NVIDIA · gevtushenko · Jul 26, 2023 · Jul 24, 2023 · Jul 24, 2023 · Jul 25, 2023
@@ -28,12 +28,14 @@
 #include "histogram_common.cuh"
 #include <nvbench_helper.cuh>
 
-// %RANGE% TUNE_ITEMS ipt 7:24:1
+// %RANGE% TUNE_ITEMS ipt 4:28:1
 // %RANGE% TUNE_THREADS tpb 128:1024:32
 // %RANGE% TUNE_RLE_COMPRESS rle 0:1:1
 // %RANGE% TUNE_WORK_STEALING ws 0:1:1
 // %RANGE% TUNE_MEM_PREFERENCE mem 0:2:1
 // %RANGE% TUNE_LOAD ld 0:2:1
+// %RANGE% TUNE_LOAD_ALGORITHM_ID laid 0:2:1
+// %RANGE% TUNE_VEC_SIZE_POW vec 0:2:1
 
 template <typename SampleT, typename CounterT, typename OffsetT>
 static void even(nvbench::state &state, nvbench::type_list<SampleT, CounterT, OffsetT>)
@@ -44,7 +46,7 @@ static void even(nvbench::state &state, nvbench::type_list<SampleT, CounterT, Of
   using sample_iterator_t = SampleT *;
 
 #if !TUNE_BASE 
-  using policy_t   = policy_hub_t<key_t, num_active_channels>;
+  using policy_t   = policy_hub_t<key_t, num_channels, num_active_channels>;
   using dispatch_t = cub::DispatchHistogram<num_channels, //
                                             num_active_channels,
                                             sample_iterator_t,
@@ -138,5 +140,5 @@ NVBENCH_BENCH_TYPES(even, NVBENCH_TYPE_AXES(sample_types, bin_types, some_offset
   .set_name("base")
   .set_type_axes_names({"SampleT{ct}", "BinT{ct}", "OffsetT{ct}"})
   .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
-  .add_int64_axis("Bins", {128, 2048, 2097152})
-  .add_string_axis("Entropy", {"1.000", "0.544", "0.000"});
+  .add_int64_axis("Bins", {32, 64, 128, 2048, 2097152})
+  .add_string_axis("Entropy", {"0.201", "0.544", "1.000"});
@@ -39,6 +39,8 @@
 #define TUNE_LOAD_MODIFIER cub::LOAD_CA
 #endif // TUNE_LOAD 
 
+#define TUNE_VEC_SIZE (1 << TUNE_VEC_SIZE_POW)
+
 #if TUNE_MEM_PREFERENCE == 0
 constexpr cub::BlockHistogramMemoryPreference MEM_PREFERENCE = cub::GMEM;
 #elif TUNE_MEM_PREFERENCE == 1
@@ -47,29 +49,32 @@ constexpr cub::BlockHistogramMemoryPreference MEM_PREFERENCE = cub::SMEM;
 constexpr cub::BlockHistogramMemoryPreference MEM_PREFERENCE = cub::BLEND;
 #endif // TUNE_MEM_PREFERENCE
 
+#if TUNE_LOAD_ALGORITHM_ID == 0
+#define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT
+#elif TUNE_LOAD_ALGORITHM_ID == 1 
+#define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE
+#else 
+#define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_STRIPED
+#endif // TUNE_LOAD_ALGORITHM_ID 
 
-template <typename SampleT, int NUM_ACTIVE_CHANNELS>
+template <typename SampleT, int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS>
 struct policy_hub_t
 {
-  template <int NOMINAL_ITEMS_PER_THREAD>
-  struct TScale
-  {
-    enum
-    {
-      V_SCALE = (sizeof(SampleT) + sizeof(int) - 1) / sizeof(int),
-      VALUE   = CUB_MAX((NOMINAL_ITEMS_PER_THREAD / NUM_ACTIVE_CHANNELS / V_SCALE), 1)
-    };
-  };
-
   struct policy_t : cub::ChainedPolicy<350, policy_t, policy_t>
   {
+    static constexpr cub::BlockLoadAlgorithm load_algorithm =
+      (TUNE_LOAD_ALGORITHM == cub::BLOCK_LOAD_STRIPED)
+        ? (NUM_CHANNELS == 1 ? cub::BLOCK_LOAD_STRIPED : cub::BLOCK_LOAD_DIRECT)
+        : TUNE_LOAD_ALGORITHM;
+
     using AgentHistogramPolicyT = cub::AgentHistogramPolicy<TUNE_THREADS,
-                                                            TScale<TUNE_ITEMS>::VALUE,
-                                                            cub::BLOCK_LOAD_DIRECT,
+                                                            TUNE_ITEMS,
+                                                            load_algorithm,
                                                             TUNE_LOAD_MODIFIER,
                                                             TUNE_RLE_COMPRESS,
                                                             MEM_PREFERENCE,
-                                                            TUNE_WORK_STEALING>;
+                                                            TUNE_WORK_STEALING,
+                                                            TUNE_VEC_SIZE>;
   };
 
   using MaxPolicy = policy_t;

@@ -34,6 +34,8 @@
 // %RANGE% TUNE_WORK_STEALING ws 0:1:1
 // %RANGE% TUNE_MEM_PREFERENCE mem 0:2:1
 // %RANGE% TUNE_LOAD ld 0:2:1
+// %RANGE% TUNE_LOAD_ALGORITHM_ID laid 0:2:1
+// %RANGE% TUNE_VEC_SIZE_POW vec 0:2:1
 
 template <typename SampleT, typename CounterT, typename OffsetT>
 static void even(nvbench::state &state, nvbench::type_list<SampleT, CounterT, OffsetT>)
@@ -44,7 +46,7 @@ static void even(nvbench::state &state, nvbench::type_list<SampleT, CounterT, Of
   using sample_iterator_t = SampleT *;
 
 #if !TUNE_BASE
-  using policy_t   = policy_hub_t<key_t, num_active_channels>;
+  using policy_t   = policy_hub_t<key_t, num_channels, num_active_channels>;
   using dispatch_t = cub::DispatchHistogram<num_channels, //
                                             num_active_channels,
                                             sample_iterator_t,
@@ -148,5 +150,5 @@ NVBENCH_BENCH_TYPES(even, NVBENCH_TYPE_AXES(sample_types, bin_types, some_offset
   .set_name("base")
   .set_type_axes_names({"SampleT{ct}", "BinT{ct}", "OffsetT{ct}"})
   .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
-  .add_int64_axis("Bins", {128, 2048, 2097152})
-  .add_string_axis("Entropy", {"1.000", "0.544", "0.000"});
+  .add_int64_axis("Bins", {32, 64, 128, 2048, 2097152})
+  .add_string_axis("Entropy", {"0.201", "0.544", "1.000"});
@@ -36,6 +36,8 @@
 // %RANGE% TUNE_WORK_STEALING ws 0:1:1
 // %RANGE% TUNE_MEM_PREFERENCE mem 0:2:1
 // %RANGE% TUNE_LOAD ld 0:2:1
+// %RANGE% TUNE_LOAD_ALGORITHM_ID laid 0:2:1
+// %RANGE% TUNE_VEC_SIZE_POW vec 0:2:1
 
 template <typename SampleT, typename CounterT, typename OffsetT>
 static void range(nvbench::state &state, nvbench::type_list<SampleT, CounterT, OffsetT>)
@@ -46,7 +48,7 @@ static void range(nvbench::state &state, nvbench::type_list<SampleT, CounterT, O
   using sample_iterator_t = SampleT *;
 
 #if !TUNE_BASE
-  using policy_t   = policy_hub_t<key_t, num_active_channels>;
+  using policy_t   = policy_hub_t<key_t, num_channels, num_active_channels>;
   using dispatch_t = cub::DispatchHistogram<num_channels, //
                                             num_active_channels,
                                             sample_iterator_t,
@@ -155,5 +157,5 @@ NVBENCH_BENCH_TYPES(range, NVBENCH_TYPE_AXES(sample_types, bin_types, some_offse
   .set_name("base")
   .set_type_axes_names({"SampleT{ct}", "BinT{ct}", "OffsetT{ct}"})
   .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
-  .add_int64_axis("Bins", {128, 2048, 2097152})
-  .add_string_axis("Entropy", {"1.000", "0.544", "0.000"});
+  .add_int64_axis("Bins", {32, 64, 128, 2048, 2097152})
+  .add_string_axis("Entropy", {"0.201", "0.544", "1.000"});
@@ -35,6 +35,8 @@
 // %RANGE% TUNE_WORK_STEALING ws 0:1:1
 // %RANGE% TUNE_MEM_PREFERENCE mem 0:2:1
 // %RANGE% TUNE_LOAD ld 0:2:1
+// %RANGE% TUNE_LOAD_ALGORITHM_ID laid 0:2:1
+// %RANGE% TUNE_VEC_SIZE_POW vec 0:2:1
 
 template <typename SampleT, typename CounterT, typename OffsetT>
 static void range(nvbench::state &state, nvbench::type_list<SampleT, CounterT, OffsetT>)
@@ -45,7 +47,7 @@ static void range(nvbench::state &state, nvbench::type_list<SampleT, CounterT, O
   using sample_iterator_t = SampleT *;
 
 #if !TUNE_BASE 
-  using policy_t   = policy_hub_t<key_t, num_active_channels>;
+  using policy_t   = policy_hub_t<key_t, num_channels, num_active_channels>;
   using dispatch_t = cub::DispatchHistogram<num_channels, //
                                             num_active_channels,
                                             sample_iterator_t,
@@ -143,5 +145,5 @@ NVBENCH_BENCH_TYPES(range, NVBENCH_TYPE_AXES(sample_types, bin_types, some_offse
   .set_name("base")
   .set_type_axes_names({"SampleT{ct}", "BinT{ct}", "OffsetT{ct}"})
   .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
-  .add_int64_axis("Bins", {128, 2048, 2097152})
-  .add_string_axis("Entropy", {"1.000", "0.544", "0.000"});
+  .add_int64_axis("Bins", {32, 64, 128, 2048, 2097152})
+  .add_string_axis("Entropy", {"0.201", "0.544", "1.000"});
@@ -69,7 +69,8 @@ template <
     CacheLoadModifier               _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
     bool                            _RLE_COMPRESS,                  ///< Whether to perform localized RLE to compress samples before histogramming
     BlockHistogramMemoryPreference  _MEM_PREFERENCE,                ///< Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
-    bool                            _WORK_STEALING>                 ///< Whether to dequeue tiles from a global work queue
+    bool                            _WORK_STEALING,                 ///< Whether to dequeue tiles from a global work queue
+    int                             _VEC_SIZE = 4>
 struct AgentHistogramPolicy
 {
     enum
@@ -81,6 +82,8 @@ struct AgentHistogramPolicy
         IS_WORK_STEALING        = _WORK_STEALING,                   ///< Whether to dequeue tiles from a global work queue
     };
 
+    static constexpr int VEC_SIZE = _VEC_SIZE;
+
     static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;          ///< The BlockLoad algorithm to use
     static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;           ///< Cache load modifier for reading input elements
 };
@@ -116,8 +119,9 @@ struct AgentHistogram
     /// The pixel type of SampleT
     using PixelT = typename CubVector<SampleT, NUM_CHANNELS>::Type;
 
-    /// The quad type of SampleT
-    using QuadT = typename CubVector<SampleT, 4>::Type;
+    /// The vec type of SampleT
+    static constexpr int VecSize = AgentHistogramPolicyT::VEC_SIZE;
+    using VecT = typename CubVector<SampleT, VecSize>::Type;
 
     /// Constants
     enum
@@ -126,7 +130,7 @@ struct AgentHistogram
 
         PIXELS_PER_THREAD       = AgentHistogramPolicyT::PIXELS_PER_THREAD,
         SAMPLES_PER_THREAD      = PIXELS_PER_THREAD * NUM_CHANNELS,
-        QUADS_PER_THREAD        = SAMPLES_PER_THREAD / 4,
+        VECS_PER_THREAD         = SAMPLES_PER_THREAD / VecSize,
 
         TILE_PIXELS             = PIXELS_PER_THREAD * BLOCK_THREADS,
         TILE_SAMPLES            = SAMPLES_PER_THREAD * BLOCK_THREADS,
@@ -157,8 +161,8 @@ struct AgentHistogram
         WrappedPixelIteratorT;
 
     /// Qaud input iterator type (for applying cache modifier)
-    typedef CacheModifiedInputIterator<LOAD_MODIFIER, QuadT, OffsetT>
-        WrappedQuadIteratorT;
+    typedef CacheModifiedInputIterator<LOAD_MODIFIER, VecT, OffsetT>
+        WrappedVecsIteratorT;
 
     /// Parameterized BlockLoad type for samples
     typedef BlockLoad<
@@ -176,13 +180,13 @@ struct AgentHistogram
             AgentHistogramPolicyT::LOAD_ALGORITHM>
         BlockLoadPixelT;
 
-    /// Parameterized BlockLoad type for quads
+    /// Parameterized BlockLoad type for vecs
     typedef BlockLoad<
-            QuadT,
+            VecT,
             BLOCK_THREADS,
-            QUADS_PER_THREAD,
+            VECS_PER_THREAD,
             AgentHistogramPolicyT::LOAD_ALGORITHM>
-        BlockLoadQuadT;
+        BlockLoadVecT;
 
     /// Shared memory type required by this thread block
     struct _TempStorage
@@ -196,7 +200,7 @@ struct AgentHistogram
         {
             typename BlockLoadSampleT::TempStorage sample_load;     // Smem needed for loading a tile of samples
             typename BlockLoadPixelT::TempStorage pixel_load;       // Smem needed for loading a tile of pixels
-            typename BlockLoadQuadT::TempStorage quad_load;         // Smem needed for loading a tile of quads
+            typename BlockLoadVecT::TempStorage vec_load;           // Smem needed for loading a tile of vecs
 
         } aliasable;
     };
@@ -453,21 +457,21 @@ struct AgentHistogram
             reinterpret_cast<AliasedPixels&>(samples));
     }
 
-    // Load full, aligned tile using quad iterator (single-channel)
+    // Load full, aligned tile using vec iterator (single-channel)
     __device__ __forceinline__ void LoadFullAlignedTile(
         OffsetT                         block_offset,
         int                             valid_samples,
         SampleT                         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
         Int2Type<1>                     num_active_channels)
     {
-        typedef QuadT AliasedQuads[QUADS_PER_THREAD];
+        typedef VecT AliasedVecs[VECS_PER_THREAD];
 
-        WrappedQuadIteratorT d_wrapped_quads((QuadT*) (d_native_samples + block_offset));
+        WrappedVecsIteratorT d_wrapped_vecs((VecT*) (d_native_samples + block_offset));
 
-        // Load using a wrapped quad iterator
-        BlockLoadQuadT(temp_storage.aliasable.quad_load).Load(
-            d_wrapped_quads,
-            reinterpret_cast<AliasedQuads&>(samples));
+        // Load using a wrapped vec iterator
+        BlockLoadVecT(temp_storage.aliasable.vec_load).Load(
+            d_wrapped_vecs,
+            reinterpret_cast<AliasedVecs&>(samples));
     }
 
     // Load full, aligned tile
@@ -534,14 +538,39 @@ struct AgentHistogram
             valid_samples);
     }
 
+    template <bool IS_FULL_TILE>
+    __device__ __forceinline__ void MarkValid(bool (&is_valid)[PIXELS_PER_THREAD],
+                                              int valid_samples,
+                                              Int2Type<false> /* is_striped = false */)
+    {
+        #pragma unroll
+        for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
+        {
+            is_valid[PIXEL] = IS_FULL_TILE || (((threadIdx.x * PIXELS_PER_THREAD + PIXEL) *
+                                                NUM_CHANNELS) < valid_samples);
+        }
+    }
+
+    template <bool IS_FULL_TILE>
+    __device__ __forceinline__ void MarkValid(bool (&is_valid)[PIXELS_PER_THREAD],
+                                              int valid_samples,
+                                              Int2Type<true> /* is_striped = true */)
+    {
+        #pragma unroll
+        for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
+        {
+            is_valid[PIXEL] = IS_FULL_TILE || (((threadIdx.x + BLOCK_THREADS * PIXEL) *
+                                                NUM_CHANNELS) < valid_samples);
+        }
+    }
 
     //---------------------------------------------------------------------
     // Tile processing
     //---------------------------------------------------------------------
 
     // Consume a tile of data samples
     template <
-        bool IS_ALIGNED,        // Whether the tile offset is aligned (quad-aligned for single-channel, pixel-aligned for multi-channel)
+        bool IS_ALIGNED,        // Whether the tile offset is aligned (vec-aligned for single-channel, pixel-aligned for multi-channel)
         bool IS_FULL_TILE>      // Whether the tile is full
     __device__ __forceinline__ void ConsumeTile(OffsetT block_offset, int valid_samples)
     {
@@ -557,15 +586,20 @@ struct AgentHistogram
             Int2Type<IS_ALIGNED>());
 
         // Set valid flags
-        #pragma unroll
-        for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
-            is_valid[PIXEL] = IS_FULL_TILE || (((threadIdx.x * PIXELS_PER_THREAD + PIXEL) * NUM_CHANNELS) < valid_samples);
+        MarkValid<IS_FULL_TILE>(
+          is_valid,
+          valid_samples,
+          Int2Type<AgentHistogramPolicyT::LOAD_ALGORITHM == BLOCK_LOAD_STRIPED>{});
 
         // Accumulate samples
         if (prefer_smem)
+        {
             AccumulateSmemPixels(samples, is_valid);
+        }
         else
+        {
             AccumulateGmemPixels(samples, is_valid);
+        }
     }
 
 
@@ -725,21 +759,21 @@ struct AgentHistogram
         int                 tiles_per_row,              ///< Number of image tiles per row
         GridQueue<int>      tile_queue)                 ///< Queue descriptor for assigning tiles of work to thread blocks
     {
-        // Check whether all row starting offsets are quad-aligned (in single-channel) or pixel-aligned (in multi-channel)
-        int     quad_mask           = AlignBytes<QuadT>::ALIGN_BYTES - 1;
+        // Check whether all row starting offsets are vec-aligned (in single-channel) or pixel-aligned (in multi-channel)
+        int     vec_mask           = AlignBytes<VecT>::ALIGN_BYTES - 1;
         int     pixel_mask          = AlignBytes<PixelT>::ALIGN_BYTES - 1;
         size_t  row_bytes           = sizeof(SampleT) * row_stride_samples;
 
-        bool quad_aligned_rows      = (NUM_CHANNELS == 1) && (SAMPLES_PER_THREAD % 4 == 0) &&     // Single channel
-                                        ((size_t(d_native_samples) & quad_mask) == 0) &&        // ptr is quad-aligned
-                                        ((num_rows == 1) || ((row_bytes & quad_mask) == 0));    // number of row-samples is a multiple of the alignment of the quad
+        bool vec_aligned_rows      = (NUM_CHANNELS == 1) && (SAMPLES_PER_THREAD % VecSize == 0) &&     // Single channel
+                                        ((size_t(d_native_samples) & vec_mask) == 0) &&        // ptr is quad-aligned
+                                        ((num_rows == 1) || ((row_bytes & vec_mask) == 0));    // number of row-samples is a multiple of the alignment of the quad
 
         bool pixel_aligned_rows     = (NUM_CHANNELS > 1) &&                                     // Multi channel
                                         ((size_t(d_native_samples) & pixel_mask) == 0) &&       // ptr is pixel-aligned
                                         ((row_bytes & pixel_mask) == 0);                        // number of row-samples is a multiple of the alignment of the pixel
 
         // Whether rows are aligned and can be vectorized
-        if ((d_native_samples != NULL) && (quad_aligned_rows || pixel_aligned_rows))
+        if ((d_native_samples != NULL) && (vec_aligned_rows || pixel_aligned_rows))
             ConsumeTiles<true>(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type<IS_WORK_STEALING>());
         else
             ConsumeTiles<false>(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type<IS_WORK_STEALING>());