From 5ce10a3321340fc438d46c60311f9b64d664334c Mon Sep 17 00:00:00 2001
From: Georgy Evtushenko <evtushenko.georgy@gmail.com>
Date: Mon, 24 Jul 2023 16:22:24 +0000
Subject: [PATCH 1/4] Better histogram benchmarks

---
 cub/benchmarks/bench/histogram/even.cu        | 10 +++---
 .../bench/histogram/histogram_common.cuh      | 33 +++++++++++--------
 cub/benchmarks/bench/histogram/multi/even.cu  |  8 +++--
 cub/benchmarks/bench/histogram/multi/range.cu |  8 +++--
 cub/benchmarks/bench/histogram/range.cu       |  8 +++--
 5 files changed, 40 insertions(+), 27 deletions(-)
diff --git a/cub/benchmarks/bench/histogram/even.cu b/cub/benchmarks/bench/histogram/even.cu
index 3346501371..761c609e95 100644
--- a/cub/benchmarks/bench/histogram/even.cu
+++ b/cub/benchmarks/bench/histogram/even.cu
@@ -28,12 +28,14 @@
 #include "histogram_common.cuh"
 #include <nvbench_helper.cuh>
 
-// %RANGE% TUNE_ITEMS ipt 7:24:1
+// %RANGE% TUNE_ITEMS ipt 4:28:1
 // %RANGE% TUNE_THREADS tpb 128:1024:32
 // %RANGE% TUNE_RLE_COMPRESS rle 0:1:1
 // %RANGE% TUNE_WORK_STEALING ws 0:1:1
 // %RANGE% TUNE_MEM_PREFERENCE mem 0:2:1
 // %RANGE% TUNE_LOAD ld 0:2:1
+// %RANGE% TUNE_LOAD_ALGORITHM_ID laid 0:2:1
+// %RANGE% TUNE_VEC_SIZE_POW vec 0:2:1
 
 template <typename SampleT, typename CounterT, typename OffsetT>
 static void even(nvbench::state &state, nvbench::type_list<SampleT, CounterT, OffsetT>)
@@ -44,7 +46,7 @@ static void even(nvbench::state &state, nvbench::type_list<SampleT, CounterT, Of
   using sample_iterator_t = SampleT *;
 
 #if !TUNE_BASE 
-  using policy_t   = policy_hub_t<key_t, num_active_channels>;
+  using policy_t   = policy_hub_t<key_t, num_channels, num_active_channels>;
   using dispatch_t = cub::DispatchHistogram<num_channels, //
                                             num_active_channels,
                                             sample_iterator_t,
@@ -138,5 +140,5 @@ NVBENCH_BENCH_TYPES(even, NVBENCH_TYPE_AXES(sample_types, bin_types, some_offset
   .set_name("base")
   .set_type_axes_names({"SampleT{ct}", "BinT{ct}", "OffsetT{ct}"})
   .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
-  .add_int64_axis("Bins", {128, 2048, 2097152})
-  .add_string_axis("Entropy", {"1.000", "0.544", "0.000"});
+  .add_int64_axis("Bins", {32, 64, 128, 2048, 2097152})
+  .add_string_axis("Entropy", {"0.201", "0.544", "1.000"});
diff --git a/cub/benchmarks/bench/histogram/histogram_common.cuh b/cub/benchmarks/bench/histogram/histogram_common.cuh
index 1d8e4e25c2..607551250f 100644
--- a/cub/benchmarks/bench/histogram/histogram_common.cuh
+++ b/cub/benchmarks/bench/histogram/histogram_common.cuh
@@ -39,6 +39,8 @@
 #define TUNE_LOAD_MODIFIER cub::LOAD_CA
 #endif // TUNE_LOAD 
 
+#define TUNE_VEC_SIZE (1 << TUNE_VEC_SIZE_POW)
+
 #if TUNE_MEM_PREFERENCE == 0
 constexpr cub::BlockHistogramMemoryPreference MEM_PREFERENCE = cub::GMEM;
 #elif TUNE_MEM_PREFERENCE == 1
@@ -47,29 +49,32 @@ constexpr cub::BlockHistogramMemoryPreference MEM_PREFERENCE = cub::SMEM;
 constexpr cub::BlockHistogramMemoryPreference MEM_PREFERENCE = cub::BLEND;
 #endif // TUNE_MEM_PREFERENCE
 
+#if TUNE_LOAD_ALGORITHM_ID == 0
+#define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT
+#elif TUNE_LOAD_ALGORITHM_ID == 1 
+#define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE
+#else 
+#define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_STRIPED
+#endif // TUNE_LOAD_ALGORITHM_ID 
 
-template <typename SampleT, int NUM_ACTIVE_CHANNELS>
+template <typename SampleT, int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS>
 struct policy_hub_t
 {
-  template <int NOMINAL_ITEMS_PER_THREAD>
-  struct TScale
-  {
-    enum
-    {
-      V_SCALE = (sizeof(SampleT) + sizeof(int) - 1) / sizeof(int),
-      VALUE   = CUB_MAX((NOMINAL_ITEMS_PER_THREAD / NUM_ACTIVE_CHANNELS / V_SCALE), 1)
-    };
-  };
-
   struct policy_t : cub::ChainedPolicy<350, policy_t, policy_t>
   {
+    static constexpr cub::BlockLoadAlgorithm load_algorithm =
+      (TUNE_LOAD_ALGORITHM == cub::BLOCK_LOAD_STRIPED)
+        ? (NUM_CHANNELS == 1 ? cub::BLOCK_LOAD_STRIPED : cub::BLOCK_LOAD_DIRECT)
+        : TUNE_LOAD_ALGORITHM;
+
     using AgentHistogramPolicyT = cub::AgentHistogramPolicy<TUNE_THREADS,
-                                                            TScale<TUNE_ITEMS>::VALUE,
-                                                            cub::BLOCK_LOAD_DIRECT,
+                                                            TUNE_ITEMS,
+                                                            load_algorithm,
                                                             TUNE_LOAD_MODIFIER,
                                                             TUNE_RLE_COMPRESS,
                                                             MEM_PREFERENCE,
-                                                            TUNE_WORK_STEALING>;
+                                                            TUNE_WORK_STEALING,
+                                                            TUNE_VEC_SIZE>;
   };
 
   using MaxPolicy = policy_t;
diff --git a/cub/benchmarks/bench/histogram/multi/even.cu b/cub/benchmarks/bench/histogram/multi/even.cu
index 2e82a1a15c..3c93269e18 100644
--- a/cub/benchmarks/bench/histogram/multi/even.cu
+++ b/cub/benchmarks/bench/histogram/multi/even.cu
@@ -34,6 +34,8 @@
 // %RANGE% TUNE_WORK_STEALING ws 0:1:1
 // %RANGE% TUNE_MEM_PREFERENCE mem 0:2:1
 // %RANGE% TUNE_LOAD ld 0:2:1
+// %RANGE% TUNE_LOAD_ALGORITHM_ID laid 0:2:1
+// %RANGE% TUNE_VEC_SIZE_POW vec 0:2:1
 
 template <typename SampleT, typename CounterT, typename OffsetT>
 static void even(nvbench::state &state, nvbench::type_list<SampleT, CounterT, OffsetT>)
@@ -44,7 +46,7 @@ static void even(nvbench::state &state, nvbench::type_list<SampleT, CounterT, Of
   using sample_iterator_t = SampleT *;
 
 #if !TUNE_BASE
-  using policy_t   = policy_hub_t<key_t, num_active_channels>;
+  using policy_t   = policy_hub_t<key_t, num_channels, num_active_channels>;
   using dispatch_t = cub::DispatchHistogram<num_channels, //
                                             num_active_channels,
                                             sample_iterator_t,
@@ -148,5 +150,5 @@ NVBENCH_BENCH_TYPES(even, NVBENCH_TYPE_AXES(sample_types, bin_types, some_offset
   .set_name("base")
   .set_type_axes_names({"SampleT{ct}", "BinT{ct}", "OffsetT{ct}"})
   .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
-  .add_int64_axis("Bins", {128, 2048, 2097152})
-  .add_string_axis("Entropy", {"1.000", "0.544", "0.000"});
+  .add_int64_axis("Bins", {32, 64, 128, 2048, 2097152})
+  .add_string_axis("Entropy", {"0.201", "0.544", "1.000"});
diff --git a/cub/benchmarks/bench/histogram/multi/range.cu b/cub/benchmarks/bench/histogram/multi/range.cu
index af3bd212ad..38ee7b9226 100644
--- a/cub/benchmarks/bench/histogram/multi/range.cu
+++ b/cub/benchmarks/bench/histogram/multi/range.cu
@@ -36,6 +36,8 @@
 // %RANGE% TUNE_WORK_STEALING ws 0:1:1
 // %RANGE% TUNE_MEM_PREFERENCE mem 0:2:1
 // %RANGE% TUNE_LOAD ld 0:2:1
+// %RANGE% TUNE_LOAD_ALGORITHM_ID laid 0:2:1
+// %RANGE% TUNE_VEC_SIZE_POW vec 0:2:1
 
 template <typename SampleT, typename CounterT, typename OffsetT>
 static void range(nvbench::state &state, nvbench::type_list<SampleT, CounterT, OffsetT>)
@@ -46,7 +48,7 @@ static void range(nvbench::state &state, nvbench::type_list<SampleT, CounterT, O
   using sample_iterator_t = SampleT *;
 
 #if !TUNE_BASE
-  using policy_t   = policy_hub_t<key_t, num_active_channels>;
+  using policy_t   = policy_hub_t<key_t, num_channels, num_active_channels>;
   using dispatch_t = cub::DispatchHistogram<num_channels, //
                                             num_active_channels,
                                             sample_iterator_t,
@@ -155,5 +157,5 @@ NVBENCH_BENCH_TYPES(range, NVBENCH_TYPE_AXES(sample_types, bin_types, some_offse
   .set_name("base")
   .set_type_axes_names({"SampleT{ct}", "BinT{ct}", "OffsetT{ct}"})
   .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
-  .add_int64_axis("Bins", {128, 2048, 2097152})
-  .add_string_axis("Entropy", {"1.000", "0.544", "0.000"});
+  .add_int64_axis("Bins", {32, 64, 128, 2048, 2097152})
+  .add_string_axis("Entropy", {"0.201", "0.544", "1.000"});
diff --git a/cub/benchmarks/bench/histogram/range.cu b/cub/benchmarks/bench/histogram/range.cu
index 3186f99a2b..68eac96e06 100644
--- a/cub/benchmarks/bench/histogram/range.cu
+++ b/cub/benchmarks/bench/histogram/range.cu
@@ -35,6 +35,8 @@
 // %RANGE% TUNE_WORK_STEALING ws 0:1:1
 // %RANGE% TUNE_MEM_PREFERENCE mem 0:2:1
 // %RANGE% TUNE_LOAD ld 0:2:1
+// %RANGE% TUNE_LOAD_ALGORITHM_ID laid 0:2:1
+// %RANGE% TUNE_VEC_SIZE_POW vec 0:2:1
 
 template <typename SampleT, typename CounterT, typename OffsetT>
 static void range(nvbench::state &state, nvbench::type_list<SampleT, CounterT, OffsetT>)
@@ -45,7 +47,7 @@ static void range(nvbench::state &state, nvbench::type_list<SampleT, CounterT, O
   using sample_iterator_t = SampleT *;
 
 #if !TUNE_BASE 
-  using policy_t   = policy_hub_t<key_t, num_active_channels>;
+  using policy_t   = policy_hub_t<key_t, num_channels, num_active_channels>;
   using dispatch_t = cub::DispatchHistogram<num_channels, //
                                             num_active_channels,
                                             sample_iterator_t,
@@ -143,5 +145,5 @@ NVBENCH_BENCH_TYPES(range, NVBENCH_TYPE_AXES(sample_types, bin_types, some_offse
   .set_name("base")
   .set_type_axes_names({"SampleT{ct}", "BinT{ct}", "OffsetT{ct}"})
   .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
-  .add_int64_axis("Bins", {128, 2048, 2097152})
-  .add_string_axis("Entropy", {"1.000", "0.544", "0.000"});
+  .add_int64_axis("Bins", {32, 64, 128, 2048, 2097152})
+  .add_string_axis("Entropy", {"0.201", "0.544", "1.000"});

From e3fba758f068eab7bec1c5156e6853382500d677 Mon Sep 17 00:00:00 2001
From: Georgy Evtushenko <evtushenko.georgy@gmail.com>
Date: Mon, 24 Jul 2023 16:22:38 +0000
Subject: [PATCH 2/4] Tune histogram for SM90

---
 cub/cub/agent/agent_histogram.cuh             |  90 +++++---
 .../device/dispatch/dispatch_histogram.cuh    |  35 +---
 .../dispatch/tuning/tuning_histogram.cuh      | 192 ++++++++++++++++++
 3 files changed, 257 insertions(+), 60 deletions(-)
 create mode 100644 cub/cub/device/dispatch/tuning/tuning_histogram.cuh

diff --git a/cub/cub/agent/agent_histogram.cuh b/cub/cub/agent/agent_histogram.cuh
index af4c0e347b..9a44ecb3f2 100644
--- a/cub/cub/agent/agent_histogram.cuh
+++ b/cub/cub/agent/agent_histogram.cuh
@@ -69,7 +69,8 @@ template <
     CacheLoadModifier               _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
     bool                            _RLE_COMPRESS,                  ///< Whether to perform localized RLE to compress samples before histogramming
     BlockHistogramMemoryPreference  _MEM_PREFERENCE,                ///< Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
-    bool                            _WORK_STEALING>                 ///< Whether to dequeue tiles from a global work queue
+    bool                            _WORK_STEALING,                 ///< Whether to dequeue tiles from a global work queue
+    int                             _VEC_SIZE = 4>
 struct AgentHistogramPolicy
 {
     enum
@@ -81,6 +82,8 @@ struct AgentHistogramPolicy
         IS_WORK_STEALING        = _WORK_STEALING,                   ///< Whether to dequeue tiles from a global work queue
     };
 
+    static constexpr int VEC_SIZE = _VEC_SIZE;
+
     static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;          ///< The BlockLoad algorithm to use
     static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;           ///< Cache load modifier for reading input elements
 };
@@ -116,8 +119,9 @@ struct AgentHistogram
     /// The pixel type of SampleT
     using PixelT = typename CubVector<SampleT, NUM_CHANNELS>::Type;
 
-    /// The quad type of SampleT
-    using QuadT = typename CubVector<SampleT, 4>::Type;
+    /// The vec type of SampleT
+    static constexpr int VecSize = AgentHistogramPolicyT::VEC_SIZE;
+    using VecT = typename CubVector<SampleT, VecSize>::Type;
 
     /// Constants
     enum
@@ -126,7 +130,7 @@ struct AgentHistogram
 
         PIXELS_PER_THREAD       = AgentHistogramPolicyT::PIXELS_PER_THREAD,
         SAMPLES_PER_THREAD      = PIXELS_PER_THREAD * NUM_CHANNELS,
-        QUADS_PER_THREAD        = SAMPLES_PER_THREAD / 4,
+        VECS_PER_THREAD         = SAMPLES_PER_THREAD / VecSize,
 
         TILE_PIXELS             = PIXELS_PER_THREAD * BLOCK_THREADS,
         TILE_SAMPLES            = SAMPLES_PER_THREAD * BLOCK_THREADS,
@@ -157,8 +161,8 @@ struct AgentHistogram
         WrappedPixelIteratorT;
 
     /// Qaud input iterator type (for applying cache modifier)
-    typedef CacheModifiedInputIterator<LOAD_MODIFIER, QuadT, OffsetT>
-        WrappedQuadIteratorT;
+    typedef CacheModifiedInputIterator<LOAD_MODIFIER, VecT, OffsetT>
+        WrappedVecsIteratorT;
 
     /// Parameterized BlockLoad type for samples
     typedef BlockLoad<
@@ -176,13 +180,13 @@ struct AgentHistogram
             AgentHistogramPolicyT::LOAD_ALGORITHM>
         BlockLoadPixelT;
 
-    /// Parameterized BlockLoad type for quads
+    /// Parameterized BlockLoad type for vecs
     typedef BlockLoad<
-            QuadT,
+            VecT,
             BLOCK_THREADS,
-            QUADS_PER_THREAD,
+            VECS_PER_THREAD,
             AgentHistogramPolicyT::LOAD_ALGORITHM>
-        BlockLoadQuadT;
+        BlockLoadVecT;
 
     /// Shared memory type required by this thread block
     struct _TempStorage
@@ -196,7 +200,7 @@ struct AgentHistogram
         {
             typename BlockLoadSampleT::TempStorage sample_load;     // Smem needed for loading a tile of samples
             typename BlockLoadPixelT::TempStorage pixel_load;       // Smem needed for loading a tile of pixels
-            typename BlockLoadQuadT::TempStorage quad_load;         // Smem needed for loading a tile of quads
+            typename BlockLoadVecT::TempStorage vec_load;           // Smem needed for loading a tile of vecs
 
         } aliasable;
     };
@@ -453,21 +457,21 @@ struct AgentHistogram
             reinterpret_cast<AliasedPixels&>(samples));
     }
 
-    // Load full, aligned tile using quad iterator (single-channel)
+    // Load full, aligned tile using vec iterator (single-channel)
     __device__ __forceinline__ void LoadFullAlignedTile(
         OffsetT                         block_offset,
         int                             valid_samples,
         SampleT                         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
         Int2Type<1>                     num_active_channels)
     {
-        typedef QuadT AliasedQuads[QUADS_PER_THREAD];
+        typedef VecT AliasedVecs[VECS_PER_THREAD];
 
-        WrappedQuadIteratorT d_wrapped_quads((QuadT*) (d_native_samples + block_offset));
+        WrappedVecsIteratorT d_wrapped_vecs((VecT*) (d_native_samples + block_offset));
 
-        // Load using a wrapped quad iterator
-        BlockLoadQuadT(temp_storage.aliasable.quad_load).Load(
-            d_wrapped_quads,
-            reinterpret_cast<AliasedQuads&>(samples));
+        // Load using a wrapped vec iterator
+        BlockLoadVecT(temp_storage.aliasable.vec_load).Load(
+            d_wrapped_vecs,
+            reinterpret_cast<AliasedVecs&>(samples));
     }
 
     // Load full, aligned tile
@@ -534,6 +538,31 @@ struct AgentHistogram
             valid_samples);
     }
 
+    template <bool IS_FULL_TILE>
+    __device__ __forceinline__ void MarkValid(bool (&is_valid)[PIXELS_PER_THREAD],
+                                              int valid_samples,
+                                              Int2Type<false> /* is_striped = false */)
+    {
+        #pragma unroll
+        for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
+        {
+            is_valid[PIXEL] = IS_FULL_TILE || (((threadIdx.x * PIXELS_PER_THREAD + PIXEL) *
+                                                NUM_CHANNELS) < valid_samples);
+        }
+    }
+
+    template <bool IS_FULL_TILE>
+    __device__ __forceinline__ void MarkValid(bool (&is_valid)[PIXELS_PER_THREAD],
+                                              int valid_samples,
+                                              Int2Type<true> /* is_striped = true */)
+    {
+        #pragma unroll
+        for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
+        {
+            is_valid[PIXEL] = IS_FULL_TILE || (((threadIdx.x + BLOCK_THREADS * PIXEL) *
+                                                NUM_CHANNELS) < valid_samples);
+        }
+    }
 
     //---------------------------------------------------------------------
     // Tile processing
@@ -541,7 +570,7 @@ struct AgentHistogram
 
     // Consume a tile of data samples
     template <
-        bool IS_ALIGNED,        // Whether the tile offset is aligned (quad-aligned for single-channel, pixel-aligned for multi-channel)
+        bool IS_ALIGNED,        // Whether the tile offset is aligned (vec-aligned for single-channel, pixel-aligned for multi-channel)
         bool IS_FULL_TILE>      // Whether the tile is full
     __device__ __forceinline__ void ConsumeTile(OffsetT block_offset, int valid_samples)
     {
@@ -557,15 +586,20 @@ struct AgentHistogram
             Int2Type<IS_ALIGNED>());
 
         // Set valid flags
-        #pragma unroll
-        for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
-            is_valid[PIXEL] = IS_FULL_TILE || (((threadIdx.x * PIXELS_PER_THREAD + PIXEL) * NUM_CHANNELS) < valid_samples);
+        MarkValid<IS_FULL_TILE>(
+          is_valid,
+          valid_samples,
+          Int2Type<AgentHistogramPolicyT::LOAD_ALGORITHM == BLOCK_LOAD_STRIPED>{});
 
         // Accumulate samples
         if (prefer_smem)
+        {
             AccumulateSmemPixels(samples, is_valid);
+        }
         else
+        {
             AccumulateGmemPixels(samples, is_valid);
+        }
     }
 
 
@@ -725,21 +759,21 @@ struct AgentHistogram
         int                 tiles_per_row,              ///< Number of image tiles per row
         GridQueue<int>      tile_queue)                 ///< Queue descriptor for assigning tiles of work to thread blocks
     {
-        // Check whether all row starting offsets are quad-aligned (in single-channel) or pixel-aligned (in multi-channel)
-        int     quad_mask           = AlignBytes<QuadT>::ALIGN_BYTES - 1;
+        // Check whether all row starting offsets are vec-aligned (in single-channel) or pixel-aligned (in multi-channel)
+        int     vec_mask           = AlignBytes<VecT>::ALIGN_BYTES - 1;
         int     pixel_mask          = AlignBytes<PixelT>::ALIGN_BYTES - 1;
         size_t  row_bytes           = sizeof(SampleT) * row_stride_samples;
 
-        bool quad_aligned_rows      = (NUM_CHANNELS == 1) && (SAMPLES_PER_THREAD % 4 == 0) &&     // Single channel
-                                        ((size_t(d_native_samples) & quad_mask) == 0) &&        // ptr is quad-aligned
-                                        ((num_rows == 1) || ((row_bytes & quad_mask) == 0));    // number of row-samples is a multiple of the alignment of the quad
+        bool vec_aligned_rows      = (NUM_CHANNELS == 1) && (SAMPLES_PER_THREAD % VecSize == 0) &&     // Single channel
+                                        ((size_t(d_native_samples) & vec_mask) == 0) &&        // ptr is quad-aligned
+                                        ((num_rows == 1) || ((row_bytes & vec_mask) == 0));    // number of row-samples is a multiple of the alignment of the quad
 
         bool pixel_aligned_rows     = (NUM_CHANNELS > 1) &&                                     // Multi channel
                                         ((size_t(d_native_samples) & pixel_mask) == 0) &&       // ptr is pixel-aligned
                                         ((row_bytes & pixel_mask) == 0);                        // number of row-samples is a multiple of the alignment of the pixel
 
         // Whether rows are aligned and can be vectorized
-        if ((d_native_samples != NULL) && (quad_aligned_rows || pixel_aligned_rows))
+        if ((d_native_samples != NULL) && (vec_aligned_rows || pixel_aligned_rows))
             ConsumeTiles<true>(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type<IS_WORK_STEALING>());
         else
             ConsumeTiles<false>(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type<IS_WORK_STEALING>());
diff --git a/cub/cub/device/dispatch/dispatch_histogram.cuh b/cub/cub/device/dispatch/dispatch_histogram.cuh
index c5fb3e105d..b393d3c824 100644
--- a/cub/cub/device/dispatch/dispatch_histogram.cuh
+++ b/cub/cub/device/dispatch/dispatch_histogram.cuh
@@ -38,6 +38,7 @@
 #include <cub/agent/agent_histogram.cuh>
 #include <cub/config.cuh>
 #include <cub/detail/cpp_compatibility.cuh>
+#include <cub/device/dispatch/tuning/tuning_histogram.cuh>
 #include <cub/grid/grid_queue.cuh>
 #include <cub/thread/thread_search.cuh>
 #include <cub/util_debug.cuh>
@@ -242,38 +243,6 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentHistogramPolicyT::BLOCK
 namespace detail
 {
 
-template <typename SampleT, int NUM_ACTIVE_CHANNELS>
-struct device_histogram_policy_hub
-{
-  template <int NOMINAL_ITEMS_PER_THREAD>
-  struct TScale
-  {
-    enum
-    {
-      V_SCALE = (sizeof(SampleT) + sizeof(int) - 1) / sizeof(int),
-      VALUE   = CUB_MAX((NOMINAL_ITEMS_PER_THREAD / NUM_ACTIVE_CHANNELS / V_SCALE), 1)
-    };
-  };
-
-  /// SM35
-  struct Policy350 : ChainedPolicy<350, Policy350, Policy350>
-  {
-    // TODO This might be worth it to separate usual histogram and the multi one
-    using AgentHistogramPolicyT =
-      AgentHistogramPolicy<128, TScale<8>::VALUE, BLOCK_LOAD_DIRECT, LOAD_LDG, true, BLEND, true>;
-  };
-
-  /// SM50
-  struct Policy500 : ChainedPolicy<500, Policy500, Policy350>
-  {
-    // TODO This might be worth it to separate usual histogram and the multi one
-    using AgentHistogramPolicyT =
-      AgentHistogramPolicy<384, TScale<16>::VALUE, BLOCK_LOAD_DIRECT, LOAD_LDG, true, SMEM, false>;
-  };
-
-  using MaxPolicy = Policy500;
-};
-
 template <int NUM_CHANNELS,
           int NUM_ACTIVE_CHANNELS,
           int PRIVATIZED_SMEM_BINS,
@@ -606,6 +575,8 @@ template <int NUM_CHANNELS,
           typename SelectedPolicy =            //
           detail::device_histogram_policy_hub< //
             cub::detail::value_t<SampleIteratorT>,
+            CounterT,
+            NUM_CHANNELS,
             NUM_ACTIVE_CHANNELS>>
 struct DispatchHistogram : SelectedPolicy
 {
diff --git a/cub/cub/device/dispatch/tuning/tuning_histogram.cuh b/cub/cub/device/dispatch/tuning/tuning_histogram.cuh
new file mode 100644
index 0000000000..b17c525c8e
--- /dev/null
+++ b/cub/cub/device/dispatch/tuning/tuning_histogram.cuh
@@ -0,0 +1,192 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/agent/agent_histogram.cuh>
+#include <cub/block/block_load.cuh>
+#include <cub/config.cuh>
+#include <cub/util_type.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+namespace detail
+{
+
+namespace histogram
+{
+
+enum class primitive_sample
+{
+  no,
+  yes
+};
+
+enum class sample_size
+{
+  _1,
+  _2,
+  unknown
+};
+
+enum class counter_size
+{
+  _4,
+  unknown
+};
+
+template <class T>
+constexpr primitive_sample is_primitive_sample()
+{
+  return Traits<T>::PRIMITIVE ? primitive_sample::yes : primitive_sample::no;
+}
+
+template <class CounterT>
+constexpr counter_size classify_counter_size()
+{
+  return sizeof(CounterT) == 4 ? counter_size::_4 : counter_size::unknown;
+}
+
+template <class SampleT>
+constexpr sample_size classify_sample_size()
+{
+  return sizeof(SampleT) == 1   ? sample_size::_1
+         : sizeof(SampleT) == 2 ? sample_size::_2
+                                : sample_size::unknown;
+}
+
+template <class SampleT, int NumActiveChannels, int NominalItemsPerThread>
+constexpr int t_scale()
+{
+  constexpr int V_SCALE = (sizeof(SampleT) + sizeof(int) - 1) / sizeof(int);
+  return CUB_MAX((NominalItemsPerThread / NumActiveChannels / V_SCALE), 1);
+}
+
+template <class SampleT,
+          int NumChannels,
+          int NumActiveChannels,
+          counter_size CounterSize,
+          primitive_sample PrimitiveSample = is_primitive_sample<SampleT>(),
+          sample_size SampleSize           = classify_sample_size<SampleT>()>
+struct sm90_tuning
+{
+  static constexpr int threads = 384;
+  static constexpr int items   = t_scale<SampleT, NumActiveChannels, 16>();
+
+  static constexpr CacheLoadModifier load_modifier               = LOAD_LDG;
+  static constexpr BlockHistogramMemoryPreference mem_preference = SMEM;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  static constexpr bool rle_compress  = true;
+  static constexpr bool work_stealing = false;
+};
+
+template <class SampleT>
+struct sm90_tuning<SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_1>
+{
+  static constexpr int threads = 768;
+  static constexpr int items   = 12;
+
+  static constexpr CacheLoadModifier load_modifier               = LOAD_LDG;
+  static constexpr BlockHistogramMemoryPreference mem_preference = SMEM;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  static constexpr bool rle_compress  = false;
+  static constexpr bool work_stealing = false;
+};
+
+template <class SampleT>
+struct sm90_tuning<SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_2>
+{
+  static constexpr int threads = 960;
+  static constexpr int items   = 10;
+
+  static constexpr CacheLoadModifier load_modifier               = LOAD_DEFAULT;
+  static constexpr BlockHistogramMemoryPreference mem_preference = SMEM;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  static constexpr bool rle_compress  = true;
+  static constexpr bool work_stealing = false;
+};
+
+} // namespace histogram
+
+template <class SampleT, class CounterT, int NumChannels, int NumActiveChannels>
+struct device_histogram_policy_hub
+{
+  template <int NOMINAL_ITEMS_PER_THREAD>
+  struct TScale
+  {
+    enum
+    {
+      V_SCALE = (sizeof(SampleT) + sizeof(int) - 1) / sizeof(int),
+      VALUE   = CUB_MAX((NOMINAL_ITEMS_PER_THREAD / NumActiveChannels / V_SCALE), 1)
+    };
+  };
+
+  /// SM35
+  struct Policy350 : ChainedPolicy<350, Policy350, Policy350>
+  {
+    // TODO This might be worth it to separate usual histogram and the multi one
+    using AgentHistogramPolicyT =
+      AgentHistogramPolicy<128, TScale<8>::VALUE, BLOCK_LOAD_DIRECT, LOAD_LDG, true, BLEND, true>;
+  };
+
+  /// SM50
+  struct Policy500 : ChainedPolicy<500, Policy500, Policy350>
+  {
+    // TODO This might be worth it to separate usual histogram and the multi one
+    using AgentHistogramPolicyT =
+      AgentHistogramPolicy<384, TScale<16>::VALUE, cub::BLOCK_LOAD_DIRECT, LOAD_LDG, true, SMEM, false>;
+  };
+
+  /// SM900
+  struct Policy900 : ChainedPolicy<900, Policy900, Policy500>
+  {
+    using tuning = detail::histogram::sm90_tuning<SampleT,
+                                                  NumChannels,
+                                                  NumActiveChannels,
+                                                  histogram::classify_counter_size<CounterT>()>;
+
+    using AgentHistogramPolicyT = AgentHistogramPolicy<tuning::threads,
+                                                       tuning::items,
+                                                       tuning::load_algorithm,
+                                                       tuning::load_modifier,
+                                                       tuning::rle_compress,
+                                                       tuning::mem_preference,
+                                                       tuning::work_stealing>;
+  };
+
+  using MaxPolicy = Policy900;
+};
+
+} // namespace detail
+
+CUB_NAMESPACE_END

From be867555223aaaa5550756802d6902227afa60c5 Mon Sep 17 00:00:00 2001
From: Georgy Evtushenko <evtushenko.georgy@gmail.com>
Date: Tue, 25 Jul 2023 09:10:14 +0000
Subject: [PATCH 3/4] Fix histogram tuning for C++11

---
 cub/cub/device/dispatch/tuning/tuning_histogram.cuh | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/cub/cub/device/dispatch/tuning/tuning_histogram.cuh b/cub/cub/device/dispatch/tuning/tuning_histogram.cuh
index b17c525c8e..f53ae3a27b 100644
--- a/cub/cub/device/dispatch/tuning/tuning_histogram.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_histogram.cuh
@@ -79,11 +79,16 @@ constexpr sample_size classify_sample_size()
                                 : sample_size::unknown;
 }
 
+template <class SampleT>
+constexpr int v_scale()
+{
+  return (sizeof(SampleT) + sizeof(int) - 1) / sizeof(int);
+}
+
 template <class SampleT, int NumActiveChannels, int NominalItemsPerThread>
 constexpr int t_scale()
 {
-  constexpr int V_SCALE = (sizeof(SampleT) + sizeof(int) - 1) / sizeof(int);
-  return CUB_MAX((NominalItemsPerThread / NumActiveChannels / V_SCALE), 1);
+  return CUB_MAX((NominalItemsPerThread / NumActiveChannels / v_scale<SampleT>()), 1);
 }
 
 template <class SampleT,

From 4af6ef01c247248fd349bc6e2ea701efe9fea539 Mon Sep 17 00:00:00 2001
From: Georgy Evtushenko <evtushenko.georgy@gmail.com>
Date: Wed, 26 Jul 2023 10:59:13 +0000
Subject: [PATCH 4/4] [skip-tests] Document new template parameter

---
 cub/cub/agent/agent_histogram.cuh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cub/cub/agent/agent_histogram.cuh b/cub/cub/agent/agent_histogram.cuh
index 9a44ecb3f2..f9770bd715 100644
--- a/cub/cub/agent/agent_histogram.cuh
+++ b/cub/cub/agent/agent_histogram.cuh
@@ -70,7 +70,7 @@ template <
     bool                            _RLE_COMPRESS,                  ///< Whether to perform localized RLE to compress samples before histogramming
     BlockHistogramMemoryPreference  _MEM_PREFERENCE,                ///< Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
     bool                            _WORK_STEALING,                 ///< Whether to dequeue tiles from a global work queue
-    int                             _VEC_SIZE = 4>
+    int                             _VEC_SIZE = 4>                  ///< Vector size for samples loading (1, 2, 4)
 struct AgentHistogramPolicy
 {
     enum
@@ -82,7 +82,7 @@ struct AgentHistogramPolicy
         IS_WORK_STEALING        = _WORK_STEALING,                   ///< Whether to dequeue tiles from a global work queue
     };
 
-    static constexpr int VEC_SIZE = _VEC_SIZE;
+    static constexpr int VEC_SIZE = _VEC_SIZE;                      ///< Vector size for samples loading (1, 2, 4)
 
     static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;          ///< The BlockLoad algorithm to use
     static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;           ///< Cache load modifier for reading input elements