Skip to content

Commit

Permalink
Merge pull request #266 from senior-zero/enh-main/github/sm90_hist
Browse files Browse the repository at this point in the history
Tune Histogram on H100
  • Loading branch information
gevtushenko committed Jul 26, 2023
2 parents 372535c + 4af6ef0 commit 5531a47
Show file tree
Hide file tree
Showing 8 changed files with 302 additions and 87 deletions.
10 changes: 6 additions & 4 deletions cub/benchmarks/bench/histogram/even.cu
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,14 @@
#include "histogram_common.cuh"
#include <nvbench_helper.cuh>

// %RANGE% TUNE_ITEMS ipt 7:24:1
// %RANGE% TUNE_ITEMS ipt 4:28:1
// %RANGE% TUNE_THREADS tpb 128:1024:32
// %RANGE% TUNE_RLE_COMPRESS rle 0:1:1
// %RANGE% TUNE_WORK_STEALING ws 0:1:1
// %RANGE% TUNE_MEM_PREFERENCE mem 0:2:1
// %RANGE% TUNE_LOAD ld 0:2:1
// %RANGE% TUNE_LOAD_ALGORITHM_ID laid 0:2:1
// %RANGE% TUNE_VEC_SIZE_POW vec 0:2:1

template <typename SampleT, typename CounterT, typename OffsetT>
static void even(nvbench::state &state, nvbench::type_list<SampleT, CounterT, OffsetT>)
Expand All @@ -44,7 +46,7 @@ static void even(nvbench::state &state, nvbench::type_list<SampleT, CounterT, Of
using sample_iterator_t = SampleT *;

#if !TUNE_BASE
using policy_t = policy_hub_t<key_t, num_active_channels>;
using policy_t = policy_hub_t<key_t, num_channels, num_active_channels>;
using dispatch_t = cub::DispatchHistogram<num_channels, //
num_active_channels,
sample_iterator_t,
Expand Down Expand Up @@ -138,5 +140,5 @@ NVBENCH_BENCH_TYPES(even, NVBENCH_TYPE_AXES(sample_types, bin_types, some_offset
.set_name("base")
.set_type_axes_names({"SampleT{ct}", "BinT{ct}", "OffsetT{ct}"})
.add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
.add_int64_axis("Bins", {128, 2048, 2097152})
.add_string_axis("Entropy", {"1.000", "0.544", "0.000"});
.add_int64_axis("Bins", {32, 64, 128, 2048, 2097152})
.add_string_axis("Entropy", {"0.201", "0.544", "1.000"});
33 changes: 19 additions & 14 deletions cub/benchmarks/bench/histogram/histogram_common.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@
#define TUNE_LOAD_MODIFIER cub::LOAD_CA
#endif // TUNE_LOAD

#define TUNE_VEC_SIZE (1 << TUNE_VEC_SIZE_POW)

#if TUNE_MEM_PREFERENCE == 0
constexpr cub::BlockHistogramMemoryPreference MEM_PREFERENCE = cub::GMEM;
#elif TUNE_MEM_PREFERENCE == 1
Expand All @@ -47,29 +49,32 @@ constexpr cub::BlockHistogramMemoryPreference MEM_PREFERENCE = cub::SMEM;
constexpr cub::BlockHistogramMemoryPreference MEM_PREFERENCE = cub::BLEND;
#endif // TUNE_MEM_PREFERENCE

#if TUNE_LOAD_ALGORITHM_ID == 0
#define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT
#elif TUNE_LOAD_ALGORITHM_ID == 1
#define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE
#else
#define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_STRIPED
#endif // TUNE_LOAD_ALGORITHM_ID

template <typename SampleT, int NUM_ACTIVE_CHANNELS>
template <typename SampleT, int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS>
struct policy_hub_t
{
template <int NOMINAL_ITEMS_PER_THREAD>
struct TScale
{
enum
{
V_SCALE = (sizeof(SampleT) + sizeof(int) - 1) / sizeof(int),
VALUE = CUB_MAX((NOMINAL_ITEMS_PER_THREAD / NUM_ACTIVE_CHANNELS / V_SCALE), 1)
};
};

struct policy_t : cub::ChainedPolicy<350, policy_t, policy_t>
{
static constexpr cub::BlockLoadAlgorithm load_algorithm =
(TUNE_LOAD_ALGORITHM == cub::BLOCK_LOAD_STRIPED)
? (NUM_CHANNELS == 1 ? cub::BLOCK_LOAD_STRIPED : cub::BLOCK_LOAD_DIRECT)
: TUNE_LOAD_ALGORITHM;

using AgentHistogramPolicyT = cub::AgentHistogramPolicy<TUNE_THREADS,
TScale<TUNE_ITEMS>::VALUE,
cub::BLOCK_LOAD_DIRECT,
TUNE_ITEMS,
load_algorithm,
TUNE_LOAD_MODIFIER,
TUNE_RLE_COMPRESS,
MEM_PREFERENCE,
TUNE_WORK_STEALING>;
TUNE_WORK_STEALING,
TUNE_VEC_SIZE>;
};

using MaxPolicy = policy_t;
Expand Down
8 changes: 5 additions & 3 deletions cub/benchmarks/bench/histogram/multi/even.cu
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@
// %RANGE% TUNE_WORK_STEALING ws 0:1:1
// %RANGE% TUNE_MEM_PREFERENCE mem 0:2:1
// %RANGE% TUNE_LOAD ld 0:2:1
// %RANGE% TUNE_LOAD_ALGORITHM_ID laid 0:2:1
// %RANGE% TUNE_VEC_SIZE_POW vec 0:2:1

template <typename SampleT, typename CounterT, typename OffsetT>
static void even(nvbench::state &state, nvbench::type_list<SampleT, CounterT, OffsetT>)
Expand All @@ -44,7 +46,7 @@ static void even(nvbench::state &state, nvbench::type_list<SampleT, CounterT, Of
using sample_iterator_t = SampleT *;

#if !TUNE_BASE
using policy_t = policy_hub_t<key_t, num_active_channels>;
using policy_t = policy_hub_t<key_t, num_channels, num_active_channels>;
using dispatch_t = cub::DispatchHistogram<num_channels, //
num_active_channels,
sample_iterator_t,
Expand Down Expand Up @@ -148,5 +150,5 @@ NVBENCH_BENCH_TYPES(even, NVBENCH_TYPE_AXES(sample_types, bin_types, some_offset
.set_name("base")
.set_type_axes_names({"SampleT{ct}", "BinT{ct}", "OffsetT{ct}"})
.add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
.add_int64_axis("Bins", {128, 2048, 2097152})
.add_string_axis("Entropy", {"1.000", "0.544", "0.000"});
.add_int64_axis("Bins", {32, 64, 128, 2048, 2097152})
.add_string_axis("Entropy", {"0.201", "0.544", "1.000"});
8 changes: 5 additions & 3 deletions cub/benchmarks/bench/histogram/multi/range.cu
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@
// %RANGE% TUNE_WORK_STEALING ws 0:1:1
// %RANGE% TUNE_MEM_PREFERENCE mem 0:2:1
// %RANGE% TUNE_LOAD ld 0:2:1
// %RANGE% TUNE_LOAD_ALGORITHM_ID laid 0:2:1
// %RANGE% TUNE_VEC_SIZE_POW vec 0:2:1

template <typename SampleT, typename CounterT, typename OffsetT>
static void range(nvbench::state &state, nvbench::type_list<SampleT, CounterT, OffsetT>)
Expand All @@ -46,7 +48,7 @@ static void range(nvbench::state &state, nvbench::type_list<SampleT, CounterT, O
using sample_iterator_t = SampleT *;

#if !TUNE_BASE
using policy_t = policy_hub_t<key_t, num_active_channels>;
using policy_t = policy_hub_t<key_t, num_channels, num_active_channels>;
using dispatch_t = cub::DispatchHistogram<num_channels, //
num_active_channels,
sample_iterator_t,
Expand Down Expand Up @@ -155,5 +157,5 @@ NVBENCH_BENCH_TYPES(range, NVBENCH_TYPE_AXES(sample_types, bin_types, some_offse
.set_name("base")
.set_type_axes_names({"SampleT{ct}", "BinT{ct}", "OffsetT{ct}"})
.add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
.add_int64_axis("Bins", {128, 2048, 2097152})
.add_string_axis("Entropy", {"1.000", "0.544", "0.000"});
.add_int64_axis("Bins", {32, 64, 128, 2048, 2097152})
.add_string_axis("Entropy", {"0.201", "0.544", "1.000"});
8 changes: 5 additions & 3 deletions cub/benchmarks/bench/histogram/range.cu
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@
// %RANGE% TUNE_WORK_STEALING ws 0:1:1
// %RANGE% TUNE_MEM_PREFERENCE mem 0:2:1
// %RANGE% TUNE_LOAD ld 0:2:1
// %RANGE% TUNE_LOAD_ALGORITHM_ID laid 0:2:1
// %RANGE% TUNE_VEC_SIZE_POW vec 0:2:1

template <typename SampleT, typename CounterT, typename OffsetT>
static void range(nvbench::state &state, nvbench::type_list<SampleT, CounterT, OffsetT>)
Expand All @@ -45,7 +47,7 @@ static void range(nvbench::state &state, nvbench::type_list<SampleT, CounterT, O
using sample_iterator_t = SampleT *;

#if !TUNE_BASE
using policy_t = policy_hub_t<key_t, num_active_channels>;
using policy_t = policy_hub_t<key_t, num_channels, num_active_channels>;
using dispatch_t = cub::DispatchHistogram<num_channels, //
num_active_channels,
sample_iterator_t,
Expand Down Expand Up @@ -143,5 +145,5 @@ NVBENCH_BENCH_TYPES(range, NVBENCH_TYPE_AXES(sample_types, bin_types, some_offse
.set_name("base")
.set_type_axes_names({"SampleT{ct}", "BinT{ct}", "OffsetT{ct}"})
.add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
.add_int64_axis("Bins", {128, 2048, 2097152})
.add_string_axis("Entropy", {"1.000", "0.544", "0.000"});
.add_int64_axis("Bins", {32, 64, 128, 2048, 2097152})
.add_string_axis("Entropy", {"0.201", "0.544", "1.000"});
90 changes: 62 additions & 28 deletions cub/cub/agent/agent_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,8 @@ template <
CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements
bool _RLE_COMPRESS, ///< Whether to perform localized RLE to compress samples before histogramming
BlockHistogramMemoryPreference _MEM_PREFERENCE, ///< Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
bool _WORK_STEALING> ///< Whether to dequeue tiles from a global work queue
bool _WORK_STEALING, ///< Whether to dequeue tiles from a global work queue
int _VEC_SIZE = 4> ///< Vector size for samples loading (1, 2, 4)
struct AgentHistogramPolicy
{
enum
Expand All @@ -81,6 +82,8 @@ struct AgentHistogramPolicy
IS_WORK_STEALING = _WORK_STEALING, ///< Whether to dequeue tiles from a global work queue
};

static constexpr int VEC_SIZE = _VEC_SIZE; ///< Vector size for samples loading (1, 2, 4)

static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use
static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements
};
Expand Down Expand Up @@ -116,8 +119,9 @@ struct AgentHistogram
/// The pixel type of SampleT
using PixelT = typename CubVector<SampleT, NUM_CHANNELS>::Type;

/// The quad type of SampleT
using QuadT = typename CubVector<SampleT, 4>::Type;
/// The vec type of SampleT
static constexpr int VecSize = AgentHistogramPolicyT::VEC_SIZE;
using VecT = typename CubVector<SampleT, VecSize>::Type;

/// Constants
enum
Expand All @@ -126,7 +130,7 @@ struct AgentHistogram

PIXELS_PER_THREAD = AgentHistogramPolicyT::PIXELS_PER_THREAD,
SAMPLES_PER_THREAD = PIXELS_PER_THREAD * NUM_CHANNELS,
QUADS_PER_THREAD = SAMPLES_PER_THREAD / 4,
VECS_PER_THREAD = SAMPLES_PER_THREAD / VecSize,

TILE_PIXELS = PIXELS_PER_THREAD * BLOCK_THREADS,
TILE_SAMPLES = SAMPLES_PER_THREAD * BLOCK_THREADS,
Expand Down Expand Up @@ -157,8 +161,8 @@ struct AgentHistogram
WrappedPixelIteratorT;

/// Qaud input iterator type (for applying cache modifier)
typedef CacheModifiedInputIterator<LOAD_MODIFIER, QuadT, OffsetT>
WrappedQuadIteratorT;
typedef CacheModifiedInputIterator<LOAD_MODIFIER, VecT, OffsetT>
WrappedVecsIteratorT;

/// Parameterized BlockLoad type for samples
typedef BlockLoad<
Expand All @@ -176,13 +180,13 @@ struct AgentHistogram
AgentHistogramPolicyT::LOAD_ALGORITHM>
BlockLoadPixelT;

/// Parameterized BlockLoad type for quads
/// Parameterized BlockLoad type for vecs
typedef BlockLoad<
QuadT,
VecT,
BLOCK_THREADS,
QUADS_PER_THREAD,
VECS_PER_THREAD,
AgentHistogramPolicyT::LOAD_ALGORITHM>
BlockLoadQuadT;
BlockLoadVecT;

/// Shared memory type required by this thread block
struct _TempStorage
Expand All @@ -196,7 +200,7 @@ struct AgentHistogram
{
typename BlockLoadSampleT::TempStorage sample_load; // Smem needed for loading a tile of samples
typename BlockLoadPixelT::TempStorage pixel_load; // Smem needed for loading a tile of pixels
typename BlockLoadQuadT::TempStorage quad_load; // Smem needed for loading a tile of quads
typename BlockLoadVecT::TempStorage vec_load; // Smem needed for loading a tile of vecs

} aliasable;
};
Expand Down Expand Up @@ -453,21 +457,21 @@ struct AgentHistogram
reinterpret_cast<AliasedPixels&>(samples));
}

// Load full, aligned tile using quad iterator (single-channel)
// Load full, aligned tile using vec iterator (single-channel)
__device__ __forceinline__ void LoadFullAlignedTile(
OffsetT block_offset,
int valid_samples,
SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
Int2Type<1> num_active_channels)
{
typedef QuadT AliasedQuads[QUADS_PER_THREAD];
typedef VecT AliasedVecs[VECS_PER_THREAD];

WrappedQuadIteratorT d_wrapped_quads((QuadT*) (d_native_samples + block_offset));
WrappedVecsIteratorT d_wrapped_vecs((VecT*) (d_native_samples + block_offset));

// Load using a wrapped quad iterator
BlockLoadQuadT(temp_storage.aliasable.quad_load).Load(
d_wrapped_quads,
reinterpret_cast<AliasedQuads&>(samples));
// Load using a wrapped vec iterator
BlockLoadVecT(temp_storage.aliasable.vec_load).Load(
d_wrapped_vecs,
reinterpret_cast<AliasedVecs&>(samples));
}

// Load full, aligned tile
Expand Down Expand Up @@ -534,14 +538,39 @@ struct AgentHistogram
valid_samples);
}

template <bool IS_FULL_TILE>
__device__ __forceinline__ void MarkValid(bool (&is_valid)[PIXELS_PER_THREAD],
int valid_samples,
Int2Type<false> /* is_striped = false */)
{
#pragma unroll
for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
{
is_valid[PIXEL] = IS_FULL_TILE || (((threadIdx.x * PIXELS_PER_THREAD + PIXEL) *
NUM_CHANNELS) < valid_samples);
}
}

template <bool IS_FULL_TILE>
__device__ __forceinline__ void MarkValid(bool (&is_valid)[PIXELS_PER_THREAD],
int valid_samples,
Int2Type<true> /* is_striped = true */)
{
#pragma unroll
for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
{
is_valid[PIXEL] = IS_FULL_TILE || (((threadIdx.x + BLOCK_THREADS * PIXEL) *
NUM_CHANNELS) < valid_samples);
}
}

//---------------------------------------------------------------------
// Tile processing
//---------------------------------------------------------------------

// Consume a tile of data samples
template <
bool IS_ALIGNED, // Whether the tile offset is aligned (quad-aligned for single-channel, pixel-aligned for multi-channel)
bool IS_ALIGNED, // Whether the tile offset is aligned (vec-aligned for single-channel, pixel-aligned for multi-channel)
bool IS_FULL_TILE> // Whether the tile is full
__device__ __forceinline__ void ConsumeTile(OffsetT block_offset, int valid_samples)
{
Expand All @@ -557,15 +586,20 @@ struct AgentHistogram
Int2Type<IS_ALIGNED>());

// Set valid flags
#pragma unroll
for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
is_valid[PIXEL] = IS_FULL_TILE || (((threadIdx.x * PIXELS_PER_THREAD + PIXEL) * NUM_CHANNELS) < valid_samples);
MarkValid<IS_FULL_TILE>(
is_valid,
valid_samples,
Int2Type<AgentHistogramPolicyT::LOAD_ALGORITHM == BLOCK_LOAD_STRIPED>{});

// Accumulate samples
if (prefer_smem)
{
AccumulateSmemPixels(samples, is_valid);
}
else
{
AccumulateGmemPixels(samples, is_valid);
}
}


Expand Down Expand Up @@ -725,21 +759,21 @@ struct AgentHistogram
int tiles_per_row, ///< Number of image tiles per row
GridQueue<int> tile_queue) ///< Queue descriptor for assigning tiles of work to thread blocks
{
// Check whether all row starting offsets are quad-aligned (in single-channel) or pixel-aligned (in multi-channel)
int quad_mask = AlignBytes<QuadT>::ALIGN_BYTES - 1;
// Check whether all row starting offsets are vec-aligned (in single-channel) or pixel-aligned (in multi-channel)
int vec_mask = AlignBytes<VecT>::ALIGN_BYTES - 1;
int pixel_mask = AlignBytes<PixelT>::ALIGN_BYTES - 1;
size_t row_bytes = sizeof(SampleT) * row_stride_samples;

bool quad_aligned_rows = (NUM_CHANNELS == 1) && (SAMPLES_PER_THREAD % 4 == 0) && // Single channel
((size_t(d_native_samples) & quad_mask) == 0) && // ptr is quad-aligned
((num_rows == 1) || ((row_bytes & quad_mask) == 0)); // number of row-samples is a multiple of the alignment of the quad
bool vec_aligned_rows = (NUM_CHANNELS == 1) && (SAMPLES_PER_THREAD % VecSize == 0) && // Single channel
((size_t(d_native_samples) & vec_mask) == 0) && // ptr is quad-aligned
((num_rows == 1) || ((row_bytes & vec_mask) == 0)); // number of row-samples is a multiple of the alignment of the quad

bool pixel_aligned_rows = (NUM_CHANNELS > 1) && // Multi channel
((size_t(d_native_samples) & pixel_mask) == 0) && // ptr is pixel-aligned
((row_bytes & pixel_mask) == 0); // number of row-samples is a multiple of the alignment of the pixel

// Whether rows are aligned and can be vectorized
if ((d_native_samples != NULL) && (quad_aligned_rows || pixel_aligned_rows))
if ((d_native_samples != NULL) && (vec_aligned_rows || pixel_aligned_rows))
ConsumeTiles<true>(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type<IS_WORK_STEALING>());
else
ConsumeTiles<false>(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type<IS_WORK_STEALING>());
Expand Down
Loading

0 comments on commit 5531a47

Please sign in to comment.