Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tune Histogram on H100 #266

Merged
merged 4 commits into from
Jul 26, 2023
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions cub/benchmarks/bench/histogram/even.cu
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,14 @@
#include "histogram_common.cuh"
#include <nvbench_helper.cuh>

// %RANGE% TUNE_ITEMS ipt 7:24:1
// %RANGE% TUNE_ITEMS ipt 4:28:1
// %RANGE% TUNE_THREADS tpb 128:1024:32
// %RANGE% TUNE_RLE_COMPRESS rle 0:1:1
// %RANGE% TUNE_WORK_STEALING ws 0:1:1
// %RANGE% TUNE_MEM_PREFERENCE mem 0:2:1
// %RANGE% TUNE_LOAD ld 0:2:1
// %RANGE% TUNE_LOAD_ALGORITHM_ID laid 0:2:1
// %RANGE% TUNE_VEC_SIZE_POW vec 0:2:1

template <typename SampleT, typename CounterT, typename OffsetT>
static void even(nvbench::state &state, nvbench::type_list<SampleT, CounterT, OffsetT>)
Expand All @@ -44,7 +46,7 @@ static void even(nvbench::state &state, nvbench::type_list<SampleT, CounterT, Of
using sample_iterator_t = SampleT *;

#if !TUNE_BASE
using policy_t = policy_hub_t<key_t, num_active_channels>;
using policy_t = policy_hub_t<key_t, num_channels, num_active_channels>;
using dispatch_t = cub::DispatchHistogram<num_channels, //
num_active_channels,
sample_iterator_t,
Expand Down Expand Up @@ -138,5 +140,5 @@ NVBENCH_BENCH_TYPES(even, NVBENCH_TYPE_AXES(sample_types, bin_types, some_offset
.set_name("base")
.set_type_axes_names({"SampleT{ct}", "BinT{ct}", "OffsetT{ct}"})
.add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
.add_int64_axis("Bins", {128, 2048, 2097152})
.add_string_axis("Entropy", {"1.000", "0.544", "0.000"});
.add_int64_axis("Bins", {32, 64, 128, 2048, 2097152})
.add_string_axis("Entropy", {"0.201", "0.544", "1.000"});
33 changes: 19 additions & 14 deletions cub/benchmarks/bench/histogram/histogram_common.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@
#define TUNE_LOAD_MODIFIER cub::LOAD_CA
#endif // TUNE_LOAD

#define TUNE_VEC_SIZE (1 << TUNE_VEC_SIZE_POW)

#if TUNE_MEM_PREFERENCE == 0
constexpr cub::BlockHistogramMemoryPreference MEM_PREFERENCE = cub::GMEM;
#elif TUNE_MEM_PREFERENCE == 1
Expand All @@ -47,29 +49,32 @@ constexpr cub::BlockHistogramMemoryPreference MEM_PREFERENCE = cub::SMEM;
constexpr cub::BlockHistogramMemoryPreference MEM_PREFERENCE = cub::BLEND;
#endif // TUNE_MEM_PREFERENCE

#if TUNE_LOAD_ALGORITHM_ID == 0
#define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT
#elif TUNE_LOAD_ALGORITHM_ID == 1
#define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE
#else
#define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_STRIPED
#endif // TUNE_LOAD_ALGORITHM_ID

template <typename SampleT, int NUM_ACTIVE_CHANNELS>
template <typename SampleT, int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS>
struct policy_hub_t
{
template <int NOMINAL_ITEMS_PER_THREAD>
struct TScale
{
enum
{
V_SCALE = (sizeof(SampleT) + sizeof(int) - 1) / sizeof(int),
VALUE = CUB_MAX((NOMINAL_ITEMS_PER_THREAD / NUM_ACTIVE_CHANNELS / V_SCALE), 1)
};
};

struct policy_t : cub::ChainedPolicy<350, policy_t, policy_t>
{
static constexpr cub::BlockLoadAlgorithm load_algorithm =
(TUNE_LOAD_ALGORITHM == cub::BLOCK_LOAD_STRIPED)
? (NUM_CHANNELS == 1 ? cub::BLOCK_LOAD_STRIPED : cub::BLOCK_LOAD_DIRECT)
: TUNE_LOAD_ALGORITHM;

using AgentHistogramPolicyT = cub::AgentHistogramPolicy<TUNE_THREADS,
TScale<TUNE_ITEMS>::VALUE,
cub::BLOCK_LOAD_DIRECT,
TUNE_ITEMS,
load_algorithm,
TUNE_LOAD_MODIFIER,
TUNE_RLE_COMPRESS,
MEM_PREFERENCE,
TUNE_WORK_STEALING>;
TUNE_WORK_STEALING,
TUNE_VEC_SIZE>;
};

using MaxPolicy = policy_t;
Expand Down
8 changes: 5 additions & 3 deletions cub/benchmarks/bench/histogram/multi/even.cu
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@
// %RANGE% TUNE_WORK_STEALING ws 0:1:1
// %RANGE% TUNE_MEM_PREFERENCE mem 0:2:1
// %RANGE% TUNE_LOAD ld 0:2:1
// %RANGE% TUNE_LOAD_ALGORITHM_ID laid 0:2:1
// %RANGE% TUNE_VEC_SIZE_POW vec 0:2:1

template <typename SampleT, typename CounterT, typename OffsetT>
static void even(nvbench::state &state, nvbench::type_list<SampleT, CounterT, OffsetT>)
Expand All @@ -44,7 +46,7 @@ static void even(nvbench::state &state, nvbench::type_list<SampleT, CounterT, Of
using sample_iterator_t = SampleT *;

#if !TUNE_BASE
using policy_t = policy_hub_t<key_t, num_active_channels>;
using policy_t = policy_hub_t<key_t, num_channels, num_active_channels>;
using dispatch_t = cub::DispatchHistogram<num_channels, //
num_active_channels,
sample_iterator_t,
Expand Down Expand Up @@ -148,5 +150,5 @@ NVBENCH_BENCH_TYPES(even, NVBENCH_TYPE_AXES(sample_types, bin_types, some_offset
.set_name("base")
.set_type_axes_names({"SampleT{ct}", "BinT{ct}", "OffsetT{ct}"})
.add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
.add_int64_axis("Bins", {128, 2048, 2097152})
.add_string_axis("Entropy", {"1.000", "0.544", "0.000"});
.add_int64_axis("Bins", {32, 64, 128, 2048, 2097152})
.add_string_axis("Entropy", {"0.201", "0.544", "1.000"});
8 changes: 5 additions & 3 deletions cub/benchmarks/bench/histogram/multi/range.cu
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@
// %RANGE% TUNE_WORK_STEALING ws 0:1:1
// %RANGE% TUNE_MEM_PREFERENCE mem 0:2:1
// %RANGE% TUNE_LOAD ld 0:2:1
// %RANGE% TUNE_LOAD_ALGORITHM_ID laid 0:2:1
// %RANGE% TUNE_VEC_SIZE_POW vec 0:2:1

template <typename SampleT, typename CounterT, typename OffsetT>
static void range(nvbench::state &state, nvbench::type_list<SampleT, CounterT, OffsetT>)
Expand All @@ -46,7 +48,7 @@ static void range(nvbench::state &state, nvbench::type_list<SampleT, CounterT, O
using sample_iterator_t = SampleT *;

#if !TUNE_BASE
using policy_t = policy_hub_t<key_t, num_active_channels>;
using policy_t = policy_hub_t<key_t, num_channels, num_active_channels>;
using dispatch_t = cub::DispatchHistogram<num_channels, //
num_active_channels,
sample_iterator_t,
Expand Down Expand Up @@ -155,5 +157,5 @@ NVBENCH_BENCH_TYPES(range, NVBENCH_TYPE_AXES(sample_types, bin_types, some_offse
.set_name("base")
.set_type_axes_names({"SampleT{ct}", "BinT{ct}", "OffsetT{ct}"})
.add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
.add_int64_axis("Bins", {128, 2048, 2097152})
.add_string_axis("Entropy", {"1.000", "0.544", "0.000"});
.add_int64_axis("Bins", {32, 64, 128, 2048, 2097152})
.add_string_axis("Entropy", {"0.201", "0.544", "1.000"});
8 changes: 5 additions & 3 deletions cub/benchmarks/bench/histogram/range.cu
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@
// %RANGE% TUNE_WORK_STEALING ws 0:1:1
// %RANGE% TUNE_MEM_PREFERENCE mem 0:2:1
// %RANGE% TUNE_LOAD ld 0:2:1
// %RANGE% TUNE_LOAD_ALGORITHM_ID laid 0:2:1
// %RANGE% TUNE_VEC_SIZE_POW vec 0:2:1

template <typename SampleT, typename CounterT, typename OffsetT>
static void range(nvbench::state &state, nvbench::type_list<SampleT, CounterT, OffsetT>)
Expand All @@ -45,7 +47,7 @@ static void range(nvbench::state &state, nvbench::type_list<SampleT, CounterT, O
using sample_iterator_t = SampleT *;

#if !TUNE_BASE
using policy_t = policy_hub_t<key_t, num_active_channels>;
using policy_t = policy_hub_t<key_t, num_channels, num_active_channels>;
using dispatch_t = cub::DispatchHistogram<num_channels, //
num_active_channels,
sample_iterator_t,
Expand Down Expand Up @@ -143,5 +145,5 @@ NVBENCH_BENCH_TYPES(range, NVBENCH_TYPE_AXES(sample_types, bin_types, some_offse
.set_name("base")
.set_type_axes_names({"SampleT{ct}", "BinT{ct}", "OffsetT{ct}"})
.add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
.add_int64_axis("Bins", {128, 2048, 2097152})
.add_string_axis("Entropy", {"1.000", "0.544", "0.000"});
.add_int64_axis("Bins", {32, 64, 128, 2048, 2097152})
.add_string_axis("Entropy", {"0.201", "0.544", "1.000"});
90 changes: 62 additions & 28 deletions cub/cub/agent/agent_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,8 @@ template <
CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements
bool _RLE_COMPRESS, ///< Whether to perform localized RLE to compress samples before histogramming
BlockHistogramMemoryPreference _MEM_PREFERENCE, ///< Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
bool _WORK_STEALING> ///< Whether to dequeue tiles from a global work queue
bool _WORK_STEALING, ///< Whether to dequeue tiles from a global work queue
int _VEC_SIZE = 4>
gevtushenko marked this conversation as resolved.
Show resolved Hide resolved
struct AgentHistogramPolicy
{
enum
Expand All @@ -81,6 +82,8 @@ struct AgentHistogramPolicy
IS_WORK_STEALING = _WORK_STEALING, ///< Whether to dequeue tiles from a global work queue
};

static constexpr int VEC_SIZE = _VEC_SIZE;

static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use
static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements
};
Expand Down Expand Up @@ -116,8 +119,9 @@ struct AgentHistogram
/// The pixel type of SampleT
using PixelT = typename CubVector<SampleT, NUM_CHANNELS>::Type;

/// The quad type of SampleT
using QuadT = typename CubVector<SampleT, 4>::Type;
/// The vec type of SampleT
static constexpr int VecSize = AgentHistogramPolicyT::VEC_SIZE;
using VecT = typename CubVector<SampleT, VecSize>::Type;

/// Constants
enum
Expand All @@ -126,7 +130,7 @@ struct AgentHistogram

PIXELS_PER_THREAD = AgentHistogramPolicyT::PIXELS_PER_THREAD,
SAMPLES_PER_THREAD = PIXELS_PER_THREAD * NUM_CHANNELS,
QUADS_PER_THREAD = SAMPLES_PER_THREAD / 4,
VECS_PER_THREAD = SAMPLES_PER_THREAD / VecSize,

TILE_PIXELS = PIXELS_PER_THREAD * BLOCK_THREADS,
TILE_SAMPLES = SAMPLES_PER_THREAD * BLOCK_THREADS,
Expand Down Expand Up @@ -157,8 +161,8 @@ struct AgentHistogram
WrappedPixelIteratorT;

/// Qaud input iterator type (for applying cache modifier)
typedef CacheModifiedInputIterator<LOAD_MODIFIER, QuadT, OffsetT>
WrappedQuadIteratorT;
typedef CacheModifiedInputIterator<LOAD_MODIFIER, VecT, OffsetT>
WrappedVecsIteratorT;

/// Parameterized BlockLoad type for samples
typedef BlockLoad<
Expand All @@ -176,13 +180,13 @@ struct AgentHistogram
AgentHistogramPolicyT::LOAD_ALGORITHM>
BlockLoadPixelT;

/// Parameterized BlockLoad type for quads
/// Parameterized BlockLoad type for vecs
typedef BlockLoad<
QuadT,
VecT,
BLOCK_THREADS,
QUADS_PER_THREAD,
VECS_PER_THREAD,
AgentHistogramPolicyT::LOAD_ALGORITHM>
BlockLoadQuadT;
BlockLoadVecT;

/// Shared memory type required by this thread block
struct _TempStorage
Expand All @@ -196,7 +200,7 @@ struct AgentHistogram
{
typename BlockLoadSampleT::TempStorage sample_load; // Smem needed for loading a tile of samples
typename BlockLoadPixelT::TempStorage pixel_load; // Smem needed for loading a tile of pixels
typename BlockLoadQuadT::TempStorage quad_load; // Smem needed for loading a tile of quads
typename BlockLoadVecT::TempStorage vec_load; // Smem needed for loading a tile of vecs

} aliasable;
};
Expand Down Expand Up @@ -453,21 +457,21 @@ struct AgentHistogram
reinterpret_cast<AliasedPixels&>(samples));
}

// Load full, aligned tile using quad iterator (single-channel)
// Load full, aligned tile using vec iterator (single-channel)
__device__ __forceinline__ void LoadFullAlignedTile(
OffsetT block_offset,
int valid_samples,
SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
Int2Type<1> num_active_channels)
{
typedef QuadT AliasedQuads[QUADS_PER_THREAD];
typedef VecT AliasedVecs[VECS_PER_THREAD];

WrappedQuadIteratorT d_wrapped_quads((QuadT*) (d_native_samples + block_offset));
WrappedVecsIteratorT d_wrapped_vecs((VecT*) (d_native_samples + block_offset));

// Load using a wrapped quad iterator
BlockLoadQuadT(temp_storage.aliasable.quad_load).Load(
d_wrapped_quads,
reinterpret_cast<AliasedQuads&>(samples));
// Load using a wrapped vec iterator
BlockLoadVecT(temp_storage.aliasable.vec_load).Load(
d_wrapped_vecs,
reinterpret_cast<AliasedVecs&>(samples));
}

// Load full, aligned tile
Expand Down Expand Up @@ -534,14 +538,39 @@ struct AgentHistogram
valid_samples);
}

template <bool IS_FULL_TILE>
__device__ __forceinline__ void MarkValid(bool (&is_valid)[PIXELS_PER_THREAD],
int valid_samples,
Int2Type<false> /* is_striped = false */)
{
#pragma unroll
for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
{
is_valid[PIXEL] = IS_FULL_TILE || (((threadIdx.x * PIXELS_PER_THREAD + PIXEL) *
NUM_CHANNELS) < valid_samples);
}
}

template <bool IS_FULL_TILE>
__device__ __forceinline__ void MarkValid(bool (&is_valid)[PIXELS_PER_THREAD],
int valid_samples,
Int2Type<true> /* is_striped = true */)
gevtushenko marked this conversation as resolved.
Show resolved Hide resolved
{
#pragma unroll
for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
{
is_valid[PIXEL] = IS_FULL_TILE || (((threadIdx.x + BLOCK_THREADS * PIXEL) *
NUM_CHANNELS) < valid_samples);
}
}

//---------------------------------------------------------------------
// Tile processing
//---------------------------------------------------------------------

// Consume a tile of data samples
template <
bool IS_ALIGNED, // Whether the tile offset is aligned (quad-aligned for single-channel, pixel-aligned for multi-channel)
bool IS_ALIGNED, // Whether the tile offset is aligned (vec-aligned for single-channel, pixel-aligned for multi-channel)
bool IS_FULL_TILE> // Whether the tile is full
__device__ __forceinline__ void ConsumeTile(OffsetT block_offset, int valid_samples)
{
Expand All @@ -557,15 +586,20 @@ struct AgentHistogram
Int2Type<IS_ALIGNED>());

// Set valid flags
#pragma unroll
for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
is_valid[PIXEL] = IS_FULL_TILE || (((threadIdx.x * PIXELS_PER_THREAD + PIXEL) * NUM_CHANNELS) < valid_samples);
MarkValid<IS_FULL_TILE>(
is_valid,
valid_samples,
Int2Type<AgentHistogramPolicyT::LOAD_ALGORITHM == BLOCK_LOAD_STRIPED>{});

// Accumulate samples
if (prefer_smem)
{
AccumulateSmemPixels(samples, is_valid);
}
else
{
AccumulateGmemPixels(samples, is_valid);
}
}


Expand Down Expand Up @@ -725,21 +759,21 @@ struct AgentHistogram
int tiles_per_row, ///< Number of image tiles per row
GridQueue<int> tile_queue) ///< Queue descriptor for assigning tiles of work to thread blocks
{
// Check whether all row starting offsets are quad-aligned (in single-channel) or pixel-aligned (in multi-channel)
int quad_mask = AlignBytes<QuadT>::ALIGN_BYTES - 1;
// Check whether all row starting offsets are vec-aligned (in single-channel) or pixel-aligned (in multi-channel)
int vec_mask = AlignBytes<VecT>::ALIGN_BYTES - 1;
int pixel_mask = AlignBytes<PixelT>::ALIGN_BYTES - 1;
size_t row_bytes = sizeof(SampleT) * row_stride_samples;

bool quad_aligned_rows = (NUM_CHANNELS == 1) && (SAMPLES_PER_THREAD % 4 == 0) && // Single channel
((size_t(d_native_samples) & quad_mask) == 0) && // ptr is quad-aligned
((num_rows == 1) || ((row_bytes & quad_mask) == 0)); // number of row-samples is a multiple of the alignment of the quad
bool vec_aligned_rows = (NUM_CHANNELS == 1) && (SAMPLES_PER_THREAD % VecSize == 0) && // Single channel
((size_t(d_native_samples) & vec_mask) == 0) && // ptr is quad-aligned
((num_rows == 1) || ((row_bytes & vec_mask) == 0)); // number of row-samples is a multiple of the alignment of the quad

bool pixel_aligned_rows = (NUM_CHANNELS > 1) && // Multi channel
((size_t(d_native_samples) & pixel_mask) == 0) && // ptr is pixel-aligned
((row_bytes & pixel_mask) == 0); // number of row-samples is a multiple of the alignment of the pixel

// Whether rows are aligned and can be vectorized
if ((d_native_samples != NULL) && (quad_aligned_rows || pixel_aligned_rows))
if ((d_native_samples != NULL) && (vec_aligned_rows || pixel_aligned_rows))
ConsumeTiles<true>(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type<IS_WORK_STEALING>());
else
ConsumeTiles<false>(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type<IS_WORK_STEALING>());
Expand Down
Loading
Loading