From 5ce10a3321340fc438d46c60311f9b64d664334c Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Mon, 24 Jul 2023 16:22:24 +0000 Subject: [PATCH 1/4] Better histogram benchmarks --- cub/benchmarks/bench/histogram/even.cu | 10 +++--- .../bench/histogram/histogram_common.cuh | 33 +++++++++++-------- cub/benchmarks/bench/histogram/multi/even.cu | 8 +++-- cub/benchmarks/bench/histogram/multi/range.cu | 8 +++-- cub/benchmarks/bench/histogram/range.cu | 8 +++-- 5 files changed, 40 insertions(+), 27 deletions(-) diff --git a/cub/benchmarks/bench/histogram/even.cu b/cub/benchmarks/bench/histogram/even.cu index 3346501371..761c609e95 100644 --- a/cub/benchmarks/bench/histogram/even.cu +++ b/cub/benchmarks/bench/histogram/even.cu @@ -28,12 +28,14 @@ #include "histogram_common.cuh" #include -// %RANGE% TUNE_ITEMS ipt 7:24:1 +// %RANGE% TUNE_ITEMS ipt 4:28:1 // %RANGE% TUNE_THREADS tpb 128:1024:32 // %RANGE% TUNE_RLE_COMPRESS rle 0:1:1 // %RANGE% TUNE_WORK_STEALING ws 0:1:1 // %RANGE% TUNE_MEM_PREFERENCE mem 0:2:1 // %RANGE% TUNE_LOAD ld 0:2:1 +// %RANGE% TUNE_LOAD_ALGORITHM_ID laid 0:2:1 +// %RANGE% TUNE_VEC_SIZE_POW vec 0:2:1 template static void even(nvbench::state &state, nvbench::type_list) @@ -44,7 +46,7 @@ static void even(nvbench::state &state, nvbench::type_list; + using policy_t = policy_hub_t; using dispatch_t = cub::DispatchHistogram +template struct policy_hub_t { - template - struct TScale - { - enum - { - V_SCALE = (sizeof(SampleT) + sizeof(int) - 1) / sizeof(int), - VALUE = CUB_MAX((NOMINAL_ITEMS_PER_THREAD / NUM_ACTIVE_CHANNELS / V_SCALE), 1) - }; - }; - struct policy_t : cub::ChainedPolicy<350, policy_t, policy_t> { + static constexpr cub::BlockLoadAlgorithm load_algorithm = + (TUNE_LOAD_ALGORITHM == cub::BLOCK_LOAD_STRIPED) + ? (NUM_CHANNELS == 1 ? cub::BLOCK_LOAD_STRIPED : cub::BLOCK_LOAD_DIRECT) + : TUNE_LOAD_ALGORITHM; + using AgentHistogramPolicyT = cub::AgentHistogramPolicy::VALUE, - cub::BLOCK_LOAD_DIRECT, + TUNE_ITEMS, + load_algorithm, TUNE_LOAD_MODIFIER, TUNE_RLE_COMPRESS, MEM_PREFERENCE, - TUNE_WORK_STEALING>; + TUNE_WORK_STEALING, + TUNE_VEC_SIZE>; }; using MaxPolicy = policy_t; diff --git a/cub/benchmarks/bench/histogram/multi/even.cu b/cub/benchmarks/bench/histogram/multi/even.cu index 2e82a1a15c..3c93269e18 100644 --- a/cub/benchmarks/bench/histogram/multi/even.cu +++ b/cub/benchmarks/bench/histogram/multi/even.cu @@ -34,6 +34,8 @@ // %RANGE% TUNE_WORK_STEALING ws 0:1:1 // %RANGE% TUNE_MEM_PREFERENCE mem 0:2:1 // %RANGE% TUNE_LOAD ld 0:2:1 +// %RANGE% TUNE_LOAD_ALGORITHM_ID laid 0:2:1 +// %RANGE% TUNE_VEC_SIZE_POW vec 0:2:1 template static void even(nvbench::state &state, nvbench::type_list) @@ -44,7 +46,7 @@ static void even(nvbench::state &state, nvbench::type_list; + using policy_t = policy_hub_t; using dispatch_t = cub::DispatchHistogram static void range(nvbench::state &state, nvbench::type_list) @@ -46,7 +48,7 @@ static void range(nvbench::state &state, nvbench::type_list; + using policy_t = policy_hub_t; using dispatch_t = cub::DispatchHistogram static void range(nvbench::state &state, nvbench::type_list) @@ -45,7 +47,7 @@ static void range(nvbench::state &state, nvbench::type_list; + using policy_t = policy_hub_t; using dispatch_t = cub::DispatchHistogram Date: Mon, 24 Jul 2023 16:22:38 +0000 Subject: [PATCH 2/4] Tune histogram for SM90 --- cub/cub/agent/agent_histogram.cuh | 90 +++++--- .../device/dispatch/dispatch_histogram.cuh | 35 +--- .../dispatch/tuning/tuning_histogram.cuh | 192 ++++++++++++++++++ 3 files changed, 257 insertions(+), 60 deletions(-) create mode 100644 cub/cub/device/dispatch/tuning/tuning_histogram.cuh diff --git a/cub/cub/agent/agent_histogram.cuh b/cub/cub/agent/agent_histogram.cuh index af4c0e347b..9a44ecb3f2 100644 --- a/cub/cub/agent/agent_histogram.cuh +++ b/cub/cub/agent/agent_histogram.cuh @@ -69,7 +69,8 @@ template < CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements bool _RLE_COMPRESS, ///< Whether to perform localized RLE to compress samples before histogramming BlockHistogramMemoryPreference _MEM_PREFERENCE, ///< Whether to prefer privatized shared-memory bins (versus privatized global-memory bins) - bool _WORK_STEALING> ///< Whether to dequeue tiles from a global work queue + bool _WORK_STEALING, ///< Whether to dequeue tiles from a global work queue + int _VEC_SIZE = 4> struct AgentHistogramPolicy { enum @@ -81,6 +82,8 @@ struct AgentHistogramPolicy IS_WORK_STEALING = _WORK_STEALING, ///< Whether to dequeue tiles from a global work queue }; + static constexpr int VEC_SIZE = _VEC_SIZE; + static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements }; @@ -116,8 +119,9 @@ struct AgentHistogram /// The pixel type of SampleT using PixelT = typename CubVector::Type; - /// The quad type of SampleT - using QuadT = typename CubVector::Type; + /// The vec type of SampleT + static constexpr int VecSize = AgentHistogramPolicyT::VEC_SIZE; + using VecT = typename CubVector::Type; /// Constants enum @@ -126,7 +130,7 @@ struct AgentHistogram PIXELS_PER_THREAD = AgentHistogramPolicyT::PIXELS_PER_THREAD, SAMPLES_PER_THREAD = PIXELS_PER_THREAD * NUM_CHANNELS, - QUADS_PER_THREAD = SAMPLES_PER_THREAD / 4, + VECS_PER_THREAD = SAMPLES_PER_THREAD / VecSize, TILE_PIXELS = PIXELS_PER_THREAD * BLOCK_THREADS, TILE_SAMPLES = SAMPLES_PER_THREAD * BLOCK_THREADS, @@ -157,8 +161,8 @@ struct AgentHistogram WrappedPixelIteratorT; /// Qaud input iterator type (for applying cache modifier) - typedef CacheModifiedInputIterator - WrappedQuadIteratorT; + typedef CacheModifiedInputIterator + WrappedVecsIteratorT; /// Parameterized BlockLoad type for samples typedef BlockLoad< @@ -176,13 +180,13 @@ struct AgentHistogram AgentHistogramPolicyT::LOAD_ALGORITHM> BlockLoadPixelT; - /// Parameterized BlockLoad type for quads + /// Parameterized BlockLoad type for vecs typedef BlockLoad< - QuadT, + VecT, BLOCK_THREADS, - QUADS_PER_THREAD, + VECS_PER_THREAD, AgentHistogramPolicyT::LOAD_ALGORITHM> - BlockLoadQuadT; + BlockLoadVecT; /// Shared memory type required by this thread block struct _TempStorage @@ -196,7 +200,7 @@ struct AgentHistogram { typename BlockLoadSampleT::TempStorage sample_load; // Smem needed for loading a tile of samples typename BlockLoadPixelT::TempStorage pixel_load; // Smem needed for loading a tile of pixels - typename BlockLoadQuadT::TempStorage quad_load; // Smem needed for loading a tile of quads + typename BlockLoadVecT::TempStorage vec_load; // Smem needed for loading a tile of vecs } aliasable; }; @@ -453,21 +457,21 @@ struct AgentHistogram reinterpret_cast(samples)); } - // Load full, aligned tile using quad iterator (single-channel) + // Load full, aligned tile using vec iterator (single-channel) __device__ __forceinline__ void LoadFullAlignedTile( OffsetT block_offset, int valid_samples, SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS], Int2Type<1> num_active_channels) { - typedef QuadT AliasedQuads[QUADS_PER_THREAD]; + typedef VecT AliasedVecs[VECS_PER_THREAD]; - WrappedQuadIteratorT d_wrapped_quads((QuadT*) (d_native_samples + block_offset)); + WrappedVecsIteratorT d_wrapped_vecs((VecT*) (d_native_samples + block_offset)); - // Load using a wrapped quad iterator - BlockLoadQuadT(temp_storage.aliasable.quad_load).Load( - d_wrapped_quads, - reinterpret_cast(samples)); + // Load using a wrapped vec iterator + BlockLoadVecT(temp_storage.aliasable.vec_load).Load( + d_wrapped_vecs, + reinterpret_cast(samples)); } // Load full, aligned tile @@ -534,6 +538,31 @@ struct AgentHistogram valid_samples); } + template + __device__ __forceinline__ void MarkValid(bool (&is_valid)[PIXELS_PER_THREAD], + int valid_samples, + Int2Type /* is_striped = false */) + { + #pragma unroll + for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL) + { + is_valid[PIXEL] = IS_FULL_TILE || (((threadIdx.x * PIXELS_PER_THREAD + PIXEL) * + NUM_CHANNELS) < valid_samples); + } + } + + template + __device__ __forceinline__ void MarkValid(bool (&is_valid)[PIXELS_PER_THREAD], + int valid_samples, + Int2Type /* is_striped = true */) + { + #pragma unroll + for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL) + { + is_valid[PIXEL] = IS_FULL_TILE || (((threadIdx.x + BLOCK_THREADS * PIXEL) * + NUM_CHANNELS) < valid_samples); + } + } //--------------------------------------------------------------------- // Tile processing @@ -541,7 +570,7 @@ struct AgentHistogram // Consume a tile of data samples template < - bool IS_ALIGNED, // Whether the tile offset is aligned (quad-aligned for single-channel, pixel-aligned for multi-channel) + bool IS_ALIGNED, // Whether the tile offset is aligned (vec-aligned for single-channel, pixel-aligned for multi-channel) bool IS_FULL_TILE> // Whether the tile is full __device__ __forceinline__ void ConsumeTile(OffsetT block_offset, int valid_samples) { @@ -557,15 +586,20 @@ struct AgentHistogram Int2Type()); // Set valid flags - #pragma unroll - for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL) - is_valid[PIXEL] = IS_FULL_TILE || (((threadIdx.x * PIXELS_PER_THREAD + PIXEL) * NUM_CHANNELS) < valid_samples); + MarkValid( + is_valid, + valid_samples, + Int2Type{}); // Accumulate samples if (prefer_smem) + { AccumulateSmemPixels(samples, is_valid); + } else + { AccumulateGmemPixels(samples, is_valid); + } } @@ -725,21 +759,21 @@ struct AgentHistogram int tiles_per_row, ///< Number of image tiles per row GridQueue tile_queue) ///< Queue descriptor for assigning tiles of work to thread blocks { - // Check whether all row starting offsets are quad-aligned (in single-channel) or pixel-aligned (in multi-channel) - int quad_mask = AlignBytes::ALIGN_BYTES - 1; + // Check whether all row starting offsets are vec-aligned (in single-channel) or pixel-aligned (in multi-channel) + int vec_mask = AlignBytes::ALIGN_BYTES - 1; int pixel_mask = AlignBytes::ALIGN_BYTES - 1; size_t row_bytes = sizeof(SampleT) * row_stride_samples; - bool quad_aligned_rows = (NUM_CHANNELS == 1) && (SAMPLES_PER_THREAD % 4 == 0) && // Single channel - ((size_t(d_native_samples) & quad_mask) == 0) && // ptr is quad-aligned - ((num_rows == 1) || ((row_bytes & quad_mask) == 0)); // number of row-samples is a multiple of the alignment of the quad + bool vec_aligned_rows = (NUM_CHANNELS == 1) && (SAMPLES_PER_THREAD % VecSize == 0) && // Single channel + ((size_t(d_native_samples) & vec_mask) == 0) && // ptr is quad-aligned + ((num_rows == 1) || ((row_bytes & vec_mask) == 0)); // number of row-samples is a multiple of the alignment of the quad bool pixel_aligned_rows = (NUM_CHANNELS > 1) && // Multi channel ((size_t(d_native_samples) & pixel_mask) == 0) && // ptr is pixel-aligned ((row_bytes & pixel_mask) == 0); // number of row-samples is a multiple of the alignment of the pixel // Whether rows are aligned and can be vectorized - if ((d_native_samples != NULL) && (quad_aligned_rows || pixel_aligned_rows)) + if ((d_native_samples != NULL) && (vec_aligned_rows || pixel_aligned_rows)) ConsumeTiles(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type()); else ConsumeTiles(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type()); diff --git a/cub/cub/device/dispatch/dispatch_histogram.cuh b/cub/cub/device/dispatch/dispatch_histogram.cuh index c5fb3e105d..b393d3c824 100644 --- a/cub/cub/device/dispatch/dispatch_histogram.cuh +++ b/cub/cub/device/dispatch/dispatch_histogram.cuh @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -242,38 +243,6 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentHistogramPolicyT::BLOCK namespace detail { -template -struct device_histogram_policy_hub -{ - template - struct TScale - { - enum - { - V_SCALE = (sizeof(SampleT) + sizeof(int) - 1) / sizeof(int), - VALUE = CUB_MAX((NOMINAL_ITEMS_PER_THREAD / NUM_ACTIVE_CHANNELS / V_SCALE), 1) - }; - }; - - /// SM35 - struct Policy350 : ChainedPolicy<350, Policy350, Policy350> - { - // TODO This might be worth it to separate usual histogram and the multi one - using AgentHistogramPolicyT = - AgentHistogramPolicy<128, TScale<8>::VALUE, BLOCK_LOAD_DIRECT, LOAD_LDG, true, BLEND, true>; - }; - - /// SM50 - struct Policy500 : ChainedPolicy<500, Policy500, Policy350> - { - // TODO This might be worth it to separate usual histogram and the multi one - using AgentHistogramPolicyT = - AgentHistogramPolicy<384, TScale<16>::VALUE, BLOCK_LOAD_DIRECT, LOAD_LDG, true, SMEM, false>; - }; - - using MaxPolicy = Policy500; -}; - template , + CounterT, + NUM_CHANNELS, NUM_ACTIVE_CHANNELS>> struct DispatchHistogram : SelectedPolicy { diff --git a/cub/cub/device/dispatch/tuning/tuning_histogram.cuh b/cub/cub/device/dispatch/tuning/tuning_histogram.cuh new file mode 100644 index 0000000000..b17c525c8e --- /dev/null +++ b/cub/cub/device/dispatch/tuning/tuning_histogram.cuh @@ -0,0 +1,192 @@ +/****************************************************************************** + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +#pragma once + +#include +#include +#include +#include + +CUB_NAMESPACE_BEGIN + +namespace detail +{ + +namespace histogram +{ + +enum class primitive_sample +{ + no, + yes +}; + +enum class sample_size +{ + _1, + _2, + unknown +}; + +enum class counter_size +{ + _4, + unknown +}; + +template +constexpr primitive_sample is_primitive_sample() +{ + return Traits::PRIMITIVE ? primitive_sample::yes : primitive_sample::no; +} + +template +constexpr counter_size classify_counter_size() +{ + return sizeof(CounterT) == 4 ? counter_size::_4 : counter_size::unknown; +} + +template +constexpr sample_size classify_sample_size() +{ + return sizeof(SampleT) == 1 ? sample_size::_1 + : sizeof(SampleT) == 2 ? sample_size::_2 + : sample_size::unknown; +} + +template +constexpr int t_scale() +{ + constexpr int V_SCALE = (sizeof(SampleT) + sizeof(int) - 1) / sizeof(int); + return CUB_MAX((NominalItemsPerThread / NumActiveChannels / V_SCALE), 1); +} + +template (), + sample_size SampleSize = classify_sample_size()> +struct sm90_tuning +{ + static constexpr int threads = 384; + static constexpr int items = t_scale(); + + static constexpr CacheLoadModifier load_modifier = LOAD_LDG; + static constexpr BlockHistogramMemoryPreference mem_preference = SMEM; + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; + + static constexpr bool rle_compress = true; + static constexpr bool work_stealing = false; +}; + +template +struct sm90_tuning +{ + static constexpr int threads = 768; + static constexpr int items = 12; + + static constexpr CacheLoadModifier load_modifier = LOAD_LDG; + static constexpr BlockHistogramMemoryPreference mem_preference = SMEM; + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; + + static constexpr bool rle_compress = false; + static constexpr bool work_stealing = false; +}; + +template +struct sm90_tuning +{ + static constexpr int threads = 960; + static constexpr int items = 10; + + static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT; + static constexpr BlockHistogramMemoryPreference mem_preference = SMEM; + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; + + static constexpr bool rle_compress = true; + static constexpr bool work_stealing = false; +}; + +} // namespace histogram + +template +struct device_histogram_policy_hub +{ + template + struct TScale + { + enum + { + V_SCALE = (sizeof(SampleT) + sizeof(int) - 1) / sizeof(int), + VALUE = CUB_MAX((NOMINAL_ITEMS_PER_THREAD / NumActiveChannels / V_SCALE), 1) + }; + }; + + /// SM35 + struct Policy350 : ChainedPolicy<350, Policy350, Policy350> + { + // TODO This might be worth it to separate usual histogram and the multi one + using AgentHistogramPolicyT = + AgentHistogramPolicy<128, TScale<8>::VALUE, BLOCK_LOAD_DIRECT, LOAD_LDG, true, BLEND, true>; + }; + + /// SM50 + struct Policy500 : ChainedPolicy<500, Policy500, Policy350> + { + // TODO This might be worth it to separate usual histogram and the multi one + using AgentHistogramPolicyT = + AgentHistogramPolicy<384, TScale<16>::VALUE, cub::BLOCK_LOAD_DIRECT, LOAD_LDG, true, SMEM, false>; + }; + + /// SM900 + struct Policy900 : ChainedPolicy<900, Policy900, Policy500> + { + using tuning = detail::histogram::sm90_tuning()>; + + using AgentHistogramPolicyT = AgentHistogramPolicy; + }; + + using MaxPolicy = Policy900; +}; + +} // namespace detail + +CUB_NAMESPACE_END From be867555223aaaa5550756802d6902227afa60c5 Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Tue, 25 Jul 2023 09:10:14 +0000 Subject: [PATCH 3/4] Fix histogram tuning for C++11 --- cub/cub/device/dispatch/tuning/tuning_histogram.cuh | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/cub/cub/device/dispatch/tuning/tuning_histogram.cuh b/cub/cub/device/dispatch/tuning/tuning_histogram.cuh index b17c525c8e..f53ae3a27b 100644 --- a/cub/cub/device/dispatch/tuning/tuning_histogram.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_histogram.cuh @@ -79,11 +79,16 @@ constexpr sample_size classify_sample_size() : sample_size::unknown; } +template +constexpr int v_scale() +{ + return (sizeof(SampleT) + sizeof(int) - 1) / sizeof(int); +} + template constexpr int t_scale() { - constexpr int V_SCALE = (sizeof(SampleT) + sizeof(int) - 1) / sizeof(int); - return CUB_MAX((NominalItemsPerThread / NumActiveChannels / V_SCALE), 1); + return CUB_MAX((NominalItemsPerThread / NumActiveChannels / v_scale()), 1); } template Date: Wed, 26 Jul 2023 10:59:13 +0000 Subject: [PATCH 4/4] [skip-tests] Document new template parameter --- cub/cub/agent/agent_histogram.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cub/cub/agent/agent_histogram.cuh b/cub/cub/agent/agent_histogram.cuh index 9a44ecb3f2..f9770bd715 100644 --- a/cub/cub/agent/agent_histogram.cuh +++ b/cub/cub/agent/agent_histogram.cuh @@ -70,7 +70,7 @@ template < bool _RLE_COMPRESS, ///< Whether to perform localized RLE to compress samples before histogramming BlockHistogramMemoryPreference _MEM_PREFERENCE, ///< Whether to prefer privatized shared-memory bins (versus privatized global-memory bins) bool _WORK_STEALING, ///< Whether to dequeue tiles from a global work queue - int _VEC_SIZE = 4> + int _VEC_SIZE = 4> ///< Vector size for samples loading (1, 2, 4) struct AgentHistogramPolicy { enum @@ -82,7 +82,7 @@ struct AgentHistogramPolicy IS_WORK_STEALING = _WORK_STEALING, ///< Whether to dequeue tiles from a global work queue }; - static constexpr int VEC_SIZE = _VEC_SIZE; + static constexpr int VEC_SIZE = _VEC_SIZE; ///< Vector size for samples loading (1, 2, 4) static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements