diff --git a/HeterogeneousCore/CUDAUtilities/interface/AtomicPairCounter.h b/HeterogeneousCore/CUDAUtilities/interface/AtomicPairCounter.h index 425bc1ef8a701..8854b02176260 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/AtomicPairCounter.h +++ b/HeterogeneousCore/CUDAUtilities/interface/AtomicPairCounter.h @@ -1,19 +1,21 @@ -#ifndef HeterogeneousCoreCUDAUtilitiesAtomicPairCounter_H -#define HeterogeneousCoreCUDAUtilitiesAtomicPairCounter_H +#ifndef HeterogeneousCore_CUDAUtilities_interface_AtomicPairCounter_h +#define HeterogeneousCore_CUDAUtilities_interface_AtomicPairCounter_h -#include #include +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h" + class AtomicPairCounter { public: - using c_type = unsigned long long int; - AtomicPairCounter(){} - AtomicPairCounter(c_type i) { counter.ac=i;} + AtomicPairCounter() {} + AtomicPairCounter(c_type i) { counter.ac = i; } - __device__ __host__ - AtomicPairCounter & operator=(c_type i) { counter.ac=i; return *this;} + __device__ __host__ AtomicPairCounter& operator=(c_type i) { + counter.ac = i; + return *this; + } struct Counters { uint32_t n; // in a "One to Many" association is the number of "One" @@ -25,30 +27,26 @@ class AtomicPairCounter { c_type ac; }; -#ifdef __CUDACC__ + static constexpr c_type incr = 1UL << 32; - static constexpr c_type incr = 1UL<<32; - - __device__ __host__ - Counters get() const { return counter.counters;} + __device__ __host__ Counters get() const { return counter.counters; } // increment n by 1 and m by i. return previous value - __device__ - Counters add(uint32_t i) { - c_type c = i; - c+=incr; + __host__ __device__ __forceinline__ Counters add(uint32_t i) { + c_type c = i; + c += incr; Atomic2 ret; - ret.ac = atomicAdd(&counter.ac,c); +#ifdef __CUDA_ARCH__ + ret.ac = atomicAdd(&counter.ac, c); +#else + ret.ac = counter.ac; + counter.ac += c; +#endif return ret.counters; } -#endif - private: - Atomic2 counter; - }; - -#endif +#endif // HeterogeneousCore_CUDAUtilities_interface_AtomicPairCounter_h diff --git a/HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h b/HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h index 66525d13db8dc..0a8df34b5125e 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h +++ b/HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h @@ -6,97 +6,116 @@ #include #include -#include +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h" namespace GPU { -template struct SimpleVector { - constexpr SimpleVector() = default; + template + struct SimpleVector { + constexpr SimpleVector() = default; + + // ownership of m_data stays within the caller + constexpr void construct(int capacity, T *data) { + m_size = 0; + m_capacity = capacity; + m_data = data; + } - // ownership of m_data stays within the caller - constexpr void construct(int capacity, T *data) { - m_size = 0; - m_capacity = capacity; - m_data = data; - } + inline constexpr int push_back_unsafe(const T &element) { + auto previousSize = m_size; + m_size++; + if (previousSize < m_capacity) { + m_data[previousSize] = element; + return previousSize; + } else { + --m_size; + return -1; + } + } - inline constexpr int push_back_unsafe(const T &element) { - auto previousSize = m_size; - m_size++; - if (previousSize < m_capacity) { - m_data[previousSize] = element; - return previousSize; - } else { - --m_size; - return -1; + template + constexpr int emplace_back_unsafe(Ts &&... args) { + auto previousSize = m_size; + m_size++; + if (previousSize < m_capacity) { + (new (&m_data[previousSize]) T(std::forward(args)...)); + return previousSize; + } else { + --m_size; + return -1; + } } - } - template constexpr int emplace_back_unsafe(Ts &&... args) { - auto previousSize = m_size; - m_size++; - if (previousSize < m_capacity) { - (new (&m_data[previousSize]) T(std::forward(args)...)); - return previousSize; - } else { - --m_size; - return -1; + __device__ inline T &back() { return m_data[m_size - 1]; } + + __device__ inline const T &back() const { + if (m_size > 0) { + return m_data[m_size - 1]; + } else + return T(); //undefined behaviour } - } - inline constexpr T & back() const { + // thread-safe version of the vector, when used in a CUDA kernel + __device__ int push_back(const T &element) { + auto previousSize = atomicAdd(&m_size, 1); + if (previousSize < m_capacity) { + m_data[previousSize] = element; + return previousSize; + } else { + atomicSub(&m_size, 1); + return -1; + } + } - if (m_size > 0) { - return m_data[m_size - 1]; - } else - return T(); //undefined behaviour - } + template + __device__ int emplace_back(Ts &&... args) { + auto previousSize = atomicAdd(&m_size, 1); + if (previousSize < m_capacity) { + (new (&m_data[previousSize]) T(std::forward(args)...)); + return previousSize; + } else { + atomicSub(&m_size, 1); + return -1; + } + } -#ifdef __CUDACC__ - - // thread-safe version of the vector, when used in a CUDA kernel - __device__ - int push_back(const T &element) { - auto previousSize = atomicAdd(&m_size, 1); - if (previousSize < m_capacity) { - m_data[previousSize] = element; - return previousSize; - } else { - atomicSub(&m_size, 1); - return -1; + // thread safe version of resize + __device__ int extend(int size = 1) { + auto previousSize = atomicAdd(&m_size, size); + if (previousSize < m_capacity) { + return previousSize; + } else { + atomicSub(&m_size, size); + return -1; + } } - } - template - __device__ - int emplace_back(Ts &&... args) { - auto previousSize = atomicAdd(&m_size, 1); - if (previousSize < m_capacity) { - (new (&m_data[previousSize]) T(std::forward(args)...)); - return previousSize; - } else { - atomicSub(&m_size, 1); - return -1; + __device__ int shrink(int size = 1) { + auto previousSize = atomicSub(&m_size, size); + if (previousSize >= size) { + return previousSize - size; + } else { + atomicAdd(&m_size, size); + return -1; + } } - } -#endif // __CUDACC__ - inline constexpr bool empty() const { return m_size==0;} - inline constexpr bool full() const { return m_size==m_capacity;} - inline constexpr T& operator[](int i) { return m_data[i]; } - inline constexpr const T& operator[](int i) const { return m_data[i]; } - inline constexpr void reset() { m_size = 0; } - inline constexpr int size() const { return m_size; } - inline constexpr int capacity() const { return m_capacity; } - inline constexpr T const * data() const { return m_data; } - inline constexpr void resize(int size) { m_size = size; } - inline constexpr void set_data(T * data) { m_data = data; } - -private: - int m_size; - int m_capacity; - - T *m_data; -}; + inline constexpr bool empty() const { return m_size <= 0; } + inline constexpr bool full() const { return m_size >= m_capacity; } + inline constexpr T &operator[](int i) { return m_data[i]; } + inline constexpr const T &operator[](int i) const { return m_data[i]; } + inline constexpr void reset() { m_size = 0; } + inline constexpr int size() const { return m_size; } + inline constexpr int capacity() const { return m_capacity; } + inline constexpr T const *data() const { return m_data; } + inline constexpr void resize(int size) { m_size = size; } + inline constexpr void set_data(T *data) { m_data = data; } + + private: + int m_size; + int m_capacity; + + T *m_data; + }; // ownership of m_data stays within the caller template @@ -109,11 +128,11 @@ template struct SimpleVector { // ownership of m_data stays within the caller template SimpleVector *make_SimpleVector(SimpleVector *mem, int capacity, T *data) { - auto ret = new(mem) SimpleVector(); + auto ret = new (mem) SimpleVector(); ret->construct(capacity, data); return ret; } -} // namespace GPU +} // namespace GPU -#endif // HeterogeneousCore_CUDAUtilities_interface_GPUSimpleVector_h +#endif // HeterogeneousCore_CUDAUtilities_interface_GPUSimpleVector_h diff --git a/HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h b/HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h index a060fe6799b6c..514a85a421ab1 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h +++ b/HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h @@ -5,100 +5,100 @@ // Author: Felice Pantaleo, CERN // -#include +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h" namespace GPU { -template struct VecArray { - inline constexpr int push_back_unsafe(const T &element) { - auto previousSize = m_size; - m_size++; - if (previousSize < maxSize) { - m_data[previousSize] = element; - return previousSize; - } else { - --m_size; - return -1; + template + class VecArray { + public: + using self = VecArray; + using value_t = T; + + inline constexpr int push_back_unsafe(const T &element) { + auto previousSize = m_size; + m_size++; + if (previousSize < maxSize) { + m_data[previousSize] = element; + return previousSize; + } else { + --m_size; + return -1; + } } - } - template constexpr int emplace_back_unsafe(Ts &&... args) { - auto previousSize = m_size; - m_size++; - if (previousSize < maxSize) { - (new (&m_data[previousSize]) T(std::forward(args)...)); - return previousSize; - } else { - --m_size; - return -1; + template + constexpr int emplace_back_unsafe(Ts &&... args) { + auto previousSize = m_size; + m_size++; + if (previousSize < maxSize) { + (new (&m_data[previousSize]) T(std::forward(args)...)); + return previousSize; + } else { + --m_size; + return -1; + } } - } - inline constexpr T & back() const { - if (m_size > 0) { - return m_data[m_size - 1]; - } else - return T(); //undefined behaviour - } - -#ifdef __CUDACC__ - - // thread-safe version of the vector, when used in a CUDA kernel - __device__ - int push_back(const T &element) { - auto previousSize = atomicAdd(&m_size, 1); - if (previousSize < maxSize) { - m_data[previousSize] = element; - return previousSize; - } else { - atomicSub(&m_size, 1); - return -1; + inline constexpr T &back() const { + if (m_size > 0) { + return m_data[m_size - 1]; + } else + return T(); //undefined behaviour } - } - template - __device__ - int emplace_back(Ts &&... args) { - auto previousSize = atomicAdd(&m_size, 1); - if (previousSize < maxSize) { - (new (&m_data[previousSize]) T(std::forward(args)...)); - return previousSize; - } else { - atomicSub(&m_size, 1); - return -1; + // thread-safe version of the vector, when used in a CUDA kernel + __device__ int push_back(const T &element) { + auto previousSize = atomicAdd(&m_size, 1); + if (previousSize < maxSize) { + m_data[previousSize] = element; + return previousSize; + } else { + atomicSub(&m_size, 1); + return -1; + } } - } - -#endif // __CUDACC__ - - __host__ __device__ - inline T pop_back() { - if (m_size > 0) { - auto previousSize = m_size--; - return m_data[previousSize - 1]; - } else - return T(); - } - inline constexpr T const * begin() const { return m_data;} - inline constexpr T const * end() const { return m_data+m_size;} - inline constexpr T * begin() { return m_data;} - inline constexpr T * end() { return m_data+m_size;} - inline constexpr int size() const { return m_size; } - inline constexpr T& operator[](int i) { return m_data[i]; } - inline constexpr const T& operator[](int i) const { return m_data[i]; } - inline constexpr void reset() { m_size = 0; } - inline constexpr int capacity() const { return maxSize; } - inline constexpr T const * data() const { return m_data; } - inline constexpr void resize(int size) { m_size = size; } - inline constexpr bool empty() const { return 0 == m_size; } - inline constexpr bool full() const { return maxSize == m_size; } - - int m_size = 0; - - T m_data[maxSize]; -}; + template + __device__ int emplace_back(Ts &&... args) { + auto previousSize = atomicAdd(&m_size, 1); + if (previousSize < maxSize) { + (new (&m_data[previousSize]) T(std::forward(args)...)); + return previousSize; + } else { + atomicSub(&m_size, 1); + return -1; + } + } -} + inline constexpr T pop_back() { + if (m_size > 0) { + auto previousSize = m_size--; + return m_data[previousSize - 1]; + } else + return T(); + } -#endif // HeterogeneousCore_CUDAUtilities_interface_GPUVecArray_h + inline constexpr T const *begin() const { return m_data; } + inline constexpr T const *end() const { return m_data + m_size; } + inline constexpr T *begin() { return m_data; } + inline constexpr T *end() { return m_data + m_size; } + inline constexpr int size() const { return m_size; } + inline constexpr T &operator[](int i) { return m_data[i]; } + inline constexpr const T &operator[](int i) const { return m_data[i]; } + inline constexpr void reset() { m_size = 0; } + inline constexpr int capacity() const { return maxSize; } + inline constexpr T const *data() const { return m_data; } + inline constexpr void resize(int size) { m_size = size; } + inline constexpr bool empty() const { return 0 == m_size; } + inline constexpr bool full() const { return maxSize == m_size; } + + private: + T m_data[maxSize]; + + int m_size; + }; + +} // namespace GPU + +#endif // HeterogeneousCore_CUDAUtilities_interface_GPUVecArray_h diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h index 61ccc7484d68b..2584ae7a4e011 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h +++ b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h @@ -1,11 +1,11 @@ -#ifndef HeterogeneousCore_CUDAUtilities_HistoContainer_h -#define HeterogeneousCore_CUDAUtilities_HistoContainer_h +#ifndef HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h +#define HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h #include #ifndef __CUDA_ARCH__ #include -#endif // __CUDA_ARCH__ -#include +#endif // __CUDA_ARCH__ +#include #include #include @@ -13,119 +13,149 @@ #include #endif -#include "HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h" +#include "HeterogeneousCore/CUDAUtilities/interface/AtomicPairCounter.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" -#ifdef __CUDACC__ +#include "HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h" #include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h" -#endif -#include "HeterogeneousCore/CUDAUtilities/interface/AtomicPairCounter.h" - -#ifdef __CUDACC__ namespace cudautils { - template - __global__ - void countFromVector(Histo * __restrict__ h, uint32_t nh, T const * __restrict__ v, uint32_t const * __restrict__ offsets) { - auto i = blockIdx.x * blockDim.x + threadIdx.x; - if(i >= offsets[nh]) return; - auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i); - assert((*off) > 0); - int32_t ih = off - offsets - 1; - assert(ih >= 0); - assert(ih < nh); - (*h).count(v[i], ih); + template + __global__ void countFromVector(Histo *__restrict__ h, + uint32_t nh, + T const *__restrict__ v, + uint32_t const *__restrict__ offsets) { + int first = blockDim.x * blockIdx.x + threadIdx.x; + for (int i = first; i < offsets[nh]; i += gridDim.x * blockDim.x) { + auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i); + assert((*off) > 0); + int32_t ih = off - offsets - 1; + assert(ih >= 0); + assert(ih < nh); + (*h).count(v[i], ih); + } } - template - __global__ - void fillFromVector(Histo * __restrict__ h, uint32_t nh, T const * __restrict__ v, uint32_t const * __restrict__ offsets) { - auto i = blockIdx.x * blockDim.x + threadIdx.x; - if(i >= offsets[nh]) return; - auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i); - assert((*off) > 0); - int32_t ih = off - offsets - 1; - assert(ih >= 0); - assert(ih < nh); - (*h).fill(v[i], i, ih); + template + __global__ void fillFromVector(Histo *__restrict__ h, + uint32_t nh, + T const *__restrict__ v, + uint32_t const *__restrict__ offsets) { + int first = blockDim.x * blockIdx.x + threadIdx.x; + for (int i = first; i < offsets[nh]; i += gridDim.x * blockDim.x) { + auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i); + assert((*off) > 0); + int32_t ih = off - offsets - 1; + assert(ih >= 0); + assert(ih < nh); + (*h).fill(v[i], i, ih); + } } - template - void launchZero(Histo * __restrict__ h, cudaStream_t stream) { - uint32_t * off = (uint32_t *)( (char*)(h) +offsetof(Histo,off)); - cudaMemsetAsync(off,0, 4*Histo::totbins(),stream); + template + void launchZero(Histo *__restrict__ h, + cudaStream_t stream +#ifndef __CUDACC__ + = 0 +#endif + ) { + uint32_t *off = (uint32_t *)((char *)(h) + offsetof(Histo, off)); +#ifdef __CUDACC__ + cudaMemsetAsync(off, 0, 4 * Histo::totbins(), stream); +#else + ::memset(off, 0, 4 * Histo::totbins()); +#endif } - template - void launchFinalize(Histo * __restrict__ h, uint8_t * __restrict__ ws, cudaStream_t stream) { - uint32_t * off = (uint32_t *)( (char*)(h) +offsetof(Histo,off)); + template + void launchFinalize(Histo *__restrict__ h, + uint8_t *__restrict__ ws +#ifndef __CUDACC__ + = nullptr +#endif + , + cudaStream_t stream +#ifndef __CUDACC__ + = 0 +#endif + ) { +#ifdef __CUDACC__ + assert(ws); + uint32_t *off = (uint32_t *)((char *)(h) + offsetof(Histo, off)); size_t wss = Histo::wsSize(); + assert(wss > 0); CubDebugExit(cub::DeviceScan::InclusiveSum(ws, wss, off, off, Histo::totbins(), stream)); +#else + h->finalize(); +#endif } - - template - void fillManyFromVector(Histo * __restrict__ h, uint8_t * __restrict__ ws, - uint32_t nh, T const * __restrict__ v, uint32_t const * __restrict__ offsets, uint32_t totSize, - int nthreads, cudaStream_t stream) { - launchZero(h,stream); + template + void fillManyFromVector(Histo *__restrict__ h, + uint8_t *__restrict__ ws, + uint32_t nh, + T const *__restrict__ v, + uint32_t const *__restrict__ offsets, + uint32_t totSize, + int nthreads, + cudaStream_t stream +#ifndef __CUDACC__ + = 0 +#endif + ) { + launchZero(h, stream); +#ifdef __CUDACC__ auto nblocks = (totSize + nthreads - 1) / nthreads; countFromVector<<>>(h, nh, v, offsets); cudaCheck(cudaGetLastError()); - launchFinalize(h,ws,stream); + launchFinalize(h, ws, stream); fillFromVector<<>>(h, nh, v, offsets); cudaCheck(cudaGetLastError()); +#else + countFromVector(h, nh, v, offsets); + h->finalize(); + fillFromVector(h, nh, v, offsets); +#endif } - - template - __global__ - void finalizeBulk(AtomicPairCounter const * apc, Assoc * __restrict__ assoc) { - assoc->bulkFinalizeFill(*apc); + template + __global__ void finalizeBulk(AtomicPairCounter const *apc, Assoc *__restrict__ assoc) { + assoc->bulkFinalizeFill(*apc); } -} // namespace cudautils -#endif - +} // namespace cudautils // iteratate over N bins left and right of the one containing "v" -template -__host__ __device__ -__forceinline__ -void forEachInBins(Hist const & hist, V value, int n, Func func) { - int bs = Hist::bin(value); - int be = std::min(int(Hist::nbins()-1),bs+n); - bs = std::max(0,bs-n); - assert(be>=bs); - for (auto pj=hist.begin(bs);pj +__host__ __device__ __forceinline__ void forEachInBins(Hist const &hist, V value, int n, Func func) { + int bs = Hist::bin(value); + int be = std::min(int(Hist::nbins() - 1), bs + n); + bs = std::max(0, bs - n); + assert(be >= bs); + for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) { + func(*pj); + } } // iteratate over bins containing all values in window wmin, wmax -template -__host__ __device__ -__forceinline__ -void forEachInWindow(Hist const & hist, V wmin, V wmax, Func const & func) { - auto bs = Hist::bin(wmin); - auto be = Hist::bin(wmax); - assert(be>=bs); - for (auto pj=hist.begin(bs);pj +__host__ __device__ __forceinline__ void forEachInWindow(Hist const &hist, V wmin, V wmax, Func const &func) { + auto bs = Hist::bin(wmin); + auto be = Hist::bin(wmax); + assert(be >= bs); + for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) { + func(*pj); + } } - - -template< - typename T, // the type of the discretized input values - uint32_t NBINS, // number of bins - uint32_t SIZE, // max number of element - uint32_t S=sizeof(T) * 8, // number of significant bits in T - typename I=uint32_t, // type stored in the container (usually an index in a vector of the input values) - uint32_t NHISTS=1 // number of histos stored -> +template class HistoContainer { public: #ifdef __CUDACC__ @@ -134,202 +164,172 @@ class HistoContainer { using Counter = std::atomic; #endif + using CountersOnly = HistoContainer; + using index_type = I; using UT = typename std::make_unsigned::type; static constexpr uint32_t ilog2(uint32_t v) { - constexpr uint32_t b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000}; constexpr uint32_t s[] = {1, 2, 4, 8, 16}; - uint32_t r = 0; // result of log2(v) will go here - for (auto i = 4; i >= 0; i--) if (v & b[i]) { - v >>= s[i]; - r |= s[i]; - } + uint32_t r = 0; // result of log2(v) will go here + for (auto i = 4; i >= 0; i--) + if (v & b[i]) { + v >>= s[i]; + r |= s[i]; + } return r; } + static constexpr uint32_t sizeT() { return S; } + static constexpr uint32_t nbins() { return NBINS; } + static constexpr uint32_t nhists() { return NHISTS; } + static constexpr uint32_t totbins() { return NHISTS * NBINS + 1; } + static constexpr uint32_t nbits() { return ilog2(NBINS - 1) + 1; } + static constexpr uint32_t capacity() { return SIZE; } - static constexpr uint32_t sizeT() { return S; } - static constexpr uint32_t nbins() { return NBINS;} - static constexpr uint32_t nhists() { return NHISTS;} - static constexpr uint32_t totbins() { return NHISTS*NBINS+1;} - static constexpr uint32_t nbits() { return ilog2(NBINS-1)+1;} - static constexpr uint32_t capacity() { return SIZE; } - - static constexpr auto histOff(uint32_t nh) { return NBINS*nh; } + static constexpr auto histOff(uint32_t nh) { return NBINS * nh; } + __host__ static size_t wsSize() { #ifdef __CUDACC__ - __host__ - static size_t wsSize() { - uint32_t * v =nullptr; - void * d_temp_storage = nullptr; - size_t temp_storage_bytes = 0; + uint32_t *v = nullptr; + void *d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, v, v, totbins()); return temp_storage_bytes; - } +#else + return 0; #endif - + } static constexpr UT bin(T t) { constexpr uint32_t shift = sizeT() - nbits(); - constexpr uint32_t mask = (1<> shift) & mask; } - void zero() { - for (auto & i : off) + __host__ __device__ void zero() { + for (auto &i : off) i = 0; } - static __host__ __device__ - __forceinline__ - uint32_t atomicIncrement(Counter & x) { - #ifdef __CUDA_ARCH__ + __host__ __device__ void add(CountersOnly const &co) { + for (uint32_t i = 0; i < totbins(); ++i) { +#ifdef __CUDA_ARCH__ + atomicAdd(off + i, co.off[i]); +#else + off[i] += co.off[i]; +#endif + } + } + + static __host__ __device__ __forceinline__ uint32_t atomicIncrement(Counter &x) { +#ifdef __CUDA_ARCH__ return atomicAdd(&x, 1); - #else +#else return x++; - #endif +#endif } - static __host__ __device__ - __forceinline__ - uint32_t atomicDecrement(Counter & x) { - #ifdef __CUDA_ARCH__ + static __host__ __device__ __forceinline__ uint32_t atomicDecrement(Counter &x) { +#ifdef __CUDA_ARCH__ return atomicSub(&x, 1); - #else +#else return x--; - #endif +#endif } - __host__ __device__ - __forceinline__ - void countDirect(T b) { - assert(b0); - bins[w-1] = j; + assert(w > 0); + bins[w - 1] = j; } - -#ifdef __CUDACC__ - __device__ - __forceinline__ - uint32_t bulkFill(AtomicPairCounter & apc, index_type const * v, uint32_t n) { + __device__ __host__ __forceinline__ int32_t bulkFill(AtomicPairCounter &apc, index_type const *v, uint32_t n) { auto c = apc.add(n); + if (c.m >= nbins()) + return -int32_t(c.m); off[c.m] = c.n; - for(int j=0; j=totbins()) return; - off[i]=n; + __device__ __host__ __forceinline__ void bulkFinalizeFill(AtomicPairCounter const &apc) { + auto m = apc.get().m; + auto n = apc.get().n; + auto first = m + blockDim.x * blockIdx.x + threadIdx.x; + for (auto i = first; i < totbins(); i += gridDim.x * blockDim.x) { + off[i] = n; + } } - -#endif - - - __host__ __device__ - __forceinline__ - void count(T t) { + __host__ __device__ __forceinline__ void count(T t) { uint32_t b = bin(t); - assert(b0); - bins[w-1] = j; + assert(w > 0); + bins[w - 1] = j; } - - __host__ __device__ - __forceinline__ - void count(T t, uint32_t nh) { + __host__ __device__ __forceinline__ void count(T t, uint32_t nh) { uint32_t b = bin(t); - assert(b0); - bins[w-1] = j; + assert(w > 0); + bins[w - 1] = j; } -#ifdef __CUDACC__ - __device__ - __forceinline__ - void finalize(Counter * ws) { - assert(off[totbins()-1]==0); - blockPrefixScan(off,totbins(),ws); - assert(off[totbins()-1]==off[totbins()-2]); - } - __host__ -#endif - void finalize() { - assert(off[totbins()-1]==0); - for(uint32_t i=1; i +template using OneToManyAssoc = HistoContainer; -#endif // HeterogeneousCore_CUDAUtilities_HistoContainer_h +#endif // HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h diff --git a/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h b/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h index acad1e2cabdc1..20bea6ecb6f7a 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h +++ b/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h @@ -1,55 +1,169 @@ -#ifndef HeterogeneousCore_CUDAUtilities_prefixScan_h -#define HeterogeneousCore_CUDAUtilities_prefixScan_h +#ifndef HeterogeneousCore_CUDAUtilities_interface_prefixScan_h +#define HeterogeneousCore_CUDAUtilities_interface_prefixScan_h #include +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h" #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" -template -__device__ -void -__forceinline__ -warpPrefixScan(T * c, uint32_t i, uint32_t mask) { - auto x = c[i]; - auto laneId = threadIdx.x & 0x1f; - #pragma unroll - for( int offset = 1 ; offset < 32 ; offset <<= 1 ) { - auto y = __shfl_up_sync(mask,x, offset); - if(laneId >= offset) x += y; - } - c[i] = x; +#ifdef __CUDA_ARCH__ +template +__device__ void __forceinline__ warpPrefixScan(T const* __restrict__ ci, T* __restrict__ co, uint32_t i, uint32_t mask) { + // ci and co may be the same + auto x = ci[i]; + auto laneId = threadIdx.x & 0x1f; +#pragma unroll + for (int offset = 1; offset < 32; offset <<= 1) { + auto y = __shfl_up_sync(mask, x, offset); + if (laneId >= offset) + x += y; + } + co[i] = x; } +#endif -// limited to 32*32 elements.... -template -__device__ -void -__forceinline__ -blockPrefixScan(T * c, uint32_t size, T* ws) { - assert(size<=1024); - assert(0==blockDim.x%32); +//same as above may remove +#ifdef __CUDA_ARCH__ +template +__device__ void __forceinline__ warpPrefixScan(T* c, uint32_t i, uint32_t mask) { + auto x = c[i]; + auto laneId = threadIdx.x & 0x1f; +#pragma unroll + for (int offset = 1; offset < 32; offset <<= 1) { + auto y = __shfl_up_sync(mask, x, offset); + if (laneId >= offset) + x += y; + } + c[i] = x; +} +#endif +// limited to 32*32 elements.... +template +__device__ __host__ void __forceinline__ blockPrefixScan(T const* __restrict__ ci, + T* __restrict__ co, + uint32_t size, + T* ws +#ifndef __CUDA_ARCH__ + = nullptr +#endif +) { +#ifdef __CUDA_ARCH__ + assert(ws); + assert(size <= 1024); + assert(0 == blockDim.x % 32); auto first = threadIdx.x; - auto mask = __ballot_sync(0xffffffff,first +__device__ __host__ void __forceinline__ blockPrefixScan(T* c, + uint32_t size, + T* ws +#ifndef __CUDA_ARCH__ + = nullptr +#endif +) { +#ifdef __CUDA_ARCH__ + assert(ws); + assert(size <= 1024); + assert(0 == blockDim.x % 32); + auto first = threadIdx.x; + auto mask = __ballot_sync(0xffffffff, first < size); + for (auto i = first; i < size; i += blockDim.x) { + warpPrefixScan(c, i, mask); + auto laneId = threadIdx.x & 0x1f; + auto warpId = i / 32; + assert(warpId < 32); + if (31 == laneId) + ws[warpId] = c[i]; + mask = __ballot_sync(mask, i + blockDim.x < size); + } + __syncthreads(); + if (size <= 32) + return; + if (threadIdx.x < 32) + warpPrefixScan(ws, threadIdx.x, 0xffffffff); + __syncthreads(); + for (auto i = first + 32; i < size; i += blockDim.x) { + auto warpId = i / 32; + c[i] += ws[warpId - 1]; + } + __syncthreads(); +#else + for (uint32_t i = 1; i < size; ++i) + c[i] += c[i - 1]; #endif +} + +// limited to 1024*1024 elements.... +template +__global__ void multiBlockPrefixScan(T const* __restrict__ ci, T* __restrict__ co, int32_t size, int32_t* pc) { + __shared__ T ws[32]; + // first each block does a scan of size 1024; (better be enough blocks....) + assert(1024 * gridDim.x >= size); + int off = 1024 * blockIdx.x; + if (size - off > 0) + blockPrefixScan(ci + off, co + off, std::min(1024, size - off), ws); + + // count blocks that finished + __shared__ bool isLastBlockDone; + if (0 == threadIdx.x) { + auto value = atomicAdd(pc, 1); // block counter + isLastBlockDone = (value == (gridDim.x - 1)); + } + + __syncthreads(); + + if (!isLastBlockDone) + return; + + // good each block has done its work and now we are left in last block + + // let's get the partial sums from each block + __shared__ T psum[1024]; + for (int i = threadIdx.x; i < gridDim.x; i += blockDim.x) { + auto j = 1024 * i + 1023; + psum[i] = (j < size) ? co[j] : T(0); + } + __syncthreads(); + blockPrefixScan(psum, psum, gridDim.x, ws); + + // now it would have been handy to have the other blocks around... + int first = threadIdx.x; // + blockDim.x * blockIdx.x + for (int i = first + 1024; i < size; i += blockDim.x) { // *gridDim.x) { + auto k = i / 1024; // block + co[i] += psum[k - 1]; + } +} + +#endif // HeterogeneousCore_CUDAUtilities_interface_prefixScan_h diff --git a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml index 6abf71bbcebf4..fe67e2a99a602 100644 --- a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml +++ b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml @@ -63,6 +63,10 @@ + + + + diff --git a/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.cpp b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.cpp new file mode 100644 index 0000000000000..3d452e851ba12 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.cpp @@ -0,0 +1 @@ +#include "OneToManyAssoc_t.h" diff --git a/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.cu b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.cu index 6d9cfe69b9489..3d452e851ba12 100644 --- a/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.cu +++ b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.cu @@ -1,161 +1 @@ -#include -#include -#include -#include -#include -#include - -#include - -#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" -#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h" - -constexpr uint32_t MaxElem=64000; -constexpr uint32_t MaxTk=8000; -constexpr uint32_t MaxAssocs = 4*MaxTk; -using Assoc = OneToManyAssoc; - -using TK = std::array; - -__global__ -void count(TK const * __restrict__ tk, Assoc * __restrict__ assoc, uint32_t n) { - auto i = blockIdx.x * blockDim.x + threadIdx.x; - auto k = i/4; - auto j = i - 4*k; - assert(j<4); - if (k>=n) return; - if (tk[k][j]countDirect(tk[k][j]); -} - -__global__ -void fill(TK const * __restrict__ tk, Assoc * __restrict__ assoc, uint32_t n) { - auto i = blockIdx.x * blockDim.x + threadIdx.x; - auto k = i/4; - auto j = i - 4*k; - assert(j<4); - if (k>=n) return; - if (tk[k][j]fillDirect(tk[k][j],k); - -} - -__global__ -void verify(Assoc * __restrict__ assoc) { - assert(assoc->size()=n) return; - auto m = tk[k][3]bulkFill(*apc,&tk[k][0],m); -} - -int main() { - exitSansCUDADevices(); - - std::cout << "OneToManyAssoc " << Assoc::nbins() << ' ' << Assoc::capacity() << ' '<< Assoc::wsSize() << std::endl; - - auto current_device = cuda::device::current::get(); - - std::mt19937 eng; - - std::geometric_distribution rdm(0.8); - - constexpr uint32_t N = 4000; - - std::vector> tr(N); - - // fill with "index" to element - long long ave=0; - int imax=0; - auto n=0U; - auto z=0U; - auto nz=0U; - for (auto i=0U; i<4U; ++i) { - auto j=0U; - while(j[]>(current_device, N); - assert(v_d.get()); - auto a_d = cuda::memory::device::make_unique(current_device,1); - auto ws_d = cuda::memory::device::make_unique(current_device, Assoc::wsSize()); - - cuda::memory::copy(v_d.get(), tr.data(), N*sizeof(std::array)); - - cudautils::launchZero(a_d.get(),0); - - auto nThreads = 256; - auto nBlocks = (4*N + nThreads - 1) / nThreads; - - count<<>>(v_d.get(),a_d.get(),N); - - cudautils::launchFinalize(a_d.get(),ws_d.get(),0); - verify<<<1,1>>>(a_d.get()); - fill<<>>(v_d.get(),a_d.get(),N); - - Assoc la; - cuda::memory::copy(&la,a_d.get(),sizeof(Assoc)); - std::cout << la.size() << std::endl; - imax = 0; - ave=0; - z=0; - for (auto i=0U; i>>(dc_d,v_d.get(),a_d.get(),N); - cudautils::finalizeBulk<<>>(dc_d,a_d.get()); - - AtomicPairCounter dc; - cudaMemcpy(&dc, dc_d, sizeof(AtomicPairCounter), cudaMemcpyDeviceToHost); - - std::cout << "final counter value " << dc.get().n << ' ' << dc.get().m << std::endl; - - cuda::memory::copy(&la,a_d.get(),sizeof(Assoc)); - std::cout << la.size() << std::endl; - imax = 0; - ave=0; - for (auto i=0U; i +#include +#include +#include +#include +#include +#include + +#ifdef __CUDACC__ +#include +#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h" +#endif + +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" + +constexpr uint32_t MaxElem = 64000; +constexpr uint32_t MaxTk = 8000; +constexpr uint32_t MaxAssocs = 4 * MaxTk; +using Assoc = OneToManyAssoc; + +using SmallAssoc = OneToManyAssoc; + +using Multiplicity = OneToManyAssoc; + +using TK = std::array; + +__global__ void countMultiLocal(TK const* __restrict__ tk, Multiplicity* __restrict__ assoc, int32_t n) { + int first = blockDim.x * blockIdx.x + threadIdx.x; + for (int i = first; i < n; i += gridDim.x * blockDim.x) { + __shared__ Multiplicity::CountersOnly local; + if (threadIdx.x == 0) + local.zero(); + __syncthreads(); + local.countDirect(2 + i % 4); + __syncthreads(); + if (threadIdx.x == 0) + assoc->add(local); + } +} + +__global__ void countMulti(TK const* __restrict__ tk, Multiplicity* __restrict__ assoc, int32_t n) { + int first = blockDim.x * blockIdx.x + threadIdx.x; + for (int i = first; i < n; i += gridDim.x * blockDim.x) + assoc->countDirect(2 + i % 4); +} + +__global__ void verifyMulti(Multiplicity* __restrict__ m1, Multiplicity* __restrict__ m2) { + auto first = blockDim.x * blockIdx.x + threadIdx.x; + for (auto i = first; i < Multiplicity::totbins(); i += gridDim.x * blockDim.x) + assert(m1->off[i] == m2->off[i]); +} + +__global__ void count(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32_t n) { + int first = blockDim.x * blockIdx.x + threadIdx.x; + for (int i = first; i < 4 * n; i += gridDim.x * blockDim.x) { + auto k = i / 4; + auto j = i - 4 * k; + assert(j < 4); + if (k >= n) + return; + if (tk[k][j] < MaxElem) + assoc->countDirect(tk[k][j]); + } +} + +__global__ void fill(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32_t n) { + int first = blockDim.x * blockIdx.x + threadIdx.x; + for (int i = first; i < 4 * n; i += gridDim.x * blockDim.x) { + auto k = i / 4; + auto j = i - 4 * k; + assert(j < 4); + if (k >= n) + return; + if (tk[k][j] < MaxElem) + assoc->fillDirect(tk[k][j], k); + } +} + +__global__ void verify(Assoc* __restrict__ assoc) { assert(assoc->size() < Assoc::capacity()); } + +template +__global__ void fillBulk(AtomicPairCounter* apc, TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32_t n) { + int first = blockDim.x * blockIdx.x + threadIdx.x; + for (int k = first; k < n; k += gridDim.x * blockDim.x) { + auto m = tk[k][3] < MaxElem ? 4 : 3; + assoc->bulkFill(*apc, &tk[k][0], m); + } +} + +template +__global__ void verifyBulk(Assoc const* __restrict__ assoc, AtomicPairCounter const* apc) { + if (apc->get().m >= Assoc::nbins()) + printf("Overflow %d %d\n", apc->get().m, Assoc::nbins()); + assert(assoc->size() < Assoc::capacity()); +} + +int main() { +#ifdef __CUDACC__ + exitSansCUDADevices(); + auto current_device = cuda::device::current::get(); +#else + // make sure cuda emulation is working + std::cout << "cuda x's " << threadIdx.x << ' ' << blockIdx.x << ' ' << blockDim.x << ' ' << gridDim.x << std::endl; + std::cout << "cuda y's " << threadIdx.y << ' ' << blockIdx.y << ' ' << blockDim.y << ' ' << gridDim.y << std::endl; + std::cout << "cuda z's " << threadIdx.z << ' ' << blockIdx.z << ' ' << blockDim.z << ' ' << gridDim.z << std::endl; + assert(threadIdx.x == 0); + assert(threadIdx.y == 0); + assert(threadIdx.z == 0); + assert(blockIdx.x == 0); + assert(blockIdx.y == 0); + assert(blockIdx.z == 0); + assert(blockDim.x == 1); + assert(blockDim.y == 1); + assert(blockDim.z == 1); + assert(gridDim.x == 1); + assert(gridDim.y == 1); + assert(gridDim.z == 1); +#endif + + std::cout << "OneToManyAssoc " << Assoc::nbins() << ' ' << Assoc::capacity() << ' ' << Assoc::wsSize() << std::endl; + std::cout << "OneToManyAssoc (small) " << SmallAssoc::nbins() << ' ' << SmallAssoc::capacity() << ' ' + << SmallAssoc::wsSize() << std::endl; + + std::mt19937 eng; + + std::geometric_distribution rdm(0.8); + + constexpr uint32_t N = 4000; + + std::vector> tr(N); + + // fill with "index" to element + long long ave = 0; + int imax = 0; + auto n = 0U; + auto z = 0U; + auto nz = 0U; + for (auto i = 0U; i < 4U; ++i) { + auto j = 0U; + while (j < N && n < MaxElem) { + if (z == 11) { + ++n; + z = 0; + ++nz; + continue; + } // a bit of not assoc + auto x = rdm(eng); + auto k = std::min(j + x + 1, N); + if (i == 3 && z == 3) { // some triplets time to time + for (; j < k; ++j) + tr[j][i] = MaxElem + 1; + } else { + ave += x + 1; + imax = std::max(imax, x); + for (; j < k; ++j) + tr[j][i] = n; + ++n; + } + ++z; + } + assert(n <= MaxElem); + assert(j <= N); + } + std::cout << "filled with " << n << " elements " << double(ave) / n << ' ' << imax << ' ' << nz << std::endl; + +#ifdef __CUDACC__ + auto v_d = cuda::memory::device::make_unique[]>(current_device, N); + assert(v_d.get()); + auto a_d = cuda::memory::device::make_unique(current_device, 1); + auto sa_d = cuda::memory::device::make_unique(current_device, 1); + auto ws_d = cuda::memory::device::make_unique(current_device, Assoc::wsSize()); + + cuda::memory::copy(v_d.get(), tr.data(), N * sizeof(std::array)); +#else + auto a_d = std::make_unique(); + auto sa_d = std::make_unique(); + auto v_d = tr.data(); +#endif + + cudautils::launchZero(a_d.get(), 0); + +#ifdef __CUDACC__ + auto nThreads = 256; + auto nBlocks = (4 * N + nThreads - 1) / nThreads; + + count<<>>(v_d.get(), a_d.get(), N); + + cudautils::launchFinalize(a_d.get(), ws_d.get(), 0); + verify<<<1, 1>>>(a_d.get()); + fill<<>>(v_d.get(), a_d.get(), N); +#else + count(v_d, a_d.get(), N); + cudautils::launchFinalize(a_d.get()); + verify(a_d.get()); + fill(v_d, a_d.get(), N); +#endif + + Assoc la; + +#ifdef __CUDACC__ + cuda::memory::copy(&la, a_d.get(), sizeof(Assoc)); +#else + memcpy(&la, a_d.get(), sizeof(Assoc)); // not required, easier +#endif + + std::cout << la.size() << std::endl; + imax = 0; + ave = 0; + z = 0; + for (auto i = 0U; i < n; ++i) { + auto x = la.size(i); + if (x == 0) { + z++; + continue; + } + ave += x; + imax = std::max(imax, int(x)); + } + assert(0 == la.size(n)); + std::cout << "found with " << n << " elements " << double(ave) / n << ' ' << imax << ' ' << z << std::endl; + + // now the inverse map (actually this is the direct....) + AtomicPairCounter* dc_d; + AtomicPairCounter dc; + +#ifdef __CUDACC__ + cudaMalloc(&dc_d, sizeof(AtomicPairCounter)); + cudaMemset(dc_d, 0, sizeof(AtomicPairCounter)); + nBlocks = (N + nThreads - 1) / nThreads; + fillBulk<<>>(dc_d, v_d.get(), a_d.get(), N); + cudautils::finalizeBulk<<>>(dc_d, a_d.get()); + verifyBulk<<<1, 1>>>(a_d.get(), dc_d); + + cuda::memory::copy(&la, a_d.get(), sizeof(Assoc)); + cudaMemcpy(&dc, dc_d, sizeof(AtomicPairCounter), cudaMemcpyDeviceToHost); + + cudaMemset(dc_d, 0, sizeof(AtomicPairCounter)); + fillBulk<<>>(dc_d, v_d.get(), sa_d.get(), N); + cudautils::finalizeBulk<<>>(dc_d, sa_d.get()); + verifyBulk<<<1, 1>>>(sa_d.get(), dc_d); + +#else + dc_d = &dc; + fillBulk(dc_d, v_d, a_d.get(), N); + cudautils::finalizeBulk(dc_d, a_d.get()); + verifyBulk(a_d.get(), dc_d); + memcpy(&la, a_d.get(), sizeof(Assoc)); + + AtomicPairCounter sdc; + fillBulk(&sdc, v_d, sa_d.get(), N); + cudautils::finalizeBulk(&sdc, sa_d.get()); + verifyBulk(sa_d.get(), &sdc); + +#endif + + std::cout << "final counter value " << dc.get().n << ' ' << dc.get().m << std::endl; + + std::cout << la.size() << std::endl; + imax = 0; + ave = 0; + for (auto i = 0U; i < N; ++i) { + auto x = la.size(i); + if (!(x == 4 || x == 3)) + std::cout << i << ' ' << x << std::endl; + assert(x == 4 || x == 3); + ave += x; + imax = std::max(imax, int(x)); + } + assert(0 == la.size(N)); + std::cout << "found with ave occupancy " << double(ave) / N << ' ' << imax << std::endl; + + // here verify use of block local counters +#ifdef __CUDACC__ + auto m1_d = cuda::memory::device::make_unique(current_device, 1); + auto m2_d = cuda::memory::device::make_unique(current_device, 1); +#else + auto m1_d = std::make_unique(); + auto m2_d = std::make_unique(); +#endif + cudautils::launchZero(m1_d.get(), 0); + cudautils::launchZero(m2_d.get(), 0); + +#ifdef __CUDACC__ + nBlocks = (4 * N + nThreads - 1) / nThreads; + countMulti<<>>(v_d.get(), m1_d.get(), N); + countMultiLocal<<>>(v_d.get(), m2_d.get(), N); + verifyMulti<<<1, Multiplicity::totbins()>>>(m1_d.get(), m2_d.get()); + + cudautils::launchFinalize(m1_d.get(), ws_d.get(), 0); + cudautils::launchFinalize(m2_d.get(), ws_d.get(), 0); + verifyMulti<<<1, Multiplicity::totbins()>>>(m1_d.get(), m2_d.get()); + + cudaCheck(cudaGetLastError()); + cudaCheck(cudaDeviceSynchronize()); +#else + countMulti(v_d, m1_d.get(), N); + countMultiLocal(v_d, m2_d.get(), N); + verifyMulti(m1_d.get(), m2_d.get()); + + cudautils::launchFinalize(m1_d.get()); + cudautils::launchFinalize(m2_d.get()); + verifyMulti(m1_d.get(), m2_d.get()); +#endif + return 0; +} diff --git a/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu b/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu index 29c80a62e3946..f8266898323cd 100644 --- a/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu +++ b/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu @@ -1,60 +1,73 @@ #include -#include +#include #include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h" #include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h" -template -__global__ -void testPrefixScan(uint32_t size) { - +template +__global__ void testPrefixScan(uint32_t size) { __shared__ T ws[32]; __shared__ T c[1024]; + __shared__ T co[1024]; + auto first = threadIdx.x; - for (auto i=first; i -__global__ -void testWarpPrefixScan(uint32_t size) { - assert(size<=32); +template +__global__ void testWarpPrefixScan(uint32_t size) { + assert(size <= 32); __shared__ T c[1024]; + __shared__ T co[1024]; auto i = threadIdx.x; - c[i]=1; + c[i] = 1; __syncthreads(); - warpPrefixScan(c,i,0xffffffff); - __syncthreads(); + warpPrefixScan(c, co, i, 0xffffffff); + warpPrefixScan(c, i, 0xffffffff); + __syncthreads(); - assert(1==c[0]); - if(i!=0) { - if (c[i]!=c[i-1]+1) printf("failed %d %d %d: %d %d\n",size, i, blockDim.x, c[i],c[i-1]); - assert(c[i]==c[i-1]+1); assert(c[i]==i+1); + assert(1 == c[0]); + assert(1 == co[0]); + if (i != 0) { + if (c[i] != c[i - 1] + 1) + printf("failed %d %d %d: %d %d\n", size, i, blockDim.x, c[i], c[i - 1]); + assert(c[i] == c[i - 1] + 1); + assert(c[i] == i + 1); + assert(c[i] = co[i]); } } -__global__ -void init(uint32_t * v, uint32_t val, uint32_t n) { - auto i = blockIdx.x * blockDim.x + threadIdx.x; - if(i<<<1,32>>>(32); + testWarpPrefixScan<<<1, 32>>>(32); cudaDeviceSynchronize(); // std::cout << "warp 16" << std::endl; - testWarpPrefixScan<<<1,32>>>(16); + testWarpPrefixScan<<<1, 32>>>(16); cudaDeviceSynchronize(); // std::cout << "warp 5" << std::endl; - testWarpPrefixScan<<<1,32>>>(5); + testWarpPrefixScan<<<1, 32>>>(5); cudaDeviceSynchronize(); std::cout << "block level" << std::endl; - for(int bs=32; bs<=1024; bs+=32) { -// std::cout << "bs " << bs << std::endl; - for (int j=1;j<=1024; ++j) { -// std::cout << j << std::endl; - testPrefixScan<<<1,bs>>>(j); - cudaDeviceSynchronize(); - testPrefixScan<<<1,bs>>>(j); - cudaDeviceSynchronize(); - }} + for (int bs = 32; bs <= 1024; bs += 32) { + // std::cout << "bs " << bs << std::endl; + for (int j = 1; j <= 1024; ++j) { + // std::cout << j << std::endl; + testPrefixScan<<<1, bs>>>(j); + cudaDeviceSynchronize(); + testPrefixScan<<<1, bs>>>(j); + cudaDeviceSynchronize(); + } + } cudaDeviceSynchronize(); - // test cub - std::cout << "cub" << std::endl; -// Declare, allocate, and initialize device-accessible pointers for input and output - int num_items = 10000; - uint32_t *d_in; - uint32_t *d_out; - - cudaMalloc(&d_in,num_items*sizeof(uint32_t)); - // cudaMalloc(&d_out,num_items*sizeof(uint32_t)); - - d_out = d_in; - - auto nthreads = 256; - auto nblocks = (num_items + nthreads - 1) / nthreads; - - init<<>>(d_in, 1, num_items); - - // Determine temporary device storage requirements for inclusive prefix sum - void *d_temp_storage = nullptr; - size_t temp_storage_bytes = 0; - cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); - - std::cout << "temp storage " << temp_storage_bytes << std::endl; - - // Allocate temporary storage for inclusive prefix sum - // fake larger ws already available - temp_storage_bytes *=8; - cudaMalloc(&d_temp_storage, temp_storage_bytes); - std::cout << "temp storage " << temp_storage_bytes << std::endl; - // Run inclusive prefix sum - CubDebugExit(cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items)); - std::cout << "temp storage " << temp_storage_bytes << std::endl; - - verify<<>>(d_out, num_items); - cudaDeviceSynchronize(); - + int num_items = 200; + for (int ksize = 1; ksize < 4; ++ksize) { + // test multiblock + std::cout << "multiblok" << std::endl; + // Declare, allocate, and initialize device-accessible pointers for input and output + num_items *= 10; + uint32_t *d_in; + uint32_t *d_out1; + uint32_t *d_out2; + + cudaMalloc(&d_in, num_items * sizeof(uint32_t)); + cudaMalloc(&d_out1, num_items * sizeof(uint32_t)); + cudaMalloc(&d_out2, num_items * sizeof(uint32_t)); + + auto nthreads = 256; + auto nblocks = (num_items + nthreads - 1) / nthreads; + + init<<>>(d_in, 1, num_items); + + // the block counter + int32_t *d_pc; + cudaMalloc(&d_pc, sizeof(int32_t)); + cudaMemset(d_pc, 0, 4); + + nthreads = 1024; + nblocks = (num_items + nthreads - 1) / nthreads; + multiBlockPrefixScan<<>>(d_in, d_out1, num_items, d_pc); + verify<<>>(d_out1, num_items); + cudaDeviceSynchronize(); + + // test cub + std::cout << "cub" << std::endl; + // Determine temporary device storage requirements for inclusive prefix sum + void *d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; + cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out2, num_items); + + std::cout << "temp storage " << temp_storage_bytes << std::endl; + + // Allocate temporary storage for inclusive prefix sum + // fake larger ws already available + temp_storage_bytes *= 8; + cudaMalloc(&d_temp_storage, temp_storage_bytes); + std::cout << "temp storage " << temp_storage_bytes << std::endl; + // Run inclusive prefix sum + CubDebugExit(cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out2, num_items)); + std::cout << "temp storage " << temp_storage_bytes << std::endl; + + verify<<>>(d_out2, num_items); + cudaDeviceSynchronize(); + } // ksize return 0; }