Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cluster2TP assoc on GPU #105

Merged
merged 20 commits into from
Jul 31, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 0 additions & 8 deletions HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,6 @@
#include <atomic>
#endif // __CUDA_ARCH__

#ifdef __CUDACC__
#include <cuda_runtime.h>
#else
#define __device__
#define __global__
#define __host__
#endif // __CUDACC__

#include "HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h"

#ifdef __CUDACC__
Expand Down
8 changes: 8 additions & 0 deletions HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
#ifndef HeterogeneousCore_CUDAUtilities_cudastdAlgorithm_h
#define HeterogeneousCore_CUDAUtilities_cudastdAlgorithm_h

#include <cuda_runtime.h>


#include <utility>

// reimplementation of std algorithms able to compile with CUDA and run on GPUs,
Expand All @@ -10,6 +13,7 @@ namespace cuda_std {

template<typename T = void>
struct less {
__device__ __host__
constexpr bool operator()(const T &lhs, const T &rhs) const {
return lhs < rhs;
}
Expand All @@ -18,10 +22,12 @@ namespace cuda_std {
template<>
struct less<void> {
template<typename T, typename U>
__device__ __host__
constexpr bool operator()(const T &lhs, const U &rhs ) const { return lhs < rhs;}
};

template<typename RandomIt, typename T, typename Compare = less<T>>
__device__ __host__
constexpr
RandomIt lower_bound(RandomIt first, RandomIt last, const T& value, Compare comp={})
{
Expand All @@ -43,6 +49,7 @@ namespace cuda_std {
}

template<typename RandomIt, typename T, typename Compare = less<T>>
__device__ __host__
constexpr
RandomIt upper_bound(RandomIt first, RandomIt last, const T& value, Compare comp={})
{
Expand All @@ -64,6 +71,7 @@ namespace cuda_std {
}

template<typename RandomIt, typename T, typename Compare = cuda_std::less<T>>
__device__ __host__
constexpr
RandomIt binary_find(RandomIt first, RandomIt last, const T& value, Compare comp={})
{
Expand Down
16 changes: 8 additions & 8 deletions HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
Original file line number Diff line number Diff line change
@@ -1,31 +1,31 @@
<use name="HeterogeneousCore/CUDAUtilities"/>

<bin file="test_GPUSimpleVector.cu" name="test_GPUSimpleVector"/>

<bin file="cudastdAlgorithm_t.cpp"/>
<bin file="cudastdAlgorithm_t.cpp">
</bin>

<bin file="cudastdAlgorithm_t.cu" name="gpuCudastdAlgorithm_t">
<use name="cuda"/>
<use name="cuda-api-wrappers"/>
</bin>

<bin file="radixSort_t.cu" name="gpuRadixSort_t">
<use name="cuda"/>
<use name="cuda-api-wrappers"/>
</bin>

<bin file="radixSort_t.cu" name="gpuRadixSort_debug">
<use name="cuda"/>
<use name="cuda-api-wrappers"/>
<flags CUDA_FLAGS="-g -G"/>
</bin>

<bin file="HistoContainer_t.cpp"/>
<bin file="HistoContainer_t.cpp">
</bin>

<bin file="HistoContainer_t.cu" name="gpuHistoContainer_t">
<use name="cuda"/>
<use name="cuda-api-wrappers"/>
</bin>

<bin file="HistoContainer_t.cpp"/>
<bin file="HistoContainer_t.cu" name="gpuHistoContainer_debug">
<use name="cuda"/>
<use name="cuda-api-wrappers"/>
<flags CUDA_FLAGS="-g -G"/>
</bin>
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@

namespace pixelgpudetails {

SiPixelRawToClusterGPUKernel::SiPixelRawToClusterGPUKernel() {
SiPixelRawToClusterGPUKernel::SiPixelRawToClusterGPUKernel(cuda::stream_t<>& cudaStream) {
int WSIZE = pixelgpudetails::MAX_FED * pixelgpudetails::MAX_WORD;
cudaMallocHost(&word, sizeof(unsigned int)*WSIZE);
cudaMallocHost(&fedId_h, sizeof(unsigned char)*WSIZE);
Expand Down Expand Up @@ -90,6 +90,13 @@ namespace pixelgpudetails {
cudaCheck(cudaMalloc((void**) & moduleStart_d, (MaxNumModules+1)*sizeof(uint32_t) ));
cudaCheck(cudaMalloc((void**) & clusInModule_d,(MaxNumModules)*sizeof(uint32_t) ));
cudaCheck(cudaMalloc((void**) & moduleId_d, (MaxNumModules)*sizeof(uint32_t) ));

cudaCheck(cudaMalloc((void**) & gpuProduct_d, sizeof(GPUProduct)));
gpuProduct = getProduct();
assert(xx_d==gpuProduct.xx_d);

cudaCheck(cudaMemcpyAsync(gpuProduct_d, &gpuProduct, sizeof(GPUProduct), cudaMemcpyDefault,cudaStream.id()));

}

SiPixelRawToClusterGPUKernel::~SiPixelRawToClusterGPUKernel() {
Expand All @@ -111,6 +118,10 @@ namespace pixelgpudetails {
cudaCheck(cudaFree(clus_d));
cudaCheck(cudaFree(clusInModule_d));
cudaCheck(cudaFree(moduleId_d));

cudaCheck(cudaFree(gpuProduct_d));


}

void SiPixelRawToClusterGPUKernel::initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -144,12 +144,22 @@ namespace pixelgpudetails {
(adc << thePacking.adc_shift);
}

constexpr
uint32_t pixelToChannel( int row, int col) {
constexpr Packing thePacking = packing();
return (row << thePacking.column_width) | col;
}


using error_obj = siPixelRawToClusterHeterogeneousProduct::error_obj;


class SiPixelRawToClusterGPUKernel {
public:
SiPixelRawToClusterGPUKernel();

using GPUProduct = siPixelRawToClusterHeterogeneousProduct::GPUProduct;

SiPixelRawToClusterGPUKernel(cuda::stream_t<>& cudaStream);
~SiPixelRawToClusterGPUKernel();


Expand All @@ -170,8 +180,9 @@ namespace pixelgpudetails {
auto getProduct() const {
return siPixelRawToClusterHeterogeneousProduct::GPUProduct{
pdigi_h, rawIdArr_h, clus_h, adc_h, error_h,
nDigis, nModulesActive,
xx_d, yy_d, adc_d, moduleInd_d, moduleStart_d,clus_d, clusInModule_d, moduleId_d
gpuProduct_d,
xx_d, yy_d, adc_d, moduleInd_d, moduleStart_d,clus_d, clusInModule_d, moduleId_d,
nDigis, nModulesActive
};
}

Expand All @@ -181,6 +192,11 @@ namespace pixelgpudetails {
unsigned char *fedId_h = nullptr; // to hold fed index for each word

// output
GPUProduct gpuProduct;
GPUProduct * gpuProduct_d;

// FIXME cleanup all these are in the gpuProduct above...

uint32_t *pdigi_h = nullptr, *rawIdArr_h = nullptr; // host copy of output
uint16_t *adc_h = nullptr; int32_t *clus_h = nullptr; // host copy of calib&clus output
pixelgpudetails::error_obj *data_h = nullptr;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -425,7 +425,7 @@ void SiPixelRawToClusterHeterogeneous::produceCPU(edm::HeterogeneousEvent& ev, c
// -----------------------------------------------------------------------------
void SiPixelRawToClusterHeterogeneous::beginStreamGPUCuda(edm::StreamID streamId, cuda::stream_t<>& cudaStream) {
// Allocate GPU resources here
gpuAlgo_ = std::make_unique<pixelgpudetails::SiPixelRawToClusterGPUKernel>();
gpuAlgo_ = std::make_unique<pixelgpudetails::SiPixelRawToClusterGPUKernel>(cudaStream);
gpuModulesToUnpack_ = std::make_unique<SiPixelFedCablingMapGPUWrapper::ModulesToUnpack>();
}

Expand Down Expand Up @@ -523,6 +523,7 @@ void SiPixelRawToClusterHeterogeneous::acquireGPUCuda(const edm::HeterogeneousEv

void SiPixelRawToClusterHeterogeneous::produceGPUCuda(edm::HeterogeneousEvent& ev, const edm::EventSetup& es, cuda::stream_t<>& cudaStream) {
auto output = std::make_unique<GPUProduct>(gpuAlgo_->getProduct());
assert(output->me_d);
ev.put<Output>(std::move(output), [this](const GPUProduct& gpu, CPUProduct& cpu) {
this->convertGPUtoCPU(gpu, cpu);
});
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ namespace siPixelRawToClusterHeterogeneousProduct {
{ }
};

// FIXME split in two
struct GPUProduct {
// Needed for digi and cluster CPU output
uint32_t const * pdigi_h = nullptr;
Expand All @@ -44,9 +45,9 @@ namespace siPixelRawToClusterHeterogeneousProduct {
uint16_t const * adc_h = nullptr;
GPU::SimpleVector<error_obj> const * error_h = nullptr;

GPUProduct const * me_d = nullptr;

// Needed for GPU rechits
uint32_t nDigis;
uint32_t nModules;
uint16_t const * xx_d;
uint16_t const * yy_d;
uint16_t const * adc_d;
Expand All @@ -55,6 +56,10 @@ namespace siPixelRawToClusterHeterogeneousProduct {
int32_t const * clus_d;
uint32_t const * clusInModule_d;
uint32_t const * moduleId_d;

uint32_t nDigis;
uint32_t nModules;

};

using HeterogeneousDigiCluster = HeterogeneousProductImpl<heterogeneous::CPUProduct<CPUProduct>,
Expand Down
7 changes: 4 additions & 3 deletions RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,10 @@ namespace pixelgpudetails {
cudaCheck(cudaMalloc((void**) & gpu_.sortIndex_d,(gpuClustering::MaxNumModules*256)*sizeof(uint16_t)));
cudaCheck(cudaMalloc((void**) & gpu_.mr_d,(gpuClustering::MaxNumModules*256)*sizeof(uint16_t)));
cudaCheck(cudaMalloc((void**) & gpu_.mc_d,(gpuClustering::MaxNumModules*256)*sizeof(uint16_t)));
// cudaCheck(cudaMalloc((void**) & gpu_.hist_d, 10*sizeof(HitsOnGPU::Hist)));
cudaCheck(cudaMalloc((void**) & gpu_.hist_d, 10*sizeof(HitsOnGPU::Hist)));

cudaCheck(cudaMalloc((void**) & gpu_d, sizeof(HitsOnGPU)));
gpu_.me_d = gpu_d;
cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault,cudaStream.id()));

}
Expand All @@ -59,7 +60,7 @@ namespace pixelgpudetails {
cudaCheck(cudaFree(gpu_.sortIndex_d));
cudaCheck(cudaFree(gpu_.mr_d));
cudaCheck(cudaFree(gpu_.mc_d));
// cudaCheck(cudaFree(gpu_.hist_d));
cudaCheck(cudaFree(gpu_.hist_d));

cudaCheck(cudaFree(gpu_d));
}
Expand Down Expand Up @@ -119,7 +120,7 @@ namespace pixelgpudetails {
// for timing test
// radixSortMultiWrapper<int16_t><<<10, 256, 0, c.stream>>>(gpu_.iphi_d,gpu_.sortIndex_d,gpu_.hitsLayerStart_d);

// fillManyFromVector(gpu_.hist_d,10,gpu_.iphi_d, gpu_.hitsLayerStart_d, nhits,256,c.stream);
cudautils::fillManyFromVector(gpu_.hist_d,10,gpu_.iphi_d, gpu_.hitsLayerStart_d, nhits,256,stream.id());


}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
#include <cstdint>
#include <vector>

// #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"


namespace siPixelRecHitsHeterogeneousProduct {
Expand All @@ -31,22 +31,26 @@ namespace siPixelRecHitsHeterogeneousProduct {
uint16_t * mr_d;
uint16_t * mc_d;

// using Hist = HistoContainer<int16_t,7,8>;
// Hist * hist_d;
using Hist = HistoContainer<int16_t,7,8>;
Hist * hist_d;

HitsOnGPU const * me_d=nullptr;
};


struct HitsOnCPU {
HitsOnCPU() = default;
explicit HitsOnCPU(uint32_t nhits) :
charge(nhits),xl(nhits),yl(nhits),xe(nhits),ye(nhits), mr(nhits), mc(nhits){}
charge(nhits),xl(nhits),yl(nhits),xe(nhits),ye(nhits), mr(nhits), mc(nhits),
nHits(nhits){}
uint32_t hitsModuleStart[2001];
std::vector<int32_t> charge;
std::vector<float> xl, yl;
std::vector<float> xe, ye;
std::vector<uint16_t> mr;
std::vector<uint16_t> mc;


uint32_t nHits;
HitsOnGPU const * gpu_d=nullptr;
};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,11 @@

#include "CAHitQuadrupletGeneratorGPU.h"

// gpu
#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h"



namespace {
void fillNtuplets(RegionsSeedingHitSets::RegionFiller &seedingHitSetsFiller,
const OrderedHitSeeds &quadruplets) {
Expand All @@ -31,6 +36,10 @@ class CAHitNtupletHeterogeneousEDProducer
: public HeterogeneousEDProducer<heterogeneous::HeterogeneousDevices<
heterogeneous::GPUCuda, heterogeneous::CPU>> {
public:

using PixelRecHitsH = siPixelRecHitsHeterogeneousProduct::HeterogeneousPixelRecHit;


CAHitNtupletHeterogeneousEDProducer(const edm::ParameterSet &iConfig);
~CAHitNtupletHeterogeneousEDProducer() = default;

Expand All @@ -50,6 +59,9 @@ class CAHitNtupletHeterogeneousEDProducer
private:
edm::EDGetTokenT<IntermediateHitDoublets> doubletToken_;

edm::EDGetTokenT<HeterogeneousProduct> tGpuHits;


edm::RunningAverage localRA_;
CAHitQuadrupletGeneratorGPU GPUGenerator_;
CAHitQuadrupletGenerator CPUGenerator_;
Expand All @@ -63,6 +75,7 @@ CAHitNtupletHeterogeneousEDProducer::CAHitNtupletHeterogeneousEDProducer(
: HeterogeneousEDProducer(iConfig),
doubletToken_(consumes<IntermediateHitDoublets>(
iConfig.getParameter<edm::InputTag>("doublets"))),
tGpuHits(consumesHeterogeneous(iConfig.getParameter<edm::InputTag>("heterogeneousPixelRecHitSrc"))),
GPUGenerator_(iConfig, consumesCollector()),
CPUGenerator_(iConfig, consumesCollector()) {
produces<RegionsSeedingHitSets>();
Expand All @@ -73,6 +86,9 @@ void CAHitNtupletHeterogeneousEDProducer::fillDescriptions(
edm::ParameterSetDescription desc;

desc.add<edm::InputTag>("doublets", edm::InputTag("hitPairEDProducer"));

desc.add<edm::InputTag>("heterogeneousPixelRecHitSrc", edm::InputTag("siPixelRecHitHeterogeneous"));

CAHitQuadrupletGeneratorGPU::fillDescriptions(desc);
HeterogeneousEDProducer::fillPSetDescription(desc);
auto label = "caHitQuadrupletHeterogeneousEDProducer";
Expand Down Expand Up @@ -106,6 +122,13 @@ void CAHitNtupletHeterogeneousEDProducer::acquireGPUCuda(

seedingHitSets_ = std::make_unique<RegionsSeedingHitSets>();

edm::Handle<siPixelRecHitsHeterogeneousProduct::GPUProduct> gh;
iEvent.getByToken<siPixelRecHitsHeterogeneousProduct::HeterogeneousPixelRecHit>(tGpuHits, gh);
auto const & gHits = *gh;
// auto nhits = gHits.nHits;

GPUGenerator_.buildDoublets(gHits,0.06f,cudaStream.id());

if (regionDoublets.empty()) {
emptyRegionDoublets = true;
} else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
#include "GPUCACell.h"
#include "CAHitQuadrupletGeneratorGPU.h"

#include "gpuPixelDoublets.h"

__global__ void
kernel_debug(unsigned int numberOfLayerPairs_, unsigned int numberOfLayers_,
const GPULayerDoublets *gpuDoublets,
Expand Down Expand Up @@ -490,3 +492,18 @@ CAHitQuadrupletGeneratorGPU::fetchKernelResult(int regionIndex, cudaStream_t cud
}
return quadsInterface;
}




void CAHitQuadrupletGeneratorGPU::buildDoublets(HitsOnCPU const & hh, float phicut, cudaStream_t stream) {
auto nhits = hh.nHits;

float phiCut=0.06;
int threadsPerBlock = 256;
int blocks = (nhits + threadsPerBlock - 1) / threadsPerBlock;

gpuPixelDoublets::getDoubletsFromHisto<<<blocks, threadsPerBlock, 0, stream>>>(hh.gpu_d,phiCut);


}
Loading