Skip to content

Commit

Permalink
Heterogeneous ClusterTPAssociation (#105)
Browse files Browse the repository at this point in the history
Implement a heterogeneous Cluster-to-TrackingParticle associator running on the GPU.
  • Loading branch information
VinInn authored and fwyzard committed Oct 23, 2020
1 parent 4b333c1 commit 1603b2c
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 28 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@

namespace pixelgpudetails {

SiPixelRawToClusterGPUKernel::SiPixelRawToClusterGPUKernel() {
SiPixelRawToClusterGPUKernel::SiPixelRawToClusterGPUKernel(cuda::stream_t<>& cudaStream) {
int WSIZE = pixelgpudetails::MAX_FED * pixelgpudetails::MAX_WORD;
cudaMallocHost(&word, sizeof(unsigned int)*WSIZE);
cudaMallocHost(&fedId_h, sizeof(unsigned char)*WSIZE);
Expand Down Expand Up @@ -90,6 +90,12 @@ namespace pixelgpudetails {
cudaCheck(cudaMalloc((void**) & moduleStart_d, (MaxNumModules+1)*sizeof(uint32_t) ));
cudaCheck(cudaMalloc((void**) & clusInModule_d,(MaxNumModules)*sizeof(uint32_t) ));
cudaCheck(cudaMalloc((void**) & moduleId_d, (MaxNumModules)*sizeof(uint32_t) ));

cudaCheck(cudaMalloc((void**) & gpuProduct_d, sizeof(GPUProduct)));
gpuProduct = getProduct();
assert(xx_d==gpuProduct.xx_d);

cudaCheck(cudaMemcpyAsync(gpuProduct_d, &gpuProduct, sizeof(GPUProduct), cudaMemcpyDefault,cudaStream.id()));
}

SiPixelRawToClusterGPUKernel::~SiPixelRawToClusterGPUKernel() {
Expand All @@ -111,6 +117,7 @@ namespace pixelgpudetails {
cudaCheck(cudaFree(clus_d));
cudaCheck(cudaFree(clusInModule_d));
cudaCheck(cudaFree(moduleId_d));
cudaCheck(cudaFree(gpuProduct_d));
}

void SiPixelRawToClusterGPUKernel::initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length) {
Expand Down Expand Up @@ -478,7 +485,7 @@ namespace pixelgpudetails {
XX[gIndex] = 0; // 0 is an indicator of a noise/dead channel
YY[gIndex] = 0; // skip these pixels during clusterization
ADC[gIndex] = 0;
continue ; // 0: bad word
continue; // 0: bad word
}

uint32_t link = getLink(ww); // Extract link
Expand Down Expand Up @@ -521,9 +528,9 @@ namespace pixelgpudetails {
// endcap ids
layer = 0;
panel = (rawId >> pixelgpudetails::panelStartBit) & pixelgpudetails::panelMask;
//disk = (rawId >> diskStartBit_) & diskMask_ ;
//disk = (rawId >> diskStartBit_) & diskMask_;
side = (panel == 1)? -1 : 1;
//blade = (rawId>>bladeStartBit_) & bladeMask_;
//blade = (rawId >> bladeStartBit_) & bladeMask_;
}

// ***special case of layer to 1 be handled here
Expand Down Expand Up @@ -558,8 +565,8 @@ namespace pixelgpudetails {
}

pixelgpudetails::Pixel globalPix = frameConversion(barrel, side, layer, rocIdInDetUnit, localPix);
XX[gIndex] = globalPix.row ; // origin shifting by 1 0-159
YY[gIndex] = globalPix.col ; // origin shifting by 1 0-415
XX[gIndex] = globalPix.row; // origin shifting by 1 0-159
YY[gIndex] = globalPix.col; // origin shifting by 1 0-415
ADC[gIndex] = getADC(ww);
pdigi[gIndex] = pixelgpudetails::pack(globalPix.row,globalPix.col,ADC[gIndex]);
moduleId[gIndex] = detId.moduleId;
Expand All @@ -583,7 +590,6 @@ namespace pixelgpudetails {
const int threadsPerBlock = 512;
const int blocks = (wordCounter + threadsPerBlock-1) /threadsPerBlock; // fill it all


assert(0 == wordCounter%2);
// wordCounter is the total no of words in each event to be trasfered on device
cudaCheck(cudaMemcpyAsync(&word_d[0], &word[0], wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
Expand Down Expand Up @@ -630,7 +636,6 @@ namespace pixelgpudetails {
int threadsPerBlock = 256;
int blocks = (wordCounter + threadsPerBlock - 1) / threadsPerBlock;


gpuCalibPixel::calibDigis<<<blocks, threadsPerBlock, 0, stream.id()>>>(
moduleInd_d,
xx_d, yy_d, adc_d,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ namespace pixelgpudetails {
class Packing {
public:
using PackedDigiType = uint32_t;

// Constructor: pre-computes masks and shifts from field widths
__host__ __device__
inline
Expand Down Expand Up @@ -144,22 +144,32 @@ namespace pixelgpudetails {
(adc << thePacking.adc_shift);
}

constexpr
uint32_t pixelToChannel( int row, int col) {
constexpr Packing thePacking = packing();
return (row << thePacking.column_width) | col;
}


using error_obj = siPixelRawToClusterHeterogeneousProduct::error_obj;


class SiPixelRawToClusterGPUKernel {
public:
SiPixelRawToClusterGPUKernel();

using GPUProduct = siPixelRawToClusterHeterogeneousProduct::GPUProduct;

SiPixelRawToClusterGPUKernel(cuda::stream_t<>& cudaStream);
~SiPixelRawToClusterGPUKernel();


SiPixelRawToClusterGPUKernel(const SiPixelRawToClusterGPUKernel&) = delete;
SiPixelRawToClusterGPUKernel(SiPixelRawToClusterGPUKernel&&) = delete;
SiPixelRawToClusterGPUKernel& operator=(const SiPixelRawToClusterGPUKernel&) = delete;
SiPixelRawToClusterGPUKernel& operator=(SiPixelRawToClusterGPUKernel&&) = delete;

void initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length);

// Not really very async yet...
void makeClustersAsync(const SiPixelFedCablingMapGPU *cablingMap, const unsigned char *modToUnp,
const SiPixelGainForHLTonGPU *gains,
Expand All @@ -170,8 +180,9 @@ namespace pixelgpudetails {
auto getProduct() const {
return siPixelRawToClusterHeterogeneousProduct::GPUProduct{
pdigi_h, rawIdArr_h, clus_h, adc_h, error_h,
nDigis, nModulesActive,
xx_d, yy_d, adc_d, moduleInd_d, moduleStart_d,clus_d, clusInModule_d, moduleId_d
gpuProduct_d,
xx_d, yy_d, adc_d, moduleInd_d, moduleStart_d,clus_d, clusInModule_d, moduleId_d,
nDigis, nModulesActive
};
}

Expand All @@ -181,6 +192,11 @@ namespace pixelgpudetails {
unsigned char *fedId_h = nullptr; // to hold fed index for each word

// output
GPUProduct gpuProduct;
GPUProduct * gpuProduct_d;

// FIXME cleanup all these are in the gpuProduct above...

uint32_t *pdigi_h = nullptr, *rawIdArr_h = nullptr; // host copy of output
uint16_t *adc_h = nullptr; int32_t *clus_h = nullptr; // host copy of calib&clus output
pixelgpudetails::error_obj *data_h = nullptr;
Expand Down Expand Up @@ -209,7 +225,7 @@ namespace pixelgpudetails {
uint32_t * clusInModule_d;
uint32_t * moduleId_d;
};

// configuration and memory buffers alocated on the GPU
struct context {
uint32_t * word_d;
Expand All @@ -223,7 +239,7 @@ namespace pixelgpudetails {

GPU::SimpleVector<error_obj> * error_d;
error_obj * data_d;

// these are for the clusterizer (to be moved)
uint32_t * moduleStart_d;
int32_t * clus_d;
Expand Down
20 changes: 8 additions & 12 deletions RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,10 @@ namespace pixelgpudetails {
cudaCheck(cudaMalloc((void**) & gpu_.sortIndex_d,(gpuClustering::MaxNumModules*256)*sizeof(uint16_t)));
cudaCheck(cudaMalloc((void**) & gpu_.mr_d,(gpuClustering::MaxNumModules*256)*sizeof(uint16_t)));
cudaCheck(cudaMalloc((void**) & gpu_.mc_d,(gpuClustering::MaxNumModules*256)*sizeof(uint16_t)));
// cudaCheck(cudaMalloc((void**) & gpu_.hist_d, 10*sizeof(HitsOnGPU::Hist)));

cudaCheck(cudaMalloc((void**) & gpu_.hist_d, 10*sizeof(HitsOnGPU::Hist)));
cudaCheck(cudaMalloc((void**) & gpu_d, sizeof(HitsOnGPU)));
gpu_.me_d = gpu_d;
cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault,cudaStream.id()));

}

PixelRecHitGPUKernel::~PixelRecHitGPUKernel() {
Expand All @@ -59,8 +58,7 @@ namespace pixelgpudetails {
cudaCheck(cudaFree(gpu_.sortIndex_d));
cudaCheck(cudaFree(gpu_.mr_d));
cudaCheck(cudaFree(gpu_.mc_d));
// cudaCheck(cudaFree(gpu_.hist_d));

cudaCheck(cudaFree(gpu_.hist_d));
cudaCheck(cudaFree(gpu_d));
}

Expand All @@ -78,7 +76,7 @@ namespace pixelgpudetails {
input.clusInModule_d,
input.clusInModule_d + gpuClustering::MaxNumModules,
&gpu_.hitsModuleStart_d[1]);

int threadsPerBlock = 256;
int blocks = input.nModules; // active modules (with digis)
gpuPixelRecHits::getHits<<<blocks, threadsPerBlock, 0, stream.id()>>>(
Expand All @@ -96,20 +94,20 @@ namespace pixelgpudetails {
gpu_.xg_d, gpu_.yg_d, gpu_.zg_d, gpu_.rg_d,
gpu_.iphi_d,
gpu_.xl_d, gpu_.yl_d,
gpu_.xerr_d, gpu_.yerr_d,
gpu_.xerr_d, gpu_.yerr_d,
gpu_.mr_d, gpu_.mc_d
);

// needed only if hits on CPU are required...
cudaCheck(cudaMemcpyAsync(hitsModuleStart_, gpu_.hitsModuleStart_d, (gpuClustering::MaxNumModules+1) * sizeof(uint32_t), cudaMemcpyDefault, stream.id()));

// to be moved to gpu?
auto nhits = hitsModuleStart_[gpuClustering::MaxNumModules];
for (int i=0;i<10;++i) hitsLayerStart_[i]=hitsModuleStart_[phase1PixelTopology::layerStart[i]];
hitsLayerStart_[10]=nhits;

#ifdef GPU_DEBUG
std::cout << "hit layerStart ";
std::cout << "hit layerStart ";
for (int i=0;i<10;++i) std::cout << phase1PixelTopology::layerName[i] << ':' << hitsLayerStart_[i] << ' ';
std::cout << "end:" << hitsLayerStart_[10] << std::endl;
#endif
Expand All @@ -119,9 +117,7 @@ namespace pixelgpudetails {
// for timing test
// radixSortMultiWrapper<int16_t><<<10, 256, 0, c.stream>>>(gpu_.iphi_d,gpu_.sortIndex_d,gpu_.hitsLayerStart_d);

// fillManyFromVector(gpu_.hist_d,10,gpu_.iphi_d, gpu_.hitsLayerStart_d, nhits,256,c.stream);


cudautils::fillManyFromVector(gpu_.hist_d,10,gpu_.iphi_d, gpu_.hitsLayerStart_d, nhits,256,stream.id());
}

HitsOnCPU PixelRecHitGPUKernel::getOutput(cuda::stream_t<>& stream) const {
Expand Down

0 comments on commit 1603b2c

Please sign in to comment.