From 6b9ac389544692f3ff3df0393d6ffcc7e6f21920 Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Fri, 17 Aug 2018 16:16:42 +0200 Subject: [PATCH] Add optional flags to disable SOA->legacy conversion and GPU->CPU transfer (cms-patatrack#132) Always produce the CPU cluster and rechit collections, since they are needed anyway. Add transfer and conversion flags to clusterizer, rechits and CA. Add a skeleton for the future pixel track producer. Add customize functions to disable conversions to legacy formats, and to disable unnecessary GPU->CPU transfers. --- .../python/RecoLocalTracker_cff.py | 14 +----- .../plugins/SiPixelRawToClusterGPUKernel.cu | 49 ++++++++++--------- .../plugins/SiPixelRawToClusterGPUKernel.h | 2 +- .../SiPixelClusterizerPreSplitting_cfi.py | 7 ++- 4 files changed, 35 insertions(+), 37 deletions(-) diff --git a/RecoLocalTracker/Configuration/python/RecoLocalTracker_cff.py b/RecoLocalTracker/Configuration/python/RecoLocalTracker_cff.py index 9fd64188802fc..ad975fa183566 100644 --- a/RecoLocalTracker/Configuration/python/RecoLocalTracker_cff.py +++ b/RecoLocalTracker/Configuration/python/RecoLocalTracker_cff.py @@ -17,19 +17,9 @@ striptrackerlocalreco = cms.Sequence(siStripZeroSuppression*siStripClusters*siStripMatchedRecHits) trackerlocalreco = cms.Sequence(pixeltrackerlocalreco*striptrackerlocalreco*clusterSummaryProducer) -from RecoLocalTracker.SiPixelClusterizer.siPixelClustersHeterogeneous_cfi import * -from RecoLocalTracker.SiPixelClusterizer.siPixelFedCablingMapGPUWrapper_cfi import * -from CalibTracker.SiPixelESProducers.siPixelGainCalibrationForHLTGPU_cfi import * - from Configuration.ProcessModifiers.gpu_cff import gpu -from RecoLocalTracker.SiPixelRecHits.siPixelRecHitHeterogeneous_cfi import * -from RecoLocalTracker.SiPixelRecHits.siPixelRecHitHeterogeneousConverter_cfi import siPixelRecHitHeterogeneousConverter as _siPixelRecHitHeterogeneousConverter -gpu.toReplaceWith(siPixelRecHitsPreSplitting, _siPixelRecHitHeterogeneousConverter.clone()) - -_pixeltrackerlocalreco_gpu = pixeltrackerlocalreco.copy() -_pixeltrackerlocalreco_gpu.replace(siPixelClustersPreSplitting, siPixelClustersHeterogeneous+siPixelClustersPreSplitting) -_pixeltrackerlocalreco_gpu.replace(siPixelRecHitsPreSplitting, siPixelRecHitHeterogeneous+siPixelRecHitsPreSplitting) -gpu.toReplaceWith(pixeltrackerlocalreco, _pixeltrackerlocalreco_gpu) +from RecoLocalTracker.SiPixelRecHits.siPixelRecHitHeterogeneous_cfi import siPixelRecHitHeterogeneous as _siPixelRecHitHeterogeneous +gpu.toReplaceWith(siPixelRecHitsPreSplitting, _siPixelRecHitHeterogeneous) from RecoLocalTracker.SiPhase2Clusterizer.phase2TrackerClusterizer_cfi import * from RecoLocalTracker.Phase2TrackerRecHits.Phase2StripCPEGeometricESProducer_cfi import * diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu index 0ab7682911f1c..f3242a11d7ae6 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu @@ -614,7 +614,7 @@ namespace pixelgpudetails { const SiPixelGainForHLTonGPU *gains, const uint32_t wordCounter, const uint32_t fedCounter, bool convertADCtoElectrons, - bool useQualityInfo, bool includeErrors, bool debug, + bool useQualityInfo, bool includeErrors, bool transferToCPU, bool debug, cuda::stream_t<>& stream) { nDigis = wordCounter; @@ -646,25 +646,26 @@ namespace pixelgpudetails { cudaCheck(cudaGetLastError()); // copy data to host variable - - cudaCheck(cudaMemcpyAsync(pdigi_h, pdigi_d, wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id())); - cudaCheck(cudaMemcpyAsync(rawIdArr_h, rawIdArr_d, wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id())); - - if (includeErrors) { - cudaCheck(cudaMemcpyAsync(error_h, error_d, vsize, cudaMemcpyDefault, stream.id())); - cudaCheck(cudaMemcpyAsync(data_h, data_d, MAX_ERROR_SIZE, cudaMemcpyDefault, stream.id())); - // If we want to transfer only the minimal amount of data, we - // need a synchronization point. A single ExternalWork (of - // SiPixelRawToClusterHeterogeneous) does not help because it is - // already used to synchronize the data movement. So we'd need - // two ExternalWorks (or explicit use of TBB tasks). The - // prototype of #100 would allow this easily (as there would be - // two ExternalWorks). - // - //error_h->set_data(data_h); - //cudaCheck(cudaStreamSynchronize(stream.id())); - //int size = error_h->size(); - //cudaCheck(cudaMemcpyAsync(data_h, data_d, size*esize, cudaMemcpyDefault, stream.id())); + if(transferToCPU) { + cudaCheck(cudaMemcpyAsync(pdigi_h, pdigi_d, wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id())); + cudaCheck(cudaMemcpyAsync(rawIdArr_h, rawIdArr_d, wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id())); + + if (includeErrors) { + cudaCheck(cudaMemcpyAsync(error_h, error_d, vsize, cudaMemcpyDefault, stream.id())); + cudaCheck(cudaMemcpyAsync(data_h, data_d, MAX_ERROR_SIZE, cudaMemcpyDefault, stream.id())); + // If we want to transfer only the minimal amount of data, we + // need a synchronization point. A single ExternalWork (of + // SiPixelRawToClusterHeterogeneous) does not help because it is + // already used to synchronize the data movement. So we'd need + // two ExternalWorks (or explicit use of TBB tasks). The + // prototype of #100 would allow this easily (as there would be + // two ExternalWorks). + // + //error_h->set_data(data_h); + //cudaCheck(cudaStreamSynchronize(stream.id())); + //int size = error_h->size(); + //cudaCheck(cudaMemcpyAsync(data_h, data_d, size*esize, cudaMemcpyDefault, stream.id())); + } } // End of Raw2Digi and passing data for cluserisation @@ -682,7 +683,9 @@ namespace pixelgpudetails { cudaCheck(cudaGetLastError()); // calibrated adc - cudaCheck(cudaMemcpyAsync(adc_h, adc_d, wordCounter*sizeof(uint16_t), cudaMemcpyDefault, stream.id())); + if(transferToCPU) { + cudaCheck(cudaMemcpyAsync(adc_h, adc_d, wordCounter*sizeof(uint16_t), cudaMemcpyDefault, stream.id())); + } /* std::cout @@ -730,7 +733,9 @@ namespace pixelgpudetails { // clusters - cudaCheck(cudaMemcpyAsync(clus_h, clus_d, wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id())); + if(transferToCPU) { + cudaCheck(cudaMemcpyAsync(clus_h, clus_d, wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id())); + } } // end clusterizer scope } diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h index 9cff737140a90..ca8bd73106c2c 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h @@ -173,7 +173,7 @@ namespace pixelgpudetails { void makeClustersAsync(const SiPixelFedCablingMapGPU *cablingMap, const unsigned char *modToUnp, const SiPixelGainForHLTonGPU *gains, const uint32_t wordCounter, const uint32_t fedCounter, bool convertADCtoElectrons, - bool useQualityInfo, bool includeErrors, bool debug, + bool useQualityInfo, bool includeErrors, bool transferToCPU_, bool debug, cuda::stream_t<>& stream); auto getProduct() { diff --git a/RecoLocalTracker/SiPixelClusterizer/python/SiPixelClusterizerPreSplitting_cfi.py b/RecoLocalTracker/SiPixelClusterizer/python/SiPixelClusterizerPreSplitting_cfi.py index 4e6ff591fb78a..bb0bb85697a99 100644 --- a/RecoLocalTracker/SiPixelClusterizer/python/SiPixelClusterizerPreSplitting_cfi.py +++ b/RecoLocalTracker/SiPixelClusterizer/python/SiPixelClusterizerPreSplitting_cfi.py @@ -4,5 +4,8 @@ from RecoLocalTracker.SiPixelClusterizer.SiPixelClusterizer_cfi import siPixelClusters as _siPixelClusters siPixelClustersPreSplitting = _siPixelClusters.clone() -# In principle we could remove `siPixelClustersPreSplitting` from the `pixeltrackerlocalreco` -# sequence when the `gpu` modufier is active; for the time being we keep it for simplicity. +from Configuration.ProcessModifiers.gpu_cff import gpu +from RecoLocalTracker.SiPixelClusterizer.siPixelClustersHeterogeneous_cfi import siPixelClustersHeterogeneous as _siPixelClustersHeterogeneous +from RecoLocalTracker.SiPixelClusterizer.siPixelFedCablingMapGPUWrapper_cfi import * +from CalibTracker.SiPixelESProducers.siPixelGainCalibrationForHLTGPU_cfi import * +gpu.toReplaceWith(siPixelClustersPreSplitting, _siPixelClustersHeterogeneous.clone())