From 6b9ac389544692f3ff3df0393d6ffcc7e6f21920 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Fri, 17 Aug 2018 16:16:42 +0200
Subject: [PATCH] Add optional flags to disable SOA->legacy conversion and
 GPU->CPU transfer (cms-patatrack#132)

Always produce the CPU cluster and rechit collections, since they are needed anyway.
Add transfer and conversion flags to clusterizer, rechits and CA.
Add a skeleton for the future pixel track producer.
Add customize functions to disable conversions to legacy formats, and to disable unnecessary GPU->CPU transfers.
---
 .../python/RecoLocalTracker_cff.py            | 14 +-----
 .../plugins/SiPixelRawToClusterGPUKernel.cu   | 49 ++++++++++---------
 .../plugins/SiPixelRawToClusterGPUKernel.h    |  2 +-
 .../SiPixelClusterizerPreSplitting_cfi.py     |  7 ++-
 4 files changed, 35 insertions(+), 37 deletions(-)

diff --git a/RecoLocalTracker/Configuration/python/RecoLocalTracker_cff.py b/RecoLocalTracker/Configuration/python/RecoLocalTracker_cff.py
index 9fd64188802fc..ad975fa183566 100644
--- a/RecoLocalTracker/Configuration/python/RecoLocalTracker_cff.py
+++ b/RecoLocalTracker/Configuration/python/RecoLocalTracker_cff.py
@@ -17,19 +17,9 @@
 striptrackerlocalreco = cms.Sequence(siStripZeroSuppression*siStripClusters*siStripMatchedRecHits)
 trackerlocalreco = cms.Sequence(pixeltrackerlocalreco*striptrackerlocalreco*clusterSummaryProducer)
 
-from RecoLocalTracker.SiPixelClusterizer.siPixelClustersHeterogeneous_cfi import *
-from RecoLocalTracker.SiPixelClusterizer.siPixelFedCablingMapGPUWrapper_cfi import *
-from CalibTracker.SiPixelESProducers.siPixelGainCalibrationForHLTGPU_cfi import *
-
 from Configuration.ProcessModifiers.gpu_cff import gpu
-from RecoLocalTracker.SiPixelRecHits.siPixelRecHitHeterogeneous_cfi import *
-from RecoLocalTracker.SiPixelRecHits.siPixelRecHitHeterogeneousConverter_cfi import siPixelRecHitHeterogeneousConverter as _siPixelRecHitHeterogeneousConverter
-gpu.toReplaceWith(siPixelRecHitsPreSplitting, _siPixelRecHitHeterogeneousConverter.clone())
-
-_pixeltrackerlocalreco_gpu = pixeltrackerlocalreco.copy()
-_pixeltrackerlocalreco_gpu.replace(siPixelClustersPreSplitting, siPixelClustersHeterogeneous+siPixelClustersPreSplitting)
-_pixeltrackerlocalreco_gpu.replace(siPixelRecHitsPreSplitting, siPixelRecHitHeterogeneous+siPixelRecHitsPreSplitting)
-gpu.toReplaceWith(pixeltrackerlocalreco, _pixeltrackerlocalreco_gpu)
+from RecoLocalTracker.SiPixelRecHits.siPixelRecHitHeterogeneous_cfi import siPixelRecHitHeterogeneous as _siPixelRecHitHeterogeneous
+gpu.toReplaceWith(siPixelRecHitsPreSplitting, _siPixelRecHitHeterogeneous)
 
 from RecoLocalTracker.SiPhase2Clusterizer.phase2TrackerClusterizer_cfi import *
 from RecoLocalTracker.Phase2TrackerRecHits.Phase2StripCPEGeometricESProducer_cfi import *
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
index 0ab7682911f1c..f3242a11d7ae6 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
@@ -614,7 +614,7 @@ namespace pixelgpudetails {
       const SiPixelGainForHLTonGPU *gains,
       const uint32_t wordCounter, const uint32_t fedCounter,
       bool convertADCtoElectrons,
-      bool useQualityInfo, bool includeErrors, bool debug,
+      bool useQualityInfo, bool includeErrors, bool transferToCPU, bool debug,
       cuda::stream_t<>& stream)
   {
     nDigis = wordCounter;
@@ -646,25 +646,26 @@ namespace pixelgpudetails {
     cudaCheck(cudaGetLastError());
 
     // copy data to host variable
-
-    cudaCheck(cudaMemcpyAsync(pdigi_h, pdigi_d, wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
-    cudaCheck(cudaMemcpyAsync(rawIdArr_h, rawIdArr_d, wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
-
-    if (includeErrors) {
-      cudaCheck(cudaMemcpyAsync(error_h, error_d, vsize, cudaMemcpyDefault, stream.id()));
-      cudaCheck(cudaMemcpyAsync(data_h, data_d, MAX_ERROR_SIZE, cudaMemcpyDefault, stream.id()));
-      // If we want to transfer only the minimal amount of data, we
-      // need a synchronization point. A single ExternalWork (of
-      // SiPixelRawToClusterHeterogeneous) does not help because it is
-      // already used to synchronize the data movement. So we'd need
-      // two ExternalWorks (or explicit use of TBB tasks). The
-      // prototype of #100 would allow this easily (as there would be
-      // two ExternalWorks).
-      //
-      //error_h->set_data(data_h);
-      //cudaCheck(cudaStreamSynchronize(stream.id()));
-      //int size = error_h->size();
-      //cudaCheck(cudaMemcpyAsync(data_h, data_d, size*esize, cudaMemcpyDefault, stream.id()));
+    if(transferToCPU) {
+      cudaCheck(cudaMemcpyAsync(pdigi_h, pdigi_d, wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
+      cudaCheck(cudaMemcpyAsync(rawIdArr_h, rawIdArr_d, wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
+
+      if (includeErrors) {
+        cudaCheck(cudaMemcpyAsync(error_h, error_d, vsize, cudaMemcpyDefault, stream.id()));
+        cudaCheck(cudaMemcpyAsync(data_h, data_d, MAX_ERROR_SIZE, cudaMemcpyDefault, stream.id()));
+        // If we want to transfer only the minimal amount of data, we
+        // need a synchronization point. A single ExternalWork (of
+        // SiPixelRawToClusterHeterogeneous) does not help because it is
+        // already used to synchronize the data movement. So we'd need
+        // two ExternalWorks (or explicit use of TBB tasks). The
+        // prototype of #100 would allow this easily (as there would be
+        // two ExternalWorks).
+        //
+        //error_h->set_data(data_h);
+        //cudaCheck(cudaStreamSynchronize(stream.id()));
+        //int size = error_h->size();
+        //cudaCheck(cudaMemcpyAsync(data_h, data_d, size*esize, cudaMemcpyDefault, stream.id()));
+      }
     }
     // End  of Raw2Digi and passing data for cluserisation
 
@@ -682,7 +683,9 @@ namespace pixelgpudetails {
       cudaCheck(cudaGetLastError());
 
       // calibrated adc
-      cudaCheck(cudaMemcpyAsync(adc_h, adc_d, wordCounter*sizeof(uint16_t), cudaMemcpyDefault, stream.id()));
+      if(transferToCPU) {
+        cudaCheck(cudaMemcpyAsync(adc_h, adc_d, wordCounter*sizeof(uint16_t), cudaMemcpyDefault, stream.id()));
+      }
 
       /*
          std::cout
@@ -730,7 +733,9 @@ namespace pixelgpudetails {
 
 
       // clusters
-      cudaCheck(cudaMemcpyAsync(clus_h, clus_d, wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
+      if(transferToCPU) {
+        cudaCheck(cudaMemcpyAsync(clus_h, clus_d, wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
+      }
     } // end clusterizer scope
   }
 
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
index 9cff737140a90..ca8bd73106c2c 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
@@ -173,7 +173,7 @@ namespace pixelgpudetails {
     void makeClustersAsync(const SiPixelFedCablingMapGPU *cablingMap, const unsigned char *modToUnp,
                            const SiPixelGainForHLTonGPU *gains,
                            const uint32_t wordCounter, const uint32_t fedCounter, bool convertADCtoElectrons,
-                           bool useQualityInfo, bool includeErrors, bool debug,
+                           bool useQualityInfo, bool includeErrors, bool transferToCPU_, bool debug,
                            cuda::stream_t<>& stream);
 
     auto getProduct() {
diff --git a/RecoLocalTracker/SiPixelClusterizer/python/SiPixelClusterizerPreSplitting_cfi.py b/RecoLocalTracker/SiPixelClusterizer/python/SiPixelClusterizerPreSplitting_cfi.py
index 4e6ff591fb78a..bb0bb85697a99 100644
--- a/RecoLocalTracker/SiPixelClusterizer/python/SiPixelClusterizerPreSplitting_cfi.py
+++ b/RecoLocalTracker/SiPixelClusterizer/python/SiPixelClusterizerPreSplitting_cfi.py
@@ -4,5 +4,8 @@
 from RecoLocalTracker.SiPixelClusterizer.SiPixelClusterizer_cfi import siPixelClusters as _siPixelClusters
 siPixelClustersPreSplitting = _siPixelClusters.clone()
 
-# In principle we could remove `siPixelClustersPreSplitting` from the `pixeltrackerlocalreco` 
-# sequence when the `gpu` modufier is active; for the time being we keep it for simplicity.
+from Configuration.ProcessModifiers.gpu_cff import gpu
+from RecoLocalTracker.SiPixelClusterizer.siPixelClustersHeterogeneous_cfi import siPixelClustersHeterogeneous as _siPixelClustersHeterogeneous
+from RecoLocalTracker.SiPixelClusterizer.siPixelFedCablingMapGPUWrapper_cfi import *
+from CalibTracker.SiPixelESProducers.siPixelGainCalibrationForHLTGPU_cfi import *
+gpu.toReplaceWith(siPixelClustersPreSplitting, _siPixelClustersHeterogeneous.clone())