From 25c7697e4b011f1399361b2d78fbfcc8ea9252c4 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Sat, 7 Jul 2018 12:22:01 +0200
Subject: [PATCH 01/19] enable sorting hits in layers by histogramming

---
 RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu     | 6 +++---
 .../plugins/siPixelRecHitsHeterogeneousProduct.h            | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
index 999fdcd6eff19..1e85afd0443e9 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
@@ -36,7 +36,7 @@ namespace pixelgpudetails {
     cudaCheck(cudaMalloc((void**) & gpu_.sortIndex_d,(gpuClustering::MaxNumModules*256)*sizeof(uint16_t)));
     cudaCheck(cudaMalloc((void**) & gpu_.mr_d,(gpuClustering::MaxNumModules*256)*sizeof(uint16_t)));
     cudaCheck(cudaMalloc((void**) & gpu_.mc_d,(gpuClustering::MaxNumModules*256)*sizeof(uint16_t)));
-//    cudaCheck(cudaMalloc((void**) & gpu_.hist_d, 10*sizeof(HitsOnGPU::Hist)));
+    cudaCheck(cudaMalloc((void**) & gpu_.hist_d, 10*sizeof(HitsOnGPU::Hist)));
 
     cudaCheck(cudaMalloc((void**) & gpu_d, sizeof(HitsOnGPU)));
     cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault,cudaStream.id()));
@@ -59,7 +59,7 @@ namespace pixelgpudetails {
     cudaCheck(cudaFree(gpu_.sortIndex_d));
     cudaCheck(cudaFree(gpu_.mr_d));
     cudaCheck(cudaFree(gpu_.mc_d));
-    // cudaCheck(cudaFree(gpu_.hist_d));
+    cudaCheck(cudaFree(gpu_.hist_d));
 
     cudaCheck(cudaFree(gpu_d));
   }
@@ -119,7 +119,7 @@ namespace pixelgpudetails {
     // for timing test
     // radixSortMultiWrapper<int16_t><<<10, 256, 0, c.stream>>>(gpu_.iphi_d,gpu_.sortIndex_d,gpu_.hitsLayerStart_d);
 
-    // fillManyFromVector(gpu_.hist_d,10,gpu_.iphi_d, gpu_.hitsLayerStart_d, nhits,256,c.stream);
+    cudautils::fillManyFromVector(gpu_.hist_d,10,gpu_.iphi_d, gpu_.hitsLayerStart_d, nhits,256,stream.id());
 
 
   }
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h b/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h
index 6ccd435faa7ba..dfe352f1ed8d5 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h
@@ -8,7 +8,7 @@
 #include <cstdint>
 #include <vector>
 
-// #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
 
 
 namespace siPixelRecHitsHeterogeneousProduct {
@@ -31,8 +31,8 @@ namespace siPixelRecHitsHeterogeneousProduct {
      uint16_t * mr_d;
      uint16_t * mc_d;
 
-     // using Hist = HistoContainer<int16_t,7,8>;
-     // Hist * hist_d;
+     using Hist = HistoContainer<int16_t,7,8>;
+     Hist * hist_d;
   };
 
 

From 6160220e6e927cd718d70bfe4f5dc7a775a10cbe Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Sat, 7 Jul 2018 16:44:54 +0200
Subject: [PATCH 02/19] clusterSimLink on gpu

---
 .../plugins/SiPixelRawToClusterGPUKernel.cu   |  12 +-
 .../plugins/SiPixelRawToClusterGPUKernel.h    |  18 +-
 .../SiPixelRawToClusterHeterogeneous.cc       |   3 +-
 .../siPixelRawToClusterHeterogeneousProduct.h |   3 +
 .../SiPixelRecHits/plugins/PixelRecHits.cu    |   1 +
 .../siPixelRecHitsHeterogeneousProduct.h      |   2 +
 .../plugins/BuildFile.xml                     |   4 +-
 .../plugins/ClusterSLOnGPU.cu                 | 163 ++++++++++++++++++
 .../plugins/ClusterSLOnGPU.h                  |  33 ++++
 9 files changed, 235 insertions(+), 4 deletions(-)
 create mode 100644 SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu
 create mode 100644 SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.h

diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
index 29e5e82049b5c..92ea71ae5ac05 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
@@ -38,7 +38,7 @@
 
 namespace pixelgpudetails {
 
-  SiPixelRawToClusterGPUKernel::SiPixelRawToClusterGPUKernel() {
+  SiPixelRawToClusterGPUKernel::SiPixelRawToClusterGPUKernel(cuda::stream_t<>& cudaStream) {
     int WSIZE = pixelgpudetails::MAX_FED * pixelgpudetails::MAX_WORD;
     cudaMallocHost(&word,       sizeof(unsigned int)*WSIZE);
     cudaMallocHost(&fedId_h,    sizeof(unsigned char)*WSIZE);
@@ -90,6 +90,12 @@ namespace pixelgpudetails {
     cudaCheck(cudaMalloc((void**) & moduleStart_d, (MaxNumModules+1)*sizeof(uint32_t) ));
     cudaCheck(cudaMalloc((void**) & clusInModule_d,(MaxNumModules)*sizeof(uint32_t) ));
     cudaCheck(cudaMalloc((void**) & moduleId_d,    (MaxNumModules)*sizeof(uint32_t) ));
+
+    cudaCheck(cudaMalloc((void**) & gpuProduct_d, sizeof(GPUProduct)));
+    gpuProduct = getProduct();
+      
+    cudaCheck(cudaMemcpyAsync(gpuProduct_d, &gpuProduct, sizeof(GPUProduct), cudaMemcpyDefault,cudaStream.id()));
+
   }
 
   SiPixelRawToClusterGPUKernel::~SiPixelRawToClusterGPUKernel() {
@@ -111,6 +117,10 @@ namespace pixelgpudetails {
     cudaCheck(cudaFree(clus_d));
     cudaCheck(cudaFree(clusInModule_d));
     cudaCheck(cudaFree(moduleId_d));
+
+    cudaCheck(cudaFree(gpuProduct_d));
+
+
   }
 
   void SiPixelRawToClusterGPUKernel::initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length) {
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
index 2b0b205c9f536..2552f72584ee2 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
@@ -144,12 +144,22 @@ namespace pixelgpudetails {
            (adc << thePacking.adc_shift);
   }
 
+  constexpr
+  uint32_t pixelToChannel( int row, int col) {
+    constexpr Packing thePacking = packing();
+    return (row << thePacking.column_width) | col;
+  }
+
+
   using error_obj = siPixelRawToClusterHeterogeneousProduct::error_obj;
 
 
   class SiPixelRawToClusterGPUKernel {
   public:
-    SiPixelRawToClusterGPUKernel();
+
+    using GPUProduct = siPixelRawToClusterHeterogeneousProduct::GPUProduct;
+
+    SiPixelRawToClusterGPUKernel(cuda::stream_t<>& cudaStream);
     ~SiPixelRawToClusterGPUKernel();
 
     
@@ -170,6 +180,7 @@ namespace pixelgpudetails {
     auto getProduct() const {
       return siPixelRawToClusterHeterogeneousProduct::GPUProduct{
         pdigi_h, rawIdArr_h, clus_h, adc_h, error_h,
+        gpuProduct_d,
         nDigis, nModulesActive,
         xx_d, yy_d, adc_d, moduleInd_d, moduleStart_d,clus_d, clusInModule_d, moduleId_d
       };
@@ -181,6 +192,11 @@ namespace pixelgpudetails {
     unsigned char *fedId_h = nullptr;    // to hold fed index for each word
 
     // output
+    GPUProduct gpuProduct;
+    GPUProduct * gpuProduct_d;
+
+    // FIXME cleanup all these are in the gpuProduct above...
+    
     uint32_t *pdigi_h = nullptr, *rawIdArr_h = nullptr;                   // host copy of output
     uint16_t *adc_h = nullptr; int32_t *clus_h = nullptr; // host copy of calib&clus output
     pixelgpudetails::error_obj *data_h = nullptr;
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc
index 196a08b85d861..e2c2dcf828685 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc
@@ -425,7 +425,7 @@ void SiPixelRawToClusterHeterogeneous::produceCPU(edm::HeterogeneousEvent& ev, c
 // -----------------------------------------------------------------------------
 void SiPixelRawToClusterHeterogeneous::beginStreamGPUCuda(edm::StreamID streamId, cuda::stream_t<>& cudaStream) {
   // Allocate GPU resources here
-  gpuAlgo_ = std::make_unique<pixelgpudetails::SiPixelRawToClusterGPUKernel>();
+  gpuAlgo_ = std::make_unique<pixelgpudetails::SiPixelRawToClusterGPUKernel>(cudaStream);
   gpuModulesToUnpack_ = std::make_unique<SiPixelFedCablingMapGPUWrapper::ModulesToUnpack>();
 }
 
@@ -523,6 +523,7 @@ void SiPixelRawToClusterHeterogeneous::acquireGPUCuda(const edm::HeterogeneousEv
 
 void SiPixelRawToClusterHeterogeneous::produceGPUCuda(edm::HeterogeneousEvent& ev, const edm::EventSetup& es, cuda::stream_t<>& cudaStream) {
   auto output = std::make_unique<GPUProduct>(gpuAlgo_->getProduct());
+  assert(output->me_d);
   ev.put<Output>(std::move(output), [this](const GPUProduct& gpu, CPUProduct& cpu) {
       this->convertGPUtoCPU(gpu, cpu);
     });
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h b/RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h
index 6d6da10934532..bafe6fb2fed58 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h
@@ -36,6 +36,7 @@ namespace siPixelRawToClusterHeterogeneousProduct {
     { }
   };
 
+  // FIXME split in two 
   struct GPUProduct {
     // Needed for digi and cluster CPU output
     uint32_t const * pdigi_h = nullptr;
@@ -44,6 +45,8 @@ namespace siPixelRawToClusterHeterogeneousProduct {
     uint16_t const * adc_h = nullptr;
     GPU::SimpleVector<error_obj> const * error_h = nullptr;
 
+    GPUProduct const * me_d = nullptr;
+
     // Needed for GPU rechits
     uint32_t nDigis;
     uint32_t nModules;
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
index 1e85afd0443e9..1bf3c7dce3d94 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
@@ -39,6 +39,7 @@ namespace pixelgpudetails {
     cudaCheck(cudaMalloc((void**) & gpu_.hist_d, 10*sizeof(HitsOnGPU::Hist)));
 
     cudaCheck(cudaMalloc((void**) & gpu_d, sizeof(HitsOnGPU)));
+    gpu_.me_d = gpu_d;
     cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault,cudaStream.id()));
 
   }
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h b/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h
index dfe352f1ed8d5..b48ba2352625c 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h
@@ -33,6 +33,8 @@ namespace siPixelRecHitsHeterogeneousProduct {
 
      using Hist = HistoContainer<int16_t,7,8>;
      Hist * hist_d;
+    
+     HitsOnGPU const * me_d=nullptr;
   };
 
 
diff --git a/SimTracker/TrackerHitAssociation/plugins/BuildFile.xml b/SimTracker/TrackerHitAssociation/plugins/BuildFile.xml
index 7cc02d9313e26..3ef0459183c1d 100644
--- a/SimTracker/TrackerHitAssociation/plugins/BuildFile.xml
+++ b/SimTracker/TrackerHitAssociation/plugins/BuildFile.xml
@@ -1,5 +1,7 @@
 <use name="SimTracker/TrackerHitAssociation" />
 <use name="SimDataFormats/TrackingAnalysis" />
-<library file="*.cc" name="SimTrackerTrackerHitAssociationPlugins">
+<library file="*.cc *.cu" name="SimTrackerTrackerHitAssociationPlugins">
+  <use name="cuda"/>
+  <use name="cuda-api-wrappers"/>
   <flags EDM_PLUGIN="1"/>
 </library>
diff --git a/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu
new file mode 100644
index 0000000000000..e773887b484d3
--- /dev/null
+++ b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu
@@ -0,0 +1,163 @@
+#include "ClusterSLOnGPU.h"
+
+// for the "packing"
+#include "RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h"
+#include<cassert>
+#include<atomic>
+
+/*
+struct ClusterSLGPU {
+ ClusterSLGPU(){alloc();}
+ void alloc();
+
+ ClusterSLGPU * me_d;
+ std::array<uint32_t,4> * links_d;
+ uint32_t * tkId_d;
+ uint32_t * tkId2_d;
+ uint32_t * n1_d;
+ uint32_t * n2_d;
+
+ static constexpr uint32_t MAX_DIGIS = 2000*150;
+ static constexpr uint32_t MaxNumModules = 2000;
+
+};
+*/
+
+
+template<class ForwardIt, class T, class Compare>
+__device__
+ForwardIt lowerBound(ForwardIt first, ForwardIt last, const T& value, Compare comp)
+{
+    ForwardIt it;
+    auto count = last-first;
+ 
+    while (count > 0) {
+        it = first;
+        auto step = count / 2;
+        it+=step;
+        if (comp(*it, value)) {
+            first = ++it;
+            count -= step + 1;
+        }
+        else
+            count = step;
+    }
+    return first;
+}
+
+
+
+__global__
+void simLink(clusterSLOnGPU::DigisOnGPU const * ddp, uint32_t ndigis, clusterSLOnGPU::HitsOnGPU const * hhp, ClusterSLGPU const * slp, uint32_t n) {
+
+  constexpr uint16_t InvId=9999; // must be > MaxNumModules
+  
+  auto const & dd = *ddp;
+  auto const & hh = *hhp;
+  auto const & sl = *slp;
+  auto i = blockIdx.x*blockDim.x + threadIdx.x;
+  
+  if (i>ndigis) return;
+
+  auto id = dd.moduleInd_d[i];
+  if (InvId==id) return;
+  assert(id<2000);
+
+  auto ch = pixelgpudetails::pixelToChannel(dd.xx_d[i], dd.yy_d[i]);
+  auto first = hh.hitsModuleStart_d[id];
+  auto cl = first + dd.clus_d[i];
+  assert(cl<256*2000);
+  
+  const std::array<uint32_t,4> me{{id,ch,0}};
+
+  auto less = [](std::array<uint32_t,4> const & a, std::array<uint32_t,4> const & b)->bool {
+     return a[0]<b[0] || ( !(b[0]<a[0]) && a[1]<b[1]); // in this context we do not care of [2] 
+  };
+
+  auto equal = [](std::array<uint32_t,4> const & a, std::array<uint32_t,4> const & b)->bool {
+     return a[0]==b[0] && a[1]==b[1]; // in this context we do not care of [2]
+  };
+
+  auto const * b = sl.links_d;
+  auto const * e = b+n;
+
+  // auto p = cuda_std::lower_bound(b,e,me,less);
+  auto p = lowerBound(b,e,me,less);
+  auto j = p-sl.links_d;
+  assert(j>=0);
+  j = std::min(int(j),int(n-1));
+  if (equal(me,sl.links_d[j])) {
+    //auto const & l = sl.links_d[j];
+    auto const tk = j; // l[2];
+    auto old = atomicCAS(&sl.tkId_d[cl],0,tk);
+    if (0==old ||tk==old) atomicAdd(&sl.n1_d[cl],1);
+    else {
+      auto old = atomicCAS(&sl.tkId2_d[cl],0,tk);
+      if (0==old ||tk==old) atomicAdd(&sl.n2_d[cl],1);
+    }    
+  } 
+  /*
+  else {
+    auto const & k=sl.links_d[j];
+    auto const & kk = j+1<n ? sl.links_d[j+1] : k;
+    printf("digi not found %d:%d closest %d:%d:%d, %d:%d:%d\n",id,ch, k[0],k[1],k[2], kk[0],kk[1],kk[2]);
+  }
+  */
+
+}
+
+
+__global__
+void dumpLink(int ev, clusterSLOnGPU::HitsOnGPU const * hhp, uint32_t nhits, ClusterSLGPU const * slp) {
+  auto i = blockIdx.x*blockDim.x + threadIdx.x;
+  if (i>nhits) return;
+
+  auto const & hh = *hhp;
+  auto const & sl = *slp;
+
+  auto const & tk1 = sl.links_d[sl.tkId_d[i]];
+  auto const & tk2 = sl.links_d[sl.tkId2_d[i]];
+
+  printf("HIT: %d %d %d %d %f %f %f %f %d %d %d %d %d %d %d\n",ev, i, 
+         hh.detInd_d[i], hh.charge_d[i], 
+         hh.xg_d[i],hh.yg_d[i],hh.zg_d[i],hh.rg_d[i],hh.iphi_d[i], 
+         tk1[2],tk1[3],sl.n1_d[i],
+         tk2[2],tk2[3],sl.n2_d[i]
+        );
+
+}
+
+
+
+namespace clusterSLOnGPU {
+
+  struct CSVHeader {
+     CSVHeader() {
+      printf("HIT: %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s\n", "ev", "ind",
+         "det", "charge",	
+         "xg","yg","zg","rg","iphi", 
+         "tkId","pt","n1","tkId2","pt2","n2" 
+        );
+     }
+
+  };
+  CSVHeader csvHeader;
+
+  std::atomic<int> evId(0);
+
+  void wrapper(DigisOnGPU const & dd, uint32_t ndigis, HitsOnCPU const & hh, uint32_t nhits, ClusterSLGPU const & sl, uint32_t n, cuda::stream_t<>& stream) {
+    
+    int ev = ++evId;
+    int threadsPerBlock = 256;
+    int blocks = (ndigis + threadsPerBlock - 1) / threadsPerBlock;
+
+    assert(sl.me_d);
+    simLink<<<blocks, threadsPerBlock, 0, stream.id()>>>(dd.me_d,ndigis, hh.gpu_d, sl.me_d,n);
+    blocks = (nhits + threadsPerBlock - 1) / threadsPerBlock;
+    dumpLink<<<blocks, threadsPerBlock, 0, stream.id()>>>(ev, hh.gpu_d, nhits, sl.me_d);
+    cudaCheck(cudaGetLastError());
+
+  }
+
+}
diff --git a/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.h b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.h
new file mode 100644
index 0000000000000..8fa8d6cba0c7b
--- /dev/null
+++ b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.h
@@ -0,0 +1,33 @@
+// gpu
+#include <cuda_runtime.h>
+#include <cuda/api_wrappers.h>
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+#include "RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h"
+#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h"
+
+struct ClusterSLGPU {
+ ClusterSLGPU(){alloc();}
+ void alloc();
+ void zero(cudaStream_t stream);
+
+ ClusterSLGPU * me_d;
+ std::array<uint32_t,4> * links_d;
+ uint32_t * tkId_d;
+ uint32_t * tkId2_d;
+ uint32_t * n1_d;
+ uint32_t * n2_d;
+
+ static constexpr uint32_t MAX_DIGIS = 2000*150;
+ static constexpr uint32_t MaxNumModules = 2000;
+
+};
+
+namespace clusterSLOnGPU {
+
+  using DigisOnGPU = siPixelRawToClusterHeterogeneousProduct::GPUProduct;
+  using HitsOnGPU = siPixelRecHitsHeterogeneousProduct::HitsOnGPU;
+  using HitsOnCPU = siPixelRecHitsHeterogeneousProduct::HitsOnCPU;
+  void wrapper(DigisOnGPU const & dd, uint32_t ndigis, HitsOnCPU const & hh, uint32_t nhits, ClusterSLGPU const & sl, uint32_t n, cuda::stream_t<>& stream);
+
+}

From c26ca645a1ca26c4a191e4b37c0cbba271e50b80 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Sun, 8 Jul 2018 16:43:15 +0200
Subject: [PATCH 03/19] Heterogeneous clTP

---
 .../ClusterTPAssociationHeterogeneous.cc      | 416 ++++++++++++++++++
 1 file changed, 416 insertions(+)
 create mode 100644 SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc

diff --git a/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc b/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc
new file mode 100644
index 0000000000000..984c3e5ddd90d
--- /dev/null
+++ b/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc
@@ -0,0 +1,416 @@
+#include <memory>
+#include <vector>
+#include <utility>
+
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "FWCore/Utilities/interface/EDGetToken.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+
+#include "HeterogeneousCore/CUDACore/interface/GPUCuda.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h"
+
+
+#include "DataFormats/Common/interface/Handle.h"
+#include "FWCore/Framework/interface/ESHandle.h"
+#include "DataFormats/Common/interface/DetSetVector.h"
+#include "DataFormats/Common/interface/DetSetVectorNew.h"
+#include "DataFormats/DetId/interface/DetId.h"
+#include "DataFormats/SiPixelDetId/interface/PixelChannelIdentifier.h"
+#include "DataFormats/TrackerRecHit2D/interface/OmniClusterRef.h"
+#include "DataFormats/SiPixelCluster/interface/SiPixelCluster.h"
+#include "DataFormats/SiStripCluster/interface/SiStripCluster.h"
+#include "DataFormats/Phase2TrackerCluster/interface/Phase2TrackerCluster1D.h"
+
+#include "SimDataFormats/Track/interface/SimTrackContainer.h"
+#include "SimDataFormats/TrackerDigiSimLink/interface/StripDigiSimLink.h"
+#include "SimDataFormats/TrackerDigiSimLink/interface/PixelDigiSimLink.h"
+#include "DataFormats/Phase2TrackerDigi/interface/Phase2TrackerDigi.h"
+#include "SimDataFormats/TrackingAnalysis/interface/TrackingParticle.h"
+#include "SimDataFormats/TrackingAnalysis/interface/TrackingParticleFwd.h"
+#include "SimTracker/TrackerHitAssociation/interface/ClusterTPAssociation.h"
+
+#include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h"
+#include "Geometry/Records/interface/TrackerDigiGeometryRecord.h"
+
+
+// gpu
+#include "RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h"
+#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h" 
+
+#include <cuda_runtime.h>
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "ClusterSLOnGPU.h"
+
+void
+ClusterSLGPU::alloc() {
+   cudaCheck(cudaMalloc((void**) & links_d,(MAX_DIGIS)*sizeof(std::array<uint32_t,4>)));
+
+   cudaCheck(cudaMalloc((void**) & tkId_d,(MaxNumModules*256)*sizeof(uint32_t)));
+   cudaCheck(cudaMalloc((void**) & tkId2_d,(MaxNumModules*256)*sizeof(uint32_t)));
+   cudaCheck(cudaMalloc((void**) & n1_d,(MaxNumModules*256)*sizeof(uint32_t)));
+   cudaCheck(cudaMalloc((void**) & n2_d,(MaxNumModules*256)*sizeof(uint32_t)));
+
+
+   cudaCheck(cudaMalloc((void**) & me_d, sizeof(ClusterSLGPU)));
+   cudaCheck(cudaMemcpy(me_d, this, sizeof(ClusterSLGPU), cudaMemcpyDefault));
+   cudaCheck(cudaDeviceSynchronize());
+
+}
+
+void
+ClusterSLGPU::zero(cudaStream_t stream) {
+   cudaCheck(cudaMemsetAsync(tkId_d,0,(MaxNumModules*256)*sizeof(uint32_t), stream));
+   cudaCheck(cudaMemsetAsync(tkId2_d,0,(MaxNumModules*256)*sizeof(uint32_t), stream));
+   cudaCheck(cudaMemsetAsync(n1_d,0,(MaxNumModules*256)*sizeof(uint32_t), stream));
+   cudaCheck(cudaMemsetAsync(n2_d,0,(MaxNumModules*256)*sizeof(uint32_t), stream));
+}
+
+class ClusterTPAssociationHeterogeneous : : public HeterogeneousEDProducer<heterogeneous::HeterogeneousDevices<
+          heterogeneous::GPUCuda, heterogeneous::CPU>>
+{
+public:
+  typedef std::vector<OmniClusterRef> OmniClusterCollection;
+
+  using PixelDigiClustersH = siPixelRawToClusterHeterogeneousProduct::HeterogeneousDigiCluster;
+  using PixelRecHitsH = siPixelRecHitsHeterogeneousProduct::HeterogeneousPixelRecHit;
+
+  explicit ClusterTPAssociationHeterogeneous(const edm::ParameterSet&);
+  ~ClusterTPAssociationHeterogeneous() override;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+private:
+
+  void beginStreamGPUCuda(edm::StreamID streamId,
+                          cuda::stream_t<> &cudaStream) override;
+
+  void acquireGPUCuda(const edm::HeterogeneousEvent &iEvent,
+                      const edm::EventSetup &iSetup,
+                      cuda::stream_t<> &cudaStream) override;
+  void produceGPUCuda(edm::HeterogeneousEvent &iEvent,
+                      const edm::EventSetup &iSetup,
+                      cuda::stream_t<> &cudaStream) override;
+  void produceCPU(edm::HeterogeneousEvent &iEvent,
+                  const edm::EventSetup &iSetup) override;
+
+
+  template <typename T>
+  std::vector<std::pair<uint32_t, EncodedEventId> >
+  getSimTrackId(const edm::Handle<edm::DetSetVector<T> >& simLinks, const DetId& detId, uint32_t channel) const;
+
+  edm::EDGetTokenT<edm::DetSetVector<PixelDigiSimLink> > sipixelSimLinksToken_;
+  edm::EDGetTokenT<edm::DetSetVector<StripDigiSimLink> > sistripSimLinksToken_;
+  edm::EDGetTokenT<edm::DetSetVector<PixelDigiSimLink> > siphase2OTSimLinksToken_;
+  edm::EDGetTokenT<edmNew::DetSetVector<SiPixelCluster> > pixelClustersToken_;
+  edm::EDGetTokenT<edmNew::DetSetVector<SiStripCluster> > stripClustersToken_;
+  edm::EDGetTokenT<edmNew::DetSetVector<Phase2TrackerCluster1D> > phase2OTClustersToken_;
+  edm::EDGetTokenT<TrackingParticleCollection> trackingParticleToken_;
+
+  edm::EDGetTokenT<HeterogeneousProduct> tGpuDigis;
+  edm::EDGetTokenT<HeterogeneousProduct> tGpuHits;
+
+  ClusterSLGPU slGPU;
+
+};
+
+ClusterTPAssociationHeterogeneous::ClusterTPAssociationHeterogeneous(const edm::ParameterSet & cfg)
+  : HeterogeneousEDProducer(cfg),
+    sipixelSimLinksToken_(consumes<edm::DetSetVector<PixelDigiSimLink> >(cfg.getParameter<edm::InputTag>("pixelSimLinkSrc"))),
+    sistripSimLinksToken_(consumes<edm::DetSetVector<StripDigiSimLink> >(cfg.getParameter<edm::InputTag>("stripSimLinkSrc"))),
+    siphase2OTSimLinksToken_(consumes<edm::DetSetVector<PixelDigiSimLink> >(cfg.getParameter<edm::InputTag>("phase2OTSimLinkSrc"))),
+    pixelClustersToken_(consumes<edmNew::DetSetVector<SiPixelCluster> >(cfg.getParameter<edm::InputTag>("pixelClusterSrc"))),
+    stripClustersToken_(consumes<edmNew::DetSetVector<SiStripCluster> >(cfg.getParameter<edm::InputTag>("stripClusterSrc"))),
+    phase2OTClustersToken_(consumes<edmNew::DetSetVector<Phase2TrackerCluster1D> >(cfg.getParameter<edm::InputTag>("phase2OTClusterSrc"))),
+    trackingParticleToken_(consumes<TrackingParticleCollection>(cfg.getParameter<edm::InputTag>("trackingParticleSrc"))),
+    tGpuDigis(consumesHeterogeneous(iConfig.getParameter<edm::InputTag>("heterogeneousPixelDigiClusterSrc"))),
+    tGpuHits(consumesHeterogeneous(iConfig.getParameter<edm::InputTag>("heterogeneousPixelRecHitSrc"))),
+{
+  produces<ClusterTPAssociation>();
+}
+
+ClusterTPAssociationHeterogeneous::~ClusterTPAssociationHeterogeneous() {
+}
+
+void ClusterTPAssociationHeterogeneous::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<edm::InputTag>("simTrackSrc",     edm::InputTag("g4SimHits"));
+  desc.add<edm::InputTag>("pixelSimLinkSrc", edm::InputTag("simSiPixelDigis"));
+  desc.add<edm::InputTag>("stripSimLinkSrc", edm::InputTag("simSiStripDigis"));
+  desc.add<edm::InputTag>("phase2OTSimLinkSrc", edm::InputTag("simSiPixelDigis","Tracker"));
+  desc.add<edm::InputTag>("pixelClusterSrc", edm::InputTag("siPixelClusters"));
+  desc.add<edm::InputTag>("stripClusterSrc", edm::InputTag("siStripClusters"));
+  desc.add<edm::InputTag>("phase2OTClusterSrc", edm::InputTag("siPhase2Clusters"));
+  desc.add<edm::InputTag>("trackingParticleSrc", edm::InputTag("mix", "MergedTrackTruth"));
+  desc.add<edm::InputTag>("heterogeneousPixelDigiClusterSrc", edm::InputTag("siPixelClustersHeterogeneous"));
+  desc.add<edm::InputTag>("heterogeneousPixelRecHitSrc", edm::InputTag("siPixelRecHitHeterogeneous"));
+
+  HeterogeneousEDProducer::fillPSetDescription(desc);
+
+  descriptions.add("tpClusterProducerDefault", desc);
+}
+
+
+void ClusterTPAssociationHeterogeneous::beginStreamGPUCuda(edm::StreamID streamId,
+                          cuda::stream_t<> &cudaStream) {
+
+}
+
+void ClusterTPAssociationHeterogeneous::acquireGPUCuda(const edm::HeterogeneousEvent &iEvent,
+                      const edm::EventSetup &iSetup,
+                      cuda::stream_t<> &cudaStream) {
+
+}
+
+void ClusterTPAssociationHeterogeneous::produceGPUCuda(edm::HeterogeneousEvent &iEvent,
+                      const edm::EventSetup &iSetup,
+                      cuda::stream_t<> &cudaStream) {
+
+}
+ 
+		
+void ClusterTPAssociationHeterogeneous::produceCPU(edm::HeterogeneousEvent &iEvent, const edm::EventSetup& es) {
+
+    edm::ESHandle<TrackerGeometry> geom;
+    es.get<TrackerDigiGeometryRecord>().get( geom );
+
+
+
+  // Pixel DigiSimLink
+  edm::Handle<edm::DetSetVector<PixelDigiSimLink> > sipixelSimLinks;
+  //  iEvent.getByLabel(_pixelSimLinkSrc, sipixelSimLinks);
+  iEvent.getByToken(sipixelSimLinksToken_,sipixelSimLinks);
+
+  // SiStrip DigiSimLink
+  edm::Handle<edm::DetSetVector<StripDigiSimLink> > sistripSimLinks;
+  iEvent.getByToken(sistripSimLinksToken_,sistripSimLinks);
+
+  // Phase2 OT DigiSimLink
+  edm::Handle<edm::DetSetVector<PixelDigiSimLink> > siphase2OTSimLinks;
+  iEvent.getByToken(siphase2OTSimLinksToken_, siphase2OTSimLinks);
+
+  // Pixel Cluster
+  edm::Handle<edmNew::DetSetVector<SiPixelCluster> > pixelClusters;
+  bool foundPixelClusters = iEvent.getByToken(pixelClustersToken_,pixelClusters);
+
+  // Strip Cluster
+  edm::Handle<edmNew::DetSetVector<SiStripCluster> > stripClusters;
+  bool foundStripClusters = iEvent.getByToken(stripClustersToken_,stripClusters);
+
+  // Phase2 Cluster
+  edm::Handle<edmNew::DetSetVector<Phase2TrackerCluster1D> > phase2OTClusters;
+  bool foundPhase2OTClusters = iEvent.getByToken(phase2OTClustersToken_, phase2OTClusters);
+
+  // TrackingParticle
+  edm::Handle<TrackingParticleCollection>  TPCollectionH;
+  iEvent.getByToken(trackingParticleToken_,TPCollectionH);
+
+  auto clusterTPList = std::make_unique<ClusterTPAssociation>(TPCollectionH);
+
+  // prepare temporary map between SimTrackId and TrackingParticle index
+  std::map<std::pair<size_t, EncodedEventId>, TrackingParticleRef> mapping;
+  for (TrackingParticleCollection::size_type itp = 0;
+                                             itp < TPCollectionH.product()->size(); ++itp) {
+    TrackingParticleRef trackingParticle(TPCollectionH, itp);
+
+    // SimTracks inside TrackingParticle
+    EncodedEventId eid(trackingParticle->eventId());
+    //size_t index = 0;
+    for (std::vector<SimTrack>::const_iterator itrk  = trackingParticle->g4Track_begin(); 
+                                               itrk != trackingParticle->g4Track_end(); ++itrk) {
+      std::pair<uint32_t, EncodedEventId> trkid(itrk->trackId(), eid);
+      //std::cout << "creating map for id: " << trkid.first << " with tp: " << trackingParticle.key() << std::endl;
+      mapping.insert(std::make_pair(trkid, trackingParticle));
+    }
+  }
+
+  //  gpu stuff ------------------------
+
+    std::cout << "In tpsimlink " << mapping.size() << std::endl;
+
+    edm::Handle<siPixelRawToClusterHeterogeneousProduct::GPUProduct> gd;
+    edm::Handle<siPixelRecHitHeterogeneousProduct::GPUProduct> gd;
+    iEvent.getByToken(tGpuDigis, gd);  
+    iEvent.getByToken(tGpuHits, gh);
+    auto const & gDigis = *gd;
+    auto const & gHits = *gh;
+    auto const & dcont = * gDigis.gpu_d;
+    auto const & hh = *gHits.gpu_d;
+    auto ndigis = gDigis.ndigis;
+    auto nhits = gHits.nhits;
+
+    uint32_t nn=0, ng=0, ng10=0;
+    std::vector<std::array<uint32_t,4>> digi2tp;
+    {std::array<uint32_t,4> a{{0,0,0,0}}; digi2tp.push_back(a);} // put at 0 0
+    for (auto const & links : *sipixelSimLinks) {
+      DetId detId(links.detId());
+      const GeomDetUnit * genericDet = geom->idToDetUnit(detId);
+      uint32_t gind = genericDet->index();
+      for (auto const & link : links) {
+        ++ng;
+             if (link.fraction() > 0.3f) ++ng10;
+        if (link.fraction() < 0.5f) continue;
+        auto tkid = std::make_pair(link.SimTrackId(), link.eventId());
+        auto ipos = mapping.find(tkid);
+          if (ipos != mapping.end()) {
+            uint32_t pt = 1000*(*ipos).second->pt();
+            ++nn;
+            std::array<uint32_t,4> a{{gind,uint32_t(link.channel()),(*ipos).second.key(),pt}};
+            digi2tp.push_back(a);
+          }
+      }
+    }
+    std::sort(digi2tp.begin(),digi2tp.end());
+
+    std::cout << "In tpsimlink found " << nn << " valid link out of " << ng << '/' << ng10 << ' ' << digi2tp.size() << std::endl;
+
+    cudaCheck(cudaMemcpyAsync(slGPU.links_d, digi2tp.data(), sizeof(std::array<uint32_t,4>)*digi2tp.size(), cudaMemcpyDefault, dcont.stream));
+    slGPU.zero(dcont.stream);
+    clusterSLOnGPU::wrapper(dcont, ndigis, hh, nhits, slGPU, digi2tp.size());
+
+  //  end gpu stuff ---------------------
+
+
+
+
+  if ( foundPixelClusters ) {
+    // Pixel Clusters 
+    for (edmNew::DetSetVector<SiPixelCluster>::const_iterator iter  = pixelClusters->begin(); 
+                                                            iter != pixelClusters->end(); ++iter) {
+      uint32_t detid = iter->id(); 
+      DetId detId(detid);
+      edmNew::DetSet<SiPixelCluster> link_pixel = (*iter);
+      for (edmNew::DetSet<SiPixelCluster>::const_iterator di  = link_pixel.begin(); 
+	   di != link_pixel.end(); ++di) {
+	const SiPixelCluster& cluster = (*di);
+	edm::Ref<edmNew::DetSetVector<SiPixelCluster>, SiPixelCluster> c_ref = 
+	  edmNew::makeRefTo(pixelClusters, di);
+	
+	std::set<std::pair<uint32_t, EncodedEventId> > simTkIds; 
+	for (int irow = cluster.minPixelRow(); irow <= cluster.maxPixelRow(); ++irow) {
+	  for (int icol = cluster.minPixelCol(); icol <= cluster.maxPixelCol(); ++icol) {
+	    uint32_t channel = PixelChannelIdentifier::pixelToChannel(irow, icol);
+	    std::vector<std::pair<uint32_t, EncodedEventId> > trkid(getSimTrackId<PixelDigiSimLink>(sipixelSimLinks, detId, channel));
+	    if (trkid.empty()) continue; 
+	    simTkIds.insert(trkid.begin(),trkid.end());
+	  }
+	}
+	for (std::set<std::pair<uint32_t, EncodedEventId> >::const_iterator iset  = simTkIds.begin(); 
+	     iset != simTkIds.end(); iset++) {
+	  auto ipos = mapping.find(*iset);
+	  if (ipos != mapping.end()) {
+	    //std::cout << "cluster in detid: " << detid << " from tp: " << ipos->second.key() << " " << iset->first << std::endl;
+	    clusterTPList->emplace_back(OmniClusterRef(c_ref), ipos->second);
+	  }
+	}
+      }
+    }
+  }
+
+  if ( foundStripClusters ) {
+    // Strip Clusters
+    for (edmNew::DetSetVector<SiStripCluster>::const_iterator iter  = stripClusters->begin(false), eter = stripClusters->end(false); 
+	 iter != eter; ++iter) {
+      if (!(*iter).isValid()) continue;
+      uint32_t detid = iter->id();  
+      DetId detId(detid);
+      edmNew::DetSet<SiStripCluster> link_strip = (*iter);
+      for (edmNew::DetSet<SiStripCluster>::const_iterator di  = link_strip.begin(); 
+	   di != link_strip.end(); di++) {
+	const SiStripCluster& cluster = (*di);
+	edm::Ref<edmNew::DetSetVector<SiStripCluster>, SiStripCluster> c_ref = 
+	  edmNew::makeRefTo(stripClusters, di);
+	
+	std::set<std::pair<uint32_t, EncodedEventId> > simTkIds; 
+	int first  = cluster.firstStrip();     
+	int last   = first + cluster.amplitudes().size();
+	
+	for (int istr = first; istr < last; ++istr) {
+	  std::vector<std::pair<uint32_t, EncodedEventId> > trkid(getSimTrackId<StripDigiSimLink>(sistripSimLinks, detId, istr));
+	  if (trkid.empty()) continue; 
+	  simTkIds.insert(trkid.begin(),trkid.end());
+	}
+	for (std::set<std::pair<uint32_t, EncodedEventId> >::const_iterator iset  = simTkIds.begin(); 
+	     iset != simTkIds.end(); iset++) {
+	  auto ipos = mapping.find(*iset);
+	  if (ipos != mapping.end()) {
+	    //std::cout << "cluster in detid: " << detid << " from tp: " << ipos->second.key() << " " << iset->first << std::endl;
+	    clusterTPList->emplace_back(OmniClusterRef(c_ref), ipos->second);
+	  } 
+	}
+      } 
+    }
+  }
+
+  if ( foundPhase2OTClusters ) {
+
+    // Phase2 Clusters
+    if(phase2OTClusters.isValid()){
+      for (edmNew::DetSetVector<Phase2TrackerCluster1D>::const_iterator iter  = phase2OTClusters->begin(false), eter = phase2OTClusters->end(false);
+                                                                iter != eter; ++iter) {
+        if (!(*iter).isValid()) continue;
+        uint32_t detid = iter->id();
+        DetId detId(detid);
+        edmNew::DetSet<Phase2TrackerCluster1D> link_phase2 = (*iter);
+        for (edmNew::DetSet<Phase2TrackerCluster1D>::const_iterator di  = link_phase2.begin();
+             di != link_phase2.end(); di++) {
+          const Phase2TrackerCluster1D& cluster = (*di);
+          edm::Ref<edmNew::DetSetVector<Phase2TrackerCluster1D>, Phase2TrackerCluster1D> c_ref =
+            edmNew::makeRefTo(phase2OTClusters, di);
+    
+          std::set<std::pair<uint32_t, EncodedEventId> > simTkIds;
+    
+          for (unsigned int istr(0); istr < cluster.size(); ++istr) {
+            uint32_t channel = Phase2TrackerDigi::pixelToChannel(cluster.firstRow() + istr, cluster.column());
+            std::vector<std::pair<uint32_t, EncodedEventId> > trkid(getSimTrackId<PixelDigiSimLink>(siphase2OTSimLinks, detId, channel));
+            if (trkid.empty()) continue;
+            simTkIds.insert(trkid.begin(),trkid.end());
+          }
+    
+          for (std::set<std::pair<uint32_t, EncodedEventId> >::const_iterator iset  = simTkIds.begin();
+                                                                              iset != simTkIds.end(); iset++) {
+            auto ipos = mapping.find(*iset);
+            if (ipos != mapping.end()) {
+              clusterTPList->emplace_back(OmniClusterRef(c_ref), ipos->second);
+            }
+          }
+        }
+      }
+    }
+
+  }
+  clusterTPList->sortAndUnique();
+  iEvent.put(std::move(clusterTPList));
+}
+
+template <typename T>
+std::vector<std::pair<uint32_t, EncodedEventId> >
+//std::pair<uint32_t, EncodedEventId>
+ClusterTPAssociationHeterogeneous::getSimTrackId(const edm::Handle<edm::DetSetVector<T> >& simLinks,
+                                            const DetId& detId, uint32_t channel) const 
+{
+  //std::pair<uint32_t, EncodedEventId> simTrkId;
+  std::vector<std::pair<uint32_t, EncodedEventId> > simTrkId;
+  auto isearch = simLinks->find(detId);
+  if (isearch != simLinks->end()) {
+    // Loop over DigiSimLink in this det unit
+    edm::DetSet<T> link_detset = (*isearch);
+    for (typename edm::DetSet<T>::const_iterator it  = link_detset.data.begin(); 
+                                                 it != link_detset.data.end(); ++it) {
+      if (channel == it->channel()) {
+        simTrkId.push_back(std::make_pair(it->SimTrackId(), it->eventId()));
+      } 
+    }
+  }
+  return simTrkId;
+}
+#include "FWCore/PluginManager/interface/ModuleDef.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+
+DEFINE_FWK_MODULE(ClusterTPAssociationHeterogeneous);

From 3e2f557b8e84631d626ac1cde4946ec78eaa8650 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Sun, 8 Jul 2018 17:00:53 +0200
Subject: [PATCH 04/19] Heterogeneous clTP

---
 .../ClusterTPAssociationHeterogeneous.cc        | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc b/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc
index 984c3e5ddd90d..e35935ef9fef2 100644
--- a/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc
+++ b/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc
@@ -72,7 +72,7 @@ ClusterSLGPU::zero(cudaStream_t stream) {
    cudaCheck(cudaMemsetAsync(n2_d,0,(MaxNumModules*256)*sizeof(uint32_t), stream));
 }
 
-class ClusterTPAssociationHeterogeneous : : public HeterogeneousEDProducer<heterogeneous::HeterogeneousDevices<
+class ClusterTPAssociationHeterogeneous : public HeterogeneousEDProducer<heterogeneous::HeterogeneousDevices<
           heterogeneous::GPUCuda, heterogeneous::CPU>>
 {
 public:
@@ -82,7 +82,7 @@ class ClusterTPAssociationHeterogeneous : : public HeterogeneousEDProducer<heter
   using PixelRecHitsH = siPixelRecHitsHeterogeneousProduct::HeterogeneousPixelRecHit;
 
   explicit ClusterTPAssociationHeterogeneous(const edm::ParameterSet&);
-  ~ClusterTPAssociationHeterogeneous() override;
+  ~ClusterTPAssociationHeterogeneous() = default;;
 
   static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
 
@@ -129,15 +129,12 @@ ClusterTPAssociationHeterogeneous::ClusterTPAssociationHeterogeneous(const edm::
     stripClustersToken_(consumes<edmNew::DetSetVector<SiStripCluster> >(cfg.getParameter<edm::InputTag>("stripClusterSrc"))),
     phase2OTClustersToken_(consumes<edmNew::DetSetVector<Phase2TrackerCluster1D> >(cfg.getParameter<edm::InputTag>("phase2OTClusterSrc"))),
     trackingParticleToken_(consumes<TrackingParticleCollection>(cfg.getParameter<edm::InputTag>("trackingParticleSrc"))),
-    tGpuDigis(consumesHeterogeneous(iConfig.getParameter<edm::InputTag>("heterogeneousPixelDigiClusterSrc"))),
-    tGpuHits(consumesHeterogeneous(iConfig.getParameter<edm::InputTag>("heterogeneousPixelRecHitSrc"))),
+    tGpuDigis(consumesHeterogeneous(cfg.getParameter<edm::InputTag>("heterogeneousPixelDigiClusterSrc"))),
+    tGpuHits(consumesHeterogeneous(cfg.getParameter<edm::InputTag>("heterogeneousPixelRecHitSrc")))
 {
   produces<ClusterTPAssociation>();
 }
 
-ClusterTPAssociationHeterogeneous::~ClusterTPAssociationHeterogeneous() {
-}
-
 void ClusterTPAssociationHeterogeneous::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
   edm::ParameterSetDescription desc;
   desc.add<edm::InputTag>("simTrackSrc",     edm::InputTag("g4SimHits"));
@@ -235,14 +232,14 @@ void ClusterTPAssociationHeterogeneous::produceCPU(edm::HeterogeneousEvent &iEve
     std::cout << "In tpsimlink " << mapping.size() << std::endl;
 
     edm::Handle<siPixelRawToClusterHeterogeneousProduct::GPUProduct> gd;
-    edm::Handle<siPixelRecHitHeterogeneousProduct::GPUProduct> gd;
+    edm::Handle<siPixelRecHitsHeterogeneousProduct::GPUProduct> gh;
     iEvent.getByToken(tGpuDigis, gd);  
     iEvent.getByToken(tGpuHits, gh);
     auto const & gDigis = *gd;
     auto const & gHits = *gh;
-    auto const & dcont = * gDigis.gpu_d;
+    auto const & dcont = * gDigis.me_d;
     auto const & hh = *gHits.gpu_d;
-    auto ndigis = gDigis.ndigis;
+    auto ndigis = gDigis.nDigis;
     auto nhits = gHits.nhits;
 
     uint32_t nn=0, ng=0, ng10=0;

From c1890f8a51457febe59dd97363bc313b5f6c9ab4 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Tue, 10 Jul 2018 11:54:16 +0200
Subject: [PATCH 05/19] moving on

---
 .../siPixelRecHitsHeterogeneousProduct.h      |   6 +-
 .../plugins/BuildFile.xml                     |   7 +
 .../ClusterTPAssociationHeterogeneous.cc      | 160 ++++++++++--------
 3 files changed, 103 insertions(+), 70 deletions(-)

diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h b/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h
index b48ba2352625c..79608c13713d7 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h
@@ -41,14 +41,16 @@ namespace siPixelRecHitsHeterogeneousProduct {
   struct HitsOnCPU {
     HitsOnCPU() = default;
     explicit HitsOnCPU(uint32_t nhits) :
-      charge(nhits),xl(nhits),yl(nhits),xe(nhits),ye(nhits), mr(nhits), mc(nhits){}
+      charge(nhits),xl(nhits),yl(nhits),xe(nhits),ye(nhits), mr(nhits), mc(nhits),
+      nHits(nhits){}
     uint32_t hitsModuleStart[2001];
     std::vector<int32_t> charge;
     std::vector<float> xl, yl;
     std::vector<float> xe, ye;
     std::vector<uint16_t> mr;
     std::vector<uint16_t> mc;
-
+ 
+    uint32_t nHits;
     HitsOnGPU const * gpu_d=nullptr;
   };
 
diff --git a/SimTracker/TrackerHitAssociation/plugins/BuildFile.xml b/SimTracker/TrackerHitAssociation/plugins/BuildFile.xml
index 3ef0459183c1d..c767b1e68936a 100644
--- a/SimTracker/TrackerHitAssociation/plugins/BuildFile.xml
+++ b/SimTracker/TrackerHitAssociation/plugins/BuildFile.xml
@@ -1,5 +1,12 @@
+<use name="HeterogeneousCore/CUDACore"/>
+<use name="HeterogeneousCore/Producer"/>
+<use name="HeterogeneousCore/Product"/>
 <use name="SimTracker/TrackerHitAssociation" />
 <use name="SimDataFormats/TrackingAnalysis" />
+
+<use name="Geometry/TrackerGeometryBuilder" />
+<use name="Geometry/Records" />
+
 <library file="*.cc *.cu" name="SimTrackerTrackerHitAssociationPlugins">
   <use name="cuda"/>
   <use name="cuda-api-wrappers"/>
diff --git a/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc b/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc
index e35935ef9fef2..50d03c41b414e 100644
--- a/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc
+++ b/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc
@@ -100,6 +100,8 @@ class ClusterTPAssociationHeterogeneous : public HeterogeneousEDProducer<heterog
   void produceCPU(edm::HeterogeneousEvent &iEvent,
                   const edm::EventSetup &iSetup) override;
 
+  void makeMap(const edm::HeterogeneousEvent &iEvent);
+
 
   template <typename T>
   std::vector<std::pair<uint32_t, EncodedEventId> >
@@ -118,6 +120,8 @@ class ClusterTPAssociationHeterogeneous : public HeterogeneousEDProducer<heterog
 
   ClusterSLGPU slGPU;
 
+  std::map<std::pair<size_t, EncodedEventId>, TrackingParticleRef> mapping;
+ 
 };
 
 ClusterTPAssociationHeterogeneous::ClusterTPAssociationHeterogeneous(const edm::ParameterSet & cfg)
@@ -150,7 +154,7 @@ void ClusterTPAssociationHeterogeneous::fillDescriptions(edm::ConfigurationDescr
 
   HeterogeneousEDProducer::fillPSetDescription(desc);
 
-  descriptions.add("tpClusterProducerDefault", desc);
+  descriptions.add("tpClusterProducerHeterogeneousDefault", desc);
 }
 
 
@@ -159,88 +163,62 @@ void ClusterTPAssociationHeterogeneous::beginStreamGPUCuda(edm::StreamID streamI
 
 }
 
-void ClusterTPAssociationHeterogeneous::acquireGPUCuda(const edm::HeterogeneousEvent &iEvent,
-                      const edm::EventSetup &iSetup,
-                      cuda::stream_t<> &cudaStream) {
+void ClusterTPAssociationHeterogeneous::makeMap(const edm::HeterogeneousEvent &iEvent) {
+    // TrackingParticle
+    edm::Handle<TrackingParticleCollection>  TPCollectionH;
+    iEvent.getByToken(trackingParticleToken_,TPCollectionH);
 
-}
-
-void ClusterTPAssociationHeterogeneous::produceGPUCuda(edm::HeterogeneousEvent &iEvent,
-                      const edm::EventSetup &iSetup,
-                      cuda::stream_t<> &cudaStream) {
-
-}
- 
-		
-void ClusterTPAssociationHeterogeneous::produceCPU(edm::HeterogeneousEvent &iEvent, const edm::EventSetup& es) {
-
-    edm::ESHandle<TrackerGeometry> geom;
-    es.get<TrackerDigiGeometryRecord>().get( geom );
-
-
-
-  // Pixel DigiSimLink
-  edm::Handle<edm::DetSetVector<PixelDigiSimLink> > sipixelSimLinks;
-  //  iEvent.getByLabel(_pixelSimLinkSrc, sipixelSimLinks);
-  iEvent.getByToken(sipixelSimLinksToken_,sipixelSimLinks);
 
-  // SiStrip DigiSimLink
-  edm::Handle<edm::DetSetVector<StripDigiSimLink> > sistripSimLinks;
-  iEvent.getByToken(sistripSimLinksToken_,sistripSimLinks);
+    // prepare temporary map between SimTrackId and TrackingParticle index
+    mapping.clear();
+    for (TrackingParticleCollection::size_type itp = 0;
+                                             itp < TPCollectionH.product()->size(); ++itp) {
+      TrackingParticleRef trackingParticle(TPCollectionH, itp);
+
+      // SimTracks inside TrackingParticle
+      EncodedEventId eid(trackingParticle->eventId());
+      for (auto itrk  = trackingParticle->g4Track_begin();
+                itrk != trackingParticle->g4Track_end(); ++itrk) {
+        std::pair<uint32_t, EncodedEventId> trkid(itrk->trackId(), eid);
+        //std::cout << "creating map for id: " << trkid.first << " with tp: " << trackingParticle.key() << std::endl;
+        mapping.insert(std::make_pair(trkid, trackingParticle));
+      }
+    }
 
-  // Phase2 OT DigiSimLink
-  edm::Handle<edm::DetSetVector<PixelDigiSimLink> > siphase2OTSimLinks;
-  iEvent.getByToken(siphase2OTSimLinksToken_, siphase2OTSimLinks);
 
-  // Pixel Cluster
-  edm::Handle<edmNew::DetSetVector<SiPixelCluster> > pixelClusters;
-  bool foundPixelClusters = iEvent.getByToken(pixelClustersToken_,pixelClusters);
+}
 
-  // Strip Cluster
-  edm::Handle<edmNew::DetSetVector<SiStripCluster> > stripClusters;
-  bool foundStripClusters = iEvent.getByToken(stripClustersToken_,stripClusters);
 
-  // Phase2 Cluster
-  edm::Handle<edmNew::DetSetVector<Phase2TrackerCluster1D> > phase2OTClusters;
-  bool foundPhase2OTClusters = iEvent.getByToken(phase2OTClustersToken_, phase2OTClusters);
+void ClusterTPAssociationHeterogeneous::acquireGPUCuda(const edm::HeterogeneousEvent &iEvent,
+                      const edm::EventSetup &iSetup,
+                      cuda::stream_t<> &cudaStream) {
 
-  // TrackingParticle
-  edm::Handle<TrackingParticleCollection>  TPCollectionH;
-  iEvent.getByToken(trackingParticleToken_,TPCollectionH);
+    edm::ESHandle<TrackerGeometry> geom;
+    iSetup.get<TrackerDigiGeometryRecord>().get( geom );
 
-  auto clusterTPList = std::make_unique<ClusterTPAssociation>(TPCollectionH);
+    // Pixel DigiSimLink
+    edm::Handle<edm::DetSetVector<PixelDigiSimLink> > sipixelSimLinks;
+    //  iEvent.getByLabel(_pixelSimLinkSrc, sipixelSimLinks);
+    iEvent.getByToken(sipixelSimLinksToken_,sipixelSimLinks);
 
-  // prepare temporary map between SimTrackId and TrackingParticle index
-  std::map<std::pair<size_t, EncodedEventId>, TrackingParticleRef> mapping;
-  for (TrackingParticleCollection::size_type itp = 0;
-                                             itp < TPCollectionH.product()->size(); ++itp) {
-    TrackingParticleRef trackingParticle(TPCollectionH, itp);
-
-    // SimTracks inside TrackingParticle
-    EncodedEventId eid(trackingParticle->eventId());
-    //size_t index = 0;
-    for (std::vector<SimTrack>::const_iterator itrk  = trackingParticle->g4Track_begin(); 
-                                               itrk != trackingParticle->g4Track_end(); ++itrk) {
-      std::pair<uint32_t, EncodedEventId> trkid(itrk->trackId(), eid);
-      //std::cout << "creating map for id: " << trkid.first << " with tp: " << trackingParticle.key() << std::endl;
-      mapping.insert(std::make_pair(trkid, trackingParticle));
-    }
-  }
+    // TrackingParticle
+    edm::Handle<TrackingParticleCollection>  TPCollectionH;
+    iEvent.getByToken(trackingParticleToken_,TPCollectionH);
 
-  //  gpu stuff ------------------------
+    makeMap(iEvent);
+   
+    //  gpu stuff ------------------------
 
     std::cout << "In tpsimlink " << mapping.size() << std::endl;
 
     edm::Handle<siPixelRawToClusterHeterogeneousProduct::GPUProduct> gd;
     edm::Handle<siPixelRecHitsHeterogeneousProduct::GPUProduct> gh;
-    iEvent.getByToken(tGpuDigis, gd);  
+    iEvent.getByToken(tGpuDigis, gd);
     iEvent.getByToken(tGpuHits, gh);
     auto const & gDigis = *gd;
     auto const & gHits = *gh;
-    auto const & dcont = * gDigis.me_d;
-    auto const & hh = *gHits.gpu_d;
     auto ndigis = gDigis.nDigis;
-    auto nhits = gHits.nhits;
+    auto nhits = gHits.nHits;
 
     uint32_t nn=0, ng=0, ng10=0;
     std::vector<std::array<uint32_t,4>> digi2tp;
@@ -251,7 +229,7 @@ void ClusterTPAssociationHeterogeneous::produceCPU(edm::HeterogeneousEvent &iEve
       uint32_t gind = genericDet->index();
       for (auto const & link : links) {
         ++ng;
-             if (link.fraction() > 0.3f) ++ng10;
+           if (link.fraction() > 0.3f) ++ng10;
         if (link.fraction() < 0.5f) continue;
         auto tkid = std::make_pair(link.SimTrackId(), link.eventId());
         auto ipos = mapping.find(tkid);
@@ -267,14 +245,57 @@ void ClusterTPAssociationHeterogeneous::produceCPU(edm::HeterogeneousEvent &iEve
 
     std::cout << "In tpsimlink found " << nn << " valid link out of " << ng << '/' << ng10 << ' ' << digi2tp.size() << std::endl;
 
-    cudaCheck(cudaMemcpyAsync(slGPU.links_d, digi2tp.data(), sizeof(std::array<uint32_t,4>)*digi2tp.size(), cudaMemcpyDefault, dcont.stream));
-    slGPU.zero(dcont.stream);
-    clusterSLOnGPU::wrapper(dcont, ndigis, hh, nhits, slGPU, digi2tp.size());
+    cudaCheck(cudaMemcpyAsync(slGPU.links_d, digi2tp.data(), sizeof(std::array<uint32_t,4>)*digi2tp.size(), cudaMemcpyDefault, cudaStream.id()));
+    slGPU.zero(cudaStream.id());
+    clusterSLOnGPU::wrapper(gDigis, ndigis, gHits, nhits, slGPU, digi2tp.size(),cudaStream);
 
   //  end gpu stuff ---------------------
 
 
+}
+
+void ClusterTPAssociationHeterogeneous::produceGPUCuda(edm::HeterogeneousEvent &iEvent,
+                      const edm::EventSetup &iSetup,
+                      cuda::stream_t<> &cudaStream) {
+
+}
+ 
+		
+void ClusterTPAssociationHeterogeneous::produceCPU(edm::HeterogeneousEvent &iEvent, const edm::EventSetup& es) {
+
+
+   makeMap(iEvent);
+
+  // Pixel DigiSimLink
+  edm::Handle<edm::DetSetVector<PixelDigiSimLink> > sipixelSimLinks;
+  //  iEvent.getByLabel(_pixelSimLinkSrc, sipixelSimLinks);
+  iEvent.getByToken(sipixelSimLinksToken_,sipixelSimLinks);
+
+  // SiStrip DigiSimLink
+  edm::Handle<edm::DetSetVector<StripDigiSimLink> > sistripSimLinks;
+  iEvent.getByToken(sistripSimLinksToken_,sistripSimLinks);
+
+  // Phase2 OT DigiSimLink
+  edm::Handle<edm::DetSetVector<PixelDigiSimLink> > siphase2OTSimLinks;
+  iEvent.getByToken(siphase2OTSimLinksToken_, siphase2OTSimLinks);
 
+  // Pixel Cluster
+  edm::Handle<edmNew::DetSetVector<SiPixelCluster> > pixelClusters;
+  bool foundPixelClusters = iEvent.getByToken(pixelClustersToken_,pixelClusters);
+
+  // Strip Cluster
+  edm::Handle<edmNew::DetSetVector<SiStripCluster> > stripClusters;
+  bool foundStripClusters = iEvent.getByToken(stripClustersToken_,stripClusters);
+
+  // Phase2 Cluster
+  edm::Handle<edmNew::DetSetVector<Phase2TrackerCluster1D> > phase2OTClusters;
+  bool foundPhase2OTClusters = iEvent.getByToken(phase2OTClustersToken_, phase2OTClusters);
+
+  // TrackingParticle
+  edm::Handle<TrackingParticleCollection>  TPCollectionH;
+  iEvent.getByToken(trackingParticleToken_,TPCollectionH);
+
+  auto clusterTPList = std::make_unique<ClusterTPAssociation>(TPCollectionH);
 
   if ( foundPixelClusters ) {
     // Pixel Clusters 
@@ -382,8 +403,11 @@ void ClusterTPAssociationHeterogeneous::produceCPU(edm::HeterogeneousEvent &iEve
     }
 
   }
+
+
   clusterTPList->sortAndUnique();
   iEvent.put(std::move(clusterTPList));
+  mapping.clear();
 }
 
 template <typename T>

From 365897511eab59412070c7fbb756e6165f8dc302 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Tue, 10 Jul 2018 14:17:01 +0200
Subject: [PATCH 06/19] progressing??

---
 .../plugins/ClusterSLOnGPU.cu                 | 33 +++++++++++-
 .../plugins/ClusterSLOnGPU.h                  | 40 ++++++++------
 .../ClusterTPAssociationHeterogeneous.cc      | 52 ++++++++-----------
 ...rackerHitAssociationHeterogeneousProduct.h | 34 ++++++++++++
 4 files changed, 111 insertions(+), 48 deletions(-)
 create mode 100644 SimTracker/TrackerHitAssociation/plugins/trackerHitAssociationHeterogeneousProduct.h

diff --git a/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu
index e773887b484d3..31695c8ea9768 100644
--- a/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu
+++ b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu
@@ -146,8 +146,37 @@ namespace clusterSLOnGPU {
 
   std::atomic<int> evId(0);
 
-  void wrapper(DigisOnGPU const & dd, uint32_t ndigis, HitsOnCPU const & hh, uint32_t nhits, ClusterSLGPU const & sl, uint32_t n, cuda::stream_t<>& stream) {
-    
+
+  void
+  Kernel::alloc(cuda::stream_t<>& stream) {
+   cudaCheck(cudaMalloc((void**) & slgpu.links_d,(MAX_DIGIS)*sizeof(std::array<uint32_t,4>)));
+
+   cudaCheck(cudaMalloc((void**) & slgpu.tkId_d,(MaxNumModules*256)*sizeof(uint32_t)));
+   cudaCheck(cudaMalloc((void**) & slgpu.tkId2_d,(MaxNumModules*256)*sizeof(uint32_t)));
+   cudaCheck(cudaMalloc((void**) & slgpu.n1_d,(MaxNumModules*256)*sizeof(uint32_t)));
+   cudaCheck(cudaMalloc((void**) & slgpu.n2_d,(MaxNumModules*256)*sizeof(uint32_t)));
+
+
+   cudaCheck(cudaMalloc((void**) & slgpu.me_d, sizeof(ClusterSLGPU)));
+   cudaCheck(cudaMemcpyAsync(slgpu.me_d, slgpu, sizeof(ClusterSLGPU), cudaMemcpyDefault, stream.id()));
+   cudaCheck(cudaDeviceSynchronize());
+
+  }
+
+  void
+  Kernel::zero(cudaStream_t stream) {
+   cudaCheck(cudaMemsetAsync(slgpu.tkId_d,0,(MaxNumModules*256)*sizeof(uint32_t), stream));
+   cudaCheck(cudaMemsetAsync(slgpu.tkId2_d,0,(MaxNumModules*256)*sizeof(uint32_t), stream));
+   cudaCheck(cudaMemsetAsync(slgpu.n1_d,0,(MaxNumModules*256)*sizeof(uint32_t), stream));
+   cudaCheck(cudaMemsetAsync(slgpu.n2_d,0,(MaxNumModules*256)*sizeof(uint32_t), stream));
+  }
+
+
+  void 
+  Kernel::algo(DigisOnGPU const & dd, uint32_t ndigis, HitsOnCPU const & hh, uint32_t nhits, uint32_t n, cuda::stream_t<>& stream) {
+
+    ClusterSLGPU const & sl = slgpu;
+
     int ev = ++evId;
     int threadsPerBlock = 256;
     int blocks = (ndigis + threadsPerBlock - 1) / threadsPerBlock;
diff --git a/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.h b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.h
index 8fa8d6cba0c7b..9c695560a80fc 100644
--- a/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.h
+++ b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.h
@@ -1,33 +1,41 @@
-// gpu
+#ifndef SimTrackerTrackerHitAssociationClusterSLOnGPU_H
+#define SimTrackerTrackerHitAssociationClusterSLOnGPU_H
+
 #include <cuda_runtime.h>
 #include <cuda/api_wrappers.h>
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
+
+#include "trackerHitAssociationHeterogeneousProduct.h"
+
 #include "RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h"
 #include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h"
 
-struct ClusterSLGPU {
- ClusterSLGPU(){alloc();}
- void alloc();
- void zero(cudaStream_t stream);
-
- ClusterSLGPU * me_d;
- std::array<uint32_t,4> * links_d;
- uint32_t * tkId_d;
- uint32_t * tkId2_d;
- uint32_t * n1_d;
- uint32_t * n2_d;
 
- static constexpr uint32_t MAX_DIGIS = 2000*150;
- static constexpr uint32_t MaxNumModules = 2000;
 
-};
 
 namespace clusterSLOnGPU {
 
+  using ClusterSLGPU = trackerHitAssociationHeterogeneousProduct::ClusterSLGPU;
+  using GPUProduct = trackerHitAssociationHeterogeneousProduct::GPUProduct;
+
   using DigisOnGPU = siPixelRawToClusterHeterogeneousProduct::GPUProduct;
   using HitsOnGPU = siPixelRecHitsHeterogeneousProduct::HitsOnGPU;
   using HitsOnCPU = siPixelRecHitsHeterogeneousProduct::HitsOnCPU;
-  void wrapper(DigisOnGPU const & dd, uint32_t ndigis, HitsOnCPU const & hh, uint32_t nhits, ClusterSLGPU const & sl, uint32_t n, cuda::stream_t<>& stream);
 
+
+  class Kernel {
+  public:
+    void algo(DigisOnGPU const & dd, uint32_t ndigis, HitsOnCPU const & hh, uint32_t nhits, uint32_t n, cuda::stream_t<>& stream);
+    GPUProduct getProduct() { retun GPUProduct{slgpu.me_d};}
+    
+  private:
+     void alloc(cuda::stream_t<>& stream);
+     void zero(cudaStream_t stream);
+  private:
+     ClusterSLGPU slgpu; 
+
+  };
 }
+
+#endif
diff --git a/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc b/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc
index 50d03c41b414e..d750750cf7daf 100644
--- a/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc
+++ b/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc
@@ -48,41 +48,22 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "ClusterSLOnGPU.h"
 
-void
-ClusterSLGPU::alloc() {
-   cudaCheck(cudaMalloc((void**) & links_d,(MAX_DIGIS)*sizeof(std::array<uint32_t,4>)));
-
-   cudaCheck(cudaMalloc((void**) & tkId_d,(MaxNumModules*256)*sizeof(uint32_t)));
-   cudaCheck(cudaMalloc((void**) & tkId2_d,(MaxNumModules*256)*sizeof(uint32_t)));
-   cudaCheck(cudaMalloc((void**) & n1_d,(MaxNumModules*256)*sizeof(uint32_t)));
-   cudaCheck(cudaMalloc((void**) & n2_d,(MaxNumModules*256)*sizeof(uint32_t)));
-
-
-   cudaCheck(cudaMalloc((void**) & me_d, sizeof(ClusterSLGPU)));
-   cudaCheck(cudaMemcpy(me_d, this, sizeof(ClusterSLGPU), cudaMemcpyDefault));
-   cudaCheck(cudaDeviceSynchronize());
-
-}
-
-void
-ClusterSLGPU::zero(cudaStream_t stream) {
-   cudaCheck(cudaMemsetAsync(tkId_d,0,(MaxNumModules*256)*sizeof(uint32_t), stream));
-   cudaCheck(cudaMemsetAsync(tkId2_d,0,(MaxNumModules*256)*sizeof(uint32_t), stream));
-   cudaCheck(cudaMemsetAsync(n1_d,0,(MaxNumModules*256)*sizeof(uint32_t), stream));
-   cudaCheck(cudaMemsetAsync(n2_d,0,(MaxNumModules*256)*sizeof(uint32_t), stream));
-}
-
 class ClusterTPAssociationHeterogeneous : public HeterogeneousEDProducer<heterogeneous::HeterogeneousDevices<
           heterogeneous::GPUCuda, heterogeneous::CPU>>
 {
 public:
   typedef std::vector<OmniClusterRef> OmniClusterCollection;
 
+  using ClusterSLGPU = trackerHitAssociationHeterogeneousProduct::ClusterSLGPU;
+  using GPUProduct = trackerHitAssociationHeterogeneousProduct::GPUProduct;
+  using CPUProduct = trackerHitAssociationHeterogeneousProduct::CPUProduct;
+  using Output = trackerHitAssociationHeterogeneousProduct::ClusterTPAHeterogeneousProduct;
+
   using PixelDigiClustersH = siPixelRawToClusterHeterogeneousProduct::HeterogeneousDigiCluster;
   using PixelRecHitsH = siPixelRecHitsHeterogeneousProduct::HeterogeneousPixelRecHit;
 
   explicit ClusterTPAssociationHeterogeneous(const edm::ParameterSet&);
-  ~ClusterTPAssociationHeterogeneous() = default;;
+  ~ClusterTPAssociationHeterogeneous() = default;
 
   static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
 
@@ -118,7 +99,7 @@ class ClusterTPAssociationHeterogeneous : public HeterogeneousEDProducer<heterog
   edm::EDGetTokenT<HeterogeneousProduct> tGpuDigis;
   edm::EDGetTokenT<HeterogeneousProduct> tGpuHits;
 
-  ClusterSLGPU slGPU;
+  std::unique_ptr<clusterSLGPU::Kernel> gpuAlgo;
 
   std::map<std::pair<size_t, EncodedEventId>, TrackingParticleRef> mapping;
  
@@ -161,6 +142,8 @@ void ClusterTPAssociationHeterogeneous::fillDescriptions(edm::ConfigurationDescr
 void ClusterTPAssociationHeterogeneous::beginStreamGPUCuda(edm::StreamID streamId,
                           cuda::stream_t<> &cudaStream) {
 
+   gpuAlgo = std::make_unique<clusterSLGPU::Kernel>(cudaStream);
+
 }
 
 void ClusterTPAssociationHeterogeneous::makeMap(const edm::HeterogeneousEvent &iEvent) {
@@ -246,8 +229,7 @@ void ClusterTPAssociationHeterogeneous::acquireGPUCuda(const edm::HeterogeneousE
     std::cout << "In tpsimlink found " << nn << " valid link out of " << ng << '/' << ng10 << ' ' << digi2tp.size() << std::endl;
 
     cudaCheck(cudaMemcpyAsync(slGPU.links_d, digi2tp.data(), sizeof(std::array<uint32_t,4>)*digi2tp.size(), cudaMemcpyDefault, cudaStream.id()));
-    slGPU.zero(cudaStream.id());
-    clusterSLOnGPU::wrapper(gDigis, ndigis, gHits, nhits, slGPU, digi2tp.size(),cudaStream);
+    gpuAlgo->algo(gDigis, ndigis, gHits, nhits, digi2tp.size(),cudaStream);
 
   //  end gpu stuff ---------------------
 
@@ -257,6 +239,8 @@ void ClusterTPAssociationHeterogeneous::acquireGPUCuda(const edm::HeterogeneousE
 void ClusterTPAssociationHeterogeneous::produceGPUCuda(edm::HeterogeneousEvent &iEvent,
                       const edm::EventSetup &iSetup,
                       cuda::stream_t<> &cudaStream) {
+     auto output = std::make_unique<GPUProduct>(gpuAlgo->getProduct());
+
 
 }
  
@@ -265,6 +249,12 @@ void ClusterTPAssociationHeterogeneous::produceCPU(edm::HeterogeneousEvent &iEve
 
 
    makeMap(iEvent);
+   iEvent.put(std::move(produceLegacy(iEvent,es)));
+
+}
+
+std::unique_ptr<CPUProduct> ClusterTPAssociationHeterogeneous::produceLegacy(edm::HeterogeneousEvent &iEvent, const edm::EventSetup& es) {
+
 
   // Pixel DigiSimLink
   edm::Handle<edm::DetSetVector<PixelDigiSimLink> > sipixelSimLinks;
@@ -295,7 +285,8 @@ void ClusterTPAssociationHeterogeneous::produceCPU(edm::HeterogeneousEvent &iEve
   edm::Handle<TrackingParticleCollection>  TPCollectionH;
   iEvent.getByToken(trackingParticleToken_,TPCollectionH);
 
-  auto clusterTPList = std::make_unique<ClusterTPAssociation>(TPCollectionH);
+  auto output = std::make_unique<CPUProduct>(TPCollectionH);
+  auto & clusterTPList = output.collection;
 
   if ( foundPixelClusters ) {
     // Pixel Clusters 
@@ -406,7 +397,8 @@ void ClusterTPAssociationHeterogeneous::produceCPU(edm::HeterogeneousEvent &iEve
 
 
   clusterTPList->sortAndUnique();
-  iEvent.put(std::move(clusterTPList));
+
+  return output;
   mapping.clear();
 }
 
diff --git a/SimTracker/TrackerHitAssociation/plugins/trackerHitAssociationHeterogeneousProduct.h b/SimTracker/TrackerHitAssociation/plugins/trackerHitAssociationHeterogeneousProduct.h
new file mode 100644
index 0000000000000..a84d6bb93efbb
--- /dev/null
+++ b/SimTracker/TrackerHitAssociation/plugins/trackerHitAssociationHeterogeneousProduct.h
@@ -0,0 +1,34 @@
+#ifndef SimTrackerTrackerHitAssociationClusterHeterogeneousProduct_H
+#define SimTrackerTrackerHitAssociationClusterHeterogeneousProduct_H
+
+
+namespace trackerHitAssociationHeterogeneousProduct {
+
+  struct CPUProduct {
+    ClusterTPAssociation collection;
+  }
+
+  struct ClusterSLGPU {
+
+   ClusterSLGPU * me_d=nullptr;
+   std::array<uint32_t,4> * links_d;
+   uint32_t * tkId_d;
+   uint32_t * tkId2_d;
+   uint32_t * n1_d;
+   uint32_t * n2_d;
+
+   static constexpr uint32_t MAX_DIGIS = 2000*150;
+   static constexpr uint32_t MaxNumModules = 2000;
+
+  };
+
+  struct GPUProduct {
+     ClusterSLGPU *  gpu_d=nullptr;
+  };
+
+   using ClusterTPAHeterogeneousProduct = HeterogeneousProductImpl<heterogeneous::CPUProduct<CPUProduct>,
+                                                                   heterogeneous::GPUCudaProduct<GPUProduct> >;
+
+}
+
+#endif

From 5aa57d570ba549ad0e683652238ee7668dcbdb63 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Thu, 12 Jul 2018 18:52:05 +0200
Subject: [PATCH 07/19] compiles

---
 .../plugins/ClusterSLOnGPU.cu                 | 21 +++++++++---------
 .../plugins/ClusterSLOnGPU.h                  |  5 +++--
 .../ClusterTPAssociationHeterogeneous.cc      | 22 +++++++++----------
 ...rackerHitAssociationHeterogeneousProduct.h | 15 ++++++++++++-
 4 files changed, 39 insertions(+), 24 deletions(-)

diff --git a/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu
index 31695c8ea9768..30db1a292bc26 100644
--- a/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu
+++ b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu
@@ -47,6 +47,7 @@ ForwardIt lowerBound(ForwardIt first, ForwardIt last, const T& value, Compare co
 }
 
 
+using ClusterSLGPU = trackerHitAssociationHeterogeneousProduct::ClusterSLGPU;
 
 __global__
 void simLink(clusterSLOnGPU::DigisOnGPU const * ddp, uint32_t ndigis, clusterSLOnGPU::HitsOnGPU const * hhp, ClusterSLGPU const * slp, uint32_t n) {
@@ -149,26 +150,26 @@ namespace clusterSLOnGPU {
 
   void
   Kernel::alloc(cuda::stream_t<>& stream) {
-   cudaCheck(cudaMalloc((void**) & slgpu.links_d,(MAX_DIGIS)*sizeof(std::array<uint32_t,4>)));
+   cudaCheck(cudaMalloc((void**) & slgpu.links_d,(ClusterSLGPU::MAX_DIGIS)*sizeof(std::array<uint32_t,4>)));
 
-   cudaCheck(cudaMalloc((void**) & slgpu.tkId_d,(MaxNumModules*256)*sizeof(uint32_t)));
-   cudaCheck(cudaMalloc((void**) & slgpu.tkId2_d,(MaxNumModules*256)*sizeof(uint32_t)));
-   cudaCheck(cudaMalloc((void**) & slgpu.n1_d,(MaxNumModules*256)*sizeof(uint32_t)));
-   cudaCheck(cudaMalloc((void**) & slgpu.n2_d,(MaxNumModules*256)*sizeof(uint32_t)));
+   cudaCheck(cudaMalloc((void**) & slgpu.tkId_d,(ClusterSLGPU::MaxNumModules*256)*sizeof(uint32_t)));
+   cudaCheck(cudaMalloc((void**) & slgpu.tkId2_d,(ClusterSLGPU::MaxNumModules*256)*sizeof(uint32_t)));
+   cudaCheck(cudaMalloc((void**) & slgpu.n1_d,(ClusterSLGPU::MaxNumModules*256)*sizeof(uint32_t)));
+   cudaCheck(cudaMalloc((void**) & slgpu.n2_d,(ClusterSLGPU::MaxNumModules*256)*sizeof(uint32_t)));
 
 
    cudaCheck(cudaMalloc((void**) & slgpu.me_d, sizeof(ClusterSLGPU)));
-   cudaCheck(cudaMemcpyAsync(slgpu.me_d, slgpu, sizeof(ClusterSLGPU), cudaMemcpyDefault, stream.id()));
+   cudaCheck(cudaMemcpyAsync(slgpu.me_d, &slgpu, sizeof(ClusterSLGPU), cudaMemcpyDefault, stream.id()));
    cudaCheck(cudaDeviceSynchronize());
 
   }
 
   void
   Kernel::zero(cudaStream_t stream) {
-   cudaCheck(cudaMemsetAsync(slgpu.tkId_d,0,(MaxNumModules*256)*sizeof(uint32_t), stream));
-   cudaCheck(cudaMemsetAsync(slgpu.tkId2_d,0,(MaxNumModules*256)*sizeof(uint32_t), stream));
-   cudaCheck(cudaMemsetAsync(slgpu.n1_d,0,(MaxNumModules*256)*sizeof(uint32_t), stream));
-   cudaCheck(cudaMemsetAsync(slgpu.n2_d,0,(MaxNumModules*256)*sizeof(uint32_t), stream));
+   cudaCheck(cudaMemsetAsync(slgpu.tkId_d,0,(ClusterSLGPU::MaxNumModules*256)*sizeof(uint32_t), stream));
+   cudaCheck(cudaMemsetAsync(slgpu.tkId2_d,0,(ClusterSLGPU::MaxNumModules*256)*sizeof(uint32_t), stream));
+   cudaCheck(cudaMemsetAsync(slgpu.n1_d,0,(ClusterSLGPU::MaxNumModules*256)*sizeof(uint32_t), stream));
+   cudaCheck(cudaMemsetAsync(slgpu.n2_d,0,(ClusterSLGPU::MaxNumModules*256)*sizeof(uint32_t), stream));
   }
 
 
diff --git a/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.h b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.h
index 9c695560a80fc..27b78000aec56 100644
--- a/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.h
+++ b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.h
@@ -26,13 +26,14 @@ namespace clusterSLOnGPU {
 
   class Kernel {
   public:
+    Kernel(cuda::stream_t<>& stream) {alloc(stream);}
     void algo(DigisOnGPU const & dd, uint32_t ndigis, HitsOnCPU const & hh, uint32_t nhits, uint32_t n, cuda::stream_t<>& stream);
-    GPUProduct getProduct() { retun GPUProduct{slgpu.me_d};}
+    GPUProduct getProduct() { return GPUProduct{slgpu.me_d};}
     
   private:
      void alloc(cuda::stream_t<>& stream);
      void zero(cudaStream_t stream);
-  private:
+  public:
      ClusterSLGPU slgpu; 
 
   };
diff --git a/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc b/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc
index d750750cf7daf..fbff2a0d354e7 100644
--- a/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc
+++ b/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc
@@ -82,7 +82,7 @@ class ClusterTPAssociationHeterogeneous : public HeterogeneousEDProducer<heterog
                   const edm::EventSetup &iSetup) override;
 
   void makeMap(const edm::HeterogeneousEvent &iEvent);
-
+  std::unique_ptr<CPUProduct> produceLegacy(edm::HeterogeneousEvent &iEvent, const edm::EventSetup& es);
 
   template <typename T>
   std::vector<std::pair<uint32_t, EncodedEventId> >
@@ -99,7 +99,7 @@ class ClusterTPAssociationHeterogeneous : public HeterogeneousEDProducer<heterog
   edm::EDGetTokenT<HeterogeneousProduct> tGpuDigis;
   edm::EDGetTokenT<HeterogeneousProduct> tGpuHits;
 
-  std::unique_ptr<clusterSLGPU::Kernel> gpuAlgo;
+  std::unique_ptr<clusterSLOnGPU::Kernel> gpuAlgo;
 
   std::map<std::pair<size_t, EncodedEventId>, TrackingParticleRef> mapping;
  
@@ -142,7 +142,7 @@ void ClusterTPAssociationHeterogeneous::fillDescriptions(edm::ConfigurationDescr
 void ClusterTPAssociationHeterogeneous::beginStreamGPUCuda(edm::StreamID streamId,
                           cuda::stream_t<> &cudaStream) {
 
-   gpuAlgo = std::make_unique<clusterSLGPU::Kernel>(cudaStream);
+   gpuAlgo = std::make_unique<clusterSLOnGPU::Kernel>(cudaStream);
 
 }
 
@@ -228,7 +228,7 @@ void ClusterTPAssociationHeterogeneous::acquireGPUCuda(const edm::HeterogeneousE
 
     std::cout << "In tpsimlink found " << nn << " valid link out of " << ng << '/' << ng10 << ' ' << digi2tp.size() << std::endl;
 
-    cudaCheck(cudaMemcpyAsync(slGPU.links_d, digi2tp.data(), sizeof(std::array<uint32_t,4>)*digi2tp.size(), cudaMemcpyDefault, cudaStream.id()));
+    cudaCheck(cudaMemcpyAsync(gpuAlgo->slgpu.links_d, digi2tp.data(), sizeof(std::array<uint32_t,4>)*digi2tp.size(), cudaMemcpyDefault, cudaStream.id()));
     gpuAlgo->algo(gDigis, ndigis, gHits, nhits, digi2tp.size(),cudaStream);
 
   //  end gpu stuff ---------------------
@@ -247,13 +247,13 @@ void ClusterTPAssociationHeterogeneous::produceGPUCuda(edm::HeterogeneousEvent &
 		
 void ClusterTPAssociationHeterogeneous::produceCPU(edm::HeterogeneousEvent &iEvent, const edm::EventSetup& es) {
 
-
    makeMap(iEvent);
    iEvent.put(std::move(produceLegacy(iEvent,es)));
 
 }
 
-std::unique_ptr<CPUProduct> ClusterTPAssociationHeterogeneous::produceLegacy(edm::HeterogeneousEvent &iEvent, const edm::EventSetup& es) {
+std::unique_ptr<ClusterTPAssociationHeterogeneous::CPUProduct> 
+ClusterTPAssociationHeterogeneous::produceLegacy(edm::HeterogeneousEvent &iEvent, const edm::EventSetup& es) {
 
 
   // Pixel DigiSimLink
@@ -286,7 +286,7 @@ std::unique_ptr<CPUProduct> ClusterTPAssociationHeterogeneous::produceLegacy(edm
   iEvent.getByToken(trackingParticleToken_,TPCollectionH);
 
   auto output = std::make_unique<CPUProduct>(TPCollectionH);
-  auto & clusterTPList = output.collection;
+  auto & clusterTPList = output->collection;
 
   if ( foundPixelClusters ) {
     // Pixel Clusters 
@@ -315,7 +315,7 @@ std::unique_ptr<CPUProduct> ClusterTPAssociationHeterogeneous::produceLegacy(edm
 	  auto ipos = mapping.find(*iset);
 	  if (ipos != mapping.end()) {
 	    //std::cout << "cluster in detid: " << detid << " from tp: " << ipos->second.key() << " " << iset->first << std::endl;
-	    clusterTPList->emplace_back(OmniClusterRef(c_ref), ipos->second);
+	    clusterTPList.emplace_back(OmniClusterRef(c_ref), ipos->second);
 	  }
 	}
       }
@@ -350,7 +350,7 @@ std::unique_ptr<CPUProduct> ClusterTPAssociationHeterogeneous::produceLegacy(edm
 	  auto ipos = mapping.find(*iset);
 	  if (ipos != mapping.end()) {
 	    //std::cout << "cluster in detid: " << detid << " from tp: " << ipos->second.key() << " " << iset->first << std::endl;
-	    clusterTPList->emplace_back(OmniClusterRef(c_ref), ipos->second);
+	    clusterTPList.emplace_back(OmniClusterRef(c_ref), ipos->second);
 	  } 
 	}
       } 
@@ -386,7 +386,7 @@ std::unique_ptr<CPUProduct> ClusterTPAssociationHeterogeneous::produceLegacy(edm
                                                                               iset != simTkIds.end(); iset++) {
             auto ipos = mapping.find(*iset);
             if (ipos != mapping.end()) {
-              clusterTPList->emplace_back(OmniClusterRef(c_ref), ipos->second);
+              clusterTPList.emplace_back(OmniClusterRef(c_ref), ipos->second);
             }
           }
         }
@@ -396,7 +396,7 @@ std::unique_ptr<CPUProduct> ClusterTPAssociationHeterogeneous::produceLegacy(edm
   }
 
 
-  clusterTPList->sortAndUnique();
+  clusterTPList.sortAndUnique();
 
   return output;
   mapping.clear();
diff --git a/SimTracker/TrackerHitAssociation/plugins/trackerHitAssociationHeterogeneousProduct.h b/SimTracker/TrackerHitAssociation/plugins/trackerHitAssociationHeterogeneousProduct.h
index a84d6bb93efbb..4db318c5348f6 100644
--- a/SimTracker/TrackerHitAssociation/plugins/trackerHitAssociationHeterogeneousProduct.h
+++ b/SimTracker/TrackerHitAssociation/plugins/trackerHitAssociationHeterogeneousProduct.h
@@ -1,12 +1,23 @@
 #ifndef SimTrackerTrackerHitAssociationClusterHeterogeneousProduct_H
 #define SimTrackerTrackerHitAssociationClusterHeterogeneousProduct_H
 
+#ifndef	__NVCC__
+#include "SimTracker/TrackerHitAssociation/interface/ClusterTPAssociation.h"
+#endif
+
+#include "HeterogeneousCore/Product/interface/HeterogeneousProduct.h"
+
 
 namespace trackerHitAssociationHeterogeneousProduct {
 
+#ifndef __NVCC__
   struct CPUProduct {
+    CPUProduct() = default;
+    template<typename T> 
+    explicit CPUProduct(T const & t) : collection(t){}
     ClusterTPAssociation collection;
-  }
+  };
+#endif
 
   struct ClusterSLGPU {
 
@@ -26,8 +37,10 @@ namespace trackerHitAssociationHeterogeneousProduct {
      ClusterSLGPU *  gpu_d=nullptr;
   };
 
+#ifndef	__NVCC__
    using ClusterTPAHeterogeneousProduct = HeterogeneousProductImpl<heterogeneous::CPUProduct<CPUProduct>,
                                                                    heterogeneous::GPUCudaProduct<GPUProduct> >;
+#endif
 
 }
 

From 4d3eea0ea2d5ee74ed73cebf20b880e5d1a1bbee Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Thu, 12 Jul 2018 19:08:13 +0200
Subject: [PATCH 08/19] compiles converter as well

---
 .../ClusterTPAssociationHeterogeneous.cc      |  6 +-
 ...sterTPAssociationHeterogeneousConverter.cc | 59 +++++++++++++++++++
 2 files changed, 64 insertions(+), 1 deletion(-)
 create mode 100644 SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneousConverter.cc

diff --git a/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc b/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc
index fbff2a0d354e7..949654073bd38 100644
--- a/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc
+++ b/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc
@@ -239,8 +239,12 @@ void ClusterTPAssociationHeterogeneous::acquireGPUCuda(const edm::HeterogeneousE
 void ClusterTPAssociationHeterogeneous::produceGPUCuda(edm::HeterogeneousEvent &iEvent,
                       const edm::EventSetup &iSetup,
                       cuda::stream_t<> &cudaStream) {
-     auto output = std::make_unique<GPUProduct>(gpuAlgo->getProduct());
 
+  auto output = std::make_unique<GPUProduct>(gpuAlgo->getProduct());
+
+  iEvent.put<Output>(std::move(output), [this, &iEvent, &iSetup](const GPUProduct& hits, CPUProduct& cpu) {
+                       cpu = *(this->produceLegacy(iEvent,iSetup));
+                     });
 
 }
  
diff --git a/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneousConverter.cc b/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneousConverter.cc
new file mode 100644
index 0000000000000..84cd7fef424ea
--- /dev/null
+++ b/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneousConverter.cc
@@ -0,0 +1,59 @@
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/global/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "HeterogeneousCore/Product/interface/HeterogeneousProduct.h"
+
+#include "trackerHitAssociationHeterogeneousProduct.h"
+
+
+class ClusterTPAssociationHeterogeneousConverter: public edm::global::EDProducer<> {
+public:
+
+  using Input = trackerHitAssociationHeterogeneousProduct::ClusterTPAHeterogeneousProduct;
+  using Product = ClusterTPAssociation;
+
+  explicit ClusterTPAssociationHeterogeneousConverter(edm::ParameterSet const& iConfig);
+  ~ClusterTPAssociationHeterogeneousConverter() = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+private:
+  void produce(edm::StreamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const override;
+
+  edm::EDGetTokenT<HeterogeneousProduct> token_;
+};
+
+ClusterTPAssociationHeterogeneousConverter::ClusterTPAssociationHeterogeneousConverter(edm::ParameterSet const& iConfig):
+  token_(consumes<HeterogeneousProduct>(iConfig.getParameter<edm::InputTag>("src")))
+{
+  produces<Product>(); 
+}
+
+void ClusterTPAssociationHeterogeneousConverter::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<edm::InputTag>("src", edm::InputTag("tpClusterProducerHeterogeneos"));
+
+  descriptions.addWithDefaultLabel(desc);
+}
+
+namespace {
+  template <typename T>
+  auto copy_unique(const T& t) {
+    return std::make_unique<T>(t);
+  }
+}
+
+void ClusterTPAssociationHeterogeneousConverter::produce(edm::StreamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
+  edm::Handle<HeterogeneousProduct> hinput;
+  iEvent.getByToken(token_, hinput);
+
+  const auto& input = hinput->get<Input>().getProduct<HeterogeneousDevice::kCPU>();
+
+  iEvent.put(copy_unique(input.collection));
+}
+
+
+DEFINE_FWK_MODULE(ClusterTPAssociationHeterogeneousConverter);

From 731b0626c5706dc846ce6e2c8cc9b67b809759c9 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Thu, 12 Jul 2018 19:25:49 +0200
Subject: [PATCH 09/19] missing only sequence

---
 .../ClusterTPAssociationHeterogeneousConverter.cc      |  2 +-
 .../python/tpClusterProducer_cfi.py                    | 10 ++++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneousConverter.cc b/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneousConverter.cc
index 84cd7fef424ea..d8b9e099c3474 100644
--- a/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneousConverter.cc
+++ b/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneousConverter.cc
@@ -36,7 +36,7 @@ void ClusterTPAssociationHeterogeneousConverter::fillDescriptions(edm::Configura
   edm::ParameterSetDescription desc;
   desc.add<edm::InputTag>("src", edm::InputTag("tpClusterProducerHeterogeneos"));
 
-  descriptions.addWithDefaultLabel(desc);
+  descriptions.add("tpClusterHeterogeneousConverter",desc);
 }
 
 namespace {
diff --git a/SimTracker/TrackerHitAssociation/python/tpClusterProducer_cfi.py b/SimTracker/TrackerHitAssociation/python/tpClusterProducer_cfi.py
index 8757a67226fb8..e8330c074dfac 100644
--- a/SimTracker/TrackerHitAssociation/python/tpClusterProducer_cfi.py
+++ b/SimTracker/TrackerHitAssociation/python/tpClusterProducer_cfi.py
@@ -1,4 +1,5 @@
 import FWCore.ParameterSet.Config as cms
+from Configuration.ProcessModifiers.gpu_cff import gpu
 
 from SimTracker.TrackerHitAssociation.tpClusterProducerDefault_cfi import tpClusterProducerDefault as _tpClusterProducerDefault
 
@@ -18,3 +19,12 @@
     stripSimLinkSrc = "mixData:StripDigiSimLink",
     phase2OTSimLinkSrc = "mixData:Phase2OTDigiSimLink",
 )
+
+
+from SimTracker.TrackerHitAssociation.tpClusterProducerHeterogeneousDefault_cfi import tpClusterProducerHeterogeneousDefault as _tpClusterProducerHeterogeneous
+tpClusterProducerHeterogeneous = _tpClusterProducerHeterogeneous.clone()
+
+from SimTracker.TrackerHitAssociation.tpClusterHeterogeneousConverter_cfi import tpClusterHeterogeneousConverter as _tpHeterogeneousConverter
+
+gpu.toReplaceWith(tpClusterProducer, _tpHeterogeneousConverter.clone())
+

From b65075bc96e838b7258f3f5a7672065be2f4d92c Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Wed, 18 Jul 2018 15:52:11 +0200
Subject: [PATCH 10/19] runs up to some point

---
 .../plugins/ClusterTPAssociationHeterogeneous.cc  |  2 +-
 .../RecoTrack/python/TrackValidation_cff.py       | 15 +++++++++++++--
 Validation/RecoTrack/python/associators_cff.py    |  3 ++-
 3 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc b/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc
index 949654073bd38..c0bc3b66f0c7f 100644
--- a/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc
+++ b/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc
@@ -117,7 +117,7 @@ ClusterTPAssociationHeterogeneous::ClusterTPAssociationHeterogeneous(const edm::
     tGpuDigis(consumesHeterogeneous(cfg.getParameter<edm::InputTag>("heterogeneousPixelDigiClusterSrc"))),
     tGpuHits(consumesHeterogeneous(cfg.getParameter<edm::InputTag>("heterogeneousPixelRecHitSrc")))
 {
-  produces<ClusterTPAssociation>();
+  produces<HeterogeneousProduct>();
 }
 
 void ClusterTPAssociationHeterogeneous::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
diff --git a/Validation/RecoTrack/python/TrackValidation_cff.py b/Validation/RecoTrack/python/TrackValidation_cff.py
index b6251473fcab4..dff571862bb46 100644
--- a/Validation/RecoTrack/python/TrackValidation_cff.py
+++ b/Validation/RecoTrack/python/TrackValidation_cff.py
@@ -709,9 +709,14 @@ def _uniqueFirstLayers(layerList):
 
 
 ### Pixel tracking only mode (placeholder for now)
-tpClusterProducerPixelTrackingOnly = tpClusterProducer.clone(
-    pixelClusterSrc = "siPixelClustersPreSplitting"
+
+tpClusterProducerHeterogeneousPixelTrackingOnly = tpClusterProducerHeterogeneous.clone(
+   pixelClusterSrc = "siPixelClustersPreSplitting"
 )
+tpClusterProducerPixelTrackingOnly = tpClusterProducer.clone()
+# Need to use the modifier to customize because the exact EDProducer type depends on the modifier
+gpu.toModify(tpClusterProducerPixelTrackingOnly, src = "tpClusterProducerHeterogeneousPixelTrackingOnly")
+
 quickTrackAssociatorByHitsPixelTrackingOnly = quickTrackAssociatorByHits.clone(
     cluster2TPSrc = "tpClusterProducerPixelTrackingOnly"
 )
@@ -739,12 +744,18 @@ def _uniqueFirstLayers(layerList):
 tracksValidationTruthPixelTrackingOnly.replace(quickTrackAssociatorByHits, quickTrackAssociatorByHitsPixelTrackingOnly)
 tracksValidationTruthPixelTrackingOnly.replace(trackingParticleRecoTrackAsssociation, trackingParticlePixelTrackAsssociation)
 tracksValidationTruthPixelTrackingOnly.replace(VertexAssociatorByPositionAndTracks, PixelVertexAssociatorByPositionAndTracks)
+
+_tracksValidationTruthPixelTrackingOnlyGPU = tracksValidationTruthPixelTrackingOnly.copy()
+_tracksValidationTruthPixelTrackingOnlyGPU.insert(0, tpClusterProducerHeterogeneousPixelTrackingOnly)
+gpu.toReplaceWith(tracksValidationTruthPixelTrackingOnly, _tracksValidationTruthPixelTrackingOnlyGPU)
+
 tracksValidationPixelTrackingOnly = cms.Sequence(
     tracksValidationTruthPixelTrackingOnly +
     trackValidatorPixelTrackingOnly
 )
 
 
+
 ### Lite mode (only generalTracks and HP)
 trackValidatorLite = trackValidator.clone(
     label = ["generalTracks", "cutsRecoTracksHp"]
diff --git a/Validation/RecoTrack/python/associators_cff.py b/Validation/RecoTrack/python/associators_cff.py
index 73c6f416e509c..8c2c8b7c7c85e 100644
--- a/Validation/RecoTrack/python/associators_cff.py
+++ b/Validation/RecoTrack/python/associators_cff.py
@@ -3,7 +3,8 @@
 #### TrackAssociation
 import SimTracker.TrackAssociatorProducers.quickTrackAssociatorByHits_cfi
 import SimTracker.TrackAssociatorProducers.trackAssociatorByPosition_cfi
-from SimTracker.TrackerHitAssociation.tpClusterProducer_cfi import tpClusterProducer as _tpClusterProducer
+# from SimTracker.TrackerHitAssociation.tpClusterProducer_cfi import tpClusterProducer as _tpClusterProducer
+from SimTracker.TrackerHitAssociation.tpClusterProducer_cfi import tpClusterProducerHeterogeneous as _tpClusterProducer
 from SimTracker.TrackAssociation.trackingParticleRecoTrackAsssociation_cfi import trackingParticleRecoTrackAsssociation as _trackingParticleRecoTrackAsssociation
 
 hltTPClusterProducer = _tpClusterProducer.clone(

From ebe74fcbea745c2567e3c93d5ba088dbf16e02de Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Wed, 18 Jul 2018 16:53:39 +0200
Subject: [PATCH 11/19] works, dumps the hits

---
 .../plugins/ClusterTPAssociationHeterogeneous.cc       | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc b/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc
index c0bc3b66f0c7f..0ae0986428a5f 100644
--- a/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc
+++ b/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc
@@ -196,8 +196,8 @@ void ClusterTPAssociationHeterogeneous::acquireGPUCuda(const edm::HeterogeneousE
 
     edm::Handle<siPixelRawToClusterHeterogeneousProduct::GPUProduct> gd;
     edm::Handle<siPixelRecHitsHeterogeneousProduct::GPUProduct> gh;
-    iEvent.getByToken(tGpuDigis, gd);
-    iEvent.getByToken(tGpuHits, gh);
+    iEvent.getByToken<siPixelRawToClusterHeterogeneousProduct::HeterogeneousDigiCluster>(tGpuDigis, gd);
+    iEvent.getByToken<siPixelRecHitsHeterogeneousProduct::HeterogeneousPixelRecHit>(tGpuHits, gh);
     auto const & gDigis = *gd;
     auto const & gHits = *gh;
     auto ndigis = gDigis.nDigis;
@@ -242,8 +242,10 @@ void ClusterTPAssociationHeterogeneous::produceGPUCuda(edm::HeterogeneousEvent &
 
   auto output = std::make_unique<GPUProduct>(gpuAlgo->getProduct());
 
-  iEvent.put<Output>(std::move(output), [this, &iEvent, &iSetup](const GPUProduct& hits, CPUProduct& cpu) {
-                       cpu = *(this->produceLegacy(iEvent,iSetup));
+  auto legacy = produceLegacy(iEvent,iSetup).release();
+
+  iEvent.put<Output>(std::move(output), [legacy](const GPUProduct& hits, CPUProduct& cpu) {
+                       cpu = *legacy; delete legacy;
                      });
 
 }

From 17f1e4c9d0a1c0dfa9f5f149a3136896ee0e3af5 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Fri, 20 Jul 2018 11:37:51 +0200
Subject: [PATCH 12/19] fixed

---
 .../plugins/ClusterSLOnGPU.cu                 | 54 +++++++++----------
 .../ClusterTPAssociationHeterogeneous.cc      |  6 +--
 2 files changed, 27 insertions(+), 33 deletions(-)

diff --git a/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu
index 30db1a292bc26..0610aa339ef87 100644
--- a/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu
+++ b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu
@@ -3,28 +3,10 @@
 // for the "packing"
 #include "RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h"
+#include <limits>
 #include<cassert>
 #include<atomic>
 
-/*
-struct ClusterSLGPU {
- ClusterSLGPU(){alloc();}
- void alloc();
-
- ClusterSLGPU * me_d;
- std::array<uint32_t,4> * links_d;
- uint32_t * tkId_d;
- uint32_t * tkId2_d;
- uint32_t * n1_d;
- uint32_t * n2_d;
-
- static constexpr uint32_t MAX_DIGIS = 2000*150;
- static constexpr uint32_t MaxNumModules = 2000;
-
-};
-*/
-
-
 template<class ForwardIt, class T, class Compare>
 __device__
 ForwardIt lowerBound(ForwardIt first, ForwardIt last, const T& value, Compare comp)
@@ -52,6 +34,9 @@ using ClusterSLGPU = trackerHitAssociationHeterogeneousProduct::ClusterSLGPU;
 __global__
 void simLink(clusterSLOnGPU::DigisOnGPU const * ddp, uint32_t ndigis, clusterSLOnGPU::HitsOnGPU const * hhp, ClusterSLGPU const * slp, uint32_t n) {
 
+
+  constexpr int32_t invTK = 0; // std::numeric_limits<int32_t>::max();
+ 
   constexpr uint16_t InvId=9999; // must be > MaxNumModules
   
   auto const & dd = *ddp;
@@ -70,7 +55,7 @@ void simLink(clusterSLOnGPU::DigisOnGPU const * ddp, uint32_t ndigis, clusterSLO
   auto cl = first + dd.clus_d[i];
   assert(cl<256*2000);
   
-  const std::array<uint32_t,4> me{{id,ch,0}};
+  const std::array<uint32_t,4> me{{id,ch,0,0}};
 
   auto less = [](std::array<uint32_t,4> const & a, std::array<uint32_t,4> const & b)->bool {
      return a[0]<b[0] || ( !(b[0]<a[0]) && a[1]<b[1]); // in this context we do not care of [2] 
@@ -87,16 +72,21 @@ void simLink(clusterSLOnGPU::DigisOnGPU const * ddp, uint32_t ndigis, clusterSLO
   auto p = lowerBound(b,e,me,less);
   auto j = p-sl.links_d;
   assert(j>=0);
+
+  auto getTK = [&](int i) { auto const & l = sl.links_d[i]; return l[2];};
+
   j = std::min(int(j),int(n-1));
   if (equal(me,sl.links_d[j])) {
-    //auto const & l = sl.links_d[j];
-    auto const tk = j; // l[2];
-    auto old = atomicCAS(&sl.tkId_d[cl],0,tk);
-    if (0==old ||tk==old) atomicAdd(&sl.n1_d[cl],1);
-    else {
-      auto old = atomicCAS(&sl.tkId2_d[cl],0,tk);
-      if (0==old ||tk==old) atomicAdd(&sl.n2_d[cl],1);
-    }    
+    auto const itk = j;
+    auto const tk = getTK(j);
+    auto old = atomicCAS(&sl.tkId_d[cl],invTK,itk);
+    if (invTK==old || tk==getTK(old)) { 
+       atomicAdd(&sl.n1_d[cl],1);
+    } else {
+      auto old = atomicCAS(&sl.tkId2_d[cl],invTK,itk);
+      if (invTK==old || tk==getTK(old)) atomicAdd(&sl.n2_d[cl],1);
+    }
+    // if (3==tk) printf("TK3: %d %d %d  %d: %d,%d\n",j,cl,id, i, dd.xx_d[i], dd.yy_d[i]);
   } 
   /*
   else {
@@ -133,6 +123,8 @@ void dumpLink(int ev, clusterSLOnGPU::HitsOnGPU const * hhp, uint32_t nhits, Clu
 
 namespace clusterSLOnGPU {
 
+ constexpr uint32_t invTK = 0; // std::numeric_limits<int32_t>::max();
+
   struct CSVHeader {
      CSVHeader() {
       printf("HIT: %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s\n", "ev", "ind",
@@ -166,8 +158,8 @@ namespace clusterSLOnGPU {
 
   void
   Kernel::zero(cudaStream_t stream) {
-   cudaCheck(cudaMemsetAsync(slgpu.tkId_d,0,(ClusterSLGPU::MaxNumModules*256)*sizeof(uint32_t), stream));
-   cudaCheck(cudaMemsetAsync(slgpu.tkId2_d,0,(ClusterSLGPU::MaxNumModules*256)*sizeof(uint32_t), stream));
+   cudaCheck(cudaMemsetAsync(slgpu.tkId_d,invTK,(ClusterSLGPU::MaxNumModules*256)*sizeof(uint32_t), stream));
+   cudaCheck(cudaMemsetAsync(slgpu.tkId2_d,invTK,(ClusterSLGPU::MaxNumModules*256)*sizeof(uint32_t), stream));
    cudaCheck(cudaMemsetAsync(slgpu.n1_d,0,(ClusterSLGPU::MaxNumModules*256)*sizeof(uint32_t), stream));
    cudaCheck(cudaMemsetAsync(slgpu.n2_d,0,(ClusterSLGPU::MaxNumModules*256)*sizeof(uint32_t), stream));
   }
@@ -176,6 +168,8 @@ namespace clusterSLOnGPU {
   void 
   Kernel::algo(DigisOnGPU const & dd, uint32_t ndigis, HitsOnCPU const & hh, uint32_t nhits, uint32_t n, cuda::stream_t<>& stream) {
 
+    zero(stream.id());
+
     ClusterSLGPU const & sl = slgpu;
 
     int ev = ++evId;
diff --git a/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc b/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc
index 0ae0986428a5f..64c379e0f106f 100644
--- a/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc
+++ b/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc
@@ -212,16 +212,16 @@ void ClusterTPAssociationHeterogeneous::acquireGPUCuda(const edm::HeterogeneousE
       uint32_t gind = genericDet->index();
       for (auto const & link : links) {
         ++ng;
-           if (link.fraction() > 0.3f) ++ng10;
+        if (link.fraction() > 0.3f) ++ng10;
         if (link.fraction() < 0.5f) continue;
         auto tkid = std::make_pair(link.SimTrackId(), link.eventId());
         auto ipos = mapping.find(tkid);
-          if (ipos != mapping.end()) {
+        if (ipos != mapping.end()) {
             uint32_t pt = 1000*(*ipos).second->pt();
             ++nn;
             std::array<uint32_t,4> a{{gind,uint32_t(link.channel()),(*ipos).second.key(),pt}};
             digi2tp.push_back(a);
-          }
+        }
       }
     }
     std::sort(digi2tp.begin(),digi2tp.end());

From 56846273bc7a00f4df1e10e23a4eef61b51c44bd Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Sat, 21 Jul 2018 16:15:12 +0200
Subject: [PATCH 13/19] keep in limit of buffer

---
 .../plugins/SiPixelRawToClusterGPUKernel.cu   |  1 +
 .../plugins/SiPixelRawToClusterGPUKernel.h    |  4 +-
 .../siPixelRawToClusterHeterogeneousProduct.h |  6 ++-
 .../plugins/ClusterSLOnGPU.cu                 | 52 ++++++++++++++++---
 .../ClusterTPAssociationHeterogeneous.cc      |  4 +-
 5 files changed, 55 insertions(+), 12 deletions(-)

diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
index 92ea71ae5ac05..932df3d2ee016 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
@@ -96,6 +96,7 @@ namespace pixelgpudetails {
       
     cudaCheck(cudaMemcpyAsync(gpuProduct_d, &gpuProduct, sizeof(GPUProduct), cudaMemcpyDefault,cudaStream.id()));
 
+    std::cout << "xx_d " << xx_d <<' ' << gpuProduct.xx_d << std::endl;
   }
 
   SiPixelRawToClusterGPUKernel::~SiPixelRawToClusterGPUKernel() {
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
index 2552f72584ee2..bb7768ebf5a76 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
@@ -181,8 +181,8 @@ namespace pixelgpudetails {
       return siPixelRawToClusterHeterogeneousProduct::GPUProduct{
         pdigi_h, rawIdArr_h, clus_h, adc_h, error_h,
         gpuProduct_d,
-        nDigis, nModulesActive,
-        xx_d, yy_d, adc_d, moduleInd_d, moduleStart_d,clus_d, clusInModule_d, moduleId_d
+        xx_d, yy_d, adc_d, moduleInd_d, moduleStart_d,clus_d, clusInModule_d, moduleId_d,
+        nDigis, nModulesActive
       };
     }
 
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h b/RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h
index bafe6fb2fed58..5c0d027fe57b5 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h
@@ -48,8 +48,6 @@ namespace siPixelRawToClusterHeterogeneousProduct {
     GPUProduct const * me_d = nullptr;
 
     // Needed for GPU rechits
-    uint32_t nDigis;
-    uint32_t nModules;
     uint16_t const * xx_d;
     uint16_t const * yy_d;
     uint16_t const * adc_d;
@@ -58,6 +56,10 @@ namespace siPixelRawToClusterHeterogeneousProduct {
     int32_t const *  clus_d;
     uint32_t const * clusInModule_d;
     uint32_t const * moduleId_d;
+
+    uint32_t nDigis;
+    uint32_t nModules;
+
   };
 
   using HeterogeneousDigiCluster = HeterogeneousProductImpl<heterogeneous::CPUProduct<CPUProduct>,
diff --git a/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu
index 0610aa339ef87..b247f4045a8f4 100644
--- a/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu
+++ b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu
@@ -34,6 +34,7 @@ using ClusterSLGPU = trackerHitAssociationHeterogeneousProduct::ClusterSLGPU;
 __global__
 void simLink(clusterSLOnGPU::DigisOnGPU const * ddp, uint32_t ndigis, clusterSLOnGPU::HitsOnGPU const * hhp, ClusterSLGPU const * slp, uint32_t n) {
 
+  assert(slp==slp->me_d);
 
   constexpr int32_t invTK = 0; // std::numeric_limits<int32_t>::max();
  
@@ -70,7 +71,7 @@ void simLink(clusterSLOnGPU::DigisOnGPU const * ddp, uint32_t ndigis, clusterSLO
 
   // auto p = cuda_std::lower_bound(b,e,me,less);
   auto p = lowerBound(b,e,me,less);
-  auto j = p-sl.links_d;
+  int32_t j = p-sl.links_d;
   assert(j>=0);
 
   auto getTK = [&](int i) { auto const & l = sl.links_d[i]; return l[2];};
@@ -82,11 +83,12 @@ void simLink(clusterSLOnGPU::DigisOnGPU const * ddp, uint32_t ndigis, clusterSLO
     auto old = atomicCAS(&sl.tkId_d[cl],invTK,itk);
     if (invTK==old || tk==getTK(old)) { 
        atomicAdd(&sl.n1_d[cl],1);
+//         sl.n1_d[cl] = tk;
     } else {
       auto old = atomicCAS(&sl.tkId2_d[cl],invTK,itk);
       if (invTK==old || tk==getTK(old)) atomicAdd(&sl.n2_d[cl],1);
     }
-    // if (3==tk) printf("TK3: %d %d %d  %d: %d,%d\n",j,cl,id, i, dd.xx_d[i], dd.yy_d[i]);
+//    if (92==tk) printf("TK3: %d %d %d  %d: %d,%d ?%d?%d\n", j, cl, id, i, dd.xx_d[i], dd.yy_d[i], hh.mr_d[cl], hh.mc_d[cl]);
   } 
   /*
   else {
@@ -98,12 +100,32 @@ void simLink(clusterSLOnGPU::DigisOnGPU const * ddp, uint32_t ndigis, clusterSLO
 
 }
 
-
 __global__
-void dumpLink(int ev, clusterSLOnGPU::HitsOnGPU const * hhp, uint32_t nhits, ClusterSLGPU const * slp) {
+void verifyZero(int ev, clusterSLOnGPU::DigisOnGPU const * ddp, clusterSLOnGPU::HitsOnGPU const * hhp, uint32_t nhits, ClusterSLGPU const * slp) {
   auto i = blockIdx.x*blockDim.x + threadIdx.x;
   if (i>nhits) return;
 
+  auto const & dd = *ddp;
+  auto const & hh = *hhp;
+  auto const & sl = *slp;
+
+  assert(sl.tkId_d[i]==0);
+  auto const & tk = sl.links_d[0];
+  assert(tk[0]==0);
+  assert(tk[1]==0);
+  assert(tk[2]==0);
+  assert(tk[3]==0);
+
+  if (i==0) printf("xx_d gpu %x\n",dd.xx_d);
+
+}
+
+
+__global__
+void dumpLink(int first, int ev, clusterSLOnGPU::HitsOnGPU const * hhp, uint32_t nhits, ClusterSLGPU const * slp) {
+  auto i = first + blockIdx.x*blockDim.x + threadIdx.x;
+  if (i>nhits) return;
+
   auto const & hh = *hhp;
   auto const & sl = *slp;
 
@@ -167,6 +189,11 @@ namespace clusterSLOnGPU {
 
   void 
   Kernel::algo(DigisOnGPU const & dd, uint32_t ndigis, HitsOnCPU const & hh, uint32_t nhits, uint32_t n, cuda::stream_t<>& stream) {
+    
+    size_t pfs = 16*1024*1024;
+    // cudaDeviceSetLimit(cudaLimitPrintfFifoSize,pfs);
+    cudaDeviceGetLimit(&pfs,cudaLimitPrintfFifoSize);
+    std::cout << "cudaLimitPrintfFifoSize " << pfs << std::endl;
 
     zero(stream.id());
 
@@ -174,12 +201,23 @@ namespace clusterSLOnGPU {
 
     int ev = ++evId;
     int threadsPerBlock = 256;
-    int blocks = (ndigis + threadsPerBlock - 1) / threadsPerBlock;
+
+    int blocks = (nhits + threadsPerBlock - 1) / threadsPerBlock;
+    verifyZero<<<blocks, threadsPerBlock, 0, stream.id()>>>(ev, dd.me_d, hh.gpu_d, nhits, sl.me_d);
+
+
+    blocks = (ndigis + threadsPerBlock - 1) / threadsPerBlock;
 
     assert(sl.me_d);
     simLink<<<blocks, threadsPerBlock, 0, stream.id()>>>(dd.me_d,ndigis, hh.gpu_d, sl.me_d,n);
-    blocks = (nhits + threadsPerBlock - 1) / threadsPerBlock;
-    dumpLink<<<blocks, threadsPerBlock, 0, stream.id()>>>(ev, hh.gpu_d, nhits, sl.me_d);
+    cudaStreamSynchronize(stream.id());
+
+    // one line == 200B so each kernel can print only 5K lines....
+    blocks = 16; // (nhits + threadsPerBlock - 1) / threadsPerBlock;
+    for (int first=0; first<int(nhits); first+=blocks*threadsPerBlock) {
+      dumpLink<<<blocks, threadsPerBlock, 0, stream.id()>>>(first, ev, hh.gpu_d, nhits, sl.me_d);
+      cudaStreamSynchronize(stream.id());
+    }
     cudaCheck(cudaGetLastError());
 
   }
diff --git a/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc b/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc
index 64c379e0f106f..3fc56610080b4 100644
--- a/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc
+++ b/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc
@@ -103,6 +103,8 @@ class ClusterTPAssociationHeterogeneous : public HeterogeneousEDProducer<heterog
 
   std::map<std::pair<size_t, EncodedEventId>, TrackingParticleRef> mapping;
  
+  std::vector<std::array<uint32_t,4>> digi2tp;
+
 };
 
 ClusterTPAssociationHeterogeneous::ClusterTPAssociationHeterogeneous(const edm::ParameterSet & cfg)
@@ -204,7 +206,7 @@ void ClusterTPAssociationHeterogeneous::acquireGPUCuda(const edm::HeterogeneousE
     auto nhits = gHits.nHits;
 
     uint32_t nn=0, ng=0, ng10=0;
-    std::vector<std::array<uint32_t,4>> digi2tp;
+    digi2tp.clear();
     {std::array<uint32_t,4> a{{0,0,0,0}}; digi2tp.push_back(a);} // put at 0 0
     for (auto const & links : *sipixelSimLinks) {
       DetId detId(links.detId());

From 4bfe9b7abf52c2aa33339ce6f98bf4325ec182c8 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Mon, 23 Jul 2018 12:29:50 +0200
Subject: [PATCH 14/19] remove debug, protect dump with flag

---
 .../plugins/SiPixelRawToClusterGPUKernel.cu        |  2 +-
 .../plugins/ClusterSLOnGPU.cu                      | 14 ++++++++------
 .../TrackerHitAssociation/plugins/ClusterSLOnGPU.h |  4 ++--
 .../plugins/ClusterTPAssociationHeterogeneous.cc   |  9 +++++++--
 4 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
index 932df3d2ee016..88004245dddae 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
@@ -93,10 +93,10 @@ namespace pixelgpudetails {
 
     cudaCheck(cudaMalloc((void**) & gpuProduct_d, sizeof(GPUProduct)));
     gpuProduct = getProduct();
+    assert(xx_d==gpuProduct.xx_d);
       
     cudaCheck(cudaMemcpyAsync(gpuProduct_d, &gpuProduct, sizeof(GPUProduct), cudaMemcpyDefault,cudaStream.id()));
 
-    std::cout << "xx_d " << xx_d <<' ' << gpuProduct.xx_d << std::endl;
   }
 
   SiPixelRawToClusterGPUKernel::~SiPixelRawToClusterGPUKernel() {
diff --git a/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu
index b247f4045a8f4..060a9056647ae 100644
--- a/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu
+++ b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu
@@ -116,7 +116,7 @@ void verifyZero(int ev, clusterSLOnGPU::DigisOnGPU const * ddp, clusterSLOnGPU::
   assert(tk[2]==0);
   assert(tk[3]==0);
 
-  if (i==0) printf("xx_d gpu %x\n",dd.xx_d);
+  // if (i==0) printf("xx_d gpu %x\n",dd.xx_d);
 
 }
 
@@ -212,11 +212,13 @@ namespace clusterSLOnGPU {
     simLink<<<blocks, threadsPerBlock, 0, stream.id()>>>(dd.me_d,ndigis, hh.gpu_d, sl.me_d,n);
     cudaStreamSynchronize(stream.id());
 
-    // one line == 200B so each kernel can print only 5K lines....
-    blocks = 16; // (nhits + threadsPerBlock - 1) / threadsPerBlock;
-    for (int first=0; first<int(nhits); first+=blocks*threadsPerBlock) {
-      dumpLink<<<blocks, threadsPerBlock, 0, stream.id()>>>(first, ev, hh.gpu_d, nhits, sl.me_d);
-      cudaStreamSynchronize(stream.id());
+    if (doDump) {
+      // one line == 200B so each kernel can print only 5K lines....
+      blocks = 16; // (nhits + threadsPerBlock - 1) / threadsPerBlock;
+      for (int first=0; first<int(nhits); first+=blocks*threadsPerBlock) {
+        dumpLink<<<blocks, threadsPerBlock, 0, stream.id()>>>(first, ev, hh.gpu_d, nhits, sl.me_d);
+        cudaStreamSynchronize(stream.id());
+      }
     }
     cudaCheck(cudaGetLastError());
 
diff --git a/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.h b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.h
index 27b78000aec56..529b0ce3715e2 100644
--- a/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.h
+++ b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.h
@@ -26,7 +26,7 @@ namespace clusterSLOnGPU {
 
   class Kernel {
   public:
-    Kernel(cuda::stream_t<>& stream) {alloc(stream);}
+    Kernel(cuda::stream_t<>& stream, bool doDump) {alloc(stream);}
     void algo(DigisOnGPU const & dd, uint32_t ndigis, HitsOnCPU const & hh, uint32_t nhits, uint32_t n, cuda::stream_t<>& stream);
     GPUProduct getProduct() { return GPUProduct{slgpu.me_d};}
     
@@ -35,7 +35,7 @@ namespace clusterSLOnGPU {
      void zero(cudaStream_t stream);
   public:
      ClusterSLGPU slgpu; 
-
+     bool doDump;
   };
 }
 
diff --git a/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc b/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc
index 3fc56610080b4..e2408a05289a9 100644
--- a/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc
+++ b/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc
@@ -105,6 +105,8 @@ class ClusterTPAssociationHeterogeneous : public HeterogeneousEDProducer<heterog
  
   std::vector<std::array<uint32_t,4>> digi2tp;
 
+  bool doDump;
+
 };
 
 ClusterTPAssociationHeterogeneous::ClusterTPAssociationHeterogeneous(const edm::ParameterSet & cfg)
@@ -117,7 +119,8 @@ ClusterTPAssociationHeterogeneous::ClusterTPAssociationHeterogeneous(const edm::
     phase2OTClustersToken_(consumes<edmNew::DetSetVector<Phase2TrackerCluster1D> >(cfg.getParameter<edm::InputTag>("phase2OTClusterSrc"))),
     trackingParticleToken_(consumes<TrackingParticleCollection>(cfg.getParameter<edm::InputTag>("trackingParticleSrc"))),
     tGpuDigis(consumesHeterogeneous(cfg.getParameter<edm::InputTag>("heterogeneousPixelDigiClusterSrc"))),
-    tGpuHits(consumesHeterogeneous(cfg.getParameter<edm::InputTag>("heterogeneousPixelRecHitSrc")))
+    tGpuHits(consumesHeterogeneous(cfg.getParameter<edm::InputTag>("heterogeneousPixelRecHitSrc"))),
+    doDump(cfg.getParameter<bool>("dumpCSV"))
 {
   produces<HeterogeneousProduct>();
 }
@@ -135,6 +138,8 @@ void ClusterTPAssociationHeterogeneous::fillDescriptions(edm::ConfigurationDescr
   desc.add<edm::InputTag>("heterogeneousPixelDigiClusterSrc", edm::InputTag("siPixelClustersHeterogeneous"));
   desc.add<edm::InputTag>("heterogeneousPixelRecHitSrc", edm::InputTag("siPixelRecHitHeterogeneous"));
 
+  desc.add<bool>("dumpCSV",false);
+
   HeterogeneousEDProducer::fillPSetDescription(desc);
 
   descriptions.add("tpClusterProducerHeterogeneousDefault", desc);
@@ -144,7 +149,7 @@ void ClusterTPAssociationHeterogeneous::fillDescriptions(edm::ConfigurationDescr
 void ClusterTPAssociationHeterogeneous::beginStreamGPUCuda(edm::StreamID streamId,
                           cuda::stream_t<> &cudaStream) {
 
-   gpuAlgo = std::make_unique<clusterSLOnGPU::Kernel>(cudaStream);
+   gpuAlgo = std::make_unique<clusterSLOnGPU::Kernel>(cudaStream,doDump);
 
 }
 

From e13e054e0af91b338cbb4b640c93a36bf9ac5f4f Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Mon, 23 Jul 2018 15:37:58 +0200
Subject: [PATCH 15/19] consume hit on gpu

---
 .../CAHitNtupletHeterogeneousEDProducer.cc    |  23 +++
 .../plugins/CAHitQuadrupletGeneratorGPU.cu    |  17 ++
 .../plugins/CAHitQuadrupletGeneratorGPU.h     |  11 ++
 .../PixelTriplets/plugins/gpuPixelDoublets.h  | 161 ++++++++++++++++++
 4 files changed, 212 insertions(+)
 create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletHeterogeneousEDProducer.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletHeterogeneousEDProducer.cc
index 186a84617e8d3..e9873247a119f 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletHeterogeneousEDProducer.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletHeterogeneousEDProducer.cc
@@ -18,6 +18,11 @@
 
 #include "CAHitQuadrupletGeneratorGPU.h"
 
+// gpu
+#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h" 
+
+
+
 namespace {
 void fillNtuplets(RegionsSeedingHitSets::RegionFiller &seedingHitSetsFiller,
                   const OrderedHitSeeds &quadruplets) {
@@ -31,6 +36,10 @@ class CAHitNtupletHeterogeneousEDProducer
     : public HeterogeneousEDProducer<heterogeneous::HeterogeneousDevices<
           heterogeneous::GPUCuda, heterogeneous::CPU>> {
 public:
+
+  using PixelRecHitsH = siPixelRecHitsHeterogeneousProduct::HeterogeneousPixelRecHit;
+
+
   CAHitNtupletHeterogeneousEDProducer(const edm::ParameterSet &iConfig);
   ~CAHitNtupletHeterogeneousEDProducer() = default;
 
@@ -50,6 +59,9 @@ class CAHitNtupletHeterogeneousEDProducer
 private:
   edm::EDGetTokenT<IntermediateHitDoublets> doubletToken_;
 
+  edm::EDGetTokenT<HeterogeneousProduct> tGpuHits;
+
+
   edm::RunningAverage localRA_;
   CAHitQuadrupletGeneratorGPU GPUGenerator_;
   CAHitQuadrupletGenerator CPUGenerator_;
@@ -63,6 +75,7 @@ CAHitNtupletHeterogeneousEDProducer::CAHitNtupletHeterogeneousEDProducer(
     : HeterogeneousEDProducer(iConfig),
       doubletToken_(consumes<IntermediateHitDoublets>(
           iConfig.getParameter<edm::InputTag>("doublets"))),
+      tGpuHits(consumesHeterogeneous(iConfig.getParameter<edm::InputTag>("heterogeneousPixelRecHitSrc"))),
       GPUGenerator_(iConfig, consumesCollector()),
       CPUGenerator_(iConfig, consumesCollector()) {
   produces<RegionsSeedingHitSets>();
@@ -73,6 +86,9 @@ void CAHitNtupletHeterogeneousEDProducer::fillDescriptions(
   edm::ParameterSetDescription desc;
 
   desc.add<edm::InputTag>("doublets", edm::InputTag("hitPairEDProducer"));
+
+  desc.add<edm::InputTag>("heterogeneousPixelRecHitSrc", edm::InputTag("siPixelRecHitHeterogeneous"));
+
   CAHitQuadrupletGeneratorGPU::fillDescriptions(desc);
   HeterogeneousEDProducer::fillPSetDescription(desc);
   auto label = "caHitQuadrupletHeterogeneousEDProducer";
@@ -106,6 +122,13 @@ void CAHitNtupletHeterogeneousEDProducer::acquireGPUCuda(
 
   seedingHitSets_ = std::make_unique<RegionsSeedingHitSets>();
 
+  edm::Handle<siPixelRecHitsHeterogeneousProduct::GPUProduct> gh;
+  iEvent.getByToken<siPixelRecHitsHeterogeneousProduct::HeterogeneousPixelRecHit>(tGpuHits, gh);
+  auto const & gHits = *gh;
+//  auto nhits = gHits.nHits;
+
+  GPUGenerator_.buildDoublets(gHits,0.06f,cudaStream.id());
+
   if (regionDoublets.empty()) {
     emptyRegionDoublets = true;
   } else {
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu
index 4a7f3ce5aa5f1..b94aad94bf16b 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu
@@ -6,6 +6,8 @@
 #include "GPUCACell.h"
 #include "CAHitQuadrupletGeneratorGPU.h"
 
+#include "gpuPixelDoublets.h"
+
 __global__ void
 kernel_debug(unsigned int numberOfLayerPairs_, unsigned int numberOfLayers_,
              const GPULayerDoublets *gpuDoublets,
@@ -490,3 +492,18 @@ CAHitQuadrupletGeneratorGPU::fetchKernelResult(int regionIndex, cudaStream_t cud
   }
   return quadsInterface;
 }
+
+
+
+
+void CAHitQuadrupletGeneratorGPU::buildDoublets(HitsOnCPU const & hh, float phicut, cudaStream_t stream) {
+   auto nhits = hh.nHits;
+
+  float phiCut=0.06;
+  int threadsPerBlock = 256;
+  int blocks = (nhits + threadsPerBlock - 1) / threadsPerBlock;
+
+  gpuPixelDoublets::getDoubletsFromHisto<<<blocks, threadsPerBlock, 0, stream>>>(hh.gpu_d,phiCut);
+
+
+}
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.h
index a6e9db46e68e7..717164c9cff62 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.h
@@ -3,6 +3,10 @@
 
 #include <cuda_runtime.h>
 
+#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h"
+
+
+
 #include "DataFormats/SiPixelDetId/interface/PixelSubdetector.h"
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "FWCore/Utilities/interface/EDGetToken.h"
@@ -32,6 +36,11 @@ namespace edm {
 
 class CAHitQuadrupletGeneratorGPU {
 public:
+
+    using HitsOnGPU = siPixelRecHitsHeterogeneousProduct::HitsOnGPU;
+    using HitsOnCPU = siPixelRecHitsHeterogeneousProduct::HitsOnCPU;
+
+
     typedef LayerHitMapCache LayerCacheType;
 
     static constexpr unsigned int minLayers = 4;
@@ -49,6 +58,8 @@ class CAHitQuadrupletGeneratorGPU {
 
     void initEvent(const edm::Event& ev, const edm::EventSetup& es);
 
+    void buildDoublets(HitsOnCPU const & hh, float phicut, cudaStream_t stream);
+
     void hitNtuplets(const IntermediateHitDoublets& regionDoublets,
                      const edm::EventSetup& es,
                      const SeedingLayerSetsHits& layers, cudaStream_t stream);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
new file mode 100644
index 0000000000000..019e775209f96
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
@@ -0,0 +1,161 @@
+#ifndef RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelDoublets_h
+#define RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelDouplets_h
+
+#include <cassert>
+#include <cstdint>
+#include <cstdio>
+#include <limits>
+#include <cmath>
+#include<algorithm>
+
+#include "DataFormats/Math/interface/approx_atan2.h"
+
+#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h"
+
+namespace gpuPixelDoublets {
+
+
+  __device__
+  std::pair<int,int> 
+  findPhiLimits(int16_t phiMe, int16_t * iphi, uint16_t * index, uint16_t size, int16_t iphicut) {
+
+  assert(iphicut>0);
+  
+  // find extreemes in top
+  int16_t minPhi = phiMe-iphicut;
+  int16_t maxPhi = phiMe+iphicut;
+
+  // std::cout << "\n phi min/max " << phiMe << ' ' << minPhi << ' ' << maxPhi << std::endl;
+
+  // guess and adjust
+  auto findLimit = [&](int16_t mPhi) { 
+    int jm = float(0.5f*size)*(1.f+float(mPhi)/float(std::numeric_limits<short>::max()));
+    // std::cout << "jm for " << mPhi << ' ' << jm << std::endl;
+    jm = std::min(size-1,std::max(0,jm));
+    bool notDone=true;
+    while(jm>0 && mPhi<iphi[index[--jm]]){notDone=false;}
+    if (notDone) while(jm<size && mPhi>iphi[index[++jm]]){}
+    jm = std::min(size-1,std::max(0,jm));
+    return jm;
+  };
+ 
+  auto jmin = findLimit(minPhi);  
+  auto jmax = findLimit(maxPhi);
+  
+
+  /*
+  std::cout << "j min/max " << jmin << ' ' << jmax << std::endl;
+  std::cout << "found min/max " << iphi[index[jmin]] << ' ' << iphi[index[jmax]] << std::endl;  
+  std::cout << "found min/max +1 " << iphi[index[jmin+1]] << ' ' << iphi[index[jmax+1]] << std::endl;
+  std::cout << "found min/max -1 " << iphi[index[jmin-1]] << ' ' << iphi[index[jmax-1]] << std::endl;
+  */
+
+  return std::make_pair(jmin,jmax);
+  }
+
+
+  __global__
+  void getDoubletsFromSorted(int16_t * iphi, uint16_t * index, uint32_t * offsets, float phiCut) {
+
+  auto iphicut = phi2short(phiCut);
+
+  auto i = blockIdx.x*blockDim.x + threadIdx.x;
+  if (i>=offsets[9]) return; // get rid of last layer
+
+  assert(0==offsets[0]);
+  int top = (i>offsets[5]) ? 5: 0;
+  while (i>=offsets[++top]){};
+  assert(top<10);
+  auto bottom = top-1;
+  if (bottom==3 || bottom==6) return; // do not have UP... (9 we got rid already)
+  assert(i>=offsets[bottom]);
+  assert(i<offsets[top]);
+
+  if (index[i]>= (offsets[top]-offsets[bottom])) {
+    printf("index problem: %d %d %d %d %d\n",i, offsets[top], offsets[bottom], offsets[top]-offsets[bottom], index[i]);
+    return;
+  }   
+
+  assert(index[i]<offsets[top]-offsets[bottom]);
+
+  int16_t phiMe = iphi[offsets[bottom]+index[i]];
+  
+  auto size = offsets[top+1] - offsets[top]; 
+  assert(size<std::numeric_limits<uint16_t>::max());
+
+  auto jLimits = findPhiLimits(phiMe, iphi+offsets[top],index+offsets[top],size,iphicut);
+
+  auto slidingWindow = [&](uint16_t mysize, uint16_t mymin,uint16_t mymax) {
+    auto topPhi = iphi+offsets[top];
+    uint16_t imax =  std::numeric_limits<uint16_t>::max();
+    uint16_t offset = (mymin>mymax) ? imax-(mysize-1) : 0;
+    int n=0;
+    for (uint16_t i = mymin+offset; i!=mymax; i++) {
+      assert(i<=imax);
+      uint16_t k = (i>mymax) ? i-offset : i;
+      assert(k<mysize);
+      assert(k>=mymin || k<mymax);
+      if (int16_t(topPhi[k]-phiMe)>2*iphicut && int16_t(phiMe-topPhi[k])>2*iphicut) 
+         printf("deltaPhi problem: %d %d %d %d, deltas %d:%d cut %d\n",i,k,phiMe,topPhi[k],int16_t(topPhi[k]-phiMe),int16_t(phiMe-topPhi[k]),iphicut);
+      n++;
+    }
+    int tot = (mymin>mymax) ? (mysize-mymin)+mymax : mymax-mymin;
+    assert(n==tot);
+  };
+
+  slidingWindow(size,jLimits.first,jLimits.second);
+ 
+  }
+
+  template<typename Hist>
+  __device__
+  void doubletsFromHisto(int16_t const * iphi, Hist const * hist, uint32_t const * offsets, float phiCut) {
+
+  auto iphicut = phi2short(phiCut);
+
+  auto i = blockIdx.x*blockDim.x + threadIdx.x;
+  if (i>=offsets[9]) return; // get rid of last layer
+
+  assert(0==offsets[0]);
+  int top = (i>offsets[5]) ? 5: 0;
+  while (i>=offsets[++top]){};
+  assert(top<10);
+  auto bottom = top-1;
+  if (bottom==3 || bottom==6) return; // do not have UP... (9 we got rid already)
+  assert(i>=offsets[bottom]);
+  assert(i<offsets[top]);
+
+  auto mep = iphi[i];
+
+  auto kl = hist[top].bin(mep-iphicut);
+  auto kh = hist[top].bin(mep+iphicut);
+
+  auto incr = [](auto & k) { return k = (k+1)%Hist::nbins();};
+
+  int tot=0;
+  int nmin=0;
+  auto khh=kh;
+  incr(khh); 
+  for (auto kk=kl; kk!=khh; incr(kk)) {
+     if (kk!=kl && kk!=kh) nmin+=hist[top].size(kk);
+     for (auto p=hist[top].begin(kk); p<hist[top].end(kk); ++p) {
+        if ( std::min(std::abs(int16_t(iphi[*p]-mep)), std::abs(int16_t(mep-iphi[*p]))) > iphicut ) continue;
+        ++tot;
+     }
+  }
+  if (0==hist[top].nspills) assert(tot>=nmin);
+  // look in spill bin as well....
+
+  }
+
+  void 
+  __global__
+  getDoubletsFromHisto(siPixelRecHitsHeterogeneousProduct::HitsOnGPU const * hhp, float phiCut) {
+    auto const & hh = *hhp;
+    doubletsFromHisto(hh.iphi_d,hh.hist_d,hh.hitsLayerStart_d,phiCut);
+
+  }
+
+} // namespace end
+
+#endif

From cd1088b0f8ae0e5cd10e1e6ae4320c56ec0885cd Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Wed, 25 Jul 2018 15:51:36 +0200
Subject: [PATCH 16/19] initialize

---
 SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.h b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.h
index 529b0ce3715e2..24ddd27450a11 100644
--- a/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.h
+++ b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.h
@@ -26,7 +26,7 @@ namespace clusterSLOnGPU {
 
   class Kernel {
   public:
-    Kernel(cuda::stream_t<>& stream, bool doDump) {alloc(stream);}
+    Kernel(cuda::stream_t<>& stream, bool dump) : doDump(dump) {alloc(stream);}
     void algo(DigisOnGPU const & dd, uint32_t ndigis, HitsOnCPU const & hh, uint32_t nhits, uint32_t n, cuda::stream_t<>& stream);
     GPUProduct getProduct() { return GPUProduct{slgpu.me_d};}
     

From 46f5e22c55e4dde946bd32562b83601518a9ab4f Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Thu, 26 Jul 2018 11:46:57 +0200
Subject: [PATCH 17/19] remove debug

---
 .../plugins/ClusterSLOnGPU.cu                  | 18 ++++++++++++------
 .../plugins/ClusterSLOnGPU.h                   |  2 +-
 .../ClusterTPAssociationHeterogeneous.cc       |  4 ++--
 3 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu
index 060a9056647ae..535388947fc84 100644
--- a/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu
+++ b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu
@@ -6,6 +6,7 @@
 #include <limits>
 #include<cassert>
 #include<atomic>
+#include <mutex>
 
 template<class ForwardIt, class T, class Compare>
 __device__
@@ -105,8 +106,8 @@ void verifyZero(int ev, clusterSLOnGPU::DigisOnGPU const * ddp, clusterSLOnGPU::
   auto i = blockIdx.x*blockDim.x + threadIdx.x;
   if (i>nhits) return;
 
-  auto const & dd = *ddp;
-  auto const & hh = *hhp;
+//  auto const & dd = *ddp;
+//  auto const & hh = *hhp;
   auto const & sl = *slp;
 
   assert(sl.tkId_d[i]==0);
@@ -147,8 +148,7 @@ namespace clusterSLOnGPU {
 
  constexpr uint32_t invTK = 0; // std::numeric_limits<int32_t>::max();
 
-  struct CSVHeader {
-     CSVHeader() {
+ void printCSVHeader() {
       printf("HIT: %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s\n", "ev", "ind",
          "det", "charge",	
          "xg","yg","zg","rg","iphi", 
@@ -156,10 +156,14 @@ namespace clusterSLOnGPU {
         );
      }
 
-  };
-  CSVHeader csvHeader;
 
   std::atomic<int> evId(0);
+  std::once_flag doneCSVHeader;
+
+  Kernel::Kernel(cuda::stream_t<>& stream, bool dump) : doDump(dump) {
+    if (doDump) std::call_once(doneCSVHeader,printCSVHeader);
+    alloc(stream);
+  }
 
 
   void
@@ -190,10 +194,12 @@ namespace clusterSLOnGPU {
   void 
   Kernel::algo(DigisOnGPU const & dd, uint32_t ndigis, HitsOnCPU const & hh, uint32_t nhits, uint32_t n, cuda::stream_t<>& stream) {
     
+    /*
     size_t pfs = 16*1024*1024;
     // cudaDeviceSetLimit(cudaLimitPrintfFifoSize,pfs);
     cudaDeviceGetLimit(&pfs,cudaLimitPrintfFifoSize);
     std::cout << "cudaLimitPrintfFifoSize " << pfs << std::endl;
+    */
 
     zero(stream.id());
 
diff --git a/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.h b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.h
index 24ddd27450a11..621bfbcb367bb 100644
--- a/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.h
+++ b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.h
@@ -26,7 +26,7 @@ namespace clusterSLOnGPU {
 
   class Kernel {
   public:
-    Kernel(cuda::stream_t<>& stream, bool dump) : doDump(dump) {alloc(stream);}
+    Kernel(cuda::stream_t<>& stream, bool dump);
     void algo(DigisOnGPU const & dd, uint32_t ndigis, HitsOnCPU const & hh, uint32_t nhits, uint32_t n, cuda::stream_t<>& stream);
     GPUProduct getProduct() { return GPUProduct{slgpu.me_d};}
     
diff --git a/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc b/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc
index e2408a05289a9..6cffed8ad0cd5 100644
--- a/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc
+++ b/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc
@@ -199,7 +199,7 @@ void ClusterTPAssociationHeterogeneous::acquireGPUCuda(const edm::HeterogeneousE
    
     //  gpu stuff ------------------------
 
-    std::cout << "In tpsimlink " << mapping.size() << std::endl;
+    // std::cout << "In tpsimlink " << mapping.size() << std::endl;
 
     edm::Handle<siPixelRawToClusterHeterogeneousProduct::GPUProduct> gd;
     edm::Handle<siPixelRecHitsHeterogeneousProduct::GPUProduct> gh;
@@ -233,7 +233,7 @@ void ClusterTPAssociationHeterogeneous::acquireGPUCuda(const edm::HeterogeneousE
     }
     std::sort(digi2tp.begin(),digi2tp.end());
 
-    std::cout << "In tpsimlink found " << nn << " valid link out of " << ng << '/' << ng10 << ' ' << digi2tp.size() << std::endl;
+    // std::cout << "In tpsimlink found " << nn << " valid link out of " << ng << '/' << ng10 << ' ' << digi2tp.size() << std::endl;
 
     cudaCheck(cudaMemcpyAsync(gpuAlgo->slgpu.links_d, digi2tp.data(), sizeof(std::array<uint32_t,4>)*digi2tp.size(), cudaMemcpyDefault, cudaStream.id()));
     gpuAlgo->algo(gDigis, ndigis, gHits, nhits, digi2tp.size(),cudaStream);

From 007795f772efb3ab0689edd670d410922bc27c84 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Thu, 26 Jul 2018 16:17:40 +0200
Subject: [PATCH 18/19] free memory, remove sync, use cudastd

---
 .../CUDAUtilities/interface/HistoContainer.h  |  8 ----
 .../interface/cudastdAlgorithm.h              | 14 ++++++
 .../plugins/ClusterSLOnGPU.cu                 | 43 +++++++------------
 .../plugins/ClusterSLOnGPU.h                  |  2 +
 4 files changed, 31 insertions(+), 36 deletions(-)

diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
index baa2e7f64c613..c398b29a64c69 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
@@ -8,14 +8,6 @@
 #include <atomic>
 #endif // __CUDA_ARCH__
 
-#ifdef __CUDACC__
-#include <cuda_runtime.h>
-#else
-#define __device__
-#define __global__
-#define __host__
-#endif // __CUDACC__
-
 #include "HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h"
 
 #ifdef __CUDACC__
diff --git a/HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h b/HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h
index 4bfa4e694d9dc..c451ab7e4113d 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h
@@ -1,6 +1,15 @@
 #ifndef HeterogeneousCore_CUDAUtilities_cudastdAlgorithm_h
 #define HeterogeneousCore_CUDAUtilities_cudastdAlgorithm_h
 
+#ifdef __CUDACC__
+#include <cuda_runtime.h>
+#else
+#define __device__
+#define __global__
+#define __host__
+#endif // __CUDACC__
+
+
 #include <utility>
 
 // reimplementation of std algorithms able to compile with CUDA and run on GPUs,
@@ -10,6 +19,7 @@ namespace cuda_std  {
 
   template<typename T = void>
   struct less {
+    __device__ __host__
     constexpr bool operator()(const T &lhs, const T &rhs) const {
       return lhs < rhs;
     }
@@ -18,10 +28,12 @@ namespace cuda_std  {
   template<>
   struct less<void> {
     template<typename T, typename U>
+    __device__ __host__
     constexpr bool operator()(const T &lhs, const U &rhs ) const { return lhs < rhs;}
   };
 
   template<typename RandomIt, typename T, typename Compare = less<T>>
+  __device__ __host__
   constexpr
   RandomIt lower_bound(RandomIt first, RandomIt last, const T& value, Compare comp={})
   {
@@ -43,6 +55,7 @@ namespace cuda_std  {
   }
 
   template<typename RandomIt, typename T, typename Compare = less<T>>
+  __device__ __host__
   constexpr
   RandomIt upper_bound(RandomIt first, RandomIt last, const T& value, Compare comp={})
   {
@@ -64,6 +77,7 @@ namespace cuda_std  {
   }
 
   template<typename RandomIt, typename T, typename Compare = cuda_std::less<T>>
+  __device__ __host__
   constexpr
   RandomIt binary_find(RandomIt first, RandomIt last, const T& value, Compare comp={})
   {
diff --git a/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu
index 535388947fc84..7a3e4eb83161d 100644
--- a/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu
+++ b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu
@@ -8,27 +8,6 @@
 #include<atomic>
 #include <mutex>
 
-template<class ForwardIt, class T, class Compare>
-__device__
-ForwardIt lowerBound(ForwardIt first, ForwardIt last, const T& value, Compare comp)
-{
-    ForwardIt it;
-    auto count = last-first;
- 
-    while (count > 0) {
-        it = first;
-        auto step = count / 2;
-        it+=step;
-        if (comp(*it, value)) {
-            first = ++it;
-            count -= step + 1;
-        }
-        else
-            count = step;
-    }
-    return first;
-}
-
 
 using ClusterSLGPU = trackerHitAssociationHeterogeneousProduct::ClusterSLGPU;
 
@@ -59,19 +38,18 @@ void simLink(clusterSLOnGPU::DigisOnGPU const * ddp, uint32_t ndigis, clusterSLO
   
   const std::array<uint32_t,4> me{{id,ch,0,0}};
 
-  auto less = [](std::array<uint32_t,4> const & a, std::array<uint32_t,4> const & b)->bool {
+  auto less = [] __device__ __host__ (std::array<uint32_t,4> const & a, std::array<uint32_t,4> const & b)->bool {
      return a[0]<b[0] || ( !(b[0]<a[0]) && a[1]<b[1]); // in this context we do not care of [2] 
   };
 
-  auto equal = [](std::array<uint32_t,4> const & a, std::array<uint32_t,4> const & b)->bool {
+  auto equal = [] __device__ __host__ (std::array<uint32_t,4> const & a, std::array<uint32_t,4> const & b)->bool {
      return a[0]==b[0] && a[1]==b[1]; // in this context we do not care of [2]
   };
 
   auto const * b = sl.links_d;
   auto const * e = b+n;
 
-  // auto p = cuda_std::lower_bound(b,e,me,less);
-  auto p = lowerBound(b,e,me,less);
+  auto p = cuda_std::lower_bound(b,e,me,less);
   int32_t j = p-sl.links_d;
   assert(j>=0);
 
@@ -178,10 +156,19 @@ namespace clusterSLOnGPU {
 
    cudaCheck(cudaMalloc((void**) & slgpu.me_d, sizeof(ClusterSLGPU)));
    cudaCheck(cudaMemcpyAsync(slgpu.me_d, &slgpu, sizeof(ClusterSLGPU), cudaMemcpyDefault, stream.id()));
-   cudaCheck(cudaDeviceSynchronize());
-
   }
 
+ void
+  Kernel::deAlloc() {
+   cudaCheck(cudaFree(slgpu.links_d));
+   cudaCheck(cudaFree(slgpu.tkId_d));
+   cudaCheck(cudaFree(slgpu.tkId2_d));
+   cudaCheck(cudaFree(slgpu.n1_d));
+   cudaCheck(cudaFree(slgpu.n2_d));
+   cudaCheck(cudaFree(slgpu.me_d));
+}
+
+
   void
   Kernel::zero(cudaStream_t stream) {
    cudaCheck(cudaMemsetAsync(slgpu.tkId_d,invTK,(ClusterSLGPU::MaxNumModules*256)*sizeof(uint32_t), stream));
@@ -216,9 +203,9 @@ namespace clusterSLOnGPU {
 
     assert(sl.me_d);
     simLink<<<blocks, threadsPerBlock, 0, stream.id()>>>(dd.me_d,ndigis, hh.gpu_d, sl.me_d,n);
-    cudaStreamSynchronize(stream.id());
 
     if (doDump) {
+      cudaStreamSynchronize(stream.id());	// flush previous printf
       // one line == 200B so each kernel can print only 5K lines....
       blocks = 16; // (nhits + threadsPerBlock - 1) / threadsPerBlock;
       for (int first=0; first<int(nhits); first+=blocks*threadsPerBlock) {
diff --git a/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.h b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.h
index 621bfbcb367bb..352337e783105 100644
--- a/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.h
+++ b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.h
@@ -27,11 +27,13 @@ namespace clusterSLOnGPU {
   class Kernel {
   public:
     Kernel(cuda::stream_t<>& stream, bool dump);
+    ~Kernel() {deAlloc();}
     void algo(DigisOnGPU const & dd, uint32_t ndigis, HitsOnCPU const & hh, uint32_t nhits, uint32_t n, cuda::stream_t<>& stream);
     GPUProduct getProduct() { return GPUProduct{slgpu.me_d};}
     
   private:
      void alloc(cuda::stream_t<>& stream);
+     void deAlloc(); 
      void zero(cudaStream_t stream);
   public:
      ClusterSLGPU slgpu; 

From 921e2979407e0561a917cda0536380c0792a53f0 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Tue, 31 Jul 2018 09:24:44 +0200
Subject: [PATCH 19/19] fix NVCC protections

---
 .../CUDAUtilities/interface/cudastdAlgorithm.h   |  6 ------
 .../CUDAUtilities/test/BuildFile.xml             | 16 ++++++++--------
 .../trackerHitAssociationHeterogeneousProduct.h  |  6 +++---
 3 files changed, 11 insertions(+), 17 deletions(-)

diff --git a/HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h b/HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h
index c451ab7e4113d..c107f4b228539 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h
@@ -1,13 +1,7 @@
 #ifndef HeterogeneousCore_CUDAUtilities_cudastdAlgorithm_h
 #define HeterogeneousCore_CUDAUtilities_cudastdAlgorithm_h
 
-#ifdef __CUDACC__
 #include <cuda_runtime.h>
-#else
-#define __device__
-#define __global__
-#define __host__
-#endif // __CUDACC__
 
 
 #include <utility>
diff --git a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
index 1c1d28dbf3761..ab97c243c4385 100644
--- a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
+++ b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
@@ -1,31 +1,31 @@
+<use name="HeterogeneousCore/CUDAUtilities"/>
+
 <bin file="test_GPUSimpleVector.cu" name="test_GPUSimpleVector"/>
 
-<bin file="cudastdAlgorithm_t.cpp"/>
+<bin file="cudastdAlgorithm_t.cpp">
+</bin>
+
 <bin file="cudastdAlgorithm_t.cu" name="gpuCudastdAlgorithm_t">
-  <use name="cuda"/>
   <use name="cuda-api-wrappers"/>
 </bin>
 
 <bin file="radixSort_t.cu" name="gpuRadixSort_t">
-  <use name="cuda"/>
   <use name="cuda-api-wrappers"/>
 </bin>
 
 <bin file="radixSort_t.cu" name="gpuRadixSort_debug">
-  <use name="cuda"/>
   <use name="cuda-api-wrappers"/>
   <flags CUDA_FLAGS="-g -G"/>
 </bin>
 
-<bin file="HistoContainer_t.cpp"/>
+<bin file="HistoContainer_t.cpp">
+</bin>
+
 <bin file="HistoContainer_t.cu" name="gpuHistoContainer_t">
-  <use name="cuda"/>
   <use name="cuda-api-wrappers"/>
 </bin>
 
-<bin file="HistoContainer_t.cpp"/>
 <bin file="HistoContainer_t.cu" name="gpuHistoContainer_debug">
-  <use name="cuda"/>
   <use name="cuda-api-wrappers"/>
   <flags CUDA_FLAGS="-g -G"/>
 </bin>
diff --git a/SimTracker/TrackerHitAssociation/plugins/trackerHitAssociationHeterogeneousProduct.h b/SimTracker/TrackerHitAssociation/plugins/trackerHitAssociationHeterogeneousProduct.h
index 4db318c5348f6..c3f86ddf855c8 100644
--- a/SimTracker/TrackerHitAssociation/plugins/trackerHitAssociationHeterogeneousProduct.h
+++ b/SimTracker/TrackerHitAssociation/plugins/trackerHitAssociationHeterogeneousProduct.h
@@ -1,7 +1,7 @@
 #ifndef SimTrackerTrackerHitAssociationClusterHeterogeneousProduct_H
 #define SimTrackerTrackerHitAssociationClusterHeterogeneousProduct_H
 
-#ifndef	__NVCC__
+#ifndef	__CUDACC__
 #include "SimTracker/TrackerHitAssociation/interface/ClusterTPAssociation.h"
 #endif
 
@@ -10,7 +10,7 @@
 
 namespace trackerHitAssociationHeterogeneousProduct {
 
-#ifndef __NVCC__
+#ifndef __CUDACC__
   struct CPUProduct {
     CPUProduct() = default;
     template<typename T> 
@@ -37,7 +37,7 @@ namespace trackerHitAssociationHeterogeneousProduct {
      ClusterSLGPU *  gpu_d=nullptr;
   };
 
-#ifndef	__NVCC__
+#ifndef	__CUDACC__
    using ClusterTPAHeterogeneousProduct = HeterogeneousProductImpl<heterogeneous::CPUProduct<CPUProduct>,
                                                                    heterogeneous::GPUCudaProduct<GPUProduct> >;
 #endif