cms-patatrack · fwyzard · Jul 31, 2018 · Jul 7, 2018 · Jul 7, 2018 · Jul 8, 2018
diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
@@ -8,14 +8,6 @@
 #include <atomic>
 #endif // __CUDA_ARCH__
 
-#ifdef __CUDACC__
-#include <cuda_runtime.h>
-#else
-#define __device__
-#define __global__
-#define __host__
-#endif // __CUDACC__
-
 #include "HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h"
 
 #ifdef __CUDACC__

diff --git a/HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h b/HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h
@@ -1,6 +1,9 @@
 #ifndef HeterogeneousCore_CUDAUtilities_cudastdAlgorithm_h
 #define HeterogeneousCore_CUDAUtilities_cudastdAlgorithm_h
 
+#include <cuda_runtime.h>
+
+
 #include <utility>
 
 // reimplementation of std algorithms able to compile with CUDA and run on GPUs,
@@ -10,6 +13,7 @@ namespace cuda_std  {
 
   template<typename T = void>
   struct less {
+    __device__ __host__
     constexpr bool operator()(const T &lhs, const T &rhs) const {
       return lhs < rhs;
     }
@@ -18,10 +22,12 @@ namespace cuda_std  {
   template<>
   struct less<void> {
     template<typename T, typename U>
+    __device__ __host__
     constexpr bool operator()(const T &lhs, const U &rhs ) const { return lhs < rhs;}
   };
 
   template<typename RandomIt, typename T, typename Compare = less<T>>
+  __device__ __host__
   constexpr
   RandomIt lower_bound(RandomIt first, RandomIt last, const T& value, Compare comp={})
   {
@@ -43,6 +49,7 @@ namespace cuda_std  {
   }
 
   template<typename RandomIt, typename T, typename Compare = less<T>>
+  __device__ __host__
   constexpr
   RandomIt upper_bound(RandomIt first, RandomIt last, const T& value, Compare comp={})
   {
@@ -64,6 +71,7 @@ namespace cuda_std  {
   }
 
   template<typename RandomIt, typename T, typename Compare = cuda_std::less<T>>
+  __device__ __host__
   constexpr
   RandomIt binary_find(RandomIt first, RandomIt last, const T& value, Compare comp={})
   {

diff --git a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
@@ -1,31 +1,31 @@
+<use name="HeterogeneousCore/CUDAUtilities"/>
+
 <bin file="test_GPUSimpleVector.cu" name="test_GPUSimpleVector"/>
 
-<bin file="cudastdAlgorithm_t.cpp"/>
+<bin file="cudastdAlgorithm_t.cpp">
+</bin>
+
 <bin file="cudastdAlgorithm_t.cu" name="gpuCudastdAlgorithm_t">
-  <use name="cuda"/>
   <use name="cuda-api-wrappers"/>
 </bin>
 
 <bin file="radixSort_t.cu" name="gpuRadixSort_t">
-  <use name="cuda"/>
   <use name="cuda-api-wrappers"/>
 </bin>
 
 <bin file="radixSort_t.cu" name="gpuRadixSort_debug">
-  <use name="cuda"/>
   <use name="cuda-api-wrappers"/>
   <flags CUDA_FLAGS="-g -G"/>
 </bin>
 
-<bin file="HistoContainer_t.cpp"/>
+<bin file="HistoContainer_t.cpp">
+</bin>
+
 <bin file="HistoContainer_t.cu" name="gpuHistoContainer_t">
-  <use name="cuda"/>
   <use name="cuda-api-wrappers"/>
 </bin>
 
-<bin file="HistoContainer_t.cpp"/>
 <bin file="HistoContainer_t.cu" name="gpuHistoContainer_debug">
-  <use name="cuda"/>
   <use name="cuda-api-wrappers"/>
   <flags CUDA_FLAGS="-g -G"/>
 </bin>
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
@@ -38,7 +38,7 @@
 
 namespace pixelgpudetails {
 
-  SiPixelRawToClusterGPUKernel::SiPixelRawToClusterGPUKernel() {
+  SiPixelRawToClusterGPUKernel::SiPixelRawToClusterGPUKernel(cuda::stream_t<>& cudaStream) {
     int WSIZE = pixelgpudetails::MAX_FED * pixelgpudetails::MAX_WORD;
     cudaMallocHost(&word,       sizeof(unsigned int)*WSIZE);
     cudaMallocHost(&fedId_h,    sizeof(unsigned char)*WSIZE);
@@ -90,6 +90,13 @@ namespace pixelgpudetails {
     cudaCheck(cudaMalloc((void**) & moduleStart_d, (MaxNumModules+1)*sizeof(uint32_t) ));
     cudaCheck(cudaMalloc((void**) & clusInModule_d,(MaxNumModules)*sizeof(uint32_t) ));
     cudaCheck(cudaMalloc((void**) & moduleId_d,    (MaxNumModules)*sizeof(uint32_t) ));
+
+    cudaCheck(cudaMalloc((void**) & gpuProduct_d, sizeof(GPUProduct)));
+    gpuProduct = getProduct();
+    assert(xx_d==gpuProduct.xx_d);
+
+    cudaCheck(cudaMemcpyAsync(gpuProduct_d, &gpuProduct, sizeof(GPUProduct), cudaMemcpyDefault,cudaStream.id()));
+
   }
 
   SiPixelRawToClusterGPUKernel::~SiPixelRawToClusterGPUKernel() {
@@ -111,6 +118,10 @@ namespace pixelgpudetails {
     cudaCheck(cudaFree(clus_d));
     cudaCheck(cudaFree(clusInModule_d));
     cudaCheck(cudaFree(moduleId_d));
+
+    cudaCheck(cudaFree(gpuProduct_d));
+
+
   }
 
   void SiPixelRawToClusterGPUKernel::initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length) {

diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
@@ -144,12 +144,22 @@ namespace pixelgpudetails {
            (adc << thePacking.adc_shift);
   }
 
+  constexpr
+  uint32_t pixelToChannel( int row, int col) {
+    constexpr Packing thePacking = packing();
+    return (row << thePacking.column_width) | col;
+  }
+
+
   using error_obj = siPixelRawToClusterHeterogeneousProduct::error_obj;
 
 
   class SiPixelRawToClusterGPUKernel {
   public:
-    SiPixelRawToClusterGPUKernel();
+
+    using GPUProduct = siPixelRawToClusterHeterogeneousProduct::GPUProduct;
+
+    SiPixelRawToClusterGPUKernel(cuda::stream_t<>& cudaStream);
     ~SiPixelRawToClusterGPUKernel();
 
 
@@ -170,8 +180,9 @@ namespace pixelgpudetails {
     auto getProduct() const {
       return siPixelRawToClusterHeterogeneousProduct::GPUProduct{
         pdigi_h, rawIdArr_h, clus_h, adc_h, error_h,
-        nDigis, nModulesActive,
-        xx_d, yy_d, adc_d, moduleInd_d, moduleStart_d,clus_d, clusInModule_d, moduleId_d
+        gpuProduct_d,
+        xx_d, yy_d, adc_d, moduleInd_d, moduleStart_d,clus_d, clusInModule_d, moduleId_d,
+        nDigis, nModulesActive
       };
     }
 
@@ -181,6 +192,11 @@ namespace pixelgpudetails {
     unsigned char *fedId_h = nullptr;    // to hold fed index for each word
 
     // output
+    GPUProduct gpuProduct;
+    GPUProduct * gpuProduct_d;
+
+    // FIXME cleanup all these are in the gpuProduct above...
+
     uint32_t *pdigi_h = nullptr, *rawIdArr_h = nullptr;                   // host copy of output
     uint16_t *adc_h = nullptr; int32_t *clus_h = nullptr; // host copy of calib&clus output
     pixelgpudetails::error_obj *data_h = nullptr;

diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc
@@ -425,7 +425,7 @@ void SiPixelRawToClusterHeterogeneous::produceCPU(edm::HeterogeneousEvent& ev, c
 // -----------------------------------------------------------------------------
 void SiPixelRawToClusterHeterogeneous::beginStreamGPUCuda(edm::StreamID streamId, cuda::stream_t<>& cudaStream) {
   // Allocate GPU resources here
-  gpuAlgo_ = std::make_unique<pixelgpudetails::SiPixelRawToClusterGPUKernel>();
+  gpuAlgo_ = std::make_unique<pixelgpudetails::SiPixelRawToClusterGPUKernel>(cudaStream);
   gpuModulesToUnpack_ = std::make_unique<SiPixelFedCablingMapGPUWrapper::ModulesToUnpack>();
 }
 
@@ -523,6 +523,7 @@ void SiPixelRawToClusterHeterogeneous::acquireGPUCuda(const edm::HeterogeneousEv
 
 void SiPixelRawToClusterHeterogeneous::produceGPUCuda(edm::HeterogeneousEvent& ev, const edm::EventSetup& es, cuda::stream_t<>& cudaStream) {
   auto output = std::make_unique<GPUProduct>(gpuAlgo_->getProduct());
+  assert(output->me_d);
   ev.put<Output>(std::move(output), [this](const GPUProduct& gpu, CPUProduct& cpu) {
       this->convertGPUtoCPU(gpu, cpu);
     });

diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h b/RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h
@@ -36,6 +36,7 @@ namespace siPixelRawToClusterHeterogeneousProduct {
     { }
   };
 
+  // FIXME split in two 
   struct GPUProduct {
     // Needed for digi and cluster CPU output
     uint32_t const * pdigi_h = nullptr;
@@ -44,9 +45,9 @@ namespace siPixelRawToClusterHeterogeneousProduct {
     uint16_t const * adc_h = nullptr;
     GPU::SimpleVector<error_obj> const * error_h = nullptr;
 
+    GPUProduct const * me_d = nullptr;
+
     // Needed for GPU rechits
-    uint32_t nDigis;
-    uint32_t nModules;
     uint16_t const * xx_d;
     uint16_t const * yy_d;
     uint16_t const * adc_d;
@@ -55,6 +56,10 @@ namespace siPixelRawToClusterHeterogeneousProduct {
     int32_t const *  clus_d;
     uint32_t const * clusInModule_d;
     uint32_t const * moduleId_d;
+
+    uint32_t nDigis;
+    uint32_t nModules;
+
   };
 
   using HeterogeneousDigiCluster = HeterogeneousProductImpl<heterogeneous::CPUProduct<CPUProduct>,

diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
@@ -36,9 +36,10 @@ namespace pixelgpudetails {
     cudaCheck(cudaMalloc((void**) & gpu_.sortIndex_d,(gpuClustering::MaxNumModules*256)*sizeof(uint16_t)));
     cudaCheck(cudaMalloc((void**) & gpu_.mr_d,(gpuClustering::MaxNumModules*256)*sizeof(uint16_t)));
     cudaCheck(cudaMalloc((void**) & gpu_.mc_d,(gpuClustering::MaxNumModules*256)*sizeof(uint16_t)));
-//    cudaCheck(cudaMalloc((void**) & gpu_.hist_d, 10*sizeof(HitsOnGPU::Hist)));
+    cudaCheck(cudaMalloc((void**) & gpu_.hist_d, 10*sizeof(HitsOnGPU::Hist)));
 
     cudaCheck(cudaMalloc((void**) & gpu_d, sizeof(HitsOnGPU)));
+    gpu_.me_d = gpu_d;
     cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault,cudaStream.id()));
 
   }
@@ -59,7 +60,7 @@ namespace pixelgpudetails {
     cudaCheck(cudaFree(gpu_.sortIndex_d));
     cudaCheck(cudaFree(gpu_.mr_d));
     cudaCheck(cudaFree(gpu_.mc_d));
-    // cudaCheck(cudaFree(gpu_.hist_d));
+    cudaCheck(cudaFree(gpu_.hist_d));
 
     cudaCheck(cudaFree(gpu_d));
   }
@@ -119,7 +120,7 @@ namespace pixelgpudetails {
     // for timing test
     // radixSortMultiWrapper<int16_t><<<10, 256, 0, c.stream>>>(gpu_.iphi_d,gpu_.sortIndex_d,gpu_.hitsLayerStart_d);
 
-    // fillManyFromVector(gpu_.hist_d,10,gpu_.iphi_d, gpu_.hitsLayerStart_d, nhits,256,c.stream);
+    cudautils::fillManyFromVector(gpu_.hist_d,10,gpu_.iphi_d, gpu_.hitsLayerStart_d, nhits,256,stream.id());
 
 
   }

diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h b/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h
@@ -8,7 +8,7 @@
 #include <cstdint>
 #include <vector>
 
-// #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
 
 
 namespace siPixelRecHitsHeterogeneousProduct {
@@ -31,22 +31,26 @@ namespace siPixelRecHitsHeterogeneousProduct {
      uint16_t * mr_d;
      uint16_t * mc_d;
 
-     // using Hist = HistoContainer<int16_t,7,8>;
-     // Hist * hist_d;
+     using Hist = HistoContainer<int16_t,7,8>;
+     Hist * hist_d;
+
+     HitsOnGPU const * me_d=nullptr;
   };
 
 
   struct HitsOnCPU {
     HitsOnCPU() = default;
     explicit HitsOnCPU(uint32_t nhits) :
-      charge(nhits),xl(nhits),yl(nhits),xe(nhits),ye(nhits), mr(nhits), mc(nhits){}
+      charge(nhits),xl(nhits),yl(nhits),xe(nhits),ye(nhits), mr(nhits), mc(nhits),
+      nHits(nhits){}
     uint32_t hitsModuleStart[2001];
     std::vector<int32_t> charge;
     std::vector<float> xl, yl;
     std::vector<float> xe, ye;
     std::vector<uint16_t> mr;
     std::vector<uint16_t> mc;
-
+
+    uint32_t nHits;
     HitsOnGPU const * gpu_d=nullptr;
   };
 

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletHeterogeneousEDProducer.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletHeterogeneousEDProducer.cc
@@ -18,6 +18,11 @@
 
 #include "CAHitQuadrupletGeneratorGPU.h"
 
+// gpu
+#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h" 
+
+
+
 namespace {
 void fillNtuplets(RegionsSeedingHitSets::RegionFiller &seedingHitSetsFiller,
                   const OrderedHitSeeds &quadruplets) {
@@ -31,6 +36,10 @@ class CAHitNtupletHeterogeneousEDProducer
     : public HeterogeneousEDProducer<heterogeneous::HeterogeneousDevices<
           heterogeneous::GPUCuda, heterogeneous::CPU>> {
 public:
+
+  using PixelRecHitsH = siPixelRecHitsHeterogeneousProduct::HeterogeneousPixelRecHit;
+
+
   CAHitNtupletHeterogeneousEDProducer(const edm::ParameterSet &iConfig);
   ~CAHitNtupletHeterogeneousEDProducer() = default;
 
@@ -50,6 +59,9 @@ class CAHitNtupletHeterogeneousEDProducer
 private:
   edm::EDGetTokenT<IntermediateHitDoublets> doubletToken_;
 
+  edm::EDGetTokenT<HeterogeneousProduct> tGpuHits;
+
+
   edm::RunningAverage localRA_;
   CAHitQuadrupletGeneratorGPU GPUGenerator_;
   CAHitQuadrupletGenerator CPUGenerator_;
@@ -63,6 +75,7 @@ CAHitNtupletHeterogeneousEDProducer::CAHitNtupletHeterogeneousEDProducer(
     : HeterogeneousEDProducer(iConfig),
       doubletToken_(consumes<IntermediateHitDoublets>(
           iConfig.getParameter<edm::InputTag>("doublets"))),
+      tGpuHits(consumesHeterogeneous(iConfig.getParameter<edm::InputTag>("heterogeneousPixelRecHitSrc"))),
       GPUGenerator_(iConfig, consumesCollector()),
       CPUGenerator_(iConfig, consumesCollector()) {
   produces<RegionsSeedingHitSets>();
@@ -73,6 +86,9 @@ void CAHitNtupletHeterogeneousEDProducer::fillDescriptions(
   edm::ParameterSetDescription desc;
 
   desc.add<edm::InputTag>("doublets", edm::InputTag("hitPairEDProducer"));
+
+  desc.add<edm::InputTag>("heterogeneousPixelRecHitSrc", edm::InputTag("siPixelRecHitHeterogeneous"));
+
   CAHitQuadrupletGeneratorGPU::fillDescriptions(desc);
   HeterogeneousEDProducer::fillPSetDescription(desc);
   auto label = "caHitQuadrupletHeterogeneousEDProducer";
@@ -106,6 +122,13 @@ void CAHitNtupletHeterogeneousEDProducer::acquireGPUCuda(
 
   seedingHitSets_ = std::make_unique<RegionsSeedingHitSets>();
 
+  edm::Handle<siPixelRecHitsHeterogeneousProduct::GPUProduct> gh;
+  iEvent.getByToken<siPixelRecHitsHeterogeneousProduct::HeterogeneousPixelRecHit>(tGpuHits, gh);
+  auto const & gHits = *gh;
+//  auto nhits = gHits.nHits;
+
+  GPUGenerator_.buildDoublets(gHits,0.06f,cudaStream.id());
+
   if (regionDoublets.empty()) {
     emptyRegionDoublets = true;
   } else {

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu
@@ -6,6 +6,8 @@
 #include "GPUCACell.h"
 #include "CAHitQuadrupletGeneratorGPU.h"
 
+#include "gpuPixelDoublets.h"
+
 __global__ void
 kernel_debug(unsigned int numberOfLayerPairs_, unsigned int numberOfLayers_,
              const GPULayerDoublets *gpuDoublets,
@@ -490,3 +492,18 @@ CAHitQuadrupletGeneratorGPU::fetchKernelResult(int regionIndex, cudaStream_t cud
   }
   return quadsInterface;
 }
+
+
+
+
+void CAHitQuadrupletGeneratorGPU::buildDoublets(HitsOnCPU const & hh, float phicut, cudaStream_t stream) {
+   auto nhits = hh.nHits;
+
+  float phiCut=0.06;
+  int threadsPerBlock = 256;
+  int blocks = (nhits + threadsPerBlock - 1) / threadsPerBlock;
+
+  gpuPixelDoublets::getDoubletsFromHisto<<<blocks, threadsPerBlock, 0, stream>>>(hh.gpu_d,phiCut);
+
+
+}