Implement GPU vertex finder with a single kernel (#413)

cms-patatrack · Dec 3, 2019 · 459524a · 459524a
1 parent 77ceafd
commit 459524a
Show file tree

Hide file tree

Showing 9 changed files with 174 additions and 57 deletions.
diff --git a/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexProducerCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexProducerCUDA.cc
@@ -44,7 +44,8 @@ class PixelVertexProducerCUDA : public edm::global::EDProducer<> {
 
 PixelVertexProducerCUDA::PixelVertexProducerCUDA(const edm::ParameterSet& conf)
     : m_OnGPU(conf.getParameter<bool>("onGPU")),
-      m_gpuAlgo(conf.getParameter<bool>("useDensity"),
+      m_gpuAlgo(conf.getParameter<bool>("oneKernel"),
+                conf.getParameter<bool>("useDensity"),
                 conf.getParameter<bool>("useDBSCAN"),
                 conf.getParameter<bool>("useIterative"),
                 conf.getParameter<int>("minT"),
@@ -68,6 +69,7 @@ void PixelVertexProducerCUDA::fillDescriptions(edm::ConfigurationDescriptions& d
   // Only one of these three algos can be used at once.
   // Maybe this should become a Plugin Factory
   desc.add<bool>("onGPU", true);
+  desc.add<bool>("oneKernel", true);
   desc.add<bool>("useDensity", true);
   desc.add<bool>("useDBSCAN", false);
   desc.add<bool>("useIterative", false);

diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracksByDensity.h b/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracksByDensity.h
@@ -17,7 +17,7 @@ namespace gpuVertexFinder {
   //
   // based on Rodrighez&Laio algo
   //
-  __global__ void clusterTracksByDensity(gpuVertexFinder::ZVertices* pdata,
+  __device__ __forceinline__ void clusterTracksByDensity(gpuVertexFinder::ZVertices* pdata,
                                          gpuVertexFinder::WorkSpace* pws,
                                          int minT,      // min number of neighbours to be "seed"
                                          float eps,     // max absolute distance to cluster
@@ -219,6 +219,16 @@ namespace gpuVertexFinder {
       printf("found %d proto vertices\n", foundClusters);
   }
 
+  __global__ void clusterTracksByDensityKernel(gpuVertexFinder::ZVertices* pdata,
+                                         gpuVertexFinder::WorkSpace* pws,
+                                         int minT,    // min number of neighbours to be "seed"
+                                         float eps,     // max absolute distance to cluster
+                                         float errmax,  // max error to be "seed"
+                                         float chi2max  // max normalized distance to cluster
+  ) {
+   clusterTracksByDensity(pdata,pws,minT,eps,errmax,chi2max);
+ }
+
 }  // namespace gpuVertexFinder
 
 #endif  // RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksByDensity_h
diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuFitVertices.h b/RecoPixelVertexing/PixelVertexFinding/src/gpuFitVertices.h
@@ -12,7 +12,7 @@
 
 namespace gpuVertexFinder {
 
-  __global__ void fitVertices(ZVertices* pdata,
+  __device__ __forceinline__ void fitVertices(ZVertices* pdata,
                               WorkSpace* pws,
                               float chi2Max  // for outlier rejection
   ) {
@@ -101,6 +101,14 @@ namespace gpuVertexFinder {
       printf("and %d noise\n", noise);
   }
 
+  __global__ void fitVerticesKernel(ZVertices* pdata,
+                              WorkSpace* pws,
+                              float chi2Max  // for outlier rejection
+  ) {
+
+   fitVertices(pdata,pws,chi2Max);
+ }
+
 }  // namespace gpuVertexFinder
 
 #endif  // RecoPixelVertexing_PixelVertexFinding_src_gpuFitVertices_h
diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuSortByPt2.h b/RecoPixelVertexing/PixelVertexFinding/src/gpuSortByPt2.h
@@ -15,7 +15,8 @@
 
 namespace gpuVertexFinder {
 
-  __global__ void sortByPt2(ZVertices* pdata, WorkSpace* pws) {
+  __device__ __forceinline__ 
+  void sortByPt2(ZVertices* pdata, WorkSpace* pws) {
     auto& __restrict__ data = *pdata;
     auto& __restrict__ ws = *pws;
     auto nt = ws.ntrks;
@@ -66,6 +67,11 @@ namespace gpuVertexFinder {
 #endif
   }
 
+
+  __global__ void sortByPt2Kernel(ZVertices* pdata, WorkSpace* pws) {
+     sortByPt2(pdata,pws);
+  }
+
 }  // namespace gpuVertexFinder
 
 #endif  // RecoPixelVertexing_PixelVertexFinding_src_gpuSortByPt2_h
diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuSplitVertices.h b/RecoPixelVertexing/PixelVertexFinding/src/gpuSplitVertices.h
@@ -12,7 +12,7 @@
 
 namespace gpuVertexFinder {
 
-  __global__ void splitVertices(ZVertices* pdata, WorkSpace* pws, float maxChi2) {
+  __device__ __forceinline__ void splitVertices(ZVertices* pdata, WorkSpace* pws, float maxChi2) {
     constexpr bool verbose = false;  // in principle the compiler should optmize out if false
 
     auto& __restrict__ data = *pdata;
@@ -32,20 +32,20 @@ namespace gpuVertexFinder {
     assert(zt);
 
     // one vertex per block
-    auto kv = blockIdx.x;
+    for ( auto kv = blockIdx.x; kv<nvFinal; kv += gridDim.x) {
 
-    if (kv >= nvFinal)
-      return;
     if (nn[kv] < 4)
-      return;
+      continue;
     if (chi2[kv] < maxChi2 * float(nn[kv]))
-      return;
+      continue;
 
-    assert(nn[kv] < 1023);
-    __shared__ uint32_t it[1024];   // track index
-    __shared__ float zz[1024];      // z pos
-    __shared__ uint8_t newV[1024];  // 0 or 1
-    __shared__ float ww[1024];      // z weight
+    constexpr int MAXTK = 512;
+    assert(nn[kv] < MAXTK);
+    if (nn[kv] >= MAXTK) continue; // too bad FIXME
+    __shared__ uint32_t it[MAXTK];   // track index
+    __shared__ float zz[MAXTK];      // z pos
+    __shared__ uint8_t newV[MAXTK];  // 0 or 1
+    __shared__ float ww[MAXTK];      // z weight
 
     __shared__ uint32_t nq;  // number of track for this vertex
     nq = 0;
@@ -54,7 +54,7 @@ namespace gpuVertexFinder {
     // copy to local
     for (auto k = threadIdx.x; k < nt; k += blockDim.x) {
       if (iv[k] == int(kv)) {
-        auto old = atomicInc(&nq, 1024);
+        auto old = atomicInc(&nq, MAXTK);
         zz[old] = zt[k] - zv[kv];
         newV[old] = zz[old] < 0 ? 0 : 1;
         ww[old] = 1.f / ezt2[k];
@@ -104,7 +104,7 @@ namespace gpuVertexFinder {
 
     // avoid empty vertices
     if (0 == wnew[0] || 0 == wnew[1])
-      return;
+      continue;
 
     // quality cut
     auto dist2 = (znew[0] - znew[1]) * (znew[0] - znew[1]);
@@ -115,7 +115,7 @@ namespace gpuVertexFinder {
       printf("inter %d %f %f\n", 20 - maxiter, chi2Dist, dist2 * wv[kv]);
 
     if (chi2Dist < 4)
-      return;
+      continue;
 
     // get a new global vertex
     __shared__ uint32_t igv;
@@ -126,6 +126,12 @@ namespace gpuVertexFinder {
       if (1 == newV[k])
         iv[it[k]] = igv;
     }
+
+  } // loop on vertices
+  }
+
+  __global__ void splitVerticesKernel(ZVertices* pdata, WorkSpace* pws, float maxChi2) {
+     splitVertices(pdata, pws, maxChi2);
   }
 
 }  // namespace gpuVertexFinder

diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h
@@ -43,28 +43,32 @@ namespace gpuVertexFinder {
     using WorkSpace = gpuVertexFinder::WorkSpace;
     using TkSoA = pixelTrack::TrackSoA;
 
-    Producer(bool useDensity,
+    Producer(bool oneKernel,
+             bool useDensity,
              bool useDBSCAN,
              bool useIterative,
              int iminT,      // min number of neighbours to be "core"
              float ieps,     // max absolute distance to cluster
              float ierrmax,  // max error to be "seed"
              float ichi2max  // max normalized distance to cluster
              )
-        : useDensity_(useDensity),
+        : oneKernel_(oneKernel && !(useDBSCAN||useIterative)),
+          useDensity_(useDensity),
           useDBSCAN_(useDBSCAN),
           useIterative_(useIterative),
           minT(iminT),
           eps(ieps),
           errmax(ierrmax),
-          chi2max(ichi2max) {}
+          chi2max(ichi2max) {
+          }
 
     ~Producer() = default;
 
     ZVertexHeterogeneous makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const;
     ZVertexHeterogeneous make(TkSoA const* tksoa, float ptMin) const;
 
   private:
+    const bool oneKernel_;
     const bool useDensity_;
     const bool useDBSCAN_;
     const bool useIterative_;

diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h
@@ -45,6 +45,48 @@ namespace gpuVertexFinder {
     }
   }
 
+// #define THREE_KERNELS
+#ifndef THREE_KERNELS
+  __global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata,
+                                        gpuVertexFinder::WorkSpace* pws,
+                                        int minT,    // min number of neighbours to be "seed"
+                                        float eps,     // max absolute distance to cluster
+                                        float errmax,  // max error to be "seed"
+                                        float chi2max  // max normalized distance to cluster,
+  ) {
+    clusterTracksByDensity(pdata,pws,minT,eps,errmax,chi2max);
+    __syncthreads();
+    fitVertices(pdata,pws, 50.);
+    __syncthreads();
+    splitVertices(pdata,pws, 9.f);
+    __syncthreads();
+    fitVertices(pdata,pws, 5000.);
+    __syncthreads();
+    sortByPt2(pdata,pws);
+  }
+#else
+  __global__ void vertexFinderKernel1(gpuVertexFinder::ZVertices* pdata,
+                                           gpuVertexFinder::WorkSpace* pws,
+                                        int minT,    // min number of neighbours to be "seed"
+                                           float eps,     // max absolute distance to cluster
+                                        float errmax,  // max error to be "seed"
+                                           float chi2max  // max normalized distance to cluster,
+  ) {
+    clusterTracksByDensity(pdata,pws,minT,eps,errmax,chi2max);
+    __syncthreads();
+    fitVertices(pdata,pws, 50.);
+  }
+
+  __global__ void vertexFinderKernel2(gpuVertexFinder::ZVertices* pdata,
+                                    gpuVertexFinder::WorkSpace* pws)
+  {
+    fitVertices(pdata,pws, 5000.);
+    __syncthreads();
+    sortByPt2(pdata,pws);
+  }
+#endif
+
+
 #ifdef __CUDACC__
   ZVertexHeterogeneous Producer::makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const {
     // std::cout << "producing Vertices on GPU" << std::endl;
@@ -77,17 +119,38 @@ namespace gpuVertexFinder {
 #endif
 
 #ifdef __CUDACC__
-    if (useDensity_) {
-      clusterTracksByDensity<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
-    } else if (useDBSCAN_) {
-      clusterTracksDBSCAN<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
-    } else if (useIterative_) {
-      clusterTracksIterative<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+    if (oneKernel_) {
+      // implemented only for density clustesrs   
+#ifndef THREE_KERNELS
+      vertexFinderOneKernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+#else 
+      vertexFinderKernel1<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+      cudaCheck(cudaGetLastError());
+      // one block per vertex...
+      splitVerticesKernel<<<1024, 128, 0, stream>>>(soa, ws_d.get(), 9.f);
+      cudaCheck(cudaGetLastError());
+      vertexFinderKernel2<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get());
+#endif
+    } else {   // five kernels
+      if (useDensity_) {
+        clusterTracksByDensityKernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+      } else if (useDBSCAN_) {
+        clusterTracksDBSCAN<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+      } else if (useIterative_) {
+        clusterTracksIterative<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+      }
+      cudaCheck(cudaGetLastError());
+      fitVerticesKernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), 50.);
+      cudaCheck(cudaGetLastError());
+      // one block per vertex...
+      splitVerticesKernel<<<1024, 128, 0, stream>>>(soa, ws_d.get(), 9.f);
+      cudaCheck(cudaGetLastError());
+      fitVerticesKernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), 5000.);
+      cudaCheck(cudaGetLastError());
+      sortByPt2Kernel<<<1, 1024-256, 0, stream>>>(soa, ws_d.get());
     }
     cudaCheck(cudaGetLastError());
-    fitVertices<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), 50.);
-    cudaCheck(cudaGetLastError());
-#else
+#else  // __CUDACC__
     if (useDensity_) {
       clusterTracksByDensity(soa, ws_d.get(), minT, eps, errmax, chi2max);
     } else if (useDBSCAN_) {
@@ -97,24 +160,11 @@ namespace gpuVertexFinder {
     }
     // std::cout << "found " << (*ws_d).nvIntermediate << " vertices " << std::endl;
     fitVertices(soa, ws_d.get(), 50.);
-#endif
-
-#ifdef __CUDACC__
-    // one block per vertex...
-    splitVertices<<<1024, 128, 0, stream>>>(soa, ws_d.get(), 9.f);
-    cudaCheck(cudaGetLastError());
-    fitVertices<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), 5000.);
-    cudaCheck(cudaGetLastError());
-
-    sortByPt2<<<1, 256, 0, stream>>>(soa, ws_d.get());
-    cudaCheck(cudaGetLastError());
-#else
-    for (blockIdx.x = 0; blockIdx.x < 1024; ++blockIdx.x) {
-      splitVertices(soa, ws_d.get(), 9.f);
-    }
-    blockIdx.x = 0;
+    // one block per vertex!
+    blockIdx.x = 0; gridDim.x=1;
+    splitVertices(soa, ws_d.get(), 9.f);
+    resetGrid();
     fitVertices(soa, ws_d.get(), 5000.);
-
     sortByPt2(soa, ws_d.get());
 #endif
 

diff --git a/RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml b/RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml
@@ -19,6 +19,12 @@
 <use name="SimDataFormats/Track"/>
 <use name="TrackingTools/TransientTrack"/>
 
+<bin file="gpuVertexFinder_t.cu" name="gpuVertexFinderOneKernel_t">
+  <use name="cuda"/>
+  <flags CUDA_FLAGS="-g -DGPU_DEBUG -DONE_KERNEL"/>
+  <flags CXXFLAGS="-g"/>
+</bin>
+
 <bin file="gpuVertexFinder_t.cu" name="gpuVertexFinderByDensity_t">
   <use name="cuda"/>
   <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
@@ -44,4 +50,3 @@
 <bin file="cpuVertexFinder_t.cpp" name="cpuVertexFinderIterative_t">
   <flags CXXFLAGS="-g -DGPU_DEBUG -DUSE_ITERATIVE"/>
 </bin>
-