Full workflow from raw data to pixel tracks and vertices on GPUs (#216)

Port and optimise the full workflow from pixel raw data to pixel tracks and vertices to GPUs. Clean the pixel n-tuplets with the "fishbone" algorithm (only on GPUs). Other changes: - recover the Riemann fit updates lost during the merge with CMSSW 10.4.x; - speed up clustering and track fitting; - minor bug fix to avoid trivial regression with the optimized fit.
cms-patatrack · Oct 23, 2020 · 25e3e41 · 25e3e41
1 parent 4e9319a
commit 25e3e41
Show file tree

Hide file tree

Showing 10 changed files with 265 additions and 51 deletions.
diff --git a/Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h b/Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h
@@ -2,6 +2,7 @@
 #define Geometry_TrackerGeometryBuilder_phase1PixelTopology_h
 
 #include <cstdint>
+#include <array>
 
 namespace phase1PixelTopology {
 
@@ -29,6 +30,66 @@ namespace phase1PixelTopology {
                                           };
 
 
+  template<class Function, std::size_t... Indices>
+  constexpr auto map_to_array_helper(Function f, std::index_sequence<Indices...>)
+  -> std::array<typename std::result_of<Function(std::size_t)>::type, sizeof...(Indices)>
+  {
+    return {{ f(Indices)... }};
+  }
+
+  template<int N, class Function>
+  constexpr auto map_to_array(Function f)
+  -> std::array<typename std::result_of<Function(std::size_t)>::type, N>
+  {
+    return map_to_array_helper(f, std::make_index_sequence<N>{});
+  }
+
+
+  constexpr uint32_t findMaxModuleStride() {
+    bool go = true;
+    int n=2;
+    while (go) {
+      for  (uint8_t i=1; i<11; ++i) {
+        if (layerStart[i]%n !=0) {go=false; break;}
+      }
+      if(!go) break;
+      n*=2;
+    }
+    return n/2;
+  }
+
+  constexpr uint32_t maxModuleStride = findMaxModuleStride();
+
+
+  constexpr uint8_t findLayer(uint32_t detId) {
+    for  (uint8_t i=0; i<11; ++i) if (detId<layerStart[i+1]) return i;
+    return 11;
+  }
+
+  constexpr uint8_t findLayerFromCompact(uint32_t detId) {
+    detId*=maxModuleStride;
+    for  (uint8_t i=0; i<11; ++i) if (detId<layerStart[i+1]) return i;
+    return 11;
+  }
+
+
+  constexpr uint32_t layerIndexSize = numberOfModules/maxModuleStride;
+  constexpr std::array<uint8_t,layerIndexSize> layer = map_to_array<layerIndexSize>(findLayerFromCompact);
+
+  constexpr bool validateLayerIndex() {
+    bool res=true;
+    for (auto i=0U; i<numberOfModules; ++i)  {
+      auto j = i/maxModuleStride;
+      res &=(layer[j]<10);
+      res &=(i>=layerStart[layer[j]]);
+      res &=(i<layerStart[layer[j]+1]);
+    }
+    return res;
+  }
+
+  static_assert(validateLayerIndex(),"layer from detIndex algo is buggy");
+
+
   // this is for the ROC n<512 (upgrade 1024)
   constexpr inline
   uint16_t  divu52(uint16_t n) {

diff --git a/Geometry/TrackerGeometryBuilder/test/phase1PixelTopology_t.cpp b/Geometry/TrackerGeometryBuilder/test/phase1PixelTopology_t.cpp
@@ -141,5 +141,13 @@ int main() {
     assert(std::get<1>(ori)==bp);
   }
 
+  using namespace phase1PixelTopology;
+  for (auto i=0U; i<numberOfModules; ++i)  {
+    assert(layer[i]<10);
+    assert(i>=layerStart[layer[i]]);
+    assert(i<layerStart[layer[i]+1]);
+  }
+
+
   return 0;
 }
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h
@@ -81,10 +81,9 @@ namespace gpuClustering {
    constexpr uint32_t maxPixInModule = 4000;
    constexpr auto  nbins = phase1PixelTopology::numColsInModule + 2;   //2+2;
    using Hist = HistoContainer<uint16_t,nbins,maxPixInModule,9,uint16_t>;
-   constexpr auto wss = Hist::totbins();
     __shared__ Hist hist;
-    __shared__ typename Hist::Counter ws[wss];
-    for (auto j=threadIdx.x; j<Hist::totbins(); j+=blockDim.x) { hist.off[j]=0; ws[j]=0;}
+    __shared__ typename Hist::Counter ws[32];
+    for (auto j=threadIdx.x; j<Hist::totbins(); j+=blockDim.x) { hist.off[j]=0;}
     __syncthreads();
 
     assert((msize == numElements) or ((msize < numElements) and (id[msize] != thisModuleId)));
@@ -107,10 +106,10 @@ namespace gpuClustering {
 #endif
       }
     __syncthreads();
-    hist.finalize(ws);
-    __syncthreads();
     if (threadIdx.x<32) ws[threadIdx.x]=0;  // used by prefix scan...
     __syncthreads();
+    hist.finalize(ws);
+    __syncthreads();
 #ifdef GPU_DEBUG
     assert(hist.size()==totGood);
     if (thisModuleId % 100 == 1)
@@ -120,7 +119,7 @@ namespace gpuClustering {
       for (int i = first; i < msize; i += blockDim.x) {
         if (id[i] == InvId)                 // skip invalid pixels
           continue;
-        hist.fill(y[i],i-firstPixel,ws);
+        hist.fill(y[i],i-firstPixel);
       }
 
 #ifdef CLUS_LIMIT_LOOP
@@ -134,21 +133,45 @@ namespace gpuClustering {
       jmax[k] = hist.end();
 #endif
 
-#ifdef GPU_DEBUG
     __shared__ int nloops;
     nloops=0;
-#endif
 
 
     __syncthreads();  // for hit filling!
 
+#ifdef GPU_DEBUG
+    // look for anomalous high occupancy
+    __shared__ uint32_t n40,n60;
+    n40=n60=0;
+    __syncthreads();
+    for (auto j=threadIdx.x; j<Hist::nbins(); j+=blockDim.x) { 
+      if(hist.size(j)>60) atomicAdd(&n60,1);
+      if(hist.size(j)>40) atomicAdd(&n40,1);
+     }
+    __syncthreads();
+    if (0==threadIdx.x) {
+      if (n60>0) printf("columns with more than 60 px %d in %d\n",n60,thisModuleId);
+      else if (n40>0) printf("columns with more than 40 px %d in %d\n",n40,thisModuleId);
+    }
+    __syncthreads();
+#endif
+
     // for each pixel, look at all the pixels until the end of the module;
     // when two valid pixels within +/- 1 in x or y are found, set their id to the minimum;
     // after the loop, all the pixel in each cluster should have the id equeal to the lowest
     // pixel in the cluster ( clus[i] == i ).
     bool more = true;
     while (__syncthreads_or(more)) {
-      more = false;
+      if (1==nloops%2) {
+        for (int j=threadIdx.x, k = 0; j<hist.size(); j+=blockDim.x, ++k) {
+             auto p = hist.begin()+j;
+             auto i = *p + firstPixel;
+             auto m = clusterId[i];
+             while (m!=clusterId[m]) m=clusterId[m];
+             clusterId[i]=m;
+        }
+      } else {
+        more = false;
         for (int j=threadIdx.x, k = 0; j<hist.size(); j+=blockDim.x, ++k) {
           auto p = hist.begin()+j;
           auto i = *p + firstPixel;
@@ -166,9 +189,7 @@ namespace gpuClustering {
           // loop to columns
           auto loop = [&](uint16_t const * kk) {
             auto m = (*kk)+firstPixel;
-#ifdef GPU_DEBUG
             assert(m!=i);
-#endif
             if (std::abs(int(x[m]) - int(x[i])) > 1) return;
             // if (std::abs(int(y[m]) - int(y[i])) > 1) return; // binssize is 1
             auto old = atomicMin(&clusterId[m], clusterId[i]);
@@ -185,9 +206,8 @@ namespace gpuClustering {
           ++p;
           for (;p<e;++p) loop(p);
         } // pixel loop
-#ifdef GPU_DEBUG
+        }
         if (threadIdx.x==0) ++nloops;
-#endif
     }  // end while
 
 #ifdef GPU_DEBUG

diff --git a/RecoLocalTracker/SiPixelRecHits/interface/PixelCPEBase.h b/RecoLocalTracker/SiPixelRecHits/interface/PixelCPEBase.h
@@ -80,11 +80,12 @@ class PixelCPEBase : public PixelClusterParameterEstimator
 
    struct ClusterParam
    {
+      ClusterParam(){}
       ClusterParam(const SiPixelCluster & cl) : theCluster(&cl) {}
 
       virtual ~ClusterParam() = default;
 
-      const SiPixelCluster * theCluster;
+      const SiPixelCluster * theCluster = nullptr;;
 
       //--- Cluster-level quantities (filled in computeAnglesFrom....)
       float cotalpha;

diff --git a/RecoLocalTracker/SiPixelRecHits/interface/PixelCPEFast.h b/RecoLocalTracker/SiPixelRecHits/interface/PixelCPEFast.h
@@ -19,7 +19,9 @@ class PixelCPEFast final : public PixelCPEBase
 public:
    struct ClusterParamGeneric : ClusterParam
    {
+      ClusterParamGeneric() {}
       ClusterParamGeneric(const SiPixelCluster & cl) : ClusterParam(cl){}
+
       // The truncation value pix_maximum is an angle-dependent cutoff on the
       // individual pixel signals. It should be applied to all pixels in the
       // cluster [signal_i = fminf(signal_i, pixmax)] before the column and row
@@ -52,6 +54,8 @@ class PixelCPEFast final : public PixelCPEBase
 
    LocalPoint localPosition (DetParam const & theDetParam, ClusterParam & theClusterParam) const override;
    LocalError localError   (DetParam const & theDetParam, ClusterParam & theClusterParam) const override;
+
+   void errorFromTemplates(DetParam const & theDetParam, ClusterParamGeneric & theClusterParam, float qclus) const;
 
    static void
    collect_edge_charges(ClusterParam & theClusterParam,  //!< input, the cluster

diff --git a/RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h b/RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h
@@ -37,6 +37,8 @@ namespace pixelCPEforGPU {
 
     float x0,y0,z0;  // the vertex in the local coord of the detector
 
+    float sx[3], sy[3]; // the errors...
+
     Frame frame;
   };
 
@@ -208,31 +210,37 @@ namespace pixelCPEforGPU {
   }
 
   constexpr inline
-  void error(CommonParams const & __restrict__ comParams, DetParams const & __restrict__ detParams, ClusParams & cp, uint32_t ic) {
+  void errorFromSize(CommonParams const & __restrict__ comParams, DetParams const & __restrict__ detParams, ClusParams & cp, uint32_t ic) {
     // Edge cluster errors
     cp.xerr[ic]= 0.0050;
     cp.yerr[ic]= 0.0085;
 
     // FIXME these are errors form Run1
     constexpr float xerr_barrel_l1[] = { 0.00115, 0.00120, 0.00088 };
-    constexpr float xerr_barrel_l1_def = 0.01030;
+    constexpr float xerr_barrel_l1_def = 0.00200;           // 0.01030;
     constexpr float yerr_barrel_l1[] = { 0.00375, 0.00230, 0.00250, 0.00250, 0.00230, 0.00230, 0.00210, 0.00210, 0.00240 };
     constexpr float yerr_barrel_l1_def = 0.00210;
     constexpr float xerr_barrel_ln[] = { 0.00115, 0.00120, 0.00088 };
-    constexpr float xerr_barrel_ln_def = 0.01030;
+    constexpr float xerr_barrel_ln_def = 0.00200; // 0.01030;
     constexpr float yerr_barrel_ln[] = { 0.00375, 0.00230, 0.00250, 0.00250, 0.00230, 0.00230, 0.00210, 0.00210, 0.00240 };
     constexpr float yerr_barrel_ln_def = 0.00210;
     constexpr float xerr_endcap[] = { 0.0020, 0.0020 };
     constexpr float xerr_endcap_def = 0.0020;
     constexpr float yerr_endcap[] = { 0.00210 };
     constexpr float yerr_endcap_def = 0.00210;
 
+    auto sx = cp.maxRow[ic] - cp.minRow[ic];
+    auto sy = cp.maxCol[ic] - cp.minCol[ic];
+
     // is edgy ?
     bool isEdgeX = cp.minRow[ic] == 0 or cp.maxRow[ic] == phase1PixelTopology::lastRowInModule;
     bool isEdgeY = cp.minCol[ic] == 0 or cp.maxCol[ic] == phase1PixelTopology::lastColInModule;
+    // is one and big?
+    bool isBig1X = (0==sx)  && phase1PixelTopology::isBigPixX(cp.minRow[ic]);
+    bool isBig1Y = (0==sy)  && phase1PixelTopology::isBigPixY(cp.minCol[ic]);
 
-    if (not isEdgeX) {
-      auto sx = cp.maxRow[ic] - cp.minRow[ic];
+
+    if (!isEdgeX && !isBig1X ) {
       if (not detParams.isBarrel) {
         cp.xerr[ic] = sx <std::size(xerr_endcap) ? xerr_endcap[sx] : xerr_endcap_def;
       } else if (detParams.layer == 1) {
@@ -242,8 +250,7 @@ namespace pixelCPEforGPU {
       }
     }
 
-    if (not isEdgeY) {
-      auto sy = cp.maxCol[ic] - cp.minCol[ic];;
+    if (!isEdgeY && !isBig1Y) {
       if (not detParams.isBarrel) {
         cp.yerr[ic] = sy <std::size(yerr_endcap) ? yerr_endcap[sy] : yerr_endcap_def;
       } else if (detParams.layer == 1) {
@@ -254,6 +261,29 @@ namespace pixelCPEforGPU {
     }
   }
 
+
+  constexpr inline
+  void errorFromDB(CommonParams const & __restrict__ comParams, DetParams const & __restrict__ detParams, ClusParams & cp, uint32_t ic) {
+    // Edge cluster errors
+    cp.xerr[ic]= 0.0050f;
+    cp.yerr[ic]= 0.0085f;
+
+    auto sx = cp.maxRow[ic] - cp.minRow[ic];
+    auto sy = cp.maxCol[ic] - cp.minCol[ic];
+
+    // is edgy ?
+    bool isEdgeX = cp.minRow[ic] == 0 or cp.maxRow[ic] == phase1PixelTopology::lastRowInModule;
+    bool isEdgeY = cp.minCol[ic] == 0 or cp.maxCol[ic] == phase1PixelTopology::lastColInModule;
+    // is one and big?
+    uint32_t ix = (0==sx);
+    uint32_t iy = (0==sy);
+    ix+= (0==sx)  && phase1PixelTopology::isBigPixX(cp.minRow[ic]);
+    iy+= (0==sy)  && phase1PixelTopology::isBigPixY(cp.minCol[ic]);
+
+    if (not isEdgeX) cp.xerr[ic] = detParams.sx[ix];
+    if (not isEdgeY) cp.yerr[ic] = detParams.sy[iy];
+  }
+
 }
 
 #endif  // RecoLocalTracker_SiPixelRecHits_pixelCPEforGPU_h
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
@@ -71,16 +71,22 @@ namespace pixelgpudetails {
     gpu_.sortIndex_d = slicePitch<uint16_t>(gpu_.owner_16bit_, gpu_.owner_16bit_pitch_, 4);
 
     cudaCheck(cudaMalloc((void **) & gpu_.hist_d, sizeof(HitsOnGPU::Hist)));
-    cudaCheck(cudaMalloc((void **) & gpu_.hws_d, 4*HitsOnGPU::Hist::totbins()));
+    cudaCheck(cudaMalloc((void **) & gpu_.hws_d, HitsOnGPU::Hist::wsSize()));
     cudaCheck(cudaMalloc((void **) & gpu_d, sizeof(HitsOnGPU)));
-    gpu_.me_d = gpu_d;
-    cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault, cudaStream.id()));
 
     // Feels a bit dumb but constexpr arrays are not supported for device code
     // TODO: should be moved to EventSetup (or better ideas?)
     // Would it be better to use "constant memory"?
     cudaCheck(cudaMalloc((void **) & d_phase1TopologyLayerStart_, 11 * sizeof(uint32_t)));
     cudaCheck(cudaMemcpyAsync(d_phase1TopologyLayerStart_, phase1PixelTopology::layerStart, 11 * sizeof(uint32_t), cudaMemcpyDefault, cudaStream.id()));
+    cudaCheck(cudaMalloc((void **) & d_phase1TopologyLayer_, phase1PixelTopology::layer.size() * sizeof(uint8_t)));
+    cudaCheck(cudaMemcpyAsync(d_phase1TopologyLayer_, phase1PixelTopology::layer.data(), phase1PixelTopology::layer.size() * sizeof(uint8_t), cudaMemcpyDefault, cudaStream.id()));
+
+    gpu_.phase1TopologyLayerStart_d = d_phase1TopologyLayerStart_;
+    gpu_.phase1TopologyLayer_d = d_phase1TopologyLayer_;
+
+    gpu_.me_d = gpu_d;
+    cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault, cudaStream.id()));
 
     cudaCheck(cudaMallocHost(&h_hitsModuleStart_, (gpuClustering::MaxNumModules+1) * sizeof(uint32_t)));
 
@@ -113,6 +119,7 @@ namespace pixelgpudetails {
     cudaCheck(cudaFree(gpu_.hws_d));
     cudaCheck(cudaFree(gpu_d));
     cudaCheck(cudaFree(d_phase1TopologyLayerStart_));
+    cudaCheck(cudaFree(d_phase1TopologyLayer_));
 
     cudaCheck(cudaFreeHost(h_hitsModuleStart_));
     cudaCheck(cudaFreeHost(h_owner_32bit_));

diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h
@@ -50,6 +50,7 @@ namespace pixelgpudetails {
     HitsOnGPU gpu_;
     uint32_t nhits_ = 0;
     uint32_t *d_phase1TopologyLayerStart_ = nullptr;
+    uint8_t *d_phase1TopologyLayer_ = nullptr;
     uint32_t *h_hitsModuleStart_ = nullptr;
     uint16_t *h_detInd_ = nullptr;
     int32_t *h_charge_ = nullptr;

diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h
@@ -126,7 +126,7 @@ namespace gpuPixelRecHits {
     assert(h < 2000*256);
 
     pixelCPEforGPU::position(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic);
-    pixelCPEforGPU::error(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic);
+    pixelCPEforGPU::errorFromDB(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic);
 
     chargeh[h] = clusParams.charge[ic];
 
@@ -135,8 +135,8 @@ namespace gpuPixelRecHits {
     xl[h]= clusParams.xpos[ic];   
     yl[h]= clusParams.ypos[ic]; 
 
-    xe[h]= clusParams.xerr[ic];
-    ye[h]= clusParams.yerr[ic];
+    xe[h]= clusParams.xerr[ic]*clusParams.xerr[ic];
+    ye[h]= clusParams.yerr[ic]*clusParams.yerr[ic];
     mr[h]= clusParams.minRow[ic];
     mc[h]= clusParams.minCol[ic];