diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
index 29e5e82049b5c..7eb90cffa2d77 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
@@ -38,7 +38,7 @@
 
 namespace pixelgpudetails {
 
-  SiPixelRawToClusterGPUKernel::SiPixelRawToClusterGPUKernel() {
+  SiPixelRawToClusterGPUKernel::SiPixelRawToClusterGPUKernel(cuda::stream_t<>& cudaStream) {
     int WSIZE = pixelgpudetails::MAX_FED * pixelgpudetails::MAX_WORD;
     cudaMallocHost(&word,       sizeof(unsigned int)*WSIZE);
     cudaMallocHost(&fedId_h,    sizeof(unsigned char)*WSIZE);
@@ -90,6 +90,12 @@ namespace pixelgpudetails {
     cudaCheck(cudaMalloc((void**) & moduleStart_d, (MaxNumModules+1)*sizeof(uint32_t) ));
     cudaCheck(cudaMalloc((void**) & clusInModule_d,(MaxNumModules)*sizeof(uint32_t) ));
     cudaCheck(cudaMalloc((void**) & moduleId_d,    (MaxNumModules)*sizeof(uint32_t) ));
+
+    cudaCheck(cudaMalloc((void**) & gpuProduct_d, sizeof(GPUProduct)));
+    gpuProduct = getProduct();
+    assert(xx_d==gpuProduct.xx_d);
+
+    cudaCheck(cudaMemcpyAsync(gpuProduct_d, &gpuProduct, sizeof(GPUProduct), cudaMemcpyDefault,cudaStream.id()));
   }
 
   SiPixelRawToClusterGPUKernel::~SiPixelRawToClusterGPUKernel() {
@@ -111,6 +117,7 @@ namespace pixelgpudetails {
     cudaCheck(cudaFree(clus_d));
     cudaCheck(cudaFree(clusInModule_d));
     cudaCheck(cudaFree(moduleId_d));
+    cudaCheck(cudaFree(gpuProduct_d));
   }
 
   void SiPixelRawToClusterGPUKernel::initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length) {
@@ -478,7 +485,7 @@ namespace pixelgpudetails {
           XX[gIndex]    = 0;  // 0 is an indicator of a noise/dead channel
           YY[gIndex]    = 0; // skip these pixels during clusterization
           ADC[gIndex]   = 0;
-          continue ; // 0: bad word
+          continue; // 0: bad word
         }
 
         uint32_t link  = getLink(ww);            // Extract link
@@ -521,9 +528,9 @@ namespace pixelgpudetails {
           // endcap ids
           layer = 0;
           panel = (rawId >> pixelgpudetails::panelStartBit) & pixelgpudetails::panelMask;
-          //disk  = (rawId >> diskStartBit_)  & diskMask_ ;
+          //disk  = (rawId >> diskStartBit_) & diskMask_;
           side  = (panel == 1)? -1 : 1;
-          //blade = (rawId>>bladeStartBit_) & bladeMask_;
+          //blade = (rawId >> bladeStartBit_) & bladeMask_;
         }
 
         // ***special case of layer to 1 be handled here
@@ -558,8 +565,8 @@ namespace pixelgpudetails {
         }
 
         pixelgpudetails::Pixel globalPix = frameConversion(barrel, side, layer, rocIdInDetUnit, localPix);
-        XX[gIndex]    = globalPix.row  ; // origin shifting by 1 0-159
-        YY[gIndex]    = globalPix.col ; // origin shifting by 1 0-415
+        XX[gIndex]    = globalPix.row;  // origin shifting by 1 0-159
+        YY[gIndex]    = globalPix.col;  // origin shifting by 1 0-415
         ADC[gIndex]   = getADC(ww);
         pdigi[gIndex] = pixelgpudetails::pack(globalPix.row,globalPix.col,ADC[gIndex]);
         moduleId[gIndex] = detId.moduleId;
@@ -583,7 +590,6 @@ namespace pixelgpudetails {
     const int threadsPerBlock = 512;
     const int blocks = (wordCounter + threadsPerBlock-1) /threadsPerBlock; // fill it all
 
-
     assert(0 == wordCounter%2);
     // wordCounter is the total no of words in each event to be trasfered on device
     cudaCheck(cudaMemcpyAsync(&word_d[0],     &word[0],     wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
@@ -630,7 +636,6 @@ namespace pixelgpudetails {
       int threadsPerBlock = 256;
       int blocks = (wordCounter + threadsPerBlock - 1) / threadsPerBlock;
 
-
       gpuCalibPixel::calibDigis<<<blocks, threadsPerBlock, 0, stream.id()>>>(
           moduleInd_d,
           xx_d, yy_d, adc_d,
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
index 2b0b205c9f536..2f7436052902b 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
@@ -83,7 +83,7 @@ namespace pixelgpudetails {
   class Packing {
   public:
     using PackedDigiType = uint32_t;
-      
+
     // Constructor: pre-computes masks and shifts from field widths
     __host__ __device__
     inline
@@ -144,22 +144,32 @@ namespace pixelgpudetails {
            (adc << thePacking.adc_shift);
   }
 
+  constexpr
+  uint32_t pixelToChannel( int row, int col) {
+    constexpr Packing thePacking = packing();
+    return (row << thePacking.column_width) | col;
+  }
+
+
   using error_obj = siPixelRawToClusterHeterogeneousProduct::error_obj;
 
 
   class SiPixelRawToClusterGPUKernel {
   public:
-    SiPixelRawToClusterGPUKernel();
+
+    using GPUProduct = siPixelRawToClusterHeterogeneousProduct::GPUProduct;
+
+    SiPixelRawToClusterGPUKernel(cuda::stream_t<>& cudaStream);
     ~SiPixelRawToClusterGPUKernel();
 
-    
+
     SiPixelRawToClusterGPUKernel(const SiPixelRawToClusterGPUKernel&) = delete;
     SiPixelRawToClusterGPUKernel(SiPixelRawToClusterGPUKernel&&) = delete;
     SiPixelRawToClusterGPUKernel& operator=(const SiPixelRawToClusterGPUKernel&) = delete;
     SiPixelRawToClusterGPUKernel& operator=(SiPixelRawToClusterGPUKernel&&) = delete;
 
     void initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length);
-    
+
     // Not really very async yet...
     void makeClustersAsync(const SiPixelFedCablingMapGPU *cablingMap, const unsigned char *modToUnp,
                            const SiPixelGainForHLTonGPU *gains,
@@ -170,8 +180,9 @@ namespace pixelgpudetails {
     auto getProduct() const {
       return siPixelRawToClusterHeterogeneousProduct::GPUProduct{
         pdigi_h, rawIdArr_h, clus_h, adc_h, error_h,
-        nDigis, nModulesActive,
-        xx_d, yy_d, adc_d, moduleInd_d, moduleStart_d,clus_d, clusInModule_d, moduleId_d
+        gpuProduct_d,
+        xx_d, yy_d, adc_d, moduleInd_d, moduleStart_d,clus_d, clusInModule_d, moduleId_d,
+        nDigis, nModulesActive
       };
     }
 
@@ -181,6 +192,11 @@ namespace pixelgpudetails {
     unsigned char *fedId_h = nullptr;    // to hold fed index for each word
 
     // output
+    GPUProduct gpuProduct;
+    GPUProduct * gpuProduct_d;
+
+    // FIXME cleanup all these are in the gpuProduct above...
+
     uint32_t *pdigi_h = nullptr, *rawIdArr_h = nullptr;                   // host copy of output
     uint16_t *adc_h = nullptr; int32_t *clus_h = nullptr; // host copy of calib&clus output
     pixelgpudetails::error_obj *data_h = nullptr;
@@ -209,7 +225,7 @@ namespace pixelgpudetails {
     uint32_t * clusInModule_d;
     uint32_t * moduleId_d;
   };
-  
+
   // configuration and memory buffers alocated on the GPU
   struct context {
     uint32_t * word_d;
@@ -223,7 +239,7 @@ namespace pixelgpudetails {
 
     GPU::SimpleVector<error_obj> * error_d;
     error_obj * data_d;
-      
+
     // these are for the clusterizer (to be moved)
     uint32_t * moduleStart_d;
     int32_t *  clus_d;
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
index 999fdcd6eff19..62253ca9d7e1b 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
@@ -36,11 +36,10 @@ namespace pixelgpudetails {
     cudaCheck(cudaMalloc((void**) & gpu_.sortIndex_d,(gpuClustering::MaxNumModules*256)*sizeof(uint16_t)));
     cudaCheck(cudaMalloc((void**) & gpu_.mr_d,(gpuClustering::MaxNumModules*256)*sizeof(uint16_t)));
     cudaCheck(cudaMalloc((void**) & gpu_.mc_d,(gpuClustering::MaxNumModules*256)*sizeof(uint16_t)));
-//    cudaCheck(cudaMalloc((void**) & gpu_.hist_d, 10*sizeof(HitsOnGPU::Hist)));
-
+    cudaCheck(cudaMalloc((void**) & gpu_.hist_d, 10*sizeof(HitsOnGPU::Hist)));
     cudaCheck(cudaMalloc((void**) & gpu_d, sizeof(HitsOnGPU)));
+    gpu_.me_d = gpu_d;
     cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault,cudaStream.id()));
-
   }
 
   PixelRecHitGPUKernel::~PixelRecHitGPUKernel() {
@@ -59,8 +58,7 @@ namespace pixelgpudetails {
     cudaCheck(cudaFree(gpu_.sortIndex_d));
     cudaCheck(cudaFree(gpu_.mr_d));
     cudaCheck(cudaFree(gpu_.mc_d));
-    // cudaCheck(cudaFree(gpu_.hist_d));
-
+    cudaCheck(cudaFree(gpu_.hist_d));
     cudaCheck(cudaFree(gpu_d));
   }
 
@@ -78,7 +76,7 @@ namespace pixelgpudetails {
                            input.clusInModule_d,
                            input.clusInModule_d + gpuClustering::MaxNumModules,
                            &gpu_.hitsModuleStart_d[1]);
-  
+
     int threadsPerBlock = 256;
     int blocks = input.nModules; // active modules (with digis)
     gpuPixelRecHits::getHits<<<blocks, threadsPerBlock, 0, stream.id()>>>(
@@ -96,20 +94,20 @@ namespace pixelgpudetails {
       gpu_.xg_d, gpu_.yg_d, gpu_.zg_d, gpu_.rg_d,
       gpu_.iphi_d,
       gpu_.xl_d, gpu_.yl_d,
-      gpu_.xerr_d, gpu_.yerr_d, 
+      gpu_.xerr_d, gpu_.yerr_d,
       gpu_.mr_d, gpu_.mc_d
     );
 
     // needed only if hits on CPU are required...
     cudaCheck(cudaMemcpyAsync(hitsModuleStart_, gpu_.hitsModuleStart_d, (gpuClustering::MaxNumModules+1) * sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
-   
+
     // to be moved to gpu?
     auto nhits = hitsModuleStart_[gpuClustering::MaxNumModules];
     for (int i=0;i<10;++i) hitsLayerStart_[i]=hitsModuleStart_[phase1PixelTopology::layerStart[i]];
     hitsLayerStart_[10]=nhits;
 
 #ifdef GPU_DEBUG
-    std::cout << "hit layerStart "; 
+    std::cout << "hit layerStart ";
     for (int i=0;i<10;++i) std::cout << phase1PixelTopology::layerName[i] << ':' << hitsLayerStart_[i] << ' ';
     std::cout << "end:" << hitsLayerStart_[10] << std::endl;
 #endif
@@ -119,9 +117,7 @@ namespace pixelgpudetails {
     // for timing test
     // radixSortMultiWrapper<int16_t><<<10, 256, 0, c.stream>>>(gpu_.iphi_d,gpu_.sortIndex_d,gpu_.hitsLayerStart_d);
 
-    // fillManyFromVector(gpu_.hist_d,10,gpu_.iphi_d, gpu_.hitsLayerStart_d, nhits,256,c.stream);
-
-
+    cudautils::fillManyFromVector(gpu_.hist_d,10,gpu_.iphi_d, gpu_.hitsLayerStart_d, nhits,256,stream.id());
   }
 
   HitsOnCPU PixelRecHitGPUKernel::getOutput(cuda::stream_t<>& stream) const {