Skip to content

Commit

Permalink
Migrate cluster track associator (#409)
Browse files Browse the repository at this point in the history
Migrate ClusterTPAssociationHeterogeneous using the depreacted HeterogeneousEDProducer to
ClusterTPAssociationProducerCUDA, and implement a simple analyzer to consume its procuct.

To test it, add a dummy analyzer to an MC workflow:

    process.load("SimTracker.TrackerHitAssociation.clusterTPCUDAdump_cfi")
    process.validation_step = cms.EndPath(process.globalValidationPixelTrackingOnly + process.clusterTPCUDAdump)
    process.tpClusterProducerCUDAPreSplitting.dumpCSV = True
  • Loading branch information
VinInn authored and fwyzard committed Dec 29, 2020
1 parent 6b0c079 commit c6abb73
Show file tree
Hide file tree
Showing 11 changed files with 457 additions and 115 deletions.
2 changes: 2 additions & 0 deletions SimTracker/TrackerHitAssociation/BuildFile.xml
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,11 @@
<use name="DataFormats/TrackerRecHit2D"/>
<use name="TrackingTools/TransientTrackingRecHit"/>
<use name="DataFormats/SiPixelDetId"/>
<use name="CUDADataFormats/Common"/>
<use name="clhep"/>
<use name="boost"/>
<use name="root"/>
<use name="cuda"/>
<export>
<lib name="1"/>
</export>
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#ifndef SimTracker_TrackerHitAssociation_plugins_trackerHitAssociationHeterogeneousProduct_h
#define SimTracker_TrackerHitAssociation_plugins_trackerHitAssociationHeterogeneousProduct_h

#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h"
#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"

namespace trackerHitAssociationHeterogeneous {

struct ClusterSLView {
using Clus2TP = std::array<uint32_t, 7>;

Clus2TP* links_d;
uint32_t* tkId_d;
uint32_t* tkId2_d;
uint32_t* n1_d;
uint32_t* n2_d;
};

template <typename Traits>
class Product {
public:
template <typename T>
using unique_ptr = typename Traits::template unique_ptr<T>;

Product() = default;
~Product() = default;
Product(Product const&) = delete;
Product(Product&&) = default;

Product(int nlinks, int nhits, cudaStream_t stream);

ClusterSLView& view() { return m_view; }
ClusterSLView const& view() const { return m_view; }

int nLinks() const { return m_nLinks; }
int nHits() const { return m_nHits; }

private:
static constexpr uint32_t n32 = 4;

unique_ptr<uint32_t[]> m_storeTP; //!
unique_ptr<uint32_t[]> m_store32; //!

ClusterSLView m_view; //!

int m_nLinks;
int m_nHits;
};

template <typename Traits>
Product<Traits>::Product(int nlinks, int nhits, cudaStream_t stream) : m_nLinks(nlinks), m_nHits(nhits) {
m_storeTP = Traits::template make_device_unique<uint32_t[]>(m_nLinks * 7, stream);
m_store32 = Traits::template make_device_unique<uint32_t[]>(m_nHits * n32, stream);

auto get32 = [&](int i) { return m_store32.get() + i * m_nHits; };

m_view.links_d = (ClusterSLView::Clus2TP*)(m_storeTP.get());
m_view.tkId_d = get32(0);
m_view.tkId2_d = get32(1);
m_view.n1_d = get32(2);
m_view.n2_d = get32(3);
}

using ProductCUDA = Product<cudaCompat::GPUTraits>;

} // namespace trackerHitAssociationHeterogeneous

#endif // SimTracker_TrackerHitAssociation_plugins_trackerHitAssociationHeterogeneousProduct_h
107 changes: 38 additions & 69 deletions SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu
Original file line number Diff line number Diff line change
Expand Up @@ -9,23 +9,20 @@

#include "ClusterSLOnGPU.h"

using ClusterSLGPU = trackerHitAssociationHeterogeneousProduct::ClusterSLGPU;
using Clus2TP = ClusterSLGPU::Clus2TP;
using ClusterSLView = trackerHitAssociationHeterogeneous::ClusterSLView;
using Clus2TP = ClusterSLView::Clus2TP;

// #define DUMP_TK2

__global__ void simLink(const SiPixelDigisCUDA::DeviceConstView* dd,
uint32_t ndigis,
clusterSLOnGPU::HitsOnGPU const* hhp,
ClusterSLGPU const* slp,
TrackingRecHit2DSOAView const* hhp,
ClusterSLView sl,
uint32_t n) {
assert(slp == slp->me_d);

constexpr int32_t invTK = 0; // std::numeric_limits<int32_t>::max();
constexpr uint32_t invTK = 0; // std::numeric_limits<int32_t>::max();
constexpr uint16_t InvId = 9999; // must be > MaxNumModules

auto const& hh = *hhp;
auto const& sl = *slp;
auto i = blockIdx.x * blockDim.x + threadIdx.x;

if (i >= ndigis)
Expand All @@ -41,7 +38,7 @@ __global__ void simLink(const SiPixelDigisCUDA::DeviceConstView* dd,
auto cl = first + dd->clus(i);
assert(cl < 2000 * blockDim.x);

const Clus2TP me{{id, ch, 0, 0, 0, 0}};
const Clus2TP me{{id, ch, 0, 0, 0, 0, 0}};

auto less = [] __host__ __device__(Clus2TP const& a, Clus2TP const& b) -> bool {
// in this context we do not care of [2]
Expand Down Expand Up @@ -80,38 +77,32 @@ __global__ void simLink(const SiPixelDigisCUDA::DeviceConstView* dd,
}
}

__global__ void verifyZero(uint32_t nhits, ClusterSLGPU const* slp) {
__global__ void doZero(uint32_t nhits, ClusterSLView sl) {
auto i = blockIdx.x * blockDim.x + threadIdx.x;
if (i > nhits)
return;

auto const& sl = *slp;

assert(sl.tkId_d[i] == 0);
auto const& tk = sl.links_d[0];
assert(tk[0] == 0);
assert(tk[1] == 0);
assert(tk[2] == 0);
assert(tk[3] == 0);
sl.tkId_d[i] = 0;
sl.n1_d[i] = 0;
sl.tkId2_d[i] = 0;
sl.n2_d[i] = 0;
}

__global__ void dumpLink(
int first, int ev, clusterSLOnGPU::HitsOnGPU const* hhp, uint32_t nhits, ClusterSLGPU const* slp) {
__global__ void dumpLink(int first, int ev, TrackingRecHit2DSOAView const* hhp, uint32_t nhits, ClusterSLView sl) {
auto i = first + blockIdx.x * blockDim.x + threadIdx.x;
if (i > nhits)
return;

auto const& hh = *hhp;
auto const& sl = *slp;

auto const& tk1 = sl.links_d[sl.tkId_d[i]];

#ifdef DUMP_TK2
auto const& tk2 = sl.links_d[sl.tkId2_d[i]];

printf("HIT: %d %d %d %d %.4f %.4f %.4f %.4f %d %d %d %d %d %d %d %d %d %d %d %d %d\n",
printf("HIT: %d %d %d %d %.4f %.4f %.4f %.4f %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n",
#else
printf("HIT: %d %d %d %d %.4f %.4f %.4f %.4f %d %d %d %d %d %d %d %d\n",
printf("HIT: %d %d %d %d %.4f %.4f %.4f %.4f %d %d %d %d %d %d %d %d %d\n",
#endif
ev,
i,
Expand All @@ -128,27 +119,27 @@ __global__ void dumpLink(
tk1[3],
tk1[4],
tk1[5],
tk1[6],
sl.n1_d[i]
#ifdef DUMP_TK2
,
tk2[2],
tk2[3],
tk2[4],
tk2[5],
tk2[6],
sl.n2_d[i]
#endif
);
}

namespace clusterSLOnGPU {

constexpr uint32_t invTK = 0; // std::numeric_limits<int32_t>::max();

void printCSVHeader() {
#ifdef DUMP_TK2
printf("HIT: %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s\n",
printf("HIT: %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s\n",
#else
printf("HIT: %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s\n",
printf("HIT: %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s\n",
#endif
"ev",
"ind",
Expand All @@ -163,13 +154,15 @@ namespace clusterSLOnGPU {
"ysize",
"tkId",
"pt",
"eta",
"z0",
"r0",
"n1"
#ifdef DUMP_TK2
,
"tkId2",
"pt2",
"eta",
"z02",
"r02",
"n2"
Expand All @@ -180,74 +173,50 @@ namespace clusterSLOnGPU {
std::atomic<int> evId(0);
std::once_flag doneCSVHeader;

Kernel::Kernel(cudaStream_t stream, bool dump) : doDump(dump) {
Kernel::Kernel(bool dump) : doDump(dump) {
if (doDump)
std::call_once(doneCSVHeader, printCSVHeader);
alloc(stream);
}

void Kernel::alloc(cudaStream_t stream) {
cudaCheck(cudaMalloc((void**)&slgpu.links_d, (ClusterSLGPU::MAX_DIGIS) * sizeof(Clus2TP)));
cudaCheck(cudaMalloc((void**)&slgpu.tkId_d, (ClusterSLGPU::MaxNumModules * 256) * sizeof(uint32_t)));
cudaCheck(cudaMalloc((void**)&slgpu.tkId2_d, (ClusterSLGPU::MaxNumModules * 256) * sizeof(uint32_t)));
cudaCheck(cudaMalloc((void**)&slgpu.n1_d, (ClusterSLGPU::MaxNumModules * 256) * sizeof(uint32_t)));
cudaCheck(cudaMalloc((void**)&slgpu.n2_d, (ClusterSLGPU::MaxNumModules * 256) * sizeof(uint32_t)));
cudaCheck(cudaMalloc((void**)&slgpu.me_d, sizeof(ClusterSLGPU)));
cudaCheck(cudaMemcpyAsync(slgpu.me_d, &slgpu, sizeof(ClusterSLGPU), cudaMemcpyDefault, stream));
}

void Kernel::deAlloc() {
cudaCheck(cudaFree(slgpu.links_d));
cudaCheck(cudaFree(slgpu.tkId_d));
cudaCheck(cudaFree(slgpu.tkId2_d));
cudaCheck(cudaFree(slgpu.n1_d));
cudaCheck(cudaFree(slgpu.n2_d));
cudaCheck(cudaFree(slgpu.me_d));
}

void Kernel::zero(cudaStream_t stream) {
cudaCheck(cudaMemsetAsync(slgpu.tkId_d, invTK, (ClusterSLGPU::MaxNumModules * 256) * sizeof(uint32_t), stream));
cudaCheck(cudaMemsetAsync(slgpu.tkId2_d, invTK, (ClusterSLGPU::MaxNumModules * 256) * sizeof(uint32_t), stream));
cudaCheck(cudaMemsetAsync(slgpu.n1_d, 0, (ClusterSLGPU::MaxNumModules * 256) * sizeof(uint32_t), stream));
cudaCheck(cudaMemsetAsync(slgpu.n2_d, 0, (ClusterSLGPU::MaxNumModules * 256) * sizeof(uint32_t), stream));
}
trackerHitAssociationHeterogeneous::ProductCUDA Kernel::makeAsync(SiPixelDigisCUDA const& dd,
uint32_t ndigis,
HitsOnCPU const& hh,
Clus2TP const* digi2tp,
uint32_t nhits,
uint32_t nlinks,
cudaStream_t stream) const {
trackerHitAssociationHeterogeneous::ProductCUDA product(nlinks, nhits, stream);
auto& csl = product.view();

void Kernel::algo(SiPixelDigisCUDA const& dd,
uint32_t ndigis,
HitsOnCPU const& hh,
uint32_t nhits,
uint32_t n,
cudaStream_t stream) {
zero(stream);
cudaCheck(cudaMemcpyAsync(csl.links_d, digi2tp, sizeof(Clus2TP) * nlinks, cudaMemcpyDefault, stream));

if (0 == nhits)
return;
ClusterSLGPU const& sl = slgpu;
return product;

int ev = ++evId;
int threadsPerBlock = 256;

int blocks = (nhits + threadsPerBlock - 1) / threadsPerBlock;
verifyZero<<<blocks, threadsPerBlock, 0, stream>>>(nhits, sl.me_d);
doZero<<<blocks, threadsPerBlock, 0, stream>>>(nhits, csl);
cudaCheck(cudaGetLastError());

blocks = (ndigis + threadsPerBlock - 1) / threadsPerBlock;

assert(sl.me_d);
simLink<<<blocks, threadsPerBlock, 0, stream>>>(dd.view(), ndigis, hh.view(), sl.me_d, n);
simLink<<<blocks, threadsPerBlock, 0, stream>>>(dd.view(), ndigis, hh.view(), csl, nlinks);
cudaCheck(cudaGetLastError());

if (doDump) {
cudaStreamSynchronize(stream); // flush previous printf
// one line == 200B so each kernel can print only 5K lines....
blocks = 16; // (nhits + threadsPerBlock - 1) / threadsPerBlock;
blocks = 16;
for (int first = 0; first < int(nhits); first += blocks * threadsPerBlock) {
dumpLink<<<blocks, threadsPerBlock, 0, stream>>>(first, ev, hh.view(), nhits, sl.me_d);
dumpLink<<<blocks, threadsPerBlock, 0, stream>>>(first, ev, hh.view(), nhits, csl);
cudaCheck(cudaGetLastError());
cudaStreamSynchronize(stream);
}
}
cudaCheck(cudaGetLastError());

return product;
}

} // namespace clusterSLOnGPU
35 changes: 13 additions & 22 deletions SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,40 +4,31 @@
#include <cuda_runtime.h>

#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"

#include "trackerHitAssociationHeterogeneousProduct.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "SimTracker/TrackerHitAssociation/interface/trackerHitAssociationHeterogeneous.h"

namespace clusterSLOnGPU {

using ClusterSLGPU = trackerHitAssociationHeterogeneousProduct::ClusterSLGPU;
using GPUProduct = trackerHitAssociationHeterogeneousProduct::GPUProduct;

using ClusterSLView = trackerHitAssociationHeterogeneous::ClusterSLView;
using Clus2TP = ClusterSLView::Clus2TP;
using HitsOnGPU = TrackingRecHit2DSOAView;
using HitsOnCPU = TrackingRecHit2DCUDA;

using Clus2TP = ClusterSLGPU::Clus2TP;

class Kernel {
public:
Kernel(cudaStream_t stream, bool dump);
~Kernel() { deAlloc(); }
void algo(SiPixelDigisCUDA const& dd,
uint32_t ndigis,
HitsOnCPU const& hh,
uint32_t nhits,
uint32_t n,
cudaStream_t stream);
GPUProduct getProduct() { return GPUProduct{slgpu.me_d}; }
explicit Kernel(bool dump);
~Kernel() {}
trackerHitAssociationHeterogeneous::ProductCUDA makeAsync(SiPixelDigisCUDA const& dd,
uint32_t ndigis,
HitsOnCPU const& hh,
Clus2TP const* digi2tp,
uint32_t nhits,
uint32_t nlinks,
cudaStream_t stream) const;

private:
void alloc(cudaStream_t stream);
void deAlloc();
void zero(cudaStream_t stream);

public:
ClusterSLGPU slgpu;
bool doDump;
};
} // namespace clusterSLOnGPU
Expand Down
Loading

0 comments on commit c6abb73

Please sign in to comment.