Skip to content

Commit

Permalink
Implement GPU vertex finder with a single kernel (#413)
Browse files Browse the repository at this point in the history
  • Loading branch information
VinInn authored and fwyzard committed Dec 3, 2019
1 parent 77ceafd commit 459524a
Show file tree
Hide file tree
Showing 9 changed files with 174 additions and 57 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@ class PixelVertexProducerCUDA : public edm::global::EDProducer<> {

PixelVertexProducerCUDA::PixelVertexProducerCUDA(const edm::ParameterSet& conf)
: m_OnGPU(conf.getParameter<bool>("onGPU")),
m_gpuAlgo(conf.getParameter<bool>("useDensity"),
m_gpuAlgo(conf.getParameter<bool>("oneKernel"),
conf.getParameter<bool>("useDensity"),
conf.getParameter<bool>("useDBSCAN"),
conf.getParameter<bool>("useIterative"),
conf.getParameter<int>("minT"),
Expand All @@ -68,6 +69,7 @@ void PixelVertexProducerCUDA::fillDescriptions(edm::ConfigurationDescriptions& d
// Only one of these three algos can be used at once.
// Maybe this should become a Plugin Factory
desc.add<bool>("onGPU", true);
desc.add<bool>("oneKernel", true);
desc.add<bool>("useDensity", true);
desc.add<bool>("useDBSCAN", false);
desc.add<bool>("useIterative", false);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ namespace gpuVertexFinder {
//
// based on Rodrighez&Laio algo
//
__global__ void clusterTracksByDensity(gpuVertexFinder::ZVertices* pdata,
__device__ __forceinline__ void clusterTracksByDensity(gpuVertexFinder::ZVertices* pdata,
gpuVertexFinder::WorkSpace* pws,
int minT, // min number of neighbours to be "seed"
float eps, // max absolute distance to cluster
Expand Down Expand Up @@ -219,6 +219,16 @@ namespace gpuVertexFinder {
printf("found %d proto vertices\n", foundClusters);
}

__global__ void clusterTracksByDensityKernel(gpuVertexFinder::ZVertices* pdata,
gpuVertexFinder::WorkSpace* pws,
int minT, // min number of neighbours to be "seed"
float eps, // max absolute distance to cluster
float errmax, // max error to be "seed"
float chi2max // max normalized distance to cluster
) {
clusterTracksByDensity(pdata,pws,minT,eps,errmax,chi2max);
}

} // namespace gpuVertexFinder

#endif // RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksByDensity_h
10 changes: 9 additions & 1 deletion RecoPixelVertexing/PixelVertexFinding/src/gpuFitVertices.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

namespace gpuVertexFinder {

__global__ void fitVertices(ZVertices* pdata,
__device__ __forceinline__ void fitVertices(ZVertices* pdata,
WorkSpace* pws,
float chi2Max // for outlier rejection
) {
Expand Down Expand Up @@ -101,6 +101,14 @@ namespace gpuVertexFinder {
printf("and %d noise\n", noise);
}

__global__ void fitVerticesKernel(ZVertices* pdata,
WorkSpace* pws,
float chi2Max // for outlier rejection
) {

fitVertices(pdata,pws,chi2Max);
}

} // namespace gpuVertexFinder

#endif // RecoPixelVertexing_PixelVertexFinding_src_gpuFitVertices_h
8 changes: 7 additions & 1 deletion RecoPixelVertexing/PixelVertexFinding/src/gpuSortByPt2.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@

namespace gpuVertexFinder {

__global__ void sortByPt2(ZVertices* pdata, WorkSpace* pws) {
__device__ __forceinline__
void sortByPt2(ZVertices* pdata, WorkSpace* pws) {
auto& __restrict__ data = *pdata;
auto& __restrict__ ws = *pws;
auto nt = ws.ntrks;
Expand Down Expand Up @@ -66,6 +67,11 @@ namespace gpuVertexFinder {
#endif
}


__global__ void sortByPt2Kernel(ZVertices* pdata, WorkSpace* pws) {
sortByPt2(pdata,pws);
}

} // namespace gpuVertexFinder

#endif // RecoPixelVertexing_PixelVertexFinding_src_gpuSortByPt2_h
34 changes: 20 additions & 14 deletions RecoPixelVertexing/PixelVertexFinding/src/gpuSplitVertices.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

namespace gpuVertexFinder {

__global__ void splitVertices(ZVertices* pdata, WorkSpace* pws, float maxChi2) {
__device__ __forceinline__ void splitVertices(ZVertices* pdata, WorkSpace* pws, float maxChi2) {
constexpr bool verbose = false; // in principle the compiler should optmize out if false

auto& __restrict__ data = *pdata;
Expand All @@ -32,20 +32,20 @@ namespace gpuVertexFinder {
assert(zt);

// one vertex per block
auto kv = blockIdx.x;
for ( auto kv = blockIdx.x; kv<nvFinal; kv += gridDim.x) {

if (kv >= nvFinal)
return;
if (nn[kv] < 4)
return;
continue;
if (chi2[kv] < maxChi2 * float(nn[kv]))
return;
continue;

assert(nn[kv] < 1023);
__shared__ uint32_t it[1024]; // track index
__shared__ float zz[1024]; // z pos
__shared__ uint8_t newV[1024]; // 0 or 1
__shared__ float ww[1024]; // z weight
constexpr int MAXTK = 512;
assert(nn[kv] < MAXTK);
if (nn[kv] >= MAXTK) continue; // too bad FIXME
__shared__ uint32_t it[MAXTK]; // track index
__shared__ float zz[MAXTK]; // z pos
__shared__ uint8_t newV[MAXTK]; // 0 or 1
__shared__ float ww[MAXTK]; // z weight

__shared__ uint32_t nq; // number of track for this vertex
nq = 0;
Expand All @@ -54,7 +54,7 @@ namespace gpuVertexFinder {
// copy to local
for (auto k = threadIdx.x; k < nt; k += blockDim.x) {
if (iv[k] == int(kv)) {
auto old = atomicInc(&nq, 1024);
auto old = atomicInc(&nq, MAXTK);
zz[old] = zt[k] - zv[kv];
newV[old] = zz[old] < 0 ? 0 : 1;
ww[old] = 1.f / ezt2[k];
Expand Down Expand Up @@ -104,7 +104,7 @@ namespace gpuVertexFinder {

// avoid empty vertices
if (0 == wnew[0] || 0 == wnew[1])
return;
continue;

// quality cut
auto dist2 = (znew[0] - znew[1]) * (znew[0] - znew[1]);
Expand All @@ -115,7 +115,7 @@ namespace gpuVertexFinder {
printf("inter %d %f %f\n", 20 - maxiter, chi2Dist, dist2 * wv[kv]);

if (chi2Dist < 4)
return;
continue;

// get a new global vertex
__shared__ uint32_t igv;
Expand All @@ -126,6 +126,12 @@ namespace gpuVertexFinder {
if (1 == newV[k])
iv[it[k]] = igv;
}

} // loop on vertices
}

__global__ void splitVerticesKernel(ZVertices* pdata, WorkSpace* pws, float maxChi2) {
splitVertices(pdata, pws, maxChi2);
}

} // namespace gpuVertexFinder
Expand Down
10 changes: 7 additions & 3 deletions RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,28 +43,32 @@ namespace gpuVertexFinder {
using WorkSpace = gpuVertexFinder::WorkSpace;
using TkSoA = pixelTrack::TrackSoA;

Producer(bool useDensity,
Producer(bool oneKernel,
bool useDensity,
bool useDBSCAN,
bool useIterative,
int iminT, // min number of neighbours to be "core"
float ieps, // max absolute distance to cluster
float ierrmax, // max error to be "seed"
float ichi2max // max normalized distance to cluster
)
: useDensity_(useDensity),
: oneKernel_(oneKernel && !(useDBSCAN||useIterative)),
useDensity_(useDensity),
useDBSCAN_(useDBSCAN),
useIterative_(useIterative),
minT(iminT),
eps(ieps),
errmax(ierrmax),
chi2max(ichi2max) {}
chi2max(ichi2max) {
}

~Producer() = default;

ZVertexHeterogeneous makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const;
ZVertexHeterogeneous make(TkSoA const* tksoa, float ptMin) const;

private:
const bool oneKernel_;
const bool useDensity_;
const bool useDBSCAN_;
const bool useIterative_;
Expand Down
102 changes: 76 additions & 26 deletions RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,48 @@ namespace gpuVertexFinder {
}
}

// #define THREE_KERNELS
#ifndef THREE_KERNELS
__global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata,
gpuVertexFinder::WorkSpace* pws,
int minT, // min number of neighbours to be "seed"
float eps, // max absolute distance to cluster
float errmax, // max error to be "seed"
float chi2max // max normalized distance to cluster,
) {
clusterTracksByDensity(pdata,pws,minT,eps,errmax,chi2max);
__syncthreads();
fitVertices(pdata,pws, 50.);
__syncthreads();
splitVertices(pdata,pws, 9.f);
__syncthreads();
fitVertices(pdata,pws, 5000.);
__syncthreads();
sortByPt2(pdata,pws);
}
#else
__global__ void vertexFinderKernel1(gpuVertexFinder::ZVertices* pdata,
gpuVertexFinder::WorkSpace* pws,
int minT, // min number of neighbours to be "seed"
float eps, // max absolute distance to cluster
float errmax, // max error to be "seed"
float chi2max // max normalized distance to cluster,
) {
clusterTracksByDensity(pdata,pws,minT,eps,errmax,chi2max);
__syncthreads();
fitVertices(pdata,pws, 50.);
}

__global__ void vertexFinderKernel2(gpuVertexFinder::ZVertices* pdata,
gpuVertexFinder::WorkSpace* pws)
{
fitVertices(pdata,pws, 5000.);
__syncthreads();
sortByPt2(pdata,pws);
}
#endif


#ifdef __CUDACC__
ZVertexHeterogeneous Producer::makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const {
// std::cout << "producing Vertices on GPU" << std::endl;
Expand Down Expand Up @@ -77,17 +119,38 @@ namespace gpuVertexFinder {
#endif

#ifdef __CUDACC__
if (useDensity_) {
clusterTracksByDensity<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
} else if (useDBSCAN_) {
clusterTracksDBSCAN<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
} else if (useIterative_) {
clusterTracksIterative<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
if (oneKernel_) {
// implemented only for density clustesrs
#ifndef THREE_KERNELS
vertexFinderOneKernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
#else
vertexFinderKernel1<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
cudaCheck(cudaGetLastError());
// one block per vertex...
splitVerticesKernel<<<1024, 128, 0, stream>>>(soa, ws_d.get(), 9.f);
cudaCheck(cudaGetLastError());
vertexFinderKernel2<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get());
#endif
} else { // five kernels
if (useDensity_) {
clusterTracksByDensityKernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
} else if (useDBSCAN_) {
clusterTracksDBSCAN<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
} else if (useIterative_) {
clusterTracksIterative<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
}
cudaCheck(cudaGetLastError());
fitVerticesKernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), 50.);
cudaCheck(cudaGetLastError());
// one block per vertex...
splitVerticesKernel<<<1024, 128, 0, stream>>>(soa, ws_d.get(), 9.f);
cudaCheck(cudaGetLastError());
fitVerticesKernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), 5000.);
cudaCheck(cudaGetLastError());
sortByPt2Kernel<<<1, 1024-256, 0, stream>>>(soa, ws_d.get());
}
cudaCheck(cudaGetLastError());
fitVertices<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), 50.);
cudaCheck(cudaGetLastError());
#else
#else // __CUDACC__
if (useDensity_) {
clusterTracksByDensity(soa, ws_d.get(), minT, eps, errmax, chi2max);
} else if (useDBSCAN_) {
Expand All @@ -97,24 +160,11 @@ namespace gpuVertexFinder {
}
// std::cout << "found " << (*ws_d).nvIntermediate << " vertices " << std::endl;
fitVertices(soa, ws_d.get(), 50.);
#endif

#ifdef __CUDACC__
// one block per vertex...
splitVertices<<<1024, 128, 0, stream>>>(soa, ws_d.get(), 9.f);
cudaCheck(cudaGetLastError());
fitVertices<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), 5000.);
cudaCheck(cudaGetLastError());

sortByPt2<<<1, 256, 0, stream>>>(soa, ws_d.get());
cudaCheck(cudaGetLastError());
#else
for (blockIdx.x = 0; blockIdx.x < 1024; ++blockIdx.x) {
splitVertices(soa, ws_d.get(), 9.f);
}
blockIdx.x = 0;
// one block per vertex!
blockIdx.x = 0; gridDim.x=1;
splitVertices(soa, ws_d.get(), 9.f);
resetGrid();
fitVertices(soa, ws_d.get(), 5000.);

sortByPt2(soa, ws_d.get());
#endif

Expand Down
7 changes: 6 additions & 1 deletion RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,12 @@
<use name="SimDataFormats/Track"/>
<use name="TrackingTools/TransientTrack"/>

<bin file="gpuVertexFinder_t.cu" name="gpuVertexFinderOneKernel_t">
<use name="cuda"/>
<flags CUDA_FLAGS="-g -DGPU_DEBUG -DONE_KERNEL"/>
<flags CXXFLAGS="-g"/>
</bin>

<bin file="gpuVertexFinder_t.cu" name="gpuVertexFinderByDensity_t">
<use name="cuda"/>
<flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
Expand All @@ -44,4 +50,3 @@
<bin file="cpuVertexFinder_t.cpp" name="cpuVertexFinderIterative_t">
<flags CXXFLAGS="-g -DGPU_DEBUG -DUSE_ITERATIVE"/>
</bin>

Loading

0 comments on commit 459524a

Please sign in to comment.