Skip to content

Commit

Permalink
Implement GPU vertex finder with a single kernel (#413)
Browse files Browse the repository at this point in the history
  • Loading branch information
VinInn authored and fwyzard committed Oct 23, 2020
1 parent ff2d3d9 commit b61907d
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 10 deletions.
7 changes: 6 additions & 1 deletion RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,12 @@
<use name="SimDataFormats/Track"/>
<use name="TrackingTools/TransientTrack"/>

<bin file="gpuVertexFinder_t.cu" name="gpuVertexFinderOneKernel_t">
<use name="cuda"/>
<flags CUDA_FLAGS="-g -DGPU_DEBUG -DONE_KERNEL"/>
<flags CXXFLAGS="-g"/>
</bin>

<bin file="gpuVertexFinder_t.cu" name="gpuVertexFinderByDensity_t">
<use name="cuda"/>
<flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
Expand All @@ -44,4 +50,3 @@
<bin file="cpuVertexFinder_t.cpp" name="cpuVertexFinderIterative_t">
<flags CXXFLAGS="-g -DGPU_DEBUG -DUSE_ITERATIVE"/>
</bin>

44 changes: 35 additions & 9 deletions RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,34 @@
#define CLUSTERIZE clusterTracksIterative
#else
#include "RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracksByDensity.h"
#define CLUSTERIZE clusterTracksByDensity
#define CLUSTERIZE clusterTracksByDensityKernel
#endif
#include "RecoPixelVertexing/PixelVertexFinding/src/gpuFitVertices.h"
#include "RecoPixelVertexing/PixelVertexFinding/src/gpuSortByPt2.h"
#include "RecoPixelVertexing/PixelVertexFinding/src/gpuSplitVertices.h"

#ifdef ONE_KERNEL
#ifdef __CUDACC__
__global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata,
gpuVertexFinder::WorkSpace* pws,
int minT, // min number of neighbours to be "seed"
float eps, // max absolute distance to cluster
float errmax, // max error to be "seed"
float chi2max // max normalized distance to cluster,
) {
clusterTracksByDensity(pdata,pws,minT,eps,errmax,chi2max);
__syncthreads();
fitVertices(pdata,pws, 50.);
__syncthreads();
splitVertices(pdata,pws, 9.f);
__syncthreads();
fitVertices(pdata,pws, 5000.);
__syncthreads();
sortByPt2(pdata,pws);
}
#endif
#endif

using namespace gpuVertexFinder;

struct Event {
Expand Down Expand Up @@ -151,13 +173,17 @@ int main() {
cudaCheck(cudaGetLastError());
cudaDeviceSynchronize();

#ifdef ONE_KERNEL
cudautils::launch(vertexFinderOneKernel, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]);
#else
cudautils::launch(CLUSTERIZE, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]);
#endif
print<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get());

cudaCheck(cudaGetLastError());
cudaDeviceSynchronize();

cudautils::launch(fitVertices, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
cudautils::launch(fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
cudaCheck(cudaGetLastError());
cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));

Expand Down Expand Up @@ -219,7 +245,7 @@ int main() {
}

#ifdef __CUDACC__
cudautils::launch(fitVertices, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
cudautils::launch(fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost));
Expand All @@ -239,23 +265,23 @@ int main() {

#ifdef __CUDACC__
// one vertex per block!!!
cudautils::launch(splitVertices, {1024, 64}, onGPU_d.get(), ws_d.get(), 9.f);
// cudautils::launch(splitVerticesKernel, {1, 256}, onGPU_d.get(), ws_d.get(), 9.f);
cudautils::launch(splitVerticesKernel, {1024, 64}, onGPU_d.get(), ws_d.get(), 9.f);
cudaCheck(cudaMemcpy(&nv, LOC_WS(nvIntermediate), sizeof(uint32_t), cudaMemcpyDeviceToHost));
#else
gridDim.x = 1024; // nv ????
gridDim.x = 1;
assert(blockIdx.x == 0);
for (; blockIdx.x < gridDim.x; ++blockIdx.x)
splitVertices(onGPU_d.get(), ws_d.get(), 9.f);
splitVertices(onGPU_d.get(), ws_d.get(), 9.f);
resetGrid();
nv = ws_d->nvIntermediate;
#endif
std::cout << "after split " << nv << std::endl;

#ifdef __CUDACC__
cudautils::launch(fitVertices, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 5000.f);
cudautils::launch(fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 5000.f);
cudaCheck(cudaGetLastError());

cudautils::launch(sortByPt2, {1, 256}, onGPU_d.get(), ws_d.get());
cudautils::launch(sortByPt2Kernel, {1, 256}, onGPU_d.get(), ws_d.get());
cudaCheck(cudaGetLastError());
cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
#else
Expand Down

0 comments on commit b61907d

Please sign in to comment.