From b61907d2b33df72693d07d001549586880648688 Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Tue, 3 Dec 2019 17:43:04 +0100 Subject: [PATCH] Implement GPU vertex finder with a single kernel (#413) --- .../PixelVertexFinding/test/BuildFile.xml | 7 ++- .../PixelVertexFinding/test/VertexFinder_t.h | 44 +++++++++++++++---- 2 files changed, 41 insertions(+), 10 deletions(-) diff --git a/RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml b/RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml index 119bd5f04b4a9..95a572e68ce5e 100644 --- a/RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml +++ b/RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml @@ -19,6 +19,12 @@ + + + + + + @@ -44,4 +50,3 @@ - diff --git a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h index 9b55ba59daab2..d32a611402e61 100644 --- a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h +++ b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h @@ -15,12 +15,34 @@ #define CLUSTERIZE clusterTracksIterative #else #include "RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracksByDensity.h" -#define CLUSTERIZE clusterTracksByDensity +#define CLUSTERIZE clusterTracksByDensityKernel #endif #include "RecoPixelVertexing/PixelVertexFinding/src/gpuFitVertices.h" #include "RecoPixelVertexing/PixelVertexFinding/src/gpuSortByPt2.h" #include "RecoPixelVertexing/PixelVertexFinding/src/gpuSplitVertices.h" +#ifdef ONE_KERNEL +#ifdef __CUDACC__ + __global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata, + gpuVertexFinder::WorkSpace* pws, + int minT, // min number of neighbours to be "seed" + float eps, // max absolute distance to cluster + float errmax, // max error to be "seed" + float chi2max // max normalized distance to cluster, + ) { + clusterTracksByDensity(pdata,pws,minT,eps,errmax,chi2max); + __syncthreads(); + fitVertices(pdata,pws, 50.); + __syncthreads(); + splitVertices(pdata,pws, 9.f); + __syncthreads(); + fitVertices(pdata,pws, 5000.); + __syncthreads(); + sortByPt2(pdata,pws); + } +#endif +#endif + using namespace gpuVertexFinder; struct Event { @@ -151,13 +173,17 @@ int main() { cudaCheck(cudaGetLastError()); cudaDeviceSynchronize(); +#ifdef ONE_KERNEL + cudautils::launch(vertexFinderOneKernel, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]); +#else cudautils::launch(CLUSTERIZE, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]); +#endif print<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get()); cudaCheck(cudaGetLastError()); cudaDeviceSynchronize(); - cudautils::launch(fitVertices, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f); + cudautils::launch(fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f); cudaCheck(cudaGetLastError()); cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost)); @@ -219,7 +245,7 @@ int main() { } #ifdef __CUDACC__ - cudautils::launch(fitVertices, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f); + cudautils::launch(fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f); cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost)); cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost)); cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost)); @@ -239,23 +265,23 @@ int main() { #ifdef __CUDACC__ // one vertex per block!!! - cudautils::launch(splitVertices, {1024, 64}, onGPU_d.get(), ws_d.get(), 9.f); + // cudautils::launch(splitVerticesKernel, {1, 256}, onGPU_d.get(), ws_d.get(), 9.f); + cudautils::launch(splitVerticesKernel, {1024, 64}, onGPU_d.get(), ws_d.get(), 9.f); cudaCheck(cudaMemcpy(&nv, LOC_WS(nvIntermediate), sizeof(uint32_t), cudaMemcpyDeviceToHost)); #else - gridDim.x = 1024; // nv ???? + gridDim.x = 1; assert(blockIdx.x == 0); - for (; blockIdx.x < gridDim.x; ++blockIdx.x) - splitVertices(onGPU_d.get(), ws_d.get(), 9.f); + splitVertices(onGPU_d.get(), ws_d.get(), 9.f); resetGrid(); nv = ws_d->nvIntermediate; #endif std::cout << "after split " << nv << std::endl; #ifdef __CUDACC__ - cudautils::launch(fitVertices, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 5000.f); + cudautils::launch(fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 5000.f); cudaCheck(cudaGetLastError()); - cudautils::launch(sortByPt2, {1, 256}, onGPU_d.get(), ws_d.get()); + cudautils::launch(sortByPt2Kernel, {1, 256}, onGPU_d.get(), ws_d.get()); cudaCheck(cudaGetLastError()); cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost)); #else