From d02f4bee794a7c04d7710e6e019fb6f3a004ecc7 Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Tue, 3 Dec 2019 18:13:21 +0100 Subject: [PATCH] Apply code-format fixes (#427) --- .../src/gpuClusterTracksByDensity.h | 24 +-- .../PixelVertexFinding/src/gpuFitVertices.h | 13 +- .../PixelVertexFinding/src/gpuSortByPt2.h | 8 +- .../PixelVertexFinding/src/gpuSplitVertices.h | 172 +++++++++--------- .../PixelVertexFinding/src/gpuVertexFinder.h | 5 +- .../src/gpuVertexFinderImpl.h | 46 +++-- .../PixelVertexFinding/test/VertexFinder_t.h | 35 ++-- 7 files changed, 147 insertions(+), 156 deletions(-) diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracksByDensity.h b/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracksByDensity.h index d69156214bfe0..871b09599c903 100644 --- a/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracksByDensity.h +++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracksByDensity.h @@ -18,11 +18,11 @@ namespace gpuVertexFinder { // based on Rodrighez&Laio algo // __device__ __forceinline__ void clusterTracksByDensity(gpuVertexFinder::ZVertices* pdata, - gpuVertexFinder::WorkSpace* pws, - int minT, // min number of neighbours to be "seed" - float eps, // max absolute distance to cluster - float errmax, // max error to be "seed" - float chi2max // max normalized distance to cluster + gpuVertexFinder::WorkSpace* pws, + int minT, // min number of neighbours to be "seed" + float eps, // max absolute distance to cluster + float errmax, // max error to be "seed" + float chi2max // max normalized distance to cluster ) { using namespace gpuVertexFinder; constexpr bool verbose = false; // in principle the compiler should optmize out if false @@ -220,14 +220,14 @@ namespace gpuVertexFinder { } __global__ void clusterTracksByDensityKernel(gpuVertexFinder::ZVertices* pdata, - gpuVertexFinder::WorkSpace* pws, - int minT, // min number of neighbours to be "seed" - float eps, // max absolute distance to cluster - float errmax, // max error to be "seed" - float chi2max // max normalized distance to cluster + gpuVertexFinder::WorkSpace* pws, + int minT, // min number of neighbours to be "seed" + float eps, // max absolute distance to cluster + float errmax, // max error to be "seed" + float chi2max // max normalized distance to cluster ) { - clusterTracksByDensity(pdata,pws,minT,eps,errmax,chi2max); - } + clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max); + } } // namespace gpuVertexFinder diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuFitVertices.h b/RecoPixelVertexing/PixelVertexFinding/src/gpuFitVertices.h index 1c3641df709ec..4487cb12ea17b 100644 --- a/RecoPixelVertexing/PixelVertexFinding/src/gpuFitVertices.h +++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuFitVertices.h @@ -13,8 +13,8 @@ namespace gpuVertexFinder { __device__ __forceinline__ void fitVertices(ZVertices* pdata, - WorkSpace* pws, - float chi2Max // for outlier rejection + WorkSpace* pws, + float chi2Max // for outlier rejection ) { constexpr bool verbose = false; // in principle the compiler should optmize out if false @@ -102,12 +102,11 @@ namespace gpuVertexFinder { } __global__ void fitVerticesKernel(ZVertices* pdata, - WorkSpace* pws, - float chi2Max // for outlier rejection + WorkSpace* pws, + float chi2Max // for outlier rejection ) { - - fitVertices(pdata,pws,chi2Max); - } + fitVertices(pdata, pws, chi2Max); + } } // namespace gpuVertexFinder diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuSortByPt2.h b/RecoPixelVertexing/PixelVertexFinding/src/gpuSortByPt2.h index 93a34c826bd1f..89cc9a3844f76 100644 --- a/RecoPixelVertexing/PixelVertexFinding/src/gpuSortByPt2.h +++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuSortByPt2.h @@ -15,8 +15,7 @@ namespace gpuVertexFinder { - __device__ __forceinline__ - void sortByPt2(ZVertices* pdata, WorkSpace* pws) { + __device__ __forceinline__ void sortByPt2(ZVertices* pdata, WorkSpace* pws) { auto& __restrict__ data = *pdata; auto& __restrict__ ws = *pws; auto nt = ws.ntrks; @@ -67,10 +66,7 @@ namespace gpuVertexFinder { #endif } - - __global__ void sortByPt2Kernel(ZVertices* pdata, WorkSpace* pws) { - sortByPt2(pdata,pws); - } + __global__ void sortByPt2Kernel(ZVertices* pdata, WorkSpace* pws) { sortByPt2(pdata, pws); } } // namespace gpuVertexFinder diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuSplitVertices.h b/RecoPixelVertexing/PixelVertexFinding/src/gpuSplitVertices.h index 48ed9290ab06c..694915ab02157 100644 --- a/RecoPixelVertexing/PixelVertexFinding/src/gpuSplitVertices.h +++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuSplitVertices.h @@ -32,106 +32,106 @@ namespace gpuVertexFinder { assert(zt); // one vertex per block - for ( auto kv = blockIdx.x; kv= MAXTK) continue; // too bad FIXME - __shared__ uint32_t it[MAXTK]; // track index - __shared__ float zz[MAXTK]; // z pos - __shared__ uint8_t newV[MAXTK]; // 0 or 1 - __shared__ float ww[MAXTK]; // z weight - - __shared__ uint32_t nq; // number of track for this vertex - nq = 0; - __syncthreads(); - - // copy to local - for (auto k = threadIdx.x; k < nt; k += blockDim.x) { - if (iv[k] == int(kv)) { - auto old = atomicInc(&nq, MAXTK); - zz[old] = zt[k] - zv[kv]; - newV[old] = zz[old] < 0 ? 0 : 1; - ww[old] = 1.f / ezt2[k]; - it[old] = k; - } - } - - __shared__ float znew[2], wnew[2]; // the new vertices - - __syncthreads(); - assert(int(nq) == nn[kv] + 1); - - int maxiter = 20; - // kt-min.... - bool more = true; - while (__syncthreads_or(more)) { - more = false; - if (0 == threadIdx.x) { - znew[0] = 0; - znew[1] = 0; - wnew[0] = 0; - wnew[1] = 0; - } - __syncthreads(); - for (auto k = threadIdx.x; k < nq; k += blockDim.x) { - auto i = newV[k]; - atomicAdd(&znew[i], zz[k] * ww[k]); - atomicAdd(&wnew[i], ww[k]); - } + for (auto kv = blockIdx.x; kv < nvFinal; kv += gridDim.x) { + if (nn[kv] < 4) + continue; + if (chi2[kv] < maxChi2 * float(nn[kv])) + continue; + + constexpr int MAXTK = 512; + assert(nn[kv] < MAXTK); + if (nn[kv] >= MAXTK) + continue; // too bad FIXME + __shared__ uint32_t it[MAXTK]; // track index + __shared__ float zz[MAXTK]; // z pos + __shared__ uint8_t newV[MAXTK]; // 0 or 1 + __shared__ float ww[MAXTK]; // z weight + + __shared__ uint32_t nq; // number of track for this vertex + nq = 0; __syncthreads(); - if (0 == threadIdx.x) { - znew[0] /= wnew[0]; - znew[1] /= wnew[1]; + + // copy to local + for (auto k = threadIdx.x; k < nt; k += blockDim.x) { + if (iv[k] == int(kv)) { + auto old = atomicInc(&nq, MAXTK); + zz[old] = zt[k] - zv[kv]; + newV[old] = zz[old] < 0 ? 0 : 1; + ww[old] = 1.f / ezt2[k]; + it[old] = k; + } } + + __shared__ float znew[2], wnew[2]; // the new vertices + __syncthreads(); - for (auto k = threadIdx.x; k < nq; k += blockDim.x) { - auto d0 = fabs(zz[k] - znew[0]); - auto d1 = fabs(zz[k] - znew[1]); - auto newer = d0 < d1 ? 0 : 1; - more |= newer != newV[k]; - newV[k] = newer; - } - --maxiter; - if (maxiter <= 0) + assert(int(nq) == nn[kv] + 1); + + int maxiter = 20; + // kt-min.... + bool more = true; + while (__syncthreads_or(more)) { more = false; - } + if (0 == threadIdx.x) { + znew[0] = 0; + znew[1] = 0; + wnew[0] = 0; + wnew[1] = 0; + } + __syncthreads(); + for (auto k = threadIdx.x; k < nq; k += blockDim.x) { + auto i = newV[k]; + atomicAdd(&znew[i], zz[k] * ww[k]); + atomicAdd(&wnew[i], ww[k]); + } + __syncthreads(); + if (0 == threadIdx.x) { + znew[0] /= wnew[0]; + znew[1] /= wnew[1]; + } + __syncthreads(); + for (auto k = threadIdx.x; k < nq; k += blockDim.x) { + auto d0 = fabs(zz[k] - znew[0]); + auto d1 = fabs(zz[k] - znew[1]); + auto newer = d0 < d1 ? 0 : 1; + more |= newer != newV[k]; + newV[k] = newer; + } + --maxiter; + if (maxiter <= 0) + more = false; + } - // avoid empty vertices - if (0 == wnew[0] || 0 == wnew[1]) - continue; + // avoid empty vertices + if (0 == wnew[0] || 0 == wnew[1]) + continue; - // quality cut - auto dist2 = (znew[0] - znew[1]) * (znew[0] - znew[1]); + // quality cut + auto dist2 = (znew[0] - znew[1]) * (znew[0] - znew[1]); - auto chi2Dist = dist2 / (1.f / wnew[0] + 1.f / wnew[1]); + auto chi2Dist = dist2 / (1.f / wnew[0] + 1.f / wnew[1]); - if (verbose && 0 == threadIdx.x) - printf("inter %d %f %f\n", 20 - maxiter, chi2Dist, dist2 * wv[kv]); + if (verbose && 0 == threadIdx.x) + printf("inter %d %f %f\n", 20 - maxiter, chi2Dist, dist2 * wv[kv]); - if (chi2Dist < 4) - continue; + if (chi2Dist < 4) + continue; - // get a new global vertex - __shared__ uint32_t igv; - if (0 == threadIdx.x) - igv = atomicAdd(&ws.nvIntermediate, 1); - __syncthreads(); - for (auto k = threadIdx.x; k < nq; k += blockDim.x) { - if (1 == newV[k]) - iv[it[k]] = igv; - } + // get a new global vertex + __shared__ uint32_t igv; + if (0 == threadIdx.x) + igv = atomicAdd(&ws.nvIntermediate, 1); + __syncthreads(); + for (auto k = threadIdx.x; k < nq; k += blockDim.x) { + if (1 == newV[k]) + iv[it[k]] = igv; + } - } // loop on vertices + } // loop on vertices } __global__ void splitVerticesKernel(ZVertices* pdata, WorkSpace* pws, float maxChi2) { - splitVertices(pdata, pws, maxChi2); + splitVertices(pdata, pws, maxChi2); } } // namespace gpuVertexFinder diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h index fec0308a1154d..6cd86c93a6737 100644 --- a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h +++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h @@ -52,15 +52,14 @@ namespace gpuVertexFinder { float ierrmax, // max error to be "seed" float ichi2max // max normalized distance to cluster ) - : oneKernel_(oneKernel && !(useDBSCAN||useIterative)), + : oneKernel_(oneKernel && !(useDBSCAN || useIterative)), useDensity_(useDensity), useDBSCAN_(useDBSCAN), useIterative_(useIterative), minT(iminT), eps(ieps), errmax(ierrmax), - chi2max(ichi2max) { - } + chi2max(ichi2max) {} ~Producer() = default; diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h index 4dd839adc5315..d6e63227ccf85 100644 --- a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h +++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h @@ -49,44 +49,41 @@ namespace gpuVertexFinder { #ifndef THREE_KERNELS __global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata, gpuVertexFinder::WorkSpace* pws, - int minT, // min number of neighbours to be "seed" + int minT, // min number of neighbours to be "seed" float eps, // max absolute distance to cluster float errmax, // max error to be "seed" float chi2max // max normalized distance to cluster, ) { - clusterTracksByDensity(pdata,pws,minT,eps,errmax,chi2max); + clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max); __syncthreads(); - fitVertices(pdata,pws, 50.); + fitVertices(pdata, pws, 50.); __syncthreads(); - splitVertices(pdata,pws, 9.f); + splitVertices(pdata, pws, 9.f); __syncthreads(); - fitVertices(pdata,pws, 5000.); + fitVertices(pdata, pws, 5000.); __syncthreads(); - sortByPt2(pdata,pws); + sortByPt2(pdata, pws); } #else __global__ void vertexFinderKernel1(gpuVertexFinder::ZVertices* pdata, - gpuVertexFinder::WorkSpace* pws, - int minT, // min number of neighbours to be "seed" - float eps, // max absolute distance to cluster - float errmax, // max error to be "seed" - float chi2max // max normalized distance to cluster, + gpuVertexFinder::WorkSpace* pws, + int minT, // min number of neighbours to be "seed" + float eps, // max absolute distance to cluster + float errmax, // max error to be "seed" + float chi2max // max normalized distance to cluster, ) { - clusterTracksByDensity(pdata,pws,minT,eps,errmax,chi2max); + clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max); __syncthreads(); - fitVertices(pdata,pws, 50.); + fitVertices(pdata, pws, 50.); } - __global__ void vertexFinderKernel2(gpuVertexFinder::ZVertices* pdata, - gpuVertexFinder::WorkSpace* pws) - { - fitVertices(pdata,pws, 5000.); + __global__ void vertexFinderKernel2(gpuVertexFinder::ZVertices* pdata, gpuVertexFinder::WorkSpace* pws) { + fitVertices(pdata, pws, 5000.); __syncthreads(); - sortByPt2(pdata,pws); + sortByPt2(pdata, pws); } #endif - #ifdef __CUDACC__ ZVertexHeterogeneous Producer::makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const { // std::cout << "producing Vertices on GPU" << std::endl; @@ -120,10 +117,10 @@ namespace gpuVertexFinder { #ifdef __CUDACC__ if (oneKernel_) { - // implemented only for density clustesrs + // implemented only for density clustesrs #ifndef THREE_KERNELS vertexFinderOneKernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); -#else +#else vertexFinderKernel1<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); cudaCheck(cudaGetLastError()); // one block per vertex... @@ -131,7 +128,7 @@ namespace gpuVertexFinder { cudaCheck(cudaGetLastError()); vertexFinderKernel2<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get()); #endif - } else { // five kernels + } else { // five kernels if (useDensity_) { clusterTracksByDensityKernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); } else if (useDBSCAN_) { @@ -147,7 +144,7 @@ namespace gpuVertexFinder { cudaCheck(cudaGetLastError()); fitVerticesKernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), 5000.); cudaCheck(cudaGetLastError()); - sortByPt2Kernel<<<1, 1024-256, 0, stream>>>(soa, ws_d.get()); + sortByPt2Kernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get()); } cudaCheck(cudaGetLastError()); #else // __CUDACC__ @@ -161,7 +158,8 @@ namespace gpuVertexFinder { // std::cout << "found " << (*ws_d).nvIntermediate << " vertices " << std::endl; fitVertices(soa, ws_d.get(), 50.); // one block per vertex! - blockIdx.x = 0; gridDim.x=1; + blockIdx.x = 0; + gridDim.x = 1; splitVertices(soa, ws_d.get(), 9.f); resetGrid(); fitVertices(soa, ws_d.get(), 5000.); diff --git a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h index d32a611402e61..5261069a6b283 100644 --- a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h +++ b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h @@ -23,23 +23,23 @@ #ifdef ONE_KERNEL #ifdef __CUDACC__ - __global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata, - gpuVertexFinder::WorkSpace* pws, - int minT, // min number of neighbours to be "seed" - float eps, // max absolute distance to cluster - float errmax, // max error to be "seed" - float chi2max // max normalized distance to cluster, - ) { - clusterTracksByDensity(pdata,pws,minT,eps,errmax,chi2max); - __syncthreads(); - fitVertices(pdata,pws, 50.); - __syncthreads(); - splitVertices(pdata,pws, 9.f); - __syncthreads(); - fitVertices(pdata,pws, 5000.); - __syncthreads(); - sortByPt2(pdata,pws); - } +__global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata, + gpuVertexFinder::WorkSpace* pws, + int minT, // min number of neighbours to be "seed" + float eps, // max absolute distance to cluster + float errmax, // max error to be "seed" + float chi2max // max normalized distance to cluster, +) { + clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max); + __syncthreads(); + fitVertices(pdata, pws, 50.); + __syncthreads(); + splitVertices(pdata, pws, 9.f); + __syncthreads(); + fitVertices(pdata, pws, 5000.); + __syncthreads(); + sortByPt2(pdata, pws); +} #endif #endif @@ -265,7 +265,6 @@ int main() { #ifdef __CUDACC__ // one vertex per block!!! - // cudautils::launch(splitVerticesKernel, {1, 256}, onGPU_d.get(), ws_d.get(), 9.f); cudautils::launch(splitVerticesKernel, {1024, 64}, onGPU_d.get(), ws_d.get(), 9.f); cudaCheck(cudaMemcpy(&nv, LOC_WS(nvIntermediate), sizeof(uint32_t), cudaMemcpyDeviceToHost)); #else