From 6bfe94f9d231057fe853c9b82c7ae210ffbe6f48 Mon Sep 17 00:00:00 2001 From: waredjeb <39335169+waredjeb@users.noreply.github.com> Date: Tue, 29 Oct 2019 07:09:04 +0100 Subject: [PATCH] Replace CUDA API wrapper memory operations with native CUDA calls (#395) --- CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc | 3 +- .../src/TrackingRecHit2DCUDA.cc | 2 +- .../test/gpuFrameTransformTest.cpp | 18 +++--- DataFormats/Math/test/CholeskyInvert_t.cu | 9 +-- DataFormats/Math/test/cudaAtan2Test.cu | 5 +- DataFormats/Math/test/cudaMathTest.cu | 13 ++-- .../CUDACore/test/testStreamEvent.cu | 7 ++- .../CUDACore/test/test_CUDAScopedContext.cc | 10 +-- .../CUDATest/plugins/TestCUDAProducerGPUEW.cc | 5 +- .../plugins/TestCUDAProducerGPUEWTask.cc | 8 ++- .../plugins/TestCUDAProducerGPUKernel.cu | 6 +- .../plugins/TestCUDAProducerGPUtoCPU.cc | 8 ++- .../test/test_TestCUDAProducerGPUFirst.cc | 2 +- .../CUDAUtilities/interface/HistoContainer.h | 2 +- .../CUDAUtilities/interface/copyAsync.h | 9 +-- .../CUDAUtilities/interface/memsetAsync.h | 5 +- .../CUDAUtilities/interface/prefixScan.h | 2 +- .../CUDAUtilities/test/AtomicPairCounter_t.cu | 11 ++-- .../CUDAUtilities/test/HistoContainer_t.cu | 10 +-- .../CUDAUtilities/test/OneHistoContainer_t.cu | 3 +- .../CUDAUtilities/test/OneToManyAssoc_t.h | 15 ++--- .../CUDAUtilities/test/copyAsync_t.cpp | 12 ++-- .../CUDAUtilities/test/prefixScan_t.cu | 13 ++-- .../CUDAUtilities/test/radixSort_t.cu | 11 ++-- .../TestHeterogeneousEDProducerGPUHelpers.cu | 14 ++--- .../src/SiPixelFedCablingMapGPUWrapper.cc | 7 ++- .../SiPixelClusterizer/test/gpuClustering_t.h | 62 +++++++++---------- .../PixelTrackFitting/test/testEigenGPU.cu | 13 ++-- .../test/testEigenGPUNoFit.cu | 52 ++++++++-------- .../PixelVertexFinding/test/VertexFinder_t.h | 37 +++++------ 30 files changed, 200 insertions(+), 174 deletions(-) diff --git a/CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc b/CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc index cdc49aa46bfa4..a297ae11dc327 100644 --- a/CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc +++ b/CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc @@ -1,8 +1,9 @@ #include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" BeamSpotCUDA::BeamSpotCUDA(Data const* data_h, cudaStream_t stream) { data_d_ = cudautils::make_device_unique(stream); - cuda::memory::async::copy(data_d_.get(), data_h, sizeof(Data), stream); + cudaCheck(cudaMemcpyAsync(data_d_.get(), data_h, sizeof(Data), cudaMemcpyHostToDevice, stream)); } diff --git a/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DCUDA.cc b/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DCUDA.cc index c4cb13e3a0bd8..e6f223bfec4e3 100644 --- a/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DCUDA.cc +++ b/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DCUDA.cc @@ -14,6 +14,6 @@ cudautils::host::unique_ptr TrackingRecHit2DCUDA::localCoordToHostAsync template <> cudautils::host::unique_ptr TrackingRecHit2DCUDA::hitsModuleStartToHostAsync(cudaStream_t stream) const { auto ret = cudautils::make_host_unique(2001, stream); - cudaMemcpyAsync(ret.get(), m_hitsModuleStart, 4 * 2001, cudaMemcpyDefault, stream); + cudaCheck(cudaMemcpyAsync(ret.get(), m_hitsModuleStart, 4 * 2001, cudaMemcpyDefault, stream)); return ret; } diff --git a/DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp b/DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp index 4f3c11212e2e9..d02672c08d5d1 100644 --- a/DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp +++ b/DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp @@ -10,6 +10,7 @@ #include +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "DataFormats/GeometrySurface/interface/GloballyPositioned.h" #include "DataFormats/GeometrySurface/interface/SOARotation.h" #include "DataFormats/GeometrySurface/interface/TkRotation.h" @@ -73,7 +74,7 @@ int main(void) { // auto d_sf = cuda::memory::device::make_unique(current_device, 1); auto d_sf = cuda::memory::device::make_unique(current_device, sizeof(SFrame)); - cuda::memory::copy(d_sf.get(), &sf1, sizeof(SFrame)); + cudaCheck(cudaMemcpy(d_sf.get(), &sf1, sizeof(SFrame), cudaMemcpyHostToDevice)); for (auto i = 0U; i < size; ++i) { xl[i] = yl[i] = 0.1f * float(i) - float(size / 2); @@ -84,9 +85,9 @@ int main(void) { std::random_shuffle(xl, xl + size); std::random_shuffle(yl, yl + size); - cuda::memory::copy(d_xl.get(), xl, size32); - cuda::memory::copy(d_yl.get(), yl, size32); - cuda::memory::copy(d_le.get(), le, 3 * size32); + cudaCheck(cudaMemcpy(d_xl.get(), xl, size32, cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(d_yl.get(), yl, size32, cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(d_le.get(), le, 3 * size32, cudaMemcpyHostToDevice)); toGlobalWrapper((SFrame const *)(d_sf.get()), d_xl.get(), @@ -97,11 +98,10 @@ int main(void) { d_le.get(), d_ge.get(), size); - - cuda::memory::copy(x, d_x.get(), size32); - cuda::memory::copy(y, d_y.get(), size32); - cuda::memory::copy(z, d_z.get(), size32); - cuda::memory::copy(ge, d_ge.get(), 6 * size32); + cudaCheck(cudaMemcpy(x, d_x.get(), size32, cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(y, d_y.get(), size32, cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(z, d_z.get(), size32, cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(ge, d_ge.get(), 6 * size32, cudaMemcpyDeviceToHost)); float eps = 0.; for (auto i = 0U; i < size; ++i) { diff --git a/DataFormats/Math/test/CholeskyInvert_t.cu b/DataFormats/Math/test/CholeskyInvert_t.cu index 3e2cf041bae16..dca89682113fe 100644 --- a/DataFormats/Math/test/CholeskyInvert_t.cu +++ b/DataFormats/Math/test/CholeskyInvert_t.cu @@ -16,6 +16,7 @@ #include #include "DataFormats/Math/interface/choleskyInversion.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h" #include "HeterogeneousCore/CUDAUtilities/interface/launch.h" @@ -132,7 +133,7 @@ void go(bool soa) { std::cout << mm[SIZE / 2](1, 1) << std::endl; auto m_d = cuda::memory::device::make_unique(current_device, DIM * DIM * stride()); - cuda::memory::copy(m_d.get(), (double const *)(mm), stride() * sizeof(MX)); + cudaCheck(cudaMemcpy(m_d.get(), (double const *)(mm), stride() * sizeof(MX), cudaMemcpyHostToDevice)); constexpr int NKK = #ifdef DOPROF @@ -151,7 +152,8 @@ void go(bool soa) { else cudautils::launch(invert, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE); - cuda::memory::copy(&mm, m_d.get(), stride() * sizeof(MX)); + cudaCheck(cudaMemcpy(&mm, m_d.get(), stride() * sizeof(MX), cudaMemcpyDeviceToHost)); + delta += (std::chrono::high_resolution_clock::now() - start); if (0 == kk) @@ -162,8 +164,7 @@ void go(bool soa) { #ifndef DOPROF cudautils::launch(invertSeq, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE); - - cuda::memory::copy(&mm, m_d.get(), stride() * sizeof(MX)); + cudaCheck(cudaMemcpy(&mm, m_d.get(), stride() * sizeof(MX), cudaMemcpyDeviceToHost)); #endif delta1 += (std::chrono::high_resolution_clock::now() - start); diff --git a/DataFormats/Math/test/cudaAtan2Test.cu b/DataFormats/Math/test/cudaAtan2Test.cu index c436801640b09..298d8b784f322 100644 --- a/DataFormats/Math/test/cudaAtan2Test.cu +++ b/DataFormats/Math/test/cudaAtan2Test.cu @@ -29,6 +29,7 @@ end #include "cuda/api_wrappers.h" #include "DataFormats/Math/interface/approx_atan2.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h" #include "HeterogeneousCore/CUDAUtilities/interface/launch.h" @@ -72,7 +73,7 @@ void go() { auto diff_d = cuda::memory::device::make_unique(current_device, 3); int diffs[3]; - cuda::memory::device::zero(diff_d.get(), 3 * 4); + cudaCheck(cudaMemset(diff_d.get(), 0, 3 * 4)); // Launch the diff CUDA Kernel dim3 threadsPerBlock(32, 32, 1); @@ -83,7 +84,7 @@ void go() { cudautils::launch(diffAtan, {blocksPerGrid, threadsPerBlock}, diff_d.get()); - cuda::memory::copy(diffs, diff_d.get(), 3 * 4); + cudaCheck(cudaMemcpy(diffs, diff_d.get(), 3 * 4, cudaMemcpyDeviceToHost)); delta += (std::chrono::high_resolution_clock::now() - start); float mdiff = diffs[0] * 1.e-7; diff --git a/DataFormats/Math/test/cudaMathTest.cu b/DataFormats/Math/test/cudaMathTest.cu index 1a77456a1c061..46aae5a64f217 100644 --- a/DataFormats/Math/test/cudaMathTest.cu +++ b/DataFormats/Math/test/cudaMathTest.cu @@ -39,6 +39,7 @@ end #include "DataFormats/Math/interface/approx_log.h" #include "DataFormats/Math/interface/approx_exp.h" #include "DataFormats/Math/interface/approx_atan2.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h" #include "HeterogeneousCore/CUDAUtilities/interface/launch.h" @@ -106,8 +107,8 @@ void go() { auto d_B = cuda::memory::device::make_unique(current_device, numElements); auto d_C = cuda::memory::device::make_unique(current_device, numElements); - cuda::memory::copy(d_A.get(), h_A.get(), size); - cuda::memory::copy(d_B.get(), h_B.get(), size); + cudaCheck(cudaMemcpy(d_A.get(), h_A.get(), size, cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(d_B.get(), h_B.get(), size, cudaMemcpyHostToDevice)); delta += (std::chrono::high_resolution_clock::now() - start); std::cout << "cuda alloc+copy took " << std::chrono::duration_cast(delta).count() << " ms" << std::endl; @@ -118,19 +119,21 @@ void go() { std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads\n"; delta -= (std::chrono::high_resolution_clock::now() - start); - cudautils::launch(vectorOp, {blocksPerGrid, threadsPerBlock}, d_A.get(), d_B.get(), d_C.get(), numElements); + cudautils::launch( + vectorOp, {blocksPerGrid, threadsPerBlock}, d_A.get(), d_B.get(), d_C.get(), numElements); delta += (std::chrono::high_resolution_clock::now() - start); std::cout << "cuda computation took " << std::chrono::duration_cast(delta).count() << " ms" << std::endl; delta -= (std::chrono::high_resolution_clock::now() - start); - cudautils::launch(vectorOp, {blocksPerGrid, threadsPerBlock}, d_A.get(), d_B.get(), d_C.get(), numElements); + cudautils::launch( + vectorOp, {blocksPerGrid, threadsPerBlock}, d_A.get(), d_B.get(), d_C.get(), numElements); delta += (std::chrono::high_resolution_clock::now() - start); std::cout << "cuda computation took " << std::chrono::duration_cast(delta).count() << " ms" << std::endl; delta -= (std::chrono::high_resolution_clock::now() - start); - cuda::memory::copy(h_C.get(), d_C.get(), size); + cudaCheck(cudaMemcpy(h_C.get(), d_C.get(), size, cudaMemcpyDeviceToHost)); delta += (std::chrono::high_resolution_clock::now() - start); std::cout << "cuda copy back took " << std::chrono::duration_cast(delta).count() << " ms" << std::endl; diff --git a/HeterogeneousCore/CUDACore/test/testStreamEvent.cu b/HeterogeneousCore/CUDACore/test/testStreamEvent.cu index 1c889f8f75a04..bd9ce4f29fba3 100644 --- a/HeterogeneousCore/CUDACore/test/testStreamEvent.cu +++ b/HeterogeneousCore/CUDACore/test/testStreamEvent.cu @@ -12,6 +12,7 @@ #include +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h" namespace { @@ -39,8 +40,8 @@ int main() { cudaStream_t stream1, stream2; cudaEvent_t event1, event2; - cudaMalloc(&dev_points1, ARRAY_SIZE * sizeof(float)); - cudaMallocHost(&host_points1, ARRAY_SIZE * sizeof(float)); + cudaCheck(cudaMalloc(&dev_points1, ARRAY_SIZE * sizeof(float))); + cudaCheck(cudaMallocHost(&host_points1, ARRAY_SIZE * sizeof(float))); cudaStreamCreateWithFlags(&stream1, cudaStreamNonBlocking); cudaStreamCreateWithFlags(&stream2, cudaStreamNonBlocking); cudaEventCreate(&event1); @@ -50,7 +51,7 @@ int main() { host_points1[j] = static_cast(j); } - cudaMemcpyAsync(dev_points1, host_points1, ARRAY_SIZE * sizeof(float), cudaMemcpyHostToDevice, stream1); + cudaCheck(cudaMemcpyAsync(dev_points1, host_points1, ARRAY_SIZE * sizeof(float), cudaMemcpyHostToDevice, stream1)); kernel_looping<<<1, 16, 0, stream1>>>(dev_points1, ARRAY_SIZE); if (debug) std::cout << "Kernel launched on stream1" << std::endl; diff --git a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc index 5e6d67b24e479..3e06ed15d7594 100644 --- a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc +++ b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc @@ -31,8 +31,7 @@ namespace cudatest { namespace { std::unique_ptr> produce(int device, int* d, int* h) { auto ctx = cudatest::TestCUDAScopedContext::make(device, true); - - cuda::memory::async::copy(d, h, sizeof(int), ctx.stream()); + cudaCheck(cudaMemcpyAsync(d, h, sizeof(int), cudaMemcpyHostToDevice, ctx.stream())); testCUDAScopedContextKernels_single(d, ctx.stream()); return ctx.wrap(d); } @@ -116,9 +115,10 @@ TEST_CASE("Use of CUDAScopedContext", "[CUDACore]") { h_a1 = 0; h_a2 = 0; int h_a3 = 0; - cuda::memory::async::copy(&h_a1, d_a1.get(), sizeof(int), ctx.stream()); - cuda::memory::async::copy(&h_a2, d_a2.get(), sizeof(int), ctx.stream()); - cuda::memory::async::copy(&h_a3, d_a3.get(), sizeof(int), ctx.stream()); + + cudaCheck(cudaMemcpyAsync(&h_a1, d_a1.get(), sizeof(int), cudaMemcpyDeviceToHost, ctx.stream())); + cudaCheck(cudaMemcpyAsync(&h_a2, d_a2.get(), sizeof(int), cudaMemcpyDeviceToHost, ctx.stream())); + cudaCheck(cudaMemcpyAsync(&h_a3, d_a3.get(), sizeof(int), cudaMemcpyDeviceToHost, ctx.stream())); REQUIRE(h_a1 == 2); REQUIRE(h_a2 == 4); diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc index abded61899096..74e5af7c46baf 100644 --- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc +++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc @@ -7,6 +7,7 @@ #include "FWCore/ServiceRegistry/interface/Service.h" #include "CUDADataFormats/Common/interface/CUDAProduct.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h" #include "HeterogeneousCore/CUDACore/interface/CUDAContextState.h" #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" @@ -67,8 +68,8 @@ void TestCUDAProducerGPUEW::acquire(const edm::Event& iEvent, // Mimick the need to transfer some of the GPU data back to CPU to // be used for something within this module, or to be put in the // event. - cuda::memory::async::copy(hostData_.get(), devicePtr_.get() + 10, sizeof(float), ctx.stream()); - + cudaCheck( + cudaMemcpyAsync(hostData_.get(), devicePtr_.get() + 10, sizeof(float), cudaMemcpyDeviceToHost, ctx.stream())); edm::LogVerbatim("TestCUDAProducerGPUEW") << label_ << " TestCUDAProducerGPUEW::acquire end event " << iEvent.id().event() << " stream " << iEvent.streamID(); } diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEWTask.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEWTask.cc index 2b628ae93051e..0c8aad0931f15 100644 --- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEWTask.cc +++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEWTask.cc @@ -10,6 +10,7 @@ #include "FWCore/ServiceRegistry/interface/Service.h" #include "CUDADataFormats/Common/interface/CUDAProduct.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h" #include "HeterogeneousCore/CUDACore/interface/CUDAContextState.h" #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" @@ -75,8 +76,8 @@ void TestCUDAProducerGPUEWTask::acquire(const edm::Event& iEvent, // Mimick the need to transfer some of the GPU data back to CPU to // be used for something within this module, or to be put in the // event. - cuda::memory::async::copy(hostData_.get(), devicePtr_.get() + 10, sizeof(float), ctx.stream()); - + cudaCheck( + cudaMemcpyAsync(hostData_.get(), devicePtr_.get() + 10, sizeof(float), cudaMemcpyDeviceToHost, ctx.stream())); // Push a task to run addSimpleWork() after the asynchronous work // (and acquire()) has finished instead of produce() ctx.pushNextTask([iev = iEvent.id().event(), istr = iEvent.streamID(), this](CUDAScopedContextTask ctx) { @@ -94,7 +95,8 @@ void TestCUDAProducerGPUEWTask::addSimpleWork(edm::EventNumber_t eventID, edm::LogVerbatim("TestCUDAProducerGPUEWTask") << label_ << " TestCUDAProducerGPUEWTask::addSimpleWork begin event " << eventID << " stream " << streamID << " 10th element " << *hostData_ << " not satisfied, queueing more work"; - cuda::memory::async::copy(hostData_.get(), devicePtr_.get() + 10, sizeof(float), ctx.stream()); + cudaCheck( + cudaMemcpyAsync(hostData_.get(), devicePtr_.get() + 10, sizeof(float), cudaMemcpyDeviceToHost, ctx.stream())); ctx.pushNextTask([eventID, streamID, this](CUDAScopedContextTask ctx) { addSimpleWork(eventID, streamID, ctx); }); gpuAlgo_.runSimpleAlgo(devicePtr_.get(), ctx.stream()); diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu index 73a6615259ebc..aaa6b9148c74c 100644 --- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu +++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu @@ -69,7 +69,7 @@ cudautils::device::unique_ptr TestCUDAProducerGPUKernel::runAlgo(const // First make the sanity check if (d_input != nullptr) { auto h_check = std::make_unique(NUM_VALUES); - cuda::memory::copy(h_check.get(), d_input, NUM_VALUES * sizeof(float)); + cudaCheck(cudaMemcpy(h_check.get(), d_input, NUM_VALUES * sizeof(float), cudaMemcpyDeviceToHost)); for (int i = 0; i < NUM_VALUES; ++i) { if (h_check[i] != i) { throw cms::Exception("Assert") << "Sanity check on element " << i << " failed, expected " << i << " got " @@ -89,8 +89,8 @@ cudautils::device::unique_ptr TestCUDAProducerGPUKernel::runAlgo(const auto d_a = cudautils::make_device_unique(NUM_VALUES, stream); auto d_b = cudautils::make_device_unique(NUM_VALUES, stream); - cuda::memory::async::copy(d_a.get(), h_a.get(), NUM_VALUES * sizeof(float), stream); - cuda::memory::async::copy(d_b.get(), h_b.get(), NUM_VALUES * sizeof(float), stream); + cudaCheck(cudaMemcpyAsync(d_a.get(), h_a.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream)); + cudaCheck(cudaMemcpyAsync(d_b.get(), h_b.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream)); int threadsPerBlock{32}; int blocksPerGrid = (NUM_VALUES + threadsPerBlock - 1) / threadsPerBlock; diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc index 06f099073e1ea..168ac1daa14b9 100644 --- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc +++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc @@ -6,6 +6,7 @@ #include "FWCore/ParameterSet/interface/ParameterSetDescription.h" #include "CUDADataFormats/Common/interface/CUDAProduct.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h" #include "HeterogeneousCore/CUDATest/interface/CUDAThing.h" #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" @@ -58,8 +59,11 @@ void TestCUDAProducerGPUtoCPU::acquire(const edm::Event& iEvent, buffer_ = cudautils::make_host_unique(TestCUDAProducerGPUKernel::NUM_VALUES, ctx.stream()); // Enqueue async copy, continue in produce once finished - cuda::memory::async::copy( - buffer_.get(), device.get(), TestCUDAProducerGPUKernel::NUM_VALUES * sizeof(float), ctx.stream()); + cudaCheck(cudaMemcpyAsync(buffer_.get(), + device.get(), + TestCUDAProducerGPUKernel::NUM_VALUES * sizeof(float), + cudaMemcpyDeviceToHost, + ctx.stream())); edm::LogVerbatim("TestCUDAProducerGPUtoCPU") << label_ << " TestCUDAProducerGPUtoCPU::acquire end event " << iEvent.id().event() << " stream " << iEvent.streamID(); diff --git a/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc b/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc index 3ab110806ee78..2b137d3483c5a 100644 --- a/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc +++ b/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc @@ -74,7 +74,7 @@ process.moduleToTest(process.toTest) REQUIRE(data != nullptr); float firstElements[10]; - cuda::memory::async::copy(firstElements, data, sizeof(float) * 10, prod->stream()); + cudaCheck(cudaMemcpyAsync(firstElements, data, sizeof(float) * 10, cudaMemcpyDeviceToHost, prod->stream())); std::cout << "Synchronizing with CUDA stream" << std::endl; auto stream = prod->stream(); diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h index 4501f98e39376..ca7053740f6f7 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h +++ b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h @@ -62,7 +62,7 @@ namespace cudautils { ) { uint32_t *off = (uint32_t *)((char *)(h) + offsetof(Histo, off)); #ifdef __CUDACC__ - cudaMemsetAsync(off, 0, 4 * Histo::totbins(), stream); + cudaCheck(cudaMemsetAsync(off, 0, 4 * Histo::totbins(), stream)); #else ::memset(off, 0, 4 * Histo::totbins()); #endif diff --git a/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h b/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h index a327402b2f3d3..bfa1bdee9a03d 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h +++ b/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h @@ -1,6 +1,7 @@ #ifndef HeterogeneousCore_CUDAUtilities_copyAsync_h #define HeterogeneousCore_CUDAUtilities_copyAsync_h +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" @@ -17,7 +18,7 @@ namespace cudautils { // Shouldn't compile for array types because of sizeof(T), but // let's add an assert with a more helpful message static_assert(std::is_array::value == false, "For array types, use the other overload with the size parameter"); - cuda::memory::async::copy(dst.get(), src.get(), sizeof(T), stream); + cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), sizeof(T), cudaMemcpyHostToDevice, stream)); } template @@ -25,7 +26,7 @@ namespace cudautils { const cudautils::device::unique_ptr& src, cudaStream_t stream) { static_assert(std::is_array::value == false, "For array types, use the other overload with the size parameter"); - cuda::memory::async::copy(dst.get(), src.get(), sizeof(T), stream); + cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), sizeof(T), cudaMemcpyDeviceToHost, stream)); } // Multiple elements @@ -34,7 +35,7 @@ namespace cudautils { const cudautils::host::unique_ptr& src, size_t nelements, cudaStream_t stream) { - cuda::memory::async::copy(dst.get(), src.get(), nelements * sizeof(T), stream); + cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), nelements * sizeof(T), cudaMemcpyHostToDevice, stream)); } template @@ -42,7 +43,7 @@ namespace cudautils { const cudautils::device::unique_ptr& src, size_t nelements, cudaStream_t stream) { - cuda::memory::async::copy(dst.get(), src.get(), nelements * sizeof(T), stream); + cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), nelements * sizeof(T), cudaMemcpyDeviceToHost, stream)); } } // namespace cudautils diff --git a/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h b/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h index b0167dcb9ed25..b9ce5a001d41f 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h +++ b/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h @@ -1,6 +1,7 @@ #ifndef HeterogeneousCore_CUDAUtilities_memsetAsync_h #define HeterogeneousCore_CUDAUtilities_memsetAsync_h +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" #include @@ -13,7 +14,7 @@ namespace cudautils { // Shouldn't compile for array types because of sizeof(T), but // let's add an assert with a more helpful message static_assert(std::is_array::value == false, "For array types, use the other overload with the size parameter"); - cuda::memory::device::async::set(ptr.get(), value, sizeof(T), stream); + cudaCheck(cudaMemsetAsync(ptr.get(), value, sizeof(T), stream)); } /** @@ -24,7 +25,7 @@ namespace cudautils { */ template inline void memsetAsync(cudautils::device::unique_ptr& ptr, int value, size_t nelements, cudaStream_t stream) { - cuda::memory::device::async::set(ptr.get(), value, nelements * sizeof(T), stream); + cudaCheck(cudaMemsetAsync(ptr.get(), value, nelements * sizeof(T), stream)); } } // namespace cudautils diff --git a/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h b/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h index 9e591f2be4d69..8b784bdd61bfe 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h +++ b/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h @@ -151,7 +151,7 @@ __global__ void multiBlockPrefixScan(T const* __restrict__ ci, T* __restrict__ c // let's get the partial sums from each block __shared__ T psum[1024]; - for (int i = threadIdx.x, ni = gridDim.x; i int main() { AtomicPairCounter *dc_d; - cudaMalloc(&dc_d, sizeof(AtomicPairCounter)); - cudaMemset(dc_d, 0, sizeof(AtomicPairCounter)); + cudaCheck(cudaMalloc(&dc_d, sizeof(AtomicPairCounter))); + cudaCheck(cudaMemset(dc_d, 0, sizeof(AtomicPairCounter))); std::cout << "size " << sizeof(AtomicPairCounter) << std::endl; constexpr uint32_t N = 20000; constexpr uint32_t M = N * 6; uint32_t *n_d, *m_d; - cudaMalloc(&n_d, N * sizeof(int)); + cudaCheck(cudaMalloc(&n_d, N * sizeof(int))); // cudaMemset(n_d, 0, N*sizeof(int)); - cudaMalloc(&m_d, M * sizeof(int)); + cudaCheck(cudaMalloc(&m_d, M * sizeof(int))); update<<<2000, 512>>>(dc_d, n_d, m_d, 10000); finalize<<<1, 1>>>(dc_d, n_d, m_d, 10000); verify<<<2000, 512>>>(dc_d, n_d, m_d, 10000); AtomicPairCounter dc; - cudaMemcpy(&dc, dc_d, sizeof(AtomicPairCounter), cudaMemcpyDeviceToHost); + cudaCheck(cudaMemcpy(&dc, dc_d, sizeof(AtomicPairCounter), cudaMemcpyDeviceToHost)); std::cout << dc.get().n << ' ' << dc.get().m << std::endl; diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu index e3ac77f1556a4..f9e5fa28c3ee9 100644 --- a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu +++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu @@ -6,6 +6,7 @@ #include +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" #include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h" @@ -26,7 +27,7 @@ void go() { T v[N]; auto v_d = cuda::memory::device::make_unique(current_device, N); - cuda::memory::copy(v_d.get(), v, N * sizeof(T)); + cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice)); constexpr uint32_t nParts = 10; constexpr uint32_t partSize = N / nParts; @@ -65,7 +66,7 @@ void go() { offsets[10] = 3297 + offsets[9]; } - cuda::memory::copy(off_d.get(), offsets, 4 * (nParts + 1)); + cudaCheck(cudaMemcpy(off_d.get(), offsets, 4 * (nParts + 1), cudaMemcpyHostToDevice)); for (long long j = 0; j < N; j++) v[j] = rgen(eng); @@ -75,11 +76,10 @@ void go() { v[j] = sizeof(T) == 1 ? 22 : 3456; } - cuda::memory::copy(v_d.get(), v, N * sizeof(T)); + cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice)); cudautils::fillManyFromVector(h_d.get(), ws_d.get(), nParts, v_d.get(), off_d.get(), offsets[10], 256, 0); - - cuda::memory::copy(&h, h_d.get(), sizeof(Hist)); + cudaCheck(cudaMemcpy(&h, h_d.get(), sizeof(Hist), cudaMemcpyDeviceToHost)); assert(0 == h.off[0]); assert(offsets[10] == h.size()); diff --git a/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu index 4514ddd55e20b..03a969102ee1b 100644 --- a/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu +++ b/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu @@ -6,6 +6,7 @@ #include +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" #include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h" #include "HeterogeneousCore/CUDAUtilities/interface/launch.h" @@ -131,7 +132,7 @@ void go() { assert(v_d.get()); assert(v); - cuda::memory::copy(v_d.get(), v, N * sizeof(T)); + cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice)); assert(v_d.get()); cudautils::launch(mykernel, {1, 256}, v_d.get(), N); } diff --git a/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h index 75763952d782b..8782d6db07e3a 100644 --- a/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h +++ b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h @@ -8,6 +8,7 @@ #ifdef __CUDACC__ #include +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h" #endif @@ -170,7 +171,7 @@ int main() { auto sa_d = cuda::memory::device::make_unique(current_device, 1); auto ws_d = cuda::memory::device::make_unique(current_device, Assoc::wsSize()); - cuda::memory::copy(v_d.get(), tr.data(), N * sizeof(std::array)); + cudaCheck(cudaMemcpy(v_d.get(), tr.data(), N * sizeof(std::array), cudaMemcpyHostToDevice)); #else auto a_d = std::make_unique(); auto sa_d = std::make_unique(); @@ -198,7 +199,7 @@ int main() { Assoc la; #ifdef __CUDACC__ - cuda::memory::copy(&la, a_d.get(), sizeof(Assoc)); + cudaCheck(cudaMemcpy(&la, a_d.get(), sizeof(Assoc), cudaMemcpyDeviceToHost)); #else memcpy(&la, a_d.get(), sizeof(Assoc)); // not required, easier #endif @@ -224,17 +225,17 @@ int main() { AtomicPairCounter dc(0); #ifdef __CUDACC__ - cudaMalloc(&dc_d, sizeof(AtomicPairCounter)); - cudaMemset(dc_d, 0, sizeof(AtomicPairCounter)); + cudaCheck(cudaMalloc(&dc_d, sizeof(AtomicPairCounter))); + cudaCheck(cudaMemset(dc_d, 0, sizeof(AtomicPairCounter))); nBlocks = (N + nThreads - 1) / nThreads; fillBulk<<>>(dc_d, v_d.get(), a_d.get(), N); cudautils::finalizeBulk<<>>(dc_d, a_d.get()); verifyBulk<<<1, 1>>>(a_d.get(), dc_d); - cuda::memory::copy(&la, a_d.get(), sizeof(Assoc)); - cudaMemcpy(&dc, dc_d, sizeof(AtomicPairCounter), cudaMemcpyDeviceToHost); + cudaCheck(cudaMemcpy(&la, a_d.get(), sizeof(Assoc), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(&dc, dc_d, sizeof(AtomicPairCounter), cudaMemcpyDeviceToHost)); - cudaMemset(dc_d, 0, sizeof(AtomicPairCounter)); + cudaCheck(cudaMemset(dc_d, 0, sizeof(AtomicPairCounter))); fillBulk<<>>(dc_d, v_d.get(), sa_d.get(), N); cudautils::finalizeBulk<<>>(dc_d, sa_d.get()); verifyBulk<<<1, 1>>>(sa_d.get(), dc_d); diff --git a/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp b/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp index cec4df12ca99b..750551161a161 100644 --- a/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp +++ b/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp @@ -21,7 +21,7 @@ TEST_CASE("copyAsync", "[cudaMemTools]") { auto host = cudautils::make_host_unique(stream); cudautils::copyAsync(device, host_orig, stream); - cuda::memory::async::copy(host.get(), device.get(), sizeof(int), stream); + cudaCheck(cudaMemcpyAsync(host.get(), device.get(), sizeof(int), cudaMemcpyDeviceToHost, stream)); cudaCheck(cudaStreamSynchronize(stream)); REQUIRE(*host == 42); @@ -40,7 +40,7 @@ TEST_CASE("copyAsync", "[cudaMemTools]") { SECTION("Copy all") { cudautils::copyAsync(device, host_orig, N, stream); - cuda::memory::async::copy(host.get(), device.get(), N * sizeof(int), stream); + cudaCheck(cudaMemcpyAsync(host.get(), device.get(), N * sizeof(int), cudaMemcpyDeviceToHost, stream)); cudaCheck(cudaStreamSynchronize(stream)); for (int i = 0; i < N; ++i) { CHECK(host[i] == i); @@ -53,7 +53,7 @@ TEST_CASE("copyAsync", "[cudaMemTools]") { SECTION("Copy some") { cudautils::copyAsync(device, host_orig, 42, stream); - cuda::memory::async::copy(host.get(), device.get(), 42 * sizeof(int), stream); + cudaCheck(cudaMemcpyAsync(host.get(), device.get(), 42 * sizeof(int), cudaMemcpyDeviceToHost, stream)); cudaCheck(cudaStreamSynchronize(stream)); for (int i = 0; i < 42; ++i) { CHECK(host[i] == 200 + i); @@ -70,7 +70,7 @@ TEST_CASE("copyAsync", "[cudaMemTools]") { auto device = cudautils::make_device_unique(stream); auto host = cudautils::make_host_unique(stream); - cuda::memory::async::copy(device.get(), host_orig.get(), sizeof(int), stream); + cudaCheck(cudaMemcpyAsync(device.get(), host_orig.get(), sizeof(int), cudaMemcpyHostToDevice, stream)); cudautils::copyAsync(host, device, stream); cudaCheck(cudaStreamSynchronize(stream)); @@ -89,7 +89,7 @@ TEST_CASE("copyAsync", "[cudaMemTools]") { auto host = cudautils::make_host_unique(N, stream); SECTION("Copy all") { - cuda::memory::async::copy(device.get(), host_orig.get(), N * sizeof(int), stream); + cudaCheck(cudaMemcpyAsync(device.get(), host_orig.get(), N * sizeof(int), cudaMemcpyHostToDevice, stream)); cudautils::copyAsync(host, device, N, stream); cudaCheck(cudaStreamSynchronize(stream)); for (int i = 0; i < N; ++i) { @@ -102,7 +102,7 @@ TEST_CASE("copyAsync", "[cudaMemTools]") { } SECTION("Copy some") { - cuda::memory::async::copy(device.get(), host_orig.get(), 42 * sizeof(int), stream); + cudaCheck(cudaMemcpyAsync(device.get(), host_orig.get(), 42 * sizeof(int), cudaMemcpyHostToDevice, stream)); cudautils::copyAsync(host, device, 42, stream); cudaCheck(cudaStreamSynchronize(stream)); for (int i = 0; i < 42; ++i) { diff --git a/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu b/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu index f8266898323cd..b83c03f710012 100644 --- a/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu +++ b/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu @@ -2,6 +2,7 @@ #include +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h" #include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h" @@ -107,9 +108,9 @@ int main() { uint32_t *d_out1; uint32_t *d_out2; - cudaMalloc(&d_in, num_items * sizeof(uint32_t)); - cudaMalloc(&d_out1, num_items * sizeof(uint32_t)); - cudaMalloc(&d_out2, num_items * sizeof(uint32_t)); + cudaCheck(cudaMalloc(&d_in, num_items * sizeof(uint32_t))); + cudaCheck(cudaMalloc(&d_out1, num_items * sizeof(uint32_t))); + cudaCheck(cudaMalloc(&d_out2, num_items * sizeof(uint32_t))); auto nthreads = 256; auto nblocks = (num_items + nthreads - 1) / nthreads; @@ -118,8 +119,8 @@ int main() { // the block counter int32_t *d_pc; - cudaMalloc(&d_pc, sizeof(int32_t)); - cudaMemset(d_pc, 0, 4); + cudaCheck(cudaMalloc(&d_pc, sizeof(int32_t))); + cudaCheck(cudaMemset(d_pc, 0, 4)); nthreads = 1024; nblocks = (num_items + nthreads - 1) / nthreads; @@ -139,7 +140,7 @@ int main() { // Allocate temporary storage for inclusive prefix sum // fake larger ws already available temp_storage_bytes *= 8; - cudaMalloc(&d_temp_storage, temp_storage_bytes); + cudaCheck(cudaMalloc(&d_temp_storage, temp_storage_bytes)); std::cout << "temp storage " << temp_storage_bytes << std::endl; // Run inclusive prefix sum CubDebugExit(cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out2, num_items)); diff --git a/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu b/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu index 9be80547837d4..bc042cc012185 100644 --- a/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu +++ b/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu @@ -10,6 +10,7 @@ #include +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h" #include "HeterogeneousCore/CUDAUtilities/interface/launch.h" #include "HeterogeneousCore/CUDAUtilities/interface/radixSort.h" @@ -102,8 +103,8 @@ void go(bool useShared) { auto ws_d = cuda::memory::device::make_unique(current_device, N); auto off_d = cuda::memory::device::make_unique(current_device, blocks + 1); - cuda::memory::copy(v_d.get(), v, N * sizeof(T)); - cuda::memory::copy(off_d.get(), offsets, 4 * (blocks + 1)); + cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(off_d.get(), offsets, 4 * (blocks + 1), cudaMemcpyHostToDevice)); if (i < 2) std::cout << "lauch for " << offsets[blocks] << std::endl; @@ -116,13 +117,13 @@ void go(bool useShared) { cudautils::launch( radixSortMultiWrapper, {blocks, ntXBl, MaxSize * 2}, v_d.get(), ind_d.get(), off_d.get(), nullptr); else - cudautils::launch(radixSortMultiWrapper2, {blocks, ntXBl}, v_d.get(), ind_d.get(), off_d.get(), ws_d.get()); + cudautils::launch( + radixSortMultiWrapper2, {blocks, ntXBl}, v_d.get(), ind_d.get(), off_d.get(), ws_d.get()); if (i == 0) std::cout << "done for " << offsets[blocks] << std::endl; - // cuda::memory::copy(v, v_d.get(), 2*N); - cuda::memory::copy(ind, ind_d.get(), 2 * N); + cudaCheck(cudaMemcpy(ind, ind_d.get(), 2 * N, cudaMemcpyDeviceToHost)); delta += (std::chrono::high_resolution_clock::now() - start); diff --git a/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUHelpers.cu b/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUHelpers.cu index 7e531a48b6b82..14e9245e19e2e 100644 --- a/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUHelpers.cu +++ b/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUHelpers.cu @@ -77,8 +77,8 @@ int TestAcceleratorServiceProducerGPUHelpers_simple_kernel(int input) { auto d_b = cuda::memory::device::make_unique(current_device, NUM_VALUES); auto d_c = cuda::memory::device::make_unique(current_device, NUM_VALUES); - cuda::memory::async::copy(d_a.get(), h_a.get(), NUM_VALUES * sizeof(int), stream.id()); - cuda::memory::async::copy(d_b.get(), h_b.get(), NUM_VALUES * sizeof(int), stream.id()); + cudaCheck(cudaMemcpyAsync(d_a.get(), h_a.get(), NUM_VALUES * sizeof(int), cudaMemcpyHostToDevice, stream.id())); + cudaCheck(cudaMemcpyAsync(d_b.get(), h_b.get(), NUM_VALUES * sizeof(int), cudaMemcpyHostToDevice, stream.id())); int threadsPerBlock{256}; int blocksPerGrid = (NUM_VALUES + threadsPerBlock - 1) / threadsPerBlock; @@ -91,7 +91,7 @@ int TestAcceleratorServiceProducerGPUHelpers_simple_kernel(int input) { d_a.get(), d_b.get(), d_c.get(), NUM_VALUES); */ - cuda::memory::async::copy(h_c.get(), d_c.get(), NUM_VALUES * sizeof(int), stream.id()); + cudaCheck(cudaMemcpyAsync(h_c.get(), d_c.get(), NUM_VALUES * sizeof(int), cudaMemcpyDeviceToHost, stream.id())); stream.synchronize(); @@ -124,7 +124,7 @@ TestHeterogeneousEDProducerGPUTask::ResultType TestHeterogeneousEDProducerGPUTas // First make the sanity check if (inputArrays.first != nullptr) { auto h_check = std::make_unique(NUM_VALUES); - cuda::memory::copy(h_check.get(), inputArrays.first, NUM_VALUES * sizeof(float)); + cudaCheck(cudaMemcpy(h_check.get(), inputArrays.first, NUM_VALUES * sizeof(float), cudaMemcpyDeviceToHost)); for (int i = 0; i < NUM_VALUES; ++i) { if (h_check[i] != i) { throw cms::Exception("Assert") << "Sanity check on element " << i << " failed, expected " << i << " got " @@ -146,8 +146,8 @@ TestHeterogeneousEDProducerGPUTask::ResultType TestHeterogeneousEDProducerGPUTas } // Create stream - cuda::memory::async::copy(d_a.get(), h_a.get(), NUM_VALUES * sizeof(float), stream.id()); - cuda::memory::async::copy(d_b.get(), h_b.get(), NUM_VALUES * sizeof(float), stream.id()); + cudaCheck(cudaMemcpyAsync(d_a.get(), h_a.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream.id())); + cudaCheck(cudaMemcpyAsync(d_b.get(), h_b.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream.id())); int threadsPerBlock{32}; int blocksPerGrid = (NUM_VALUES + threadsPerBlock - 1) / threadsPerBlock; @@ -195,7 +195,7 @@ void TestHeterogeneousEDProducerGPUTask::release(const std::string &label, cuda: int TestHeterogeneousEDProducerGPUTask::getResult(const ResultTypeRaw &d_ac, cuda::stream_t<> &stream) { auto h_c = cuda::memory::host::make_unique(NUM_VALUES); - cuda::memory::async::copy(h_c.get(), d_ac.second, NUM_VALUES * sizeof(int), stream.id()); + cudaCheck(cudaMemcpyAsync(h_c.get(), d_ac.second, NUM_VALUES * sizeof(int), cudaMemcpyDeviceToHost, stream.id())); stream.synchronize(); float ret = 0; diff --git a/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc b/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc index 4a554b47bbf63..d4b8e40dea76b 100644 --- a/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc +++ b/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc @@ -156,8 +156,11 @@ cudautils::device::unique_ptr SiPixelFedCablingMapGPUWrapper::g } } - cuda::memory::async::copy( - modToUnpDevice.get(), modToUnpHost.get(), pixelgpudetails::MAX_SIZE * sizeof(unsigned char), cudaStream); + cudaCheck(cudaMemcpyAsync(modToUnpDevice.get(), + modToUnpHost.get(), + pixelgpudetails::MAX_SIZE * sizeof(unsigned char), + cudaMemcpyHostToDevice, + cudaStream)); return modToUnpDevice; } diff --git a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h index 4f7ced9b7e309..bb86c1392cdf9 100644 --- a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h +++ b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h @@ -12,6 +12,7 @@ #ifdef __CUDACC__ #include +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h" #include "HeterogeneousCore/CUDAUtilities/interface/launch.h" #endif @@ -252,12 +253,12 @@ int main(void) { size_t size16 = n * sizeof(unsigned short); // size_t size8 = n * sizeof(uint8_t); - cuda::memory::copy(d_moduleStart.get(), &nModules, sizeof(uint32_t)); + cudaCheck(cudaMemcpy(d_moduleStart.get(), &nModules, sizeof(uint32_t), cudaMemcpyHostToDevice)); - cuda::memory::copy(d_id.get(), h_id.get(), size16); - cuda::memory::copy(d_x.get(), h_x.get(), size16); - cuda::memory::copy(d_y.get(), h_y.get(), size16); - cuda::memory::copy(d_adc.get(), h_adc.get(), size16); + cudaCheck(cudaMemcpy(d_id.get(), h_id.get(), size16, cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(d_x.get(), h_x.get(), size16, cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(d_y.get(), h_y.get(), size16, cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(d_adc.get(), h_adc.get(), size16, cudaMemcpyHostToDevice)); // Launch CUDA Kernels int threadsPerBlock = (kkk == 5) ? 512 : ((kkk == 3) ? 128 : 256); int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; @@ -270,26 +271,23 @@ int main(void) { std::cout << "CUDA findModules kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads\n"; - - cuda::memory::device::zero(d_clusInModule.get(), MaxNumModules * sizeof(uint32_t)); + cudaCheck(cudaMemset(d_clusInModule.get(), 0, MaxNumModules * sizeof(uint32_t))); cudautils::launch(findClus, - {blocksPerGrid, threadsPerBlock}, - d_id.get(), - d_x.get(), - d_y.get(), - d_moduleStart.get(), - d_clusInModule.get(), - d_moduleId.get(), - d_clus.get(), - n); + {blocksPerGrid, threadsPerBlock}, + d_id.get(), + d_x.get(), + d_y.get(), + d_moduleStart.get(), + d_clusInModule.get(), + d_moduleId.get(), + d_clus.get(), + n); cudaDeviceSynchronize(); - - cuda::memory::copy(&nModules, d_moduleStart.get(), sizeof(uint32_t)); + cudaCheck(cudaMemcpy(&nModules, d_moduleStart.get(), sizeof(uint32_t), cudaMemcpyDeviceToHost)); uint32_t nclus[MaxNumModules], moduleId[nModules]; - - cuda::memory::copy(&nclus, d_clusInModule.get(), MaxNumModules * sizeof(uint32_t)); + cudaCheck(cudaMemcpy(&nclus, d_clusInModule.get(), MaxNumModules * sizeof(uint32_t), cudaMemcpyDeviceToHost)); std::cout << "before charge cut found " << std::accumulate(nclus, nclus + MaxNumModules, 0) << " clusters" << std::endl; @@ -302,14 +300,14 @@ int main(void) { std::cout << "ERROR!!!!! wrong number of cluster found" << std::endl; cudautils::launch(clusterChargeCut, - {blocksPerGrid, threadsPerBlock}, - d_id.get(), - d_adc.get(), - d_moduleStart.get(), - d_clusInModule.get(), - d_moduleId.get(), - d_clus.get(), - n); + {blocksPerGrid, threadsPerBlock}, + d_id.get(), + d_adc.get(), + d_moduleStart.get(), + d_clusInModule.get(), + d_moduleId.get(), + d_clus.get(), + n); cudaDeviceSynchronize(); #else @@ -354,10 +352,10 @@ int main(void) { std::cout << "found " << nModules << " Modules active" << std::endl; #ifdef __CUDACC__ - cuda::memory::copy(h_id.get(), d_id.get(), size16); - cuda::memory::copy(h_clus.get(), d_clus.get(), size32); - cuda::memory::copy(&nclus, d_clusInModule.get(), MaxNumModules * sizeof(uint32_t)); - cuda::memory::copy(&moduleId, d_moduleId.get(), nModules * sizeof(uint32_t)); + cudaCheck(cudaMemcpy(h_id.get(), d_id.get(), size16, cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(h_clus.get(), d_clus.get(), size32, cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(&nclus, d_clusInModule.get(), MaxNumModules * sizeof(uint32_t), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(&moduleId, d_moduleId.get(), nModules * sizeof(uint32_t), cudaMemcpyDeviceToHost)); #endif std::set clids; diff --git a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu index a206feca83b52..7b02a23c41dca 100644 --- a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu +++ b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu @@ -261,10 +261,10 @@ void testFit() { kernelFastFit<<>>(hitsGPU, fast_fit_resultsGPU); cudaDeviceSynchronize(); - cudaMemcpy(fast_fit_resultsGPUret, - fast_fit_resultsGPU, - Rfit::maxNumberOfTracks() * sizeof(Vector4d), - cudaMemcpyDeviceToHost); + cudaCheck(cudaMemcpy(fast_fit_resultsGPUret, + fast_fit_resultsGPU, + Rfit::maxNumberOfTracks() * sizeof(Vector4d), + cudaMemcpyDeviceToHost)); Rfit::Map4d fast_fit(fast_fit_resultsGPUret + 10, 4); std::cout << "Fitted values (FastFit, [X0, Y0, R, tan(theta)]): GPU\n" << fast_fit << std::endl; assert(isEqualFuzzy(fast_fit_results, fast_fit)); @@ -311,13 +311,14 @@ void testFit() { std::cout << "Fitted values (CircleFit):\n" << circle_fit_results.par << std::endl; - cudaMemcpy(circle_fit_resultsGPUret, circle_fit_resultsGPU, sizeof(Rfit::circle_fit), cudaMemcpyDeviceToHost); + cudaCheck( + cudaMemcpy(circle_fit_resultsGPUret, circle_fit_resultsGPU, sizeof(Rfit::circle_fit), cudaMemcpyDeviceToHost)); std::cout << "Fitted values (CircleFit) GPU:\n" << circle_fit_resultsGPUret->par << std::endl; assert(isEqualFuzzy(circle_fit_results.par, circle_fit_resultsGPUret->par)); std::cout << "Fitted values (LineFit):\n" << line_fit_results.par << std::endl; // LINE_FIT GPU - cudaMemcpy(line_fit_resultsGPUret, line_fit_resultsGPU, sizeof(Rfit::line_fit), cudaMemcpyDeviceToHost); + cudaCheck(cudaMemcpy(line_fit_resultsGPUret, line_fit_resultsGPU, sizeof(Rfit::line_fit), cudaMemcpyDeviceToHost)); std::cout << "Fitted values (LineFit) GPU:\n" << line_fit_resultsGPUret->par << std::endl; assert(isEqualFuzzy(line_fit_results.par, line_fit_resultsGPUret->par, N == 5 ? 1e-4 : 1e-6)); // requires fma on CPU diff --git a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu index ebaea2037eb2a..e16ac3dbbcbc3 100644 --- a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu +++ b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu @@ -73,18 +73,19 @@ void testMultiply() { Eigen::Matrix *multiply_resultGPU = nullptr; Eigen::Matrix *multiply_resultGPUret = new Eigen::Matrix(); - cudaMalloc((void **)&JGPU, sizeof(Eigen::Matrix)); - cudaMalloc((void **)&CGPU, sizeof(Eigen::Matrix)); - cudaMalloc((void **)&multiply_resultGPU, sizeof(Eigen::Matrix)); - cudaMemcpy(JGPU, &J, sizeof(Eigen::Matrix), cudaMemcpyHostToDevice); - cudaMemcpy(CGPU, &C, sizeof(Eigen::Matrix), cudaMemcpyHostToDevice); - cudaMemcpy(multiply_resultGPU, &multiply_result, sizeof(Eigen::Matrix), cudaMemcpyHostToDevice); + cudaCheck(cudaMalloc((void **)&JGPU, sizeof(Eigen::Matrix))); + cudaCheck(cudaMalloc((void **)&CGPU, sizeof(Eigen::Matrix))); + cudaCheck(cudaMalloc((void **)&multiply_resultGPU, sizeof(Eigen::Matrix))); + cudaCheck(cudaMemcpy(JGPU, &J, sizeof(Eigen::Matrix), cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(CGPU, &C, sizeof(Eigen::Matrix), cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy( + multiply_resultGPU, &multiply_result, sizeof(Eigen::Matrix), cudaMemcpyHostToDevice)); kernelMultiply<<<1, 1>>>(JGPU, CGPU, multiply_resultGPU); cudaDeviceSynchronize(); - cudaMemcpy( - multiply_resultGPUret, multiply_resultGPU, sizeof(Eigen::Matrix), cudaMemcpyDeviceToHost); + cudaCheck(cudaMemcpy( + multiply_resultGPUret, multiply_resultGPU, sizeof(Eigen::Matrix), cudaMemcpyDeviceToHost)); printIt(multiply_resultGPUret); assert(isEqualFuzzy(multiply_result, (*multiply_resultGPUret))); } @@ -104,14 +105,14 @@ void testInverse3x3() { std::cout << "Here is the matrix m:" << std::endl << m << std::endl; std::cout << "Its inverse is:" << std::endl << m.inverse() << std::endl; #endif - cudaMalloc((void **)&mGPU, sizeof(Matrix3d)); - cudaMalloc((void **)&mGPUret, sizeof(Matrix3d)); - cudaMemcpy(mGPU, &m, sizeof(Matrix3d), cudaMemcpyHostToDevice); + cudaCheck(cudaMalloc((void **)&mGPU, sizeof(Matrix3d))); + cudaCheck(cudaMalloc((void **)&mGPUret, sizeof(Matrix3d))); + cudaCheck(cudaMemcpy(mGPU, &m, sizeof(Matrix3d), cudaMemcpyHostToDevice)); kernelInverse3x3<<<1, 1>>>(mGPU, mGPUret); cudaDeviceSynchronize(); - cudaMemcpy(mCPUret, mGPUret, sizeof(Matrix3d), cudaMemcpyDeviceToHost); + cudaCheck(cudaMemcpy(mCPUret, mGPUret, sizeof(Matrix3d), cudaMemcpyDeviceToHost)); #if TEST_DEBUG std::cout << "Its GPU inverse is:" << std::endl << (*mCPUret) << std::endl; #endif @@ -133,14 +134,14 @@ void testInverse4x4() { std::cout << "Here is the matrix m:" << std::endl << m << std::endl; std::cout << "Its inverse is:" << std::endl << m.inverse() << std::endl; #endif - cudaMalloc((void **)&mGPU, sizeof(Matrix4d)); - cudaMalloc((void **)&mGPUret, sizeof(Matrix4d)); - cudaMemcpy(mGPU, &m, sizeof(Matrix4d), cudaMemcpyHostToDevice); + cudaCheck(cudaMalloc((void **)&mGPU, sizeof(Matrix4d))); + cudaCheck(cudaMalloc((void **)&mGPUret, sizeof(Matrix4d))); + cudaCheck(cudaMemcpy(mGPU, &m, sizeof(Matrix4d), cudaMemcpyHostToDevice)); kernelInverse4x4<<<1, 1>>>(mGPU, mGPUret); cudaDeviceSynchronize(); - cudaMemcpy(mCPUret, mGPUret, sizeof(Matrix4d), cudaMemcpyDeviceToHost); + cudaCheck(cudaMemcpy(mCPUret, mGPUret, sizeof(Matrix4d), cudaMemcpyDeviceToHost)); #if TEST_DEBUG std::cout << "Its GPU inverse is:" << std::endl << (*mCPUret) << std::endl; #endif @@ -162,14 +163,14 @@ void testInverse5x5() { std::cout << "Here is the matrix m:" << std::endl << m << std::endl; std::cout << "Its inverse is:" << std::endl << m.inverse() << std::endl; #endif - cudaMalloc((void **)&mGPU, sizeof(Matrix5d)); - cudaMalloc((void **)&mGPUret, sizeof(Matrix5d)); - cudaMemcpy(mGPU, &m, sizeof(Matrix5d), cudaMemcpyHostToDevice); + cudaCheck(cudaMalloc((void **)&mGPU, sizeof(Matrix5d))); + cudaCheck(cudaMalloc((void **)&mGPUret, sizeof(Matrix5d))); + cudaCheck(cudaMemcpy(mGPU, &m, sizeof(Matrix5d), cudaMemcpyHostToDevice)); kernelInverse5x5<<<1, 1>>>(mGPU, mGPUret); cudaDeviceSynchronize(); - cudaMemcpy(mCPUret, mGPUret, sizeof(Matrix5d), cudaMemcpyDeviceToHost); + cudaCheck(cudaMemcpy(mCPUret, mGPUret, sizeof(Matrix5d), cudaMemcpyDeviceToHost)); #if TEST_DEBUG std::cout << "Its GPU inverse is:" << std::endl << (*mCPUret) << std::endl; #endif @@ -195,15 +196,16 @@ void testEigenvalues() { std::cout << "The eigenvalues of M are:" << std::endl << (*ret) << std::endl; std::cout << "*************************\n\n" << std::endl; #endif - cudaMalloc((void **)&m_gpu, sizeof(Matrix3d)); - cudaMalloc((void **)&ret_gpu, sizeof(Eigen::SelfAdjointEigenSolver::RealVectorType)); - cudaMemcpy(m_gpu, &m, sizeof(Matrix3d), cudaMemcpyHostToDevice); + cudaCheck(cudaMalloc((void **)&m_gpu, sizeof(Matrix3d))); + cudaCheck(cudaMalloc((void **)&ret_gpu, sizeof(Eigen::SelfAdjointEigenSolver::RealVectorType))); + cudaCheck(cudaMemcpy(m_gpu, &m, sizeof(Matrix3d), cudaMemcpyHostToDevice)); kernel<<<1, 1>>>(m_gpu, ret_gpu); cudaDeviceSynchronize(); - cudaMemcpy(mgpudebug, m_gpu, sizeof(Matrix3d), cudaMemcpyDeviceToHost); - cudaMemcpy(ret1, ret_gpu, sizeof(Eigen::SelfAdjointEigenSolver::RealVectorType), cudaMemcpyDeviceToHost); + cudaCheck(cudaMemcpy(mgpudebug, m_gpu, sizeof(Matrix3d), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy( + ret1, ret_gpu, sizeof(Eigen::SelfAdjointEigenSolver::RealVectorType), cudaMemcpyDeviceToHost)); #if TEST_DEBUG std::cout << "GPU Generated Matrix M 3x3:\n" << (*mgpudebug) << std::endl; std::cout << "GPU The eigenvalues of M are:" << std::endl << (*ret1) << std::endl; diff --git a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h index 0df7af362ac0d..14263ed7b3d18 100644 --- a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h +++ b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h @@ -6,6 +6,7 @@ #include +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h" #include "HeterogeneousCore/CUDAUtilities/interface/launch.h" #ifdef USE_DBSCAN @@ -126,10 +127,10 @@ int main() { std::cout << "v,t size " << ev.zvert.size() << ' ' << ev.ztrack.size() << std::endl; auto nt = ev.ztrack.size(); #ifdef __CUDACC__ - cuda::memory::copy(LOC_WS(ntrks), &nt, sizeof(uint32_t)); - cuda::memory::copy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size()); - cuda::memory::copy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size()); - cuda::memory::copy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size()); + cudaCheck(cudaMemcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t), cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size(), cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice)); #else ::memcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t)); ::memcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size()); @@ -162,7 +163,7 @@ int main() { cudautils::launch(fitVertices, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f); cudaCheck(cudaGetLastError()); - cuda::memory::copy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t)); + cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost)); #else print(onGPU_d.get(), ws_d.get()); @@ -207,8 +208,8 @@ int main() { #endif #ifdef __CUDACC__ - cuda::memory::copy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t)); - cuda::memory::copy(chi2, LOC_ONGPU(chi2), nv * sizeof(float)); + cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost)); #else memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float)); #endif @@ -223,9 +224,9 @@ int main() { #ifdef __CUDACC__ cudautils::launch(fitVertices, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f); - cuda::memory::copy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t)); - cuda::memory::copy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t)); - cuda::memory::copy(chi2, LOC_ONGPU(chi2), nv * sizeof(float)); + cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost)); #else fitVertices(onGPU_d.get(), ws_d.get(), 50.f); nv = onGPU_d->nvFinal; @@ -243,7 +244,7 @@ int main() { #ifdef __CUDACC__ // one vertex per block!!! cudautils::launch(splitVertices, {1024, 64}, onGPU_d.get(), ws_d.get(), 9.f); - cuda::memory::copy(&nv, LOC_WS(nvIntermediate), sizeof(uint32_t)); + cudaCheck(cudaMemcpy(&nv, LOC_WS(nvIntermediate), sizeof(uint32_t), cudaMemcpyDeviceToHost)); #else gridDim.x = 1024; // nv ???? assert(blockIdx.x == 0); @@ -260,7 +261,7 @@ int main() { cudautils::launch(sortByPt2, {1, 256}, onGPU_d.get(), ws_d.get()); cudaCheck(cudaGetLastError()); - cuda::memory::copy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t)); + cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost)); #else fitVertices(onGPU_d.get(), ws_d.get(), 5000.f); sortByPt2(onGPU_d.get(), ws_d.get()); @@ -274,12 +275,12 @@ int main() { } #ifdef __CUDACC__ - cuda::memory::copy(zv, LOC_ONGPU(zv), nv * sizeof(float)); - cuda::memory::copy(wv, LOC_ONGPU(wv), nv * sizeof(float)); - cuda::memory::copy(chi2, LOC_ONGPU(chi2), nv * sizeof(float)); - cuda::memory::copy(ptv2, LOC_ONGPU(ptv2), nv * sizeof(float)); - cuda::memory::copy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t)); - cuda::memory::copy(ind, LOC_ONGPU(sortInd), nv * sizeof(uint16_t)); + cudaCheck(cudaMemcpy(zv, LOC_ONGPU(zv), nv * sizeof(float), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(wv, LOC_ONGPU(wv), nv * sizeof(float), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(ptv2, LOC_ONGPU(ptv2), nv * sizeof(float), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(ind, LOC_ONGPU(sortInd), nv * sizeof(uint16_t), cudaMemcpyDeviceToHost)); #endif for (auto j = 0U; j < nv; ++j) if (nn[j] > 0)