From 6bfe94f9d231057fe853c9b82c7ae210ffbe6f48 Mon Sep 17 00:00:00 2001
From: waredjeb <39335169+waredjeb@users.noreply.github.com>
Date: Tue, 29 Oct 2019 07:09:04 +0100
Subject: [PATCH] Replace CUDA API wrapper memory operations with native CUDA
 calls (#395)

---
 CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc  |  3 +-
 .../src/TrackingRecHit2DCUDA.cc               |  2 +-
 .../test/gpuFrameTransformTest.cpp            | 18 +++---
 DataFormats/Math/test/CholeskyInvert_t.cu     |  9 +--
 DataFormats/Math/test/cudaAtan2Test.cu        |  5 +-
 DataFormats/Math/test/cudaMathTest.cu         | 13 ++--
 .../CUDACore/test/testStreamEvent.cu          |  7 ++-
 .../CUDACore/test/test_CUDAScopedContext.cc   | 10 +--
 .../CUDATest/plugins/TestCUDAProducerGPUEW.cc |  5 +-
 .../plugins/TestCUDAProducerGPUEWTask.cc      |  8 ++-
 .../plugins/TestCUDAProducerGPUKernel.cu      |  6 +-
 .../plugins/TestCUDAProducerGPUtoCPU.cc       |  8 ++-
 .../test/test_TestCUDAProducerGPUFirst.cc     |  2 +-
 .../CUDAUtilities/interface/HistoContainer.h  |  2 +-
 .../CUDAUtilities/interface/copyAsync.h       |  9 +--
 .../CUDAUtilities/interface/memsetAsync.h     |  5 +-
 .../CUDAUtilities/interface/prefixScan.h      |  2 +-
 .../CUDAUtilities/test/AtomicPairCounter_t.cu | 11 ++--
 .../CUDAUtilities/test/HistoContainer_t.cu    | 10 +--
 .../CUDAUtilities/test/OneHistoContainer_t.cu |  3 +-
 .../CUDAUtilities/test/OneToManyAssoc_t.h     | 15 ++---
 .../CUDAUtilities/test/copyAsync_t.cpp        | 12 ++--
 .../CUDAUtilities/test/prefixScan_t.cu        | 13 ++--
 .../CUDAUtilities/test/radixSort_t.cu         | 11 ++--
 .../TestHeterogeneousEDProducerGPUHelpers.cu  | 14 ++---
 .../src/SiPixelFedCablingMapGPUWrapper.cc     |  7 ++-
 .../SiPixelClusterizer/test/gpuClustering_t.h | 62 +++++++++----------
 .../PixelTrackFitting/test/testEigenGPU.cu    | 13 ++--
 .../test/testEigenGPUNoFit.cu                 | 52 ++++++++--------
 .../PixelVertexFinding/test/VertexFinder_t.h  | 37 +++++------
 30 files changed, 200 insertions(+), 174 deletions(-)
diff --git a/CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc b/CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc
index cdc49aa46bfa4..a297ae11dc327 100644
--- a/CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc
+++ b/CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc
@@ -1,8 +1,9 @@
 #include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h"
 
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 
 BeamSpotCUDA::BeamSpotCUDA(Data const* data_h, cudaStream_t stream) {
   data_d_ = cudautils::make_device_unique<Data>(stream);
-  cuda::memory::async::copy(data_d_.get(), data_h, sizeof(Data), stream);
+  cudaCheck(cudaMemcpyAsync(data_d_.get(), data_h, sizeof(Data), cudaMemcpyHostToDevice, stream));
 }
diff --git a/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DCUDA.cc b/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DCUDA.cc
index c4cb13e3a0bd8..e6f223bfec4e3 100644
--- a/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DCUDA.cc
+++ b/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DCUDA.cc
@@ -14,6 +14,6 @@ cudautils::host::unique_ptr<float[]> TrackingRecHit2DCUDA::localCoordToHostAsync
 template <>
 cudautils::host::unique_ptr<uint32_t[]> TrackingRecHit2DCUDA::hitsModuleStartToHostAsync(cudaStream_t stream) const {
   auto ret = cudautils::make_host_unique<uint32_t[]>(2001, stream);
-  cudaMemcpyAsync(ret.get(), m_hitsModuleStart, 4 * 2001, cudaMemcpyDefault, stream);
+  cudaCheck(cudaMemcpyAsync(ret.get(), m_hitsModuleStart, 4 * 2001, cudaMemcpyDefault, stream));
   return ret;
 }
diff --git a/DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp b/DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp
index 4f3c11212e2e9..d02672c08d5d1 100644
--- a/DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp
+++ b/DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp
@@ -10,6 +10,7 @@
 
 #include <cuda/api_wrappers.h>
 
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "DataFormats/GeometrySurface/interface/GloballyPositioned.h"
 #include "DataFormats/GeometrySurface/interface/SOARotation.h"
 #include "DataFormats/GeometrySurface/interface/TkRotation.h"
@@ -73,7 +74,7 @@ int main(void) {
 
   // auto d_sf = cuda::memory::device::make_unique<SFrame[]>(current_device, 1);
   auto d_sf = cuda::memory::device::make_unique<char[]>(current_device, sizeof(SFrame));
-  cuda::memory::copy(d_sf.get(), &sf1, sizeof(SFrame));
+  cudaCheck(cudaMemcpy(d_sf.get(), &sf1, sizeof(SFrame), cudaMemcpyHostToDevice));
 
   for (auto i = 0U; i < size; ++i) {
     xl[i] = yl[i] = 0.1f * float(i) - float(size / 2);
@@ -84,9 +85,9 @@ int main(void) {
   std::random_shuffle(xl, xl + size);
   std::random_shuffle(yl, yl + size);
 
-  cuda::memory::copy(d_xl.get(), xl, size32);
-  cuda::memory::copy(d_yl.get(), yl, size32);
-  cuda::memory::copy(d_le.get(), le, 3 * size32);
+  cudaCheck(cudaMemcpy(d_xl.get(), xl, size32, cudaMemcpyHostToDevice));
+  cudaCheck(cudaMemcpy(d_yl.get(), yl, size32, cudaMemcpyHostToDevice));
+  cudaCheck(cudaMemcpy(d_le.get(), le, 3 * size32, cudaMemcpyHostToDevice));
 
   toGlobalWrapper((SFrame const *)(d_sf.get()),
                   d_xl.get(),
@@ -97,11 +98,10 @@ int main(void) {
                   d_le.get(),
                   d_ge.get(),
                   size);
-
-  cuda::memory::copy(x, d_x.get(), size32);
-  cuda::memory::copy(y, d_y.get(), size32);
-  cuda::memory::copy(z, d_z.get(), size32);
-  cuda::memory::copy(ge, d_ge.get(), 6 * size32);
+  cudaCheck(cudaMemcpy(x, d_x.get(), size32, cudaMemcpyDeviceToHost));
+  cudaCheck(cudaMemcpy(y, d_y.get(), size32, cudaMemcpyDeviceToHost));
+  cudaCheck(cudaMemcpy(z, d_z.get(), size32, cudaMemcpyDeviceToHost));
+  cudaCheck(cudaMemcpy(ge, d_ge.get(), 6 * size32, cudaMemcpyDeviceToHost));
 
   float eps = 0.;
   for (auto i = 0U; i < size; ++i) {
diff --git a/DataFormats/Math/test/CholeskyInvert_t.cu b/DataFormats/Math/test/CholeskyInvert_t.cu
index 3e2cf041bae16..dca89682113fe 100644
--- a/DataFormats/Math/test/CholeskyInvert_t.cu
+++ b/DataFormats/Math/test/CholeskyInvert_t.cu
@@ -16,6 +16,7 @@
 #include <cuda/api_wrappers.h>
 
 #include "DataFormats/Math/interface/choleskyInversion.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
 
@@ -132,7 +133,7 @@ void go(bool soa) {
   std::cout << mm[SIZE / 2](1, 1) << std::endl;
 
   auto m_d = cuda::memory::device::make_unique<double[]>(current_device, DIM * DIM * stride());
-  cuda::memory::copy(m_d.get(), (double const *)(mm), stride() * sizeof(MX));
+  cudaCheck(cudaMemcpy(m_d.get(), (double const *)(mm), stride() * sizeof(MX), cudaMemcpyHostToDevice));
 
   constexpr int NKK =
 #ifdef DOPROF
@@ -151,7 +152,8 @@ void go(bool soa) {
     else
       cudautils::launch(invert<MX, DIM>, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE);
 
-    cuda::memory::copy(&mm, m_d.get(), stride() * sizeof(MX));
+    cudaCheck(cudaMemcpy(&mm, m_d.get(), stride() * sizeof(MX), cudaMemcpyDeviceToHost));
+
     delta += (std::chrono::high_resolution_clock::now() - start);
 
     if (0 == kk)
@@ -162,8 +164,7 @@ void go(bool soa) {
 
 #ifndef DOPROF
       cudautils::launch(invertSeq<MX, DIM>, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE);
-
-      cuda::memory::copy(&mm, m_d.get(), stride() * sizeof(MX));
+      cudaCheck(cudaMemcpy(&mm, m_d.get(), stride() * sizeof(MX), cudaMemcpyDeviceToHost));
 #endif
       delta1 += (std::chrono::high_resolution_clock::now() - start);
 
diff --git a/DataFormats/Math/test/cudaAtan2Test.cu b/DataFormats/Math/test/cudaAtan2Test.cu
index c436801640b09..298d8b784f322 100644
--- a/DataFormats/Math/test/cudaAtan2Test.cu
+++ b/DataFormats/Math/test/cudaAtan2Test.cu
@@ -29,6 +29,7 @@ end
 #include "cuda/api_wrappers.h"
 
 #include "DataFormats/Math/interface/approx_atan2.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
 
@@ -72,7 +73,7 @@ void go() {
   auto diff_d = cuda::memory::device::make_unique<int[]>(current_device, 3);
 
   int diffs[3];
-  cuda::memory::device::zero(diff_d.get(), 3 * 4);
+  cudaCheck(cudaMemset(diff_d.get(), 0, 3 * 4));
 
   // Launch the diff CUDA Kernel
   dim3 threadsPerBlock(32, 32, 1);
@@ -83,7 +84,7 @@ void go() {
 
   cudautils::launch(diffAtan<DEGREE>, {blocksPerGrid, threadsPerBlock}, diff_d.get());
 
-  cuda::memory::copy(diffs, diff_d.get(), 3 * 4);
+  cudaCheck(cudaMemcpy(diffs, diff_d.get(), 3 * 4, cudaMemcpyDeviceToHost));
   delta += (std::chrono::high_resolution_clock::now() - start);
 
   float mdiff = diffs[0] * 1.e-7;
diff --git a/DataFormats/Math/test/cudaMathTest.cu b/DataFormats/Math/test/cudaMathTest.cu
index 1a77456a1c061..46aae5a64f217 100644
--- a/DataFormats/Math/test/cudaMathTest.cu
+++ b/DataFormats/Math/test/cudaMathTest.cu
@@ -39,6 +39,7 @@ end
 #include "DataFormats/Math/interface/approx_log.h"
 #include "DataFormats/Math/interface/approx_exp.h"
 #include "DataFormats/Math/interface/approx_atan2.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
 
@@ -106,8 +107,8 @@ void go() {
   auto d_B = cuda::memory::device::make_unique<float[]>(current_device, numElements);
   auto d_C = cuda::memory::device::make_unique<float[]>(current_device, numElements);
 
-  cuda::memory::copy(d_A.get(), h_A.get(), size);
-  cuda::memory::copy(d_B.get(), h_B.get(), size);
+  cudaCheck(cudaMemcpy(d_A.get(), h_A.get(), size, cudaMemcpyHostToDevice));
+  cudaCheck(cudaMemcpy(d_B.get(), h_B.get(), size, cudaMemcpyHostToDevice));
   delta += (std::chrono::high_resolution_clock::now() - start);
   std::cout << "cuda alloc+copy took " << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() << " ms"
             << std::endl;
@@ -118,19 +119,21 @@ void go() {
   std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads\n";
 
   delta -= (std::chrono::high_resolution_clock::now() - start);
-  cudautils::launch(vectorOp<USE, ADDY>, {blocksPerGrid, threadsPerBlock}, d_A.get(), d_B.get(), d_C.get(), numElements);
+  cudautils::launch(
+      vectorOp<USE, ADDY>, {blocksPerGrid, threadsPerBlock}, d_A.get(), d_B.get(), d_C.get(), numElements);
   delta += (std::chrono::high_resolution_clock::now() - start);
   std::cout << "cuda computation took " << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() << " ms"
             << std::endl;
 
   delta -= (std::chrono::high_resolution_clock::now() - start);
-  cudautils::launch(vectorOp<USE, ADDY>, {blocksPerGrid, threadsPerBlock}, d_A.get(), d_B.get(), d_C.get(), numElements);
+  cudautils::launch(
+      vectorOp<USE, ADDY>, {blocksPerGrid, threadsPerBlock}, d_A.get(), d_B.get(), d_C.get(), numElements);
   delta += (std::chrono::high_resolution_clock::now() - start);
   std::cout << "cuda computation took " << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() << " ms"
             << std::endl;
 
   delta -= (std::chrono::high_resolution_clock::now() - start);
-  cuda::memory::copy(h_C.get(), d_C.get(), size);
+  cudaCheck(cudaMemcpy(h_C.get(), d_C.get(), size, cudaMemcpyDeviceToHost));
   delta += (std::chrono::high_resolution_clock::now() - start);
   std::cout << "cuda copy back took " << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() << " ms"
             << std::endl;
diff --git a/HeterogeneousCore/CUDACore/test/testStreamEvent.cu b/HeterogeneousCore/CUDACore/test/testStreamEvent.cu
index 1c889f8f75a04..bd9ce4f29fba3 100644
--- a/HeterogeneousCore/CUDACore/test/testStreamEvent.cu
+++ b/HeterogeneousCore/CUDACore/test/testStreamEvent.cu
@@ -12,6 +12,7 @@
 
 #include <cuda_runtime.h>
 
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
 
 namespace {
@@ -39,8 +40,8 @@ int main() {
   cudaStream_t stream1, stream2;
   cudaEvent_t event1, event2;
 
-  cudaMalloc(&dev_points1, ARRAY_SIZE * sizeof(float));
-  cudaMallocHost(&host_points1, ARRAY_SIZE * sizeof(float));
+  cudaCheck(cudaMalloc(&dev_points1, ARRAY_SIZE * sizeof(float)));
+  cudaCheck(cudaMallocHost(&host_points1, ARRAY_SIZE * sizeof(float)));
   cudaStreamCreateWithFlags(&stream1, cudaStreamNonBlocking);
   cudaStreamCreateWithFlags(&stream2, cudaStreamNonBlocking);
   cudaEventCreate(&event1);
@@ -50,7 +51,7 @@ int main() {
     host_points1[j] = static_cast<float>(j);
   }
 
-  cudaMemcpyAsync(dev_points1, host_points1, ARRAY_SIZE * sizeof(float), cudaMemcpyHostToDevice, stream1);
+  cudaCheck(cudaMemcpyAsync(dev_points1, host_points1, ARRAY_SIZE * sizeof(float), cudaMemcpyHostToDevice, stream1));
   kernel_looping<<<1, 16, 0, stream1>>>(dev_points1, ARRAY_SIZE);
   if (debug)
     std::cout << "Kernel launched on stream1" << std::endl;
diff --git a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
index 5e6d67b24e479..3e06ed15d7594 100644
--- a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
+++ b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
@@ -31,8 +31,7 @@ namespace cudatest {
 namespace {
   std::unique_ptr<CUDAProduct<int*>> produce(int device, int* d, int* h) {
     auto ctx = cudatest::TestCUDAScopedContext::make(device, true);
-
-    cuda::memory::async::copy(d, h, sizeof(int), ctx.stream());
+    cudaCheck(cudaMemcpyAsync(d, h, sizeof(int), cudaMemcpyHostToDevice, ctx.stream()));
     testCUDAScopedContextKernels_single(d, ctx.stream());
     return ctx.wrap(d);
   }
@@ -116,9 +115,10 @@ TEST_CASE("Use of CUDAScopedContext", "[CUDACore]") {
       h_a1 = 0;
       h_a2 = 0;
       int h_a3 = 0;
-      cuda::memory::async::copy(&h_a1, d_a1.get(), sizeof(int), ctx.stream());
-      cuda::memory::async::copy(&h_a2, d_a2.get(), sizeof(int), ctx.stream());
-      cuda::memory::async::copy(&h_a3, d_a3.get(), sizeof(int), ctx.stream());
+
+      cudaCheck(cudaMemcpyAsync(&h_a1, d_a1.get(), sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()));
+      cudaCheck(cudaMemcpyAsync(&h_a2, d_a2.get(), sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()));
+      cudaCheck(cudaMemcpyAsync(&h_a3, d_a3.get(), sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()));
 
       REQUIRE(h_a1 == 2);
       REQUIRE(h_a2 == 4);
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc
index abded61899096..74e5af7c46baf 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc
@@ -7,6 +7,7 @@
 #include "FWCore/ServiceRegistry/interface/Service.h"
 
 #include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAContextState.h"
 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
@@ -67,8 +68,8 @@ void TestCUDAProducerGPUEW::acquire(const edm::Event& iEvent,
   // Mimick the need to transfer some of the GPU data back to CPU to
   // be used for something within this module, or to be put in the
   // event.
-  cuda::memory::async::copy(hostData_.get(), devicePtr_.get() + 10, sizeof(float), ctx.stream());
-
+  cudaCheck(
+      cudaMemcpyAsync(hostData_.get(), devicePtr_.get() + 10, sizeof(float), cudaMemcpyDeviceToHost, ctx.stream()));
   edm::LogVerbatim("TestCUDAProducerGPUEW") << label_ << " TestCUDAProducerGPUEW::acquire end event "
                                             << iEvent.id().event() << " stream " << iEvent.streamID();
 }
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEWTask.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEWTask.cc
index 2b628ae93051e..0c8aad0931f15 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEWTask.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEWTask.cc
@@ -10,6 +10,7 @@
 #include "FWCore/ServiceRegistry/interface/Service.h"
 
 #include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAContextState.h"
 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
@@ -75,8 +76,8 @@ void TestCUDAProducerGPUEWTask::acquire(const edm::Event& iEvent,
   // Mimick the need to transfer some of the GPU data back to CPU to
   // be used for something within this module, or to be put in the
   // event.
-  cuda::memory::async::copy(hostData_.get(), devicePtr_.get() + 10, sizeof(float), ctx.stream());
-
+  cudaCheck(
+      cudaMemcpyAsync(hostData_.get(), devicePtr_.get() + 10, sizeof(float), cudaMemcpyDeviceToHost, ctx.stream()));
   // Push a task to run addSimpleWork() after the asynchronous work
   // (and acquire()) has finished instead of produce()
   ctx.pushNextTask([iev = iEvent.id().event(), istr = iEvent.streamID(), this](CUDAScopedContextTask ctx) {
@@ -94,7 +95,8 @@ void TestCUDAProducerGPUEWTask::addSimpleWork(edm::EventNumber_t eventID,
     edm::LogVerbatim("TestCUDAProducerGPUEWTask")
         << label_ << " TestCUDAProducerGPUEWTask::addSimpleWork begin event " << eventID << " stream " << streamID
         << " 10th element " << *hostData_ << " not satisfied, queueing more work";
-    cuda::memory::async::copy(hostData_.get(), devicePtr_.get() + 10, sizeof(float), ctx.stream());
+    cudaCheck(
+        cudaMemcpyAsync(hostData_.get(), devicePtr_.get() + 10, sizeof(float), cudaMemcpyDeviceToHost, ctx.stream()));
 
     ctx.pushNextTask([eventID, streamID, this](CUDAScopedContextTask ctx) { addSimpleWork(eventID, streamID, ctx); });
     gpuAlgo_.runSimpleAlgo(devicePtr_.get(), ctx.stream());
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu
index 73a6615259ebc..aaa6b9148c74c 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu
@@ -69,7 +69,7 @@ cudautils::device::unique_ptr<float[]> TestCUDAProducerGPUKernel::runAlgo(const
   // First make the sanity check
   if (d_input != nullptr) {
     auto h_check = std::make_unique<float[]>(NUM_VALUES);
-    cuda::memory::copy(h_check.get(), d_input, NUM_VALUES * sizeof(float));
+    cudaCheck(cudaMemcpy(h_check.get(), d_input, NUM_VALUES * sizeof(float), cudaMemcpyDeviceToHost));
     for (int i = 0; i < NUM_VALUES; ++i) {
       if (h_check[i] != i) {
         throw cms::Exception("Assert") << "Sanity check on element " << i << " failed, expected " << i << " got "
@@ -89,8 +89,8 @@ cudautils::device::unique_ptr<float[]> TestCUDAProducerGPUKernel::runAlgo(const
   auto d_a = cudautils::make_device_unique<float[]>(NUM_VALUES, stream);
   auto d_b = cudautils::make_device_unique<float[]>(NUM_VALUES, stream);
 
-  cuda::memory::async::copy(d_a.get(), h_a.get(), NUM_VALUES * sizeof(float), stream);
-  cuda::memory::async::copy(d_b.get(), h_b.get(), NUM_VALUES * sizeof(float), stream);
+  cudaCheck(cudaMemcpyAsync(d_a.get(), h_a.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream));
+  cudaCheck(cudaMemcpyAsync(d_b.get(), h_b.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream));
 
   int threadsPerBlock{32};
   int blocksPerGrid = (NUM_VALUES + threadsPerBlock - 1) / threadsPerBlock;
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc
index 06f099073e1ea..168ac1daa14b9 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc
@@ -6,6 +6,7 @@
 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
 
 #include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
 #include "HeterogeneousCore/CUDATest/interface/CUDAThing.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
@@ -58,8 +59,11 @@ void TestCUDAProducerGPUtoCPU::acquire(const edm::Event& iEvent,
 
   buffer_ = cudautils::make_host_unique<float[]>(TestCUDAProducerGPUKernel::NUM_VALUES, ctx.stream());
   // Enqueue async copy, continue in produce once finished
-  cuda::memory::async::copy(
-      buffer_.get(), device.get(), TestCUDAProducerGPUKernel::NUM_VALUES * sizeof(float), ctx.stream());
+  cudaCheck(cudaMemcpyAsync(buffer_.get(),
+                            device.get(),
+                            TestCUDAProducerGPUKernel::NUM_VALUES * sizeof(float),
+                            cudaMemcpyDeviceToHost,
+                            ctx.stream()));
 
   edm::LogVerbatim("TestCUDAProducerGPUtoCPU") << label_ << " TestCUDAProducerGPUtoCPU::acquire end event "
                                                << iEvent.id().event() << " stream " << iEvent.streamID();
diff --git a/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc b/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc
index 3ab110806ee78..2b137d3483c5a 100644
--- a/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc
+++ b/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc
@@ -74,7 +74,7 @@ process.moduleToTest(process.toTest)
     REQUIRE(data != nullptr);
 
     float firstElements[10];
-    cuda::memory::async::copy(firstElements, data, sizeof(float) * 10, prod->stream());
+    cudaCheck(cudaMemcpyAsync(firstElements, data, sizeof(float) * 10, cudaMemcpyDeviceToHost, prod->stream()));
 
     std::cout << "Synchronizing with CUDA stream" << std::endl;
     auto stream = prod->stream();
diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
index 4501f98e39376..ca7053740f6f7 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
@@ -62,7 +62,7 @@ namespace cudautils {
   ) {
     uint32_t *off = (uint32_t *)((char *)(h) + offsetof(Histo, off));
 #ifdef __CUDACC__
-    cudaMemsetAsync(off, 0, 4 * Histo::totbins(), stream);
+    cudaCheck(cudaMemsetAsync(off, 0, 4 * Histo::totbins(), stream));
 #else
     ::memset(off, 0, 4 * Histo::totbins());
 #endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h b/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h
index a327402b2f3d3..bfa1bdee9a03d 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h
@@ -1,6 +1,7 @@
 #ifndef HeterogeneousCore_CUDAUtilities_copyAsync_h
 #define HeterogeneousCore_CUDAUtilities_copyAsync_h
 
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
 
@@ -17,7 +18,7 @@ namespace cudautils {
     // Shouldn't compile for array types because of sizeof(T), but
     // let's add an assert with a more helpful message
     static_assert(std::is_array<T>::value == false, "For array types, use the other overload with the size parameter");
-    cuda::memory::async::copy(dst.get(), src.get(), sizeof(T), stream);
+    cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), sizeof(T), cudaMemcpyHostToDevice, stream));
   }
 
   template <typename T>
@@ -25,7 +26,7 @@ namespace cudautils {
                         const cudautils::device::unique_ptr<T>& src,
                         cudaStream_t stream) {
     static_assert(std::is_array<T>::value == false, "For array types, use the other overload with the size parameter");
-    cuda::memory::async::copy(dst.get(), src.get(), sizeof(T), stream);
+    cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), sizeof(T), cudaMemcpyDeviceToHost, stream));
   }
 
   // Multiple elements
@@ -34,7 +35,7 @@ namespace cudautils {
                         const cudautils::host::unique_ptr<T[]>& src,
                         size_t nelements,
                         cudaStream_t stream) {
-    cuda::memory::async::copy(dst.get(), src.get(), nelements * sizeof(T), stream);
+    cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), nelements * sizeof(T), cudaMemcpyHostToDevice, stream));
   }
 
   template <typename T>
@@ -42,7 +43,7 @@ namespace cudautils {
                         const cudautils::device::unique_ptr<T[]>& src,
                         size_t nelements,
                         cudaStream_t stream) {
-    cuda::memory::async::copy(dst.get(), src.get(), nelements * sizeof(T), stream);
+    cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), nelements * sizeof(T), cudaMemcpyDeviceToHost, stream));
   }
 }  // namespace cudautils
 
diff --git a/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h b/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h
index b0167dcb9ed25..b9ce5a001d41f 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h
@@ -1,6 +1,7 @@
 #ifndef HeterogeneousCore_CUDAUtilities_memsetAsync_h
 #define HeterogeneousCore_CUDAUtilities_memsetAsync_h
 
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 
 #include <cuda/api_wrappers.h>
@@ -13,7 +14,7 @@ namespace cudautils {
     // Shouldn't compile for array types because of sizeof(T), but
     // let's add an assert with a more helpful message
     static_assert(std::is_array<T>::value == false, "For array types, use the other overload with the size parameter");
-    cuda::memory::device::async::set(ptr.get(), value, sizeof(T), stream);
+    cudaCheck(cudaMemsetAsync(ptr.get(), value, sizeof(T), stream));
   }
 
   /**
@@ -24,7 +25,7 @@ namespace cudautils {
    */
   template <typename T>
   inline void memsetAsync(cudautils::device::unique_ptr<T[]>& ptr, int value, size_t nelements, cudaStream_t stream) {
-    cuda::memory::device::async::set(ptr.get(), value, nelements * sizeof(T), stream);
+    cudaCheck(cudaMemsetAsync(ptr.get(), value, nelements * sizeof(T), stream));
   }
 }  // namespace cudautils
 
diff --git a/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h b/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h
index 9e591f2be4d69..8b784bdd61bfe 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h
@@ -151,7 +151,7 @@ __global__ void multiBlockPrefixScan(T const* __restrict__ ci, T* __restrict__ c
 
   // let's get the partial sums from each block
   __shared__ T psum[1024];
-  for (int i = threadIdx.x, ni = gridDim.x; i<ni; i += blockDim.x) {
+  for (int i = threadIdx.x, ni = gridDim.x; i < ni; i += blockDim.x) {
     auto j = 1024 * i + 1023;
     psum[i] = (j < size) ? co[j] : T(0);
   }
diff --git a/HeterogeneousCore/CUDAUtilities/test/AtomicPairCounter_t.cu b/HeterogeneousCore/CUDAUtilities/test/AtomicPairCounter_t.cu
index 1b9669aa88368..1c28758fb970e 100644
--- a/HeterogeneousCore/CUDAUtilities/test/AtomicPairCounter_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/AtomicPairCounter_t.cu
@@ -1,3 +1,4 @@
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/AtomicPairCounter.h"
 
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
@@ -39,24 +40,24 @@ __global__ void verify(AtomicPairCounter const *dc, uint32_t const *ind, uint32_
 #include <iostream>
 int main() {
   AtomicPairCounter *dc_d;
-  cudaMalloc(&dc_d, sizeof(AtomicPairCounter));
-  cudaMemset(dc_d, 0, sizeof(AtomicPairCounter));
+  cudaCheck(cudaMalloc(&dc_d, sizeof(AtomicPairCounter)));
+  cudaCheck(cudaMemset(dc_d, 0, sizeof(AtomicPairCounter)));
 
   std::cout << "size " << sizeof(AtomicPairCounter) << std::endl;
 
   constexpr uint32_t N = 20000;
   constexpr uint32_t M = N * 6;
   uint32_t *n_d, *m_d;
-  cudaMalloc(&n_d, N * sizeof(int));
+  cudaCheck(cudaMalloc(&n_d, N * sizeof(int)));
   // cudaMemset(n_d, 0, N*sizeof(int));
-  cudaMalloc(&m_d, M * sizeof(int));
+  cudaCheck(cudaMalloc(&m_d, M * sizeof(int)));
 
   update<<<2000, 512>>>(dc_d, n_d, m_d, 10000);
   finalize<<<1, 1>>>(dc_d, n_d, m_d, 10000);
   verify<<<2000, 512>>>(dc_d, n_d, m_d, 10000);
 
   AtomicPairCounter dc;
-  cudaMemcpy(&dc, dc_d, sizeof(AtomicPairCounter), cudaMemcpyDeviceToHost);
+  cudaCheck(cudaMemcpy(&dc, dc_d, sizeof(AtomicPairCounter), cudaMemcpyDeviceToHost));
 
   std::cout << dc.get().n << ' ' << dc.get().m << std::endl;
 
diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu
index e3ac77f1556a4..f9e5fa28c3ee9 100644
--- a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu
@@ -6,6 +6,7 @@
 
 #include <cuda/api_wrappers.h>
 
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
 
@@ -26,7 +27,7 @@ void go() {
   T v[N];
   auto v_d = cuda::memory::device::make_unique<T[]>(current_device, N);
 
-  cuda::memory::copy(v_d.get(), v, N * sizeof(T));
+  cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice));
 
   constexpr uint32_t nParts = 10;
   constexpr uint32_t partSize = N / nParts;
@@ -65,7 +66,7 @@ void go() {
       offsets[10] = 3297 + offsets[9];
     }
 
-    cuda::memory::copy(off_d.get(), offsets, 4 * (nParts + 1));
+    cudaCheck(cudaMemcpy(off_d.get(), offsets, 4 * (nParts + 1), cudaMemcpyHostToDevice));
 
     for (long long j = 0; j < N; j++)
       v[j] = rgen(eng);
@@ -75,11 +76,10 @@ void go() {
         v[j] = sizeof(T) == 1 ? 22 : 3456;
     }
 
-    cuda::memory::copy(v_d.get(), v, N * sizeof(T));
+    cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice));
 
     cudautils::fillManyFromVector(h_d.get(), ws_d.get(), nParts, v_d.get(), off_d.get(), offsets[10], 256, 0);
-
-    cuda::memory::copy(&h, h_d.get(), sizeof(Hist));
+    cudaCheck(cudaMemcpy(&h, h_d.get(), sizeof(Hist), cudaMemcpyDeviceToHost));
     assert(0 == h.off[0]);
     assert(offsets[10] == h.size());
 
diff --git a/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu
index 4514ddd55e20b..03a969102ee1b 100644
--- a/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu
@@ -6,6 +6,7 @@
 
 #include <cuda/api_wrappers.h>
 
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
@@ -131,7 +132,7 @@ void go() {
 
     assert(v_d.get());
     assert(v);
-    cuda::memory::copy(v_d.get(), v, N * sizeof(T));
+    cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice));
     assert(v_d.get());
     cudautils::launch(mykernel<T, NBINS, S, DELTA>, {1, 256}, v_d.get(), N);
   }
diff --git a/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h
index 75763952d782b..8782d6db07e3a 100644
--- a/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h
+++ b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h
@@ -8,6 +8,7 @@
 
 #ifdef __CUDACC__
 #include <cuda/api_wrappers.h>
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
 #endif
 
@@ -170,7 +171,7 @@ int main() {
   auto sa_d = cuda::memory::device::make_unique<SmallAssoc[]>(current_device, 1);
   auto ws_d = cuda::memory::device::make_unique<uint8_t[]>(current_device, Assoc::wsSize());
 
-  cuda::memory::copy(v_d.get(), tr.data(), N * sizeof(std::array<uint16_t, 4>));
+  cudaCheck(cudaMemcpy(v_d.get(), tr.data(), N * sizeof(std::array<uint16_t, 4>), cudaMemcpyHostToDevice));
 #else
   auto a_d = std::make_unique<Assoc>();
   auto sa_d = std::make_unique<SmallAssoc>();
@@ -198,7 +199,7 @@ int main() {
   Assoc la;
 
 #ifdef __CUDACC__
-  cuda::memory::copy(&la, a_d.get(), sizeof(Assoc));
+  cudaCheck(cudaMemcpy(&la, a_d.get(), sizeof(Assoc), cudaMemcpyDeviceToHost));
 #else
   memcpy(&la, a_d.get(), sizeof(Assoc));  // not required, easier
 #endif
@@ -224,17 +225,17 @@ int main() {
   AtomicPairCounter dc(0);
 
 #ifdef __CUDACC__
-  cudaMalloc(&dc_d, sizeof(AtomicPairCounter));
-  cudaMemset(dc_d, 0, sizeof(AtomicPairCounter));
+  cudaCheck(cudaMalloc(&dc_d, sizeof(AtomicPairCounter)));
+  cudaCheck(cudaMemset(dc_d, 0, sizeof(AtomicPairCounter)));
   nBlocks = (N + nThreads - 1) / nThreads;
   fillBulk<<<nBlocks, nThreads>>>(dc_d, v_d.get(), a_d.get(), N);
   cudautils::finalizeBulk<<<nBlocks, nThreads>>>(dc_d, a_d.get());
   verifyBulk<<<1, 1>>>(a_d.get(), dc_d);
 
-  cuda::memory::copy(&la, a_d.get(), sizeof(Assoc));
-  cudaMemcpy(&dc, dc_d, sizeof(AtomicPairCounter), cudaMemcpyDeviceToHost);
+  cudaCheck(cudaMemcpy(&la, a_d.get(), sizeof(Assoc), cudaMemcpyDeviceToHost));
+  cudaCheck(cudaMemcpy(&dc, dc_d, sizeof(AtomicPairCounter), cudaMemcpyDeviceToHost));
 
-  cudaMemset(dc_d, 0, sizeof(AtomicPairCounter));
+  cudaCheck(cudaMemset(dc_d, 0, sizeof(AtomicPairCounter)));
   fillBulk<<<nBlocks, nThreads>>>(dc_d, v_d.get(), sa_d.get(), N);
   cudautils::finalizeBulk<<<nBlocks, nThreads>>>(dc_d, sa_d.get());
   verifyBulk<<<1, 1>>>(sa_d.get(), dc_d);
diff --git a/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp b/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp
index cec4df12ca99b..750551161a161 100644
--- a/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp
+++ b/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp
@@ -21,7 +21,7 @@ TEST_CASE("copyAsync", "[cudaMemTools]") {
       auto host = cudautils::make_host_unique<int>(stream);
 
       cudautils::copyAsync(device, host_orig, stream);
-      cuda::memory::async::copy(host.get(), device.get(), sizeof(int), stream);
+      cudaCheck(cudaMemcpyAsync(host.get(), device.get(), sizeof(int), cudaMemcpyDeviceToHost, stream));
       cudaCheck(cudaStreamSynchronize(stream));
 
       REQUIRE(*host == 42);
@@ -40,7 +40,7 @@ TEST_CASE("copyAsync", "[cudaMemTools]") {
 
       SECTION("Copy all") {
         cudautils::copyAsync(device, host_orig, N, stream);
-        cuda::memory::async::copy(host.get(), device.get(), N * sizeof(int), stream);
+        cudaCheck(cudaMemcpyAsync(host.get(), device.get(), N * sizeof(int), cudaMemcpyDeviceToHost, stream));
         cudaCheck(cudaStreamSynchronize(stream));
         for (int i = 0; i < N; ++i) {
           CHECK(host[i] == i);
@@ -53,7 +53,7 @@ TEST_CASE("copyAsync", "[cudaMemTools]") {
 
       SECTION("Copy some") {
         cudautils::copyAsync(device, host_orig, 42, stream);
-        cuda::memory::async::copy(host.get(), device.get(), 42 * sizeof(int), stream);
+        cudaCheck(cudaMemcpyAsync(host.get(), device.get(), 42 * sizeof(int), cudaMemcpyDeviceToHost, stream));
         cudaCheck(cudaStreamSynchronize(stream));
         for (int i = 0; i < 42; ++i) {
           CHECK(host[i] == 200 + i);
@@ -70,7 +70,7 @@ TEST_CASE("copyAsync", "[cudaMemTools]") {
       auto device = cudautils::make_device_unique<int>(stream);
       auto host = cudautils::make_host_unique<int>(stream);
 
-      cuda::memory::async::copy(device.get(), host_orig.get(), sizeof(int), stream);
+      cudaCheck(cudaMemcpyAsync(device.get(), host_orig.get(), sizeof(int), cudaMemcpyHostToDevice, stream));
       cudautils::copyAsync(host, device, stream);
       cudaCheck(cudaStreamSynchronize(stream));
 
@@ -89,7 +89,7 @@ TEST_CASE("copyAsync", "[cudaMemTools]") {
       auto host = cudautils::make_host_unique<int[]>(N, stream);
 
       SECTION("Copy all") {
-        cuda::memory::async::copy(device.get(), host_orig.get(), N * sizeof(int), stream);
+        cudaCheck(cudaMemcpyAsync(device.get(), host_orig.get(), N * sizeof(int), cudaMemcpyHostToDevice, stream));
         cudautils::copyAsync(host, device, N, stream);
         cudaCheck(cudaStreamSynchronize(stream));
         for (int i = 0; i < N; ++i) {
@@ -102,7 +102,7 @@ TEST_CASE("copyAsync", "[cudaMemTools]") {
       }
 
       SECTION("Copy some") {
-        cuda::memory::async::copy(device.get(), host_orig.get(), 42 * sizeof(int), stream);
+        cudaCheck(cudaMemcpyAsync(device.get(), host_orig.get(), 42 * sizeof(int), cudaMemcpyHostToDevice, stream));
         cudautils::copyAsync(host, device, 42, stream);
         cudaCheck(cudaStreamSynchronize(stream));
         for (int i = 0; i < 42; ++i) {
diff --git a/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu b/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu
index f8266898323cd..b83c03f710012 100644
--- a/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu
@@ -2,6 +2,7 @@
 
 #include <cub/cub.cuh>
 
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
 
@@ -107,9 +108,9 @@ int main() {
     uint32_t *d_out1;
     uint32_t *d_out2;
 
-    cudaMalloc(&d_in, num_items * sizeof(uint32_t));
-    cudaMalloc(&d_out1, num_items * sizeof(uint32_t));
-    cudaMalloc(&d_out2, num_items * sizeof(uint32_t));
+    cudaCheck(cudaMalloc(&d_in, num_items * sizeof(uint32_t)));
+    cudaCheck(cudaMalloc(&d_out1, num_items * sizeof(uint32_t)));
+    cudaCheck(cudaMalloc(&d_out2, num_items * sizeof(uint32_t)));
 
     auto nthreads = 256;
     auto nblocks = (num_items + nthreads - 1) / nthreads;
@@ -118,8 +119,8 @@ int main() {
 
     // the block counter
     int32_t *d_pc;
-    cudaMalloc(&d_pc, sizeof(int32_t));
-    cudaMemset(d_pc, 0, 4);
+    cudaCheck(cudaMalloc(&d_pc, sizeof(int32_t)));
+    cudaCheck(cudaMemset(d_pc, 0, 4));
 
     nthreads = 1024;
     nblocks = (num_items + nthreads - 1) / nthreads;
@@ -139,7 +140,7 @@ int main() {
     // Allocate temporary storage for inclusive prefix sum
     // fake larger ws already available
     temp_storage_bytes *= 8;
-    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+    cudaCheck(cudaMalloc(&d_temp_storage, temp_storage_bytes));
     std::cout << "temp storage " << temp_storage_bytes << std::endl;
     // Run inclusive prefix sum
     CubDebugExit(cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out2, num_items));
diff --git a/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu b/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu
index 9be80547837d4..bc042cc012185 100644
--- a/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu
@@ -10,6 +10,7 @@
 
 #include <cuda/api_wrappers.h>
 
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/radixSort.h"
@@ -102,8 +103,8 @@ void go(bool useShared) {
     auto ws_d = cuda::memory::device::make_unique<uint16_t[]>(current_device, N);
     auto off_d = cuda::memory::device::make_unique<uint32_t[]>(current_device, blocks + 1);
 
-    cuda::memory::copy(v_d.get(), v, N * sizeof(T));
-    cuda::memory::copy(off_d.get(), offsets, 4 * (blocks + 1));
+    cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice));
+    cudaCheck(cudaMemcpy(off_d.get(), offsets, 4 * (blocks + 1), cudaMemcpyHostToDevice));
 
     if (i < 2)
       std::cout << "lauch for " << offsets[blocks] << std::endl;
@@ -116,13 +117,13 @@ void go(bool useShared) {
       cudautils::launch(
           radixSortMultiWrapper<U, NS>, {blocks, ntXBl, MaxSize * 2}, v_d.get(), ind_d.get(), off_d.get(), nullptr);
     else
-      cudautils::launch(radixSortMultiWrapper2<U, NS>, {blocks, ntXBl}, v_d.get(), ind_d.get(), off_d.get(), ws_d.get());
+      cudautils::launch(
+          radixSortMultiWrapper2<U, NS>, {blocks, ntXBl}, v_d.get(), ind_d.get(), off_d.get(), ws_d.get());
 
     if (i == 0)
       std::cout << "done for " << offsets[blocks] << std::endl;
 
-    //  cuda::memory::copy(v, v_d.get(), 2*N);
-    cuda::memory::copy(ind, ind_d.get(), 2 * N);
+    cudaCheck(cudaMemcpy(ind, ind_d.get(), 2 * N, cudaMemcpyDeviceToHost));
 
     delta += (std::chrono::high_resolution_clock::now() - start);
 
diff --git a/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUHelpers.cu b/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUHelpers.cu
index 7e531a48b6b82..14e9245e19e2e 100644
--- a/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUHelpers.cu
+++ b/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUHelpers.cu
@@ -77,8 +77,8 @@ int TestAcceleratorServiceProducerGPUHelpers_simple_kernel(int input) {
   auto d_b = cuda::memory::device::make_unique<int[]>(current_device, NUM_VALUES);
   auto d_c = cuda::memory::device::make_unique<int[]>(current_device, NUM_VALUES);
 
-  cuda::memory::async::copy(d_a.get(), h_a.get(), NUM_VALUES * sizeof(int), stream.id());
-  cuda::memory::async::copy(d_b.get(), h_b.get(), NUM_VALUES * sizeof(int), stream.id());
+  cudaCheck(cudaMemcpyAsync(d_a.get(), h_a.get(), NUM_VALUES * sizeof(int), cudaMemcpyHostToDevice, stream.id()));
+  cudaCheck(cudaMemcpyAsync(d_b.get(), h_b.get(), NUM_VALUES * sizeof(int), cudaMemcpyHostToDevice, stream.id()));
 
   int threadsPerBlock{256};
   int blocksPerGrid = (NUM_VALUES + threadsPerBlock - 1) / threadsPerBlock;
@@ -91,7 +91,7 @@ int TestAcceleratorServiceProducerGPUHelpers_simple_kernel(int input) {
                d_a.get(), d_b.get(), d_c.get(), NUM_VALUES);
   */
 
-  cuda::memory::async::copy(h_c.get(), d_c.get(), NUM_VALUES * sizeof(int), stream.id());
+  cudaCheck(cudaMemcpyAsync(h_c.get(), d_c.get(), NUM_VALUES * sizeof(int), cudaMemcpyDeviceToHost, stream.id()));
 
   stream.synchronize();
 
@@ -124,7 +124,7 @@ TestHeterogeneousEDProducerGPUTask::ResultType TestHeterogeneousEDProducerGPUTas
   // First make the sanity check
   if (inputArrays.first != nullptr) {
     auto h_check = std::make_unique<float[]>(NUM_VALUES);
-    cuda::memory::copy(h_check.get(), inputArrays.first, NUM_VALUES * sizeof(float));
+    cudaCheck(cudaMemcpy(h_check.get(), inputArrays.first, NUM_VALUES * sizeof(float), cudaMemcpyDeviceToHost));
     for (int i = 0; i < NUM_VALUES; ++i) {
       if (h_check[i] != i) {
         throw cms::Exception("Assert") << "Sanity check on element " << i << " failed, expected " << i << " got "
@@ -146,8 +146,8 @@ TestHeterogeneousEDProducerGPUTask::ResultType TestHeterogeneousEDProducerGPUTas
   }
 
   // Create stream
-  cuda::memory::async::copy(d_a.get(), h_a.get(), NUM_VALUES * sizeof(float), stream.id());
-  cuda::memory::async::copy(d_b.get(), h_b.get(), NUM_VALUES * sizeof(float), stream.id());
+  cudaCheck(cudaMemcpyAsync(d_a.get(), h_a.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream.id()));
+  cudaCheck(cudaMemcpyAsync(d_b.get(), h_b.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream.id()));
 
   int threadsPerBlock{32};
   int blocksPerGrid = (NUM_VALUES + threadsPerBlock - 1) / threadsPerBlock;
@@ -195,7 +195,7 @@ void TestHeterogeneousEDProducerGPUTask::release(const std::string &label, cuda:
 
 int TestHeterogeneousEDProducerGPUTask::getResult(const ResultTypeRaw &d_ac, cuda::stream_t<> &stream) {
   auto h_c = cuda::memory::host::make_unique<float[]>(NUM_VALUES);
-  cuda::memory::async::copy(h_c.get(), d_ac.second, NUM_VALUES * sizeof(int), stream.id());
+  cudaCheck(cudaMemcpyAsync(h_c.get(), d_ac.second, NUM_VALUES * sizeof(int), cudaMemcpyDeviceToHost, stream.id()));
   stream.synchronize();
 
   float ret = 0;
diff --git a/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc b/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc
index 4a554b47bbf63..d4b8e40dea76b 100644
--- a/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc
+++ b/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc
@@ -156,8 +156,11 @@ cudautils::device::unique_ptr<unsigned char[]> SiPixelFedCablingMapGPUWrapper::g
     }
   }
 
-  cuda::memory::async::copy(
-      modToUnpDevice.get(), modToUnpHost.get(), pixelgpudetails::MAX_SIZE * sizeof(unsigned char), cudaStream);
+  cudaCheck(cudaMemcpyAsync(modToUnpDevice.get(),
+                            modToUnpHost.get(),
+                            pixelgpudetails::MAX_SIZE * sizeof(unsigned char),
+                            cudaMemcpyHostToDevice,
+                            cudaStream));
   return modToUnpDevice;
 }
 
diff --git a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h
index 4f7ced9b7e309..bb86c1392cdf9 100644
--- a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h
+++ b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h
@@ -12,6 +12,7 @@
 #ifdef __CUDACC__
 #include <cuda/api_wrappers.h>
 
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
 #endif
@@ -252,12 +253,12 @@ int main(void) {
     size_t size16 = n * sizeof(unsigned short);
     // size_t size8 = n * sizeof(uint8_t);
 
-    cuda::memory::copy(d_moduleStart.get(), &nModules, sizeof(uint32_t));
+    cudaCheck(cudaMemcpy(d_moduleStart.get(), &nModules, sizeof(uint32_t), cudaMemcpyHostToDevice));
 
-    cuda::memory::copy(d_id.get(), h_id.get(), size16);
-    cuda::memory::copy(d_x.get(), h_x.get(), size16);
-    cuda::memory::copy(d_y.get(), h_y.get(), size16);
-    cuda::memory::copy(d_adc.get(), h_adc.get(), size16);
+    cudaCheck(cudaMemcpy(d_id.get(), h_id.get(), size16, cudaMemcpyHostToDevice));
+    cudaCheck(cudaMemcpy(d_x.get(), h_x.get(), size16, cudaMemcpyHostToDevice));
+    cudaCheck(cudaMemcpy(d_y.get(), h_y.get(), size16, cudaMemcpyHostToDevice));
+    cudaCheck(cudaMemcpy(d_adc.get(), h_adc.get(), size16, cudaMemcpyHostToDevice));
     // Launch CUDA Kernels
     int threadsPerBlock = (kkk == 5) ? 512 : ((kkk == 3) ? 128 : 256);
     int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
@@ -270,26 +271,23 @@ int main(void) {
 
     std::cout << "CUDA findModules kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock
               << " threads\n";
-
-    cuda::memory::device::zero(d_clusInModule.get(), MaxNumModules * sizeof(uint32_t));
+    cudaCheck(cudaMemset(d_clusInModule.get(), 0, MaxNumModules * sizeof(uint32_t)));
 
     cudautils::launch(findClus,
-                 {blocksPerGrid, threadsPerBlock},
-                 d_id.get(),
-                 d_x.get(),
-                 d_y.get(),
-                 d_moduleStart.get(),
-                 d_clusInModule.get(),
-                 d_moduleId.get(),
-                 d_clus.get(),
-                 n);
+                      {blocksPerGrid, threadsPerBlock},
+                      d_id.get(),
+                      d_x.get(),
+                      d_y.get(),
+                      d_moduleStart.get(),
+                      d_clusInModule.get(),
+                      d_moduleId.get(),
+                      d_clus.get(),
+                      n);
     cudaDeviceSynchronize();
-
-    cuda::memory::copy(&nModules, d_moduleStart.get(), sizeof(uint32_t));
+    cudaCheck(cudaMemcpy(&nModules, d_moduleStart.get(), sizeof(uint32_t), cudaMemcpyDeviceToHost));
 
     uint32_t nclus[MaxNumModules], moduleId[nModules];
-
-    cuda::memory::copy(&nclus, d_clusInModule.get(), MaxNumModules * sizeof(uint32_t));
+    cudaCheck(cudaMemcpy(&nclus, d_clusInModule.get(), MaxNumModules * sizeof(uint32_t), cudaMemcpyDeviceToHost));
 
     std::cout << "before charge cut found " << std::accumulate(nclus, nclus + MaxNumModules, 0) << " clusters"
               << std::endl;
@@ -302,14 +300,14 @@ int main(void) {
       std::cout << "ERROR!!!!! wrong number of cluster found" << std::endl;
 
     cudautils::launch(clusterChargeCut,
-                 {blocksPerGrid, threadsPerBlock},
-                 d_id.get(),
-                 d_adc.get(),
-                 d_moduleStart.get(),
-                 d_clusInModule.get(),
-                 d_moduleId.get(),
-                 d_clus.get(),
-                 n);
+                      {blocksPerGrid, threadsPerBlock},
+                      d_id.get(),
+                      d_adc.get(),
+                      d_moduleStart.get(),
+                      d_clusInModule.get(),
+                      d_moduleId.get(),
+                      d_clus.get(),
+                      n);
 
     cudaDeviceSynchronize();
 #else
@@ -354,10 +352,10 @@ int main(void) {
     std::cout << "found " << nModules << " Modules active" << std::endl;
 
 #ifdef __CUDACC__
-    cuda::memory::copy(h_id.get(), d_id.get(), size16);
-    cuda::memory::copy(h_clus.get(), d_clus.get(), size32);
-    cuda::memory::copy(&nclus, d_clusInModule.get(), MaxNumModules * sizeof(uint32_t));
-    cuda::memory::copy(&moduleId, d_moduleId.get(), nModules * sizeof(uint32_t));
+    cudaCheck(cudaMemcpy(h_id.get(), d_id.get(), size16, cudaMemcpyDeviceToHost));
+    cudaCheck(cudaMemcpy(h_clus.get(), d_clus.get(), size32, cudaMemcpyDeviceToHost));
+    cudaCheck(cudaMemcpy(&nclus, d_clusInModule.get(), MaxNumModules * sizeof(uint32_t), cudaMemcpyDeviceToHost));
+    cudaCheck(cudaMemcpy(&moduleId, d_moduleId.get(), nModules * sizeof(uint32_t), cudaMemcpyDeviceToHost));
 #endif
 
     std::set<unsigned int> clids;
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu
index a206feca83b52..7b02a23c41dca 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu
+++ b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu
@@ -261,10 +261,10 @@ void testFit() {
   kernelFastFit<N><<<Ntracks / 64, 64>>>(hitsGPU, fast_fit_resultsGPU);
   cudaDeviceSynchronize();
 
-  cudaMemcpy(fast_fit_resultsGPUret,
-             fast_fit_resultsGPU,
-             Rfit::maxNumberOfTracks() * sizeof(Vector4d),
-             cudaMemcpyDeviceToHost);
+  cudaCheck(cudaMemcpy(fast_fit_resultsGPUret,
+                       fast_fit_resultsGPU,
+                       Rfit::maxNumberOfTracks() * sizeof(Vector4d),
+                       cudaMemcpyDeviceToHost));
   Rfit::Map4d fast_fit(fast_fit_resultsGPUret + 10, 4);
   std::cout << "Fitted values (FastFit, [X0, Y0, R, tan(theta)]): GPU\n" << fast_fit << std::endl;
   assert(isEqualFuzzy(fast_fit_results, fast_fit));
@@ -311,13 +311,14 @@ void testFit() {
 
   std::cout << "Fitted values (CircleFit):\n" << circle_fit_results.par << std::endl;
 
-  cudaMemcpy(circle_fit_resultsGPUret, circle_fit_resultsGPU, sizeof(Rfit::circle_fit), cudaMemcpyDeviceToHost);
+  cudaCheck(
+      cudaMemcpy(circle_fit_resultsGPUret, circle_fit_resultsGPU, sizeof(Rfit::circle_fit), cudaMemcpyDeviceToHost));
   std::cout << "Fitted values (CircleFit) GPU:\n" << circle_fit_resultsGPUret->par << std::endl;
   assert(isEqualFuzzy(circle_fit_results.par, circle_fit_resultsGPUret->par));
 
   std::cout << "Fitted values (LineFit):\n" << line_fit_results.par << std::endl;
   // LINE_FIT GPU
-  cudaMemcpy(line_fit_resultsGPUret, line_fit_resultsGPU, sizeof(Rfit::line_fit), cudaMemcpyDeviceToHost);
+  cudaCheck(cudaMemcpy(line_fit_resultsGPUret, line_fit_resultsGPU, sizeof(Rfit::line_fit), cudaMemcpyDeviceToHost));
   std::cout << "Fitted values (LineFit) GPU:\n" << line_fit_resultsGPUret->par << std::endl;
   assert(isEqualFuzzy(line_fit_results.par, line_fit_resultsGPUret->par, N == 5 ? 1e-4 : 1e-6));  // requires fma on CPU
 
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu
index ebaea2037eb2a..e16ac3dbbcbc3 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu
+++ b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu
@@ -73,18 +73,19 @@ void testMultiply() {
   Eigen::Matrix<double, row1, col2> *multiply_resultGPU = nullptr;
   Eigen::Matrix<double, row1, col2> *multiply_resultGPUret = new Eigen::Matrix<double, row1, col2>();
 
-  cudaMalloc((void **)&JGPU, sizeof(Eigen::Matrix<double, row1, col1>));
-  cudaMalloc((void **)&CGPU, sizeof(Eigen::Matrix<double, row2, col2>));
-  cudaMalloc((void **)&multiply_resultGPU, sizeof(Eigen::Matrix<double, row1, col2>));
-  cudaMemcpy(JGPU, &J, sizeof(Eigen::Matrix<double, row1, col1>), cudaMemcpyHostToDevice);
-  cudaMemcpy(CGPU, &C, sizeof(Eigen::Matrix<double, row2, col2>), cudaMemcpyHostToDevice);
-  cudaMemcpy(multiply_resultGPU, &multiply_result, sizeof(Eigen::Matrix<double, row1, col2>), cudaMemcpyHostToDevice);
+  cudaCheck(cudaMalloc((void **)&JGPU, sizeof(Eigen::Matrix<double, row1, col1>)));
+  cudaCheck(cudaMalloc((void **)&CGPU, sizeof(Eigen::Matrix<double, row2, col2>)));
+  cudaCheck(cudaMalloc((void **)&multiply_resultGPU, sizeof(Eigen::Matrix<double, row1, col2>)));
+  cudaCheck(cudaMemcpy(JGPU, &J, sizeof(Eigen::Matrix<double, row1, col1>), cudaMemcpyHostToDevice));
+  cudaCheck(cudaMemcpy(CGPU, &C, sizeof(Eigen::Matrix<double, row2, col2>), cudaMemcpyHostToDevice));
+  cudaCheck(cudaMemcpy(
+      multiply_resultGPU, &multiply_result, sizeof(Eigen::Matrix<double, row1, col2>), cudaMemcpyHostToDevice));
 
   kernelMultiply<<<1, 1>>>(JGPU, CGPU, multiply_resultGPU);
   cudaDeviceSynchronize();
 
-  cudaMemcpy(
-      multiply_resultGPUret, multiply_resultGPU, sizeof(Eigen::Matrix<double, row1, col2>), cudaMemcpyDeviceToHost);
+  cudaCheck(cudaMemcpy(
+      multiply_resultGPUret, multiply_resultGPU, sizeof(Eigen::Matrix<double, row1, col2>), cudaMemcpyDeviceToHost));
   printIt(multiply_resultGPUret);
   assert(isEqualFuzzy(multiply_result, (*multiply_resultGPUret)));
 }
@@ -104,14 +105,14 @@ void testInverse3x3() {
   std::cout << "Here is the matrix m:" << std::endl << m << std::endl;
   std::cout << "Its inverse is:" << std::endl << m.inverse() << std::endl;
 #endif
-  cudaMalloc((void **)&mGPU, sizeof(Matrix3d));
-  cudaMalloc((void **)&mGPUret, sizeof(Matrix3d));
-  cudaMemcpy(mGPU, &m, sizeof(Matrix3d), cudaMemcpyHostToDevice);
+  cudaCheck(cudaMalloc((void **)&mGPU, sizeof(Matrix3d)));
+  cudaCheck(cudaMalloc((void **)&mGPUret, sizeof(Matrix3d)));
+  cudaCheck(cudaMemcpy(mGPU, &m, sizeof(Matrix3d), cudaMemcpyHostToDevice));
 
   kernelInverse3x3<<<1, 1>>>(mGPU, mGPUret);
   cudaDeviceSynchronize();
 
-  cudaMemcpy(mCPUret, mGPUret, sizeof(Matrix3d), cudaMemcpyDeviceToHost);
+  cudaCheck(cudaMemcpy(mCPUret, mGPUret, sizeof(Matrix3d), cudaMemcpyDeviceToHost));
 #if TEST_DEBUG
   std::cout << "Its GPU inverse is:" << std::endl << (*mCPUret) << std::endl;
 #endif
@@ -133,14 +134,14 @@ void testInverse4x4() {
   std::cout << "Here is the matrix m:" << std::endl << m << std::endl;
   std::cout << "Its inverse is:" << std::endl << m.inverse() << std::endl;
 #endif
-  cudaMalloc((void **)&mGPU, sizeof(Matrix4d));
-  cudaMalloc((void **)&mGPUret, sizeof(Matrix4d));
-  cudaMemcpy(mGPU, &m, sizeof(Matrix4d), cudaMemcpyHostToDevice);
+  cudaCheck(cudaMalloc((void **)&mGPU, sizeof(Matrix4d)));
+  cudaCheck(cudaMalloc((void **)&mGPUret, sizeof(Matrix4d)));
+  cudaCheck(cudaMemcpy(mGPU, &m, sizeof(Matrix4d), cudaMemcpyHostToDevice));
 
   kernelInverse4x4<<<1, 1>>>(mGPU, mGPUret);
   cudaDeviceSynchronize();
 
-  cudaMemcpy(mCPUret, mGPUret, sizeof(Matrix4d), cudaMemcpyDeviceToHost);
+  cudaCheck(cudaMemcpy(mCPUret, mGPUret, sizeof(Matrix4d), cudaMemcpyDeviceToHost));
 #if TEST_DEBUG
   std::cout << "Its GPU inverse is:" << std::endl << (*mCPUret) << std::endl;
 #endif
@@ -162,14 +163,14 @@ void testInverse5x5() {
   std::cout << "Here is the matrix m:" << std::endl << m << std::endl;
   std::cout << "Its inverse is:" << std::endl << m.inverse() << std::endl;
 #endif
-  cudaMalloc((void **)&mGPU, sizeof(Matrix5d));
-  cudaMalloc((void **)&mGPUret, sizeof(Matrix5d));
-  cudaMemcpy(mGPU, &m, sizeof(Matrix5d), cudaMemcpyHostToDevice);
+  cudaCheck(cudaMalloc((void **)&mGPU, sizeof(Matrix5d)));
+  cudaCheck(cudaMalloc((void **)&mGPUret, sizeof(Matrix5d)));
+  cudaCheck(cudaMemcpy(mGPU, &m, sizeof(Matrix5d), cudaMemcpyHostToDevice));
 
   kernelInverse5x5<<<1, 1>>>(mGPU, mGPUret);
   cudaDeviceSynchronize();
 
-  cudaMemcpy(mCPUret, mGPUret, sizeof(Matrix5d), cudaMemcpyDeviceToHost);
+  cudaCheck(cudaMemcpy(mCPUret, mGPUret, sizeof(Matrix5d), cudaMemcpyDeviceToHost));
 #if TEST_DEBUG
   std::cout << "Its GPU inverse is:" << std::endl << (*mCPUret) << std::endl;
 #endif
@@ -195,15 +196,16 @@ void testEigenvalues() {
   std::cout << "The eigenvalues of M are:" << std::endl << (*ret) << std::endl;
   std::cout << "*************************\n\n" << std::endl;
 #endif
-  cudaMalloc((void **)&m_gpu, sizeof(Matrix3d));
-  cudaMalloc((void **)&ret_gpu, sizeof(Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType));
-  cudaMemcpy(m_gpu, &m, sizeof(Matrix3d), cudaMemcpyHostToDevice);
+  cudaCheck(cudaMalloc((void **)&m_gpu, sizeof(Matrix3d)));
+  cudaCheck(cudaMalloc((void **)&ret_gpu, sizeof(Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType)));
+  cudaCheck(cudaMemcpy(m_gpu, &m, sizeof(Matrix3d), cudaMemcpyHostToDevice));
 
   kernel<<<1, 1>>>(m_gpu, ret_gpu);
   cudaDeviceSynchronize();
 
-  cudaMemcpy(mgpudebug, m_gpu, sizeof(Matrix3d), cudaMemcpyDeviceToHost);
-  cudaMemcpy(ret1, ret_gpu, sizeof(Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType), cudaMemcpyDeviceToHost);
+  cudaCheck(cudaMemcpy(mgpudebug, m_gpu, sizeof(Matrix3d), cudaMemcpyDeviceToHost));
+  cudaCheck(cudaMemcpy(
+      ret1, ret_gpu, sizeof(Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType), cudaMemcpyDeviceToHost));
 #if TEST_DEBUG
   std::cout << "GPU Generated Matrix M 3x3:\n" << (*mgpudebug) << std::endl;
   std::cout << "GPU The eigenvalues of M are:" << std::endl << (*ret1) << std::endl;
diff --git a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
index 0df7af362ac0d..14263ed7b3d18 100644
--- a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
+++ b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
@@ -6,6 +6,7 @@
 
 #include <cuda/api_wrappers.h>
 
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
 #ifdef USE_DBSCAN
@@ -126,10 +127,10 @@ int main() {
       std::cout << "v,t size " << ev.zvert.size() << ' ' << ev.ztrack.size() << std::endl;
       auto nt = ev.ztrack.size();
 #ifdef __CUDACC__
-      cuda::memory::copy(LOC_WS(ntrks), &nt, sizeof(uint32_t));
-      cuda::memory::copy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size());
-      cuda::memory::copy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size());
-      cuda::memory::copy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size());
+      cudaCheck(cudaMemcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t), cudaMemcpyHostToDevice));
+      cudaCheck(cudaMemcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size(), cudaMemcpyHostToDevice));
+      cudaCheck(cudaMemcpy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice));
+      cudaCheck(cudaMemcpy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice));
 #else
       ::memcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t));
       ::memcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size());
@@ -162,7 +163,7 @@ int main() {
 
       cudautils::launch(fitVertices, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
       cudaCheck(cudaGetLastError());
-      cuda::memory::copy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t));
+      cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
 
 #else
       print(onGPU_d.get(), ws_d.get());
@@ -207,8 +208,8 @@ int main() {
 #endif
 
 #ifdef __CUDACC__
-      cuda::memory::copy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t));
-      cuda::memory::copy(chi2, LOC_ONGPU(chi2), nv * sizeof(float));
+      cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost));
 #else
       memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float));
 #endif
@@ -223,9 +224,9 @@ int main() {
 
 #ifdef __CUDACC__
       cudautils::launch(fitVertices, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
-      cuda::memory::copy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t));
-      cuda::memory::copy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t));
-      cuda::memory::copy(chi2, LOC_ONGPU(chi2), nv * sizeof(float));
+      cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost));
 #else
       fitVertices(onGPU_d.get(), ws_d.get(), 50.f);
       nv = onGPU_d->nvFinal;
@@ -243,7 +244,7 @@ int main() {
 #ifdef __CUDACC__
       // one vertex per block!!!
       cudautils::launch(splitVertices, {1024, 64}, onGPU_d.get(), ws_d.get(), 9.f);
-      cuda::memory::copy(&nv, LOC_WS(nvIntermediate), sizeof(uint32_t));
+      cudaCheck(cudaMemcpy(&nv, LOC_WS(nvIntermediate), sizeof(uint32_t), cudaMemcpyDeviceToHost));
 #else
       gridDim.x = 1024;  // nv ????
       assert(blockIdx.x == 0);
@@ -260,7 +261,7 @@ int main() {
 
       cudautils::launch(sortByPt2, {1, 256}, onGPU_d.get(), ws_d.get());
       cudaCheck(cudaGetLastError());
-      cuda::memory::copy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t));
+      cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
 #else
       fitVertices(onGPU_d.get(), ws_d.get(), 5000.f);
       sortByPt2(onGPU_d.get(), ws_d.get());
@@ -274,12 +275,12 @@ int main() {
       }
 
 #ifdef __CUDACC__
-      cuda::memory::copy(zv, LOC_ONGPU(zv), nv * sizeof(float));
-      cuda::memory::copy(wv, LOC_ONGPU(wv), nv * sizeof(float));
-      cuda::memory::copy(chi2, LOC_ONGPU(chi2), nv * sizeof(float));
-      cuda::memory::copy(ptv2, LOC_ONGPU(ptv2), nv * sizeof(float));
-      cuda::memory::copy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t));
-      cuda::memory::copy(ind, LOC_ONGPU(sortInd), nv * sizeof(uint16_t));
+      cudaCheck(cudaMemcpy(zv, LOC_ONGPU(zv), nv * sizeof(float), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(wv, LOC_ONGPU(wv), nv * sizeof(float), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(ptv2, LOC_ONGPU(ptv2), nv * sizeof(float), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(ind, LOC_ONGPU(sortInd), nv * sizeof(uint16_t), cudaMemcpyDeviceToHost));
 #endif
       for (auto j = 0U; j < nv; ++j)
         if (nn[j] > 0)