Skip to content

Commit

Permalink
Replace CUDA API wrapper memory operations with native CUDA calls (#395)
Browse files Browse the repository at this point in the history
  • Loading branch information
waredjeb authored and fwyzard committed Oct 29, 2019
1 parent 617f9a0 commit 6bfe94f
Show file tree
Hide file tree
Showing 30 changed files with 200 additions and 174 deletions.
3 changes: 2 additions & 1 deletion CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
#include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h"

#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"

BeamSpotCUDA::BeamSpotCUDA(Data const* data_h, cudaStream_t stream) {
data_d_ = cudautils::make_device_unique<Data>(stream);
cuda::memory::async::copy(data_d_.get(), data_h, sizeof(Data), stream);
cudaCheck(cudaMemcpyAsync(data_d_.get(), data_h, sizeof(Data), cudaMemcpyHostToDevice, stream));
}
2 changes: 1 addition & 1 deletion CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DCUDA.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,6 @@ cudautils::host::unique_ptr<float[]> TrackingRecHit2DCUDA::localCoordToHostAsync
template <>
cudautils::host::unique_ptr<uint32_t[]> TrackingRecHit2DCUDA::hitsModuleStartToHostAsync(cudaStream_t stream) const {
auto ret = cudautils::make_host_unique<uint32_t[]>(2001, stream);
cudaMemcpyAsync(ret.get(), m_hitsModuleStart, 4 * 2001, cudaMemcpyDefault, stream);
cudaCheck(cudaMemcpyAsync(ret.get(), m_hitsModuleStart, 4 * 2001, cudaMemcpyDefault, stream));
return ret;
}
18 changes: 9 additions & 9 deletions DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

#include <cuda/api_wrappers.h>

#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "DataFormats/GeometrySurface/interface/GloballyPositioned.h"
#include "DataFormats/GeometrySurface/interface/SOARotation.h"
#include "DataFormats/GeometrySurface/interface/TkRotation.h"
Expand Down Expand Up @@ -73,7 +74,7 @@ int main(void) {

// auto d_sf = cuda::memory::device::make_unique<SFrame[]>(current_device, 1);
auto d_sf = cuda::memory::device::make_unique<char[]>(current_device, sizeof(SFrame));
cuda::memory::copy(d_sf.get(), &sf1, sizeof(SFrame));
cudaCheck(cudaMemcpy(d_sf.get(), &sf1, sizeof(SFrame), cudaMemcpyHostToDevice));

for (auto i = 0U; i < size; ++i) {
xl[i] = yl[i] = 0.1f * float(i) - float(size / 2);
Expand All @@ -84,9 +85,9 @@ int main(void) {
std::random_shuffle(xl, xl + size);
std::random_shuffle(yl, yl + size);

cuda::memory::copy(d_xl.get(), xl, size32);
cuda::memory::copy(d_yl.get(), yl, size32);
cuda::memory::copy(d_le.get(), le, 3 * size32);
cudaCheck(cudaMemcpy(d_xl.get(), xl, size32, cudaMemcpyHostToDevice));
cudaCheck(cudaMemcpy(d_yl.get(), yl, size32, cudaMemcpyHostToDevice));
cudaCheck(cudaMemcpy(d_le.get(), le, 3 * size32, cudaMemcpyHostToDevice));

toGlobalWrapper((SFrame const *)(d_sf.get()),
d_xl.get(),
Expand All @@ -97,11 +98,10 @@ int main(void) {
d_le.get(),
d_ge.get(),
size);

cuda::memory::copy(x, d_x.get(), size32);
cuda::memory::copy(y, d_y.get(), size32);
cuda::memory::copy(z, d_z.get(), size32);
cuda::memory::copy(ge, d_ge.get(), 6 * size32);
cudaCheck(cudaMemcpy(x, d_x.get(), size32, cudaMemcpyDeviceToHost));
cudaCheck(cudaMemcpy(y, d_y.get(), size32, cudaMemcpyDeviceToHost));
cudaCheck(cudaMemcpy(z, d_z.get(), size32, cudaMemcpyDeviceToHost));
cudaCheck(cudaMemcpy(ge, d_ge.get(), 6 * size32, cudaMemcpyDeviceToHost));

float eps = 0.;
for (auto i = 0U; i < size; ++i) {
Expand Down
9 changes: 5 additions & 4 deletions DataFormats/Math/test/CholeskyInvert_t.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include <cuda/api_wrappers.h>

#include "DataFormats/Math/interface/choleskyInversion.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
#include "HeterogeneousCore/CUDAUtilities/interface/launch.h"

Expand Down Expand Up @@ -132,7 +133,7 @@ void go(bool soa) {
std::cout << mm[SIZE / 2](1, 1) << std::endl;

auto m_d = cuda::memory::device::make_unique<double[]>(current_device, DIM * DIM * stride());
cuda::memory::copy(m_d.get(), (double const *)(mm), stride() * sizeof(MX));
cudaCheck(cudaMemcpy(m_d.get(), (double const *)(mm), stride() * sizeof(MX), cudaMemcpyHostToDevice));

constexpr int NKK =
#ifdef DOPROF
Expand All @@ -151,7 +152,8 @@ void go(bool soa) {
else
cudautils::launch(invert<MX, DIM>, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE);

cuda::memory::copy(&mm, m_d.get(), stride() * sizeof(MX));
cudaCheck(cudaMemcpy(&mm, m_d.get(), stride() * sizeof(MX), cudaMemcpyDeviceToHost));

delta += (std::chrono::high_resolution_clock::now() - start);

if (0 == kk)
Expand All @@ -162,8 +164,7 @@ void go(bool soa) {

#ifndef DOPROF
cudautils::launch(invertSeq<MX, DIM>, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE);

cuda::memory::copy(&mm, m_d.get(), stride() * sizeof(MX));
cudaCheck(cudaMemcpy(&mm, m_d.get(), stride() * sizeof(MX), cudaMemcpyDeviceToHost));
#endif
delta1 += (std::chrono::high_resolution_clock::now() - start);

Expand Down
5 changes: 3 additions & 2 deletions DataFormats/Math/test/cudaAtan2Test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ end
#include "cuda/api_wrappers.h"

#include "DataFormats/Math/interface/approx_atan2.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
#include "HeterogeneousCore/CUDAUtilities/interface/launch.h"

Expand Down Expand Up @@ -72,7 +73,7 @@ void go() {
auto diff_d = cuda::memory::device::make_unique<int[]>(current_device, 3);

int diffs[3];
cuda::memory::device::zero(diff_d.get(), 3 * 4);
cudaCheck(cudaMemset(diff_d.get(), 0, 3 * 4));

// Launch the diff CUDA Kernel
dim3 threadsPerBlock(32, 32, 1);
Expand All @@ -83,7 +84,7 @@ void go() {

cudautils::launch(diffAtan<DEGREE>, {blocksPerGrid, threadsPerBlock}, diff_d.get());

cuda::memory::copy(diffs, diff_d.get(), 3 * 4);
cudaCheck(cudaMemcpy(diffs, diff_d.get(), 3 * 4, cudaMemcpyDeviceToHost));
delta += (std::chrono::high_resolution_clock::now() - start);

float mdiff = diffs[0] * 1.e-7;
Expand Down
13 changes: 8 additions & 5 deletions DataFormats/Math/test/cudaMathTest.cu
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ end
#include "DataFormats/Math/interface/approx_log.h"
#include "DataFormats/Math/interface/approx_exp.h"
#include "DataFormats/Math/interface/approx_atan2.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
#include "HeterogeneousCore/CUDAUtilities/interface/launch.h"

Expand Down Expand Up @@ -106,8 +107,8 @@ void go() {
auto d_B = cuda::memory::device::make_unique<float[]>(current_device, numElements);
auto d_C = cuda::memory::device::make_unique<float[]>(current_device, numElements);

cuda::memory::copy(d_A.get(), h_A.get(), size);
cuda::memory::copy(d_B.get(), h_B.get(), size);
cudaCheck(cudaMemcpy(d_A.get(), h_A.get(), size, cudaMemcpyHostToDevice));
cudaCheck(cudaMemcpy(d_B.get(), h_B.get(), size, cudaMemcpyHostToDevice));
delta += (std::chrono::high_resolution_clock::now() - start);
std::cout << "cuda alloc+copy took " << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() << " ms"
<< std::endl;
Expand All @@ -118,19 +119,21 @@ void go() {
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads\n";

delta -= (std::chrono::high_resolution_clock::now() - start);
cudautils::launch(vectorOp<USE, ADDY>, {blocksPerGrid, threadsPerBlock}, d_A.get(), d_B.get(), d_C.get(), numElements);
cudautils::launch(
vectorOp<USE, ADDY>, {blocksPerGrid, threadsPerBlock}, d_A.get(), d_B.get(), d_C.get(), numElements);
delta += (std::chrono::high_resolution_clock::now() - start);
std::cout << "cuda computation took " << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() << " ms"
<< std::endl;

delta -= (std::chrono::high_resolution_clock::now() - start);
cudautils::launch(vectorOp<USE, ADDY>, {blocksPerGrid, threadsPerBlock}, d_A.get(), d_B.get(), d_C.get(), numElements);
cudautils::launch(
vectorOp<USE, ADDY>, {blocksPerGrid, threadsPerBlock}, d_A.get(), d_B.get(), d_C.get(), numElements);
delta += (std::chrono::high_resolution_clock::now() - start);
std::cout << "cuda computation took " << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() << " ms"
<< std::endl;

delta -= (std::chrono::high_resolution_clock::now() - start);
cuda::memory::copy(h_C.get(), d_C.get(), size);
cudaCheck(cudaMemcpy(h_C.get(), d_C.get(), size, cudaMemcpyDeviceToHost));
delta += (std::chrono::high_resolution_clock::now() - start);
std::cout << "cuda copy back took " << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() << " ms"
<< std::endl;
Expand Down
7 changes: 4 additions & 3 deletions HeterogeneousCore/CUDACore/test/testStreamEvent.cu
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

#include <cuda_runtime.h>

#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"

namespace {
Expand Down Expand Up @@ -39,8 +40,8 @@ int main() {
cudaStream_t stream1, stream2;
cudaEvent_t event1, event2;

cudaMalloc(&dev_points1, ARRAY_SIZE * sizeof(float));
cudaMallocHost(&host_points1, ARRAY_SIZE * sizeof(float));
cudaCheck(cudaMalloc(&dev_points1, ARRAY_SIZE * sizeof(float)));
cudaCheck(cudaMallocHost(&host_points1, ARRAY_SIZE * sizeof(float)));
cudaStreamCreateWithFlags(&stream1, cudaStreamNonBlocking);
cudaStreamCreateWithFlags(&stream2, cudaStreamNonBlocking);
cudaEventCreate(&event1);
Expand All @@ -50,7 +51,7 @@ int main() {
host_points1[j] = static_cast<float>(j);
}

cudaMemcpyAsync(dev_points1, host_points1, ARRAY_SIZE * sizeof(float), cudaMemcpyHostToDevice, stream1);
cudaCheck(cudaMemcpyAsync(dev_points1, host_points1, ARRAY_SIZE * sizeof(float), cudaMemcpyHostToDevice, stream1));
kernel_looping<<<1, 16, 0, stream1>>>(dev_points1, ARRAY_SIZE);
if (debug)
std::cout << "Kernel launched on stream1" << std::endl;
Expand Down
10 changes: 5 additions & 5 deletions HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,7 @@ namespace cudatest {
namespace {
std::unique_ptr<CUDAProduct<int*>> produce(int device, int* d, int* h) {
auto ctx = cudatest::TestCUDAScopedContext::make(device, true);

cuda::memory::async::copy(d, h, sizeof(int), ctx.stream());
cudaCheck(cudaMemcpyAsync(d, h, sizeof(int), cudaMemcpyHostToDevice, ctx.stream()));
testCUDAScopedContextKernels_single(d, ctx.stream());
return ctx.wrap(d);
}
Expand Down Expand Up @@ -116,9 +115,10 @@ TEST_CASE("Use of CUDAScopedContext", "[CUDACore]") {
h_a1 = 0;
h_a2 = 0;
int h_a3 = 0;
cuda::memory::async::copy(&h_a1, d_a1.get(), sizeof(int), ctx.stream());
cuda::memory::async::copy(&h_a2, d_a2.get(), sizeof(int), ctx.stream());
cuda::memory::async::copy(&h_a3, d_a3.get(), sizeof(int), ctx.stream());

cudaCheck(cudaMemcpyAsync(&h_a1, d_a1.get(), sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()));
cudaCheck(cudaMemcpyAsync(&h_a2, d_a2.get(), sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()));
cudaCheck(cudaMemcpyAsync(&h_a3, d_a3.get(), sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()));

REQUIRE(h_a1 == 2);
REQUIRE(h_a2 == 4);
Expand Down
5 changes: 3 additions & 2 deletions HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include "FWCore/ServiceRegistry/interface/Service.h"

#include "CUDADataFormats/Common/interface/CUDAProduct.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
#include "HeterogeneousCore/CUDACore/interface/CUDAContextState.h"
#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
Expand Down Expand Up @@ -67,8 +68,8 @@ void TestCUDAProducerGPUEW::acquire(const edm::Event& iEvent,
// Mimick the need to transfer some of the GPU data back to CPU to
// be used for something within this module, or to be put in the
// event.
cuda::memory::async::copy(hostData_.get(), devicePtr_.get() + 10, sizeof(float), ctx.stream());

cudaCheck(
cudaMemcpyAsync(hostData_.get(), devicePtr_.get() + 10, sizeof(float), cudaMemcpyDeviceToHost, ctx.stream()));
edm::LogVerbatim("TestCUDAProducerGPUEW") << label_ << " TestCUDAProducerGPUEW::acquire end event "
<< iEvent.id().event() << " stream " << iEvent.streamID();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include "FWCore/ServiceRegistry/interface/Service.h"

#include "CUDADataFormats/Common/interface/CUDAProduct.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
#include "HeterogeneousCore/CUDACore/interface/CUDAContextState.h"
#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
Expand Down Expand Up @@ -75,8 +76,8 @@ void TestCUDAProducerGPUEWTask::acquire(const edm::Event& iEvent,
// Mimick the need to transfer some of the GPU data back to CPU to
// be used for something within this module, or to be put in the
// event.
cuda::memory::async::copy(hostData_.get(), devicePtr_.get() + 10, sizeof(float), ctx.stream());

cudaCheck(
cudaMemcpyAsync(hostData_.get(), devicePtr_.get() + 10, sizeof(float), cudaMemcpyDeviceToHost, ctx.stream()));
// Push a task to run addSimpleWork() after the asynchronous work
// (and acquire()) has finished instead of produce()
ctx.pushNextTask([iev = iEvent.id().event(), istr = iEvent.streamID(), this](CUDAScopedContextTask ctx) {
Expand All @@ -94,7 +95,8 @@ void TestCUDAProducerGPUEWTask::addSimpleWork(edm::EventNumber_t eventID,
edm::LogVerbatim("TestCUDAProducerGPUEWTask")
<< label_ << " TestCUDAProducerGPUEWTask::addSimpleWork begin event " << eventID << " stream " << streamID
<< " 10th element " << *hostData_ << " not satisfied, queueing more work";
cuda::memory::async::copy(hostData_.get(), devicePtr_.get() + 10, sizeof(float), ctx.stream());
cudaCheck(
cudaMemcpyAsync(hostData_.get(), devicePtr_.get() + 10, sizeof(float), cudaMemcpyDeviceToHost, ctx.stream()));

ctx.pushNextTask([eventID, streamID, this](CUDAScopedContextTask ctx) { addSimpleWork(eventID, streamID, ctx); });
gpuAlgo_.runSimpleAlgo(devicePtr_.get(), ctx.stream());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ cudautils::device::unique_ptr<float[]> TestCUDAProducerGPUKernel::runAlgo(const
// First make the sanity check
if (d_input != nullptr) {
auto h_check = std::make_unique<float[]>(NUM_VALUES);
cuda::memory::copy(h_check.get(), d_input, NUM_VALUES * sizeof(float));
cudaCheck(cudaMemcpy(h_check.get(), d_input, NUM_VALUES * sizeof(float), cudaMemcpyDeviceToHost));
for (int i = 0; i < NUM_VALUES; ++i) {
if (h_check[i] != i) {
throw cms::Exception("Assert") << "Sanity check on element " << i << " failed, expected " << i << " got "
Expand All @@ -89,8 +89,8 @@ cudautils::device::unique_ptr<float[]> TestCUDAProducerGPUKernel::runAlgo(const
auto d_a = cudautils::make_device_unique<float[]>(NUM_VALUES, stream);
auto d_b = cudautils::make_device_unique<float[]>(NUM_VALUES, stream);

cuda::memory::async::copy(d_a.get(), h_a.get(), NUM_VALUES * sizeof(float), stream);
cuda::memory::async::copy(d_b.get(), h_b.get(), NUM_VALUES * sizeof(float), stream);
cudaCheck(cudaMemcpyAsync(d_a.get(), h_a.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream));
cudaCheck(cudaMemcpyAsync(d_b.get(), h_b.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream));

int threadsPerBlock{32};
int blocksPerGrid = (NUM_VALUES + threadsPerBlock - 1) / threadsPerBlock;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"

#include "CUDADataFormats/Common/interface/CUDAProduct.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
#include "HeterogeneousCore/CUDATest/interface/CUDAThing.h"
#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
Expand Down Expand Up @@ -58,8 +59,11 @@ void TestCUDAProducerGPUtoCPU::acquire(const edm::Event& iEvent,

buffer_ = cudautils::make_host_unique<float[]>(TestCUDAProducerGPUKernel::NUM_VALUES, ctx.stream());
// Enqueue async copy, continue in produce once finished
cuda::memory::async::copy(
buffer_.get(), device.get(), TestCUDAProducerGPUKernel::NUM_VALUES * sizeof(float), ctx.stream());
cudaCheck(cudaMemcpyAsync(buffer_.get(),
device.get(),
TestCUDAProducerGPUKernel::NUM_VALUES * sizeof(float),
cudaMemcpyDeviceToHost,
ctx.stream()));

edm::LogVerbatim("TestCUDAProducerGPUtoCPU") << label_ << " TestCUDAProducerGPUtoCPU::acquire end event "
<< iEvent.id().event() << " stream " << iEvent.streamID();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ process.moduleToTest(process.toTest)
REQUIRE(data != nullptr);

float firstElements[10];
cuda::memory::async::copy(firstElements, data, sizeof(float) * 10, prod->stream());
cudaCheck(cudaMemcpyAsync(firstElements, data, sizeof(float) * 10, cudaMemcpyDeviceToHost, prod->stream()));

std::cout << "Synchronizing with CUDA stream" << std::endl;
auto stream = prod->stream();
Expand Down
2 changes: 1 addition & 1 deletion HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ namespace cudautils {
) {
uint32_t *off = (uint32_t *)((char *)(h) + offsetof(Histo, off));
#ifdef __CUDACC__
cudaMemsetAsync(off, 0, 4 * Histo::totbins(), stream);
cudaCheck(cudaMemsetAsync(off, 0, 4 * Histo::totbins(), stream));
#else
::memset(off, 0, 4 * Histo::totbins());
#endif
Expand Down
9 changes: 5 additions & 4 deletions HeterogeneousCore/CUDAUtilities/interface/copyAsync.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#ifndef HeterogeneousCore_CUDAUtilities_copyAsync_h
#define HeterogeneousCore_CUDAUtilities_copyAsync_h

#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"

Expand All @@ -17,15 +18,15 @@ namespace cudautils {
// Shouldn't compile for array types because of sizeof(T), but
// let's add an assert with a more helpful message
static_assert(std::is_array<T>::value == false, "For array types, use the other overload with the size parameter");
cuda::memory::async::copy(dst.get(), src.get(), sizeof(T), stream);
cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), sizeof(T), cudaMemcpyHostToDevice, stream));
}

template <typename T>
inline void copyAsync(cudautils::host::unique_ptr<T>& dst,
const cudautils::device::unique_ptr<T>& src,
cudaStream_t stream) {
static_assert(std::is_array<T>::value == false, "For array types, use the other overload with the size parameter");
cuda::memory::async::copy(dst.get(), src.get(), sizeof(T), stream);
cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), sizeof(T), cudaMemcpyDeviceToHost, stream));
}

// Multiple elements
Expand All @@ -34,15 +35,15 @@ namespace cudautils {
const cudautils::host::unique_ptr<T[]>& src,
size_t nelements,
cudaStream_t stream) {
cuda::memory::async::copy(dst.get(), src.get(), nelements * sizeof(T), stream);
cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), nelements * sizeof(T), cudaMemcpyHostToDevice, stream));
}

template <typename T>
inline void copyAsync(cudautils::host::unique_ptr<T[]>& dst,
const cudautils::device::unique_ptr<T[]>& src,
size_t nelements,
cudaStream_t stream) {
cuda::memory::async::copy(dst.get(), src.get(), nelements * sizeof(T), stream);
cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), nelements * sizeof(T), cudaMemcpyDeviceToHost, stream));
}
} // namespace cudautils

Expand Down
Loading

0 comments on commit 6bfe94f

Please sign in to comment.