Skip to content

Commit

Permalink
Merge pull request #1711 from hdelan/minimize-vector-allocations
Browse files Browse the repository at this point in the history
[HIP][CUDA] Several changes to kernel launch
  • Loading branch information
kbenzie committed Jun 12, 2024
2 parents 8788bd1 + 61b42a3 commit b13c5e1
Show file tree
Hide file tree
Showing 16 changed files with 532 additions and 514 deletions.
211 changes: 49 additions & 162 deletions source/adapters/cuda/enqueue.cpp

Large diffs are not rendered by default.

73 changes: 41 additions & 32 deletions source/adapters/cuda/memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

#include "common.hpp"
#include "context.hpp"
#include "enqueue.hpp"
#include "memory.hpp"

/// Creates a UR Memory object using a CUDA memory allocation.
Expand Down Expand Up @@ -238,7 +239,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate(
try {
if (PerformInitialCopy) {
for (const auto &Device : hContext->getDevices()) {
UR_CHECK_ERROR(migrateMemoryToDeviceIfNeeded(URMemObj.get(), Device));
// Synchronous behaviour is best in this case
ScopedContext Active(Device);
CUstream Stream{0}; // Use default stream
UR_CHECK_ERROR(enqueueMigrateMemoryToDeviceIfNeeded(URMemObj.get(),
Device, Stream));
UR_CHECK_ERROR(cuStreamSynchronize(Stream));
}
}

Expand Down Expand Up @@ -496,27 +502,28 @@ ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t Mem,
}

namespace {
ur_result_t migrateBufferToDevice(ur_mem_handle_t Mem,
ur_device_handle_t hDevice) {
ur_result_t enqueueMigrateBufferToDevice(ur_mem_handle_t Mem,
ur_device_handle_t hDevice,
CUstream Stream) {
auto &Buffer = std::get<BufferMem>(Mem->Mem);
if (Mem->LastEventWritingToMemObj == nullptr) {
if (Mem->LastQueueWritingToMemObj == nullptr) {
// Device allocation being initialized from host for the first time
if (Buffer.HostPtr) {
UR_CHECK_ERROR(
cuMemcpyHtoD(Buffer.getPtr(hDevice), Buffer.HostPtr, Buffer.Size));
UR_CHECK_ERROR(cuMemcpyHtoDAsync(Buffer.getPtr(hDevice), Buffer.HostPtr,
Buffer.Size, Stream));
}
} else if (Mem->LastEventWritingToMemObj->getQueue()->getDevice() !=
hDevice) {
UR_CHECK_ERROR(cuMemcpyDtoD(
} else if (Mem->LastQueueWritingToMemObj->getDevice() != hDevice) {
UR_CHECK_ERROR(cuMemcpyDtoDAsync(
Buffer.getPtr(hDevice),
Buffer.getPtr(Mem->LastEventWritingToMemObj->getQueue()->getDevice()),
Buffer.Size));
Buffer.getPtr(Mem->LastQueueWritingToMemObj->getDevice()), Buffer.Size,
Stream));
}
return UR_RESULT_SUCCESS;
}

ur_result_t migrateImageToDevice(ur_mem_handle_t Mem,
ur_device_handle_t hDevice) {
ur_result_t enqueueMigrateImageToDevice(ur_mem_handle_t Mem,
ur_device_handle_t hDevice,
CUstream Stream) {
auto &Image = std::get<SurfaceMem>(Mem->Mem);
// When a dimension isn't used image_desc has the size set to 1
size_t PixelSizeBytes = Image.PixelTypeSizeBytes *
Expand Down Expand Up @@ -547,40 +554,42 @@ ur_result_t migrateImageToDevice(ur_mem_handle_t Mem,
CpyDesc3D.Depth = Image.ImageDesc.depth;
}

if (Mem->LastEventWritingToMemObj == nullptr) {
if (Mem->LastQueueWritingToMemObj == nullptr) {
if (Image.HostPtr) {
if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE1D) {
UR_CHECK_ERROR(
cuMemcpyHtoA(ImageArray, 0, Image.HostPtr, ImageSizeBytes));
UR_CHECK_ERROR(cuMemcpyHtoAAsync(ImageArray, 0, Image.HostPtr,
ImageSizeBytes, Stream));
} else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE2D) {
CpyDesc2D.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
CpyDesc2D.srcHost = Image.HostPtr;
UR_CHECK_ERROR(cuMemcpy2D(&CpyDesc2D));
UR_CHECK_ERROR(cuMemcpy2DAsync(&CpyDesc2D, Stream));
} else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE3D) {
CpyDesc3D.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
CpyDesc3D.srcHost = Image.HostPtr;
UR_CHECK_ERROR(cuMemcpy3D(&CpyDesc3D));
UR_CHECK_ERROR(cuMemcpy3DAsync(&CpyDesc3D, Stream));
}
}
} else if (Mem->LastEventWritingToMemObj->getQueue()->getDevice() !=
hDevice) {
} else if (Mem->LastQueueWritingToMemObj->getDevice() != hDevice) {
if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE1D) {
// Blocking wait needed
UR_CHECK_ERROR(urQueueFinish(Mem->LastQueueWritingToMemObj));
// FIXME: 1D memcpy from DtoD going through the host.
UR_CHECK_ERROR(cuMemcpyAtoH(
Image.HostPtr,
Image.getArray(
Mem->LastEventWritingToMemObj->getQueue()->getDevice()),
Image.getArray(Mem->LastQueueWritingToMemObj->getDevice()),
0 /*srcOffset*/, ImageSizeBytes));
UR_CHECK_ERROR(
cuMemcpyHtoA(ImageArray, 0, Image.HostPtr, ImageSizeBytes));
} else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE2D) {
CpyDesc2D.srcArray = Image.getArray(
Mem->LastEventWritingToMemObj->getQueue()->getDevice());
UR_CHECK_ERROR(cuMemcpy2D(&CpyDesc2D));
CpyDesc2D.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_DEVICE;
CpyDesc2D.srcArray =
Image.getArray(Mem->LastQueueWritingToMemObj->getDevice());
UR_CHECK_ERROR(cuMemcpy2DAsync(&CpyDesc2D, Stream));
} else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE3D) {
CpyDesc3D.srcArray = Image.getArray(
Mem->LastEventWritingToMemObj->getQueue()->getDevice());
UR_CHECK_ERROR(cuMemcpy3D(&CpyDesc3D));
CpyDesc3D.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_DEVICE;
CpyDesc3D.srcArray =
Image.getArray(Mem->LastQueueWritingToMemObj->getDevice());
UR_CHECK_ERROR(cuMemcpy3DAsync(&CpyDesc3D, Stream));
}
}
return UR_RESULT_SUCCESS;
Expand All @@ -589,8 +598,8 @@ ur_result_t migrateImageToDevice(ur_mem_handle_t Mem,

// If calling this entry point it is necessary to lock the memoryMigrationMutex
// beforehand
ur_result_t migrateMemoryToDeviceIfNeeded(ur_mem_handle_t Mem,
const ur_device_handle_t hDevice) {
ur_result_t enqueueMigrateMemoryToDeviceIfNeeded(
ur_mem_handle_t Mem, const ur_device_handle_t hDevice, CUstream Stream) {
UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
// Device allocation has already been initialized with most up to date
// data in buffer
Expand All @@ -601,9 +610,9 @@ ur_result_t migrateMemoryToDeviceIfNeeded(ur_mem_handle_t Mem,

ScopedContext Active(hDevice);
if (Mem->isBuffer()) {
UR_CHECK_ERROR(migrateBufferToDevice(Mem, hDevice));
UR_CHECK_ERROR(enqueueMigrateBufferToDevice(Mem, hDevice, Stream));
} else {
UR_CHECK_ERROR(migrateImageToDevice(Mem, hDevice));
UR_CHECK_ERROR(enqueueMigrateImageToDevice(Mem, hDevice, Stream));
}

Mem->HaveMigratedToDeviceSinceLastWrite[Mem->getContext()->getDeviceIndex(
Expand Down
85 changes: 16 additions & 69 deletions source/adapters/cuda/memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,12 @@
#include "device.hpp"
#include "event.hpp"

ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t,
const ur_device_handle_t);
ur_result_t enqueueMigrateMemoryToDeviceIfNeeded(ur_mem_handle_t,
const ur_device_handle_t,
CUstream);

// Handler for plain, pointer-based CUDA allocations
struct BufferMem {

Expand Down Expand Up @@ -288,7 +294,7 @@ struct SurfaceMem {
///
/// The ur_mem_handle_t is responsible for memory allocation and migration
/// across devices in the same ur_context_handle_t. If a kernel writes to a
/// ur_mem_handle_t then it will write to LastEventWritingToMemObj. Then all
/// ur_mem_handle_t then it will write to LastQueueWritingToMemObj. Then all
/// subsequent operations that want to read from the ur_mem_handle_t must wait
/// on the event referring to the last write.
///
Expand All @@ -308,61 +314,7 @@ struct SurfaceMem {
///
/// Migrations will occur in both cases if the most recent version of data
/// is on a different device, marked by
/// LastEventWritingToMemObj->getQueue()->getDevice()
///
/// Example trace:
/// ~~~~~~~~~~~~~~
///
/// =====> urContextCreate([device0, device1], ...) // associated with [q0, q1]
/// -> OUT: hContext
///
/// =====> urMemBufferCreate(hContext,...);
/// -> No native allocations made
/// -> OUT: hBuffer
///
/// =====> urEnqueueMemBufferWrite(q0, hBuffer,...);
/// -> Allocation made on q0 ie device0
/// -> New allocation initialized with host data.
///
/// =====> urKernelSetArgMemObj(hKernel0, hBuffer, ...);
/// -> ur_kernel_handle_t associated with a ur_program_handle_t,
/// which is in turn unique to a device. So we can set the kernel
/// arg with the ptr of the device specific allocation.
/// -> hKernel0->getProgram()->getDevice() == device0
/// -> allocateMemObjOnDeviceIfNeeded(device0);
/// -> Native allocation already made on device0, continue.
///
/// =====> urEnqueueKernelLaunch(q0, hKernel0, ...);
/// -> Suppose that hKernel0 writes to hBuffer.
/// -> Call hBuffer->setLastEventWritingToMemObj with return event
/// from this operation
/// -> Enqueue native kernel launch
///
/// =====> urKernelSetArgMemObj(hKernel1, hBuffer, ...);
/// -> hKernel1->getProgram()->getDevice() == device1
/// -> New allocation will be made on device1 when calling
/// getPtr(device1)
/// -> No native allocation on device1
/// -> Make native allocation on device1
///
/// =====> urEnqueueKernelLaunch(q1, hKernel1, ...);
/// -> Suppose hKernel1 wants to read from hBuffer and not write.
/// -> migrateMemoryToDeviceIfNeeded(device1);
/// -> hBuffer->LastEventWritingToMemObj is not nullptr
/// -> Check if memory has been migrated to device1 since the
/// last write
/// -> Hasn't been migrated
/// -> Wait on LastEventWritingToMemObj.
/// -> Migrate memory from device0's native allocation to
/// device1's native allocation.
/// -> Enqueue native kernel launch
///
/// =====> urEnqueueKernelLaunch(q0, hKernel0, ...);
/// -> migrateMemoryToDeviceIfNeeded(device0);
/// -> hBuffer->LastEventWritingToMemObj refers to an event
/// from q0
/// -> Migration not necessary
/// -> Enqueue native kernel launch
/// LastQueueWritingToMemObj->getDevice()
///
struct ur_mem_handle_t_ {
// Context where the memory object is accessible
Expand All @@ -381,15 +333,13 @@ struct ur_mem_handle_t_ {
// Has the memory been migrated to a device since the last write?
std::vector<bool> HaveMigratedToDeviceSinceLastWrite;

// We should wait on this event prior to migrating memory across allocations
// in this ur_mem_handle_t_
ur_event_handle_t LastEventWritingToMemObj{nullptr};
// Queue with most up to date data of ur_mem_handle_t_
ur_queue_handle_t LastQueueWritingToMemObj{nullptr};

// Enumerates all possible types of accesses.
enum access_mode_t { unknown, read_write, read_only, write_only };

ur_mutex MemoryAllocationMutex; // A mutex for allocations
ur_mutex MemoryMigrationMutex; // A mutex for memory transfers

/// A UR Memory object represents either plain memory allocations ("Buffers"
/// in OpenCL) or typed allocations ("Images" in OpenCL).
Expand Down Expand Up @@ -478,20 +428,17 @@ struct ur_mem_handle_t_ {

uint32_t getReferenceCount() const noexcept { return RefCount; }

void setLastEventWritingToMemObj(ur_event_handle_t NewEvent) {
assert(NewEvent && "Invalid event!");
// This entry point should only ever be called when using multi device ctx
assert(Context->Devices.size() > 1);
urEventRetain(NewEvent);
if (LastEventWritingToMemObj != nullptr) {
urEventRelease(LastEventWritingToMemObj);
void setLastQueueWritingToMemObj(ur_queue_handle_t WritingQueue) {
urQueueRetain(WritingQueue);
if (LastQueueWritingToMemObj != nullptr) {
urQueueRelease(LastQueueWritingToMemObj);
}
LastEventWritingToMemObj = NewEvent;
LastQueueWritingToMemObj = WritingQueue;
for (const auto &Device : Context->getDevices()) {
// This event is never an interop event so will always have an associated
// queue
HaveMigratedToDeviceSinceLastWrite[Context->getDeviceIndex(Device)] =
Device == NewEvent->getQueue()->getDevice();
Device == WritingQueue->getDevice();
}
}
};
Expand Down
Loading

0 comments on commit b13c5e1

Please sign in to comment.