diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp index ac839510f2..906fd49d1d 100644 --- a/source/adapters/cuda/enqueue.cpp +++ b/source/adapters/cuda/enqueue.cpp @@ -414,37 +414,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); - std::vector DepEvents( - phEventWaitList, phEventWaitList + numEventsInWaitList); - std::vector> MemMigrationLocks; - - // phEventWaitList only contains events that are handed to UR by the SYCL - // runtime. However since UR handles memory dependencies within a context - // we may need to add more events to our dependent events list if the UR - // context contains multiple devices - if (hQueue->getContext()->Devices.size() > 1) { - MemMigrationLocks.reserve(hKernel->Args.MemObjArgs.size()); - for (auto &MemArg : hKernel->Args.MemObjArgs) { - bool PushBack = false; - if (auto MemDepEvent = MemArg.Mem->LastEventWritingToMemObj; - MemDepEvent && std::find(DepEvents.begin(), DepEvents.end(), - MemDepEvent) == DepEvents.end()) { - DepEvents.push_back(MemDepEvent); - PushBack = true; - } - if ((MemArg.AccessFlags & - (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY)) || - PushBack) { - if (std::find_if(MemMigrationLocks.begin(), MemMigrationLocks.end(), - [MemArg](auto &Lock) { - return Lock.first == MemArg.Mem; - }) == MemMigrationLocks.end()) - MemMigrationLocks.emplace_back( - std::pair{MemArg.Mem, ur_lock{MemArg.Mem->MemoryMigrationMutex}}); - } - } - } - // Early exit for zero size kernel if (*pGlobalWorkSize == 0) { return urEnqueueEventsWaitWithBarrier(hQueue, numEventsInWaitList, @@ -477,15 +446,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( CUstream CuStream = hQueue->getNextComputeStream( numEventsInWaitList, phEventWaitList, Guard, &StreamToken); - if (DepEvents.size()) { - UR_CHECK_ERROR(enqueueEventsWait(hQueue, CuStream, DepEvents.size(), - DepEvents.data())); - } + UR_CHECK_ERROR(enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, + phEventWaitList)); // For memory migration across devices in the same context if (hQueue->getContext()->Devices.size() > 1) { for (auto &MemArg : hKernel->Args.MemObjArgs) { - migrateMemoryToDeviceIfNeeded(MemArg.Mem, hQueue->getDevice()); + enqueueMigrateMemoryToDeviceIfNeeded(MemArg.Mem, hQueue->getDevice(), + CuStream); + if (MemArg.AccessFlags & + (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY)) { + MemArg.Mem->setLastQueueWritingToMemObj(hQueue); + } } } @@ -496,20 +468,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( UR_CHECK_ERROR(RetImplEvent->start()); } - // Once event has been started we can unlock MemoryMigrationMutex - if (hQueue->getContext()->Devices.size() > 1) { - for (auto &MemArg : hKernel->Args.MemObjArgs) { - // Telling the ur_mem_handle_t that it will need to wait on this kernel - // if it has been written to - if (phEvent && (MemArg.AccessFlags & - (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY))) { - MemArg.Mem->setLastEventWritingToMemObj(RetImplEvent.get()); - } - } - // We can release the MemoryMigrationMutexes now - MemMigrationLocks.clear(); - } - auto &ArgIndices = hKernel->getArgIndices(); UR_CHECK_ERROR(cuLaunchKernel( CuFunc, BlocksPerGrid[0], BlocksPerGrid[1], BlocksPerGrid[2], @@ -523,7 +481,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( UR_CHECK_ERROR(RetImplEvent->record()); *phEvent = RetImplEvent.release(); } - } catch (ur_result_t Err) { return Err; } @@ -603,37 +560,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( } } - std::vector DepEvents( - phEventWaitList, phEventWaitList + numEventsInWaitList); - std::vector> MemMigrationLocks; - - // phEventWaitList only contains events that are handed to UR by the SYCL - // runtime. However since UR handles memory dependencies within a context - // we may need to add more events to our dependent events list if the UR - // context contains multiple devices - if (hQueue->getContext()->Devices.size() > 1) { - MemMigrationLocks.reserve(hKernel->Args.MemObjArgs.size()); - for (auto &MemArg : hKernel->Args.MemObjArgs) { - bool PushBack = false; - if (auto MemDepEvent = MemArg.Mem->LastEventWritingToMemObj; - MemDepEvent && std::find(DepEvents.begin(), DepEvents.end(), - MemDepEvent) == DepEvents.end()) { - DepEvents.push_back(MemDepEvent); - PushBack = true; - } - if ((MemArg.AccessFlags & - (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY)) || - PushBack) { - if (std::find_if(MemMigrationLocks.begin(), MemMigrationLocks.end(), - [MemArg](auto &Lock) { - return Lock.first == MemArg.Mem; - }) == MemMigrationLocks.end()) - MemMigrationLocks.emplace_back( - std::pair{MemArg.Mem, ur_lock{MemArg.Mem->MemoryMigrationMutex}}); - } - } - } - // Early exit for zero size kernel if (*pGlobalWorkSize == 0) { return urEnqueueEventsWaitWithBarrier(hQueue, numEventsInWaitList, @@ -666,15 +592,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( CUstream CuStream = hQueue->getNextComputeStream( numEventsInWaitList, phEventWaitList, Guard, &StreamToken); - if (DepEvents.size()) { - UR_CHECK_ERROR(enqueueEventsWait(hQueue, CuStream, DepEvents.size(), - DepEvents.data())); - } + UR_CHECK_ERROR(enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, + phEventWaitList)); // For memory migration across devices in the same context if (hQueue->getContext()->Devices.size() > 1) { for (auto &MemArg : hKernel->Args.MemObjArgs) { - migrateMemoryToDeviceIfNeeded(MemArg.Mem, hQueue->getDevice()); + enqueueMigrateMemoryToDeviceIfNeeded(MemArg.Mem, hQueue->getDevice(), + CuStream); + if (MemArg.AccessFlags & + (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY)) { + MemArg.Mem->setLastQueueWritingToMemObj(hQueue); + } } } @@ -685,20 +614,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( UR_CHECK_ERROR(RetImplEvent->start()); } - // Once event has been started we can unlock MemoryMigrationMutex - if (hQueue->getContext()->Devices.size() > 1) { - for (auto &MemArg : hKernel->Args.MemObjArgs) { - // Telling the ur_mem_handle_t that it will need to wait on this kernel - // if it has been written to - if (phEvent && (MemArg.AccessFlags & - (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY))) { - MemArg.Mem->setLastEventWritingToMemObj(RetImplEvent.get()); - } - } - // We can release the MemoryMigrationMutexes now - MemMigrationLocks.clear(); - } - auto &ArgIndices = hKernel->getArgIndices(); CUlaunchConfig launch_config; @@ -725,7 +640,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( UR_CHECK_ERROR(RetImplEvent->record()); *phEvent = RetImplEvent.release(); } - } catch (ur_result_t Err) { return Err; } @@ -820,28 +734,19 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferReadRect( ur_event_handle_t *phEvent) { std::unique_ptr RetImplEvent{nullptr}; - ur_lock MemoryMigrationLock{hBuffer->MemoryMigrationMutex}; - auto Device = hQueue->getDevice(); - ScopedContext Active(Device); - CUstream Stream = hQueue->getNextTransferStream(); - try { // Note that this entry point may be called on a queue that may not be the // last queue to write to the MemBuffer, meaning we must perform the copy // from a different device - if (hBuffer->LastEventWritingToMemObj && - hBuffer->LastEventWritingToMemObj->getQueue()->getDevice() != - hQueue->getDevice()) { - hQueue = hBuffer->LastEventWritingToMemObj->getQueue(); - Device = hQueue->getDevice(); - ScopedContext Active(Device); - Stream = CUstream{0}; // Default stream for different device - // We may have to wait for an event on another queue if it is the last - // event writing to mem obj - UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, 1, - &hBuffer->LastEventWritingToMemObj)); + if (hBuffer->LastQueueWritingToMemObj && + hBuffer->LastQueueWritingToMemObj->getDevice() != hQueue->getDevice()) { + hQueue = hBuffer->LastQueueWritingToMemObj; } + auto Device = hQueue->getDevice(); + ScopedContext Active(Device); + CUstream Stream = hQueue->getNextTransferStream(); + UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, numEventsInWaitList, phEventWaitList)); @@ -886,6 +791,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWriteRect( CUdeviceptr DevPtr = std::get(hBuffer->Mem).getPtr(hQueue->getDevice()); std::unique_ptr RetImplEvent{nullptr}; + hBuffer->setLastQueueWritingToMemObj(hQueue); try { ScopedContext Active(hQueue->getDevice()); @@ -916,7 +822,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWriteRect( if (phEvent) { *phEvent = RetImplEvent.release(); } - } catch (ur_result_t Err) { return Err; } @@ -1054,20 +959,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill( ur_event_handle_t *phEvent) { UR_ASSERT(size + offset <= std::get(hBuffer->Mem).getSize(), UR_RESULT_ERROR_INVALID_SIZE); - std::unique_ptr RetImplEvent{nullptr}; + hBuffer->setLastQueueWritingToMemObj(hQueue); try { ScopedContext Active(hQueue->getDevice()); auto Stream = hQueue->getNextTransferStream(); - ur_result_t Result = - enqueueEventsWait(hQueue, Stream, numEventsInWaitList, phEventWaitList); + UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, numEventsInWaitList, + phEventWaitList)); if (phEvent) { RetImplEvent = std::unique_ptr(ur_event_handle_t_::makeNative( - UR_COMMAND_MEM_BUFFER_FILL, hQueue, Stream)); + UR_COMMAND_MEM_BUFFER_WRITE_RECT, hQueue, Stream)); UR_CHECK_ERROR(RetImplEvent->start()); } @@ -1093,8 +998,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill( break; } default: { - Result = commonMemSetLargePattern(Stream, patternSize, size, pPattern, - DstDevice); + UR_CHECK_ERROR(commonMemSetLargePattern(Stream, patternSize, size, + pPattern, DstDevice)); break; } } @@ -1103,13 +1008,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill( UR_CHECK_ERROR(RetImplEvent->record()); *phEvent = RetImplEvent.release(); } - - return Result; } catch (ur_result_t Err) { return Err; } catch (...) { return UR_RESULT_ERROR_UNKNOWN; } + return UR_RESULT_SUCCESS; } static size_t imageElementByteSize(CUDA_ARRAY_DESCRIPTOR ArrayDesc) { @@ -1210,28 +1114,19 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead( UR_ASSERT(hImage->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); - ur_lock MemoryMigrationLock{hImage->MemoryMigrationMutex}; - auto Device = hQueue->getDevice(); - CUstream Stream = hQueue->getNextTransferStream(); - try { // Note that this entry point may be called on a queue that may not be the // last queue to write to the Image, meaning we must perform the copy // from a different device - if (hImage->LastEventWritingToMemObj && - hImage->LastEventWritingToMemObj->getQueue()->getDevice() != - hQueue->getDevice()) { - hQueue = hImage->LastEventWritingToMemObj->getQueue(); - Device = hQueue->getDevice(); - ScopedContext Active(Device); - Stream = CUstream{0}; // Default stream for different device - // We may have to wait for an event on another queue if it is the last - // event writing to mem obj - UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, 1, - &hImage->LastEventWritingToMemObj)); + if (hImage->LastQueueWritingToMemObj && + hImage->LastQueueWritingToMemObj->getDevice() != hQueue->getDevice()) { + hQueue = hImage->LastQueueWritingToMemObj; } + auto Device = hQueue->getDevice(); ScopedContext Active(Device); + CUstream Stream = hQueue->getNextTransferStream(); + UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, numEventsInWaitList, phEventWaitList)); @@ -1834,28 +1729,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead( UR_ASSERT(offset + size <= std::get(hBuffer->Mem).Size, UR_RESULT_ERROR_INVALID_SIZE); std::unique_ptr RetImplEvent{nullptr}; - ur_lock MemoryMigrationLock{hBuffer->MemoryMigrationMutex}; - auto Device = hQueue->getDevice(); - ScopedContext Active(Device); - CUstream Stream = hQueue->getNextTransferStream(); try { // Note that this entry point may be called on a queue that may not be the // last queue to write to the MemBuffer, meaning we must perform the copy // from a different device - if (hBuffer->LastEventWritingToMemObj && - hBuffer->LastEventWritingToMemObj->getQueue()->getDevice() != - hQueue->getDevice()) { - hQueue = hBuffer->LastEventWritingToMemObj->getQueue(); - Device = hQueue->getDevice(); - ScopedContext Active(Device); - Stream = CUstream{0}; // Default stream for different device - // We may have to wait for an event on another queue if it is the last - // event writing to mem obj - UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, 1, - &hBuffer->LastEventWritingToMemObj)); + if (hBuffer->LastQueueWritingToMemObj && + hBuffer->LastQueueWritingToMemObj->getDevice() != hQueue->getDevice()) { + hQueue = hBuffer->LastQueueWritingToMemObj; } + auto Device = hQueue->getDevice(); + ScopedContext Active(Device); + CUstream Stream = hQueue->getNextTransferStream(); + UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, numEventsInWaitList, phEventWaitList)); @@ -1897,17 +1784,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite( UR_ASSERT(offset + size <= std::get(hBuffer->Mem).Size, UR_RESULT_ERROR_INVALID_SIZE); - ur_result_t Result = UR_RESULT_SUCCESS; CUdeviceptr DevPtr = std::get(hBuffer->Mem).getPtr(hQueue->getDevice()); std::unique_ptr RetImplEvent{nullptr}; + hBuffer->setLastQueueWritingToMemObj(hQueue); try { ScopedContext Active(hQueue->getDevice()); CUstream CuStream = hQueue->getNextTransferStream(); - Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, - phEventWaitList); + UR_CHECK_ERROR(enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, + phEventWaitList)); if (phEvent) { RetImplEvent = @@ -1930,9 +1817,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite( *phEvent = RetImplEvent.release(); } } catch (ur_result_t Err) { - Result = Err; + return Err; } - return Result; + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableWrite( diff --git a/source/adapters/cuda/memory.cpp b/source/adapters/cuda/memory.cpp index 15039d6f70..9ea62a2c1b 100644 --- a/source/adapters/cuda/memory.cpp +++ b/source/adapters/cuda/memory.cpp @@ -12,6 +12,7 @@ #include "common.hpp" #include "context.hpp" +#include "enqueue.hpp" #include "memory.hpp" /// Creates a UR Memory object using a CUDA memory allocation. @@ -238,7 +239,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate( try { if (PerformInitialCopy) { for (const auto &Device : hContext->getDevices()) { - UR_CHECK_ERROR(migrateMemoryToDeviceIfNeeded(URMemObj.get(), Device)); + // Synchronous behaviour is best in this case + ScopedContext Active(Device); + CUstream Stream{0}; // Use default stream + UR_CHECK_ERROR(enqueueMigrateMemoryToDeviceIfNeeded(URMemObj.get(), + Device, Stream)); + UR_CHECK_ERROR(cuStreamSynchronize(Stream)); } } @@ -496,27 +502,28 @@ ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t Mem, } namespace { -ur_result_t migrateBufferToDevice(ur_mem_handle_t Mem, - ur_device_handle_t hDevice) { +ur_result_t enqueueMigrateBufferToDevice(ur_mem_handle_t Mem, + ur_device_handle_t hDevice, + CUstream Stream) { auto &Buffer = std::get(Mem->Mem); - if (Mem->LastEventWritingToMemObj == nullptr) { + if (Mem->LastQueueWritingToMemObj == nullptr) { // Device allocation being initialized from host for the first time if (Buffer.HostPtr) { - UR_CHECK_ERROR( - cuMemcpyHtoD(Buffer.getPtr(hDevice), Buffer.HostPtr, Buffer.Size)); + UR_CHECK_ERROR(cuMemcpyHtoDAsync(Buffer.getPtr(hDevice), Buffer.HostPtr, + Buffer.Size, Stream)); } - } else if (Mem->LastEventWritingToMemObj->getQueue()->getDevice() != - hDevice) { - UR_CHECK_ERROR(cuMemcpyDtoD( + } else if (Mem->LastQueueWritingToMemObj->getDevice() != hDevice) { + UR_CHECK_ERROR(cuMemcpyDtoDAsync( Buffer.getPtr(hDevice), - Buffer.getPtr(Mem->LastEventWritingToMemObj->getQueue()->getDevice()), - Buffer.Size)); + Buffer.getPtr(Mem->LastQueueWritingToMemObj->getDevice()), Buffer.Size, + Stream)); } return UR_RESULT_SUCCESS; } -ur_result_t migrateImageToDevice(ur_mem_handle_t Mem, - ur_device_handle_t hDevice) { +ur_result_t enqueueMigrateImageToDevice(ur_mem_handle_t Mem, + ur_device_handle_t hDevice, + CUstream Stream) { auto &Image = std::get(Mem->Mem); // When a dimension isn't used image_desc has the size set to 1 size_t PixelSizeBytes = Image.PixelTypeSizeBytes * @@ -547,40 +554,42 @@ ur_result_t migrateImageToDevice(ur_mem_handle_t Mem, CpyDesc3D.Depth = Image.ImageDesc.depth; } - if (Mem->LastEventWritingToMemObj == nullptr) { + if (Mem->LastQueueWritingToMemObj == nullptr) { if (Image.HostPtr) { if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE1D) { - UR_CHECK_ERROR( - cuMemcpyHtoA(ImageArray, 0, Image.HostPtr, ImageSizeBytes)); + UR_CHECK_ERROR(cuMemcpyHtoAAsync(ImageArray, 0, Image.HostPtr, + ImageSizeBytes, Stream)); } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE2D) { CpyDesc2D.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST; CpyDesc2D.srcHost = Image.HostPtr; - UR_CHECK_ERROR(cuMemcpy2D(&CpyDesc2D)); + UR_CHECK_ERROR(cuMemcpy2DAsync(&CpyDesc2D, Stream)); } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE3D) { CpyDesc3D.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST; CpyDesc3D.srcHost = Image.HostPtr; - UR_CHECK_ERROR(cuMemcpy3D(&CpyDesc3D)); + UR_CHECK_ERROR(cuMemcpy3DAsync(&CpyDesc3D, Stream)); } } - } else if (Mem->LastEventWritingToMemObj->getQueue()->getDevice() != - hDevice) { + } else if (Mem->LastQueueWritingToMemObj->getDevice() != hDevice) { if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE1D) { + // Blocking wait needed + UR_CHECK_ERROR(urQueueFinish(Mem->LastQueueWritingToMemObj)); // FIXME: 1D memcpy from DtoD going through the host. UR_CHECK_ERROR(cuMemcpyAtoH( Image.HostPtr, - Image.getArray( - Mem->LastEventWritingToMemObj->getQueue()->getDevice()), + Image.getArray(Mem->LastQueueWritingToMemObj->getDevice()), 0 /*srcOffset*/, ImageSizeBytes)); UR_CHECK_ERROR( cuMemcpyHtoA(ImageArray, 0, Image.HostPtr, ImageSizeBytes)); } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE2D) { - CpyDesc2D.srcArray = Image.getArray( - Mem->LastEventWritingToMemObj->getQueue()->getDevice()); - UR_CHECK_ERROR(cuMemcpy2D(&CpyDesc2D)); + CpyDesc2D.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_DEVICE; + CpyDesc2D.srcArray = + Image.getArray(Mem->LastQueueWritingToMemObj->getDevice()); + UR_CHECK_ERROR(cuMemcpy2DAsync(&CpyDesc2D, Stream)); } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE3D) { - CpyDesc3D.srcArray = Image.getArray( - Mem->LastEventWritingToMemObj->getQueue()->getDevice()); - UR_CHECK_ERROR(cuMemcpy3D(&CpyDesc3D)); + CpyDesc3D.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_DEVICE; + CpyDesc3D.srcArray = + Image.getArray(Mem->LastQueueWritingToMemObj->getDevice()); + UR_CHECK_ERROR(cuMemcpy3DAsync(&CpyDesc3D, Stream)); } } return UR_RESULT_SUCCESS; @@ -589,8 +598,8 @@ ur_result_t migrateImageToDevice(ur_mem_handle_t Mem, // If calling this entry point it is necessary to lock the memoryMigrationMutex // beforehand -ur_result_t migrateMemoryToDeviceIfNeeded(ur_mem_handle_t Mem, - const ur_device_handle_t hDevice) { +ur_result_t enqueueMigrateMemoryToDeviceIfNeeded( + ur_mem_handle_t Mem, const ur_device_handle_t hDevice, CUstream Stream) { UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); // Device allocation has already been initialized with most up to date // data in buffer @@ -601,9 +610,9 @@ ur_result_t migrateMemoryToDeviceIfNeeded(ur_mem_handle_t Mem, ScopedContext Active(hDevice); if (Mem->isBuffer()) { - UR_CHECK_ERROR(migrateBufferToDevice(Mem, hDevice)); + UR_CHECK_ERROR(enqueueMigrateBufferToDevice(Mem, hDevice, Stream)); } else { - UR_CHECK_ERROR(migrateImageToDevice(Mem, hDevice)); + UR_CHECK_ERROR(enqueueMigrateImageToDevice(Mem, hDevice, Stream)); } Mem->HaveMigratedToDeviceSinceLastWrite[Mem->getContext()->getDeviceIndex( diff --git a/source/adapters/cuda/memory.hpp b/source/adapters/cuda/memory.hpp index a084a55c07..a67e9295cc 100644 --- a/source/adapters/cuda/memory.hpp +++ b/source/adapters/cuda/memory.hpp @@ -20,6 +20,12 @@ #include "device.hpp" #include "event.hpp" +ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t, + const ur_device_handle_t); +ur_result_t enqueueMigrateMemoryToDeviceIfNeeded(ur_mem_handle_t, + const ur_device_handle_t, + CUstream); + // Handler for plain, pointer-based CUDA allocations struct BufferMem { @@ -288,7 +294,7 @@ struct SurfaceMem { /// /// The ur_mem_handle_t is responsible for memory allocation and migration /// across devices in the same ur_context_handle_t. If a kernel writes to a -/// ur_mem_handle_t then it will write to LastEventWritingToMemObj. Then all +/// ur_mem_handle_t then it will write to LastQueueWritingToMemObj. Then all /// subsequent operations that want to read from the ur_mem_handle_t must wait /// on the event referring to the last write. /// @@ -308,61 +314,7 @@ struct SurfaceMem { /// /// Migrations will occur in both cases if the most recent version of data /// is on a different device, marked by -/// LastEventWritingToMemObj->getQueue()->getDevice() -/// -/// Example trace: -/// ~~~~~~~~~~~~~~ -/// -/// =====> urContextCreate([device0, device1], ...) // associated with [q0, q1] -/// -> OUT: hContext -/// -/// =====> urMemBufferCreate(hContext,...); -/// -> No native allocations made -/// -> OUT: hBuffer -/// -/// =====> urEnqueueMemBufferWrite(q0, hBuffer,...); -/// -> Allocation made on q0 ie device0 -/// -> New allocation initialized with host data. -/// -/// =====> urKernelSetArgMemObj(hKernel0, hBuffer, ...); -/// -> ur_kernel_handle_t associated with a ur_program_handle_t, -/// which is in turn unique to a device. So we can set the kernel -/// arg with the ptr of the device specific allocation. -/// -> hKernel0->getProgram()->getDevice() == device0 -/// -> allocateMemObjOnDeviceIfNeeded(device0); -/// -> Native allocation already made on device0, continue. -/// -/// =====> urEnqueueKernelLaunch(q0, hKernel0, ...); -/// -> Suppose that hKernel0 writes to hBuffer. -/// -> Call hBuffer->setLastEventWritingToMemObj with return event -/// from this operation -/// -> Enqueue native kernel launch -/// -/// =====> urKernelSetArgMemObj(hKernel1, hBuffer, ...); -/// -> hKernel1->getProgram()->getDevice() == device1 -/// -> New allocation will be made on device1 when calling -/// getPtr(device1) -/// -> No native allocation on device1 -/// -> Make native allocation on device1 -/// -/// =====> urEnqueueKernelLaunch(q1, hKernel1, ...); -/// -> Suppose hKernel1 wants to read from hBuffer and not write. -/// -> migrateMemoryToDeviceIfNeeded(device1); -/// -> hBuffer->LastEventWritingToMemObj is not nullptr -/// -> Check if memory has been migrated to device1 since the -/// last write -/// -> Hasn't been migrated -/// -> Wait on LastEventWritingToMemObj. -/// -> Migrate memory from device0's native allocation to -/// device1's native allocation. -/// -> Enqueue native kernel launch -/// -/// =====> urEnqueueKernelLaunch(q0, hKernel0, ...); -/// -> migrateMemoryToDeviceIfNeeded(device0); -/// -> hBuffer->LastEventWritingToMemObj refers to an event -/// from q0 -/// -> Migration not necessary -/// -> Enqueue native kernel launch +/// LastQueueWritingToMemObj->getDevice() /// struct ur_mem_handle_t_ { // Context where the memory object is accessible @@ -381,15 +333,13 @@ struct ur_mem_handle_t_ { // Has the memory been migrated to a device since the last write? std::vector HaveMigratedToDeviceSinceLastWrite; - // We should wait on this event prior to migrating memory across allocations - // in this ur_mem_handle_t_ - ur_event_handle_t LastEventWritingToMemObj{nullptr}; + // Queue with most up to date data of ur_mem_handle_t_ + ur_queue_handle_t LastQueueWritingToMemObj{nullptr}; // Enumerates all possible types of accesses. enum access_mode_t { unknown, read_write, read_only, write_only }; ur_mutex MemoryAllocationMutex; // A mutex for allocations - ur_mutex MemoryMigrationMutex; // A mutex for memory transfers /// A UR Memory object represents either plain memory allocations ("Buffers" /// in OpenCL) or typed allocations ("Images" in OpenCL). @@ -478,20 +428,17 @@ struct ur_mem_handle_t_ { uint32_t getReferenceCount() const noexcept { return RefCount; } - void setLastEventWritingToMemObj(ur_event_handle_t NewEvent) { - assert(NewEvent && "Invalid event!"); - // This entry point should only ever be called when using multi device ctx - assert(Context->Devices.size() > 1); - urEventRetain(NewEvent); - if (LastEventWritingToMemObj != nullptr) { - urEventRelease(LastEventWritingToMemObj); + void setLastQueueWritingToMemObj(ur_queue_handle_t WritingQueue) { + urQueueRetain(WritingQueue); + if (LastQueueWritingToMemObj != nullptr) { + urQueueRelease(LastQueueWritingToMemObj); } - LastEventWritingToMemObj = NewEvent; + LastQueueWritingToMemObj = WritingQueue; for (const auto &Device : Context->getDevices()) { // This event is never an interop event so will always have an associated // queue HaveMigratedToDeviceSinceLastWrite[Context->getDeviceIndex(Device)] = - Device == NewEvent->getQueue()->getDevice(); + Device == WritingQueue->getDevice(); } } }; diff --git a/source/adapters/hip/enqueue.cpp b/source/adapters/hip/enqueue.cpp index aa3af35040..9092964a75 100644 --- a/source/adapters/hip/enqueue.cpp +++ b/source/adapters/hip/enqueue.cpp @@ -160,8 +160,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite( UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); UR_ASSERT(hBuffer->isBuffer(), UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); - ur_result_t Result = UR_RESULT_SUCCESS; std::unique_ptr RetImplEvent{nullptr}; + hBuffer->setLastQueueWritingToMemObj(hQueue); try { ScopedContext Active(hQueue->getDevice()); @@ -193,9 +193,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite( *phEvent = RetImplEvent.release(); } } catch (ur_result_t Err) { - Result = Err; + return Err; } - return Result; + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead( @@ -210,29 +210,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead( std::unique_ptr RetImplEvent{nullptr}; - ur_lock MemoryMigrationLock{hBuffer->MemoryMigrationMutex}; - auto Device = hQueue->getDevice(); - hipStream_t HIPStream = hQueue->getNextTransferStream(); - try { // Note that this entry point may be called on a queue that may not be the // last queue to write to the MemBuffer, meaning we must perform the copy // from a different device - if (hBuffer->LastEventWritingToMemObj && - hBuffer->LastEventWritingToMemObj->getQueue()->getDevice() != - hQueue->getDevice()) { - // This event is never created with interop so getQueue is never null - hQueue = hBuffer->LastEventWritingToMemObj->getQueue(); - Device = hQueue->getDevice(); - ScopedContext Active(Device); - HIPStream = hipStream_t{0}; // Default stream for different device - // We may have to wait for an event on another queue if it is the last - // event writing to mem obj - UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, 1, - &hBuffer->LastEventWritingToMemObj)); + if (hBuffer->LastQueueWritingToMemObj && + hBuffer->LastQueueWritingToMemObj->getDevice() != hQueue->getDevice()) { + hQueue = hBuffer->LastQueueWritingToMemObj; } + auto Device = hQueue->getDevice(); ScopedContext Active(Device); + hipStream_t HIPStream = hQueue->getNextTransferStream(); // Use the default stream if copying from another device UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, @@ -280,44 +269,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); - std::vector DepEvents( - phEventWaitList, phEventWaitList + numEventsInWaitList); - std::vector> MemMigrationLocks; - - // phEventWaitList only contains events that are handed to UR by the SYCL - // runtime. However since UR handles memory dependencies within a context - // we may need to add more events to our dependent events list if the UR - // context contains multiple devices - if (hQueue->getContext()->Devices.size() > 1) { - MemMigrationLocks.reserve(hKernel->Args.MemObjArgs.size()); - for (auto &MemArg : hKernel->Args.MemObjArgs) { - bool PushBack = false; - if (auto MemDepEvent = MemArg.Mem->LastEventWritingToMemObj; - MemDepEvent && std::find(DepEvents.begin(), DepEvents.end(), - MemDepEvent) == DepEvents.end()) { - DepEvents.push_back(MemDepEvent); - PushBack = true; - } - if ((MemArg.AccessFlags & - (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY)) || - PushBack) { - if (std::find_if(MemMigrationLocks.begin(), MemMigrationLocks.end(), - [MemArg](auto &Lock) { - return Lock.first == MemArg.Mem; - }) == MemMigrationLocks.end()) - MemMigrationLocks.emplace_back( - std::pair{MemArg.Mem, ur_lock{MemArg.Mem->MemoryMigrationMutex}}); - } - } - } - // Early exit for zero size range kernel if (*pGlobalWorkSize == 0) { - if (DepEvents.size()) { - return urEnqueueEventsWaitWithBarrier(hQueue, DepEvents.size(), - phEventWaitList, phEvent); - } - return UR_RESULT_SUCCESS; + return urEnqueueEventsWaitWithBarrier(hQueue, numEventsInWaitList, + phEventWaitList, phEvent); } // Set the number of threads per block to the number of threads per warp @@ -325,7 +280,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( size_t ThreadsPerBlock[3] = {32u, 1u, 1u}; size_t BlocksPerGrid[3] = {1u, 1u, 1u}; - ur_result_t Result = UR_RESULT_SUCCESS; std::unique_ptr RetImplEvent{nullptr}; try { @@ -343,20 +297,25 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( hipStream_t HIPStream = hQueue->getNextComputeStream( numEventsInWaitList, phEventWaitList, Guard, &StreamToken); - if (DepEvents.size()) { - UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, DepEvents.size(), - DepEvents.data())); - } + UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, + phEventWaitList)); // For memory migration across devices in the same context if (hQueue->getContext()->Devices.size() > 1) { for (auto &MemArg : hKernel->Args.MemObjArgs) { - migrateMemoryToDeviceIfNeeded(MemArg.Mem, hQueue->getDevice()); + enqueueMigrateMemoryToDeviceIfNeeded(MemArg.Mem, hQueue->getDevice(), + HIPStream); + if (MemArg.AccessFlags & + (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY)) { + MemArg.Mem->setLastQueueWritingToMemObj(hQueue); + } } } auto ArgIndices = hKernel->getArgIndices(); + // If migration of mem across buffer is needed, an event must be associated + // with this command, implicitly if phEvent is nullptr if (phEvent) { RetImplEvent = std::unique_ptr(ur_event_handle_t_::makeNative( @@ -364,20 +323,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( UR_CHECK_ERROR(RetImplEvent->start()); } - // Once event has been started we can unlock MemoryMigrationMutex - if (hQueue->getContext()->Devices.size() > 1) { - for (auto &MemArg : hKernel->Args.MemObjArgs) { - // Telling the ur_mem_handle_t that it will need to wait on this kernel - // if it has been written to - if (phEvent && (MemArg.AccessFlags & - (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY))) { - MemArg.Mem->setLastEventWritingToMemObj(RetImplEvent.get()); - } - } - // We can release the MemoryMigrationMutexes now - MemMigrationLocks.clear(); - } - UR_CHECK_ERROR(hipModuleLaunchKernel( HIPFunc, BlocksPerGrid[0], BlocksPerGrid[1], BlocksPerGrid[2], ThreadsPerBlock[0], ThreadsPerBlock[1], ThreadsPerBlock[2], @@ -390,9 +335,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( *phEvent = RetImplEvent.release(); } } catch (ur_result_t err) { - Result = err; + return err; } - return Result; + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( @@ -578,30 +523,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferReadRect( std::unique_ptr RetImplEvent{nullptr}; - ur_result_t Result = UR_RESULT_SUCCESS; - ur_lock MemoryMigrationLock(hBuffer->MemoryMigrationMutex); - auto Device = hQueue->getDevice(); - hipStream_t HIPStream = hQueue->getNextTransferStream(); - try { // Note that this entry point may be called on a queue that may not be the // last queue to write to the MemBuffer, meaning we must perform the copy // from a different device - if (hBuffer->LastEventWritingToMemObj && - hBuffer->LastEventWritingToMemObj->getQueue()->getDevice() != - hQueue->getDevice()) { - // This event is never created with interop so getQueue is never null - hQueue = hBuffer->LastEventWritingToMemObj->getQueue(); - Device = hQueue->getDevice(); - ScopedContext Active(Device); - HIPStream = hipStream_t{0}; // Default stream for different device - // We may have to wait for an event on another queue if it is the last - // event writing to mem obj - UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, 1, - &hBuffer->LastEventWritingToMemObj)); + if (hBuffer->LastQueueWritingToMemObj && + hBuffer->LastQueueWritingToMemObj->getDevice() != hQueue->getDevice()) { + hQueue = hBuffer->LastQueueWritingToMemObj; } + auto Device = hQueue->getDevice(); ScopedContext Active(Device); + hipStream_t HIPStream = hQueue->getNextTransferStream(); UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, phEventWaitList)); @@ -632,9 +565,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferReadRect( } } catch (ur_result_t Err) { - Result = Err; + return Err; } - return Result; + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWriteRect( @@ -644,27 +577,27 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWriteRect( size_t hostRowPitch, size_t hostSlicePitch, void *pSrc, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - ur_result_t Result = UR_RESULT_SUCCESS; void *DevPtr = std::get(hBuffer->Mem).getVoid(hQueue->getDevice()); std::unique_ptr RetImplEvent{nullptr}; + hBuffer->setLastQueueWritingToMemObj(hQueue); try { ScopedContext Active(hQueue->getDevice()); hipStream_t HIPStream = hQueue->getNextTransferStream(); - Result = enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, - phEventWaitList); + UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, + phEventWaitList)); if (phEvent) { RetImplEvent = std::unique_ptr(ur_event_handle_t_::makeNative( - UR_COMMAND_MEM_BUFFER_WRITE_RECT, hQueue, HIPStream)); + UR_COMMAND_MEM_BUFFER_WRITE, hQueue, HIPStream)); UR_CHECK_ERROR(RetImplEvent->start()); } - Result = commonEnqueueMemBufferCopyRect( + UR_CHECK_ERROR(commonEnqueueMemBufferCopyRect( HIPStream, region, pSrc, hipMemoryTypeHost, hostOrigin, hostRowPitch, hostSlicePitch, &DevPtr, hipMemoryTypeDevice, bufferOrigin, - bufferRowPitch, bufferSlicePitch); + bufferRowPitch, bufferSlicePitch)); if (phEvent) { UR_CHECK_ERROR(RetImplEvent->record()); @@ -677,11 +610,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWriteRect( if (phEvent) { *phEvent = RetImplEvent.release(); } - } catch (ur_result_t Err) { - Result = Err; + return Err; } - return Result; + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopy( @@ -803,8 +735,8 @@ static inline void memsetRemainPattern(hipStream_t Stream, uint32_t PatternSize, // HIP has no memset functions that allow setting values more than 4 bytes. UR // API lets you pass an arbitrary "pattern" to the buffer fill, which can be // more than 4 bytes. We must break up the pattern into 1 byte values, and set -// the buffer using multiple strided calls. The first 4 patterns are set using -// hipMemsetD32Async then all subsequent 1 byte patterns are set using +// the buffer using multiple strided calls. The first 4 patterns are set +// using hipMemsetD32Async then all subsequent 1 byte patterns are set using // hipMemset2DAsync which is called for each pattern. ur_result_t commonMemSetLargePattern(hipStream_t Stream, uint32_t PatternSize, size_t Size, const void *pPattern, @@ -823,8 +755,8 @@ ur_result_t commonMemSetLargePattern(hipStream_t Stream, uint32_t PatternSize, UR_CHECK_ERROR(hipPointerGetAttributes(&ptrAttribs, (const void *)Ptr)); // The hostPointer attribute is non-null also for shared memory allocations. - // To make sure that this workaround only executes for host pinned memory, we - // need to check that isManaged attribute is false. + // To make sure that this workaround only executes for host pinned memory, + // we need to check that isManaged attribute is false. if (ptrAttribs.hostPointer && !ptrAttribs.isManaged) { const auto NumOfCopySteps = Size / PatternSize; const auto Offset = sizeof(uint32_t); @@ -874,21 +806,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill( std::ignore = PatternSizeIsValid; std::unique_ptr RetImplEvent{nullptr}; + hBuffer->setLastQueueWritingToMemObj(hQueue); try { ScopedContext Active(hQueue->getDevice()); auto Stream = hQueue->getNextTransferStream(); - ur_result_t Result = UR_RESULT_SUCCESS; if (phEventWaitList) { - Result = enqueueEventsWait(hQueue, Stream, numEventsInWaitList, - phEventWaitList); + UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, numEventsInWaitList, + phEventWaitList)); } if (phEvent) { RetImplEvent = std::unique_ptr(ur_event_handle_t_::makeNative( - UR_COMMAND_MEM_BUFFER_FILL, hQueue, Stream)); + UR_COMMAND_MEM_BUFFER_WRITE, hQueue, Stream)); UR_CHECK_ERROR(RetImplEvent->start()); } @@ -915,8 +847,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill( } default: { - Result = commonMemSetLargePattern(Stream, patternSize, size, pPattern, - DstDevice); + UR_CHECK_ERROR(commonMemSetLargePattern(Stream, patternSize, size, + pPattern, DstDevice)); break; } } @@ -925,13 +857,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill( UR_CHECK_ERROR(RetImplEvent->record()); *phEvent = RetImplEvent.release(); } - - return Result; } catch (ur_result_t Err) { return Err; } catch (...) { return UR_RESULT_ERROR_UNKNOWN; } + return UR_RESULT_SUCCESS; } /// General ND memory copy operation for images (where N > 1). @@ -1015,28 +946,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead( const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { UR_ASSERT(hImage->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); - ur_lock MemoryMigrationLock{hImage->MemoryMigrationMutex}; - auto Device = hQueue->getDevice(); - hipStream_t HIPStream = hQueue->getNextTransferStream(); - try { // Note that this entry point may be called on a queue that may not be the - // last queue to write to the MemBuffer, meaning we must perform the copy + // last queue to write to the MemImage, meaning we must perform the copy // from a different device - if (hImage->LastEventWritingToMemObj && - hImage->LastEventWritingToMemObj->getQueue()->getDevice() != - hQueue->getDevice()) { - hQueue = hImage->LastEventWritingToMemObj->getQueue(); - Device = hQueue->getDevice(); - ScopedContext Active(Device); - HIPStream = hipStream_t{0}; // Default stream for different device - // We may have to wait for an event on another queue if it is the last - // event writing to mem obj - UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, 1, - &hImage->LastEventWritingToMemObj)); + if (hImage->LastQueueWritingToMemObj && + hImage->LastQueueWritingToMemObj->getDevice() != hQueue->getDevice()) { + hQueue = hImage->LastQueueWritingToMemObj; } + auto Device = hQueue->getDevice(); ScopedContext Active(Device); + hipStream_t HIPStream = hQueue->getNextTransferStream(); if (phEventWaitList) { UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, @@ -1299,7 +1220,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap( if (!IsPinned && (Map->getMapFlags() & (UR_MAP_FLAG_WRITE | UR_MAP_FLAG_WRITE_INVALIDATE_REGION))) { - // Pinned host memory is only on host so it doesn't need to be written to. + // Pinned host memory is only on host so it doesn't need to be written + // to. UR_CHECK_ERROR(urEnqueueMemBufferWrite( hQueue, hMem, true, Map->getMapOffset(), Map->getMapSize(), pMappedPtr, numEventsInWaitList, phEventWaitList, phEvent)); @@ -1475,10 +1397,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( hipPointerAttribute_t attribs; // TODO: hipPointerGetAttributes will fail if pMem is non-HIP allocated - // memory, as it is neither registered as host memory, nor into the address - // space for the current device, meaning the pMem ptr points to a - // system-allocated memory. This means we may need to check system-alloacted - // memory and handle the failure more gracefully. + // memory, as it is neither registered as host memory, nor into the + // address space for the current device, meaning the pMem ptr points to a + // system-allocated memory. This means we may need to check + // system-alloacted memory and handle the failure more gracefully. UR_CHECK_ERROR(hipPointerGetAttributes(&attribs, pMem)); // async prefetch requires USM pointer (or hip SVM) to work. if (!attribs.isManaged) { @@ -1507,8 +1429,9 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size, ur_device_handle_t Device = hQueue->getDevice(); #if HIP_VERSION_MAJOR >= 5 - // NOTE: The hipPointerGetAttribute API is marked as beta, meaning, while this - // is feature complete, it is still open to changes and outstanding issues. + // NOTE: The hipPointerGetAttribute API is marked as beta, meaning, while + // this is feature complete, it is still open to changes and outstanding + // issues. size_t PointerRangeSize = 0; UR_CHECK_ERROR(hipPointerGetAttribute( &PointerRangeSize, HIP_POINTER_ATTRIBUTE_RANGE_SIZE, @@ -1548,9 +1471,10 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size, } // Passing MEM_ADVICE_SET/MEM_ADVICE_CLEAR_PREFERRED_LOCATION to - // hipMemAdvise on a GPU device requires the GPU device to report a non-zero - // value for hipDeviceAttributeConcurrentManagedAccess. Therefore, ignore - // the mem advice if concurrent managed memory access is not available. + // hipMemAdvise on a GPU device requires the GPU device to report a + // non-zero value for hipDeviceAttributeConcurrentManagedAccess. + // Therefore, ignore the mem advice if concurrent managed memory access is + // not available. if (advice & (UR_USM_ADVICE_FLAG_SET_PREFERRED_LOCATION | UR_USM_ADVICE_FLAG_CLEAR_PREFERRED_LOCATION | UR_USM_ADVICE_FLAG_SET_ACCESSED_BY_DEVICE | @@ -1585,9 +1509,10 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size, #endif } else { Result = setHipMemAdvise(HIPDevicePtr, size, advice, DeviceID); - // UR_RESULT_ERROR_INVALID_ENUMERATION is returned when using a valid but - // currently unmapped advice arguments as not supported by this platform. - // Therefore, warn the user instead of throwing and aborting the runtime. + // UR_RESULT_ERROR_INVALID_ENUMERATION is returned when using a valid + // but currently unmapped advice arguments as not supported by this + // platform. Therefore, warn the user instead of throwing and aborting + // the runtime. if (Result == UR_RESULT_ERROR_INVALID_ENUMERATION) { releaseEvent(); setErrorMessage("mem_advise is ignored as the advice argument is not " @@ -1648,15 +1573,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy2D( UR_CHECK_ERROR(RetImplEvent->start()); } - // There is an issue with hipMemcpy2D* when hipMemcpyDefault is used, which - // makes the HIP runtime not correctly derive the copy kind (direction) for - // the copies since ROCm 5.6.0+. See: https://github.com/ROCm/clr/issues/40 + // There is an issue with hipMemcpy2D* when hipMemcpyDefault is used, + // which makes the HIP runtime not correctly derive the copy kind + // (direction) for the copies since ROCm 5.6.0+. See: + // https://github.com/ROCm/clr/issues/40 // TODO: Add maximum HIP_VERSION when bug has been fixed. #if HIP_VERSION >= 50600000 hipPointerAttribute_t srcAttribs{}; hipPointerAttribute_t dstAttribs{}; - // Determine if pSrc and/or pDst are system allocated pageable host memory. + // Determine if pSrc and/or pDst are system allocated pageable host + // memory. bool srcIsSystemAlloc{false}; bool dstIsSystemAlloc{false}; @@ -1851,9 +1778,9 @@ setKernelParams(const ur_device_handle_t Device, const uint32_t WorkDim, UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE); UR_ASSERT(LocalWorkSize[dim] <= MaxThreadsPerBlock[dim], UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE); - // Checks that local work sizes are a divisor of the global work sizes - // which includes that the local work sizes are neither larger than - // the global work sizes and not 0. + // Checks that local work sizes are a divisor of the global work + // sizes which includes that the local work sizes are neither larger + // than the global work sizes and not 0. UR_ASSERT(LocalWorkSize != 0, UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE); UR_ASSERT((GlobalWorkSize[dim] % LocalWorkSize[dim]) == 0, diff --git a/source/adapters/hip/memory.cpp b/source/adapters/hip/memory.cpp index 026b840f5b..eb91f1620a 100644 --- a/source/adapters/hip/memory.cpp +++ b/source/adapters/hip/memory.cpp @@ -10,6 +10,7 @@ #include "memory.hpp" #include "context.hpp" +#include "enqueue.hpp" #include #include @@ -390,7 +391,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate( if (PerformInitialCopy) { for (const auto &Dev : hContext->getDevices()) { - UR_CHECK_ERROR(migrateMemoryToDeviceIfNeeded(URMemObj.get(), Dev)); + ScopedContext Active(Dev); + hipStream_t Stream{0}; // Use default stream + UR_CHECK_ERROR( + enqueueMigrateMemoryToDeviceIfNeeded(URMemObj.get(), Dev, Stream)); + UR_CHECK_ERROR(hipStreamSynchronize(Stream)); } } *phMem = URMemObj.release(); @@ -541,27 +546,28 @@ ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t Mem, } namespace { -inline ur_result_t migrateBufferToDevice(ur_mem_handle_t Mem, - ur_device_handle_t hDevice) { +inline ur_result_t enqueueMigrateBufferToDevice(ur_mem_handle_t Mem, + ur_device_handle_t hDevice, + hipStream_t Stream) { auto &Buffer = std::get(Mem->Mem); - if (Mem->LastEventWritingToMemObj == nullptr) { + if (Mem->LastQueueWritingToMemObj == nullptr) { // Device allocation being initialized from host for the first time if (Buffer.HostPtr) { - UR_CHECK_ERROR( - hipMemcpyHtoD(Buffer.getPtr(hDevice), Buffer.HostPtr, Buffer.Size)); + UR_CHECK_ERROR(hipMemcpyHtoDAsync(Buffer.getPtr(hDevice), Buffer.HostPtr, + Buffer.Size, Stream)); } - } else if (Mem->LastEventWritingToMemObj->getQueue()->getDevice() != - hDevice) { - UR_CHECK_ERROR(hipMemcpyDtoD( + } else if (Mem->LastQueueWritingToMemObj->getDevice() != hDevice) { + UR_CHECK_ERROR(hipMemcpyDtoDAsync( Buffer.getPtr(hDevice), - Buffer.getPtr(Mem->LastEventWritingToMemObj->getQueue()->getDevice()), - Buffer.Size)); + Buffer.getPtr(Mem->LastQueueWritingToMemObj->getDevice()), Buffer.Size, + Stream)); } return UR_RESULT_SUCCESS; } -inline ur_result_t migrateImageToDevice(ur_mem_handle_t Mem, - ur_device_handle_t hDevice) { +inline ur_result_t enqueueMigrateImageToDevice(ur_mem_handle_t Mem, + ur_device_handle_t hDevice, + hipStream_t Stream) { auto &Image = std::get(Mem->Mem); // When a dimension isn't used image_desc has the size set to 1 size_t PixelSizeBytes = Image.PixelTypeSizeBytes * @@ -592,36 +598,40 @@ inline ur_result_t migrateImageToDevice(ur_mem_handle_t Mem, CpyDesc3D.Depth = Image.ImageDesc.depth; } - if (Mem->LastEventWritingToMemObj == nullptr) { + if (Mem->LastQueueWritingToMemObj == nullptr) { if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE1D) { + UR_CHECK_ERROR(hipStreamSynchronize(Stream)); UR_CHECK_ERROR( hipMemcpyHtoA(ImageArray, 0, Image.HostPtr, ImageSizeBytes)); } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE2D) { CpyDesc2D.srcHost = Image.HostPtr; - UR_CHECK_ERROR(hipMemcpyParam2D(&CpyDesc2D)); + UR_CHECK_ERROR(hipMemcpyParam2DAsync(&CpyDesc2D, Stream)); } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE3D) { CpyDesc3D.srcHost = Image.HostPtr; + CpyDesc3D.srcMemoryType = hipMemoryTypeHost; UR_CHECK_ERROR(hipDrvMemcpy3D(&CpyDesc3D)); } - } else if (Mem->LastEventWritingToMemObj->getQueue()->getDevice() != - hDevice) { + } else if (Mem->LastQueueWritingToMemObj->getDevice() != hDevice) { if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE1D) { + // Blocking wait needed + UR_CHECK_ERROR(urQueueFinish(Mem->LastQueueWritingToMemObj)); // FIXME: 1D memcpy from DtoD going through the host. UR_CHECK_ERROR(hipMemcpyAtoH( Image.HostPtr, - Image.getArray( - Mem->LastEventWritingToMemObj->getQueue()->getDevice()), + Image.getArray(Mem->LastQueueWritingToMemObj->getDevice()), 0 /*srcOffset*/, ImageSizeBytes)); UR_CHECK_ERROR( hipMemcpyHtoA(ImageArray, 0, Image.HostPtr, ImageSizeBytes)); } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE2D) { - CpyDesc2D.srcArray = Image.getArray( - Mem->LastEventWritingToMemObj->getQueue()->getDevice()); - UR_CHECK_ERROR(hipMemcpyParam2D(&CpyDesc2D)); + CpyDesc2D.srcMemoryType = hipMemoryTypeDevice; + CpyDesc2D.srcArray = + Image.getArray(Mem->LastQueueWritingToMemObj->getDevice()); + UR_CHECK_ERROR(hipMemcpyParam2DAsync(&CpyDesc2D, Stream)); } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE3D) { - CpyDesc3D.srcArray = Image.getArray( - Mem->LastEventWritingToMemObj->getQueue()->getDevice()); - UR_CHECK_ERROR(hipDrvMemcpy3D(&CpyDesc3D)); + CpyDesc3D.srcMemoryType = hipMemoryTypeDevice; + CpyDesc3D.srcArray = + Image.getArray(Mem->LastQueueWritingToMemObj->getDevice()); + UR_CHECK_ERROR(hipDrvMemcpy3DAsync(&CpyDesc3D, Stream)); } } return UR_RESULT_SUCCESS; @@ -630,8 +640,8 @@ inline ur_result_t migrateImageToDevice(ur_mem_handle_t Mem, // If calling this entry point it is necessary to lock the memoryMigrationMutex // beforehand -ur_result_t migrateMemoryToDeviceIfNeeded(ur_mem_handle_t Mem, - const ur_device_handle_t hDevice) { +ur_result_t enqueueMigrateMemoryToDeviceIfNeeded( + ur_mem_handle_t Mem, const ur_device_handle_t hDevice, hipStream_t Stream) { UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); auto DeviceIdx = Mem->getContext()->getDeviceIndex(hDevice); // Device allocation has already been initialized with most up to date @@ -641,9 +651,9 @@ ur_result_t migrateMemoryToDeviceIfNeeded(ur_mem_handle_t Mem, ScopedContext Active(hDevice); if (Mem->isBuffer()) { - UR_CHECK_ERROR(migrateBufferToDevice(Mem, hDevice)); + UR_CHECK_ERROR(enqueueMigrateBufferToDevice(Mem, hDevice, Stream)); } else { - UR_CHECK_ERROR(migrateImageToDevice(Mem, hDevice)); + UR_CHECK_ERROR(enqueueMigrateImageToDevice(Mem, hDevice, Stream)); } Mem->HaveMigratedToDeviceSinceLastWrite[DeviceIdx] = true; diff --git a/source/adapters/hip/memory.hpp b/source/adapters/hip/memory.hpp index cd8b8f1330..425c2e7f53 100644 --- a/source/adapters/hip/memory.hpp +++ b/source/adapters/hip/memory.hpp @@ -17,6 +17,12 @@ #include #include +ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t, + const ur_device_handle_t); +ur_result_t enqueueMigrateMemoryToDeviceIfNeeded(ur_mem_handle_t, + const ur_device_handle_t, + hipStream_t); + // Handler for plain, pointer-based HIP allocations struct BufferMem { struct BufferMap { @@ -280,7 +286,7 @@ struct SurfaceMem { /// /// The ur_mem_handle_t is responsible for memory allocation and migration /// across devices in the same ur_context_handle_t. If a kernel writes to a -/// ur_mem_handle_t then it will write to LastEventWritingToMemObj. Then all +/// ur_mem_handle_t then it will write to LastQueueWritingToMemObj. Then all /// subsequent operations that want to read from the ur_mem_handle_t must wait /// on the event referring to the last write. /// @@ -299,61 +305,7 @@ struct SurfaceMem { /// 2. urEnqueueMem(Buffer|Image)Read(Rect) /// /// Migrations will occur in both cases if the most recent version of data -/// is on a different device, marked by LastEventWritingToMemObj->getDevice(). -/// -/// Example trace: -/// ~~~~~~~~~~~~~~ -/// -/// =====> urContextCreate([device0, device1], ...) // associated with [q0, q1] -/// -> OUT: hContext -/// -/// =====> urMemBufferCreate(hContext,...); -/// -> No native allocations made -/// -> OUT: hBuffer -/// -/// =====> urEnqueueMemBufferWrite(q0, hBuffer,...); -/// -> Allocation made on q0 ie device0 -/// -> New allocation initialized with host data. -/// -/// =====> urKernelSetArgMemObj(hKernel0, hBuffer, ...); -/// -> ur_kernel_handle_t associated with a ur_program_handle_t, -/// which is in turn unique to a device. So we can set the kernel -/// arg with the ptr of the device specific allocation. -/// -> hKernel0->getProgram()->getDevice() == device0 -/// -> allocateMemObjOnDeviceIfNeeded(device0); -/// -> Native allocation already made on device0, continue. -/// -/// =====> urEnqueueKernelLaunch(q0, hKernel0, ...); -/// -> Suppose that hKernel0 writes to hBuffer. -/// -> Call hBuffer->setLastEventWritingToMemObj with return event -/// from this operation -/// -> Enqueue native kernel launch -/// -/// =====> urKernelSetArgMemObj(hKernel1, hBuffer, ...); -/// -> hKernel1->getProgram()->getDevice() == device1 -/// -> New allocation will be made on device1 when calling -/// getPtr(device1) -/// -> No native allocation on device1 -/// -> Make native allocation on device1 -/// -/// =====> urEnqueueKernelLaunch(q1, hKernel1, ...); -/// -> Suppose hKernel1 wants to read from hBuffer and not write. -/// -> migrateMemoryToDeviceIfNeeded(device1); -/// -> hBuffer->LastEventWritingToMemObj is not nullptr -/// -> Check if memory has been migrated to device1 since the -/// last write -/// -> Hasn't been migrated -/// -> Wait on LastEventWritingToMemObj. -/// -> Migrate memory from device0's native allocation to -/// device1's native allocation. -/// -> Enqueue native kernel launch -/// -/// =====> urEnqueueKernelLaunch(q0, hKernel0, ...); -/// -> migrateMemoryToDeviceIfNeeded(device0); -/// -> hBuffer->LastEventWritingToMemObj refers to an event -/// from q0 -/// -> Migration not necessary -/// -> Enqueue native kernel launch +/// is on a different device, marked by LastQueueWritingToMemObj->getDevice(). /// struct ur_mem_handle_t_ { @@ -377,15 +329,13 @@ struct ur_mem_handle_t_ { // Has the memory been migrated to a device since the last write? std::vector HaveMigratedToDeviceSinceLastWrite; - // We should wait on this event prior to migrating memory across allocations - // in this ur_mem_handle_t_ - ur_event_handle_t LastEventWritingToMemObj{nullptr}; + // Queue with most up to date data of ur_mem_handle_t_ + ur_queue_handle_t LastQueueWritingToMemObj{nullptr}; // Enumerates all possible types of accesses. enum access_mode_t { unknown, read_write, read_only, write_only }; ur_mutex MemoryAllocationMutex; // A mutex for allocations - ur_mutex MemoryMigrationMutex; // A mutex for memory transfers /// A UR Memory object represents either plain memory allocations ("Buffers" /// in OpenCL) or typed allocations ("Images" in OpenCL). @@ -474,18 +424,15 @@ struct ur_mem_handle_t_ { uint32_t getReferenceCount() const noexcept { return RefCount; } - void setLastEventWritingToMemObj(ur_event_handle_t NewEvent) { - assert(NewEvent && "Invalid event!"); - // This entry point should only ever be called when using multi device ctx - assert(Context->Devices.size() > 1); - if (LastEventWritingToMemObj != nullptr) { - urEventRelease(LastEventWritingToMemObj); + void setLastQueueWritingToMemObj(ur_queue_handle_t WritingQueue) { + if (LastQueueWritingToMemObj != nullptr) { + urQueueRelease(LastQueueWritingToMemObj); } - urEventRetain(NewEvent); - LastEventWritingToMemObj = NewEvent; + urQueueRetain(WritingQueue); + LastQueueWritingToMemObj = WritingQueue; for (const auto &Device : Context->getDevices()) { HaveMigratedToDeviceSinceLastWrite[Context->getDeviceIndex(Device)] = - Device == NewEvent->getQueue()->getDevice(); + Device == WritingQueue->getDevice(); } } }; diff --git a/test/conformance/CMakeLists.txt b/test/conformance/CMakeLists.txt index 79cefdd06f..edb67e1e0d 100644 --- a/test/conformance/CMakeLists.txt +++ b/test/conformance/CMakeLists.txt @@ -5,6 +5,10 @@ set(UR_CONFORMANCE_TEST_DIR ${CMAKE_CURRENT_SOURCE_DIR}) +set(UR_CONFORMANCE_DEVICE_BINARIES_DIR + "${CMAKE_CURRENT_BINARY_DIR}/device_binaries" CACHE INTERNAL + "Internal cache variable for device binaries directory") + function(add_test_adapter name adapter) set(TEST_TARGET_NAME test-${name}) set(TEST_NAME ${name}-${adapter}) diff --git a/test/conformance/device_code/CMakeLists.txt b/test/conformance/device_code/CMakeLists.txt index 1419604b9d..24c437e853 100644 --- a/test/conformance/device_code/CMakeLists.txt +++ b/test/conformance/device_code/CMakeLists.txt @@ -135,6 +135,7 @@ add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/fill_3d.cpp) add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/fill_usm.cpp) add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/foo.cpp) add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/image_copy.cpp) +add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/inc.cpp) add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/mean.cpp) add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/cpy_and_mult.cpp) add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/cpy_and_mult_usm.cpp) diff --git a/test/conformance/device_code/inc.cpp b/test/conformance/device_code/inc.cpp new file mode 100644 index 0000000000..d41c07366d --- /dev/null +++ b/test/conformance/device_code/inc.cpp @@ -0,0 +1,18 @@ +// Copyright (C) 2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include + +class inc; + +int main() { + uint32_t *ptr; + sycl::buffer buf{ptr, 1}; + sycl::queue{}.submit([&](sycl::handler &cgh) { + sycl::accessor acc{buf, cgh}; + auto kernel = [acc](sycl::item<1> it) { acc[it]++; }; + cgh.parallel_for(sycl::range<1>{1}, kernel); + }); +} diff --git a/test/conformance/memory/CMakeLists.txt b/test/conformance/memory/CMakeLists.txt index 041f73a079..1ac78fa26a 100644 --- a/test/conformance/memory/CMakeLists.txt +++ b/test/conformance/memory/CMakeLists.txt @@ -15,3 +15,8 @@ add_conformance_test_with_devices_environment(memory urMemImageGetInfo.cpp urMemRelease.cpp urMemRetain.cpp) + +if (UR_DPCXX) + add_conformance_test_with_kernels_environment(memory-migrate + urMemBufferMigrateAcrossDevices.cpp) +endif() diff --git a/test/conformance/memory/memory-migrate_adapter_cuda.match b/test/conformance/memory/memory-migrate_adapter_cuda.match new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/conformance/memory/memory-migrate_adapter_hip.match b/test/conformance/memory/memory-migrate_adapter_hip.match new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/conformance/memory/memory-migrate_adapter_level_zero.match b/test/conformance/memory/memory-migrate_adapter_level_zero.match new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/conformance/memory/memory-migrate_adapter_native_cpu.match b/test/conformance/memory/memory-migrate_adapter_native_cpu.match new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/conformance/memory/memory-migrate_adapter_opencl.match b/test/conformance/memory/memory-migrate_adapter_opencl.match new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/conformance/memory/urMemBufferMigrateAcrossDevices.cpp b/test/conformance/memory/urMemBufferMigrateAcrossDevices.cpp new file mode 100644 index 0000000000..2e8856ac97 --- /dev/null +++ b/test/conformance/memory/urMemBufferMigrateAcrossDevices.cpp @@ -0,0 +1,263 @@ +// Copyright (C) 2023 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +// Some tests to ensure implicit memory migration of buffers across devices +// in the same context. + +#include "uur/fixtures.h" + +using T = uint32_t; + +struct urMultiDeviceContextTest : uur::urPlatformTest { + void SetUp() { + uur::urPlatformTest::SetUp(); + ASSERT_SUCCESS(urDeviceGet(platform, UR_DEVICE_TYPE_ALL, 0, nullptr, + &num_devices)); + ASSERT_NE(num_devices, 0); + if (num_devices == 1) { + return; + } + + devices = std::vector(num_devices); + ASSERT_SUCCESS(urDeviceGet(platform, UR_DEVICE_TYPE_ALL, num_devices, + devices.data(), nullptr)); + ASSERT_SUCCESS( + urContextCreate(num_devices, devices.data(), nullptr, &context)); + + queues = std::vector(num_devices); + for (auto i = 0u; i < num_devices; ++i) { + ASSERT_SUCCESS( + urQueueCreate(context, devices[i], nullptr, &queues[i])); + } + } + + void TearDown() { + uur::urPlatformTest::TearDown(); + if (num_devices == 1) { + return; + } + for (auto i = 0u; i < num_devices; ++i) { + urDeviceRelease(devices[i]); + urQueueRelease(queues[i]); + } + urContextRelease(context); + } + + uint32_t num_devices = 0; + ur_context_handle_t context; + std::vector devices; + std::vector queues; +}; + +struct urMultiDeviceContextMemBufferTest : urMultiDeviceContextTest { + void SetUp() { + urMultiDeviceContextTest::SetUp(); + if (num_devices == 1) { + return; + } + ASSERT_SUCCESS(urMemBufferCreate(context, 0 /*flags=*/, + buffer_size_bytes, + nullptr /*pProperties=*/, &buffer)); + + UUR_RETURN_ON_FATAL_FAILURE( + uur::KernelsEnvironment::instance->LoadSource(program_name, + il_binary)); + + programs = std::vector(num_devices); + kernels = std::vector(num_devices); + + const ur_program_properties_t properties = { + UR_STRUCTURE_TYPE_PROGRAM_PROPERTIES, nullptr, + static_cast(metadatas.size()), + metadatas.empty() ? nullptr : metadatas.data()}; + for (auto i = 0u; i < num_devices; ++i) { + ASSERT_SUCCESS(uur::KernelsEnvironment::instance->CreateProgram( + platform, context, devices[i], *il_binary, &properties, + &programs[i])); + ASSERT_SUCCESS(urProgramBuild(context, programs[i], nullptr)); + auto kernel_names = + uur::KernelsEnvironment::instance->GetEntryPointNames( + program_name); + kernel_name = kernel_names[0]; + ASSERT_FALSE(kernel_name.empty()); + ASSERT_SUCCESS( + urKernelCreate(programs[i], kernel_name.data(), &kernels[i])); + } + } + + // Adds a kernel arg representing a sycl buffer constructed with a 1D range. + void AddBuffer1DArg(ur_kernel_handle_t kernel, size_t current_arg_index, + ur_mem_handle_t buffer) { + ASSERT_SUCCESS( + urKernelSetArgMemObj(kernel, current_arg_index, nullptr, buffer)); + + // SYCL device kernels have different interfaces depending on the + // backend being used. Typically a kernel which takes a buffer argument + // will take a pointer to the start of the buffer and a sycl::id param + // which is a struct that encodes the accessor to the buffer. However + // the AMD backend handles this differently and uses three separate + // arguments for each of the three dimensions of the accessor. + + ur_platform_backend_t backend; + ASSERT_SUCCESS(urPlatformGetInfo(platform, UR_PLATFORM_INFO_BACKEND, + sizeof(backend), &backend, nullptr)); + if (backend == UR_PLATFORM_BACKEND_HIP) { + // this emulates the three offset params for buffer accessor on AMD. + size_t val = 0; + ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index + 1, + sizeof(size_t), nullptr, &val)); + ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index + 2, + sizeof(size_t), nullptr, &val)); + ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index + 3, + sizeof(size_t), nullptr, &val)); + current_arg_index += 4; + } else { + // This emulates the offset struct sycl adds for a 1D buffer accessor. + struct { + size_t offsets[1] = {0}; + } accessor; + ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index + 1, + sizeof(accessor), nullptr, + &accessor)); + current_arg_index += 2; + } + } + + void TearDown() { + if (num_devices > 1) { + for (auto i = 0u; i < num_devices; ++i) { + ASSERT_SUCCESS(urKernelRelease(kernels[i])); + ASSERT_SUCCESS(urProgramRelease(programs[i])); + } + urMemRelease(buffer); + } + urMultiDeviceContextTest::TearDown(); + } + + size_t buffer_size = 4096; + size_t buffer_size_bytes = 4096 * sizeof(T); + ur_mem_handle_t buffer; + + // Program stuff so we can launch kernels + std::shared_ptr> il_binary; + std::string program_name = "inc"; + std::string kernel_name; + std::vector programs; + std::vector kernels; + std::vector metadatas{}; +}; + +TEST_F(urMultiDeviceContextMemBufferTest, WriteRead) { + if (num_devices == 1) { + GTEST_SKIP(); + } + T fill_val = 42; + std::vector in_vec(buffer_size, fill_val); + std::vector out_vec(buffer_size, 0); + ur_event_handle_t e1; + + ASSERT_SUCCESS(urEnqueueMemBufferWrite(queues[0], buffer, false, 0, + buffer_size_bytes, in_vec.data(), 0, + nullptr, &e1)); + + ASSERT_SUCCESS(urEnqueueMemBufferRead(queues[1], buffer, false, 0, + buffer_size_bytes, out_vec.data(), 1, + &e1, nullptr)); + for (auto &a : out_vec) { + ASSERT_EQ(a, fill_val); + } +} + +TEST_F(urMultiDeviceContextMemBufferTest, FillRead) { + if (num_devices == 1) { + GTEST_SKIP(); + } + T fill_val = 42; + std::vector in_vec(buffer_size, fill_val); + std::vector out_vec(buffer_size); + ur_event_handle_t e1; + + ASSERT_SUCCESS(urEnqueueMemBufferFill(queues[0], buffer, &fill_val, + sizeof(fill_val), 0, + buffer_size_bytes, 0, nullptr, &e1)); + + ASSERT_SUCCESS(urEnqueueMemBufferRead(queues[1], buffer, false, 0, + buffer_size_bytes, out_vec.data(), 1, + &e1, nullptr)); + for (auto &a : out_vec) { + ASSERT_EQ(a, fill_val); + } +} + +TEST_F(urMultiDeviceContextMemBufferTest, WriteKernelRead) { + if (num_devices == 1) { + GTEST_SKIP(); + } + + // Kernel to run on queues[1] + AddBuffer1DArg(kernels[1], 0, buffer); + + T fill_val = 42; + std::vector in_vec(buffer_size, fill_val); + std::vector out_vec(buffer_size); + ur_event_handle_t e1, e2; + + ASSERT_SUCCESS(urEnqueueMemBufferWrite(queues[0], buffer, false, 0, + buffer_size_bytes, in_vec.data(), 0, + nullptr, &e1)); + + size_t work_dims[3] = {buffer_size, 1, 1}; + size_t offset[3] = {0, 0, 0}; + + // Kernel increments the fill val by 1 + ASSERT_SUCCESS(urEnqueueKernelLaunch(queues[1], kernels[1], 1 /*workDim=*/, + offset, work_dims, nullptr, 1, &e1, + &e2)); + + ASSERT_SUCCESS(urEnqueueMemBufferRead(queues[0], buffer, false, 0, + buffer_size_bytes, out_vec.data(), 1, + &e2, nullptr)); + for (auto &a : out_vec) { + ASSERT_EQ(a, fill_val + 1); + } +} + +TEST_F(urMultiDeviceContextMemBufferTest, WriteKernelKernelRead) { + if (num_devices == 1) { + GTEST_SKIP(); + } + + AddBuffer1DArg(kernels[0], 0, buffer); + AddBuffer1DArg(kernels[1], 0, buffer); + + T fill_val = 42; + std::vector in_vec(buffer_size, fill_val); + std::vector out_vec(buffer_size); + ur_event_handle_t e1, e2, e3; + + ASSERT_SUCCESS(urEnqueueMemBufferWrite(queues[0], buffer, false, 0, + buffer_size_bytes, in_vec.data(), 0, + nullptr, &e1)); + + size_t work_dims[3] = {buffer_size, 1, 1}; + size_t offset[3] = {0, 0, 0}; + + // Kernel increments the fill val by 1 + ASSERT_SUCCESS(urEnqueueKernelLaunch(queues[1], kernels[1], 1 /*workDim=*/, + offset, work_dims, nullptr, 1, &e1, + &e2)); + + // Kernel increments the fill val by 1 + ASSERT_SUCCESS(urEnqueueKernelLaunch(queues[0], kernels[0], 1 /*workDim=*/, + offset, work_dims, nullptr, 1, &e2, + &e3)); + + ASSERT_SUCCESS(urEnqueueMemBufferRead(queues[1], buffer, false, 0, + buffer_size_bytes, out_vec.data(), 1, + &e3, nullptr)); + for (auto &a : out_vec) { + ASSERT_EQ(a, fill_val + 2); + } +}