Merge pull request #1711 from hdelan/minimize-vector-allocations

[HIP][CUDA] Several changes to kernel launch
oneapi-src · Jun 12, 2024 · b13c5e1 · b13c5e1
2 parents 8788bd1 + 61b42a3
commit b13c5e1
Show file tree

Hide file tree

Showing 16 changed files with 532 additions and 514 deletions.
diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp
diff --git a/source/adapters/cuda/memory.cpp b/source/adapters/cuda/memory.cpp
@@ -12,6 +12,7 @@
 
 #include "common.hpp"
 #include "context.hpp"
+#include "enqueue.hpp"
 #include "memory.hpp"
 
 /// Creates a UR Memory object using a CUDA memory allocation.
@@ -238,7 +239,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate(
   try {
     if (PerformInitialCopy) {
       for (const auto &Device : hContext->getDevices()) {
-        UR_CHECK_ERROR(migrateMemoryToDeviceIfNeeded(URMemObj.get(), Device));
+        // Synchronous behaviour is best in this case
+        ScopedContext Active(Device);
+        CUstream Stream{0}; // Use default stream
+        UR_CHECK_ERROR(enqueueMigrateMemoryToDeviceIfNeeded(URMemObj.get(),
+                                                            Device, Stream));
+        UR_CHECK_ERROR(cuStreamSynchronize(Stream));
       }
     }
 
@@ -496,27 +502,28 @@ ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t Mem,
 }
 
 namespace {
-ur_result_t migrateBufferToDevice(ur_mem_handle_t Mem,
-                                  ur_device_handle_t hDevice) {
+ur_result_t enqueueMigrateBufferToDevice(ur_mem_handle_t Mem,
+                                         ur_device_handle_t hDevice,
+                                         CUstream Stream) {
   auto &Buffer = std::get<BufferMem>(Mem->Mem);
-  if (Mem->LastEventWritingToMemObj == nullptr) {
+  if (Mem->LastQueueWritingToMemObj == nullptr) {
     // Device allocation being initialized from host for the first time
     if (Buffer.HostPtr) {
-      UR_CHECK_ERROR(
-          cuMemcpyHtoD(Buffer.getPtr(hDevice), Buffer.HostPtr, Buffer.Size));
+      UR_CHECK_ERROR(cuMemcpyHtoDAsync(Buffer.getPtr(hDevice), Buffer.HostPtr,
+                                       Buffer.Size, Stream));
     }
-  } else if (Mem->LastEventWritingToMemObj->getQueue()->getDevice() !=
-             hDevice) {
-    UR_CHECK_ERROR(cuMemcpyDtoD(
+  } else if (Mem->LastQueueWritingToMemObj->getDevice() != hDevice) {
+    UR_CHECK_ERROR(cuMemcpyDtoDAsync(
         Buffer.getPtr(hDevice),
-        Buffer.getPtr(Mem->LastEventWritingToMemObj->getQueue()->getDevice()),
-        Buffer.Size));
+        Buffer.getPtr(Mem->LastQueueWritingToMemObj->getDevice()), Buffer.Size,
+        Stream));
   }
   return UR_RESULT_SUCCESS;
 }
 
-ur_result_t migrateImageToDevice(ur_mem_handle_t Mem,
-                                 ur_device_handle_t hDevice) {
+ur_result_t enqueueMigrateImageToDevice(ur_mem_handle_t Mem,
+                                        ur_device_handle_t hDevice,
+                                        CUstream Stream) {
   auto &Image = std::get<SurfaceMem>(Mem->Mem);
   // When a dimension isn't used image_desc has the size set to 1
   size_t PixelSizeBytes = Image.PixelTypeSizeBytes *
@@ -547,40 +554,42 @@ ur_result_t migrateImageToDevice(ur_mem_handle_t Mem,
     CpyDesc3D.Depth = Image.ImageDesc.depth;
   }
 
-  if (Mem->LastEventWritingToMemObj == nullptr) {
+  if (Mem->LastQueueWritingToMemObj == nullptr) {
     if (Image.HostPtr) {
       if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE1D) {
-        UR_CHECK_ERROR(
-            cuMemcpyHtoA(ImageArray, 0, Image.HostPtr, ImageSizeBytes));
+        UR_CHECK_ERROR(cuMemcpyHtoAAsync(ImageArray, 0, Image.HostPtr,
+                                         ImageSizeBytes, Stream));
       } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE2D) {
         CpyDesc2D.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
         CpyDesc2D.srcHost = Image.HostPtr;
-        UR_CHECK_ERROR(cuMemcpy2D(&CpyDesc2D));
+        UR_CHECK_ERROR(cuMemcpy2DAsync(&CpyDesc2D, Stream));
       } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE3D) {
         CpyDesc3D.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
         CpyDesc3D.srcHost = Image.HostPtr;
-        UR_CHECK_ERROR(cuMemcpy3D(&CpyDesc3D));
+        UR_CHECK_ERROR(cuMemcpy3DAsync(&CpyDesc3D, Stream));
       }
     }
-  } else if (Mem->LastEventWritingToMemObj->getQueue()->getDevice() !=
-             hDevice) {
+  } else if (Mem->LastQueueWritingToMemObj->getDevice() != hDevice) {
     if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE1D) {
+      // Blocking wait needed
+      UR_CHECK_ERROR(urQueueFinish(Mem->LastQueueWritingToMemObj));
       // FIXME: 1D memcpy from DtoD going through the host.
       UR_CHECK_ERROR(cuMemcpyAtoH(
           Image.HostPtr,
-          Image.getArray(
-              Mem->LastEventWritingToMemObj->getQueue()->getDevice()),
+          Image.getArray(Mem->LastQueueWritingToMemObj->getDevice()),
           0 /*srcOffset*/, ImageSizeBytes));
       UR_CHECK_ERROR(
           cuMemcpyHtoA(ImageArray, 0, Image.HostPtr, ImageSizeBytes));
     } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE2D) {
-      CpyDesc2D.srcArray = Image.getArray(
-          Mem->LastEventWritingToMemObj->getQueue()->getDevice());
-      UR_CHECK_ERROR(cuMemcpy2D(&CpyDesc2D));
+      CpyDesc2D.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_DEVICE;
+      CpyDesc2D.srcArray =
+          Image.getArray(Mem->LastQueueWritingToMemObj->getDevice());
+      UR_CHECK_ERROR(cuMemcpy2DAsync(&CpyDesc2D, Stream));
     } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE3D) {
-      CpyDesc3D.srcArray = Image.getArray(
-          Mem->LastEventWritingToMemObj->getQueue()->getDevice());
-      UR_CHECK_ERROR(cuMemcpy3D(&CpyDesc3D));
+      CpyDesc3D.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_DEVICE;
+      CpyDesc3D.srcArray =
+          Image.getArray(Mem->LastQueueWritingToMemObj->getDevice());
+      UR_CHECK_ERROR(cuMemcpy3DAsync(&CpyDesc3D, Stream));
     }
   }
   return UR_RESULT_SUCCESS;
@@ -589,8 +598,8 @@ ur_result_t migrateImageToDevice(ur_mem_handle_t Mem,
 
 // If calling this entry point it is necessary to lock the memoryMigrationMutex
 // beforehand
-ur_result_t migrateMemoryToDeviceIfNeeded(ur_mem_handle_t Mem,
-                                          const ur_device_handle_t hDevice) {
+ur_result_t enqueueMigrateMemoryToDeviceIfNeeded(
+    ur_mem_handle_t Mem, const ur_device_handle_t hDevice, CUstream Stream) {
   UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
   // Device allocation has already been initialized with most up to date
   // data in buffer
@@ -601,9 +610,9 @@ ur_result_t migrateMemoryToDeviceIfNeeded(ur_mem_handle_t Mem,
 
   ScopedContext Active(hDevice);
   if (Mem->isBuffer()) {
-    UR_CHECK_ERROR(migrateBufferToDevice(Mem, hDevice));
+    UR_CHECK_ERROR(enqueueMigrateBufferToDevice(Mem, hDevice, Stream));
   } else {
-    UR_CHECK_ERROR(migrateImageToDevice(Mem, hDevice));
+    UR_CHECK_ERROR(enqueueMigrateImageToDevice(Mem, hDevice, Stream));
   }
 
   Mem->HaveMigratedToDeviceSinceLastWrite[Mem->getContext()->getDeviceIndex(

diff --git a/source/adapters/cuda/memory.hpp b/source/adapters/cuda/memory.hpp
@@ -20,6 +20,12 @@
 #include "device.hpp"
 #include "event.hpp"
 
+ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t,
+                                           const ur_device_handle_t);
+ur_result_t enqueueMigrateMemoryToDeviceIfNeeded(ur_mem_handle_t,
+                                                 const ur_device_handle_t,
+                                                 CUstream);
+
 // Handler for plain, pointer-based CUDA allocations
 struct BufferMem {
 
@@ -288,7 +294,7 @@ struct SurfaceMem {
 ///
 /// The ur_mem_handle_t is responsible for memory allocation and migration
 /// across devices in the same ur_context_handle_t. If a kernel writes to a
-/// ur_mem_handle_t then it will write to LastEventWritingToMemObj. Then all
+/// ur_mem_handle_t then it will write to LastQueueWritingToMemObj. Then all
 /// subsequent operations that want to read from the ur_mem_handle_t must wait
 /// on the event referring to the last write.
 ///
@@ -308,61 +314,7 @@ struct SurfaceMem {
 ///
 /// Migrations will occur in both cases if the most recent version of data
 /// is on a different device, marked by
-/// LastEventWritingToMemObj->getQueue()->getDevice()
-///
-/// Example trace:
-/// ~~~~~~~~~~~~~~
-///
-/// =====> urContextCreate([device0, device1], ...) // associated with [q0, q1]
-///             -> OUT: hContext
-///
-/// =====> urMemBufferCreate(hContext,...);
-///             -> No native allocations made
-///             -> OUT: hBuffer
-///
-/// =====> urEnqueueMemBufferWrite(q0, hBuffer,...);
-///             -> Allocation made on q0 ie device0
-///             -> New allocation initialized with host data.
-///
-/// =====> urKernelSetArgMemObj(hKernel0, hBuffer, ...);
-///             -> ur_kernel_handle_t associated with a ur_program_handle_t,
-///                which is in turn unique to a device. So we can set the kernel
-///                arg with the ptr of the device specific allocation.
-///             -> hKernel0->getProgram()->getDevice() == device0
-///             -> allocateMemObjOnDeviceIfNeeded(device0);
-///                   -> Native allocation already made on device0, continue.
-///
-/// =====> urEnqueueKernelLaunch(q0, hKernel0, ...);
-///             -> Suppose that hKernel0 writes to hBuffer.
-///             -> Call hBuffer->setLastEventWritingToMemObj with return event
-///                from this operation
-///             -> Enqueue native kernel launch
-///
-/// =====> urKernelSetArgMemObj(hKernel1, hBuffer, ...);
-///             -> hKernel1->getProgram()->getDevice() == device1
-///             -> New allocation will be made on device1 when calling
-///                getPtr(device1)
-///                   -> No native allocation on device1
-///                   -> Make native allocation on device1
-///
-/// =====> urEnqueueKernelLaunch(q1, hKernel1, ...);
-///             -> Suppose hKernel1 wants to read from hBuffer and not write.
-///             -> migrateMemoryToDeviceIfNeeded(device1);
-///                   -> hBuffer->LastEventWritingToMemObj is not nullptr
-///                   -> Check if memory has been migrated to device1 since the
-///                      last write
-///                        -> Hasn't been migrated
-///                   -> Wait on LastEventWritingToMemObj.
-///                   -> Migrate memory from device0's native allocation to
-///                      device1's native allocation.
-///             -> Enqueue native kernel launch
-///
-/// =====> urEnqueueKernelLaunch(q0, hKernel0, ...);
-///             -> migrateMemoryToDeviceIfNeeded(device0);
-///                   -> hBuffer->LastEventWritingToMemObj refers to an event
-///                      from q0
-///                        -> Migration not necessary
-///             -> Enqueue native kernel launch
+/// LastQueueWritingToMemObj->getDevice()
 ///
 struct ur_mem_handle_t_ {
   // Context where the memory object is accessible
@@ -381,15 +333,13 @@ struct ur_mem_handle_t_ {
   // Has the memory been migrated to a device since the last write?
   std::vector<bool> HaveMigratedToDeviceSinceLastWrite;
 
-  // We should wait on this event prior to migrating memory across allocations
-  // in this ur_mem_handle_t_
-  ur_event_handle_t LastEventWritingToMemObj{nullptr};
+  // Queue with most up to date data of ur_mem_handle_t_
+  ur_queue_handle_t LastQueueWritingToMemObj{nullptr};
 
   // Enumerates all possible types of accesses.
   enum access_mode_t { unknown, read_write, read_only, write_only };
 
   ur_mutex MemoryAllocationMutex; // A mutex for allocations
-  ur_mutex MemoryMigrationMutex;  // A mutex for memory transfers
 
   /// A UR Memory object represents either plain memory allocations ("Buffers"
   /// in OpenCL) or typed allocations ("Images" in OpenCL).
@@ -478,20 +428,17 @@ struct ur_mem_handle_t_ {
 
   uint32_t getReferenceCount() const noexcept { return RefCount; }
 
-  void setLastEventWritingToMemObj(ur_event_handle_t NewEvent) {
-    assert(NewEvent && "Invalid event!");
-    // This entry point should only ever be called when using multi device ctx
-    assert(Context->Devices.size() > 1);
-    urEventRetain(NewEvent);
-    if (LastEventWritingToMemObj != nullptr) {
-      urEventRelease(LastEventWritingToMemObj);
+  void setLastQueueWritingToMemObj(ur_queue_handle_t WritingQueue) {
+    urQueueRetain(WritingQueue);
+    if (LastQueueWritingToMemObj != nullptr) {
+      urQueueRelease(LastQueueWritingToMemObj);
     }
-    LastEventWritingToMemObj = NewEvent;
+    LastQueueWritingToMemObj = WritingQueue;
     for (const auto &Device : Context->getDevices()) {
       // This event is never an interop event so will always have an associated
       // queue
       HaveMigratedToDeviceSinceLastWrite[Context->getDeviceIndex(Device)] =
-          Device == NewEvent->getQueue()->getDevice();
+          Device == WritingQueue->getDevice();
     }
   }
 };