diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp
index ac839510f2..906fd49d1d 100644
--- a/source/adapters/cuda/enqueue.cpp
+++ b/source/adapters/cuda/enqueue.cpp
@@ -414,37 +414,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
   UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
   UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
 
-  std::vector<ur_event_handle_t> DepEvents(
-      phEventWaitList, phEventWaitList + numEventsInWaitList);
-  std::vector<std::pair<ur_mem_handle_t, ur_lock>> MemMigrationLocks;
-
-  // phEventWaitList only contains events that are handed to UR by the SYCL
-  // runtime. However since UR handles memory dependencies within a context
-  // we may need to add more events to our dependent events list if the UR
-  // context contains multiple devices
-  if (hQueue->getContext()->Devices.size() > 1) {
-    MemMigrationLocks.reserve(hKernel->Args.MemObjArgs.size());
-    for (auto &MemArg : hKernel->Args.MemObjArgs) {
-      bool PushBack = false;
-      if (auto MemDepEvent = MemArg.Mem->LastEventWritingToMemObj;
-          MemDepEvent && std::find(DepEvents.begin(), DepEvents.end(),
-                                   MemDepEvent) == DepEvents.end()) {
-        DepEvents.push_back(MemDepEvent);
-        PushBack = true;
-      }
-      if ((MemArg.AccessFlags &
-           (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY)) ||
-          PushBack) {
-        if (std::find_if(MemMigrationLocks.begin(), MemMigrationLocks.end(),
-                         [MemArg](auto &Lock) {
-                           return Lock.first == MemArg.Mem;
-                         }) == MemMigrationLocks.end())
-          MemMigrationLocks.emplace_back(
-              std::pair{MemArg.Mem, ur_lock{MemArg.Mem->MemoryMigrationMutex}});
-      }
-    }
-  }
-
   // Early exit for zero size kernel
   if (*pGlobalWorkSize == 0) {
     return urEnqueueEventsWaitWithBarrier(hQueue, numEventsInWaitList,
@@ -477,15 +446,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
     CUstream CuStream = hQueue->getNextComputeStream(
         numEventsInWaitList, phEventWaitList, Guard, &StreamToken);
 
-    if (DepEvents.size()) {
-      UR_CHECK_ERROR(enqueueEventsWait(hQueue, CuStream, DepEvents.size(),
-                                       DepEvents.data()));
-    }
+    UR_CHECK_ERROR(enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
+                                     phEventWaitList));
 
     // For memory migration across devices in the same context
     if (hQueue->getContext()->Devices.size() > 1) {
       for (auto &MemArg : hKernel->Args.MemObjArgs) {
-        migrateMemoryToDeviceIfNeeded(MemArg.Mem, hQueue->getDevice());
+        enqueueMigrateMemoryToDeviceIfNeeded(MemArg.Mem, hQueue->getDevice(),
+                                             CuStream);
+        if (MemArg.AccessFlags &
+            (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY)) {
+          MemArg.Mem->setLastQueueWritingToMemObj(hQueue);
+        }
       }
     }
 
@@ -496,20 +468,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
       UR_CHECK_ERROR(RetImplEvent->start());
     }
 
-    // Once event has been started we can unlock MemoryMigrationMutex
-    if (hQueue->getContext()->Devices.size() > 1) {
-      for (auto &MemArg : hKernel->Args.MemObjArgs) {
-        // Telling the ur_mem_handle_t that it will need to wait on this kernel
-        // if it has been written to
-        if (phEvent && (MemArg.AccessFlags &
-                        (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY))) {
-          MemArg.Mem->setLastEventWritingToMemObj(RetImplEvent.get());
-        }
-      }
-      // We can release the MemoryMigrationMutexes now
-      MemMigrationLocks.clear();
-    }
-
     auto &ArgIndices = hKernel->getArgIndices();
     UR_CHECK_ERROR(cuLaunchKernel(
         CuFunc, BlocksPerGrid[0], BlocksPerGrid[1], BlocksPerGrid[2],
@@ -523,7 +481,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
       UR_CHECK_ERROR(RetImplEvent->record());
       *phEvent = RetImplEvent.release();
     }
-
   } catch (ur_result_t Err) {
     return Err;
   }
@@ -603,37 +560,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     }
   }
 
-  std::vector<ur_event_handle_t> DepEvents(
-      phEventWaitList, phEventWaitList + numEventsInWaitList);
-  std::vector<std::pair<ur_mem_handle_t, ur_lock>> MemMigrationLocks;
-
-  // phEventWaitList only contains events that are handed to UR by the SYCL
-  // runtime. However since UR handles memory dependencies within a context
-  // we may need to add more events to our dependent events list if the UR
-  // context contains multiple devices
-  if (hQueue->getContext()->Devices.size() > 1) {
-    MemMigrationLocks.reserve(hKernel->Args.MemObjArgs.size());
-    for (auto &MemArg : hKernel->Args.MemObjArgs) {
-      bool PushBack = false;
-      if (auto MemDepEvent = MemArg.Mem->LastEventWritingToMemObj;
-          MemDepEvent && std::find(DepEvents.begin(), DepEvents.end(),
-                                   MemDepEvent) == DepEvents.end()) {
-        DepEvents.push_back(MemDepEvent);
-        PushBack = true;
-      }
-      if ((MemArg.AccessFlags &
-           (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY)) ||
-          PushBack) {
-        if (std::find_if(MemMigrationLocks.begin(), MemMigrationLocks.end(),
-                         [MemArg](auto &Lock) {
-                           return Lock.first == MemArg.Mem;
-                         }) == MemMigrationLocks.end())
-          MemMigrationLocks.emplace_back(
-              std::pair{MemArg.Mem, ur_lock{MemArg.Mem->MemoryMigrationMutex}});
-      }
-    }
-  }
-
   // Early exit for zero size kernel
   if (*pGlobalWorkSize == 0) {
     return urEnqueueEventsWaitWithBarrier(hQueue, numEventsInWaitList,
@@ -666,15 +592,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     CUstream CuStream = hQueue->getNextComputeStream(
         numEventsInWaitList, phEventWaitList, Guard, &StreamToken);
 
-    if (DepEvents.size()) {
-      UR_CHECK_ERROR(enqueueEventsWait(hQueue, CuStream, DepEvents.size(),
-                                       DepEvents.data()));
-    }
+    UR_CHECK_ERROR(enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
+                                     phEventWaitList));
 
     // For memory migration across devices in the same context
     if (hQueue->getContext()->Devices.size() > 1) {
       for (auto &MemArg : hKernel->Args.MemObjArgs) {
-        migrateMemoryToDeviceIfNeeded(MemArg.Mem, hQueue->getDevice());
+        enqueueMigrateMemoryToDeviceIfNeeded(MemArg.Mem, hQueue->getDevice(),
+                                             CuStream);
+        if (MemArg.AccessFlags &
+            (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY)) {
+          MemArg.Mem->setLastQueueWritingToMemObj(hQueue);
+        }
       }
     }
 
@@ -685,20 +614,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
       UR_CHECK_ERROR(RetImplEvent->start());
     }
 
-    // Once event has been started we can unlock MemoryMigrationMutex
-    if (hQueue->getContext()->Devices.size() > 1) {
-      for (auto &MemArg : hKernel->Args.MemObjArgs) {
-        // Telling the ur_mem_handle_t that it will need to wait on this kernel
-        // if it has been written to
-        if (phEvent && (MemArg.AccessFlags &
-                        (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY))) {
-          MemArg.Mem->setLastEventWritingToMemObj(RetImplEvent.get());
-        }
-      }
-      // We can release the MemoryMigrationMutexes now
-      MemMigrationLocks.clear();
-    }
-
     auto &ArgIndices = hKernel->getArgIndices();
 
     CUlaunchConfig launch_config;
@@ -725,7 +640,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
       UR_CHECK_ERROR(RetImplEvent->record());
       *phEvent = RetImplEvent.release();
     }
-
   } catch (ur_result_t Err) {
     return Err;
   }
@@ -820,28 +734,19 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferReadRect(
     ur_event_handle_t *phEvent) {
   std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
 
-  ur_lock MemoryMigrationLock{hBuffer->MemoryMigrationMutex};
-  auto Device = hQueue->getDevice();
-  ScopedContext Active(Device);
-  CUstream Stream = hQueue->getNextTransferStream();
-
   try {
     // Note that this entry point may be called on a queue that may not be the
     // last queue to write to the MemBuffer, meaning we must perform the copy
     // from a different device
-    if (hBuffer->LastEventWritingToMemObj &&
-        hBuffer->LastEventWritingToMemObj->getQueue()->getDevice() !=
-            hQueue->getDevice()) {
-      hQueue = hBuffer->LastEventWritingToMemObj->getQueue();
-      Device = hQueue->getDevice();
-      ScopedContext Active(Device);
-      Stream = CUstream{0}; // Default stream for different device
-      // We may have to wait for an event on another queue if it is the last
-      // event writing to mem obj
-      UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, 1,
-                                       &hBuffer->LastEventWritingToMemObj));
+    if (hBuffer->LastQueueWritingToMemObj &&
+        hBuffer->LastQueueWritingToMemObj->getDevice() != hQueue->getDevice()) {
+      hQueue = hBuffer->LastQueueWritingToMemObj;
     }
 
+    auto Device = hQueue->getDevice();
+    ScopedContext Active(Device);
+    CUstream Stream = hQueue->getNextTransferStream();
+
     UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, numEventsInWaitList,
                                      phEventWaitList));
 
@@ -886,6 +791,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWriteRect(
   CUdeviceptr DevPtr =
       std::get<BufferMem>(hBuffer->Mem).getPtr(hQueue->getDevice());
   std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
+  hBuffer->setLastQueueWritingToMemObj(hQueue);
 
   try {
     ScopedContext Active(hQueue->getDevice());
@@ -916,7 +822,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWriteRect(
     if (phEvent) {
       *phEvent = RetImplEvent.release();
     }
-
   } catch (ur_result_t Err) {
     return Err;
   }
@@ -1054,20 +959,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill(
     ur_event_handle_t *phEvent) {
   UR_ASSERT(size + offset <= std::get<BufferMem>(hBuffer->Mem).getSize(),
             UR_RESULT_ERROR_INVALID_SIZE);
-
   std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
+  hBuffer->setLastQueueWritingToMemObj(hQueue);
 
   try {
     ScopedContext Active(hQueue->getDevice());
 
     auto Stream = hQueue->getNextTransferStream();
-    ur_result_t Result =
-        enqueueEventsWait(hQueue, Stream, numEventsInWaitList, phEventWaitList);
+    UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, numEventsInWaitList,
+                                     phEventWaitList));
 
     if (phEvent) {
       RetImplEvent =
           std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::makeNative(
-              UR_COMMAND_MEM_BUFFER_FILL, hQueue, Stream));
+              UR_COMMAND_MEM_BUFFER_WRITE_RECT, hQueue, Stream));
       UR_CHECK_ERROR(RetImplEvent->start());
     }
 
@@ -1093,8 +998,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill(
       break;
     }
     default: {
-      Result = commonMemSetLargePattern(Stream, patternSize, size, pPattern,
-                                        DstDevice);
+      UR_CHECK_ERROR(commonMemSetLargePattern(Stream, patternSize, size,
+                                              pPattern, DstDevice));
       break;
     }
     }
@@ -1103,13 +1008,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill(
       UR_CHECK_ERROR(RetImplEvent->record());
       *phEvent = RetImplEvent.release();
     }
-
-    return Result;
   } catch (ur_result_t Err) {
     return Err;
   } catch (...) {
     return UR_RESULT_ERROR_UNKNOWN;
   }
+  return UR_RESULT_SUCCESS;
 }
 
 static size_t imageElementByteSize(CUDA_ARRAY_DESCRIPTOR ArrayDesc) {
@@ -1210,28 +1114,19 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead(
 
   UR_ASSERT(hImage->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT);
 
-  ur_lock MemoryMigrationLock{hImage->MemoryMigrationMutex};
-  auto Device = hQueue->getDevice();
-  CUstream Stream = hQueue->getNextTransferStream();
-
   try {
     // Note that this entry point may be called on a queue that may not be the
     // last queue to write to the Image, meaning we must perform the copy
     // from a different device
-    if (hImage->LastEventWritingToMemObj &&
-        hImage->LastEventWritingToMemObj->getQueue()->getDevice() !=
-            hQueue->getDevice()) {
-      hQueue = hImage->LastEventWritingToMemObj->getQueue();
-      Device = hQueue->getDevice();
-      ScopedContext Active(Device);
-      Stream = CUstream{0}; // Default stream for different device
-      // We may have to wait for an event on another queue if it is the last
-      // event writing to mem obj
-      UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, 1,
-                                       &hImage->LastEventWritingToMemObj));
+    if (hImage->LastQueueWritingToMemObj &&
+        hImage->LastQueueWritingToMemObj->getDevice() != hQueue->getDevice()) {
+      hQueue = hImage->LastQueueWritingToMemObj;
     }
 
+    auto Device = hQueue->getDevice();
     ScopedContext Active(Device);
+    CUstream Stream = hQueue->getNextTransferStream();
+
     UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, numEventsInWaitList,
                                      phEventWaitList));
 
@@ -1834,28 +1729,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead(
   UR_ASSERT(offset + size <= std::get<BufferMem>(hBuffer->Mem).Size,
             UR_RESULT_ERROR_INVALID_SIZE);
   std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
-  ur_lock MemoryMigrationLock{hBuffer->MemoryMigrationMutex};
-  auto Device = hQueue->getDevice();
-  ScopedContext Active(Device);
-  CUstream Stream = hQueue->getNextTransferStream();
 
   try {
     // Note that this entry point may be called on a queue that may not be the
     // last queue to write to the MemBuffer, meaning we must perform the copy
     // from a different device
-    if (hBuffer->LastEventWritingToMemObj &&
-        hBuffer->LastEventWritingToMemObj->getQueue()->getDevice() !=
-            hQueue->getDevice()) {
-      hQueue = hBuffer->LastEventWritingToMemObj->getQueue();
-      Device = hQueue->getDevice();
-      ScopedContext Active(Device);
-      Stream = CUstream{0}; // Default stream for different device
-      // We may have to wait for an event on another queue if it is the last
-      // event writing to mem obj
-      UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, 1,
-                                       &hBuffer->LastEventWritingToMemObj));
+    if (hBuffer->LastQueueWritingToMemObj &&
+        hBuffer->LastQueueWritingToMemObj->getDevice() != hQueue->getDevice()) {
+      hQueue = hBuffer->LastQueueWritingToMemObj;
     }
 
+    auto Device = hQueue->getDevice();
+    ScopedContext Active(Device);
+    CUstream Stream = hQueue->getNextTransferStream();
+
     UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, numEventsInWaitList,
                                      phEventWaitList));
 
@@ -1897,17 +1784,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite(
   UR_ASSERT(offset + size <= std::get<BufferMem>(hBuffer->Mem).Size,
             UR_RESULT_ERROR_INVALID_SIZE);
 
-  ur_result_t Result = UR_RESULT_SUCCESS;
   CUdeviceptr DevPtr =
       std::get<BufferMem>(hBuffer->Mem).getPtr(hQueue->getDevice());
   std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
+  hBuffer->setLastQueueWritingToMemObj(hQueue);
 
   try {
     ScopedContext Active(hQueue->getDevice());
     CUstream CuStream = hQueue->getNextTransferStream();
 
-    Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
-                               phEventWaitList);
+    UR_CHECK_ERROR(enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
+                                     phEventWaitList));
 
     if (phEvent) {
       RetImplEvent =
@@ -1930,9 +1817,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite(
       *phEvent = RetImplEvent.release();
     }
   } catch (ur_result_t Err) {
-    Result = Err;
+    return Err;
   }
-  return Result;
+  return UR_RESULT_SUCCESS;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableWrite(
diff --git a/source/adapters/cuda/memory.cpp b/source/adapters/cuda/memory.cpp
index 15039d6f70..9ea62a2c1b 100644
--- a/source/adapters/cuda/memory.cpp
+++ b/source/adapters/cuda/memory.cpp
@@ -12,6 +12,7 @@
 
 #include "common.hpp"
 #include "context.hpp"
+#include "enqueue.hpp"
 #include "memory.hpp"
 
 /// Creates a UR Memory object using a CUDA memory allocation.
@@ -238,7 +239,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate(
   try {
     if (PerformInitialCopy) {
       for (const auto &Device : hContext->getDevices()) {
-        UR_CHECK_ERROR(migrateMemoryToDeviceIfNeeded(URMemObj.get(), Device));
+        // Synchronous behaviour is best in this case
+        ScopedContext Active(Device);
+        CUstream Stream{0}; // Use default stream
+        UR_CHECK_ERROR(enqueueMigrateMemoryToDeviceIfNeeded(URMemObj.get(),
+                                                            Device, Stream));
+        UR_CHECK_ERROR(cuStreamSynchronize(Stream));
       }
     }
 
@@ -496,27 +502,28 @@ ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t Mem,
 }
 
 namespace {
-ur_result_t migrateBufferToDevice(ur_mem_handle_t Mem,
-                                  ur_device_handle_t hDevice) {
+ur_result_t enqueueMigrateBufferToDevice(ur_mem_handle_t Mem,
+                                         ur_device_handle_t hDevice,
+                                         CUstream Stream) {
   auto &Buffer = std::get<BufferMem>(Mem->Mem);
-  if (Mem->LastEventWritingToMemObj == nullptr) {
+  if (Mem->LastQueueWritingToMemObj == nullptr) {
     // Device allocation being initialized from host for the first time
     if (Buffer.HostPtr) {
-      UR_CHECK_ERROR(
-          cuMemcpyHtoD(Buffer.getPtr(hDevice), Buffer.HostPtr, Buffer.Size));
+      UR_CHECK_ERROR(cuMemcpyHtoDAsync(Buffer.getPtr(hDevice), Buffer.HostPtr,
+                                       Buffer.Size, Stream));
     }
-  } else if (Mem->LastEventWritingToMemObj->getQueue()->getDevice() !=
-             hDevice) {
-    UR_CHECK_ERROR(cuMemcpyDtoD(
+  } else if (Mem->LastQueueWritingToMemObj->getDevice() != hDevice) {
+    UR_CHECK_ERROR(cuMemcpyDtoDAsync(
         Buffer.getPtr(hDevice),
-        Buffer.getPtr(Mem->LastEventWritingToMemObj->getQueue()->getDevice()),
-        Buffer.Size));
+        Buffer.getPtr(Mem->LastQueueWritingToMemObj->getDevice()), Buffer.Size,
+        Stream));
   }
   return UR_RESULT_SUCCESS;
 }
 
-ur_result_t migrateImageToDevice(ur_mem_handle_t Mem,
-                                 ur_device_handle_t hDevice) {
+ur_result_t enqueueMigrateImageToDevice(ur_mem_handle_t Mem,
+                                        ur_device_handle_t hDevice,
+                                        CUstream Stream) {
   auto &Image = std::get<SurfaceMem>(Mem->Mem);
   // When a dimension isn't used image_desc has the size set to 1
   size_t PixelSizeBytes = Image.PixelTypeSizeBytes *
@@ -547,40 +554,42 @@ ur_result_t migrateImageToDevice(ur_mem_handle_t Mem,
     CpyDesc3D.Depth = Image.ImageDesc.depth;
   }
 
-  if (Mem->LastEventWritingToMemObj == nullptr) {
+  if (Mem->LastQueueWritingToMemObj == nullptr) {
     if (Image.HostPtr) {
       if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE1D) {
-        UR_CHECK_ERROR(
-            cuMemcpyHtoA(ImageArray, 0, Image.HostPtr, ImageSizeBytes));
+        UR_CHECK_ERROR(cuMemcpyHtoAAsync(ImageArray, 0, Image.HostPtr,
+                                         ImageSizeBytes, Stream));
       } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE2D) {
         CpyDesc2D.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
         CpyDesc2D.srcHost = Image.HostPtr;
-        UR_CHECK_ERROR(cuMemcpy2D(&CpyDesc2D));
+        UR_CHECK_ERROR(cuMemcpy2DAsync(&CpyDesc2D, Stream));
       } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE3D) {
         CpyDesc3D.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
         CpyDesc3D.srcHost = Image.HostPtr;
-        UR_CHECK_ERROR(cuMemcpy3D(&CpyDesc3D));
+        UR_CHECK_ERROR(cuMemcpy3DAsync(&CpyDesc3D, Stream));
       }
     }
-  } else if (Mem->LastEventWritingToMemObj->getQueue()->getDevice() !=
-             hDevice) {
+  } else if (Mem->LastQueueWritingToMemObj->getDevice() != hDevice) {
     if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE1D) {
+      // Blocking wait needed
+      UR_CHECK_ERROR(urQueueFinish(Mem->LastQueueWritingToMemObj));
       // FIXME: 1D memcpy from DtoD going through the host.
       UR_CHECK_ERROR(cuMemcpyAtoH(
           Image.HostPtr,
-          Image.getArray(
-              Mem->LastEventWritingToMemObj->getQueue()->getDevice()),
+          Image.getArray(Mem->LastQueueWritingToMemObj->getDevice()),
           0 /*srcOffset*/, ImageSizeBytes));
       UR_CHECK_ERROR(
           cuMemcpyHtoA(ImageArray, 0, Image.HostPtr, ImageSizeBytes));
     } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE2D) {
-      CpyDesc2D.srcArray = Image.getArray(
-          Mem->LastEventWritingToMemObj->getQueue()->getDevice());
-      UR_CHECK_ERROR(cuMemcpy2D(&CpyDesc2D));
+      CpyDesc2D.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_DEVICE;
+      CpyDesc2D.srcArray =
+          Image.getArray(Mem->LastQueueWritingToMemObj->getDevice());
+      UR_CHECK_ERROR(cuMemcpy2DAsync(&CpyDesc2D, Stream));
     } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE3D) {
-      CpyDesc3D.srcArray = Image.getArray(
-          Mem->LastEventWritingToMemObj->getQueue()->getDevice());
-      UR_CHECK_ERROR(cuMemcpy3D(&CpyDesc3D));
+      CpyDesc3D.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_DEVICE;
+      CpyDesc3D.srcArray =
+          Image.getArray(Mem->LastQueueWritingToMemObj->getDevice());
+      UR_CHECK_ERROR(cuMemcpy3DAsync(&CpyDesc3D, Stream));
     }
   }
   return UR_RESULT_SUCCESS;
@@ -589,8 +598,8 @@ ur_result_t migrateImageToDevice(ur_mem_handle_t Mem,
 
 // If calling this entry point it is necessary to lock the memoryMigrationMutex
 // beforehand
-ur_result_t migrateMemoryToDeviceIfNeeded(ur_mem_handle_t Mem,
-                                          const ur_device_handle_t hDevice) {
+ur_result_t enqueueMigrateMemoryToDeviceIfNeeded(
+    ur_mem_handle_t Mem, const ur_device_handle_t hDevice, CUstream Stream) {
   UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
   // Device allocation has already been initialized with most up to date
   // data in buffer
@@ -601,9 +610,9 @@ ur_result_t migrateMemoryToDeviceIfNeeded(ur_mem_handle_t Mem,
 
   ScopedContext Active(hDevice);
   if (Mem->isBuffer()) {
-    UR_CHECK_ERROR(migrateBufferToDevice(Mem, hDevice));
+    UR_CHECK_ERROR(enqueueMigrateBufferToDevice(Mem, hDevice, Stream));
   } else {
-    UR_CHECK_ERROR(migrateImageToDevice(Mem, hDevice));
+    UR_CHECK_ERROR(enqueueMigrateImageToDevice(Mem, hDevice, Stream));
   }
 
   Mem->HaveMigratedToDeviceSinceLastWrite[Mem->getContext()->getDeviceIndex(
diff --git a/source/adapters/cuda/memory.hpp b/source/adapters/cuda/memory.hpp
index a084a55c07..a67e9295cc 100644
--- a/source/adapters/cuda/memory.hpp
+++ b/source/adapters/cuda/memory.hpp
@@ -20,6 +20,12 @@
 #include "device.hpp"
 #include "event.hpp"
 
+ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t,
+                                           const ur_device_handle_t);
+ur_result_t enqueueMigrateMemoryToDeviceIfNeeded(ur_mem_handle_t,
+                                                 const ur_device_handle_t,
+                                                 CUstream);
+
 // Handler for plain, pointer-based CUDA allocations
 struct BufferMem {
 
@@ -288,7 +294,7 @@ struct SurfaceMem {
 ///
 /// The ur_mem_handle_t is responsible for memory allocation and migration
 /// across devices in the same ur_context_handle_t. If a kernel writes to a
-/// ur_mem_handle_t then it will write to LastEventWritingToMemObj. Then all
+/// ur_mem_handle_t then it will write to LastQueueWritingToMemObj. Then all
 /// subsequent operations that want to read from the ur_mem_handle_t must wait
 /// on the event referring to the last write.
 ///
@@ -308,61 +314,7 @@ struct SurfaceMem {
 ///
 /// Migrations will occur in both cases if the most recent version of data
 /// is on a different device, marked by
-/// LastEventWritingToMemObj->getQueue()->getDevice()
-///
-/// Example trace:
-/// ~~~~~~~~~~~~~~
-///
-/// =====> urContextCreate([device0, device1], ...) // associated with [q0, q1]
-///             -> OUT: hContext
-///
-/// =====> urMemBufferCreate(hContext,...);
-///             -> No native allocations made
-///             -> OUT: hBuffer
-///
-/// =====> urEnqueueMemBufferWrite(q0, hBuffer,...);
-///             -> Allocation made on q0 ie device0
-///             -> New allocation initialized with host data.
-///
-/// =====> urKernelSetArgMemObj(hKernel0, hBuffer, ...);
-///             -> ur_kernel_handle_t associated with a ur_program_handle_t,
-///                which is in turn unique to a device. So we can set the kernel
-///                arg with the ptr of the device specific allocation.
-///             -> hKernel0->getProgram()->getDevice() == device0
-///             -> allocateMemObjOnDeviceIfNeeded(device0);
-///                   -> Native allocation already made on device0, continue.
-///
-/// =====> urEnqueueKernelLaunch(q0, hKernel0, ...);
-///             -> Suppose that hKernel0 writes to hBuffer.
-///             -> Call hBuffer->setLastEventWritingToMemObj with return event
-///                from this operation
-///             -> Enqueue native kernel launch
-///
-/// =====> urKernelSetArgMemObj(hKernel1, hBuffer, ...);
-///             -> hKernel1->getProgram()->getDevice() == device1
-///             -> New allocation will be made on device1 when calling
-///                getPtr(device1)
-///                   -> No native allocation on device1
-///                   -> Make native allocation on device1
-///
-/// =====> urEnqueueKernelLaunch(q1, hKernel1, ...);
-///             -> Suppose hKernel1 wants to read from hBuffer and not write.
-///             -> migrateMemoryToDeviceIfNeeded(device1);
-///                   -> hBuffer->LastEventWritingToMemObj is not nullptr
-///                   -> Check if memory has been migrated to device1 since the
-///                      last write
-///                        -> Hasn't been migrated
-///                   -> Wait on LastEventWritingToMemObj.
-///                   -> Migrate memory from device0's native allocation to
-///                      device1's native allocation.
-///             -> Enqueue native kernel launch
-///
-/// =====> urEnqueueKernelLaunch(q0, hKernel0, ...);
-///             -> migrateMemoryToDeviceIfNeeded(device0);
-///                   -> hBuffer->LastEventWritingToMemObj refers to an event
-///                      from q0
-///                        -> Migration not necessary
-///             -> Enqueue native kernel launch
+/// LastQueueWritingToMemObj->getDevice()
 ///
 struct ur_mem_handle_t_ {
   // Context where the memory object is accessible
@@ -381,15 +333,13 @@ struct ur_mem_handle_t_ {
   // Has the memory been migrated to a device since the last write?
   std::vector<bool> HaveMigratedToDeviceSinceLastWrite;
 
-  // We should wait on this event prior to migrating memory across allocations
-  // in this ur_mem_handle_t_
-  ur_event_handle_t LastEventWritingToMemObj{nullptr};
+  // Queue with most up to date data of ur_mem_handle_t_
+  ur_queue_handle_t LastQueueWritingToMemObj{nullptr};
 
   // Enumerates all possible types of accesses.
   enum access_mode_t { unknown, read_write, read_only, write_only };
 
   ur_mutex MemoryAllocationMutex; // A mutex for allocations
-  ur_mutex MemoryMigrationMutex;  // A mutex for memory transfers
 
   /// A UR Memory object represents either plain memory allocations ("Buffers"
   /// in OpenCL) or typed allocations ("Images" in OpenCL).
@@ -478,20 +428,17 @@ struct ur_mem_handle_t_ {
 
   uint32_t getReferenceCount() const noexcept { return RefCount; }
 
-  void setLastEventWritingToMemObj(ur_event_handle_t NewEvent) {
-    assert(NewEvent && "Invalid event!");
-    // This entry point should only ever be called when using multi device ctx
-    assert(Context->Devices.size() > 1);
-    urEventRetain(NewEvent);
-    if (LastEventWritingToMemObj != nullptr) {
-      urEventRelease(LastEventWritingToMemObj);
+  void setLastQueueWritingToMemObj(ur_queue_handle_t WritingQueue) {
+    urQueueRetain(WritingQueue);
+    if (LastQueueWritingToMemObj != nullptr) {
+      urQueueRelease(LastQueueWritingToMemObj);
     }
-    LastEventWritingToMemObj = NewEvent;
+    LastQueueWritingToMemObj = WritingQueue;
     for (const auto &Device : Context->getDevices()) {
       // This event is never an interop event so will always have an associated
       // queue
       HaveMigratedToDeviceSinceLastWrite[Context->getDeviceIndex(Device)] =
-          Device == NewEvent->getQueue()->getDevice();
+          Device == WritingQueue->getDevice();
     }
   }
 };
diff --git a/source/adapters/hip/enqueue.cpp b/source/adapters/hip/enqueue.cpp
index aa3af35040..9092964a75 100644
--- a/source/adapters/hip/enqueue.cpp
+++ b/source/adapters/hip/enqueue.cpp
@@ -160,8 +160,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite(
             UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST);
   UR_ASSERT(hBuffer->isBuffer(), UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST);
 
-  ur_result_t Result = UR_RESULT_SUCCESS;
   std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
+  hBuffer->setLastQueueWritingToMemObj(hQueue);
 
   try {
     ScopedContext Active(hQueue->getDevice());
@@ -193,9 +193,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite(
       *phEvent = RetImplEvent.release();
     }
   } catch (ur_result_t Err) {
-    Result = Err;
+    return Err;
   }
-  return Result;
+  return UR_RESULT_SUCCESS;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead(
@@ -210,29 +210,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead(
 
   std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
 
-  ur_lock MemoryMigrationLock{hBuffer->MemoryMigrationMutex};
-  auto Device = hQueue->getDevice();
-  hipStream_t HIPStream = hQueue->getNextTransferStream();
-
   try {
     // Note that this entry point may be called on a queue that may not be the
     // last queue to write to the MemBuffer, meaning we must perform the copy
     // from a different device
-    if (hBuffer->LastEventWritingToMemObj &&
-        hBuffer->LastEventWritingToMemObj->getQueue()->getDevice() !=
-            hQueue->getDevice()) {
-      // This event is never created with interop so getQueue is never null
-      hQueue = hBuffer->LastEventWritingToMemObj->getQueue();
-      Device = hQueue->getDevice();
-      ScopedContext Active(Device);
-      HIPStream = hipStream_t{0}; // Default stream for different device
-      // We may have to wait for an event on another queue if it is the last
-      // event writing to mem obj
-      UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, 1,
-                                       &hBuffer->LastEventWritingToMemObj));
+    if (hBuffer->LastQueueWritingToMemObj &&
+        hBuffer->LastQueueWritingToMemObj->getDevice() != hQueue->getDevice()) {
+      hQueue = hBuffer->LastQueueWritingToMemObj;
     }
 
+    auto Device = hQueue->getDevice();
     ScopedContext Active(Device);
+    hipStream_t HIPStream = hQueue->getNextTransferStream();
 
     // Use the default stream if copying from another device
     UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList,
@@ -280,44 +269,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
   UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
   UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
 
-  std::vector<ur_event_handle_t> DepEvents(
-      phEventWaitList, phEventWaitList + numEventsInWaitList);
-  std::vector<std::pair<ur_mem_handle_t, ur_lock>> MemMigrationLocks;
-
-  // phEventWaitList only contains events that are handed to UR by the SYCL
-  // runtime. However since UR handles memory dependencies within a context
-  // we may need to add more events to our dependent events list if the UR
-  // context contains multiple devices
-  if (hQueue->getContext()->Devices.size() > 1) {
-    MemMigrationLocks.reserve(hKernel->Args.MemObjArgs.size());
-    for (auto &MemArg : hKernel->Args.MemObjArgs) {
-      bool PushBack = false;
-      if (auto MemDepEvent = MemArg.Mem->LastEventWritingToMemObj;
-          MemDepEvent && std::find(DepEvents.begin(), DepEvents.end(),
-                                   MemDepEvent) == DepEvents.end()) {
-        DepEvents.push_back(MemDepEvent);
-        PushBack = true;
-      }
-      if ((MemArg.AccessFlags &
-           (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY)) ||
-          PushBack) {
-        if (std::find_if(MemMigrationLocks.begin(), MemMigrationLocks.end(),
-                         [MemArg](auto &Lock) {
-                           return Lock.first == MemArg.Mem;
-                         }) == MemMigrationLocks.end())
-          MemMigrationLocks.emplace_back(
-              std::pair{MemArg.Mem, ur_lock{MemArg.Mem->MemoryMigrationMutex}});
-      }
-    }
-  }
-
   // Early exit for zero size range kernel
   if (*pGlobalWorkSize == 0) {
-    if (DepEvents.size()) {
-      return urEnqueueEventsWaitWithBarrier(hQueue, DepEvents.size(),
-                                            phEventWaitList, phEvent);
-    }
-    return UR_RESULT_SUCCESS;
+    return urEnqueueEventsWaitWithBarrier(hQueue, numEventsInWaitList,
+                                          phEventWaitList, phEvent);
   }
 
   // Set the number of threads per block to the number of threads per warp
@@ -325,7 +280,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
   size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
   size_t BlocksPerGrid[3] = {1u, 1u, 1u};
 
-  ur_result_t Result = UR_RESULT_SUCCESS;
   std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
 
   try {
@@ -343,20 +297,25 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
     hipStream_t HIPStream = hQueue->getNextComputeStream(
         numEventsInWaitList, phEventWaitList, Guard, &StreamToken);
 
-    if (DepEvents.size()) {
-      UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, DepEvents.size(),
-                                       DepEvents.data()));
-    }
+    UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList,
+                                     phEventWaitList));
 
     // For memory migration across devices in the same context
     if (hQueue->getContext()->Devices.size() > 1) {
       for (auto &MemArg : hKernel->Args.MemObjArgs) {
-        migrateMemoryToDeviceIfNeeded(MemArg.Mem, hQueue->getDevice());
+        enqueueMigrateMemoryToDeviceIfNeeded(MemArg.Mem, hQueue->getDevice(),
+                                             HIPStream);
+        if (MemArg.AccessFlags &
+            (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY)) {
+          MemArg.Mem->setLastQueueWritingToMemObj(hQueue);
+        }
       }
     }
 
     auto ArgIndices = hKernel->getArgIndices();
 
+    // If migration of mem across buffer is needed, an event must be associated
+    // with this command, implicitly if phEvent is nullptr
     if (phEvent) {
       RetImplEvent =
           std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::makeNative(
@@ -364,20 +323,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
       UR_CHECK_ERROR(RetImplEvent->start());
     }
 
-    // Once event has been started we can unlock MemoryMigrationMutex
-    if (hQueue->getContext()->Devices.size() > 1) {
-      for (auto &MemArg : hKernel->Args.MemObjArgs) {
-        // Telling the ur_mem_handle_t that it will need to wait on this kernel
-        // if it has been written to
-        if (phEvent && (MemArg.AccessFlags &
-                        (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY))) {
-          MemArg.Mem->setLastEventWritingToMemObj(RetImplEvent.get());
-        }
-      }
-      // We can release the MemoryMigrationMutexes now
-      MemMigrationLocks.clear();
-    }
-
     UR_CHECK_ERROR(hipModuleLaunchKernel(
         HIPFunc, BlocksPerGrid[0], BlocksPerGrid[1], BlocksPerGrid[2],
         ThreadsPerBlock[0], ThreadsPerBlock[1], ThreadsPerBlock[2],
@@ -390,9 +335,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
       *phEvent = RetImplEvent.release();
     }
   } catch (ur_result_t err) {
-    Result = err;
+    return err;
   }
-  return Result;
+  return UR_RESULT_SUCCESS;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
@@ -578,30 +523,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferReadRect(
 
   std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
 
-  ur_result_t Result = UR_RESULT_SUCCESS;
-  ur_lock MemoryMigrationLock(hBuffer->MemoryMigrationMutex);
-  auto Device = hQueue->getDevice();
-  hipStream_t HIPStream = hQueue->getNextTransferStream();
-
   try {
     // Note that this entry point may be called on a queue that may not be the
     // last queue to write to the MemBuffer, meaning we must perform the copy
     // from a different device
-    if (hBuffer->LastEventWritingToMemObj &&
-        hBuffer->LastEventWritingToMemObj->getQueue()->getDevice() !=
-            hQueue->getDevice()) {
-      // This event is never created with interop so getQueue is never null
-      hQueue = hBuffer->LastEventWritingToMemObj->getQueue();
-      Device = hQueue->getDevice();
-      ScopedContext Active(Device);
-      HIPStream = hipStream_t{0}; // Default stream for different device
-      // We may have to wait for an event on another queue if it is the last
-      // event writing to mem obj
-      UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, 1,
-                                       &hBuffer->LastEventWritingToMemObj));
+    if (hBuffer->LastQueueWritingToMemObj &&
+        hBuffer->LastQueueWritingToMemObj->getDevice() != hQueue->getDevice()) {
+      hQueue = hBuffer->LastQueueWritingToMemObj;
     }
 
+    auto Device = hQueue->getDevice();
     ScopedContext Active(Device);
+    hipStream_t HIPStream = hQueue->getNextTransferStream();
 
     UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList,
                                      phEventWaitList));
@@ -632,9 +565,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferReadRect(
     }
 
   } catch (ur_result_t Err) {
-    Result = Err;
+    return Err;
   }
-  return Result;
+  return UR_RESULT_SUCCESS;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWriteRect(
@@ -644,27 +577,27 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWriteRect(
     size_t hostRowPitch, size_t hostSlicePitch, void *pSrc,
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_event_handle_t *phEvent) {
-  ur_result_t Result = UR_RESULT_SUCCESS;
   void *DevPtr = std::get<BufferMem>(hBuffer->Mem).getVoid(hQueue->getDevice());
   std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
+  hBuffer->setLastQueueWritingToMemObj(hQueue);
 
   try {
     ScopedContext Active(hQueue->getDevice());
     hipStream_t HIPStream = hQueue->getNextTransferStream();
-    Result = enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList,
-                               phEventWaitList);
+    UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList,
+                                     phEventWaitList));
 
     if (phEvent) {
       RetImplEvent =
           std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::makeNative(
-              UR_COMMAND_MEM_BUFFER_WRITE_RECT, hQueue, HIPStream));
+              UR_COMMAND_MEM_BUFFER_WRITE, hQueue, HIPStream));
       UR_CHECK_ERROR(RetImplEvent->start());
     }
 
-    Result = commonEnqueueMemBufferCopyRect(
+    UR_CHECK_ERROR(commonEnqueueMemBufferCopyRect(
         HIPStream, region, pSrc, hipMemoryTypeHost, hostOrigin, hostRowPitch,
         hostSlicePitch, &DevPtr, hipMemoryTypeDevice, bufferOrigin,
-        bufferRowPitch, bufferSlicePitch);
+        bufferRowPitch, bufferSlicePitch));
 
     if (phEvent) {
       UR_CHECK_ERROR(RetImplEvent->record());
@@ -677,11 +610,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWriteRect(
     if (phEvent) {
       *phEvent = RetImplEvent.release();
     }
-
   } catch (ur_result_t Err) {
-    Result = Err;
+    return Err;
   }
-  return Result;
+  return UR_RESULT_SUCCESS;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopy(
@@ -803,8 +735,8 @@ static inline void memsetRemainPattern(hipStream_t Stream, uint32_t PatternSize,
 // HIP has no memset functions that allow setting values more than 4 bytes. UR
 // API lets you pass an arbitrary "pattern" to the buffer fill, which can be
 // more than 4 bytes. We must break up the pattern into 1 byte values, and set
-// the buffer using multiple strided calls.  The first 4 patterns are set using
-// hipMemsetD32Async then all subsequent 1 byte patterns are set using
+// the buffer using multiple strided calls.  The first 4 patterns are set
+// using hipMemsetD32Async then all subsequent 1 byte patterns are set using
 // hipMemset2DAsync which is called for each pattern.
 ur_result_t commonMemSetLargePattern(hipStream_t Stream, uint32_t PatternSize,
                                      size_t Size, const void *pPattern,
@@ -823,8 +755,8 @@ ur_result_t commonMemSetLargePattern(hipStream_t Stream, uint32_t PatternSize,
   UR_CHECK_ERROR(hipPointerGetAttributes(&ptrAttribs, (const void *)Ptr));
 
   // The hostPointer attribute is non-null also for shared memory allocations.
-  // To make sure that this workaround only executes for host pinned memory, we
-  // need to check that isManaged attribute is false.
+  // To make sure that this workaround only executes for host pinned memory,
+  // we need to check that isManaged attribute is false.
   if (ptrAttribs.hostPointer && !ptrAttribs.isManaged) {
     const auto NumOfCopySteps = Size / PatternSize;
     const auto Offset = sizeof(uint32_t);
@@ -874,21 +806,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill(
   std::ignore = PatternSizeIsValid;
 
   std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
+  hBuffer->setLastQueueWritingToMemObj(hQueue);
 
   try {
     ScopedContext Active(hQueue->getDevice());
 
     auto Stream = hQueue->getNextTransferStream();
-    ur_result_t Result = UR_RESULT_SUCCESS;
     if (phEventWaitList) {
-      Result = enqueueEventsWait(hQueue, Stream, numEventsInWaitList,
-                                 phEventWaitList);
+      UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, numEventsInWaitList,
+                                       phEventWaitList));
     }
 
     if (phEvent) {
       RetImplEvent =
           std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::makeNative(
-              UR_COMMAND_MEM_BUFFER_FILL, hQueue, Stream));
+              UR_COMMAND_MEM_BUFFER_WRITE, hQueue, Stream));
       UR_CHECK_ERROR(RetImplEvent->start());
     }
 
@@ -915,8 +847,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill(
     }
 
     default: {
-      Result = commonMemSetLargePattern(Stream, patternSize, size, pPattern,
-                                        DstDevice);
+      UR_CHECK_ERROR(commonMemSetLargePattern(Stream, patternSize, size,
+                                              pPattern, DstDevice));
       break;
     }
     }
@@ -925,13 +857,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill(
       UR_CHECK_ERROR(RetImplEvent->record());
       *phEvent = RetImplEvent.release();
     }
-
-    return Result;
   } catch (ur_result_t Err) {
     return Err;
   } catch (...) {
     return UR_RESULT_ERROR_UNKNOWN;
   }
+  return UR_RESULT_SUCCESS;
 }
 
 /// General ND memory copy operation for images (where N > 1).
@@ -1015,28 +946,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead(
     const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
   UR_ASSERT(hImage->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT);
 
-  ur_lock MemoryMigrationLock{hImage->MemoryMigrationMutex};
-  auto Device = hQueue->getDevice();
-  hipStream_t HIPStream = hQueue->getNextTransferStream();
-
   try {
     // Note that this entry point may be called on a queue that may not be the
-    // last queue to write to the MemBuffer, meaning we must perform the copy
+    // last queue to write to the MemImage, meaning we must perform the copy
     // from a different device
-    if (hImage->LastEventWritingToMemObj &&
-        hImage->LastEventWritingToMemObj->getQueue()->getDevice() !=
-            hQueue->getDevice()) {
-      hQueue = hImage->LastEventWritingToMemObj->getQueue();
-      Device = hQueue->getDevice();
-      ScopedContext Active(Device);
-      HIPStream = hipStream_t{0}; // Default stream for different device
-      // We may have to wait for an event on another queue if it is the last
-      // event writing to mem obj
-      UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, 1,
-                                       &hImage->LastEventWritingToMemObj));
+    if (hImage->LastQueueWritingToMemObj &&
+        hImage->LastQueueWritingToMemObj->getDevice() != hQueue->getDevice()) {
+      hQueue = hImage->LastQueueWritingToMemObj;
     }
 
+    auto Device = hQueue->getDevice();
     ScopedContext Active(Device);
+    hipStream_t HIPStream = hQueue->getNextTransferStream();
 
     if (phEventWaitList) {
       UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList,
@@ -1299,7 +1220,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap(
     if (!IsPinned &&
         (Map->getMapFlags() &
          (UR_MAP_FLAG_WRITE | UR_MAP_FLAG_WRITE_INVALIDATE_REGION))) {
-      // Pinned host memory is only on host so it doesn't need to be written to.
+      // Pinned host memory is only on host so it doesn't need to be written
+      // to.
       UR_CHECK_ERROR(urEnqueueMemBufferWrite(
           hQueue, hMem, true, Map->getMapOffset(), Map->getMapSize(),
           pMappedPtr, numEventsInWaitList, phEventWaitList, phEvent));
@@ -1475,10 +1397,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch(
 
     hipPointerAttribute_t attribs;
     // TODO: hipPointerGetAttributes will fail if pMem is non-HIP allocated
-    // memory, as it is neither registered as host memory, nor into the address
-    // space for the current device, meaning the pMem ptr points to a
-    // system-allocated memory. This means we may need to check system-alloacted
-    // memory and handle the failure more gracefully.
+    // memory, as it is neither registered as host memory, nor into the
+    // address space for the current device, meaning the pMem ptr points to a
+    // system-allocated memory. This means we may need to check
+    // system-alloacted memory and handle the failure more gracefully.
     UR_CHECK_ERROR(hipPointerGetAttributes(&attribs, pMem));
     // async prefetch requires USM pointer (or hip SVM) to work.
     if (!attribs.isManaged) {
@@ -1507,8 +1429,9 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size,
   ur_device_handle_t Device = hQueue->getDevice();
 
 #if HIP_VERSION_MAJOR >= 5
-  // NOTE: The hipPointerGetAttribute API is marked as beta, meaning, while this
-  // is feature complete, it is still open to changes and outstanding issues.
+  // NOTE: The hipPointerGetAttribute API is marked as beta, meaning, while
+  // this is feature complete, it is still open to changes and outstanding
+  // issues.
   size_t PointerRangeSize = 0;
   UR_CHECK_ERROR(hipPointerGetAttribute(
       &PointerRangeSize, HIP_POINTER_ATTRIBUTE_RANGE_SIZE,
@@ -1548,9 +1471,10 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size,
     }
 
     // Passing MEM_ADVICE_SET/MEM_ADVICE_CLEAR_PREFERRED_LOCATION to
-    // hipMemAdvise on a GPU device requires the GPU device to report a non-zero
-    // value for hipDeviceAttributeConcurrentManagedAccess. Therefore, ignore
-    // the mem advice if concurrent managed memory access is not available.
+    // hipMemAdvise on a GPU device requires the GPU device to report a
+    // non-zero value for hipDeviceAttributeConcurrentManagedAccess.
+    // Therefore, ignore the mem advice if concurrent managed memory access is
+    // not available.
     if (advice & (UR_USM_ADVICE_FLAG_SET_PREFERRED_LOCATION |
                   UR_USM_ADVICE_FLAG_CLEAR_PREFERRED_LOCATION |
                   UR_USM_ADVICE_FLAG_SET_ACCESSED_BY_DEVICE |
@@ -1585,9 +1509,10 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size,
 #endif
     } else {
       Result = setHipMemAdvise(HIPDevicePtr, size, advice, DeviceID);
-      // UR_RESULT_ERROR_INVALID_ENUMERATION is returned when using a valid but
-      // currently unmapped advice arguments as not supported by this platform.
-      // Therefore, warn the user instead of throwing and aborting the runtime.
+      // UR_RESULT_ERROR_INVALID_ENUMERATION is returned when using a valid
+      // but currently unmapped advice arguments as not supported by this
+      // platform. Therefore, warn the user instead of throwing and aborting
+      // the runtime.
       if (Result == UR_RESULT_ERROR_INVALID_ENUMERATION) {
         releaseEvent();
         setErrorMessage("mem_advise is ignored as the advice argument is not "
@@ -1648,15 +1573,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy2D(
       UR_CHECK_ERROR(RetImplEvent->start());
     }
 
-    // There is an issue with hipMemcpy2D* when hipMemcpyDefault is used, which
-    // makes the HIP runtime not correctly derive the copy kind (direction) for
-    // the copies since ROCm 5.6.0+. See: https://github.com/ROCm/clr/issues/40
+    // There is an issue with hipMemcpy2D* when hipMemcpyDefault is used,
+    // which makes the HIP runtime not correctly derive the copy kind
+    // (direction) for the copies since ROCm 5.6.0+. See:
+    // https://github.com/ROCm/clr/issues/40
     // TODO: Add maximum HIP_VERSION when bug has been fixed.
 #if HIP_VERSION >= 50600000
     hipPointerAttribute_t srcAttribs{};
     hipPointerAttribute_t dstAttribs{};
 
-    // Determine if pSrc and/or pDst are system allocated pageable host memory.
+    // Determine if pSrc and/or pDst are system allocated pageable host
+    // memory.
     bool srcIsSystemAlloc{false};
     bool dstIsSystemAlloc{false};
 
@@ -1851,9 +1778,9 @@ setKernelParams(const ur_device_handle_t Device, const uint32_t WorkDim,
                     UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE);
           UR_ASSERT(LocalWorkSize[dim] <= MaxThreadsPerBlock[dim],
                     UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE);
-          // Checks that local work sizes are a divisor of the global work sizes
-          // which includes that the local work sizes are neither larger than
-          // the global work sizes and not 0.
+          // Checks that local work sizes are a divisor of the global work
+          // sizes which includes that the local work sizes are neither larger
+          // than the global work sizes and not 0.
           UR_ASSERT(LocalWorkSize != 0,
                     UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE);
           UR_ASSERT((GlobalWorkSize[dim] % LocalWorkSize[dim]) == 0,
diff --git a/source/adapters/hip/memory.cpp b/source/adapters/hip/memory.cpp
index 026b840f5b..eb91f1620a 100644
--- a/source/adapters/hip/memory.cpp
+++ b/source/adapters/hip/memory.cpp
@@ -10,6 +10,7 @@
 
 #include "memory.hpp"
 #include "context.hpp"
+#include "enqueue.hpp"
 #include <cassert>
 #include <ur_util.hpp>
 
@@ -390,7 +391,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate(
 
   if (PerformInitialCopy) {
     for (const auto &Dev : hContext->getDevices()) {
-      UR_CHECK_ERROR(migrateMemoryToDeviceIfNeeded(URMemObj.get(), Dev));
+      ScopedContext Active(Dev);
+      hipStream_t Stream{0}; // Use default stream
+      UR_CHECK_ERROR(
+          enqueueMigrateMemoryToDeviceIfNeeded(URMemObj.get(), Dev, Stream));
+      UR_CHECK_ERROR(hipStreamSynchronize(Stream));
     }
   }
   *phMem = URMemObj.release();
@@ -541,27 +546,28 @@ ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t Mem,
 }
 
 namespace {
-inline ur_result_t migrateBufferToDevice(ur_mem_handle_t Mem,
-                                         ur_device_handle_t hDevice) {
+inline ur_result_t enqueueMigrateBufferToDevice(ur_mem_handle_t Mem,
+                                                ur_device_handle_t hDevice,
+                                                hipStream_t Stream) {
   auto &Buffer = std::get<BufferMem>(Mem->Mem);
-  if (Mem->LastEventWritingToMemObj == nullptr) {
+  if (Mem->LastQueueWritingToMemObj == nullptr) {
     // Device allocation being initialized from host for the first time
     if (Buffer.HostPtr) {
-      UR_CHECK_ERROR(
-          hipMemcpyHtoD(Buffer.getPtr(hDevice), Buffer.HostPtr, Buffer.Size));
+      UR_CHECK_ERROR(hipMemcpyHtoDAsync(Buffer.getPtr(hDevice), Buffer.HostPtr,
+                                        Buffer.Size, Stream));
     }
-  } else if (Mem->LastEventWritingToMemObj->getQueue()->getDevice() !=
-             hDevice) {
-    UR_CHECK_ERROR(hipMemcpyDtoD(
+  } else if (Mem->LastQueueWritingToMemObj->getDevice() != hDevice) {
+    UR_CHECK_ERROR(hipMemcpyDtoDAsync(
         Buffer.getPtr(hDevice),
-        Buffer.getPtr(Mem->LastEventWritingToMemObj->getQueue()->getDevice()),
-        Buffer.Size));
+        Buffer.getPtr(Mem->LastQueueWritingToMemObj->getDevice()), Buffer.Size,
+        Stream));
   }
   return UR_RESULT_SUCCESS;
 }
 
-inline ur_result_t migrateImageToDevice(ur_mem_handle_t Mem,
-                                        ur_device_handle_t hDevice) {
+inline ur_result_t enqueueMigrateImageToDevice(ur_mem_handle_t Mem,
+                                               ur_device_handle_t hDevice,
+                                               hipStream_t Stream) {
   auto &Image = std::get<SurfaceMem>(Mem->Mem);
   // When a dimension isn't used image_desc has the size set to 1
   size_t PixelSizeBytes = Image.PixelTypeSizeBytes *
@@ -592,36 +598,40 @@ inline ur_result_t migrateImageToDevice(ur_mem_handle_t Mem,
     CpyDesc3D.Depth = Image.ImageDesc.depth;
   }
 
-  if (Mem->LastEventWritingToMemObj == nullptr) {
+  if (Mem->LastQueueWritingToMemObj == nullptr) {
     if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE1D) {
+      UR_CHECK_ERROR(hipStreamSynchronize(Stream));
       UR_CHECK_ERROR(
           hipMemcpyHtoA(ImageArray, 0, Image.HostPtr, ImageSizeBytes));
     } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE2D) {
       CpyDesc2D.srcHost = Image.HostPtr;
-      UR_CHECK_ERROR(hipMemcpyParam2D(&CpyDesc2D));
+      UR_CHECK_ERROR(hipMemcpyParam2DAsync(&CpyDesc2D, Stream));
     } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE3D) {
       CpyDesc3D.srcHost = Image.HostPtr;
+      CpyDesc3D.srcMemoryType = hipMemoryTypeHost;
       UR_CHECK_ERROR(hipDrvMemcpy3D(&CpyDesc3D));
     }
-  } else if (Mem->LastEventWritingToMemObj->getQueue()->getDevice() !=
-             hDevice) {
+  } else if (Mem->LastQueueWritingToMemObj->getDevice() != hDevice) {
     if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE1D) {
+      // Blocking wait needed
+      UR_CHECK_ERROR(urQueueFinish(Mem->LastQueueWritingToMemObj));
       // FIXME: 1D memcpy from DtoD going through the host.
       UR_CHECK_ERROR(hipMemcpyAtoH(
           Image.HostPtr,
-          Image.getArray(
-              Mem->LastEventWritingToMemObj->getQueue()->getDevice()),
+          Image.getArray(Mem->LastQueueWritingToMemObj->getDevice()),
           0 /*srcOffset*/, ImageSizeBytes));
       UR_CHECK_ERROR(
           hipMemcpyHtoA(ImageArray, 0, Image.HostPtr, ImageSizeBytes));
     } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE2D) {
-      CpyDesc2D.srcArray = Image.getArray(
-          Mem->LastEventWritingToMemObj->getQueue()->getDevice());
-      UR_CHECK_ERROR(hipMemcpyParam2D(&CpyDesc2D));
+      CpyDesc2D.srcMemoryType = hipMemoryTypeDevice;
+      CpyDesc2D.srcArray =
+          Image.getArray(Mem->LastQueueWritingToMemObj->getDevice());
+      UR_CHECK_ERROR(hipMemcpyParam2DAsync(&CpyDesc2D, Stream));
     } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE3D) {
-      CpyDesc3D.srcArray = Image.getArray(
-          Mem->LastEventWritingToMemObj->getQueue()->getDevice());
-      UR_CHECK_ERROR(hipDrvMemcpy3D(&CpyDesc3D));
+      CpyDesc3D.srcMemoryType = hipMemoryTypeDevice;
+      CpyDesc3D.srcArray =
+          Image.getArray(Mem->LastQueueWritingToMemObj->getDevice());
+      UR_CHECK_ERROR(hipDrvMemcpy3DAsync(&CpyDesc3D, Stream));
     }
   }
   return UR_RESULT_SUCCESS;
@@ -630,8 +640,8 @@ inline ur_result_t migrateImageToDevice(ur_mem_handle_t Mem,
 
 // If calling this entry point it is necessary to lock the memoryMigrationMutex
 // beforehand
-ur_result_t migrateMemoryToDeviceIfNeeded(ur_mem_handle_t Mem,
-                                          const ur_device_handle_t hDevice) {
+ur_result_t enqueueMigrateMemoryToDeviceIfNeeded(
+    ur_mem_handle_t Mem, const ur_device_handle_t hDevice, hipStream_t Stream) {
   UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
   auto DeviceIdx = Mem->getContext()->getDeviceIndex(hDevice);
   // Device allocation has already been initialized with most up to date
@@ -641,9 +651,9 @@ ur_result_t migrateMemoryToDeviceIfNeeded(ur_mem_handle_t Mem,
 
   ScopedContext Active(hDevice);
   if (Mem->isBuffer()) {
-    UR_CHECK_ERROR(migrateBufferToDevice(Mem, hDevice));
+    UR_CHECK_ERROR(enqueueMigrateBufferToDevice(Mem, hDevice, Stream));
   } else {
-    UR_CHECK_ERROR(migrateImageToDevice(Mem, hDevice));
+    UR_CHECK_ERROR(enqueueMigrateImageToDevice(Mem, hDevice, Stream));
   }
 
   Mem->HaveMigratedToDeviceSinceLastWrite[DeviceIdx] = true;
diff --git a/source/adapters/hip/memory.hpp b/source/adapters/hip/memory.hpp
index cd8b8f1330..425c2e7f53 100644
--- a/source/adapters/hip/memory.hpp
+++ b/source/adapters/hip/memory.hpp
@@ -17,6 +17,12 @@
 #include <unordered_map>
 #include <variant>
 
+ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t,
+                                           const ur_device_handle_t);
+ur_result_t enqueueMigrateMemoryToDeviceIfNeeded(ur_mem_handle_t,
+                                                 const ur_device_handle_t,
+                                                 hipStream_t);
+
 // Handler for plain, pointer-based HIP allocations
 struct BufferMem {
   struct BufferMap {
@@ -280,7 +286,7 @@ struct SurfaceMem {
 ///
 /// The ur_mem_handle_t is responsible for memory allocation and migration
 /// across devices in the same ur_context_handle_t. If a kernel writes to a
-/// ur_mem_handle_t then it will write to LastEventWritingToMemObj. Then all
+/// ur_mem_handle_t then it will write to LastQueueWritingToMemObj. Then all
 /// subsequent operations that want to read from the ur_mem_handle_t must wait
 /// on the event referring to the last write.
 ///
@@ -299,61 +305,7 @@ struct SurfaceMem {
 ///   2. urEnqueueMem(Buffer|Image)Read(Rect)
 ///
 /// Migrations will occur in both cases if the most recent version of data
-/// is on a different device, marked by LastEventWritingToMemObj->getDevice().
-///
-/// Example trace:
-/// ~~~~~~~~~~~~~~
-///
-/// =====> urContextCreate([device0, device1], ...) // associated with [q0, q1]
-///             -> OUT: hContext
-///
-/// =====> urMemBufferCreate(hContext,...);
-///             -> No native allocations made
-///             -> OUT: hBuffer
-///
-/// =====> urEnqueueMemBufferWrite(q0, hBuffer,...);
-///             -> Allocation made on q0 ie device0
-///             -> New allocation initialized with host data.
-///
-/// =====> urKernelSetArgMemObj(hKernel0, hBuffer, ...);
-///             -> ur_kernel_handle_t associated with a ur_program_handle_t,
-///                which is in turn unique to a device. So we can set the kernel
-///                arg with the ptr of the device specific allocation.
-///             -> hKernel0->getProgram()->getDevice() == device0
-///             -> allocateMemObjOnDeviceIfNeeded(device0);
-///                   -> Native allocation already made on device0, continue.
-///
-/// =====> urEnqueueKernelLaunch(q0, hKernel0, ...);
-///             -> Suppose that hKernel0 writes to hBuffer.
-///             -> Call hBuffer->setLastEventWritingToMemObj with return event
-///                from this operation
-///             -> Enqueue native kernel launch
-///
-/// =====> urKernelSetArgMemObj(hKernel1, hBuffer, ...);
-///             -> hKernel1->getProgram()->getDevice() == device1
-///             -> New allocation will be made on device1 when calling
-///                getPtr(device1)
-///                   -> No native allocation on device1
-///                   -> Make native allocation on device1
-///
-/// =====> urEnqueueKernelLaunch(q1, hKernel1, ...);
-///             -> Suppose hKernel1 wants to read from hBuffer and not write.
-///             -> migrateMemoryToDeviceIfNeeded(device1);
-///                   -> hBuffer->LastEventWritingToMemObj is not nullptr
-///                   -> Check if memory has been migrated to device1 since the
-///                      last write
-///                        -> Hasn't been migrated
-///                   -> Wait on LastEventWritingToMemObj.
-///                   -> Migrate memory from device0's native allocation to
-///                      device1's native allocation.
-///             -> Enqueue native kernel launch
-///
-/// =====> urEnqueueKernelLaunch(q0, hKernel0, ...);
-///             -> migrateMemoryToDeviceIfNeeded(device0);
-///                   -> hBuffer->LastEventWritingToMemObj refers to an event
-///                      from q0
-///                        -> Migration not necessary
-///             -> Enqueue native kernel launch
+/// is on a different device, marked by LastQueueWritingToMemObj->getDevice().
 ///
 struct ur_mem_handle_t_ {
 
@@ -377,15 +329,13 @@ struct ur_mem_handle_t_ {
   // Has the memory been migrated to a device since the last write?
   std::vector<bool> HaveMigratedToDeviceSinceLastWrite;
 
-  // We should wait on this event prior to migrating memory across allocations
-  // in this ur_mem_handle_t_
-  ur_event_handle_t LastEventWritingToMemObj{nullptr};
+  // Queue with most up to date data of ur_mem_handle_t_
+  ur_queue_handle_t LastQueueWritingToMemObj{nullptr};
 
   // Enumerates all possible types of accesses.
   enum access_mode_t { unknown, read_write, read_only, write_only };
 
   ur_mutex MemoryAllocationMutex; // A mutex for allocations
-  ur_mutex MemoryMigrationMutex;  // A mutex for memory transfers
 
   /// A UR Memory object represents either plain memory allocations ("Buffers"
   /// in OpenCL) or typed allocations ("Images" in OpenCL).
@@ -474,18 +424,15 @@ struct ur_mem_handle_t_ {
 
   uint32_t getReferenceCount() const noexcept { return RefCount; }
 
-  void setLastEventWritingToMemObj(ur_event_handle_t NewEvent) {
-    assert(NewEvent && "Invalid event!");
-    // This entry point should only ever be called when using multi device ctx
-    assert(Context->Devices.size() > 1);
-    if (LastEventWritingToMemObj != nullptr) {
-      urEventRelease(LastEventWritingToMemObj);
+  void setLastQueueWritingToMemObj(ur_queue_handle_t WritingQueue) {
+    if (LastQueueWritingToMemObj != nullptr) {
+      urQueueRelease(LastQueueWritingToMemObj);
     }
-    urEventRetain(NewEvent);
-    LastEventWritingToMemObj = NewEvent;
+    urQueueRetain(WritingQueue);
+    LastQueueWritingToMemObj = WritingQueue;
     for (const auto &Device : Context->getDevices()) {
       HaveMigratedToDeviceSinceLastWrite[Context->getDeviceIndex(Device)] =
-          Device == NewEvent->getQueue()->getDevice();
+          Device == WritingQueue->getDevice();
     }
   }
 };
diff --git a/test/conformance/CMakeLists.txt b/test/conformance/CMakeLists.txt
index 79cefdd06f..edb67e1e0d 100644
--- a/test/conformance/CMakeLists.txt
+++ b/test/conformance/CMakeLists.txt
@@ -5,6 +5,10 @@
 
 set(UR_CONFORMANCE_TEST_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 
+set(UR_CONFORMANCE_DEVICE_BINARIES_DIR
+    "${CMAKE_CURRENT_BINARY_DIR}/device_binaries" CACHE INTERNAL
+    "Internal cache variable for device binaries directory")
+
 function(add_test_adapter name adapter)
     set(TEST_TARGET_NAME test-${name})
     set(TEST_NAME ${name}-${adapter})
diff --git a/test/conformance/device_code/CMakeLists.txt b/test/conformance/device_code/CMakeLists.txt
index 1419604b9d..24c437e853 100644
--- a/test/conformance/device_code/CMakeLists.txt
+++ b/test/conformance/device_code/CMakeLists.txt
@@ -135,6 +135,7 @@ add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/fill_3d.cpp)
 add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/fill_usm.cpp)
 add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/foo.cpp)
 add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/image_copy.cpp)
+add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/inc.cpp)
 add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/mean.cpp)
 add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/cpy_and_mult.cpp)
 add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/cpy_and_mult_usm.cpp)
diff --git a/test/conformance/device_code/inc.cpp b/test/conformance/device_code/inc.cpp
new file mode 100644
index 0000000000..d41c07366d
--- /dev/null
+++ b/test/conformance/device_code/inc.cpp
@@ -0,0 +1,18 @@
+// Copyright (C) 2024 Intel Corporation
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+// See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <sycl/sycl.hpp>
+
+class inc;
+
+int main() {
+    uint32_t *ptr;
+    sycl::buffer<uint32_t> buf{ptr, 1};
+    sycl::queue{}.submit([&](sycl::handler &cgh) {
+        sycl::accessor acc{buf, cgh};
+        auto kernel = [acc](sycl::item<1> it) { acc[it]++; };
+        cgh.parallel_for<inc>(sycl::range<1>{1}, kernel);
+    });
+}
diff --git a/test/conformance/memory/CMakeLists.txt b/test/conformance/memory/CMakeLists.txt
index 041f73a079..1ac78fa26a 100644
--- a/test/conformance/memory/CMakeLists.txt
+++ b/test/conformance/memory/CMakeLists.txt
@@ -15,3 +15,8 @@ add_conformance_test_with_devices_environment(memory
     urMemImageGetInfo.cpp 
     urMemRelease.cpp 
     urMemRetain.cpp)
+
+if (UR_DPCXX)
+    add_conformance_test_with_kernels_environment(memory-migrate
+        urMemBufferMigrateAcrossDevices.cpp)
+endif()
diff --git a/test/conformance/memory/memory-migrate_adapter_cuda.match b/test/conformance/memory/memory-migrate_adapter_cuda.match
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/test/conformance/memory/memory-migrate_adapter_hip.match b/test/conformance/memory/memory-migrate_adapter_hip.match
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/test/conformance/memory/memory-migrate_adapter_level_zero.match b/test/conformance/memory/memory-migrate_adapter_level_zero.match
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/test/conformance/memory/memory-migrate_adapter_native_cpu.match b/test/conformance/memory/memory-migrate_adapter_native_cpu.match
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/test/conformance/memory/memory-migrate_adapter_opencl.match b/test/conformance/memory/memory-migrate_adapter_opencl.match
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/test/conformance/memory/urMemBufferMigrateAcrossDevices.cpp b/test/conformance/memory/urMemBufferMigrateAcrossDevices.cpp
new file mode 100644
index 0000000000..2e8856ac97
--- /dev/null
+++ b/test/conformance/memory/urMemBufferMigrateAcrossDevices.cpp
@@ -0,0 +1,263 @@
+// Copyright (C) 2023 Intel Corporation
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+// See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// Some tests to ensure implicit memory migration of buffers across devices
+// in the same context.
+
+#include "uur/fixtures.h"
+
+using T = uint32_t;
+
+struct urMultiDeviceContextTest : uur::urPlatformTest {
+    void SetUp() {
+        uur::urPlatformTest::SetUp();
+        ASSERT_SUCCESS(urDeviceGet(platform, UR_DEVICE_TYPE_ALL, 0, nullptr,
+                                   &num_devices));
+        ASSERT_NE(num_devices, 0);
+        if (num_devices == 1) {
+            return;
+        }
+
+        devices = std::vector<ur_device_handle_t>(num_devices);
+        ASSERT_SUCCESS(urDeviceGet(platform, UR_DEVICE_TYPE_ALL, num_devices,
+                                   devices.data(), nullptr));
+        ASSERT_SUCCESS(
+            urContextCreate(num_devices, devices.data(), nullptr, &context));
+
+        queues = std::vector<ur_queue_handle_t>(num_devices);
+        for (auto i = 0u; i < num_devices; ++i) {
+            ASSERT_SUCCESS(
+                urQueueCreate(context, devices[i], nullptr, &queues[i]));
+        }
+    }
+
+    void TearDown() {
+        uur::urPlatformTest::TearDown();
+        if (num_devices == 1) {
+            return;
+        }
+        for (auto i = 0u; i < num_devices; ++i) {
+            urDeviceRelease(devices[i]);
+            urQueueRelease(queues[i]);
+        }
+        urContextRelease(context);
+    }
+
+    uint32_t num_devices = 0;
+    ur_context_handle_t context;
+    std::vector<ur_device_handle_t> devices;
+    std::vector<ur_queue_handle_t> queues;
+};
+
+struct urMultiDeviceContextMemBufferTest : urMultiDeviceContextTest {
+    void SetUp() {
+        urMultiDeviceContextTest::SetUp();
+        if (num_devices == 1) {
+            return;
+        }
+        ASSERT_SUCCESS(urMemBufferCreate(context, 0 /*flags=*/,
+                                         buffer_size_bytes,
+                                         nullptr /*pProperties=*/, &buffer));
+
+        UUR_RETURN_ON_FATAL_FAILURE(
+            uur::KernelsEnvironment::instance->LoadSource(program_name,
+                                                          il_binary));
+
+        programs = std::vector<ur_program_handle_t>(num_devices);
+        kernels = std::vector<ur_kernel_handle_t>(num_devices);
+
+        const ur_program_properties_t properties = {
+            UR_STRUCTURE_TYPE_PROGRAM_PROPERTIES, nullptr,
+            static_cast<uint32_t>(metadatas.size()),
+            metadatas.empty() ? nullptr : metadatas.data()};
+        for (auto i = 0u; i < num_devices; ++i) {
+            ASSERT_SUCCESS(uur::KernelsEnvironment::instance->CreateProgram(
+                platform, context, devices[i], *il_binary, &properties,
+                &programs[i]));
+            ASSERT_SUCCESS(urProgramBuild(context, programs[i], nullptr));
+            auto kernel_names =
+                uur::KernelsEnvironment::instance->GetEntryPointNames(
+                    program_name);
+            kernel_name = kernel_names[0];
+            ASSERT_FALSE(kernel_name.empty());
+            ASSERT_SUCCESS(
+                urKernelCreate(programs[i], kernel_name.data(), &kernels[i]));
+        }
+    }
+
+    // Adds a kernel arg representing a sycl buffer constructed with a 1D range.
+    void AddBuffer1DArg(ur_kernel_handle_t kernel, size_t current_arg_index,
+                        ur_mem_handle_t buffer) {
+        ASSERT_SUCCESS(
+            urKernelSetArgMemObj(kernel, current_arg_index, nullptr, buffer));
+
+        // SYCL device kernels have different interfaces depending on the
+        // backend being used. Typically a kernel which takes a buffer argument
+        // will take a pointer to the start of the buffer and a sycl::id param
+        // which is a struct that encodes the accessor to the buffer. However
+        // the AMD backend handles this differently and uses three separate
+        // arguments for each of the three dimensions of the accessor.
+
+        ur_platform_backend_t backend;
+        ASSERT_SUCCESS(urPlatformGetInfo(platform, UR_PLATFORM_INFO_BACKEND,
+                                         sizeof(backend), &backend, nullptr));
+        if (backend == UR_PLATFORM_BACKEND_HIP) {
+            // this emulates the three offset params for buffer accessor on AMD.
+            size_t val = 0;
+            ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index + 1,
+                                               sizeof(size_t), nullptr, &val));
+            ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index + 2,
+                                               sizeof(size_t), nullptr, &val));
+            ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index + 3,
+                                               sizeof(size_t), nullptr, &val));
+            current_arg_index += 4;
+        } else {
+            // This emulates the offset struct sycl adds for a 1D buffer accessor.
+            struct {
+                size_t offsets[1] = {0};
+            } accessor;
+            ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index + 1,
+                                               sizeof(accessor), nullptr,
+                                               &accessor));
+            current_arg_index += 2;
+        }
+    }
+
+    void TearDown() {
+        if (num_devices > 1) {
+            for (auto i = 0u; i < num_devices; ++i) {
+                ASSERT_SUCCESS(urKernelRelease(kernels[i]));
+                ASSERT_SUCCESS(urProgramRelease(programs[i]));
+            }
+            urMemRelease(buffer);
+        }
+        urMultiDeviceContextTest::TearDown();
+    }
+
+    size_t buffer_size = 4096;
+    size_t buffer_size_bytes = 4096 * sizeof(T);
+    ur_mem_handle_t buffer;
+
+    // Program stuff so we can launch kernels
+    std::shared_ptr<std::vector<char>> il_binary;
+    std::string program_name = "inc";
+    std::string kernel_name;
+    std::vector<ur_program_handle_t> programs;
+    std::vector<ur_kernel_handle_t> kernels;
+    std::vector<ur_program_metadata_t> metadatas{};
+};
+
+TEST_F(urMultiDeviceContextMemBufferTest, WriteRead) {
+    if (num_devices == 1) {
+        GTEST_SKIP();
+    }
+    T fill_val = 42;
+    std::vector<T> in_vec(buffer_size, fill_val);
+    std::vector<T> out_vec(buffer_size, 0);
+    ur_event_handle_t e1;
+
+    ASSERT_SUCCESS(urEnqueueMemBufferWrite(queues[0], buffer, false, 0,
+                                           buffer_size_bytes, in_vec.data(), 0,
+                                           nullptr, &e1));
+
+    ASSERT_SUCCESS(urEnqueueMemBufferRead(queues[1], buffer, false, 0,
+                                          buffer_size_bytes, out_vec.data(), 1,
+                                          &e1, nullptr));
+    for (auto &a : out_vec) {
+        ASSERT_EQ(a, fill_val);
+    }
+}
+
+TEST_F(urMultiDeviceContextMemBufferTest, FillRead) {
+    if (num_devices == 1) {
+        GTEST_SKIP();
+    }
+    T fill_val = 42;
+    std::vector<T> in_vec(buffer_size, fill_val);
+    std::vector<T> out_vec(buffer_size);
+    ur_event_handle_t e1;
+
+    ASSERT_SUCCESS(urEnqueueMemBufferFill(queues[0], buffer, &fill_val,
+                                          sizeof(fill_val), 0,
+                                          buffer_size_bytes, 0, nullptr, &e1));
+
+    ASSERT_SUCCESS(urEnqueueMemBufferRead(queues[1], buffer, false, 0,
+                                          buffer_size_bytes, out_vec.data(), 1,
+                                          &e1, nullptr));
+    for (auto &a : out_vec) {
+        ASSERT_EQ(a, fill_val);
+    }
+}
+
+TEST_F(urMultiDeviceContextMemBufferTest, WriteKernelRead) {
+    if (num_devices == 1) {
+        GTEST_SKIP();
+    }
+
+    // Kernel to run on queues[1]
+    AddBuffer1DArg(kernels[1], 0, buffer);
+
+    T fill_val = 42;
+    std::vector<T> in_vec(buffer_size, fill_val);
+    std::vector<T> out_vec(buffer_size);
+    ur_event_handle_t e1, e2;
+
+    ASSERT_SUCCESS(urEnqueueMemBufferWrite(queues[0], buffer, false, 0,
+                                           buffer_size_bytes, in_vec.data(), 0,
+                                           nullptr, &e1));
+
+    size_t work_dims[3] = {buffer_size, 1, 1};
+    size_t offset[3] = {0, 0, 0};
+
+    // Kernel increments the fill val by 1
+    ASSERT_SUCCESS(urEnqueueKernelLaunch(queues[1], kernels[1], 1 /*workDim=*/,
+                                         offset, work_dims, nullptr, 1, &e1,
+                                         &e2));
+
+    ASSERT_SUCCESS(urEnqueueMemBufferRead(queues[0], buffer, false, 0,
+                                          buffer_size_bytes, out_vec.data(), 1,
+                                          &e2, nullptr));
+    for (auto &a : out_vec) {
+        ASSERT_EQ(a, fill_val + 1);
+    }
+}
+
+TEST_F(urMultiDeviceContextMemBufferTest, WriteKernelKernelRead) {
+    if (num_devices == 1) {
+        GTEST_SKIP();
+    }
+
+    AddBuffer1DArg(kernels[0], 0, buffer);
+    AddBuffer1DArg(kernels[1], 0, buffer);
+
+    T fill_val = 42;
+    std::vector<T> in_vec(buffer_size, fill_val);
+    std::vector<T> out_vec(buffer_size);
+    ur_event_handle_t e1, e2, e3;
+
+    ASSERT_SUCCESS(urEnqueueMemBufferWrite(queues[0], buffer, false, 0,
+                                           buffer_size_bytes, in_vec.data(), 0,
+                                           nullptr, &e1));
+
+    size_t work_dims[3] = {buffer_size, 1, 1};
+    size_t offset[3] = {0, 0, 0};
+
+    // Kernel increments the fill val by 1
+    ASSERT_SUCCESS(urEnqueueKernelLaunch(queues[1], kernels[1], 1 /*workDim=*/,
+                                         offset, work_dims, nullptr, 1, &e1,
+                                         &e2));
+
+    // Kernel increments the fill val by 1
+    ASSERT_SUCCESS(urEnqueueKernelLaunch(queues[0], kernels[0], 1 /*workDim=*/,
+                                         offset, work_dims, nullptr, 1, &e2,
+                                         &e3));
+
+    ASSERT_SUCCESS(urEnqueueMemBufferRead(queues[1], buffer, false, 0,
+                                          buffer_size_bytes, out_vec.data(), 1,
+                                          &e3, nullptr));
+    for (auto &a : out_vec) {
+        ASSERT_EQ(a, fill_val + 2);
+    }
+}