[GraphBolt] TorchBasedFeature.read_async. [2] (#7547)

dmlc · Jul 20, 2024 · 4290174 · 4290174
1 parent 8d770b6
commit 4290174
Showing 1 changed file with 99 additions and 5 deletions.
diff --git a/python/dgl/graphbolt/impl/torch_based_feature_store.py b/python/dgl/graphbolt/impl/torch_based_feature_store.py
@@ -7,7 +7,11 @@
 import numpy as np
 import torch
 
-from ..base import index_select
+from ..base import (
+    get_device_to_host_uva_stream,
+    get_host_to_device_uva_stream,
+    index_select,
+)
 from ..feature_store import Feature
 from ..internal_utils import gb_warning, is_wsl
 from .basic_feature_store import BasicFeatureStore
@@ -145,7 +149,60 @@ def read_async(self, ids: torch.Tensor):
         ...     future = next(async_handle)
         >>> result = future.wait()  # result contains the read values.
         """
-        raise NotImplementedError
+        assert self._tensor.device.type == "cpu"
+        if ids.is_cuda and self.is_pinned():
+            current_stream = torch.cuda.current_stream()
+            host_to_device_stream = get_host_to_device_uva_stream()
+            host_to_device_stream.wait_stream(current_stream)
+            with torch.cuda.stream(host_to_device_stream):
+                ids.record_stream(torch.cuda.current_stream())
+                values = index_select(self._tensor, ids)
+                values.record_stream(current_stream)
+                values_copy_event = torch.cuda.Event()
+                values_copy_event.record()
+
+            class _Waiter:
+                @staticmethod
+                def wait():
+                    """Returns the stored value when invoked."""
+                    values_copy_event.wait()
+                    return values
+
+            yield _Waiter()
+        elif ids.is_cuda:
+            ids_device = ids.device
+            current_stream = torch.cuda.current_stream()
+            device_to_host_stream = get_device_to_host_uva_stream()
+            device_to_host_stream.wait_stream(current_stream)
+            with torch.cuda.stream(device_to_host_stream):
+                ids.record_stream(torch.cuda.current_stream())
+                ids = ids.to(self._tensor.device, non_blocking=True)
+                ids_copy_event = torch.cuda.Event()
+                ids_copy_event.record()
+
+            yield  # first stage is done.
+
+            ids_copy_event.synchronize()
+            values = torch.ops.graphbolt.index_select_async(self._tensor, ids)
+            yield
+
+            host_to_device_stream = get_host_to_device_uva_stream()
+            with torch.cuda.stream(host_to_device_stream):
+                values_cuda = values.wait().to(ids_device, non_blocking=True)
+                values_cuda.record_stream(current_stream)
+                values_copy_event = torch.cuda.Event()
+                values_copy_event.record()
+
+            class _Waiter:
+                @staticmethod
+                def wait():
+                    """Returns the stored value when invoked."""
+                    values_copy_event.wait()
+                    return values_cuda
+
+            yield _Waiter()
+        else:
+            yield torch.ops.graphbolt.index_select_async(self._tensor, ids)
 
     def read_async_num_stages(self, ids_device: torch.device):
         """The number of stages of the read_async operation. See read_async
@@ -159,7 +216,10 @@ def read_async_num_stages(self, ids_device: torch.device):
         int
             The number of stages of the read_async operation.
         """
-        raise NotImplementedError
+        if ids_device.type == "cuda":
+            return 1 if self.is_pinned() else 3
+        else:
+            return 1
 
     def size(self):
         """Get the size of the feature.
@@ -367,7 +427,41 @@ def read_async(self, ids: torch.Tensor):
         ...     future = next(async_handle)
         >>> result = future.wait()  # result contains the read values.
         """
-        raise NotImplementedError
+        assert torch.ops.graphbolt.detect_io_uring()
+        if ids.is_cuda:
+            ids_device = ids.device
+            current_stream = torch.cuda.current_stream()
+            device_to_host_stream = get_device_to_host_uva_stream()
+            device_to_host_stream.wait_stream(current_stream)
+            with torch.cuda.stream(device_to_host_stream):
+                ids.record_stream(torch.cuda.current_stream())
+                ids = ids.to(self._tensor.device, non_blocking=True)
+                ids_copy_event = torch.cuda.Event()
+                ids_copy_event.record()
+
+            yield  # first stage is done.
+
+            ids_copy_event.synchronize()
+            values = self._ondisk_npy_array.index_select(ids)
+            yield
+
+            host_to_device_stream = get_host_to_device_uva_stream()
+            with torch.cuda.stream(host_to_device_stream):
+                values_cuda = values.wait().to(ids_device, non_blocking=True)
+                values_cuda.record_stream(current_stream)
+                values_copy_event = torch.cuda.Event()
+                values_copy_event.record()
+
+            class _Waiter:
+                @staticmethod
+                def wait():
+                    """Returns the stored value when invoked."""
+                    values_copy_event.wait()
+                    return values_cuda
+
+            yield _Waiter()
+        else:
+            yield self._ondisk_npy_array.index_select(ids)
 
     def read_async_num_stages(self, ids_device: torch.device):
         """The number of stages of the read_async operation. See read_async
@@ -381,7 +475,7 @@ def read_async_num_stages(self, ids_device: torch.device):
         int
             The number of stages of the read_async operation.
         """
-        raise NotImplementedError
+        return 3 if ids_device.type == "cuda" else 1
 
     def size(self):
         """Get the size of the feature.