Skip to content

Commit

Permalink
[GraphBolt] TorchBasedFeature.read_async. [2] (#7547)
Browse files Browse the repository at this point in the history
  • Loading branch information
mfbalin committed Jul 20, 2024
1 parent 8d770b6 commit 4290174
Showing 1 changed file with 99 additions and 5 deletions.
104 changes: 99 additions & 5 deletions python/dgl/graphbolt/impl/torch_based_feature_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,11 @@
import numpy as np
import torch

from ..base import index_select
from ..base import (
get_device_to_host_uva_stream,
get_host_to_device_uva_stream,
index_select,
)
from ..feature_store import Feature
from ..internal_utils import gb_warning, is_wsl
from .basic_feature_store import BasicFeatureStore
Expand Down Expand Up @@ -145,7 +149,60 @@ def read_async(self, ids: torch.Tensor):
... future = next(async_handle)
>>> result = future.wait() # result contains the read values.
"""
raise NotImplementedError
assert self._tensor.device.type == "cpu"
if ids.is_cuda and self.is_pinned():
current_stream = torch.cuda.current_stream()
host_to_device_stream = get_host_to_device_uva_stream()
host_to_device_stream.wait_stream(current_stream)
with torch.cuda.stream(host_to_device_stream):
ids.record_stream(torch.cuda.current_stream())
values = index_select(self._tensor, ids)
values.record_stream(current_stream)
values_copy_event = torch.cuda.Event()
values_copy_event.record()

class _Waiter:
@staticmethod
def wait():
"""Returns the stored value when invoked."""
values_copy_event.wait()
return values

yield _Waiter()
elif ids.is_cuda:
ids_device = ids.device
current_stream = torch.cuda.current_stream()
device_to_host_stream = get_device_to_host_uva_stream()
device_to_host_stream.wait_stream(current_stream)
with torch.cuda.stream(device_to_host_stream):
ids.record_stream(torch.cuda.current_stream())
ids = ids.to(self._tensor.device, non_blocking=True)
ids_copy_event = torch.cuda.Event()
ids_copy_event.record()

yield # first stage is done.

ids_copy_event.synchronize()
values = torch.ops.graphbolt.index_select_async(self._tensor, ids)
yield

host_to_device_stream = get_host_to_device_uva_stream()
with torch.cuda.stream(host_to_device_stream):
values_cuda = values.wait().to(ids_device, non_blocking=True)
values_cuda.record_stream(current_stream)
values_copy_event = torch.cuda.Event()
values_copy_event.record()

class _Waiter:
@staticmethod
def wait():
"""Returns the stored value when invoked."""
values_copy_event.wait()
return values_cuda

yield _Waiter()
else:
yield torch.ops.graphbolt.index_select_async(self._tensor, ids)

def read_async_num_stages(self, ids_device: torch.device):
"""The number of stages of the read_async operation. See read_async
Expand All @@ -159,7 +216,10 @@ def read_async_num_stages(self, ids_device: torch.device):
int
The number of stages of the read_async operation.
"""
raise NotImplementedError
if ids_device.type == "cuda":
return 1 if self.is_pinned() else 3
else:
return 1

def size(self):
"""Get the size of the feature.
Expand Down Expand Up @@ -367,7 +427,41 @@ def read_async(self, ids: torch.Tensor):
... future = next(async_handle)
>>> result = future.wait() # result contains the read values.
"""
raise NotImplementedError
assert torch.ops.graphbolt.detect_io_uring()
if ids.is_cuda:
ids_device = ids.device
current_stream = torch.cuda.current_stream()
device_to_host_stream = get_device_to_host_uva_stream()
device_to_host_stream.wait_stream(current_stream)
with torch.cuda.stream(device_to_host_stream):
ids.record_stream(torch.cuda.current_stream())
ids = ids.to(self._tensor.device, non_blocking=True)
ids_copy_event = torch.cuda.Event()
ids_copy_event.record()

yield # first stage is done.

ids_copy_event.synchronize()
values = self._ondisk_npy_array.index_select(ids)
yield

host_to_device_stream = get_host_to_device_uva_stream()
with torch.cuda.stream(host_to_device_stream):
values_cuda = values.wait().to(ids_device, non_blocking=True)
values_cuda.record_stream(current_stream)
values_copy_event = torch.cuda.Event()
values_copy_event.record()

class _Waiter:
@staticmethod
def wait():
"""Returns the stored value when invoked."""
values_copy_event.wait()
return values_cuda

yield _Waiter()
else:
yield self._ondisk_npy_array.index_select(ids)

def read_async_num_stages(self, ids_device: torch.device):
"""The number of stages of the read_async operation. See read_async
Expand All @@ -381,7 +475,7 @@ def read_async_num_stages(self, ids_device: torch.device):
int
The number of stages of the read_async operation.
"""
raise NotImplementedError
return 3 if ids_device.type == "cuda" else 1

def size(self):
"""Get the size of the feature.
Expand Down

0 comments on commit 4290174

Please sign in to comment.