dmlc · frozenbugs · Feb 5, 2024 · Jan 19, 2024 · Jan 20, 2024 · Jan 20, 2024
diff --git a/python/dgl/graphbolt/base.py b/python/dgl/graphbolt/base.py
@@ -1,5 +1,6 @@
 """Base types and utilities for Graph Bolt."""
 
+from collections import deque
 from dataclasses import dataclass
 
 import torch
@@ -14,6 +15,10 @@
     "etype_str_to_tuple",
     "etype_tuple_to_str",
     "CopyTo",
+    "Waiter",
+    "Bufferer",
+    "FutureWaiter",
+    "EndMarker",
     "isin",
     "expand_indptr",
     "CSCFormatBase",
@@ -219,6 +224,76 @@ def __iter__(self):
             yield data
 
 
+@functional_datapipe("mark_end")
+class EndMarker(IterDataPipe):
+    """Used to mark the end of a datapipe and is a no-op."""
+
+    def __init__(self, datapipe):
+        self.datapipe = datapipe
+
+    def __iter__(self):
+        yield from self.datapipe
+
+
+@functional_datapipe("buffer")
+class Bufferer(IterDataPipe):
+    """Buffers items before yielding them.
+
+    Parameters
+    ----------
+    datapipe : DataPipe
+        The data pipeline.
+    buffer_size : int, optional
+        The size of the buffer which stores the fetched samples. If data coming
+        from datapipe has latency spikes, consider setting to a higher value.
+        Default is 1.
+    """
+
+    def __init__(self, datapipe, buffer_size=1):
+        self.datapipe = datapipe
+        if buffer_size <= 0:
+            raise ValueError(
+                "'buffer_size' is required to be a positive integer."
+            )
+        self.buffer = deque(maxlen=buffer_size)
+
+    def __iter__(self):
+        for data in self.datapipe:
+            if len(self.buffer) < self.buffer.maxlen:
+                self.buffer.append(data)
+            else:
+                return_data = self.buffer.popleft()
+                self.buffer.append(data)
+                yield return_data
+        while len(self.buffer) > 0:
+            yield self.buffer.popleft()
+
+
+@functional_datapipe("wait")
+class Waiter(IterDataPipe):
+    """Calls the wait function of all items."""
+
+    def __init__(self, datapipe):
+        self.datapipe = datapipe
+
+    def __iter__(self):
+        for data in self.datapipe:
+            data.wait()
+            yield data
+
+
+@functional_datapipe("wait_future")
+class FutureWaiter(IterDataPipe):
+    """Calls the result function of all items and returns their results."""
+
+    def __init__(self, datapipe):
+        self.datapipe = datapipe
+
+    def __iter__(self):
+        for data in self.datapipe:
+            yield data.result()
+
+
 @dataclass
 class CSCFormatBase:
     r"""Basic class representing data in Compressed Sparse Column (CSC) format.

diff --git a/python/dgl/graphbolt/dataloader.py b/python/dgl/graphbolt/dataloader.py
@@ -1,6 +1,6 @@
 """Graph Bolt DataLoaders"""
 
-from collections import deque
+from concurrent.futures import ThreadPoolExecutor
 
 import torch
 import torch.utils.data
@@ -9,15 +9,14 @@
 
 from .base import CopyTo
 from .feature_fetcher import FeatureFetcher
+from .impl.neighbor_sampler import SamplePerLayer
 
 from .internal import datapipe_graph_to_adjlist
 from .item_sampler import ItemSampler
 
 
 __all__ = [
     "DataLoader",
-    "Awaiter",
-    "Bufferer",
 ]
 
 
@@ -40,61 +39,6 @@ def _find_and_wrap_parent(datapipe_graph, target_datapipe, wrapper, **kwargs):
     return datapipe_graph
 
 
-class EndMarker(dp.iter.IterDataPipe):
-    """Used to mark the end of a datapipe and is a no-op."""
-
-    def __init__(self, datapipe):
-        self.datapipe = datapipe
-
-    def __iter__(self):
-        yield from self.datapipe
-
-
-class Bufferer(dp.iter.IterDataPipe):
-    """Buffers items before yielding them.
-
-    Parameters
-    ----------
-    datapipe : DataPipe
-        The data pipeline.
-    buffer_size : int, optional
-        The size of the buffer which stores the fetched samples. If data coming
-        from datapipe has latency spikes, consider setting to a higher value.
-        Default is 1.
-    """
-
-    def __init__(self, datapipe, buffer_size=1):
-        self.datapipe = datapipe
-        if buffer_size <= 0:
-            raise ValueError(
-                "'buffer_size' is required to be a positive integer."
-            )
-        self.buffer = deque(maxlen=buffer_size)
-
-    def __iter__(self):
-        for data in self.datapipe:
-            if len(self.buffer) < self.buffer.maxlen:
-                self.buffer.append(data)
-            else:
-                return_data = self.buffer.popleft()
-                self.buffer.append(data)
-                yield return_data
-        while len(self.buffer) > 0:
-            yield self.buffer.popleft()
-
-
-class Awaiter(dp.iter.IterDataPipe):
-    """Calls the wait function of all items."""
-
-    def __init__(self, datapipe):
-        self.datapipe = datapipe
-
-    def __iter__(self):
-        for data in self.datapipe:
-            data.wait()
-            yield data
-
-
 class MultiprocessingWrapper(dp.iter.IterDataPipe):
     """Wraps a datapipe with multiprocessing.
 
@@ -156,6 +100,10 @@ class DataLoader(torch.utils.data.DataLoader):
         If True, the data loader will overlap the UVA feature fetcher operations
         with the rest of operations by using an alternative CUDA stream. Default
         is True.
+    overlap_graph_fetch : bool, optional
+        If True, the data loader will overlap the UVA graph fetching operations
+        with the rest of operations by using an alternative CUDA stream. Default
+        is False.
     max_uva_threads : int, optional
         Limits the number of CUDA threads used for UVA copies so that the rest
         of the computations can run simultaneously with it. Setting it to a too
@@ -170,6 +118,7 @@ def __init__(
         num_workers=0,
         persistent_workers=True,
         overlap_feature_fetch=True,
+        overlap_graph_fetch=False,
         max_uva_threads=6144,
     ):
         # Multiprocessing requires two modifications to the datapipe:
@@ -179,7 +128,7 @@ def __init__(
         # 2. Cut the datapipe at FeatureFetcher, and wrap the inner datapipe
         #    of the FeatureFetcher with a multiprocessing PyTorch DataLoader.
 
-        datapipe = EndMarker(datapipe)
+        datapipe = datapipe.mark_end()
         datapipe_graph = dp_utils.traverse_dps(datapipe)
 
         # (1) Insert minibatch distribution.
@@ -223,7 +172,25 @@ def __init__(
                 datapipe_graph = dp_utils.replace_dp(
                     datapipe_graph,
                     feature_fetcher,
-                    Awaiter(Bufferer(feature_fetcher, buffer_size=1)),
+                    feature_fetcher.buffer(1).wait(),
+                )
+
+        if (
+            overlap_graph_fetch
+            and num_workers == 0
+            and torch.cuda.is_available()
+        ):
+            torch.ops.graphbolt.set_max_uva_threads(max_uva_threads)
+            samplers = dp_utils.find_dps(
+                datapipe_graph,
+                SamplePerLayer,
+            )
+            executor = ThreadPoolExecutor(max_workers=1)
+            for sampler in samplers:
+                datapipe_graph = dp_utils.replace_dp(
+                    datapipe_graph,
+                    sampler,
+                    sampler.fetch_and_sample(_get_uva_stream(), executor, 1),
                 )
 
         # (4) Cut datapipe at CopyTo and wrap with prefetcher. This enables the