Skip to content

Commit

Permalink
[GraphBolt][CUDA] Optimize CopyTo performance. (#7634)
Browse files Browse the repository at this point in the history
  • Loading branch information
mfbalin committed Aug 1, 2024
1 parent 5c902cd commit dfd4915
Show file tree
Hide file tree
Showing 2 changed files with 4 additions and 8 deletions.
3 changes: 0 additions & 3 deletions python/dgl/graphbolt/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -369,9 +369,6 @@ def __init__(self, datapipe, device, non_blocking=False):

def __iter__(self):
for data in self.datapipe:
if self.non_blocking:
# The copy is non blocking only if contents of data are pinned.
assert data.is_pinned(), f"{data} should be pinned."
yield recursive_apply(
data, apply_to, self.device, self.non_blocking
)
Expand Down
9 changes: 4 additions & 5 deletions python/dgl/graphbolt/dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,11 +231,10 @@ def __init__(
datapipe_graph = dp_utils.replace_dp(
datapipe_graph,
copier,
copier.datapipe.transform(
lambda x: x.pin_memory()
).prefetch(2)
# After the data gets pinned, we can copy non_blocking.
.copy_to(copier.device, non_blocking=True),
# Add prefetch so that CPU and GPU can run concurrently.
copier.datapipe.prefetch(2).copy_to(
copier.device, non_blocking=True
),
)

# The stages after feature fetching is still done in the main process.
Expand Down

0 comments on commit dfd4915

Please sign in to comment.