Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GraphBolt][CUDA] Eliminate GPUCache synchronization. #7705

Merged
merged 2 commits into from
Aug 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions graphbolt/src/cuda/extension/gpu_cache.cu
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,14 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> GpuCache::Query(
return std::make_tuple(values, missing_index, missing_keys);
}

c10::intrusive_ptr<Future<std::vector<torch::Tensor>>> GpuCache::QueryAsync(
torch::Tensor keys) {
return async([=] {
auto [values, missing_index, missing_keys] = Query(keys);
return std::vector{values, missing_index, missing_keys};
});
}

void GpuCache::Replace(torch::Tensor keys, torch::Tensor values) {
TORCH_CHECK(keys.device().is_cuda(), "Keys should be on a CUDA device.");
TORCH_CHECK(
Expand Down
4 changes: 4 additions & 0 deletions graphbolt/src/cuda/extension/gpu_cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#ifndef GRAPHBOLT_GPU_CACHE_H_
#define GRAPHBOLT_GPU_CACHE_H_

#include <graphbolt/async.h>
#include <torch/custom_class.h>
#include <torch/torch.h>

Expand Down Expand Up @@ -53,6 +54,9 @@ class GpuCache : public torch::CustomClassHolder {
std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> Query(
torch::Tensor keys);

c10::intrusive_ptr<Future<std::vector<torch::Tensor>>> QueryAsync(
torch::Tensor keys);

void Replace(torch::Tensor keys, torch::Tensor values);

static c10::intrusive_ptr<GpuCache> Create(
Expand Down
1 change: 1 addition & 0 deletions graphbolt/src/python_binding.cc
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ TORCH_LIBRARY(graphbolt, m) {
#ifdef GRAPHBOLT_USE_CUDA
m.class_<cuda::GpuCache>("GpuCache")
.def("query", &cuda::GpuCache::Query)
.def("query_async", &cuda::GpuCache::QueryAsync)
.def("replace", &cuda::GpuCache::Replace);
m.def("gpu_cache", &cuda::GpuCache::Create);
m.class_<cuda::GpuGraphCache>("GpuGraphCache")
Expand Down
32 changes: 27 additions & 5 deletions python/dgl/graphbolt/impl/gpu_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,16 @@ def __init__(self, cache_shape, dtype):
self.total_miss = 0
self.total_queries = 0

def query(self, keys):
def query(self, keys, async_op=False):
"""Queries the GPU cache.

Parameters
----------
keys : Tensor
The keys to query the GPU cache with.
async_op: bool
Boolean indicating whether the call is asynchronous. If so, the
result can be obtained by calling wait on the returned future.

Returns
-------
Expand All @@ -29,10 +32,29 @@ def query(self, keys):
values[missing_indices] corresponds to cache misses that should be
filled by quering another source with missing_keys.
"""
self.total_queries += keys.shape[0]
values, missing_index, missing_keys = self._cache.query(keys)
self.total_miss += missing_keys.shape[0]
return values, missing_index, missing_keys

class _Waiter:
def __init__(self, gpu_cache, future):
self.gpu_cache = gpu_cache
self.future = future

def wait(self):
"""Returns the stored value when invoked."""
gpu_cache = self.gpu_cache
values, missing_index, missing_keys = (
self.future.wait() if async_op else self.future
)
# Ensure there is no leak.
self.gpu_cache = self.future = None

gpu_cache.total_queries += values.shape[0]
gpu_cache.total_miss += missing_keys.shape[0]
return values, missing_index, missing_keys

if async_op:
return _Waiter(self, self._cache.query_async(keys))
else:
return _Waiter(self, self._cache.query(keys)).wait()

def replace(self, keys, values):
"""Inserts key-value pairs into the GPU cache using the Least-Recently
Expand Down
8 changes: 6 additions & 2 deletions python/dgl/graphbolt/impl/gpu_cached_feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,11 @@ def read_async(self, ids: torch.Tensor):
>>> assert stage + 1 == feature.read_async_num_stages(ids.device)
>>> result = future.wait() # result contains the read values.
"""
values, missing_index, missing_keys = self._feature.query(ids)
future = self._feature.query(ids, async_op=True)

yield

values, missing_index, missing_keys = future.wait()

fallback_reader = self._fallback_feature.read_async(missing_keys)
fallback_num_stages = self._fallback_feature.read_async_num_stages(
Expand Down Expand Up @@ -175,7 +179,7 @@ def read_async_num_stages(self, ids_device: torch.device):
The number of stages of the read_async operation.
"""
assert ids_device.type == "cuda"
return self._fallback_feature.read_async_num_stages(ids_device)
return 1 + self._fallback_feature.read_async_num_stages(ids_device)

def size(self):
"""Get the size of the feature.
Expand Down
Loading