Skip to content

Commit

Permalink
Merge branch 'master' into DiskBasedFeature_dglexample
Browse files Browse the repository at this point in the history
  • Loading branch information
Liu-rj committed Aug 2, 2024
2 parents d177173 + 56a1e64 commit 20e24df
Show file tree
Hide file tree
Showing 6 changed files with 63 additions and 11 deletions.
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,6 @@
[submodule "third_party/taskflow"]
path = third_party/taskflow
url = https://github.com/taskflow/taskflow.git
[submodule "third_party/tsl_robin_map"]
path = third_party/tsl_robin_map
url = https://github.com/Tessil/robin-map.git
2 changes: 1 addition & 1 deletion graphbolt/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ include_directories(BEFORE ${BOLT_DIR}
# `std::atomic_ref`, `std::counting_semaphore`
"../third_party/cccl/libcudacxx/include"
"../third_party/pcg/include"
"../third_party/phmap")
"../third_party/tsl_robin_map/include")
target_link_libraries(${LIB_GRAPHBOLT_NAME} "${TORCH_LIBRARIES}")
if(BUILD_WITH_TASKFLOW)
target_include_directories(${LIB_GRAPHBOLT_NAME} PRIVATE "../third_party/taskflow")
Expand Down
23 changes: 14 additions & 9 deletions graphbolt/src/cache_policy.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,10 @@
#ifndef GRAPHBOLT_CACHE_POLICY_H_
#define GRAPHBOLT_CACHE_POLICY_H_

#include <parallel_hashmap/phmap.h>
#include <torch/custom_class.h>
#include <torch/torch.h>
#include <tsl/robin_map.h>
#include <tsl/robin_set.h>

#include <limits>
#include <mutex>
Expand Down Expand Up @@ -178,9 +179,13 @@ class BaseCachePolicy {

protected:
template <typename K, typename V>
using map_t = phmap::flat_hash_map<K, V>;
using map_t = tsl::robin_map<K, V>;
template <typename K>
using set_t = phmap::flat_hash_set<K>;
using set_t = tsl::robin_set<K>;
template <typename iterator>
static auto& mutable_value_ref(iterator it) {
return it.value();
}
static constexpr int kCapacityFactor = 2;

template <typename CachePolicy>
Expand Down Expand Up @@ -298,7 +303,7 @@ class S3FifoCachePolicy : public BaseCachePolicy {
const auto in_ghost_queue = ghost_set_.erase(key);
auto& queue = in_ghost_queue ? main_queue_ : small_queue_;
auto cache_key_ptr = queue.Push(CacheKey(key));
it->second = cache_key_ptr;
mutable_value_ref(it) = cache_key_ptr;
return &cache_key_ptr->setPos(Evict());
}

Expand All @@ -318,7 +323,7 @@ class S3FifoCachePolicy : public BaseCachePolicy {
auto it = key_to_cache_key_.find(evicted.getKey());
if (evicted.getFreq() > 0 || evicted.InUse()) {
evicted.Decrement();
it->second = main_queue_.Push(evicted);
mutable_value_ref(it) = main_queue_.Push(evicted);
} else {
key_to_cache_key_.erase(it);
return evicted.getPos();
Expand All @@ -332,7 +337,7 @@ class S3FifoCachePolicy : public BaseCachePolicy {
auto evicted = small_queue_.Pop();
auto it = key_to_cache_key_.find(evicted.getKey());
if (evicted.getFreq() > 0 || evicted.InUse()) {
it->second = main_queue_.Push(evicted.ResetFreq());
mutable_value_ref(it) = main_queue_.Push(evicted.ResetFreq());
} else {
key_to_cache_key_.erase(it);
const auto evicted_key = evicted.getKey();
Expand Down Expand Up @@ -449,7 +454,7 @@ class SieveCachePolicy : public BaseCachePolicy {
const auto key = it->first;
queue_.push_front(CacheKey(key));
auto cache_key_ptr = &queue_.front();
it->second = cache_key_ptr;
mutable_value_ref(it) = cache_key_ptr;
return &cache_key_ptr->setPos(Evict());
}

Expand Down Expand Up @@ -589,7 +594,7 @@ class LruCachePolicy : public BaseCachePolicy {
CacheKey* Insert(map_iterator it) {
const auto key = it->first;
queue_.push_front(CacheKey(key));
it->second = queue_.begin();
mutable_value_ref(it) = queue_.begin();
auto cache_key_ptr = &*queue_.begin();
return &cache_key_ptr->setPos(Evict());
}
Expand Down Expand Up @@ -718,7 +723,7 @@ class ClockCachePolicy : public BaseCachePolicy {
CacheKey* Insert(map_iterator it) {
const auto key = it->first;
auto cache_key_ptr = queue_.Push(CacheKey(key));
it->second = cache_key_ptr;
mutable_value_ref(it) = cache_key_ptr;
return &cache_key_ptr->setPos(Evict());
}

Expand Down
37 changes: 36 additions & 1 deletion python/dgl/graphbolt/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,42 @@
import os
import sys

from .internal_utils import *

CUDA_ALLOCATOR_ENV_WARNING_STR = """
An experimental feature for CUDA allocations is turned on for better allocation
pattern resulting in better memory usage for minibatch GNN training workloads.
See https://pytorch.org/docs/stable/notes/cuda.html#optimizing-memory-usage-with-pytorch-cuda-alloc-conf,
and set the environment variable `PYTORCH_CUDA_ALLOC_CONF=expandable_segments:False`
if you want to disable it.
"""
cuda_allocator_env = os.getenv("PYTORCH_CUDA_ALLOC_CONF")
if cuda_allocator_env is None:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
gb_warning(CUDA_ALLOCATOR_ENV_WARNING_STR)
else:
configs = {
kv_pair.split(":")[0]: kv_pair.split(":")[1]
for kv_pair in cuda_allocator_env.split(",")
}
if "expandable_segments" in configs:
if configs["expandable_segments"] != "True":
gb_warning(
"You should consider `expandable_segments:True` in the"
" environment variable `PYTORCH_CUDA_ALLOC_CONF` for lower"
" memory usage. See "
"https://pytorch.org/docs/stable/notes/cuda.html"
"#optimizing-memory-usage-with-pytorch-cuda-alloc-conf"
)
else:
configs["expandable_segments"] = "True"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = ",".join(
[k + ":" + v for k, v in configs.items()]
)
gb_warning(CUDA_ALLOCATOR_ENV_WARNING_STR)


# pylint: disable=wrong-import-position, wrong-import-order
import torch

### FROM DGL @todo
Expand Down Expand Up @@ -47,7 +83,6 @@ def load_graphbolt():
from .itemset import *
from .item_sampler import *
from .minibatch_transformer import *
from .internal_utils import *
from .negative_sampler import *
from .sampled_subgraph import *
from .subgraph_sampler import *
Expand Down
8 changes: 8 additions & 0 deletions tests/python/pytorch/graphbolt/test_base.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
import re
import unittest
from collections.abc import Iterable, Mapping
Expand All @@ -12,6 +13,13 @@
from . import gb_test_utils


def test_pytorch_cuda_allocator_conf():
env = os.getenv("PYTORCH_CUDA_ALLOC_CONF")
assert env is not None
config_list = env.split(",")
assert "expandable_segments:True" in config_list


@unittest.skipIf(F._default_context_str != "gpu", "CopyTo needs GPU to test")
@pytest.mark.parametrize("non_blocking", [False, True])
def test_CopyTo(non_blocking):
Expand Down
1 change: 1 addition & 0 deletions third_party/tsl_robin_map
Submodule tsl_robin_map added at 1115da

0 comments on commit 20e24df

Please sign in to comment.