Merge branch 'master' into DiskBasedFeature_dglexample

dmlc · Aug 2, 2024 · 20e24df · 20e24df
2 parents d177173 + 56a1e64
commit 20e24df
Show file tree

Hide file tree

Showing 6 changed files with 63 additions and 11 deletions.
diff --git a/.gitmodules b/.gitmodules
@@ -37,3 +37,6 @@
 [submodule "third_party/taskflow"]
 	path = third_party/taskflow
 	url = https://github.com/taskflow/taskflow.git
+[submodule "third_party/tsl_robin_map"]
+	path = third_party/tsl_robin_map
+	url = https://github.com/Tessil/robin-map.git
diff --git a/graphbolt/CMakeLists.txt b/graphbolt/CMakeLists.txt
@@ -78,7 +78,7 @@ include_directories(BEFORE ${BOLT_DIR}
                            # `std::atomic_ref`, `std::counting_semaphore`
                            "../third_party/cccl/libcudacxx/include"
                            "../third_party/pcg/include"
-                           "../third_party/phmap")
+                           "../third_party/tsl_robin_map/include")
 target_link_libraries(${LIB_GRAPHBOLT_NAME} "${TORCH_LIBRARIES}")
 if(BUILD_WITH_TASKFLOW)
   target_include_directories(${LIB_GRAPHBOLT_NAME} PRIVATE "../third_party/taskflow")

diff --git a/graphbolt/src/cache_policy.h b/graphbolt/src/cache_policy.h
@@ -20,9 +20,10 @@
 #ifndef GRAPHBOLT_CACHE_POLICY_H_
 #define GRAPHBOLT_CACHE_POLICY_H_
 
-#include <parallel_hashmap/phmap.h>
 #include <torch/custom_class.h>
 #include <torch/torch.h>
+#include <tsl/robin_map.h>
+#include <tsl/robin_set.h>
 
 #include <limits>
 #include <mutex>
@@ -178,9 +179,13 @@ class BaseCachePolicy {
 
  protected:
   template <typename K, typename V>
-  using map_t = phmap::flat_hash_map<K, V>;
+  using map_t = tsl::robin_map<K, V>;
   template <typename K>
-  using set_t = phmap::flat_hash_set<K>;
+  using set_t = tsl::robin_set<K>;
+  template <typename iterator>
+  static auto& mutable_value_ref(iterator it) {
+    return it.value();
+  }
   static constexpr int kCapacityFactor = 2;
 
   template <typename CachePolicy>
@@ -298,7 +303,7 @@ class S3FifoCachePolicy : public BaseCachePolicy {
     const auto in_ghost_queue = ghost_set_.erase(key);
     auto& queue = in_ghost_queue ? main_queue_ : small_queue_;
     auto cache_key_ptr = queue.Push(CacheKey(key));
-    it->second = cache_key_ptr;
+    mutable_value_ref(it) = cache_key_ptr;
     return &cache_key_ptr->setPos(Evict());
   }
 
@@ -318,7 +323,7 @@ class S3FifoCachePolicy : public BaseCachePolicy {
       auto it = key_to_cache_key_.find(evicted.getKey());
       if (evicted.getFreq() > 0 || evicted.InUse()) {
         evicted.Decrement();
-        it->second = main_queue_.Push(evicted);
+        mutable_value_ref(it) = main_queue_.Push(evicted);
       } else {
         key_to_cache_key_.erase(it);
         return evicted.getPos();
@@ -332,7 +337,7 @@ class S3FifoCachePolicy : public BaseCachePolicy {
       auto evicted = small_queue_.Pop();
       auto it = key_to_cache_key_.find(evicted.getKey());
       if (evicted.getFreq() > 0 || evicted.InUse()) {
-        it->second = main_queue_.Push(evicted.ResetFreq());
+        mutable_value_ref(it) = main_queue_.Push(evicted.ResetFreq());
       } else {
         key_to_cache_key_.erase(it);
         const auto evicted_key = evicted.getKey();
@@ -449,7 +454,7 @@ class SieveCachePolicy : public BaseCachePolicy {
     const auto key = it->first;
     queue_.push_front(CacheKey(key));
     auto cache_key_ptr = &queue_.front();
-    it->second = cache_key_ptr;
+    mutable_value_ref(it) = cache_key_ptr;
     return &cache_key_ptr->setPos(Evict());
   }
 
@@ -589,7 +594,7 @@ class LruCachePolicy : public BaseCachePolicy {
   CacheKey* Insert(map_iterator it) {
     const auto key = it->first;
     queue_.push_front(CacheKey(key));
-    it->second = queue_.begin();
+    mutable_value_ref(it) = queue_.begin();
     auto cache_key_ptr = &*queue_.begin();
     return &cache_key_ptr->setPos(Evict());
   }
@@ -718,7 +723,7 @@ class ClockCachePolicy : public BaseCachePolicy {
   CacheKey* Insert(map_iterator it) {
     const auto key = it->first;
     auto cache_key_ptr = queue_.Push(CacheKey(key));
-    it->second = cache_key_ptr;
+    mutable_value_ref(it) = cache_key_ptr;
     return &cache_key_ptr->setPos(Evict());
   }
 

diff --git a/python/dgl/graphbolt/__init__.py b/python/dgl/graphbolt/__init__.py
@@ -2,6 +2,42 @@
 import os
 import sys
 
+from .internal_utils import *
+
+CUDA_ALLOCATOR_ENV_WARNING_STR = """
+An experimental feature for CUDA allocations is turned on for better allocation
+pattern resulting in better memory usage for minibatch GNN training workloads.
+See https://pytorch.org/docs/stable/notes/cuda.html#optimizing-memory-usage-with-pytorch-cuda-alloc-conf,
+and set the environment variable `PYTORCH_CUDA_ALLOC_CONF=expandable_segments:False`
+if you want to disable it.
+"""
+cuda_allocator_env = os.getenv("PYTORCH_CUDA_ALLOC_CONF")
+if cuda_allocator_env is None:
+    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+    gb_warning(CUDA_ALLOCATOR_ENV_WARNING_STR)
+else:
+    configs = {
+        kv_pair.split(":")[0]: kv_pair.split(":")[1]
+        for kv_pair in cuda_allocator_env.split(",")
+    }
+    if "expandable_segments" in configs:
+        if configs["expandable_segments"] != "True":
+            gb_warning(
+                "You should consider `expandable_segments:True` in the"
+                " environment variable `PYTORCH_CUDA_ALLOC_CONF` for lower"
+                " memory usage. See "
+                "https://pytorch.org/docs/stable/notes/cuda.html"
+                "#optimizing-memory-usage-with-pytorch-cuda-alloc-conf"
+            )
+    else:
+        configs["expandable_segments"] = "True"
+        os.environ["PYTORCH_CUDA_ALLOC_CONF"] = ",".join(
+            [k + ":" + v for k, v in configs.items()]
+        )
+        gb_warning(CUDA_ALLOCATOR_ENV_WARNING_STR)
+
+
+# pylint: disable=wrong-import-position, wrong-import-order
 import torch
 
 ### FROM DGL @todo
@@ -47,7 +83,6 @@ def load_graphbolt():
 from .itemset import *
 from .item_sampler import *
 from .minibatch_transformer import *
-from .internal_utils import *
 from .negative_sampler import *
 from .sampled_subgraph import *
 from .subgraph_sampler import *

diff --git a/tests/python/pytorch/graphbolt/test_base.py b/tests/python/pytorch/graphbolt/test_base.py
@@ -1,3 +1,4 @@
+import os
 import re
 import unittest
 from collections.abc import Iterable, Mapping
@@ -12,6 +13,13 @@
 from . import gb_test_utils
 
 
+def test_pytorch_cuda_allocator_conf():
+    env = os.getenv("PYTORCH_CUDA_ALLOC_CONF")
+    assert env is not None
+    config_list = env.split(",")
+    assert "expandable_segments:True" in config_list
+
+
 @unittest.skipIf(F._default_context_str != "gpu", "CopyTo needs GPU to test")
 @pytest.mark.parametrize("non_blocking", [False, True])
 def test_CopyTo(non_blocking):

diff --git a/third_party/tsl_robin_map b/third_party/tsl_robin_map