Merge branch 'dmlc:master' into add-igb-to-sage

dmlc · Aug 17, 2024 · c5285d1 · c5285d1
2 parents c5e10fd + 09ea319
commit c5285d1
Show file tree

Hide file tree

Showing 28 changed files with 663 additions and 190 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -129,11 +129,11 @@ if (${BUILD_TYPE} STREQUAL "dev")
   endif()
 else()
   if (MSVC)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /O2 /DNDEBUG")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /O2 /DNDEBUG")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /O2")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /O2")
   else()
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O2 -DNDEBUG")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2 -DNDEBUG")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O2")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
   endif()
 endif()
 
@@ -186,11 +186,48 @@ else(MSVC)
   endif(NOT APPLE)
 endif(MSVC)
 
+if(USE_OPENMP)
+  include(FindOpenMP)
+  if(OPENMP_FOUND)
+    set(CMAKE_C_FLAGS "${OpenMP_C_FLAGS} ${CMAKE_C_FLAGS}")
+    set(CMAKE_CXX_FLAGS "${OpenMP_CXX_FLAGS} ${CMAKE_CXX_FLAGS}")
+  endif(OPENMP_FOUND)
+  message(STATUS "Build with OpenMP.")
+endif(USE_OPENMP)
+
 if(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "(x86)|(X86)|(amd64)|(AMD64)")
   message(STATUS "Disabling LIBXSMM on ${CMAKE_SYSTEM_PROCESSOR}.")
   set(USE_LIBXSMM OFF)
 endif()
 
+if(USE_LIBXSMM)
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DUSE_LIBXSMM -DDGL_CPU_LLC_SIZE=40000000 -D__BLAS=0")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_LIBXSMM -DDGL_CPU_LLC_SIZE=40000000 -D__BLAS=0")
+  message(STATUS "Build with LIBXSMM optimization.")
+endif(USE_LIBXSMM)
+
+if ((NOT MSVC) AND USE_EPOLL)
+  INCLUDE(CheckIncludeFile)
+  check_include_file("sys/epoll.h" EPOLL_AVAILABLE)
+  if (EPOLL_AVAILABLE)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DUSE_EPOLL")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_EPOLL")
+  else()
+    message(WARNING "EPOLL is not available on this platform...")
+  endif()
+endif ()
+
+# To compile METIS correct for DGL.
+if(MSVC)
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /DIDXTYPEWIDTH=64 /DREALTYPEWIDTH=32")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /DIDXTYPEWIDTH=64 /DREALTYPEWIDTH=32")
+else(MSVC)
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DIDXTYPEWIDTH=64 -DREALTYPEWIDTH=32")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DIDXTYPEWIDTH=64 -DREALTYPEWIDTH=32")
+endif(MSVC)
+
+# configure minigun
+add_definitions(-DENABLE_PARTIAL_FRONTIER=0)  # disable minigun partial frontier compile
 # Source file lists
 file(GLOB DGL_SRC
   src/*.cc
@@ -219,12 +256,6 @@ else()
 endif()
 list(APPEND DGL_SRC ${DGL_RPC_SRC})
 
-if(USE_OPENMP)
-  find_package(OpenMP REQUIRED)
-  list(APPEND DGL_LINKER_LIBS OpenMP::OpenMP_CXX)
-  message(STATUS "Build with OpenMP.")
-endif(USE_OPENMP)
-
 # Configure cuda
 if(USE_CUDA)
   file(GLOB_RECURSE DGL_CUDA_SRC
@@ -248,16 +279,6 @@ else(USE_CUDA)
   add_library(dgl SHARED ${DGL_SRC})
 endif(USE_CUDA)
 
-if ((NOT MSVC) AND USE_EPOLL)
-  INCLUDE(CheckIncludeFile)
-  check_include_file("sys/epoll.h" EPOLL_AVAILABLE)
-  if (EPOLL_AVAILABLE)
-    target_compile_definitions(dgl PRIVATE USE_EPOLL)
-  else()
-    message(WARNING "EPOLL is not available on this platform...")
-  endif()
-endif ()
-
 # include directories
 target_include_directories(dgl PRIVATE "include")
 # check for conda includes
@@ -330,26 +351,18 @@ else(EXTERNAL_NANOFLANN_PATH)
 endif(EXTERNAL_NANOFLANN_PATH)
 
 if (USE_LIBXSMM)
-  target_compile_definitions(dgl PRIVATE USE_LIBXSMM DGL_CPU_LLC_SIZE=40000000 __BLAS=0)
   target_include_directories(dgl PRIVATE "third_party/libxsmm/include")
-  message(STATUS "Build with LIBXSMM optimization.")
 endif()
 
-# To compile METIS correct for DGL.
-add_compile_definitions(IDXTYPEWIDTH=64 REALTYPEWIDTH=32)
 if (EXTERNAL_METIS_PATH)
-  # To compile METIS correct for DGL.
-  if(MSVC)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /DIDXTYPEWIDTH=64 /DREALTYPEWIDTH=32")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /DIDXTYPEWIDTH=64 /DREALTYPEWIDTH=32")
-  else(MSVC)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DIDXTYPEWIDTH=64 -DREALTYPEWIDTH=32")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DIDXTYPEWIDTH=64 -DREALTYPEWIDTH=32")
-  endif(MSVC)
   find_package(METIS REQUIRED)
-  message(STATUS "Found METIS library")
-  target_include_directories(dgl SYSTEM PUBLIC ${METIS_INCLUDE_DIR})
-  list(APPEND DGL_LINKER_LIBS ${METIS_LIBRARIES})
+  if (NOT METIS_FOUND)
+    message(FATAL_ERROR "Failed to find METIS library")
+  else()
+    message(STATUS "Found METIS library")
+    target_include_directories(dgl SYSTEM PUBLIC ${METIS_INCLUDE_DIR})
+    list(APPEND DGL_LINKER_LIBS ${METIS_LIBRARIES})
+  endif()
 else(EXTERNAL_METIS_PATH)
   target_include_directories(dgl PRIVATE "third_party/METIS/include")
   # Compile METIS
@@ -378,6 +391,8 @@ endif()
 
 # Compile gpu_cache
 if(USE_CUDA)
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DUSE_GPU_CACHE")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_GPU_CACHE")
   # Manually build gpu_cache because CMake always builds it as shared
   file(GLOB gpu_cache_src
     third_party/HugeCTR/gpu_cache/src/nv_gpu_cache.cu
@@ -391,7 +406,7 @@ endif(USE_CUDA)
 
 # support PARALLEL_ALGORITHMS
 if (LIBCXX_ENABLE_PARALLEL_ALGORITHMS)
-  target_compile_definitions(dgl PRIVATE PARALLEL_ALGORITHMS)
+  add_definitions(-DPARALLEL_ALGORITHMS)
 endif(LIBCXX_ENABLE_PARALLEL_ALGORITHMS)
 
 target_link_libraries(dgl ${DGL_LINKER_LIBS} ${DGL_RUNTIME_LINKER_LIBS})

diff --git a/cmake/modules/CUDA.cmake b/cmake/modules/CUDA.cmake
@@ -230,10 +230,6 @@ macro(dgl_config_cuda linker_libs)
     string(CONCAT CXX_HOST_FLAGS ${CXX_HOST_FLAGS} ",/MD")
   endif()
   list(APPEND CUDA_NVCC_FLAGS "-Xcompiler" "${CXX_HOST_FLAGS}")
-  if(USE_OPENMP)
-    # Needed by CUDA disjoint union source file.
-    list(APPEND CUDA_NVCC_FLAGS "-Xcompiler" "${OpenMP_CXX_FLAGS}")
-  endif(USE_OPENMP)
 
   # 1. Add arch flags
   dgl_select_nvcc_arch_flags(NVCC_FLAGS_ARCH)

diff --git a/examples/multigpu/graphbolt/node_classification.py b/examples/multigpu/graphbolt/node_classification.py
@@ -135,7 +135,10 @@ def create_dataloader(
     if args.storage_device != "cpu":
         datapipe = datapipe.copy_to(device)
     datapipe = datapipe.sample_neighbor(
-        graph, args.fanout, overlap_fetch=args.storage_device == "pinned"
+        graph,
+        args.fanout,
+        overlap_fetch=args.storage_device == "pinned",
+        asynchronous=args.storage_device != "cpu",
     )
     datapipe = datapipe.fetch_feature(features, node_feature_keys=["feat"])
     if args.storage_device == "cpu":

diff --git a/graphbolt/include/graphbolt/async.h b/graphbolt/include/graphbolt/async.h
@@ -26,6 +26,7 @@
 #include <future>
 #include <memory>
 #include <mutex>
+#include <variant>
 
 #ifdef BUILD_WITH_TASKFLOW
 #include <taskflow/algorithm/for_each.hpp>
@@ -37,6 +38,7 @@
 #endif
 
 #ifdef GRAPHBOLT_USE_CUDA
+#include <ATen/cuda/CUDAEvent.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/cuda/CUDAStream.h>
 #include <torch/csrc/api/include/torch/cuda.h>
@@ -92,15 +94,50 @@ inline int get_num_interop_threads() {
 
 template <typename T>
 class Future : public torch::CustomClassHolder {
+#ifdef GRAPHBOLT_USE_CUDA
+  using T_no_event = std::conditional_t<std::is_void_v<T>, std::monostate, T>;
+  using T_with_event = std::conditional_t<
+      std::is_void_v<T>, at::cuda::CUDAEvent,
+      std::pair<T, at::cuda::CUDAEvent>>;
+  using future_type = std::future<std::variant<T_no_event, T_with_event>>;
+#else
+  using future_type = std::future<T>;
+#endif
+
  public:
-  Future(std::future<T>&& future) : future_(std::move(future)) {}
+#ifdef GRAPHBOLT_USE_CUDA
+  using return_type = std::variant<T_no_event, T_with_event>;
+#else
+  using return_type = T;
+#endif
+
+  Future(future_type&& future) : future_(std::move(future)) {}
 
   Future() = default;
 
-  T Wait() { return future_.get(); }
+  T Wait() {
+#ifdef GRAPHBOLT_USE_CUDA
+    auto result = future_.get();
+    if constexpr (std::is_void_v<T>) {
+      if (std::holds_alternative<T_with_event>(result)) {
+        auto&& event = std::get<T_with_event>(result);
+        event.block(c10::cuda::getCurrentCUDAStream());
+      }
+      return;
+    } else if (std::holds_alternative<T_with_event>(result)) {
+      auto&& [value, event] = std::get<T_with_event>(result);
+      event.block(c10::cuda::getCurrentCUDAStream());
+      return value;
+    } else {
+      return std::get<T_no_event>(result);
+    }
+#else
+    return future_.get();
+#endif
+  }
 
  private:
-  std::future<T> future_;
+  future_type future_;
 };
 
 /**
@@ -109,36 +146,55 @@ class Future : public torch::CustomClassHolder {
  * task to avoid spawning a new OpenMP threadpool on each interop thread.
  */
 template <typename F>
-inline auto async(F&& function) {
+inline auto async(F&& function, bool is_cuda = false) {
   using T = decltype(function());
 #ifdef GRAPHBOLT_USE_CUDA
-  const auto is_cuda_available = torch::cuda::is_available();
   struct c10::StreamData3 stream_data;
-  if (is_cuda_available) {
+  if (is_cuda) {
     stream_data = c10::cuda::getCurrentCUDAStream().pack3();
   }
 #endif
-  auto fn = [=, func = std::move(function)] {
+  using return_type = typename Future<T>::return_type;
+  auto fn = [=, func = std::move(function)]() -> return_type {
 #ifdef GRAPHBOLT_USE_CUDA
     // We make sure to use the same CUDA stream as the thread launching the
     // async operation.
-    if (is_cuda_available) {
+    if (is_cuda) {
       auto stream = c10::cuda::CUDAStream::unpack3(
           stream_data.stream_id, stream_data.device_index,
           stream_data.device_type);
       c10::cuda::CUDAStreamGuard guard(stream);
+      at::cuda::CUDAEvent event;
+      // Might be executed on the GPU so we record an event to be able to
+      // synchronize with it later, in case it is executed on an alternative
+      // CUDA stream.
+      if constexpr (std::is_void_v<T>) {
+        func();
+        event.record();
+        return event;
+      } else {
+        auto result = func();
+        event.record();
+        return std::make_pair(std::move(result), std::move(event));
+      }
+    }
+    if constexpr (std::is_void_v<T>) {
+      func();
+      return std::monostate{};
+    } else {
       return func();
     }
-#endif
+#else
     return func();
+#endif
   };
 #ifdef BUILD_WITH_TASKFLOW
   auto future = interop_pool().async(std::move(fn));
 #else
-  auto promise = std::make_shared<std::promise<T>>();
+  auto promise = std::make_shared<std::promise<return_type>>();
   auto future = promise->get_future();
   at::launch([promise, func = std::move(fn)]() {
-    if constexpr (std::is_void_v<T>) {
+    if constexpr (std::is_void_v<return_type>) {
       func();
       promise->set_value();
     } else

diff --git a/graphbolt/include/graphbolt/fused_sampled_subgraph.h b/graphbolt/include/graphbolt/fused_sampled_subgraph.h
@@ -49,7 +49,8 @@ struct FusedSampledSubgraph : torch::CustomClassHolder {
    * graph.
    * @param original_row_node_ids Column's reverse node ids in the original
    * graph.
-   * @param original_edge_ids Reverse edge ids in the original graph.
+   * @param original_edge_ids Mapping of subgraph edge IDs to original
+   * FusedCSCSamplingGraph edge IDs.
    * @param type_per_edge Type id of each edge.
    * @param etype_offsets Edge offsets for the sampled edges for the sampled
    * edges that are sorted w.r.t. edge types.
@@ -91,10 +92,17 @@ struct FusedSampledSubgraph : torch::CustomClassHolder {
   torch::optional<torch::Tensor> indices;
 
   /**
-   * @brief Reverse edge ids in the original graph, the edge with id
-   * `original_edge_ids[i]` in the original graph is mapped to `i` in this
-   * subgraph. This is useful when edge features are needed. The edges are
-   * sorted w.r.t. their edge types for the heterogenous case.
+   * @brief Mapping of subgraph edge IDs to original FusedCSCSamplingGraph
+   * edge IDs.
+   *
+   * In this subgraph, the edge at index i corresponds to the edge with ID
+   * original_edge_ids[i] in the original FusedCSCSamplingGraph. Edges are
+   * sorted by type for heterogeneous graphs.
+   *
+   * Note: To retrieve the actual original edge IDs for feature fetching, use
+   * the `_ORIGINAL_EDGE_ID` edge attribute in FusedCSCSamplingGraph to map the
+   * `original_edge_ids` agin, as IDs may have been remapped during conversion
+   * to FusedCSCSamplingGraph.
    */
   torch::Tensor original_edge_ids;
 

diff --git a/graphbolt/src/cuda/extension/gpu_cache.cu b/graphbolt/src/cuda/extension/gpu_cache.cu
@@ -76,6 +76,16 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> GpuCache::Query(
   return std::make_tuple(values, missing_index, missing_keys);
 }
 
+c10::intrusive_ptr<Future<std::vector<torch::Tensor>>> GpuCache::QueryAsync(
+    torch::Tensor keys) {
+  return async(
+      [=] {
+        auto [values, missing_index, missing_keys] = Query(keys);
+        return std::vector{values, missing_index, missing_keys};
+      },
+      true);
+}
+
 void GpuCache::Replace(torch::Tensor keys, torch::Tensor values) {
   TORCH_CHECK(keys.device().is_cuda(), "Keys should be on a CUDA device.");
   TORCH_CHECK(

diff --git a/graphbolt/src/cuda/extension/gpu_cache.h b/graphbolt/src/cuda/extension/gpu_cache.h
@@ -21,6 +21,7 @@
 #ifndef GRAPHBOLT_GPU_CACHE_H_
 #define GRAPHBOLT_GPU_CACHE_H_
 
+#include <graphbolt/async.h>
 #include <torch/custom_class.h>
 #include <torch/torch.h>
 
@@ -53,6 +54,9 @@ class GpuCache : public torch::CustomClassHolder {
   std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> Query(
       torch::Tensor keys);
 
+  c10::intrusive_ptr<Future<std::vector<torch::Tensor>>> QueryAsync(
+      torch::Tensor keys);
+
   void Replace(torch::Tensor keys, torch::Tensor values);
 
   static c10::intrusive_ptr<GpuCache> Create(