dmlc · mfbalin · Apr 7, 2024 · Mar 24, 2024 · Mar 24, 2024 · Mar 24, 2024
diff --git a/.gitmodules b/.gitmodules
@@ -28,3 +28,6 @@
 [submodule "third_party/liburing"]
 	path = third_party/liburing
 	url = https://github.com/axboe/liburing.git
+[submodule "third_party/cuco"]
+	path = third_party/cuco
+	url = https://github.com/NVIDIA/cuCollections.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -590,5 +590,5 @@ if(BUILD_GRAPHBOLT)
   endif(USE_CUDA)
   if(CMAKE_SYSTEM_NAME MATCHES "Linux")
     add_dependencies(graphbolt liburing)
-  endif(USE_CUDA)
+  endif()
 endif(BUILD_GRAPHBOLT)
diff --git a/graphbolt/CMakeLists.txt b/graphbolt/CMakeLists.txt
@@ -58,10 +58,19 @@ if(USE_CUDA)
   if(DEFINED ENV{CUDAARCHS})
     set(CMAKE_CUDA_ARCHITECTURES $ENV{CUDAARCHS})
   endif()
+  set(CMAKE_CUDA_ARCHITECTURES_FILTERED ${CMAKE_CUDA_ARCHITECTURES})
+  # CUDA extension supports only sm_70 and up (Volta+).
+  list(FILTER CMAKE_CUDA_ARCHITECTURES_FILTERED EXCLUDE REGEX "[2-6][0-9]")
+  list(LENGTH CMAKE_CUDA_ARCHITECTURES_FILTERED CMAKE_CUDA_ARCHITECTURES_FILTERED_LEN)
+  if(CMAKE_CUDA_ARCHITECTURES_FILTERED_LEN EQUAL 0)
+    # Build the CUDA extension at least build for Volta.
+    set(CMAKE_CUDA_ARCHITECTURES_FILTERED "70")
+  endif()
+  set(LIB_GRAPHBOLT_CUDA_NAME "${LIB_GRAPHBOLT_NAME}_cuda")
 endif()
 
 add_library(${LIB_GRAPHBOLT_NAME} SHARED ${BOLT_SRC} ${BOLT_HEADERS})
-target_include_directories(${LIB_GRAPHBOLT_NAME} PRIVATE ${BOLT_DIR}
+include_directories(BEFORE ${BOLT_DIR}
                            ${BOLT_HEADERS}
                            "../third_party/dmlc-core/include"
                            "../third_party/pcg/include")
@@ -73,12 +82,25 @@ if(CMAKE_SYSTEM_NAME MATCHES "Linux")
 endif()
 
 if(USE_CUDA)
+  file(GLOB BOLT_CUDA_EXTENSION_SRC
+    ${BOLT_DIR}/cuda/extension/*.cu
+    ${BOLT_DIR}/cuda/extension/*.cc
+  )
+  # Until https://github.com/NVIDIA/cccl/issues/1083 is resolved, we need to
+  # compile the cuda/extension folder with Volta+ CUDA architectures.
+  add_library(${LIB_GRAPHBOLT_CUDA_NAME} STATIC ${BOLT_CUDA_EXTENSION_SRC} ${BOLT_HEADERS})
+  target_link_libraries(${LIB_GRAPHBOLT_CUDA_NAME} "${TORCH_LIBRARIES}")
+
   set_target_properties(${LIB_GRAPHBOLT_NAME} PROPERTIES CUDA_STANDARD 17)
+  set_target_properties(${LIB_GRAPHBOLT_CUDA_NAME} PROPERTIES CUDA_STANDARD 17)
+  set_target_properties(${LIB_GRAPHBOLT_CUDA_NAME} PROPERTIES CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES_FILTERED}")
+  set_target_properties(${LIB_GRAPHBOLT_CUDA_NAME} PROPERTIES POSITION_INDEPENDENT_CODE TRUE)
   message(STATUS "Use external CCCL library for a consistent API and performance for graphbolt.")
-  target_include_directories(${LIB_GRAPHBOLT_NAME} PRIVATE
-                             "../third_party/cccl/thrust"
-                             "../third_party/cccl/cub"
-                             "../third_party/cccl/libcudacxx/include")
+  include_directories(BEFORE
+                      "../third_party/cccl/thrust"
+                      "../third_party/cccl/cub"
+                      "../third_party/cccl/libcudacxx/include"
+                      "../third_party/cuco/include")
 
   message(STATUS "Use HugeCTR gpu_cache for graphbolt with INCLUDE_DIRS $ENV{GPU_CACHE_INCLUDE_DIRS}.")
   target_include_directories(${LIB_GRAPHBOLT_NAME} PRIVATE $ENV{GPU_CACHE_INCLUDE_DIRS})
@@ -87,6 +109,11 @@ if(USE_CUDA)
 
   get_property(archs TARGET ${LIB_GRAPHBOLT_NAME} PROPERTY CUDA_ARCHITECTURES)
   message(STATUS "CUDA_ARCHITECTURES for graphbolt: ${archs}")
+
+  get_property(archs TARGET ${LIB_GRAPHBOLT_CUDA_NAME} PROPERTY CUDA_ARCHITECTURES)
+  message(STATUS "CUDA_ARCHITECTURES for graphbolt extension: ${archs}")
+
+  target_link_libraries(${LIB_GRAPHBOLT_NAME} ${LIB_GRAPHBOLT_CUDA_NAME})
 endif()
 
 # The Torch CMake configuration only sets up the path for the MKL library when

diff --git a/graphbolt/build.bat b/graphbolt/build.bat
@@ -11,7 +11,7 @@ IF x%1x == xx GOTO single
 
 FOR %%X IN (%*) DO (
   DEL /S /Q *
-  "%CMAKE_COMMAND%" -DGPU_CACHE_BUILD_DIR=%BINDIR% -DCMAKE_CONFIGURATION_TYPES=Release -DPYTHON_INTERP=%%X .. -G "Visual Studio 16 2019" || EXIT /B 1
+  "%CMAKE_COMMAND%" -DGPU_CACHE_BUILD_DIR=%BINDIR% -DCMAKE_CONFIGURATION_TYPES=Release -DPYTHON_INTERP=%%X -DTORCH_CUDA_ARCH_LIST=Volta .. -G "Visual Studio 16 2019" || EXIT /B 1
   msbuild graphbolt.sln /m /nr:false || EXIT /B 1
   COPY /Y Release\*.dll "%BINDIR%\graphbolt" || EXIT /B 1
 )
@@ -21,7 +21,7 @@ GOTO end
 :single
 
 DEL /S /Q *
-"%CMAKE_COMMAND%" -DGPU_CACHE_BUILD_DIR=%BINDIR% -DCMAKE_CONFIGURATION_TYPES=Release .. -G "Visual Studio 16 2019" || EXIT /B 1
+"%CMAKE_COMMAND%" -DGPU_CACHE_BUILD_DIR=%BINDIR% -DCMAKE_CONFIGURATION_TYPES=Release -DTORCH_CUDA_ARCH_LIST=Volta .. -G "Visual Studio 16 2019" || EXIT /B 1
 msbuild graphbolt.sln /m /nr:false || EXIT /B 1
 COPY /Y Release\*.dll "%BINDIR%\graphbolt" || EXIT /B 1
 

diff --git a/graphbolt/build.sh b/graphbolt/build.sh
@@ -12,7 +12,11 @@ else
   CPSOURCE=*.so
 fi
 
-CMAKE_FLAGS="-DCUDA_TOOLKIT_ROOT_DIR=$CUDA_TOOLKIT_ROOT_DIR -DUSE_CUDA=$USE_CUDA -DGPU_CACHE_BUILD_DIR=$BINDIR"
+# We build for the same architectures as DGL, thus we hardcode
+# TORCH_CUDA_ARCH_LIST and we need to at least compile for Volta. Until
+# https://github.com/NVIDIA/cccl/issues/1083 is resolved, we need to compile the
+# cuda/extension folder with Volta+ CUDA architectures.
+CMAKE_FLAGS="-DCUDA_TOOLKIT_ROOT_DIR=$CUDA_TOOLKIT_ROOT_DIR -DUSE_CUDA=$USE_CUDA -DGPU_CACHE_BUILD_DIR=$BINDIR -DTORCH_CUDA_ARCH_LIST=Volta"
 echo $CMAKE_FLAGS
 
 if [ $# -eq 0 ]; then

diff --git a/graphbolt/include/graphbolt/cuda_ops.h b/graphbolt/include/graphbolt/cuda_ops.h
@@ -221,6 +221,17 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> UniqueAndCompact(
     const torch::Tensor src_ids, const torch::Tensor dst_ids,
     const torch::Tensor unique_dst_ids, int num_bits = 0);
 
+/**
+ * @brief Batched version of UniqueAndCompact. The ith element of the return
+ * value is equal to the passing the ith elements of the input arguments to
+ * UniqueAndCompact.
+ */
+std::vector<std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>>
+UniqueAndCompactBatched(
+    const std::vector<torch::Tensor>& src_ids,
+    const std::vector<torch::Tensor>& dst_ids,
+    const std::vector<torch::Tensor>& unique_dst_ids, int num_bits = 0);
+
 }  //  namespace ops
 }  //  namespace graphbolt
 

diff --git a/graphbolt/include/graphbolt/unique_and_compact.h b/graphbolt/include/graphbolt/unique_and_compact.h
@@ -50,6 +50,12 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> UniqueAndCompact(
     const torch::Tensor& src_ids, const torch::Tensor& dst_ids,
     const torch::Tensor unique_dst_ids);
 
+std::vector<std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>>
+UniqueAndCompactBatched(
+    const std::vector<torch::Tensor>& src_ids,
+    const std::vector<torch::Tensor>& dst_ids,
+    const std::vector<torch::Tensor> unique_dst_ids);
+
 }  // namespace sampling
 }  // namespace graphbolt
 

diff --git a/graphbolt/src/cuda/common.h b/graphbolt/src/cuda/common.h
@@ -11,6 +11,7 @@
 #include <c10/cuda/CUDACachingAllocator.h>
 #include <c10/cuda/CUDAException.h>
 #include <c10/cuda/CUDAStream.h>
+#include <cuda.h>
 #include <cuda_runtime.h>
 #include <torch/script.h>
 
@@ -38,12 +39,17 @@ namespace cuda {
  *
  * int_array.get() gives the raw pointer.
  */
+template <typename value_t = char>
 struct CUDAWorkspaceAllocator {
+  static_assert(sizeof(char) == 1, "sizeof(char) == 1 should hold.");
   // Required by thrust to satisfy allocator requirements.
-  using value_type = char;
+  using value_type = value_t;
 
   explicit CUDAWorkspaceAllocator() { at::globalContext().lazyInitCUDA(); }
 
+  template <class U>
+  CUDAWorkspaceAllocator(CUDAWorkspaceAllocator<U> const&) noexcept {}
+
   CUDAWorkspaceAllocator& operator=(const CUDAWorkspaceAllocator&) = default;
 
   void operator()(void* ptr) const {
@@ -53,7 +59,7 @@ struct CUDAWorkspaceAllocator {
   // Required by thrust to satisfy allocator requirements.
   value_type* allocate(std::ptrdiff_t size) const {
     return reinterpret_cast<value_type*>(
-        c10::cuda::CUDACachingAllocator::raw_alloc(size));
+        c10::cuda::CUDACachingAllocator::raw_alloc(size * sizeof(value_type)));
   }
 
   // Required by thrust to satisfy allocator requirements.
@@ -63,7 +69,9 @@ struct CUDAWorkspaceAllocator {
   std::unique_ptr<T, CUDAWorkspaceAllocator> AllocateStorage(
       std::size_t size) const {
     return std::unique_ptr<T, CUDAWorkspaceAllocator>(
-        reinterpret_cast<T*>(allocate(sizeof(T) * size)), *this);
+        reinterpret_cast<T*>(
+            c10::cuda::CUDACachingAllocator::raw_alloc(sizeof(T) * size)),
+        *this);
   }
 };
 
@@ -81,6 +89,21 @@ inline bool is_zero<dim3>(dim3 size) {
   return size.x == 0 || size.y == 0 || size.z == 0;
 }
 
+#define CUDA_DRIVER_CHECK(EXPR)                       \
+  do {                                                \
+    CUresult __err = EXPR;                            \
+    if (__err != CUDA_SUCCESS) {                      \
+      const char* err_str;                            \
+      CUresult get_error_str_err C10_UNUSED =         \
+          cuGetErrorString(__err, &err_str);          \
+      if (get_error_str_err != CUDA_SUCCESS) {        \
+        AT_ERROR("CUDA driver error: unknown error"); \
+      } else {                                        \
+        AT_ERROR("CUDA driver error: ", err_str);     \
+      }                                               \
+    }                                                 \
+  } while (0)
+
 #define CUDA_CALL(func) C10_CUDA_CHECK((func))
 
 #define CUDA_KERNEL_CALL(kernel, nblks, nthrs, shmem, ...)          \

diff --git a/graphbolt/src/cuda/extension/unique_and_compact.h b/graphbolt/src/cuda/extension/unique_and_compact.h
@@ -0,0 +1,26 @@
+/**
+ *  Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
+ * @file cuda/unique_and_compact.h
+ * @brief Unique and compact operator utilities on CUDA using hash table.
+ */
+
+#ifndef GRAPHBOLT_CUDA_UNIQUE_AND_COMPACT_H_
+#define GRAPHBOLT_CUDA_UNIQUE_AND_COMPACT_H_
+
+#include <torch/script.h>
+
+#include <vector>
+
+namespace graphbolt {
+namespace ops {
+
+std::vector<std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> >
+UniqueAndCompactBatchedMap(
+    const std::vector<torch::Tensor>& src_ids,
+    const std::vector<torch::Tensor>& dst_ids,
+    const std::vector<torch::Tensor>& unique_dst_ids);
+
+}  // namespace ops
+}  // namespace graphbolt
+
+#endif  // GRAPHBOLT_CUDA_UNIQUE_AND_COMPACT_H_