Merge branch 'branch-24.10' into hnsw-python-api

rapidsai · Sep 11, 2024 · 53bcf5d · 53bcf5d
2 parents b47f92f + 68480c9
commit 53bcf5d
Show file tree

Hide file tree

Showing 13 changed files with 99 additions and 95 deletions.
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
@@ -45,6 +45,7 @@ DEPENDENCIES=(
   dask-cuda
   cuvs
   pylibraft
+  librmm
   rmm
   rapids-dask-dependency
 )

diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml
@@ -35,6 +35,7 @@ dependencies:
 - libcusolver=11.4.1.48
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
+- librmm==24.10.*,>=0.0.0a0
 - make
 - nccl>=2.9.9
 - ninja
@@ -49,7 +50,6 @@ dependencies:
 - pytest==7.*
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
 - recommonmark
-- rmm==24.10.*,>=0.0.0a0
 - rust
 - scikit-build-core>=0.10.0
 - scikit-learn

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -35,6 +35,7 @@ dependencies:
 - libcusolver=11.4.1.48
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
+- librmm==24.10.*,>=0.0.0a0
 - make
 - nccl>=2.9.9
 - ninja
@@ -49,7 +50,6 @@ dependencies:
 - pytest==7.*
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
 - recommonmark
-- rmm==24.10.*,>=0.0.0a0
 - rust
 - scikit-build-core>=0.10.0
 - scikit-learn

diff --git a/conda/environments/all_cuda-125_arch-aarch64.yaml b/conda/environments/all_cuda-125_arch-aarch64.yaml
@@ -32,6 +32,7 @@ dependencies:
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
+- librmm==24.10.*,>=0.0.0a0
 - make
 - nccl>=2.9.9
 - ninja
@@ -45,7 +46,6 @@ dependencies:
 - pytest==7.*
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
 - recommonmark
-- rmm==24.10.*,>=0.0.0a0
 - rust
 - scikit-build-core>=0.10.0
 - scikit-learn

diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -32,6 +32,7 @@ dependencies:
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
+- librmm==24.10.*,>=0.0.0a0
 - make
 - nccl>=2.9.9
 - ninja
@@ -45,7 +46,6 @@ dependencies:
 - pytest==7.*
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
 - recommonmark
-- rmm==24.10.*,>=0.0.0a0
 - rust
 - scikit-build-core>=0.10.0
 - scikit-learn

diff --git a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
@@ -33,6 +33,7 @@ dependencies:
 - libcusolver=11.4.1.48
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
+- librmm==24.10.*,>=0.0.0a0
 - matplotlib
 - nccl>=2.9.9
 - ninja
@@ -42,6 +43,5 @@ dependencies:
 - pandas
 - pylibraft==24.10.*,>=0.0.0a0
 - pyyaml
-- rmm==24.10.*,>=0.0.0a0
 - sysroot_linux-aarch64==2.17
 name: bench_ann_cuda-118_arch-aarch64
diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
@@ -33,6 +33,7 @@ dependencies:
 - libcusolver=11.4.1.48
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
+- librmm==24.10.*,>=0.0.0a0
 - matplotlib
 - nccl>=2.9.9
 - ninja
@@ -42,6 +43,5 @@ dependencies:
 - pandas
 - pylibraft==24.10.*,>=0.0.0a0
 - pyyaml
-- rmm==24.10.*,>=0.0.0a0
 - sysroot_linux-64==2.17
 name: bench_ann_cuda-118_arch-x86_64
diff --git a/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml
@@ -30,6 +30,7 @@ dependencies:
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
+- librmm==24.10.*,>=0.0.0a0
 - matplotlib
 - nccl>=2.9.9
 - ninja
@@ -38,6 +39,5 @@ dependencies:
 - pandas
 - pylibraft==24.10.*,>=0.0.0a0
 - pyyaml
-- rmm==24.10.*,>=0.0.0a0
 - sysroot_linux-aarch64==2.17
 name: bench_ann_cuda-125_arch-aarch64
diff --git a/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
@@ -30,6 +30,7 @@ dependencies:
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
+- librmm==24.10.*,>=0.0.0a0
 - matplotlib
 - nccl>=2.9.9
 - ninja
@@ -38,6 +39,5 @@ dependencies:
 - pandas
 - pylibraft==24.10.*,>=0.0.0a0
 - pyyaml
-- rmm==24.10.*,>=0.0.0a0
 - sysroot_linux-64==2.17
 name: bench_ann_cuda-125_arch-x86_64
diff --git a/cpp/src/neighbors/detail/faiss_distance_utils.h b/cpp/src/neighbors/detail/faiss_distance_utils.h
@@ -14,39 +14,39 @@ inline void chooseTileSize(size_t numQueries,
                            size_t numCentroids,
                            size_t dim,
                            size_t elementSize,
-                           size_t totalMem,
                            size_t& tileRows,
                            size_t& tileCols)
 {
+  // 512 seems to be a batch size sweetspot for float32.
+  // If we are on float16, increase to 512.
+  // If the k size (vec dim) of the matrix multiplication is small (<= 32),
+  // increase to 1024.
+  size_t preferredTileRows = 512;
+  if (dim <= 32) { preferredTileRows = 1024; }
+
+  tileRows = std::min(preferredTileRows, numQueries);
+
   // The matrix multiplication should be large enough to be efficient, but if
   // it is too large, we seem to lose efficiency as opposed to
   // double-streaming. Each tile size here defines 1/2 of the memory use due
   // to double streaming. We ignore available temporary memory, as that is
   // adjusted independently by the user and can thus meet these requirements
   // (or not). For <= 4 GB GPUs, prefer 512 MB of usage. For <= 8 GB GPUs,
   // prefer 768 MB of usage. Otherwise, prefer 1 GB of usage.
-  size_t targetUsage = 0;
-
-  if (totalMem <= ((size_t)4) * 1024 * 1024 * 1024) {
-    targetUsage = 512 * 1024 * 1024;
-  } else if (totalMem <= ((size_t)8) * 1024 * 1024 * 1024) {
-    targetUsage = 768 * 1024 * 1024;
+  size_t targetUsage = 512 * 1024 * 1024;
+  if (tileRows * numCentroids * elementSize * 2 <= targetUsage) {
+    tileCols = numCentroids;
   } else {
-    targetUsage = 1024 * 1024 * 1024;
-  }
+    // only query total memory in case it potentially impacts tilesize
+    size_t totalMem = rmm::available_device_memory().second;
 
-  targetUsage /= 2 * elementSize;
+    if (totalMem > ((size_t)8) * 1024 * 1024 * 1024) {
+      targetUsage = 1024 * 1024 * 1024;
+    } else if (totalMem > ((size_t)4) * 1024 * 1024 * 1024) {
+      targetUsage = 768 * 1024 * 1024;
+    }
 
-  // 512 seems to be a batch size sweetspot for float32.
-  // If we are on float16, increase to 512.
-  // If the k size (vec dim) of the matrix multiplication is small (<= 32),
-  // increase to 1024.
-  size_t preferredTileRows = 512;
-  if (dim <= 32) { preferredTileRows = 1024; }
-
-  tileRows = std::min(preferredTileRows, numQueries);
-
-  // tileCols is the remainder size
-  tileCols = std::min(targetUsage / preferredTileRows, numCentroids);
+    tileCols = std::min(targetUsage / (2 * elementSize * tileRows), numCentroids);
+  }
 }
 }  // namespace cuvs::neighbors::detail::faiss_select
diff --git a/cpp/src/neighbors/detail/knn_brute_force.cuh b/cpp/src/neighbors/detail/knn_brute_force.cuh
@@ -81,14 +81,12 @@ void tiled_brute_force_knn(const raft::resources& handle,
                            const uint32_t* filter_bitmap             = nullptr)
 {
   // Figure out the number of rows/cols to tile for
-  size_t tile_rows   = 0;
-  size_t tile_cols   = 0;
-  auto stream        = raft::resource::get_cuda_stream(handle);
-  auto device_memory = raft::resource::get_workspace_resource(handle);
-  auto total_mem     = rmm::available_device_memory().second;
+  size_t tile_rows = 0;
+  size_t tile_cols = 0;
+  auto stream      = raft::resource::get_cuda_stream(handle);
 
   cuvs::neighbors::detail::faiss_select::chooseTileSize(
-    m, n, d, sizeof(DistanceT), total_mem, tile_rows, tile_cols);
+    m, n, d, sizeof(DistanceT), tile_rows, tile_cols);
 
   // for unittesting, its convenient to be able to put a max size on the tiles
   // so we can test the tiling logic without having to use huge inputs.
@@ -356,27 +354,26 @@ void brute_force_knn_impl(
 
   ASSERT(input.size() == sizes.size(), "input and sizes vectors should be the same size");
 
-  std::vector<IdxType>* id_ranges;
-  if (translations == nullptr) {
+  std::vector<IdxType> id_ranges;
+  if (translations != nullptr) {
+    // use the given translations
+    id_ranges.insert(id_ranges.end(), translations->begin(), translations->end());
+  } else if (input.size() > 1) {
     // If we don't have explicit translations
     // for offsets of the indices, build them
     // from the local partitions
-    id_ranges       = new std::vector<IdxType>();
     IdxType total_n = 0;
     for (size_t i = 0; i < input.size(); i++) {
-      id_ranges->push_back(total_n);
+      id_ranges.push_back(total_n);
       total_n += sizes[i];
     }
-  } else {
-    // otherwise, use the given translations
-    id_ranges = translations;
   }
 
-  int device;
-  RAFT_CUDA_TRY(cudaGetDevice(&device));
-
-  rmm::device_uvector<IdxType> trans(id_ranges->size(), userStream);
-  raft::update_device(trans.data(), id_ranges->data(), id_ranges->size(), userStream);
+  rmm::device_uvector<IdxType> trans(0, userStream);
+  if (id_ranges.size() > 0) {
+    trans.resize(id_ranges.size(), userStream);
+    raft::update_device(trans.data(), id_ranges.data(), id_ranges.size(), userStream);
+  }
 
   rmm::device_uvector<DistType> all_D(0, userStream);
   rmm::device_uvector<IdxType> all_I(0, userStream);
@@ -513,8 +510,6 @@ void brute_force_knn_impl(
     // no translations or partitions to combine, it can be skipped.
     knn_merge_parts(out_D, out_I, res_D, res_I, n, input.size(), k, userStream, trans.data());
   }
-
-  if (translations == nullptr) delete id_ranges;
 };
 
 template <typename T,