From 21247890038f004435691c93acc3e2b3c770913e Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 11 Sep 2024 00:11:13 -0500
Subject: [PATCH 1/3] Refactor dependencies.yaml to use depends-on pattern.
 (#321)

This refactors cuvs's `dependencies.yaml` to follow a convention we've established in other repos. For RAPIDS packages we declare a `depends_on_X` dependency list, and use that rather than specifying packages multiple times. This enables more reuse and easier automation for version migrations.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/cuvs/pull/321
---
 ci/release/update-version.sh                  |  1 +
 .../all_cuda-118_arch-aarch64.yaml            |  2 +-
 .../all_cuda-118_arch-x86_64.yaml             |  2 +-
 .../all_cuda-125_arch-aarch64.yaml            |  2 +-
 .../all_cuda-125_arch-x86_64.yaml             |  2 +-
 .../bench_ann_cuda-118_arch-aarch64.yaml      |  2 +-
 .../bench_ann_cuda-118_arch-x86_64.yaml       |  2 +-
 .../bench_ann_cuda-125_arch-aarch64.yaml      |  2 +-
 .../bench_ann_cuda-125_arch-x86_64.yaml       |  2 +-
 dependencies.yaml                             | 98 ++++++++++---------
 python/cuvs/pyproject.toml                    |  2 -
 11 files changed, 63 insertions(+), 54 deletions(-)

diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 6d7d022c2..feb0a400c 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -45,6 +45,7 @@ DEPENDENCIES=(
   dask-cuda
   cuvs
   pylibraft
+  librmm
   rmm
   rapids-dask-dependency
 )
diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml
index 4bbdc3650..cfcb56225 100644
--- a/conda/environments/all_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-118_arch-aarch64.yaml
@@ -35,6 +35,7 @@ dependencies:
 - libcusolver=11.4.1.48
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
+- librmm==24.10.*,>=0.0.0a0
 - make
 - nccl>=2.9.9
 - ninja
@@ -49,7 +50,6 @@ dependencies:
 - pytest==7.*
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
 - recommonmark
-- rmm==24.10.*,>=0.0.0a0
 - rust
 - scikit-build-core>=0.10.0
 - scikit-learn
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 908421d08..dc519d1b5 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -35,6 +35,7 @@ dependencies:
 - libcusolver=11.4.1.48
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
+- librmm==24.10.*,>=0.0.0a0
 - make
 - nccl>=2.9.9
 - ninja
@@ -49,7 +50,6 @@ dependencies:
 - pytest==7.*
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
 - recommonmark
-- rmm==24.10.*,>=0.0.0a0
 - rust
 - scikit-build-core>=0.10.0
 - scikit-learn
diff --git a/conda/environments/all_cuda-125_arch-aarch64.yaml b/conda/environments/all_cuda-125_arch-aarch64.yaml
index 3131d0b77..b32650e44 100644
--- a/conda/environments/all_cuda-125_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-125_arch-aarch64.yaml
@@ -32,6 +32,7 @@ dependencies:
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
+- librmm==24.10.*,>=0.0.0a0
 - make
 - nccl>=2.9.9
 - ninja
@@ -45,7 +46,6 @@ dependencies:
 - pytest==7.*
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
 - recommonmark
-- rmm==24.10.*,>=0.0.0a0
 - rust
 - scikit-build-core>=0.10.0
 - scikit-learn
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 2f107c4fb..d40fc3b99 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -32,6 +32,7 @@ dependencies:
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
+- librmm==24.10.*,>=0.0.0a0
 - make
 - nccl>=2.9.9
 - ninja
@@ -45,7 +46,6 @@ dependencies:
 - pytest==7.*
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
 - recommonmark
-- rmm==24.10.*,>=0.0.0a0
 - rust
 - scikit-build-core>=0.10.0
 - scikit-learn
diff --git a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
index 9b23faa67..c6e8b05a2 100644
--- a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
@@ -33,6 +33,7 @@ dependencies:
 - libcusolver=11.4.1.48
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
+- librmm==24.10.*,>=0.0.0a0
 - matplotlib
 - nccl>=2.9.9
 - ninja
@@ -42,6 +43,5 @@ dependencies:
 - pandas
 - pylibraft==24.10.*,>=0.0.0a0
 - pyyaml
-- rmm==24.10.*,>=0.0.0a0
 - sysroot_linux-aarch64==2.17
 name: bench_ann_cuda-118_arch-aarch64
diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
index e73efd60c..d6c023ae9 100644
--- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
@@ -33,6 +33,7 @@ dependencies:
 - libcusolver=11.4.1.48
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
+- librmm==24.10.*,>=0.0.0a0
 - matplotlib
 - nccl>=2.9.9
 - ninja
@@ -42,6 +43,5 @@ dependencies:
 - pandas
 - pylibraft==24.10.*,>=0.0.0a0
 - pyyaml
-- rmm==24.10.*,>=0.0.0a0
 - sysroot_linux-64==2.17
 name: bench_ann_cuda-118_arch-x86_64
diff --git a/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml
index dec41007a..4d0ca9496 100644
--- a/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml
@@ -30,6 +30,7 @@ dependencies:
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
+- librmm==24.10.*,>=0.0.0a0
 - matplotlib
 - nccl>=2.9.9
 - ninja
@@ -38,6 +39,5 @@ dependencies:
 - pandas
 - pylibraft==24.10.*,>=0.0.0a0
 - pyyaml
-- rmm==24.10.*,>=0.0.0a0
 - sysroot_linux-aarch64==2.17
 name: bench_ann_cuda-125_arch-aarch64
diff --git a/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
index f106644cd..7dd67ab5e 100644
--- a/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
@@ -30,6 +30,7 @@ dependencies:
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
+- librmm==24.10.*,>=0.0.0a0
 - matplotlib
 - nccl>=2.9.9
 - ninja
@@ -38,6 +39,5 @@ dependencies:
 - pandas
 - pylibraft==24.10.*,>=0.0.0a0
 - pyyaml
-- rmm==24.10.*,>=0.0.0a0
 - sysroot_linux-64==2.17
 name: bench_ann_cuda-125_arch-x86_64
diff --git a/dependencies.yaml b/dependencies.yaml
index a53aef0f0..9fcbeaae2 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -11,6 +11,8 @@ files:
       - build_py_cuvs
       - cuda
       - cuda_version
+      - depends_on_pylibraft
+      - depends_on_librmm
       - develop
       - checks
       - build_wheels
@@ -31,6 +33,8 @@ files:
       - build_py_cuvs
       - cuda
       - cuda_version
+      - depends_on_pylibraft
+      - depends_on_librmm
       - develop
       - bench
       - bench_python
@@ -93,6 +97,7 @@ files:
     includes:
       - cuda_wheels
       - run_py_cuvs
+      - depends_on_pylibraft
   py_test_py_cuvs:
     output: pyproject
     pyproject_dir: python/cuvs
@@ -193,15 +198,7 @@ dependencies:
     common:
       - output_types: [conda]
         packages:
-          - &rmm_unsuffixed rmm==24.10.*,>=0.0.0a0
-          - &pylibraft_unsuffixed pylibraft==24.10.*,>=0.0.0a0
           - dlpack>=0.8,<1.0
-      - output_types: requirements
-        packages:
-          # pip recognizes the index as a global option for the requirements.txt file
-          # This index is needed for rmm-cu{11,12}.
-          - --extra-index-url=https://pypi.nvidia.com
-          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
     specific:
       - output_types: [conda, requirements, pyproject]
         matrices:
@@ -216,21 +213,6 @@ dependencies:
           - matrix:
             packages:
               - &cuda_python cuda-python
-      - output_types: [requirements, pyproject]
-        matrices:
-          - matrix:
-              cuda: "12.*"
-              cuda_suffixed: "true"
-            packages:
-              - &rmm_cu12 rmm-cu12==24.10.*,>=0.0.0a0
-              - &pylibraft_cu12 pylibraft-cu12==24.10.*,>=0.0.0a0
-          - matrix:
-              cuda: "11.*"
-              cuda_suffixed: "true"
-            packages:
-              - &rmm_cu11 rmm-cu11==24.10.*,>=0.0.0a0
-              - &pylibraft_cu11 pylibraft-cu11==24.10.*,>=0.0.0a0
-          - {matrix: null, packages: [*rmm_unsuffixed, *pylibraft_unsuffixed] }
   checks:
     common:
       - output_types: [conda, requirements]
@@ -448,19 +430,9 @@ dependencies:
               - python>=3.10,<3.13
   run_py_cuvs:
     common:
-      - output_types: [conda, pyproject]
+      - output_types: [conda, requirements, pyproject]
         packages:
           - &numpy numpy>=1.23,<3.0a0
-      - output_types: [conda]
-        packages:
-          - *rmm_unsuffixed
-          - *pylibraft_unsuffixed
-      - output_types: requirements
-        packages:
-          # pip recognizes the index as a global option for the requirements.txt file
-          # This index is needed for cudf and rmm.
-          - --extra-index-url=https://pypi.nvidia.com
-          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
     specific:
       - output_types: [conda, requirements, pyproject]
         matrices:
@@ -475,15 +447,6 @@ dependencies:
           - matrix:
             packages:
               - *cuda_python
-      - output_types: [requirements, pyproject]
-        matrices:
-          - matrix: {cuda: "12.*"}
-            packages:
-              - *pylibraft_cu12
-          - matrix: {cuda: "11.*"}
-            packages:
-              - *pylibraft_cu11
-          - {matrix: null, packages: [*pylibraft_unsuffixed]}
   test_python_common:
     common:
       - output_types: [conda, requirements, pyproject]
@@ -505,7 +468,6 @@ dependencies:
           - h5py>=3.8.0
           - benchmark>=1.8.2
           - openblas
-          - *rmm_unsuffixed
   bench_python:
     common:
       - output_types: [conda]
@@ -515,3 +477,51 @@ dependencies:
           - pyyaml
           - pandas
           - click
+  depends_on_librmm:
+    common:
+      - output_types: conda
+        packages:
+          - &librmm_unsuffixed librmm==24.10.*,>=0.0.0a0
+      - output_types: requirements
+        packages:
+          # pip recognizes the index as a global option for the requirements.txt file
+          - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
+    specific:
+      - output_types: [requirements, pyproject]
+        matrices:
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
+            packages:
+              - librmm-cu12==24.10.*,>=0.0.0a0
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
+            packages:
+              - librmm-cu11==24.10.*,>=0.0.0a0
+          - {matrix: null, packages: [*librmm_unsuffixed]}
+  depends_on_pylibraft:
+    common:
+      - output_types: conda
+        packages:
+          - &pylibraft_unsuffixed pylibraft==24.10.*,>=0.0.0a0
+      - output_types: requirements
+        packages:
+          # pip recognizes the index as a global option for the requirements.txt file
+          - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
+    specific:
+      - output_types: [requirements, pyproject]
+        matrices:
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
+            packages:
+              - pylibraft-cu12==24.10.*,>=0.0.0a0
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
+            packages:
+              - pylibraft-cu11==24.10.*,>=0.0.0a0
+          - {matrix: null, packages: [*pylibraft_unsuffixed]}
diff --git a/python/cuvs/pyproject.toml b/python/cuvs/pyproject.toml
index 0eb98d601..68bd9a868 100644
--- a/python/cuvs/pyproject.toml
+++ b/python/cuvs/pyproject.toml
@@ -128,8 +128,6 @@ requires = [
     "cuda-python",
     "cython>=3.0.0",
     "ninja",
-    "pylibraft==24.10.*,>=0.0.0a0",
-    "rmm==24.10.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 build-backend = "scikit_build_core.build"
 dependencies-file = "../../dependencies.yaml"

From 68480c92cec6da2d7289482146a85725e6f1235a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Malte=20F=C3=B6rster?=
 <97973773+mfoerste4@users.noreply.github.com>
Date: Wed, 11 Sep 2024 16:16:55 +0200
Subject: [PATCH 2/3] Brute force knn tile size heuristic (#316)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This PR modifies the tile size heuristic for brute force knn as mentioned in (#277).

It also removes some unneeded cuda calls to save a couple of microseconds which might be relevant when running smaller batches.

CC @tfeher

Authors:
  - Malte Förster (https://github.com/mfoerste4)
  - Ben Frederickson (https://github.com/benfred)
  - Tamas Bela Feher (https://github.com/tfeher)

Approvers:
  - Ben Frederickson (https://github.com/benfred)
  - Tamas Bela Feher (https://github.com/tfeher)

URL: https://github.com/rapidsai/cuvs/pull/316
---
 .../neighbors/detail/faiss_distance_utils.h   | 42 +++++++++----------
 cpp/src/neighbors/detail/knn_brute_force.cuh  | 35 +++++++---------
 2 files changed, 36 insertions(+), 41 deletions(-)

diff --git a/cpp/src/neighbors/detail/faiss_distance_utils.h b/cpp/src/neighbors/detail/faiss_distance_utils.h
index e8a41c1aa..63f0c88c2 100644
--- a/cpp/src/neighbors/detail/faiss_distance_utils.h
+++ b/cpp/src/neighbors/detail/faiss_distance_utils.h
@@ -14,10 +14,18 @@ inline void chooseTileSize(size_t numQueries,
                            size_t numCentroids,
                            size_t dim,
                            size_t elementSize,
-                           size_t totalMem,
                            size_t& tileRows,
                            size_t& tileCols)
 {
+  // 512 seems to be a batch size sweetspot for float32.
+  // If we are on float16, increase to 512.
+  // If the k size (vec dim) of the matrix multiplication is small (<= 32),
+  // increase to 1024.
+  size_t preferredTileRows = 512;
+  if (dim <= 32) { preferredTileRows = 1024; }
+
+  tileRows = std::min(preferredTileRows, numQueries);
+
   // The matrix multiplication should be large enough to be efficient, but if
   // it is too large, we seem to lose efficiency as opposed to
   // double-streaming. Each tile size here defines 1/2 of the memory use due
@@ -25,28 +33,20 @@ inline void chooseTileSize(size_t numQueries,
   // adjusted independently by the user and can thus meet these requirements
   // (or not). For <= 4 GB GPUs, prefer 512 MB of usage. For <= 8 GB GPUs,
   // prefer 768 MB of usage. Otherwise, prefer 1 GB of usage.
-  size_t targetUsage = 0;
-
-  if (totalMem <= ((size_t)4) * 1024 * 1024 * 1024) {
-    targetUsage = 512 * 1024 * 1024;
-  } else if (totalMem <= ((size_t)8) * 1024 * 1024 * 1024) {
-    targetUsage = 768 * 1024 * 1024;
+  size_t targetUsage = 512 * 1024 * 1024;
+  if (tileRows * numCentroids * elementSize * 2 <= targetUsage) {
+    tileCols = numCentroids;
   } else {
-    targetUsage = 1024 * 1024 * 1024;
-  }
+    // only query total memory in case it potentially impacts tilesize
+    size_t totalMem = rmm::available_device_memory().second;
 
-  targetUsage /= 2 * elementSize;
+    if (totalMem > ((size_t)8) * 1024 * 1024 * 1024) {
+      targetUsage = 1024 * 1024 * 1024;
+    } else if (totalMem > ((size_t)4) * 1024 * 1024 * 1024) {
+      targetUsage = 768 * 1024 * 1024;
+    }
 
-  // 512 seems to be a batch size sweetspot for float32.
-  // If we are on float16, increase to 512.
-  // If the k size (vec dim) of the matrix multiplication is small (<= 32),
-  // increase to 1024.
-  size_t preferredTileRows = 512;
-  if (dim <= 32) { preferredTileRows = 1024; }
-
-  tileRows = std::min(preferredTileRows, numQueries);
-
-  // tileCols is the remainder size
-  tileCols = std::min(targetUsage / preferredTileRows, numCentroids);
+    tileCols = std::min(targetUsage / (2 * elementSize * tileRows), numCentroids);
+  }
 }
 }  // namespace cuvs::neighbors::detail::faiss_select
diff --git a/cpp/src/neighbors/detail/knn_brute_force.cuh b/cpp/src/neighbors/detail/knn_brute_force.cuh
index e3f7acc96..88986af7d 100644
--- a/cpp/src/neighbors/detail/knn_brute_force.cuh
+++ b/cpp/src/neighbors/detail/knn_brute_force.cuh
@@ -81,14 +81,12 @@ void tiled_brute_force_knn(const raft::resources& handle,
                            const uint32_t* filter_bitmap             = nullptr)
 {
   // Figure out the number of rows/cols to tile for
-  size_t tile_rows   = 0;
-  size_t tile_cols   = 0;
-  auto stream        = raft::resource::get_cuda_stream(handle);
-  auto device_memory = raft::resource::get_workspace_resource(handle);
-  auto total_mem     = rmm::available_device_memory().second;
+  size_t tile_rows = 0;
+  size_t tile_cols = 0;
+  auto stream      = raft::resource::get_cuda_stream(handle);
 
   cuvs::neighbors::detail::faiss_select::chooseTileSize(
-    m, n, d, sizeof(DistanceT), total_mem, tile_rows, tile_cols);
+    m, n, d, sizeof(DistanceT), tile_rows, tile_cols);
 
   // for unittesting, its convenient to be able to put a max size on the tiles
   // so we can test the tiling logic without having to use huge inputs.
@@ -356,27 +354,26 @@ void brute_force_knn_impl(
 
   ASSERT(input.size() == sizes.size(), "input and sizes vectors should be the same size");
 
-  std::vector<IdxType>* id_ranges;
-  if (translations == nullptr) {
+  std::vector<IdxType> id_ranges;
+  if (translations != nullptr) {
+    // use the given translations
+    id_ranges.insert(id_ranges.end(), translations->begin(), translations->end());
+  } else if (input.size() > 1) {
     // If we don't have explicit translations
     // for offsets of the indices, build them
     // from the local partitions
-    id_ranges       = new std::vector<IdxType>();
     IdxType total_n = 0;
     for (size_t i = 0; i < input.size(); i++) {
-      id_ranges->push_back(total_n);
+      id_ranges.push_back(total_n);
       total_n += sizes[i];
     }
-  } else {
-    // otherwise, use the given translations
-    id_ranges = translations;
   }
 
-  int device;
-  RAFT_CUDA_TRY(cudaGetDevice(&device));
-
-  rmm::device_uvector<IdxType> trans(id_ranges->size(), userStream);
-  raft::update_device(trans.data(), id_ranges->data(), id_ranges->size(), userStream);
+  rmm::device_uvector<IdxType> trans(0, userStream);
+  if (id_ranges.size() > 0) {
+    trans.resize(id_ranges.size(), userStream);
+    raft::update_device(trans.data(), id_ranges.data(), id_ranges.size(), userStream);
+  }
 
   rmm::device_uvector<DistType> all_D(0, userStream);
   rmm::device_uvector<IdxType> all_I(0, userStream);
@@ -513,8 +510,6 @@ void brute_force_knn_impl(
     // no translations or partitions to combine, it can be skipped.
     knn_merge_parts(out_D, out_I, res_D, res_I, n, input.size(), k, userStream, trans.data());
   }
-
-  if (translations == nullptr) delete id_ranges;
 };
 
 template <typename T,

From a6b71d72ed349a118887ff730475eaeda43ef2aa Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 11 Sep 2024 12:57:52 -0500
Subject: [PATCH 3/3] Simplify libcuvs conda recipe. (#322)

This PR refactors the `libcuvs` conda recipe to use `libraft-headers-only` instead of `libraft`.

I removed dependencies on `rmm` from the `cuvs` build -- it is not imported.

I included a couple small fixes in CMake and the `cuvs` recipe.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Mike Sarahan (https://github.com/msarahan)
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/cuvs/pull/322
---
 conda/recipes/cuvs/meta.yaml    |  6 +-----
 conda/recipes/libcuvs/meta.yaml | 33 ++++++++++++++++++++++++---------
 cpp/CMakeLists.txt              |  4 ++--
 3 files changed, 27 insertions(+), 16 deletions(-)

diff --git a/conda/recipes/cuvs/meta.yaml b/conda/recipes/cuvs/meta.yaml
index 2633b3db8..e7e2daf0c 100644
--- a/conda/recipes/cuvs/meta.yaml
+++ b/conda/recipes/cuvs/meta.yaml
@@ -1,7 +1,7 @@
 # Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 # Usage:
-#   conda build . -c conda-forge -c numba -c rapidsai -c pytorch
+#   conda build . -c rapidsai -c conda-forge -c nvidia
 {% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
 {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %}
 {% set py_version = environ['CONDA_PY'] %}
@@ -54,10 +54,8 @@ requirements:
     - pylibraft {{ minor_version }}
     - libcuvs {{ version }}
     - python x.x
-    - rmm ={{ minor_version }}
     - rapids-build-backend>=0.3.0,<0.4.0.dev0
     - scikit-build-core >=0.10.0
-    - setuptools
   run:
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
     {% if cuda_major == "11" %}
@@ -68,7 +66,6 @@ requirements:
     - pylibraft {{ minor_version }}
     - libcuvs {{ version }}
     - python x.x
-    - rmm ={{ minor_version }}
     - cuda-python
     - numpy >=1.23,<3.0a0
 
@@ -81,5 +78,4 @@ tests:
 about:
   home: https://rapids.ai/
   license: Apache-2.0
-  # license_file: LICENSE
   summary: cuvs python library
diff --git a/conda/recipes/libcuvs/meta.yaml b/conda/recipes/libcuvs/meta.yaml
index 4ffdc91e4..e154ccf41 100644
--- a/conda/recipes/libcuvs/meta.yaml
+++ b/conda/recipes/libcuvs/meta.yaml
@@ -1,7 +1,7 @@
 # Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 # Usage:
-#   conda build . -c conda-forge -c nvidia -c rapidsai
+#   conda build . -c rapidsai -c conda-forge -c nvidia
 {% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
 {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %}
 {% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %}
@@ -63,7 +63,8 @@ outputs:
         - ninja
         - {{ stdlib("c") }}
       host:
-        - libraft ={{ minor_version }}
+        - librmm ={{ minor_version }}
+        - libraft-headers ={{ minor_version }}
         - cuda-version ={{ cuda_version }}
         {% if cuda_major == "11" %}
         - cuda-profiler-api {{ cuda11_cuda_profiler_api_host_version }}
@@ -84,7 +85,6 @@ outputs:
         - libcusparse-dev
         {% endif %}
       run:
-        - libraft ={{ minor_version }}
         - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
         {% if cuda_major != "11" %}
         - cuda-cudart
@@ -129,7 +129,8 @@ outputs:
         - ninja
         - {{ stdlib("c") }}
       host:
-        - libraft ={{ minor_version }}
+        - librmm ={{ minor_version }}
+        - libraft-headers ={{ minor_version }}
         - cuda-version ={{ cuda_version }}
         {% if cuda_major == "11" %}
         - cuda-profiler-api {{ cuda11_cuda_profiler_api_host_version }}
@@ -150,7 +151,6 @@ outputs:
         - libcusparse-dev
         {% endif %}
       run:
-        - libraft ={{ minor_version }}
         - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
         {% if cuda_major != "11" %}
         - cuda-cudart
@@ -195,7 +195,8 @@ outputs:
         - ninja
         - {{ stdlib("c") }}
       host:
-        - libraft ={{ minor_version }}
+        - librmm ={{ minor_version }}
+        - libraft-headers ={{ minor_version }}
         - {{ pin_subpackage('libcuvs', exact=True) }}
         - cuda-version ={{ cuda_version }}
         - openblas # required by some CPU algos in benchmarks
@@ -228,7 +229,6 @@ outputs:
         - libcusolver
         - libcusparse
         {% endif %}
-        - libraft ={{ minor_version }}
         - {{ pin_subpackage('libcuvs', exact=True) }}
     about:
       home: https://rapids.ai/
@@ -248,6 +248,9 @@ outputs:
         - {{ compiler('cuda') }}
         - cuda-cudart-dev
         - libcublas-dev
+        - libcurand-dev
+        - libcusolver-dev
+        - libcusparse-dev
         {% endif %}
     requirements:
       build:
@@ -263,17 +266,27 @@ outputs:
         - ninja
         - {{ stdlib("c") }}
       host:
-        - libraft ={{ minor_version }}
+        - librmm ={{ minor_version }}
+        - libraft-headers ={{ minor_version }}
         - {{ pin_subpackage('libcuvs', exact=True) }}
         - cuda-version ={{ cuda_version }}
         {% if cuda_major == "11" %}
         - cuda-profiler-api {{ cuda11_cuda_profiler_api_run_version }}
         - libcublas {{ cuda11_libcublas_host_version }}
         - libcublas-dev {{ cuda11_libcublas_host_version }}
+        - libcurand {{ cuda11_libcurand_host_version }}
+        - libcurand-dev {{ cuda11_libcurand_host_version }}
+        - libcusolver {{ cuda11_libcusolver_host_version }}
+        - libcusolver-dev {{ cuda11_libcusolver_host_version }}
+        - libcusparse {{ cuda11_libcusparse_host_version }}
+        - libcusparse-dev {{ cuda11_libcusparse_host_version }}
         {% else %}
         - cuda-cudart-dev
         - cuda-profiler-api
         - libcublas-dev
+        - libcurand-dev
+        - libcusolver-dev
+        - libcusparse-dev
         {% endif %}
       run:
         - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
@@ -282,8 +295,10 @@ outputs:
         {% else %}
         - cuda-cudart
         - libcublas
+        - libcurand
+        - libcusolver
+        - libcusparse
         {% endif %}
-        - libraft ={{ minor_version }}
         - {{ pin_subpackage('libcuvs', exact=True) }}
     about:
       home: https://rapids.ai/
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index ba46e60b4..fec1248bb 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -53,8 +53,8 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
 option(BUILD_SHARED_LIBS "Build cuvs shared libraries" ON)
 option(BUILD_TESTS "Build cuvs unit-tests" ON)
-option(BUILD_C_LIBRARY "Build raft C API library" OFF)
-option(BUILD_C_TESTS "Build raft C API tests" OFF)
+option(BUILD_C_LIBRARY "Build cuVS C API library" OFF)
+option(BUILD_C_TESTS "Build cuVS C API tests" OFF)
 option(BUILD_ANN_BENCH "Build cuVS ann benchmarks" OFF)
 option(BUILD_CAGRA_HNSWLIB "Build CAGRA+hnswlib interface" ON)
 option(CUDA_ENABLE_KERNELINFO "Enable kernel resource usage info" OFF)