Skip to content

Commit

Permalink
Merge branch 'branch-24.10' into hnsw-python-api
Browse files Browse the repository at this point in the history
  • Loading branch information
cjnolet committed Sep 11, 2024
2 parents b47f92f + 68480c9 commit 53bcf5d
Show file tree
Hide file tree
Showing 13 changed files with 99 additions and 95 deletions.
1 change: 1 addition & 0 deletions ci/release/update-version.sh
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ DEPENDENCIES=(
dask-cuda
cuvs
pylibraft
librmm
rmm
rapids-dask-dependency
)
Expand Down
2 changes: 1 addition & 1 deletion conda/environments/all_cuda-118_arch-aarch64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ dependencies:
- libcusolver=11.4.1.48
- libcusparse-dev=11.7.5.86
- libcusparse=11.7.5.86
- librmm==24.10.*,>=0.0.0a0
- make
- nccl>=2.9.9
- ninja
Expand All @@ -49,7 +50,6 @@ dependencies:
- pytest==7.*
- rapids-build-backend>=0.3.0,<0.4.0.dev0
- recommonmark
- rmm==24.10.*,>=0.0.0a0
- rust
- scikit-build-core>=0.10.0
- scikit-learn
Expand Down
2 changes: 1 addition & 1 deletion conda/environments/all_cuda-118_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ dependencies:
- libcusolver=11.4.1.48
- libcusparse-dev=11.7.5.86
- libcusparse=11.7.5.86
- librmm==24.10.*,>=0.0.0a0
- make
- nccl>=2.9.9
- ninja
Expand All @@ -49,7 +50,6 @@ dependencies:
- pytest==7.*
- rapids-build-backend>=0.3.0,<0.4.0.dev0
- recommonmark
- rmm==24.10.*,>=0.0.0a0
- rust
- scikit-build-core>=0.10.0
- scikit-learn
Expand Down
2 changes: 1 addition & 1 deletion conda/environments/all_cuda-125_arch-aarch64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ dependencies:
- libcurand-dev
- libcusolver-dev
- libcusparse-dev
- librmm==24.10.*,>=0.0.0a0
- make
- nccl>=2.9.9
- ninja
Expand All @@ -45,7 +46,6 @@ dependencies:
- pytest==7.*
- rapids-build-backend>=0.3.0,<0.4.0.dev0
- recommonmark
- rmm==24.10.*,>=0.0.0a0
- rust
- scikit-build-core>=0.10.0
- scikit-learn
Expand Down
2 changes: 1 addition & 1 deletion conda/environments/all_cuda-125_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ dependencies:
- libcurand-dev
- libcusolver-dev
- libcusparse-dev
- librmm==24.10.*,>=0.0.0a0
- make
- nccl>=2.9.9
- ninja
Expand All @@ -45,7 +46,6 @@ dependencies:
- pytest==7.*
- rapids-build-backend>=0.3.0,<0.4.0.dev0
- recommonmark
- rmm==24.10.*,>=0.0.0a0
- rust
- scikit-build-core>=0.10.0
- scikit-learn
Expand Down
2 changes: 1 addition & 1 deletion conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ dependencies:
- libcusolver=11.4.1.48
- libcusparse-dev=11.7.5.86
- libcusparse=11.7.5.86
- librmm==24.10.*,>=0.0.0a0
- matplotlib
- nccl>=2.9.9
- ninja
Expand All @@ -42,6 +43,5 @@ dependencies:
- pandas
- pylibraft==24.10.*,>=0.0.0a0
- pyyaml
- rmm==24.10.*,>=0.0.0a0
- sysroot_linux-aarch64==2.17
name: bench_ann_cuda-118_arch-aarch64
2 changes: 1 addition & 1 deletion conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ dependencies:
- libcusolver=11.4.1.48
- libcusparse-dev=11.7.5.86
- libcusparse=11.7.5.86
- librmm==24.10.*,>=0.0.0a0
- matplotlib
- nccl>=2.9.9
- ninja
Expand All @@ -42,6 +43,5 @@ dependencies:
- pandas
- pylibraft==24.10.*,>=0.0.0a0
- pyyaml
- rmm==24.10.*,>=0.0.0a0
- sysroot_linux-64==2.17
name: bench_ann_cuda-118_arch-x86_64
2 changes: 1 addition & 1 deletion conda/environments/bench_ann_cuda-125_arch-aarch64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ dependencies:
- libcurand-dev
- libcusolver-dev
- libcusparse-dev
- librmm==24.10.*,>=0.0.0a0
- matplotlib
- nccl>=2.9.9
- ninja
Expand All @@ -38,6 +39,5 @@ dependencies:
- pandas
- pylibraft==24.10.*,>=0.0.0a0
- pyyaml
- rmm==24.10.*,>=0.0.0a0
- sysroot_linux-aarch64==2.17
name: bench_ann_cuda-125_arch-aarch64
2 changes: 1 addition & 1 deletion conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ dependencies:
- libcurand-dev
- libcusolver-dev
- libcusparse-dev
- librmm==24.10.*,>=0.0.0a0
- matplotlib
- nccl>=2.9.9
- ninja
Expand All @@ -38,6 +39,5 @@ dependencies:
- pandas
- pylibraft==24.10.*,>=0.0.0a0
- pyyaml
- rmm==24.10.*,>=0.0.0a0
- sysroot_linux-64==2.17
name: bench_ann_cuda-125_arch-x86_64
42 changes: 21 additions & 21 deletions cpp/src/neighbors/detail/faiss_distance_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,39 +14,39 @@ inline void chooseTileSize(size_t numQueries,
size_t numCentroids,
size_t dim,
size_t elementSize,
size_t totalMem,
size_t& tileRows,
size_t& tileCols)
{
// 512 seems to be a batch size sweetspot for float32.
// If we are on float16, increase to 512.
// If the k size (vec dim) of the matrix multiplication is small (<= 32),
// increase to 1024.
size_t preferredTileRows = 512;
if (dim <= 32) { preferredTileRows = 1024; }

tileRows = std::min(preferredTileRows, numQueries);

// The matrix multiplication should be large enough to be efficient, but if
// it is too large, we seem to lose efficiency as opposed to
// double-streaming. Each tile size here defines 1/2 of the memory use due
// to double streaming. We ignore available temporary memory, as that is
// adjusted independently by the user and can thus meet these requirements
// (or not). For <= 4 GB GPUs, prefer 512 MB of usage. For <= 8 GB GPUs,
// prefer 768 MB of usage. Otherwise, prefer 1 GB of usage.
size_t targetUsage = 0;

if (totalMem <= ((size_t)4) * 1024 * 1024 * 1024) {
targetUsage = 512 * 1024 * 1024;
} else if (totalMem <= ((size_t)8) * 1024 * 1024 * 1024) {
targetUsage = 768 * 1024 * 1024;
size_t targetUsage = 512 * 1024 * 1024;
if (tileRows * numCentroids * elementSize * 2 <= targetUsage) {
tileCols = numCentroids;
} else {
targetUsage = 1024 * 1024 * 1024;
}
// only query total memory in case it potentially impacts tilesize
size_t totalMem = rmm::available_device_memory().second;

targetUsage /= 2 * elementSize;
if (totalMem > ((size_t)8) * 1024 * 1024 * 1024) {
targetUsage = 1024 * 1024 * 1024;
} else if (totalMem > ((size_t)4) * 1024 * 1024 * 1024) {
targetUsage = 768 * 1024 * 1024;
}

// 512 seems to be a batch size sweetspot for float32.
// If we are on float16, increase to 512.
// If the k size (vec dim) of the matrix multiplication is small (<= 32),
// increase to 1024.
size_t preferredTileRows = 512;
if (dim <= 32) { preferredTileRows = 1024; }

tileRows = std::min(preferredTileRows, numQueries);

// tileCols is the remainder size
tileCols = std::min(targetUsage / preferredTileRows, numCentroids);
tileCols = std::min(targetUsage / (2 * elementSize * tileRows), numCentroids);
}
}
} // namespace cuvs::neighbors::detail::faiss_select
35 changes: 15 additions & 20 deletions cpp/src/neighbors/detail/knn_brute_force.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -81,14 +81,12 @@ void tiled_brute_force_knn(const raft::resources& handle,
const uint32_t* filter_bitmap = nullptr)
{
// Figure out the number of rows/cols to tile for
size_t tile_rows = 0;
size_t tile_cols = 0;
auto stream = raft::resource::get_cuda_stream(handle);
auto device_memory = raft::resource::get_workspace_resource(handle);
auto total_mem = rmm::available_device_memory().second;
size_t tile_rows = 0;
size_t tile_cols = 0;
auto stream = raft::resource::get_cuda_stream(handle);

cuvs::neighbors::detail::faiss_select::chooseTileSize(
m, n, d, sizeof(DistanceT), total_mem, tile_rows, tile_cols);
m, n, d, sizeof(DistanceT), tile_rows, tile_cols);

// for unittesting, its convenient to be able to put a max size on the tiles
// so we can test the tiling logic without having to use huge inputs.
Expand Down Expand Up @@ -356,27 +354,26 @@ void brute_force_knn_impl(

ASSERT(input.size() == sizes.size(), "input and sizes vectors should be the same size");

std::vector<IdxType>* id_ranges;
if (translations == nullptr) {
std::vector<IdxType> id_ranges;
if (translations != nullptr) {
// use the given translations
id_ranges.insert(id_ranges.end(), translations->begin(), translations->end());
} else if (input.size() > 1) {
// If we don't have explicit translations
// for offsets of the indices, build them
// from the local partitions
id_ranges = new std::vector<IdxType>();
IdxType total_n = 0;
for (size_t i = 0; i < input.size(); i++) {
id_ranges->push_back(total_n);
id_ranges.push_back(total_n);
total_n += sizes[i];
}
} else {
// otherwise, use the given translations
id_ranges = translations;
}

int device;
RAFT_CUDA_TRY(cudaGetDevice(&device));

rmm::device_uvector<IdxType> trans(id_ranges->size(), userStream);
raft::update_device(trans.data(), id_ranges->data(), id_ranges->size(), userStream);
rmm::device_uvector<IdxType> trans(0, userStream);
if (id_ranges.size() > 0) {
trans.resize(id_ranges.size(), userStream);
raft::update_device(trans.data(), id_ranges.data(), id_ranges.size(), userStream);
}

rmm::device_uvector<DistType> all_D(0, userStream);
rmm::device_uvector<IdxType> all_I(0, userStream);
Expand Down Expand Up @@ -513,8 +510,6 @@ void brute_force_knn_impl(
// no translations or partitions to combine, it can be skipped.
knn_merge_parts(out_D, out_I, res_D, res_I, n, input.size(), k, userStream, trans.data());
}

if (translations == nullptr) delete id_ranges;
};

template <typename T,
Expand Down
Loading

0 comments on commit 53bcf5d

Please sign in to comment.