Skip to content

Commit

Permalink
faiss paper benchmarks (facebookresearch#3189)
Browse files Browse the repository at this point in the history
Summary:
- IVF benchmarks: `bench_fw_ivf.py bench_fw_ivf.py bigann /checkpoint/gsz/bench_fw/ivf`
- Codec benchmarks: `bench_fw_codecs.py contriever /checkpoint/gsz/bench_fw/codecs` and `bench_fw_codecs.py deep1b /checkpoint/gsz/bench_fw/codecs`
- A range codec evaluation: `bench_fw_range.py ssnpp /checkpoint/gsz/bench_fw/range`
- Visualize with `bench_fw_notebook.ipynb`
- Support for running on a cluster

Pull Request resolved: facebookresearch#3189

Reviewed By: mdouze

Differential Revision: D52544642

Pulled By: algoriddle

fbshipit-source-id: 21dcdfd076aef6d36467c908e6be78ef851b0e98
  • Loading branch information
algoriddle authored and facebook-github-bot committed Jan 5, 2024
1 parent b7681be commit beef610
Show file tree
Hide file tree
Showing 12 changed files with 1,729 additions and 553 deletions.
482 changes: 325 additions & 157 deletions benchs/bench_fw/benchmark.py

Large diffs are not rendered by default.

63 changes: 46 additions & 17 deletions benchs/bench_fw/benchmark_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import os
import pickle
from dataclasses import dataclass
import submitit
from typing import Any, List, Optional
from zipfile import ZipFile

Expand Down Expand Up @@ -106,7 +107,7 @@ def write_file(
fn = self.get_local_filename(filename)
with ZipFile(fn, "w") as zip_file:
for key, value in zip(keys, values, strict=True):
with zip_file.open(key, "w") as f:
with zip_file.open(key, "w", force_zip64=True) as f:
if key in ["D", "I", "R", "lims"]:
np.save(f, value)
elif key in ["P"]:
Expand All @@ -117,22 +118,22 @@ def write_file(
self.upload_file_to_blobstore(filename, overwrite=overwrite)

def get_dataset(self, dataset):
if dataset.namespace is not None and dataset.namespace[:4] == "std_":
if dataset.tablename not in self.cached_ds:
self.cached_ds[dataset.tablename] = dataset_from_name(
dataset.tablename,
)
p = dataset.namespace[4]
if p == "t":
return self.cached_ds[dataset.tablename].get_train()
elif p == "d":
return self.cached_ds[dataset.tablename].get_database()
elif p == "q":
return self.cached_ds[dataset.tablename].get_queries()
else:
raise ValueError
elif dataset not in self.cached_ds:
if dataset.namespace == "syn":
if dataset not in self.cached_ds:
if dataset.namespace is not None and dataset.namespace[:4] == "std_":
if dataset.tablename not in self.cached_ds:
self.cached_ds[dataset.tablename] = dataset_from_name(
dataset.tablename,
)
p = dataset.namespace[4]
if p == "t":
self.cached_ds[dataset] = self.cached_ds[dataset.tablename].get_train(dataset.num_vectors)
elif p == "d":
self.cached_ds[dataset] = self.cached_ds[dataset.tablename].get_database()
elif p == "q":
self.cached_ds[dataset] = self.cached_ds[dataset.tablename].get_queries()
else:
raise ValueError
elif dataset.namespace == "syn":
d, seed = dataset.tablename.split("_")
d = int(d)
seed = int(seed)
Expand Down Expand Up @@ -225,3 +226,31 @@ def write_index(
logger.info(f"Saving index to {fn}")
faiss.write_index(index, fn)
self.upload_file_to_blobstore(filename)
assert os.path.exists(fn)
return os.path.getsize(fn)

def launch_jobs(self, func, params, local=True):
if local:
results = [func(p) for p in params]
return results
print(f'launching {len(params)} jobs')
executor = submitit.AutoExecutor(folder='/checkpoint/gsz/jobs')
executor.update_parameters(
nodes=1,
gpus_per_node=8,
cpus_per_task=80,
# mem_gb=640,
tasks_per_node=1,
name="faiss_benchmark",
slurm_array_parallelism=512,
slurm_partition="scavenge",
slurm_time=4 * 60,
slurm_constraint="bldg1",
)
jobs = executor.map_array(func, params)
print(f'launched {len(jobs)} jobs')
# for job, param in zip(jobs, params):
# print(f"{job.job_id=} {param=}")
results = [job.result() for job in jobs]
print(f'received {len(results)} results')
return results
27 changes: 27 additions & 0 deletions benchs/bench_fw/descriptors.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,13 @@
# LICENSE file in the root directory of this source tree.

from dataclasses import dataclass
import logging
from typing import Any, Dict, List, Optional

import faiss # @manual=//faiss/python:pyfaiss_gpu
from .utils import timer
logger = logging.getLogger(__name__)


@dataclass
class IndexDescriptor:
Expand Down Expand Up @@ -33,6 +38,10 @@ class IndexDescriptor:
# [radius2_from, radius2_to) -> score2
range_metrics: Optional[Dict[str, Any]] = None
radius: Optional[float] = None
training_size: Optional[int] = None

def __hash__(self):
return hash(str(self))


@dataclass
Expand Down Expand Up @@ -85,3 +94,21 @@ def get_filename(
filename += f"_{self.num_vectors}"
filename += "."
return filename

def k_means(self, io, k, dry_run):
logger.info(f"k_means {k} {self}")
kmeans_vectors = DatasetDescriptor(
tablename=f"{self.get_filename()}kmeans_{k}.npy"
)
meta_filename = kmeans_vectors.tablename + ".json"
if not io.file_exist(kmeans_vectors.tablename) or not io.file_exist(meta_filename):
if dry_run:
return None, None, kmeans_vectors.tablename
x = io.get_dataset(self)
kmeans = faiss.Kmeans(d=x.shape[1], k=k, gpu=True)
_, t, _ = timer("k_means", lambda: kmeans.train(x))
io.write_nparray(kmeans.centroids, kmeans_vectors.tablename)
io.write_json({"k_means_time": t}, meta_filename)
else:
t = io.read_json(meta_filename)["k_means_time"]
return kmeans_vectors, t, None
Loading

0 comments on commit beef610

Please sign in to comment.