From ed2aa399ec92bb90803c165a19e1d66309ab139b Mon Sep 17 00:00:00 2001 From: Matthijs Douze Date: Tue, 23 May 2023 07:20:35 -0700 Subject: [PATCH] move by_residual to IndexIVF (#2870) Summary: Pull Request resolved: https://github.com/facebookresearch/faiss/pull/2870 Factor by_residual for all the IndexIVF inheritors. Some training code can be put in IndexIVF and `train_residual` is replaced with `train_encoder`. This will be used for the IndependentQuantizer work. Reviewed By: alexanderguzhva Differential Revision: D45987304 fbshipit-source-id: c79e9f21e66e185480b93b6c149841ad90bb92a1 --- c_api/IndexIVF_c.cpp | 11 ++++ c_api/IndexIVF_c.h | 6 ++ c_api/IndexScalarQuantizer_c.cpp | 10 --- c_api/IndexScalarQuantizer_c.h | 5 -- faiss/IndexIVF.cpp | 42 ++++++++++-- faiss/IndexIVF.h | 19 ++++-- faiss/IndexIVFAdditiveQuantizer.cpp | 26 +++----- faiss/IndexIVFAdditiveQuantizer.h | 5 +- faiss/IndexIVFAdditiveQuantizerFastScan.cpp | 46 +++---------- faiss/IndexIVFAdditiveQuantizerFastScan.h | 4 +- faiss/IndexIVFFastScan.cpp | 3 + faiss/IndexIVFFastScan.h | 1 - faiss/IndexIVFFlat.cpp | 7 ++ faiss/IndexIVFFlat.h | 2 +- faiss/IndexIVFPQ.cpp | 72 +++------------------ faiss/IndexIVFPQ.h | 7 +- faiss/IndexIVFPQFastScan.cpp | 52 +++------------ faiss/IndexIVFPQFastScan.h | 4 +- faiss/IndexIVFPQR.cpp | 37 ++++++++--- faiss/IndexIVFPQR.h | 4 +- faiss/IndexIVFSpectralHash.cpp | 22 +++---- faiss/IndexIVFSpectralHash.h | 12 ++-- faiss/IndexScalarQuantizer.cpp | 23 ++++--- faiss/IndexScalarQuantizer.h | 7 +- faiss/gpu/GpuIndexIVFScalarQuantizer.cu | 12 +++- faiss/impl/FaissException.h | 18 ++++++ faiss/impl/ScalarQuantizer.cpp | 26 -------- faiss/impl/ScalarQuantizer.h | 8 --- faiss/impl/index_write.cpp | 2 + 29 files changed, 224 insertions(+), 269 deletions(-) diff --git a/c_api/IndexIVF_c.cpp b/c_api/IndexIVF_c.cpp index 5c54fb6e29..9a6f39dfa1 100644 --- a/c_api/IndexIVF_c.cpp +++ b/c_api/IndexIVF_c.cpp @@ -165,6 +165,17 @@ void faiss_IndexIVF_invlists_get_ids( memcpy(invlist, list, list_size * sizeof(idx_t)); } +int faiss_IndexIVF_train_encoder( + FaissIndexIVF* index, + idx_t n, + const float* x, + const idx_t* assign) { + try { + reinterpret_cast(index)->train_encoder(n, x, assign); + } + CATCH_AND_HANDLE +} + void faiss_IndexIVFStats_reset(FaissIndexIVFStats* stats) { reinterpret_cast(stats)->reset(); } diff --git a/c_api/IndexIVF_c.h b/c_api/IndexIVF_c.h index 3ef8a9ad7f..98a09c2668 100644 --- a/c_api/IndexIVF_c.h +++ b/c_api/IndexIVF_c.h @@ -154,6 +154,12 @@ void faiss_IndexIVF_invlists_get_ids( size_t list_no, idx_t* invlist); +int faiss_IndexIVF_train_encoder( + FaissIndexIVF* index, + idx_t n, + const float* x, + const idx_t* assign); + typedef struct FaissIndexIVFStats { size_t nq; // nb of queries run size_t nlist; // nb of inverted lists scanned diff --git a/c_api/IndexScalarQuantizer_c.cpp b/c_api/IndexScalarQuantizer_c.cpp index 5c00e342c3..9f3393e831 100644 --- a/c_api/IndexScalarQuantizer_c.cpp +++ b/c_api/IndexScalarQuantizer_c.cpp @@ -110,13 +110,3 @@ int faiss_IndexIVFScalarQuantizer_add_core( } CATCH_AND_HANDLE } - -int faiss_IndexIVFScalarQuantizer_train_residual( - FaissIndexIVFScalarQuantizer* index, - idx_t n, - const float* x) { - try { - reinterpret_cast(index)->train_residual(n, x); - } - CATCH_AND_HANDLE -} diff --git a/c_api/IndexScalarQuantizer_c.h b/c_api/IndexScalarQuantizer_c.h index becdb201e0..2c5e3f2942 100644 --- a/c_api/IndexScalarQuantizer_c.h +++ b/c_api/IndexScalarQuantizer_c.h @@ -88,11 +88,6 @@ int faiss_IndexIVFScalarQuantizer_add_core( const idx_t* xids, const idx_t* precomputed_idx); -int faiss_IndexIVFScalarQuantizer_train_residual( - FaissIndexIVFScalarQuantizer* index, - idx_t n, - const float* x); - #ifdef __cplusplus } #endif diff --git a/faiss/IndexIVF.cpp b/faiss/IndexIVF.cpp index 19e18e4666..aeaca78011 100644 --- a/faiss/IndexIVF.cpp +++ b/faiss/IndexIVF.cpp @@ -1061,22 +1061,52 @@ void IndexIVF::update_vectors(int n, const idx_t* new_ids, const float* x) { } void IndexIVF::train(idx_t n, const float* x) { - if (verbose) + if (verbose) { printf("Training level-1 quantizer\n"); + } train_q1(n, x, verbose, metric_type); - if (verbose) + if (verbose) { printf("Training IVF residual\n"); + } + + // optional subsampling + idx_t max_nt = train_encoder_num_vectors(); + if (max_nt <= 0) { + max_nt = (size_t)1 << 35; + } + + TransformedVectors tv( + x, fvecs_maybe_subsample(d, (size_t*)&n, max_nt, x, verbose)); + + if (by_residual) { + std::vector assign(n); + quantizer->assign(n, tv.x, assign.data()); + + std::vector residuals(n * d); + quantizer->compute_residual_n(n, tv.x, residuals.data(), assign.data()); + + train_encoder(n, residuals.data(), assign.data()); + } else { + train_encoder(n, tv.x, nullptr); + } - train_residual(n, x); is_trained = true; } -void IndexIVF::train_residual(idx_t /*n*/, const float* /*x*/) { - if (verbose) - printf("IndexIVF: no residual training\n"); +idx_t IndexIVF::train_encoder_num_vectors() const { + return 0; +} + +void IndexIVF::train_encoder( + idx_t /*n*/, + const float* /*x*/, + const idx_t* assign) { // does nothing by default + if (verbose) { + printf("IndexIVF: no residual training\n"); + } } bool check_compatible_for_merge_expensive_check = true; diff --git a/faiss/IndexIVF.h b/faiss/IndexIVF.h index ade8b5113d..a4a40194f9 100644 --- a/faiss/IndexIVF.h +++ b/faiss/IndexIVF.h @@ -177,6 +177,7 @@ struct IndexIVF : Index, IndexIVFInterface { bool own_invlists = false; size_t code_size = 0; ///< code size per vector in bytes + /** Parallel mode determines how queries are parallelized with OpenMP * * 0 (default): split over queries @@ -194,6 +195,10 @@ struct IndexIVF : Index, IndexIVFInterface { * enables reconstruct() */ DirectMap direct_map; + /// do the codes in the invlists encode the vectors relative to the + /// centroids? + bool by_residual = true; + /** The Inverted file takes a quantizer (an Index) on input, * which implements the function mapping a vector to a list * identifier. @@ -207,7 +212,7 @@ struct IndexIVF : Index, IndexIVFInterface { void reset() override; - /// Trains the quantizer and calls train_residual to train sub-quantizers + /// Trains the quantizer and calls train_encoder to train sub-quantizers void train(idx_t n, const float* x) override; /// Calls add_with_ids with NULL ids @@ -252,9 +257,15 @@ struct IndexIVF : Index, IndexIVFInterface { */ void add_sa_codes(idx_t n, const uint8_t* codes, const idx_t* xids); - /// Sub-classes that encode the residuals can train their encoders here - /// does nothing by default - virtual void train_residual(idx_t n, const float* x); + /** Train the encoder for the vectors. + * + * If by_residual then it is called with residuals and corresponding assign + * array, otherwise x is the raw training vectors and assign=nullptr */ + virtual void train_encoder(idx_t n, const float* x, const idx_t* assign); + + /// can be redefined by subclasses to indicate how many training vectors + /// they need + virtual idx_t train_encoder_num_vectors() const; void search_preassigned( idx_t n, diff --git a/faiss/IndexIVFAdditiveQuantizer.cpp b/faiss/IndexIVFAdditiveQuantizer.cpp index 0fa836aa08..54779792b8 100644 --- a/faiss/IndexIVFAdditiveQuantizer.cpp +++ b/faiss/IndexIVFAdditiveQuantizer.cpp @@ -37,30 +37,20 @@ IndexIVFAdditiveQuantizer::IndexIVFAdditiveQuantizer( IndexIVFAdditiveQuantizer::IndexIVFAdditiveQuantizer(AdditiveQuantizer* aq) : IndexIVF(), aq(aq) {} -void IndexIVFAdditiveQuantizer::train_residual(idx_t n, const float* x) { - const float* x_in = x; +void IndexIVFAdditiveQuantizer::train_encoder( + idx_t n, + const float* x, + const idx_t* assign) { + aq->train(n, x); +} +idx_t IndexIVFAdditiveQuantizer::train_encoder_num_vectors() const { size_t max_train_points = 1024 * ((size_t)1 << aq->nbits[0]); // we need more data to train LSQ if (dynamic_cast(aq)) { max_train_points = 1024 * aq->M * ((size_t)1 << aq->nbits[0]); } - - x = fvecs_maybe_subsample( - d, (size_t*)&n, max_train_points, x, verbose, 1234); - ScopeDeleter del_x(x_in == x ? nullptr : x); - - if (by_residual) { - std::vector idx(n); - quantizer->assign(n, x, idx.data()); - - std::vector residuals(n * d); - quantizer->compute_residual_n(n, x, residuals.data(), idx.data()); - - aq->train(n, residuals.data()); - } else { - aq->train(n, x); - } + return max_train_points; } void IndexIVFAdditiveQuantizer::encode_vectors( diff --git a/faiss/IndexIVFAdditiveQuantizer.h b/faiss/IndexIVFAdditiveQuantizer.h index 483f5e4b6e..d065947d09 100644 --- a/faiss/IndexIVFAdditiveQuantizer.h +++ b/faiss/IndexIVFAdditiveQuantizer.h @@ -26,7 +26,6 @@ namespace faiss { struct IndexIVFAdditiveQuantizer : IndexIVF { // the quantizer AdditiveQuantizer* aq; - bool by_residual = true; int use_precomputed_table = 0; // for future use using Search_type_t = AdditiveQuantizer::Search_type_t; @@ -40,7 +39,9 @@ struct IndexIVFAdditiveQuantizer : IndexIVF { explicit IndexIVFAdditiveQuantizer(AdditiveQuantizer* aq); - void train_residual(idx_t n, const float* x) override; + void train_encoder(idx_t n, const float* x, const idx_t* assign) override; + + idx_t train_encoder_num_vectors() const override; void encode_vectors( idx_t n, diff --git a/faiss/IndexIVFAdditiveQuantizerFastScan.cpp b/faiss/IndexIVFAdditiveQuantizerFastScan.cpp index e44e70c77c..2f41f87bcc 100644 --- a/faiss/IndexIVFAdditiveQuantizerFastScan.cpp +++ b/faiss/IndexIVFAdditiveQuantizerFastScan.cpp @@ -131,45 +131,20 @@ IndexIVFAdditiveQuantizerFastScan::~IndexIVFAdditiveQuantizerFastScan() {} * Training *********************************************************/ -void IndexIVFAdditiveQuantizerFastScan::train_residual( +idx_t IndexIVFAdditiveQuantizerFastScan::train_encoder_num_vectors() const { + return max_train_points; +} + +void IndexIVFAdditiveQuantizerFastScan::train_encoder( idx_t n, - const float* x_in) { + const float* x, + const idx_t* assign) { if (aq->is_trained) { return; } - const int seed = 0x12345; - size_t nt = n; - const float* x = fvecs_maybe_subsample( - d, &nt, max_train_points, x_in, verbose, seed); - n = nt; if (verbose) { - printf("training additive quantizer on %zd vectors\n", nt); - } - aq->verbose = verbose; - - std::unique_ptr del_x; - if (x != x_in) { - del_x.reset((float*)x); - } - - const float* trainset; - std::vector residuals(n * d); - std::vector assign(n); - - if (by_residual) { - if (verbose) { - printf("computing residuals\n"); - } - quantizer->assign(n, x, assign.data()); - residuals.resize(n * d); - for (idx_t i = 0; i < n; i++) { - quantizer->compute_residual( - x + i * d, residuals.data() + i * d, assign[i]); - } - trainset = residuals.data(); - } else { - trainset = x; + printf("training additive quantizer on %d vectors\n", int(n)); } if (verbose) { @@ -181,17 +156,16 @@ void IndexIVFAdditiveQuantizerFastScan::train_residual( d); } aq->verbose = verbose; - aq->train(n, trainset); + aq->train(n, x); // train norm quantizer if (by_residual && metric_type == METRIC_L2) { std::vector decoded_x(n * d); std::vector x_codes(n * aq->code_size); - aq->compute_codes(residuals.data(), x_codes.data(), n); + aq->compute_codes(x, x_codes.data(), n); aq->decode(x_codes.data(), decoded_x.data(), n); // add coarse centroids - FAISS_THROW_IF_NOT(assign.size() == n); std::vector centroid(d); for (idx_t i = 0; i < n; i++) { auto xi = decoded_x.data() + i * d; diff --git a/faiss/IndexIVFAdditiveQuantizerFastScan.h b/faiss/IndexIVFAdditiveQuantizerFastScan.h index 7a70a3ba46..24ce7287ec 100644 --- a/faiss/IndexIVFAdditiveQuantizerFastScan.h +++ b/faiss/IndexIVFAdditiveQuantizerFastScan.h @@ -63,7 +63,9 @@ struct IndexIVFAdditiveQuantizerFastScan : IndexIVFFastScan { const IndexIVFAdditiveQuantizer& orig, int bbs = 32); - void train_residual(idx_t n, const float* x) override; + void train_encoder(idx_t n, const float* x, const idx_t* assign) override; + + idx_t train_encoder_num_vectors() const override; void estimate_norm_scale(idx_t n, const float* x); diff --git a/faiss/IndexIVFFastScan.cpp b/faiss/IndexIVFFastScan.cpp index 701edc1fd7..800172cc9e 100644 --- a/faiss/IndexIVFFastScan.cpp +++ b/faiss/IndexIVFFastScan.cpp @@ -43,6 +43,8 @@ IndexIVFFastScan::IndexIVFFastScan( size_t code_size, MetricType metric) : IndexIVF(quantizer, d, nlist, code_size, metric) { + // unlike other indexes, we prefer no residuals for performance reasons. + by_residual = false; FAISS_THROW_IF_NOT(metric == METRIC_L2 || metric == METRIC_INNER_PRODUCT); } @@ -50,6 +52,7 @@ IndexIVFFastScan::IndexIVFFastScan() { bbs = 0; M2 = 0; is_trained = false; + by_residual = false; } void IndexIVFFastScan::init_fastscan( diff --git a/faiss/IndexIVFFastScan.h b/faiss/IndexIVFFastScan.h index fd7d021137..c1a6b506c1 100644 --- a/faiss/IndexIVFFastScan.h +++ b/faiss/IndexIVFFastScan.h @@ -45,7 +45,6 @@ struct IndexIVFFastScan : IndexIVF { int implem = 0; // skip some parts of the computation (for timing) int skip = 0; - bool by_residual = false; // batching factors at search time (0 = default) int qbs = 0; diff --git a/faiss/IndexIVFFlat.cpp b/faiss/IndexIVFFlat.cpp index 03f8cb0dc4..a6f090c2c9 100644 --- a/faiss/IndexIVFFlat.cpp +++ b/faiss/IndexIVFFlat.cpp @@ -36,6 +36,11 @@ IndexIVFFlat::IndexIVFFlat( MetricType metric) : IndexIVF(quantizer, d, nlist, sizeof(float) * d, metric) { code_size = sizeof(float) * d; + by_residual = false; +} + +IndexIVFFlat::IndexIVFFlat() { + by_residual = false; } void IndexIVFFlat::add_core( @@ -45,6 +50,7 @@ void IndexIVFFlat::add_core( const int64_t* coarse_idx) { FAISS_THROW_IF_NOT(is_trained); FAISS_THROW_IF_NOT(coarse_idx); + FAISS_THROW_IF_NOT(!by_residual); assert(invlists); direct_map.check_can_add(xids); @@ -89,6 +95,7 @@ void IndexIVFFlat::encode_vectors( const idx_t* list_nos, uint8_t* codes, bool include_listnos) const { + FAISS_THROW_IF_NOT(!by_residual); if (!include_listnos) { memcpy(codes, x, code_size * n); } else { diff --git a/faiss/IndexIVFFlat.h b/faiss/IndexIVFFlat.h index 1ecc6ffc74..a0233052fa 100644 --- a/faiss/IndexIVFFlat.h +++ b/faiss/IndexIVFFlat.h @@ -50,7 +50,7 @@ struct IndexIVFFlat : IndexIVF { void sa_decode(idx_t n, const uint8_t* bytes, float* x) const override; - IndexIVFFlat() {} + IndexIVFFlat(); }; struct IndexIVFFlatDedup : IndexIVFFlat { diff --git a/faiss/IndexIVFPQ.cpp b/faiss/IndexIVFPQ.cpp index fd91738ad1..60633cc41b 100644 --- a/faiss/IndexIVFPQ.cpp +++ b/faiss/IndexIVFPQ.cpp @@ -64,74 +64,16 @@ IndexIVFPQ::IndexIVFPQ( /**************************************************************** * training */ -void IndexIVFPQ::train_residual(idx_t n, const float* x) { - train_residual_o(n, x, nullptr); -} - -void IndexIVFPQ::train_residual_o(idx_t n, const float* x, float* residuals_2) { - const float* x_in = x; - - x = fvecs_maybe_subsample( - d, - (size_t*)&n, - pq.cp.max_points_per_centroid * pq.ksub, - x, - verbose, - pq.cp.seed); - - ScopeDeleter del_x(x_in == x ? nullptr : x); - - const float* trainset; - ScopeDeleter del_residuals; - if (by_residual) { - if (verbose) - printf("computing residuals\n"); - idx_t* assign = new idx_t[n]; // assignement to coarse centroids - ScopeDeleter del(assign); - quantizer->assign(n, x, assign); - float* residuals = new float[n * d]; - del_residuals.set(residuals); - for (idx_t i = 0; i < n; i++) - quantizer->compute_residual( - x + i * d, residuals + i * d, assign[i]); - - trainset = residuals; - } else { - trainset = x; - } - if (verbose) - printf("training %zdx%zd product quantizer on %" PRId64 - " vectors in %dD\n", - pq.M, - pq.ksub, - n, - d); - pq.verbose = verbose; - pq.train(n, trainset); +void IndexIVFPQ::train_encoder(idx_t n, const float* x, const idx_t* assign) { + pq.train(n, x); if (do_polysemous_training) { if (verbose) printf("doing polysemous training for PQ\n"); PolysemousTraining default_pt; - PolysemousTraining* pt = polysemous_training; - if (!pt) - pt = &default_pt; - pt->optimize_pq_for_hamming(pq, n, trainset); - } - - // prepare second-level residuals for refine PQ - if (residuals_2) { - uint8_t* train_codes = new uint8_t[pq.code_size * n]; - ScopeDeleter del(train_codes); - pq.compute_codes(trainset, train_codes, n); - - for (idx_t i = 0; i < n; i++) { - const float* xx = trainset + i * d; - float* res = residuals_2 + i * d; - pq.decode(train_codes + i * pq.code_size, res); - for (int j = 0; j < d; j++) - res[j] = xx[j] - res[j]; - } + PolysemousTraining* pt = + polysemous_training ? polysemous_training : &default_pt; + pt->optimize_pq_for_hamming(pq, n, x); } if (by_residual) { @@ -139,6 +81,10 @@ void IndexIVFPQ::train_residual_o(idx_t n, const float* x, float* residuals_2) { } } +idx_t IndexIVFPQ::train_encoder_num_vectors() const { + return pq.cp.max_points_per_centroid * pq.ksub; +} + /**************************************************************** * IVFPQ as codec */ diff --git a/faiss/IndexIVFPQ.h b/faiss/IndexIVFPQ.h index 58c85fa27b..ab49f1e549 100644 --- a/faiss/IndexIVFPQ.h +++ b/faiss/IndexIVFPQ.h @@ -32,8 +32,6 @@ FAISS_API extern size_t precomputed_table_max_bytes; * vector is encoded as a product quantizer code. */ struct IndexIVFPQ : IndexIVF { - bool by_residual; ///< Encode residual or plain vector? - ProductQuantizer pq; ///< produces the codes bool do_polysemous_training; ///< reorder PQ centroids after training? @@ -86,10 +84,9 @@ struct IndexIVFPQ : IndexIVF { const idx_t* precomputed_idx = nullptr); /// trains the product quantizer - void train_residual(idx_t n, const float* x) override; + void train_encoder(idx_t n, const float* x, const idx_t* assign) override; - /// same as train_residual, also output 2nd level residuals - void train_residual_o(idx_t n, const float* x, float* residuals_2); + idx_t train_encoder_num_vectors() const override; void reconstruct_from_offset(int64_t list_no, int64_t offset, float* recons) const override; diff --git a/faiss/IndexIVFPQFastScan.cpp b/faiss/IndexIVFPQFastScan.cpp index 07d88bf50e..b44b71ec67 100644 --- a/faiss/IndexIVFPQFastScan.cpp +++ b/faiss/IndexIVFPQFastScan.cpp @@ -44,7 +44,7 @@ IndexIVFPQFastScan::IndexIVFPQFastScan( MetricType metric, int bbs) : IndexIVFFastScan(quantizer, d, nlist, 0, metric), pq(d, M, nbits) { - by_residual = false; // set to false by default because it's much faster + by_residual = false; // set to false by default because it's faster init_fastscan(M, nbits, nlist, metric, bbs); } @@ -106,54 +106,22 @@ IndexIVFPQFastScan::IndexIVFPQFastScan(const IndexIVFPQ& orig, int bbs) * Training *********************************************************/ -void IndexIVFPQFastScan::train_residual(idx_t n, const float* x_in) { - const float* x = fvecs_maybe_subsample( - d, - (size_t*)&n, - pq.cp.max_points_per_centroid * pq.ksub, - x_in, - verbose, - pq.cp.seed); - - std::unique_ptr del_x; - if (x != x_in) { - del_x.reset((float*)x); - } - - const float* trainset; - AlignedTable residuals; - - if (by_residual) { - if (verbose) - printf("computing residuals\n"); - std::vector assign(n); - quantizer->assign(n, x, assign.data()); - residuals.resize(n * d); - for (idx_t i = 0; i < n; i++) { - quantizer->compute_residual( - x + i * d, residuals.data() + i * d, assign[i]); - } - trainset = residuals.data(); - } else { - trainset = x; - } - - if (verbose) { - printf("training %zdx%zd product quantizer on " - "%" PRId64 " vectors in %dD\n", - pq.M, - pq.ksub, - n, - d); - } +void IndexIVFPQFastScan::train_encoder( + idx_t n, + const float* x, + const idx_t* assign) { pq.verbose = verbose; - pq.train(n, trainset); + pq.train(n, x); if (by_residual && metric_type == METRIC_L2) { precompute_table(); } } +idx_t IndexIVFPQFastScan::train_encoder_num_vectors() const { + return pq.cp.max_points_per_centroid * pq.ksub; +} + void IndexIVFPQFastScan::precompute_table() { initialize_IVFPQ_precomputed_table( use_precomputed_table, diff --git a/faiss/IndexIVFPQFastScan.h b/faiss/IndexIVFPQFastScan.h index 55c5430b64..9a79833591 100644 --- a/faiss/IndexIVFPQFastScan.h +++ b/faiss/IndexIVFPQFastScan.h @@ -54,7 +54,9 @@ struct IndexIVFPQFastScan : IndexIVFFastScan { // built from an IndexIVFPQ explicit IndexIVFPQFastScan(const IndexIVFPQ& orig, int bbs = 32); - void train_residual(idx_t n, const float* x) override; + void train_encoder(idx_t n, const float* x, const idx_t* assign) override; + + idx_t train_encoder_num_vectors() const override; /// build precomputed table, possibly updating use_precomputed_table void precompute_table(); diff --git a/faiss/IndexIVFPQR.cpp b/faiss/IndexIVFPQR.cpp index f60302396d..5a5b88d94d 100644 --- a/faiss/IndexIVFPQR.cpp +++ b/faiss/IndexIVFPQR.cpp @@ -35,10 +35,12 @@ IndexIVFPQR::IndexIVFPQR( refine_pq(d, M_refine, nbits_per_idx_refine), k_factor(4) { by_residual = true; + refine_pq.cp.max_points_per_centroid = 1000; } IndexIVFPQR::IndexIVFPQR() : k_factor(1) { by_residual = true; + refine_pq.cp.max_points_per_centroid = 1000; } void IndexIVFPQR::reset() { @@ -46,24 +48,39 @@ void IndexIVFPQR::reset() { refine_codes.clear(); } -void IndexIVFPQR::train_residual(idx_t n, const float* x) { - float* residual_2 = new float[n * d]; - ScopeDeleter del(residual_2); - - train_residual_o(n, x, residual_2); - - if (verbose) +void IndexIVFPQR::train_encoder(idx_t n, const float* x, const idx_t* assign) { + IndexIVFPQ::train_encoder(n, x, assign); + if (verbose) { printf("training %zdx%zd 2nd level PQ quantizer on %" PRId64 " %dD-vectors\n", refine_pq.M, refine_pq.ksub, n, d); - - refine_pq.cp.max_points_per_centroid = 1000; + } refine_pq.cp.verbose = verbose; - refine_pq.train(n, residual_2); + // 2nd level residual + std::vector residual_2(n * d); + std::vector train_codes(pq.code_size * n); + pq.compute_codes(x, train_codes.data(), n); + + for (idx_t i = 0; i < n; i++) { + const float* xx = x + i * d; + float* res = residual_2.data() + i * d; + pq.decode(train_codes.data() + i * pq.code_size, res); + for (int j = 0; j < d; j++) { + res[j] = xx[j] - res[j]; + } + } + + refine_pq.train(n, residual_2.data()); +} + +idx_t IndexIVFPQR::train_encoder_num_vectors() const { + return std::max( + pq.cp.max_points_per_centroid * pq.ksub, + refine_pq.cp.max_points_per_centroid * refine_pq.ksub); } void IndexIVFPQR::add_with_ids(idx_t n, const float* x, const idx_t* xids) { diff --git a/faiss/IndexIVFPQR.h b/faiss/IndexIVFPQR.h index 55756f59f9..73502879f2 100644 --- a/faiss/IndexIVFPQR.h +++ b/faiss/IndexIVFPQR.h @@ -37,7 +37,9 @@ struct IndexIVFPQR : IndexIVFPQ { size_t remove_ids(const IDSelector& sel) override; /// trains the two product quantizers - void train_residual(idx_t n, const float* x) override; + void train_encoder(idx_t n, const float* x, const idx_t* assign) override; + + idx_t train_encoder_num_vectors() const override; void add_with_ids(idx_t n, const float* x, const idx_t* xids) override; diff --git a/faiss/IndexIVFSpectralHash.cpp b/faiss/IndexIVFSpectralHash.cpp index 61de5aecb3..443c45dee6 100644 --- a/faiss/IndexIVFSpectralHash.cpp +++ b/faiss/IndexIVFSpectralHash.cpp @@ -31,22 +31,17 @@ IndexIVFSpectralHash::IndexIVFSpectralHash( float period) : IndexIVF(quantizer, d, nlist, (nbit + 7) / 8, METRIC_L2), nbit(nbit), - period(period), - threshold_type(Thresh_global) { + period(period) { RandomRotationMatrix* rr = new RandomRotationMatrix(d, nbit); rr->init(1234); vt = rr; - own_fields = true; is_trained = false; + by_residual = false; } -IndexIVFSpectralHash::IndexIVFSpectralHash() - : IndexIVF(), - vt(nullptr), - own_fields(false), - nbit(0), - period(0), - threshold_type(Thresh_global) {} +IndexIVFSpectralHash::IndexIVFSpectralHash() : IndexIVF() { + by_residual = false; +} IndexIVFSpectralHash::~IndexIVFSpectralHash() { if (own_fields) { @@ -67,10 +62,14 @@ float median(size_t n, float* x) { } // namespace -void IndexIVFSpectralHash::train_residual(idx_t n, const float* x) { +void IndexIVFSpectralHash::train_encoder( + idx_t n, + const float* x, + const idx_t* assign) { if (!vt->is_trained) { vt->train(n, x); } + FAISS_THROW_IF_NOT(!by_residual); if (threshold_type == Thresh_global) { // nothing to do @@ -167,6 +166,7 @@ void IndexIVFSpectralHash::encode_vectors( uint8_t* codes, bool include_listnos) const { FAISS_THROW_IF_NOT(is_trained); + FAISS_THROW_IF_NOT(!by_residual); float freq = 2.0 / period; size_t coarse_size = include_listnos ? coarse_code_size() : 0; diff --git a/faiss/IndexIVFSpectralHash.h b/faiss/IndexIVFSpectralHash.h index ee464859c3..ae7df58e40 100644 --- a/faiss/IndexIVFSpectralHash.h +++ b/faiss/IndexIVFSpectralHash.h @@ -30,14 +30,14 @@ struct IndexPreTransform; */ struct IndexIVFSpectralHash : IndexIVF { /// transformation from d to nbit dim - VectorTransform* vt; + VectorTransform* vt = nullptr; /// own the vt - bool own_fields; + bool own_fields = true; /// nb of bits of the binary signature - int nbit; + int nbit = 0; /// interval size for 0s and 1s - float period; + float period = 0; enum ThresholdType { Thresh_global, ///< global threshold at 0 @@ -45,7 +45,7 @@ struct IndexIVFSpectralHash : IndexIVF { Thresh_centroid_half, ///< central interval around centroid Thresh_median ///< median of training set }; - ThresholdType threshold_type; + ThresholdType threshold_type = Thresh_global; /// Trained threshold. /// size nlist * nbit or 0 if Thresh_global @@ -60,7 +60,7 @@ struct IndexIVFSpectralHash : IndexIVF { IndexIVFSpectralHash(); - void train_residual(idx_t n, const float* x) override; + void train_encoder(idx_t n, const float* x, const idx_t* assign) override; void encode_vectors( idx_t n, diff --git a/faiss/IndexScalarQuantizer.cpp b/faiss/IndexScalarQuantizer.cpp index 4189bcd034..acd3592bf9 100644 --- a/faiss/IndexScalarQuantizer.cpp +++ b/faiss/IndexScalarQuantizer.cpp @@ -122,21 +122,28 @@ IndexIVFScalarQuantizer::IndexIVFScalarQuantizer( size_t nlist, ScalarQuantizer::QuantizerType qtype, MetricType metric, - bool encode_residual) - : IndexIVF(quantizer, d, nlist, 0, metric), - sq(d, qtype), - by_residual(encode_residual) { + bool by_residual) + : IndexIVF(quantizer, d, nlist, 0, metric), sq(d, qtype) { code_size = sq.code_size; + this->by_residual = by_residual; // was not known at construction time invlists->code_size = code_size; is_trained = false; } -IndexIVFScalarQuantizer::IndexIVFScalarQuantizer() - : IndexIVF(), by_residual(true) {} +IndexIVFScalarQuantizer::IndexIVFScalarQuantizer() : IndexIVF() { + by_residual = true; +} + +void IndexIVFScalarQuantizer::train_encoder( + idx_t n, + const float* x, + const idx_t* assign) { + sq.train(n, x); +} -void IndexIVFScalarQuantizer::train_residual(idx_t n, const float* x) { - sq.train_residual(n, x, quantizer, by_residual, verbose); +idx_t IndexIVFScalarQuantizer::train_encoder_num_vectors() const { + return 100000; } void IndexIVFScalarQuantizer::encode_vectors( diff --git a/faiss/IndexScalarQuantizer.h b/faiss/IndexScalarQuantizer.h index c1e6b34f2c..c064bbeeb3 100644 --- a/faiss/IndexScalarQuantizer.h +++ b/faiss/IndexScalarQuantizer.h @@ -65,7 +65,6 @@ struct IndexScalarQuantizer : IndexFlatCodes { struct IndexIVFScalarQuantizer : IndexIVF { ScalarQuantizer sq; - bool by_residual; IndexIVFScalarQuantizer( Index* quantizer, @@ -73,11 +72,13 @@ struct IndexIVFScalarQuantizer : IndexIVF { size_t nlist, ScalarQuantizer::QuantizerType qtype, MetricType metric = METRIC_L2, - bool encode_residual = true); + bool by_residual = true); IndexIVFScalarQuantizer(); - void train_residual(idx_t n, const float* x) override; + void train_encoder(idx_t n, const float* x, const idx_t* assign) override; + + idx_t train_encoder_num_vectors() const override; void encode_vectors( idx_t n, diff --git a/faiss/gpu/GpuIndexIVFScalarQuantizer.cu b/faiss/gpu/GpuIndexIVFScalarQuantizer.cu index f58c72889b..7c21a770d0 100644 --- a/faiss/gpu/GpuIndexIVFScalarQuantizer.cu +++ b/faiss/gpu/GpuIndexIVFScalarQuantizer.cu @@ -219,7 +219,17 @@ void GpuIndexIVFScalarQuantizer::reset() { void GpuIndexIVFScalarQuantizer::trainResiduals_(idx_t n, const float* x) { // The input is already guaranteed to be on the CPU - sq.train_residual(n, x, quantizer, by_residual, verbose); + if (!by_residual) { + sq.train(n, x); + } else { + std::vector assign(n); + quantizer->assign(n, x, assign.data()); + + std::vector residuals(n * d); + quantizer->compute_residual_n(n, x, residuals.data(), assign.data()); + + sq.train(n, residuals.data()); + } } void GpuIndexIVFScalarQuantizer::train(idx_t n, const float* x) { diff --git a/faiss/impl/FaissException.h b/faiss/impl/FaissException.h index bc8bb9aca6..5e5bcf1a30 100644 --- a/faiss/impl/FaissException.h +++ b/faiss/impl/FaissException.h @@ -1,3 +1,4 @@ + /** * Copyright (c) Facebook, Inc. and its affiliates. * @@ -79,6 +80,23 @@ struct ScopeDeleter1 { } }; +/** RAII object for a set of possibly transformed vectors (deallocated only if + * they are indeed transformed) + */ +struct TransformedVectors { + const float* x; + bool own_x; + TransformedVectors(const float* x_orig, const float* x) : x(x) { + own_x = x_orig != x; + } + + ~TransformedVectors() { + if (own_x) { + delete[] x; + } + } +}; + /// make typeids more readable std::string demangle_cpp_symbol(const char* name); diff --git a/faiss/impl/ScalarQuantizer.cpp b/faiss/impl/ScalarQuantizer.cpp index a3cf4c744e..8d18907875 100644 --- a/faiss/impl/ScalarQuantizer.cpp +++ b/faiss/impl/ScalarQuantizer.cpp @@ -1115,32 +1115,6 @@ void ScalarQuantizer::train(size_t n, const float* x) { } } -void ScalarQuantizer::train_residual( - size_t n, - const float* x, - Index* quantizer, - bool by_residual, - bool verbose) { - const float* x_in = x; - - // 100k points more than enough - x = fvecs_maybe_subsample(d, (size_t*)&n, 100000, x, verbose, 1234); - - ScopeDeleter del_x(x_in == x ? nullptr : x); - - if (by_residual) { - std::vector idx(n); - quantizer->assign(n, x, idx.data()); - - std::vector residuals(n * d); - quantizer->compute_residual_n(n, x, residuals.data(), idx.data()); - - train(n, residuals.data()); - } else { - train(n, x); - } -} - ScalarQuantizer::SQuantizer* ScalarQuantizer::select_quantizer() const { #ifdef USE_F16C if (d % 8 == 0) { diff --git a/faiss/impl/ScalarQuantizer.h b/faiss/impl/ScalarQuantizer.h index e29a1420c9..550a979092 100644 --- a/faiss/impl/ScalarQuantizer.h +++ b/faiss/impl/ScalarQuantizer.h @@ -65,14 +65,6 @@ struct ScalarQuantizer : Quantizer { void train(size_t n, const float* x) override; - /// Used by an IVF index to train based on the residuals - void train_residual( - size_t n, - const float* x, - Index* quantizer, - bool by_residual, - bool verbose); - /** Encode a set of vectors * * @param x vectors to encode, size n * d diff --git a/faiss/impl/index_write.cpp b/faiss/impl/index_write.cpp index 6ea40e1be7..d40f651c56 100644 --- a/faiss/impl/index_write.cpp +++ b/faiss/impl/index_write.cpp @@ -385,6 +385,8 @@ static void write_ivf_header(const IndexIVF* ivf, IOWriter* f) { write_index_header(ivf, f); WRITE1(ivf->nlist); WRITE1(ivf->nprobe); + // subclasses write by_residual (some of them support only one setting of + // by_residual). write_index(ivf->quantizer, f); write_direct_map(&ivf->direct_map, f); }