diff --git a/Cargo.lock b/Cargo.lock index d16e2faf8b..992e3dc31f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -554,6 +554,7 @@ dependencies = [ "smallvec", "sqlx", "tantivy", + "tantivy-columnar", "tempdir", "thiserror", "thread-priority", @@ -749,6 +750,7 @@ version = "1.0.83" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0" dependencies = [ + "jobserver", "libc", ] @@ -1768,17 +1770,6 @@ dependencies = [ "once_cell", ] -[[package]] -name = "fail" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe5e43d0f78a42ad591453aedb1d7ae631ce7ee445c7643691055a9ed8d3b01c" -dependencies = [ - "log", - "once_cell", - "rand 0.8.5", -] - [[package]] name = "failure" version = "0.1.8" @@ -1826,20 +1817,6 @@ dependencies = [ "serde", ] -[[package]] -name = "fastfield_codecs" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "374a3a53c1bd5fb31b10084229290eafb0a05f260ec90f1f726afffda4877a8a" -dependencies = [ - "fastdivide", - "itertools 0.10.5", - "log", - "ownedbytes", - "tantivy-bitpacker", - "tantivy-common", -] - [[package]] name = "fastrand" version = "2.0.1" @@ -1955,13 +1932,13 @@ dependencies = [ ] [[package]] -name = "fs2" -version = "0.4.3" +name = "fs4" +version = "0.6.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9564fc758e15025b46aa6643b1b77d047d1a56a1aea6e01002ac0c7026876213" +checksum = "2eeb4ed9e12f43b7fa0baae3f9cdda28352770132ef2e09a23760c29cae8bd47" dependencies = [ - "libc", - "winapi", + "rustix", + "windows-sys 0.48.0", ] [[package]] @@ -3170,9 +3147,6 @@ name = "hashbrown" version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" -dependencies = [ - "ahash 0.7.6", -] [[package]] name = "hashbrown" @@ -3803,6 +3777,15 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8eaf4bc02d17cbdd7ff4c7438cafcdf7fb9a4613313ad11b4f8fefe7d3fa0130" +[[package]] +name = "jobserver" +version = "0.1.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c37f63953c4c63420ed5fd3d6d398c719489b9f872b9fa683262f8edd363c7d" +dependencies = [ + "libc", +] + [[package]] name = "js-sys" version = "0.3.64" @@ -4045,11 +4028,11 @@ dependencies = [ [[package]] name = "lru" -version = "0.7.8" +version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e999beba7b6e8345721bd280141ed958096a2e4abdf74f67ff4ce49b4b54e47a" +checksum = "a4a83fb7698b3643a0e34f9ae6f2e8f0178c0fd42f8b59d493aa271ff3a5bf21" dependencies = [ - "hashbrown 0.12.3", + "hashbrown 0.14.0", ] [[package]] @@ -4063,9 +4046,9 @@ dependencies = [ [[package]] name = "lz4_flex" -version = "0.9.5" +version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a8cbbb2831780bc3b9c15a41f5b49222ef756b6730a95f3decfdd15903eb5a3" +checksum = "3ea9b256699eda7b0387ffbc776dd625e28bde3918446381781245b7a50349d8" [[package]] name = "mac" @@ -4290,12 +4273,9 @@ dependencies = [ [[package]] name = "murmurhash32" -version = "0.2.0" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d736ff882f0e85fe9689fb23db229616c4c00aee2b3ac282f666d8f20eb25d4a" -dependencies = [ - "byteorder", -] +checksum = "d9380db4c04d219ac5c51d14996bbf2c2e9a15229771b53f8671eb6c83cf44df" [[package]] name = "nanorand" @@ -4794,9 +4774,9 @@ checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" [[package]] name = "ownedbytes" -version = "0.4.0" +version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e957eaa64a299f39755416e5b3128c505e9d63a91d0453771ad2ccd3907f8db" +checksum = "6e8a72b918ae8198abb3a18c190288123e1d442b6b9a7d709305fd194688b4b7" dependencies = [ "stable_deref_trait", ] @@ -6474,6 +6454,15 @@ version = "0.3.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d" +[[package]] +name = "sketches-ddsketch" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68a406c1882ed7f29cd5e248c9848a80e7cb6ae0fea82346d2746f2f941c07e1" +dependencies = [ + "serde", +] + [[package]] name = "slab" version = "0.4.9" @@ -6868,49 +6857,49 @@ dependencies = [ [[package]] name = "tantivy" -version = "0.19.2" +version = "0.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5bb26a6b22c84d8be41d99a14016d6f04d30d8d31a2ea411a8ab553af5cc490d" +checksum = "c1d4675fed6fe2218ce11445374e181e864a8ffd0f28e7e0591ccfc38cd000ae" dependencies = [ - "aho-corasick 0.7.20", + "aho-corasick 1.1.1", "arc-swap", "async-trait", - "base64 0.13.1", + "base64 0.21.4", "bitpacking", "byteorder", "census", "crc32fast", "crossbeam-channel", "downcast-rs", - "fail", "fastdivide", - "fastfield_codecs", - "fs2", + "fs4", "htmlescape", - "itertools 0.10.5", + "itertools 0.11.0", "levenshtein_automata", "log", "lru", "lz4_flex", "measure_time", - "memmap2 0.5.10", + "memmap2 0.7.1", "murmurhash32", "num_cpus", "once_cell", "oneshot", - "ownedbytes", "rayon", "regex 1.9.5", "rust-stemmers", "rustc-hash", "serde", "serde_json", + "sketches-ddsketch", "smallvec", - "stable_deref_trait", "tantivy-bitpacker", + "tantivy-columnar", "tantivy-common", "tantivy-fst", "tantivy-query-grammar", + "tantivy-stacker", + "tantivy-tokenizer-api", "tempfile", "thiserror", "time", @@ -6920,18 +6909,40 @@ dependencies = [ [[package]] name = "tantivy-bitpacker" -version = "0.3.0" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e71a0c95b82d4292b097a09b989a6380d28c3a86800c841a2d03bae1fc8b9fa6" +checksum = "cecb164321482301f514dd582264fa67f70da2d7eb01872ccd71e35e0d96655a" +dependencies = [ + "bitpacking", +] + +[[package]] +name = "tantivy-columnar" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8d85f8019af9a78b3118c11298b36ffd21c2314bd76bbcd9d12e00124cbb7e70" +dependencies = [ + "fastdivide", + "fnv", + "itertools 0.11.0", + "serde", + "tantivy-bitpacker", + "tantivy-common", + "tantivy-sstable", + "tantivy-stacker", +] [[package]] name = "tantivy-common" -version = "0.4.0" +version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14fef4182bb60df9a4b92cd8ecab39ba2e50a05542934af17eef1f49660705cb" +checksum = "af4a3a975e604a2aba6b1106a04505e1e7a025e6def477fab6e410b4126471e1" dependencies = [ + "async-trait", "byteorder", "ownedbytes", + "serde", + "time", ] [[package]] @@ -6947,13 +6958,41 @@ dependencies = [ [[package]] name = "tantivy-query-grammar" -version = "0.19.0" +version = "0.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "343e3ada4c1c480953f6960f8a21ce9c76611480ffdd4f4e230fdddce0fc5331" +checksum = "1d39c5a03100ac10c96e0c8b07538e2ab8b17da56434ab348309b31f23fada77" dependencies = [ - "combine", - "once_cell", - "regex 1.9.5", + "nom", +] + +[[package]] +name = "tantivy-sstable" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc0c1bb43e5e8b8e05eb8009610344dbf285f06066c844032fbb3e546b3c71df" +dependencies = [ + "tantivy-common", + "tantivy-fst", + "zstd", +] + +[[package]] +name = "tantivy-stacker" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2c078595413f13f218cf6f97b23dcfd48936838f1d3d13a1016e05acd64ed6c" +dependencies = [ + "murmurhash32", + "tantivy-common", +] + +[[package]] +name = "tantivy-tokenizer-api" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "347b6fb212b26d3505d224f438e3c4b827ab8bd847fe9953ad5ac6b8f9443b66" +dependencies = [ + "serde", ] [[package]] @@ -8844,3 +8883,32 @@ dependencies = [ "crossbeam-utils", "flate2", ] + +[[package]] +name = "zstd" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a27595e173641171fc74a1232b7b1c7a7cb6e18222c11e9dfb9888fa424c53c" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "6.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee98ffd0b48ee95e6c5168188e44a54550b1564d9d530ee21d5f0eaed1069581" +dependencies = [ + "libc", + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "2.0.9+zstd.1.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e16efa8a874a0481a574084d34cc26fdb3b99627480f785888deb6386506656" +dependencies = [ + "cc", + "pkg-config", +] diff --git a/server/bleep/Cargo.toml b/server/bleep/Cargo.toml index db3bcaa200..1261dded77 100644 --- a/server/bleep/Cargo.toml +++ b/server/bleep/Cargo.toml @@ -30,7 +30,8 @@ harness = false [dependencies] # core -tantivy = { version = "0.19.2", features = ["mmap"] } +tantivy = { version = "0.21.0", features = ["mmap"] } +tantivy-columnar = "0.2.0" tokio = { version = "1.32.0", features = ["macros", "process", "rt", "rt-multi-thread", "io-std", "io-util", "sync", "fs"] } futures = "0.3.28" rayon = "1.8.0" diff --git a/server/bleep/src/collector/bytes_filter.rs b/server/bleep/src/collector/bytes_filter.rs index 89dde6fcbc..6cf34e0f84 100644 --- a/server/bleep/src/collector/bytes_filter.rs +++ b/server/bleep/src/collector/bytes_filter.rs @@ -1,7 +1,6 @@ // a version of tantivy::collector::FilterCollector that works on byte fast fields use tantivy::collector::{Collector, SegmentCollector}; -use tantivy::fastfield::BytesFastFieldReader; use tantivy::schema::Field; use tantivy::{Score, SegmentReader, TantivyError}; @@ -58,7 +57,8 @@ where ))); } - let fast_field_reader = segment_reader.fast_fields().bytes(self.field)?; + let field_name = schema.get_field_name(self.field); + let fast_field_reader = segment_reader.fast_fields().bytes(field_name)?.unwrap(); let segment_collector = self .collector @@ -87,7 +87,7 @@ pub struct BytesFilterSegmentCollector where TPredicate: 'static, { - fast_field_reader: BytesFastFieldReader, + fast_field_reader: tantivy_columnar::BytesColumn, segment_collector: TSegmentCollector, predicate: TPredicate, } @@ -101,8 +101,16 @@ where type Fruit = TSegmentCollector::Fruit; fn collect(&mut self, doc: u32, score: Score) { - let value = self.fast_field_reader.get_bytes(doc); - if (self.predicate)(value) { + let mut value = Vec::new(); + self.fast_field_reader + .ords() + .values_for_doc(doc) + .for_each(|ord| { + self.fast_field_reader + .ord_to_bytes(ord, &mut value) + .unwrap(); + }); + if (self.predicate)(&value) { self.segment_collector.collect(doc, score) } } diff --git a/server/bleep/src/collector/frequency.rs b/server/bleep/src/collector/frequency.rs index 40a9aa58bd..6b28d56758 100644 --- a/server/bleep/src/collector/frequency.rs +++ b/server/bleep/src/collector/frequency.rs @@ -2,10 +2,10 @@ use std::collections::HashMap; use tantivy::{ collector::{Collector, SegmentCollector}, - fastfield::BytesFastFieldReader, schema::Field, Score, SegmentReader, }; +use tantivy_columnar::BytesColumn; pub struct FrequencyCollector(pub Field); @@ -19,7 +19,8 @@ impl Collector for FrequencyCollector { _segment_local_id: u32, segment_reader: &SegmentReader, ) -> tantivy::Result { - let reader = segment_reader.fast_fields().bytes(self.0)?; + let field_name = segment_reader.schema().get_field_name(self.0); + let reader = segment_reader.fast_fields().bytes(field_name)?.unwrap(); Ok(FrequencySegmentCollector { reader, freqs: HashMap::new(), @@ -43,7 +44,7 @@ impl Collector for FrequencyCollector { } pub struct FrequencySegmentCollector { - reader: BytesFastFieldReader, + reader: BytesColumn, freqs: HashMap, usize>, } @@ -51,11 +52,11 @@ impl SegmentCollector for FrequencySegmentCollector { type Fruit = HashMap, usize>; fn collect(&mut self, doc: u32, _score: Score) { - let k = self.reader.get_bytes(doc); - self.freqs - .entry(k.to_owned()) - .and_modify(|v| *v += 1) - .or_insert(1); + let mut k = Vec::new(); + self.reader.ords().values_for_doc(doc).for_each(|ord| { + self.reader.ord_to_bytes(ord, &mut k).unwrap(); + }); + self.freqs.entry(k).and_modify(|v| *v += 1).or_insert(1); } fn harvest(self) -> ::Fruit { diff --git a/server/bleep/src/indexes.rs b/server/bleep/src/indexes.rs index 7076603bf5..03a3558845 100644 --- a/server/bleep/src/indexes.rs +++ b/server/bleep/src/indexes.rs @@ -238,7 +238,7 @@ impl Indexer { index.set_multithread_executor(threads)?; index .tokenizers() - .register("default", NgramTokenizer::new(1, 3, false)); + .register("default", NgramTokenizer::new(1, 3, false).unwrap()); Ok(index) } diff --git a/server/bleep/src/indexes/schema.rs b/server/bleep/src/indexes/schema.rs index ba35d91b6a..f556fe6b35 100644 --- a/server/bleep/src/indexes/schema.rs +++ b/server/bleep/src/indexes/schema.rs @@ -132,6 +132,10 @@ impl File { histogram: Arc::new(Histogram::builder().build().unwrap().into()), } } + + pub fn schema(&self) -> Schema { + self.schema.clone() + } } impl Default for File { diff --git a/server/bleep/src/query/compiler.rs b/server/bleep/src/query/compiler.rs index 863efad924..68414c4a64 100644 --- a/server/bleep/src/query/compiler.rs +++ b/server/bleep/src/query/compiler.rs @@ -99,7 +99,7 @@ impl Compiler { let field_query = match extraction { Extraction::Literal(Literal::Plain(text)) => { - let tokenizer = index + let mut tokenizer = index .tokenizer_for_field(*field) .context("field is missing tokenizer")?; @@ -376,7 +376,7 @@ mod tests { let (occur, term) = &subquery.clauses()[0]; let term = term.downcast_ref::().unwrap(); assert_eq!(*occur, Occur::Should); - assert_eq!(term.term().as_str().unwrap(), expected); + assert_eq!(term.term().value().as_str().unwrap(), expected); } } } diff --git a/server/bleep/src/query/ranking.rs b/server/bleep/src/query/ranking.rs index b10d287fd0..891f8104f6 100644 --- a/server/bleep/src/query/ranking.rs +++ b/server/bleep/src/query/ranking.rs @@ -1,32 +1,37 @@ -use std::{sync::Arc, time::SystemTime}; +use std::time::SystemTime; use tantivy::{ collector::{ScoreSegmentTweaker, ScoreTweaker}, - fastfield::{BytesFastFieldReader, Column}, + fastfield::Column, DocId, Score, }; +use tantivy_columnar::{column_values::ColumnValues, BytesColumn}; use crate::indexes::file::File; pub struct DocumentTweaker(pub File); pub struct SegmentScorer { - line_length: Arc>, - lang: BytesFastFieldReader, - last_commit: Arc>, + line_length: Column, + lang: BytesColumn, + last_commit: Column, } impl ScoreSegmentTweaker for SegmentScorer { fn score(&mut self, doc: DocId, mut score: Score) -> Score { // * 1000 if it's a language we understand - score *= 1.0 + self.lang.num_bytes(doc).min(1) as f32 * 999.0; + let mut bytes = Vec::new(); + self.lang.ords().values_for_doc(doc).for_each(|ord| { + self.lang.ord_to_bytes(ord, &mut bytes).unwrap(); + }); + score *= 1.0 + bytes.len().min(1) as f32 * 999.0; // Penalty for lines that are too long - score /= self.line_length.get_val(doc).clamp(20.0, 1000.0) as f32; + score /= self.line_length.values.get_val(doc).clamp(20.0, 1000.0) as f32; score /= SystemTime::now() .duration_since(SystemTime::UNIX_EPOCH) .unwrap() .as_secs() - .saturating_sub(self.last_commit.get_val(doc)) + .saturating_sub(self.last_commit.values.get_val(doc)) .min(5_000_000) as f32; score @@ -40,13 +45,17 @@ impl ScoreTweaker for DocumentTweaker { &self, segment_reader: &tantivy::SegmentReader, ) -> tantivy::Result { - let Self(schema) = self; + let Self(file) = self; + let schema = file.schema(); + let avg_line_length_field = schema.get_field_name(file.avg_line_length); + let lang_field = schema.get_field_name(file.lang); + let last_commit_unix_seconds_field = schema.get_field_name(file.last_commit_unix_seconds); Ok(SegmentScorer { - line_length: segment_reader.fast_fields().f64(schema.avg_line_length)?, - lang: segment_reader.fast_fields().bytes(schema.lang)?, + line_length: segment_reader.fast_fields().f64(avg_line_length_field)?, + lang: segment_reader.fast_fields().bytes(lang_field)?.unwrap(), last_commit: segment_reader .fast_fields() - .u64(schema.last_commit_unix_seconds)?, + .u64(last_commit_unix_seconds_field)?, }) } }