From 86b9550811b359a0af7a93523fbd14c5cdcc635f Mon Sep 17 00:00:00 2001 From: Michael Howell Date: Sat, 30 Dec 2023 20:56:59 -0700 Subject: [PATCH] rustdoc-search: tighter encoding for `f` index Two optimizations for the function signature search: * Instead of using JSON arrays, like `[1,20]`, it uses VLQ hex with no commas, like `[aAd]`. * This also adds backrefs: if you have more than one function with exactly the same signature, it'll not only store it once, it'll *decode* it once, and store in the typeIdMap only once. Size change ----------- standard library ```console $ du -bs search-index-old.js search-index-new.js 4976370 search-index-old.js 4404391 search-index-new.js ``` ((4976370-4404391)/4404391)*100% = 12.9% Benchmarks are similarly shrunk: ```console $ du -hs tmp/{arti,cortex-m,sqlx,stm32f4,ripgrep}/toolchain_{old,new}/doc/search-index.js 10555067 tmp/arti/toolchain_old/doc/search-index.js 8921236 tmp/arti/toolchain_new/doc/search-index.js 77018 tmp/cortex-m/toolchain_old/doc/search-index.js 66676 tmp/cortex-m/toolchain_new/doc/search-index.js 2876330 tmp/sqlx/toolchain_old/doc/search-index.js 2436812 tmp/sqlx/toolchain_new/doc/search-index.js 63632890 tmp/stm32f4/toolchain_old/doc/search-index.js 52337438 tmp/stm32f4/toolchain_new/doc/search-index.js 631150 tmp/ripgrep/toolchain_old/doc/search-index.js 541646 tmp/ripgrep/toolchain_new/doc/search-index.js ``` --- src/librustdoc/html/render/mod.rs | 146 ++++++++++++++------- src/librustdoc/html/render/search_index.rs | 29 +--- src/librustdoc/html/static/js/search.js | 81 +++++++++--- 3 files changed, 170 insertions(+), 86 deletions(-) diff --git a/src/librustdoc/html/render/mod.rs b/src/librustdoc/html/render/mod.rs index 118c6eeb289bb..345a47a21b8f7 100644 --- a/src/librustdoc/html/render/mod.rs +++ b/src/librustdoc/html/render/mod.rs @@ -58,7 +58,7 @@ use rustc_span::{ symbol::{sym, Symbol}, BytePos, FileName, RealFileName, }; -use serde::ser::{SerializeMap, SerializeSeq}; +use serde::ser::SerializeMap; use serde::{Serialize, Serializer}; use crate::clean::{self, ItemId, RenderedLink, SelfTy}; @@ -123,44 +123,53 @@ pub(crate) struct IndexItem { } /// A type used for the search index. -#[derive(Debug)] +#[derive(Debug, Eq, PartialEq)] pub(crate) struct RenderType { id: Option, generics: Option>, bindings: Option)>>, } -impl Serialize for RenderType { - fn serialize(&self, serializer: S) -> Result - where - S: Serializer, - { - let id = match &self.id { - // 0 is a sentinel, everything else is one-indexed - None => 0, - // concrete type - Some(RenderTypeId::Index(idx)) if *idx >= 0 => idx + 1, - // generic type parameter - Some(RenderTypeId::Index(idx)) => *idx, - _ => panic!("must convert render types to indexes before serializing"), - }; +impl RenderType { + pub fn write_to_string(&self, string: &mut String) { if self.generics.is_some() || self.bindings.is_some() { - let mut seq = serializer.serialize_seq(None)?; - seq.serialize_element(&id)?; - seq.serialize_element(self.generics.as_ref().map(Vec::as_slice).unwrap_or_default())?; + string.push('{'); + // 0 is a sentinel, everything else is one-indexed + match self.id { + Some(id) => id.write_to_string(string), + None => string.push('`'), + } + string.push('{'); + for generic in &self.generics.as_ref().map(Vec::as_slice).unwrap_or_default()[..] { + generic.write_to_string(string); + } + string.push('}'); if self.bindings.is_some() { - seq.serialize_element( - self.bindings.as_ref().map(Vec::as_slice).unwrap_or_default(), - )?; + string.push('{'); + for binding in &self.bindings.as_ref().map(Vec::as_slice).unwrap_or_default()[..] { + string.push('{'); + binding.0.write_to_string(string); + string.push('{'); + for constraint in &binding.1[..] { + constraint.write_to_string(string); + } + string.push('}'); + string.push('}'); + } + string.push('}'); } - seq.end() + string.push('}'); } else { - id.serialize(serializer) + // 0 is a sentinel, everything else is one-indexed + match self.id { + Some(id) => id.write_to_string(string), + None => string.push('`'), + } } } } -#[derive(Clone, Copy, Debug)] +#[derive(Clone, Copy, Debug, Eq, PartialEq)] pub(crate) enum RenderTypeId { DefId(DefId), Primitive(clean::PrimitiveType), @@ -168,36 +177,50 @@ pub(crate) enum RenderTypeId { Index(isize), } -impl Serialize for RenderTypeId { - fn serialize(&self, serializer: S) -> Result - where - S: Serializer, - { - let id = match &self { +impl RenderTypeId { + pub fn write_to_string(&self, string: &mut String) { + // (sign, value) + let (sign, id): (bool, u32) = match &self { // 0 is a sentinel, everything else is one-indexed // concrete type - RenderTypeId::Index(idx) if *idx >= 0 => idx + 1, + RenderTypeId::Index(idx) if *idx >= 0 => (false, (idx + 1isize).try_into().unwrap()), // generic type parameter - RenderTypeId::Index(idx) => *idx, + RenderTypeId::Index(idx) => (true, (-*idx).try_into().unwrap()), _ => panic!("must convert render types to indexes before serializing"), }; - id.serialize(serializer) + // zig-zag notation + let value: u32 = (id << 1) | (if sign { 1 } else { 0 }); + // encode + let mut shift: u32 = 28; + let mut mask: u32 = 0xF0_00_00_00; + while shift < 32 { + let hexit = (value & mask) >> shift; + if hexit != 0 || shift == 0 { + let hex = + char::try_from(if shift == 0 { '`' } else { '@' } as u32 + hexit).unwrap(); + string.push(hex); + } + shift = shift.wrapping_sub(4); + mask = mask >> 4; + } } } /// Full type of functions/methods in the search index. -#[derive(Debug)] +#[derive(Debug, Eq, PartialEq)] pub(crate) struct IndexItemFunctionType { inputs: Vec, output: Vec, where_clause: Vec>, } -impl Serialize for IndexItemFunctionType { - fn serialize(&self, serializer: S) -> Result - where - S: Serializer, - { +impl IndexItemFunctionType { + pub fn write_to_string<'a>( + &'a self, + string: &mut String, + backref_queue: &mut VecDeque<&'a IndexItemFunctionType>, + ) { + assert!(backref_queue.len() < 16); // If we couldn't figure out a type, just write `0`. let has_missing = self .inputs @@ -205,33 +228,58 @@ impl Serialize for IndexItemFunctionType { .chain(self.output.iter()) .any(|i| i.id.is_none() && i.generics.is_none()); if has_missing { - 0.serialize(serializer) + string.push('`'); + } else if let Some(idx) = backref_queue.iter().position(|other| *other == self) { + string.push( + char::try_from('0' as u32 + u32::try_from(idx).unwrap()) + .expect("last possible value is '?'"), + ); } else { - let mut seq = serializer.serialize_seq(None)?; + backref_queue.push_front(self); + if backref_queue.len() >= 16 { + backref_queue.pop_back(); + } + string.push('{'); match &self.inputs[..] { [one] if one.generics.is_none() && one.bindings.is_none() => { - seq.serialize_element(one)? + one.write_to_string(string); + } + _ => { + string.push('{'); + for item in &self.inputs[..] { + item.write_to_string(string); + } + string.push('}'); } - _ => seq.serialize_element(&self.inputs)?, } match &self.output[..] { [] if self.where_clause.is_empty() => {} [one] if one.generics.is_none() && one.bindings.is_none() => { - seq.serialize_element(one)? + one.write_to_string(string); + } + _ => { + string.push('{'); + for item in &self.output[..] { + item.write_to_string(string); + } + string.push('}'); } - _ => seq.serialize_element(&self.output)?, } for constraint in &self.where_clause { if let [one] = &constraint[..] && one.generics.is_none() && one.bindings.is_none() { - seq.serialize_element(one)?; + one.write_to_string(string); } else { - seq.serialize_element(constraint)?; + string.push('{'); + for item in &constraint[..] { + item.write_to_string(string); + } + string.push('}'); } } - seq.end() + string.push('}'); } } } diff --git a/src/librustdoc/html/render/search_index.rs b/src/librustdoc/html/render/search_index.rs index a1029320d2d27..e49df400c83ed 100644 --- a/src/librustdoc/html/render/search_index.rs +++ b/src/librustdoc/html/render/search_index.rs @@ -1,5 +1,5 @@ use std::collections::hash_map::Entry; -use std::collections::BTreeMap; +use std::collections::{BTreeMap, VecDeque}; use rustc_data_structures::fx::{FxHashMap, FxIndexMap}; use rustc_middle::ty::TyCtxt; @@ -409,9 +409,11 @@ pub(crate) fn build_index<'tcx>( let mut full_paths = Vec::with_capacity(self.items.len()); let mut descriptions = Vec::with_capacity(self.items.len()); let mut parents = Vec::with_capacity(self.items.len()); - let mut functions = Vec::with_capacity(self.items.len()); + let mut functions = String::with_capacity(self.items.len()); let mut deprecated = Vec::with_capacity(self.items.len()); + let mut backref_queue = VecDeque::new(); + for (index, item) in self.items.iter().enumerate() { let n = item.ty as u8; let c = char::try_from(n + b'A').expect("item types must fit in ASCII"); @@ -434,27 +436,10 @@ pub(crate) fn build_index<'tcx>( full_paths.push((index, &item.path)); } - // Fake option to get `0` out as a sentinel instead of `null`. - // We want to use `0` because it's three less bytes. - enum FunctionOption<'a> { - Function(&'a IndexItemFunctionType), - None, - } - impl<'a> Serialize for FunctionOption<'a> { - fn serialize(&self, serializer: S) -> Result - where - S: Serializer, - { - match self { - FunctionOption::None => 0.serialize(serializer), - FunctionOption::Function(ty) => ty.serialize(serializer), - } - } + match &item.search_type { + Some(ty) => ty.write_to_string(&mut functions, &mut backref_queue), + None => functions.push('`'), } - functions.push(match &item.search_type { - Some(ty) => FunctionOption::Function(ty), - None => FunctionOption::None, - }); if item.deprecation.is_some() { deprecated.push(index); diff --git a/src/librustdoc/html/static/js/search.js b/src/librustdoc/html/static/js/search.js index 2df4f7020dfbb..6fb92d8fbb123 100644 --- a/src/librustdoc/html/static/js/search.js +++ b/src/librustdoc/html/static/js/search.js @@ -2767,19 +2767,65 @@ ${item.displayPath}${name}\ * The raw function search type format is generated using serde in * librustdoc/html/render/mod.rs: impl Serialize for IndexItemFunctionType * - * @param {RawFunctionSearchType} functionSearchType + * @param {{ + * string: string, + * offset: number, + * backrefQueue: FunctionSearchType[] + * }} itemFunctionDecoder * @param {Array<{name: string, ty: number}>} lowercasePaths * @param {Map} * * @return {null|FunctionSearchType} */ - function buildFunctionSearchType(functionSearchType, lowercasePaths) { - const INPUTS_DATA = 0; - const OUTPUT_DATA = 1; - // `0` is used as a sentinel because it's fewer bytes than `null` - if (functionSearchType === 0) { + function buildFunctionSearchType(itemFunctionDecoder, lowercasePaths) { + const c = itemFunctionDecoder.string.charCodeAt(itemFunctionDecoder.offset); + itemFunctionDecoder.offset += 1; + const [zero, ua, la, ob, cb] = ["0", "@", "`", "{", "}"].map(c => c.charCodeAt(0)); + // `` ` `` is used as a sentinel because it's fewer bytes than `null`, and decodes to zero + // `0` is a backref + if (c === la) { return null; } + // sixteen characters after "0" are backref + if (c >= zero && c < ua) { + return itemFunctionDecoder.backrefQueue[c - zero]; + } + if (c !== ob) { + throw ["Unexpected ", c, " in function: expected ", "{", "; this is a bug"]; + } + // call after consuming `{` + function decodeList() { + let c = itemFunctionDecoder.string.charCodeAt(itemFunctionDecoder.offset); + const ret = []; + while (c !== cb) { + ret.push(decode()); + c = itemFunctionDecoder.string.charCodeAt(itemFunctionDecoder.offset); + } + itemFunctionDecoder.offset += 1; // eat cb + return ret; + } + // consumes and returns a list or integer + function decode() { + let n = 0; + let c = itemFunctionDecoder.string.charCodeAt(itemFunctionDecoder.offset); + if (c === ob) { + itemFunctionDecoder.offset += 1; + return decodeList(); + } + while (c < la) { + n = (n << 4) | (c & 0xF); + itemFunctionDecoder.offset += 1; + c = itemFunctionDecoder.string.charCodeAt(itemFunctionDecoder.offset); + } + // last character >= la + n = (n << 4) | (c & 0xF); + const [sign, value] = [n & 1, n >> 1]; + itemFunctionDecoder.offset += 1; + return sign ? -value : value; + } + const functionSearchType = decodeList(); + const INPUTS_DATA = 0; + const OUTPUT_DATA = 1; let inputs, output; if (typeof functionSearchType[INPUTS_DATA] === "number") { inputs = [buildItemSearchType(functionSearchType[INPUTS_DATA], lowercasePaths)]; @@ -2808,9 +2854,14 @@ ${item.displayPath}${name}\ ? [buildItemSearchType(functionSearchType[i], lowercasePaths)] : buildItemSearchTypeAll(functionSearchType[i], lowercasePaths)); } - return { + const ret = { inputs, output, where_clause, }; + itemFunctionDecoder.backrefQueue.unshift(ret); + if (itemFunctionDecoder.backrefQueue.length >= 16) { + itemFunctionDecoder.backrefQueue.pop(); + } + return ret; } /** @@ -2992,8 +3043,12 @@ ${item.displayPath}${name}\ const itemDescs = crateCorpus.d; // an array of (Number) the parent path index + 1 to `paths`, or 0 if none const itemParentIdxs = crateCorpus.i; - // an array of (Array | 0) the type of the function, if any - const itemFunctionSearchTypes = crateCorpus.f; + // a string representing the list of function types + const itemFunctionDecoder = { + string: crateCorpus.f, + offset: 0, + backrefQueue: [], + }; // an array of (Number) indices for the deprecated items const deprecatedItems = new Set(crateCorpus.c); // an array of (Number) indices for the deprecated items @@ -3041,12 +3096,8 @@ ${item.displayPath}${name}\ word = itemNames[i].toLowerCase(); } const path = itemPaths.has(i) ? itemPaths.get(i) : lastPath; - let type = null; - if (itemFunctionSearchTypes[i] !== 0) { - type = buildFunctionSearchType( - itemFunctionSearchTypes[i], - lowercasePaths - ); + const type = buildFunctionSearchType(itemFunctionDecoder, lowercasePaths); + if (type !== null) { if (type) { const fp = functionTypeFingerprint.subarray(id * 4, (id + 1) * 4); const fps = new Set();