From 350dfe5a35c34dfc1482691f0b45e10454c7356a Mon Sep 17 00:00:00 2001 From: Micha Reiser Date: Wed, 30 Mar 2022 15:13:20 +0200 Subject: [PATCH] refactor(rome_js_parser): Refactor Parser Events MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reduce the size of a single parser event from 16 bytes to 8 bytes each by: * Using a `NonZeroU32` for the forward parent. The forward parent can never be 0 because it stores the offset from the current event to the start of the "forwarded" parent. * Store the `start` of a node in the `CompletedMarker` (can't be computed because of forward parents) * Remove `end` from the `Finish` event and instead retrieve the last token of the node when queried (mainly to produce diagnostics). * Only store the end offset for each Token instead of the full range. The end offset is sufficient to reconstruct the length in the tree sink. This reduces the memory consumption during the parse phase significantly: * `jquery`: * Current Bytes: 4.12 MB -> 2.12 MB * Max Bytes: 5.82 MB -> 3.82 MB * Total Bytes: 8.45 MB -> 4.37 MB * `tex-chtml-full` * Current bytes: 33.11 MB -> 17.11 MB * Max bytes: 46 MB -> 30 MB * Total bytes: 67.78 -> 34.92 MB It also reduces the max bytes required during the tree sink phase. The changes do improve parse times but not as much as I did hope for: ``` group event main ----- ----- ---- parser/checker.ts 1.00 63.6±1.84ms 40.9 MB/sec 1.00 63.8±0.45ms 40.8 MB/sec parser/compiler.js 1.00 36.3±0.77ms 28.9 MB/sec 1.03 37.5±0.38ms 27.9 MB/sec parser/d3.min.js 1.00 24.3±0.25ms 10.8 MB/sec 1.03 25.1±2.39ms 10.4 MB/sec parser/dojo.js 1.00 2.2±0.00ms 30.9 MB/sec 1.03 2.3±0.02ms 30.0 MB/sec parser/ios.d.ts 1.00 52.7±0.55ms 35.4 MB/sec 1.19 62.6±0.58ms 29.8 MB/sec parser/jquery.min.js 1.00 6.6±0.13ms 12.6 MB/sec 1.05 6.9±0.26ms 12.0 MB/sec parser/math.js 1.00 45.4±0.90ms 14.3 MB/sec 1.02 46.3±0.59ms 14.0 MB/sec parser/parser.ts 1.00 1525.9±16.73µs 31.7 MB/sec 1.02 1556.6±21.54µs 31.0 MB/sec parser/pixi.min.js 1.00 28.9±0.67ms 15.2 MB/sec 1.01 29.3±0.14ms 15.0 MB/sec parser/react-dom.production.min.js 1.00 9.0±0.01ms 12.7 MB/sec 1.02 9.2±0.05ms 12.5 MB/sec parser/react.production.min.js 1.00 466.9±1.03µs 13.2 MB/sec 1.03 481.5±3.49µs 12.8 MB/sec parser/router.ts 1.00 1186.9±8.65µs 50.4 MB/sec 1.03 1222.2±10.20µs 48.9 MB/sec parser/tex-chtml-full.js 1.00 60.5±0.68ms 15.1 MB/sec 1.10 66.4±1.53ms 13.7 MB/sec parser/three.min.js 1.00 32.1±0.24ms 18.3 MB/sec 1.03 33.0±0.43ms 17.8 MB/sec parser/typescript.js 1.00 279.9±4.87ms 33.9 MB/sec 1.04 292.2±2.93ms 32.5 MB/sec parser/vue.global.prod.js 1.00 11.4±0.34ms 10.6 MB/sec 1.01 11.5±0.03ms 10.5 MB/sec ``` ## Tests `cargo test` --- crates/rome_js_parser/src/event.rs | 41 +++---- crates/rome_js_parser/src/lib.rs | 2 +- .../rome_js_parser/src/lossless_tree_sink.rs | 10 +- crates/rome_js_parser/src/parser.rs | 102 +++++++----------- .../src/parser/rewrite_parser.rs | 20 ++-- crates/rome_js_parser/src/syntax/class.rs | 5 +- crates/rome_js_parser/src/syntax/function.rs | 6 +- crates/rome_js_parser/src/syntax/module.rs | 5 +- 8 files changed, 74 insertions(+), 117 deletions(-) diff --git a/crates/rome_js_parser/src/event.rs b/crates/rome_js_parser/src/event.rs index cd979ed1477..0b3cd989548 100644 --- a/crates/rome_js_parser/src/event.rs +++ b/crates/rome_js_parser/src/event.rs @@ -1,14 +1,13 @@ //! Events emitted by the Parser which are then constructed into a syntax tree use std::mem; +use std::num::NonZeroU32; use crate::lexer::TextSize; -use crate::{ParseDiagnostic, Parser, TreeSink}; -use rome_js_syntax::JsSyntaxKind::{self, *}; -use rome_rowan::TextRange; - use crate::parser::rewrite_parser::{RewriteParser, RewriteToken}; use crate::parser::Checkpoint; +use crate::{ParseDiagnostic, Parser, TreeSink}; +use rome_js_syntax::JsSyntaxKind::{self, *}; /// Events emitted by the Parser, these events are later /// made into a syntax tree with `process` into TreeSink. @@ -23,29 +22,25 @@ pub enum Event { /// become the children of the respective node. Start { kind: JsSyntaxKind, - start: TextSize, - forward_parent: Option, + forward_parent: Option, }, /// Complete the previous `Start` event - Finish { end: TextSize }, + Finish, /// Produce a single leaf-element. - /// `n_raw_tokens` is used to glue complex contextual tokens. - /// For example, lexer tokenizes `>>` as `>`, `>`, and - /// `n_raw_tokens = 2` is used to produced a single `>>`. Token { kind: JsSyntaxKind, - range: TextRange, + /// The end offset of this token. + end: TextSize, }, } impl Event { - pub fn tombstone(start: TextSize) -> Self { + pub fn tombstone() -> Self { Event::Start { kind: TOMBSTONE, forward_parent: None, - start, } } } @@ -57,7 +52,7 @@ pub fn process(sink: &mut impl TreeSink, mut events: Vec, errors: Vec (), @@ -72,14 +67,13 @@ pub fn process(sink: &mut impl TreeSink, mut events: Vec, errors: Vec, errors: Vec sink.finish_node(), - Event::Token { kind, range } => { - sink.token(kind, range.len()); + Event::Token { kind, end } => { + sink.token(*kind, *end); } } } @@ -113,9 +107,9 @@ struct RewriteParseEventsTreeSink<'r, 'p, T> { } impl<'r, 'p, T: RewriteParseEvents> TreeSink for RewriteParseEventsTreeSink<'r, 'p, T> { - fn token(&mut self, kind: JsSyntaxKind, length: TextSize) { + fn token(&mut self, kind: JsSyntaxKind, end: TextSize) { self.reparse - .token(RewriteToken::new(kind, length), &mut self.parser); + .token(RewriteToken::new(kind, end), &mut self.parser); } fn start_node(&mut self, kind: JsSyntaxKind) { @@ -156,7 +150,6 @@ pub(crate) fn rewrite_events( // The current parsed grammar is a super-set of the grammar that gets re-parsed. Thus, any // error that applied to the old grammar also applies to the sub-grammar. let events: Vec<_> = p.events.split_off(checkpoint.event_pos + 1); - p.last_token_event_pos = checkpoint.last_token_pos; let mut sink = RewriteParseEventsTreeSink { parser: RewriteParser::new(p, checkpoint.token_source), diff --git a/crates/rome_js_parser/src/lib.rs b/crates/rome_js_parser/src/lib.rs index b01fec3393e..7f0d98170d5 100644 --- a/crates/rome_js_parser/src/lib.rs +++ b/crates/rome_js_parser/src/lib.rs @@ -100,7 +100,7 @@ use std::path::Path; /// An abstraction for syntax tree implementations pub trait TreeSink { /// Adds new token to the current branch. - fn token(&mut self, kind: JsSyntaxKind, length: TextSize); + fn token(&mut self, kind: JsSyntaxKind, end: TextSize); /// Start new branch and make it current. fn start_node(&mut self, kind: JsSyntaxKind); diff --git a/crates/rome_js_parser/src/lossless_tree_sink.rs b/crates/rome_js_parser/src/lossless_tree_sink.rs index fec8db76925..0c36883e5bc 100644 --- a/crates/rome_js_parser/src/lossless_tree_sink.rs +++ b/crates/rome_js_parser/src/lossless_tree_sink.rs @@ -21,8 +21,8 @@ pub struct LosslessTreeSink<'a> { } impl<'a> TreeSink for LosslessTreeSink<'a> { - fn token(&mut self, kind: JsSyntaxKind, length: TextSize) { - self.do_token(kind, length); + fn token(&mut self, kind: JsSyntaxKind, end: TextSize) { + self.do_token(kind, end); } fn start_node(&mut self, kind: JsSyntaxKind) { @@ -34,7 +34,7 @@ impl<'a> TreeSink for LosslessTreeSink<'a> { self.parents_count -= 1; if self.parents_count == 0 && self.needs_eof { - self.do_token(JsSyntaxKind::EOF, TextSize::default()); + self.do_token(JsSyntaxKind::EOF, TextSize::from(self.text.len() as u32)); } self.inner.finish_node(); @@ -69,7 +69,7 @@ impl<'a> LosslessTreeSink<'a> { } #[inline] - fn do_token(&mut self, kind: JsSyntaxKind, length: TextSize) { + fn do_token(&mut self, kind: JsSyntaxKind, token_end: TextSize) { if kind == JsSyntaxKind::EOF { self.needs_eof = false; } @@ -80,7 +80,7 @@ impl<'a> LosslessTreeSink<'a> { self.eat_trivia(false); let trailing_start = self.trivia_pieces.len(); - self.text_pos += length; + self.text_pos = token_end; // Everything until the next linebreak (but not including it) // will be the trailing trivia... diff --git a/crates/rome_js_parser/src/parser.rs b/crates/rome_js_parser/src/parser.rs index 0854c461b19..499a8e284b3 100644 --- a/crates/rome_js_parser/src/parser.rs +++ b/crates/rome_js_parser/src/parser.rs @@ -15,6 +15,7 @@ use rome_js_syntax::{ JsSyntaxKind::{self}, TextRange, }; +use std::num::NonZeroU32; pub(crate) use parse_error::*; pub(crate) use parse_lists::{ParseNodeList, ParseSeparatedList}; @@ -73,8 +74,6 @@ pub(crate) struct Parser<'s> { pub(super) state: ParserState, pub source_type: SourceType, pub diagnostics: Vec, - // A `u32` is sufficient because the parser only supports files up to `u32` bytes. - pub(super) last_token_event_pos: Option, // If the parser should skip tokens as trivia skipping: bool, } @@ -89,7 +88,6 @@ impl<'s> Parser<'s> { events: vec![], state: ParserState::new(&source_type), tokens: token_source, - last_token_event_pos: None, source_type, diagnostics: vec![], skipping: false, @@ -135,20 +133,18 @@ impl<'s> Parser<'s> { /// Returns the kind of the last bumped token. pub fn last(&self) -> Option { - self.last_token_event_pos - .map(|pos| match self.events[pos as usize] { - Event::Token { kind, .. } => kind, - _ => unreachable!(), - }) + self.events.iter().rev().find_map(|event| match event { + Event::Token { kind, .. } => Some(*kind), + _ => None, + }) } - /// Returns the range of the last bumped token. - pub fn last_range(&self) -> Option { - self.last_token_event_pos - .map(|pos| match self.events[pos as usize] { - Event::Token { range, .. } => range, - _ => unreachable!(), - }) + /// Returns the end offset of the last bumped token. + pub fn last_end(&self) -> Option { + self.events.iter().rev().find_map(|event| match event { + Event::Token { end, .. } => Some(*end), + _ => None, + }) } /// Consume the next token if `kind` matches. @@ -169,7 +165,7 @@ impl<'s> Parser<'s> { pub fn start(&mut self) -> Marker { let pos = self.events.len() as u32; let start = self.tokens.position(); - self.push_event(Event::tombstone(start)); + self.push_event(Event::tombstone()); Marker::new(pos, start) } @@ -271,13 +267,12 @@ impl<'s> Parser<'s> { } else { let range = self.cur_range(); self.tokens.bump(context); - self.push_token(kind, range); + self.push_token(kind, range.end()); } } - fn push_token(&mut self, kind: JsSyntaxKind, range: TextRange) { - self.last_token_event_pos = Some(self.events.len() as u32); - self.push_event(Event::Token { kind, range }); + fn push_token(&mut self, kind: JsSyntaxKind, end: TextSize) { + self.push_event(Event::Token { kind, end }); } fn push_event(&mut self, event: Event) { @@ -312,10 +307,8 @@ impl<'s> Parser<'s> { event_pos, errors_pos, state, - last_token_pos, } = checkpoint; self.tokens.rewind(token_source); - self.last_token_event_pos = last_token_pos; self.drain_events(self.cur_event_pos() - event_pos); self.diagnostics.truncate(errors_pos as usize); self.state.restore(state) @@ -326,7 +319,6 @@ impl<'s> Parser<'s> { pub fn checkpoint(&self) -> Checkpoint { Checkpoint { token_source: self.tokens.checkpoint(), - last_token_pos: self.last_token_event_pos, event_pos: self.cur_event_pos(), errors_pos: self.diagnostics.len() as u32, state: self.state.checkpoint(), @@ -432,21 +424,7 @@ impl Marker { /// Finishes the syntax tree node and assigns `kind` to it, /// and mark the create a `CompletedMarker` for possible future /// operation like `.precede()` to deal with forward_parent. - pub fn complete(self, p: &mut Parser, kind: JsSyntaxKind) -> CompletedMarker { - let end_pos = TextSize::max( - p.last_range().map(|t| t.end()).unwrap_or(self.start), - self.start, - ); - - self.complete_at(p, kind, end_pos) - } - - fn complete_at( - mut self, - p: &mut Parser, - kind: JsSyntaxKind, - end_pos: TextSize, - ) -> CompletedMarker { + pub fn complete(mut self, p: &mut Parser, kind: JsSyntaxKind) -> CompletedMarker { self.bomb.defuse(); let idx = self.pos as usize; match p.events[idx] { @@ -458,11 +436,9 @@ impl Marker { _ => unreachable!(), } let finish_pos = p.events.len() as u32; + p.push_event(Event::Finish); - assert!(end_pos >= self.start); - p.push_event(Event::Finish { end: end_pos }); - - let new = CompletedMarker::new(self.pos, finish_pos, kind); + let new = CompletedMarker::new(self.pos, finish_pos, self.start, kind); new.old_start(self.old_start) } @@ -503,6 +479,7 @@ impl Marker { #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub(crate) struct CompletedMarker { start_pos: u32, + offset: TextSize, // Hack for parsing completed markers which have been preceded // This should be redone completely in the future old_start: u32, @@ -511,9 +488,10 @@ pub(crate) struct CompletedMarker { } impl CompletedMarker { - pub fn new(start_pos: u32, finish_pos: u32, kind: JsSyntaxKind) -> Self { + pub fn new(start_pos: u32, finish_pos: u32, offset: TextSize, kind: JsSyntaxKind) -> Self { CompletedMarker { start_pos, + offset, old_start: start_pos, finish_pos, kind, @@ -549,15 +527,16 @@ impl CompletedMarker { /// Get the range of the marker pub fn range(&self, p: &Parser) -> TextRange { - let start = match p.events[self.old_start as usize] { - Event::Start { start, .. } => start, - _ => unreachable!(), - }; - let end = match p.events[self.finish_pos as usize] { - Event::Finish { end } => end, - _ => unreachable!(), - }; - TextRange::new(start, end) + let end = p.events[self.old_start as usize..self.finish_pos as usize] + .iter() + .rev() + .find_map(|event| match event { + Event::Token { end, .. } => Some(*end), + _ => None, + }) + .unwrap_or(self.offset); + + TextRange::new(self.offset, end) } /// Get the underlying text of a marker @@ -583,15 +562,16 @@ impl CompletedMarker { match p.events[idx] { Event::Start { ref mut forward_parent, - start, .. } => { - *forward_parent = Some(new_pos.pos - self.start_pos); - new_pos.start = start; + // Safety: The new marker is always inserted after the start marker of this node, thus + // subtracting the two positions can never be 0. + *forward_parent = Some(NonZeroU32::try_from(new_pos.pos - self.start_pos).unwrap()); } _ => unreachable!(), } new_pos.child_idx = Some(self.start_pos as usize); + new_pos.start = self.offset; new_pos.old_start(self.old_start as u32) } @@ -599,24 +579,19 @@ impl CompletedMarker { pub fn undo_completion(self, p: &mut Parser) -> Marker { let start_idx = self.start_pos as usize; let finish_idx = self.finish_pos as usize; - let start_pos; match p.events[start_idx] { Event::Start { ref mut kind, forward_parent: None, - start, - } => { - start_pos = start; - *kind = JsSyntaxKind::TOMBSTONE - } + } => *kind = JsSyntaxKind::TOMBSTONE, _ => unreachable!(), } match p.events[finish_idx] { - ref mut slot @ Event::Finish { .. } => *slot = Event::tombstone(start_pos), + ref mut slot @ Event::Finish { .. } => *slot = Event::tombstone(), _ => unreachable!(), } - Marker::new(self.start_pos, start_pos) + Marker::new(self.start_pos, self.offset) } pub fn kind(&self) -> JsSyntaxKind { @@ -632,7 +607,6 @@ pub struct Checkpoint { /// Safety: The parser only supports files <= 4Gb. Storing a `u32` is sufficient to store one error /// for each single character in the file, which should be sufficient for any realistic file. errors_pos: u32, - pub(super) last_token_pos: Option, state: ParserStateCheckpoint, pub(super) token_source: TokenSourceCheckpoint, } diff --git a/crates/rome_js_parser/src/parser/rewrite_parser.rs b/crates/rome_js_parser/src/parser/rewrite_parser.rs index dc68b8201f7..ff998f9b0f5 100644 --- a/crates/rome_js_parser/src/parser/rewrite_parser.rs +++ b/crates/rome_js_parser/src/parser/rewrite_parser.rs @@ -37,16 +37,15 @@ impl<'parser, 'source> RewriteParser<'parser, 'source> { pub fn start(&mut self) -> RewriteMarker { let pos = self.inner.events.len() as u32; self.skip_trivia(false); - self.inner.push_event(Event::tombstone(self.offset)); + self.inner.push_event(Event::tombstone()); RewriteMarker(Marker::new(pos, self.offset)) } /// Bumps the passed in token pub fn bump(&mut self, token: RewriteToken) { self.skip_trivia(false); - self.inner - .push_token(token.kind, TextRange::at(self.offset, token.length)); - self.offset += token.length; + self.inner.push_token(token.kind, token.end); + self.offset = token.end; self.skip_trivia(true); } @@ -92,12 +91,12 @@ impl<'parser, 'source> RewriteParser<'parser, 'source> { #[derive(Debug, Clone, Copy)] pub(crate) struct RewriteToken { pub(crate) kind: JsSyntaxKind, - length: TextSize, + end: TextSize, } impl RewriteToken { - pub fn new(kind: JsSyntaxKind, length: TextSize) -> Self { - Self { kind, length } + pub fn new(kind: JsSyntaxKind, end: TextSize) -> Self { + Self { kind, end } } } @@ -107,12 +106,7 @@ pub(crate) struct RewriteMarker(Marker); impl RewriteMarker { /// Completes the node with the specified kind pub fn complete(self, p: &mut RewriteParser, kind: JsSyntaxKind) -> RewriteCompletedMarker { - let mut end_pos = p.inner.last_range().map(|t| t.end()).unwrap_or_default(); - if end_pos < self.0.start { - end_pos = p.offset; - } - - RewriteCompletedMarker(self.0.complete_at(p.inner, kind, end_pos)) + RewriteCompletedMarker(self.0.complete(p.inner, kind)) } } diff --git a/crates/rome_js_parser/src/syntax/class.rs b/crates/rome_js_parser/src/syntax/class.rs index a00b4895b45..bf3e74a0159 100644 --- a/crates/rome_js_parser/src/syntax/class.rs +++ b/crates/rome_js_parser/src/syntax/class.rs @@ -958,10 +958,7 @@ fn parse_property_class_member_body( fn expect_member_semi(p: &mut Parser, member_marker: &Marker, name: &str) { if !optional_semi(p) { // Gets the start of the member - let end = p - .last_range() - .map(|r| r.end()) - .unwrap_or_else(|| p.cur_range().start()); + let end = p.last_end().unwrap_or_else(|| p.cur_range().start()); let err = p .err_builder(&format!( diff --git a/crates/rome_js_parser/src/syntax/function.rs b/crates/rome_js_parser/src/syntax/function.rs index 4429666a5fb..dfe40d54477 100644 --- a/crates/rome_js_parser/src/syntax/function.rs +++ b/crates/rome_js_parser/src/syntax/function.rs @@ -186,9 +186,11 @@ fn parse_function(p: &mut Parser, m: Marker, kind: FunctionKind) -> CompletedMar } p.expect(T![function]); - let generator_range = if p.eat(T![*]) { + let generator_range = if p.at(T![*]) { + let range = p.cur_range(); + p.bump(T![*]); flags |= SignatureFlags::GENERATOR; - p.last_range() + Some(range) } else { None }; diff --git a/crates/rome_js_parser/src/syntax/module.rs b/crates/rome_js_parser/src/syntax/module.rs index 6dc1978198b..417dfc830f4 100644 --- a/crates/rome_js_parser/src/syntax/module.rs +++ b/crates/rome_js_parser/src/syntax/module.rs @@ -237,10 +237,7 @@ fn parse_import_default_or_named_clause_rest( parse_named_import(p).or_add_diagnostic(p, expected_named_import); if is_typed { - let end = p - .last_range() - .map(|r| r.end()) - .unwrap_or_else(|| p.cur_range().start()); + let end = p.last_end().unwrap_or_else(|| p.cur_range().start()); // test_err ts ts_typed_default_import_with_named // import type A, { B, C } from './a';