Skip to content
This repository has been archived by the owner on Aug 31, 2023. It is now read-only.

Commit

Permalink
fix(rome_js_parser): correctly parse regex with unicode chars (#2344)
Browse files Browse the repository at this point in the history
* correctly parse regex with unicode chars
  • Loading branch information
xunilrj committed Apr 4, 2022
1 parent 59e3eda commit d6bd13f
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 13 deletions.
38 changes: 29 additions & 9 deletions crates/rome_js_parser/src/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -505,7 +505,8 @@ impl<'src> Lexer<'src> {
}
}

/// Get the unicode char which starts at the current byte
/// Get the UTF8 char which starts at the current byte
/// Safety: Must be called at the begining of a UTF8 char.
fn current_char_unchecked(&self) -> char {
// This is unreachable for all intents and purposes, but this is just a precautionary measure
debug_assert!(!self.is_eof());
Expand Down Expand Up @@ -561,6 +562,19 @@ impl<'src> Lexer<'src> {
self.current_byte()
}

/// Advances the position by the current char UTF8 length and returns the next char
/// Safety: Must be called at the begining of a UTF8 char.
#[inline]
fn next_char_unchecked(&mut self) -> Option<char> {
self.advance_char_unchecked();

if self.is_eof() {
None
} else {
Some(self.current_char_unchecked())
}
}

/// Get the next byte but only advance the index if there is a next byte.
/// This is really just a hack for certain methods like escapes
#[inline]
Expand Down Expand Up @@ -595,6 +609,14 @@ impl<'src> Lexer<'src> {
self.position += n;
}

/// Advances the current position by the current char UTF8 length
/// Safety: Must be called at the begining of a UTF8 char.
#[inline]
fn advance_char_unchecked(&mut self) {
let c = self.current_char_unchecked();
self.position += c.len_utf8();
}

/// Returns `true` if the parser is at or passed the end of the file.
#[inline]
fn is_eof(&self) -> bool {
Expand Down Expand Up @@ -1444,7 +1466,6 @@ impl<'src> Lexer<'src> {
}
COMMENT
}
// _ if self.state.expr_allowed => self.read_regex(),
Some(b'=') => {
self.advance(2);
SLASHEQ
Expand All @@ -1467,19 +1488,18 @@ impl<'src> Lexer<'src> {
#[allow(clippy::many_single_char_names)]
fn read_regex(&mut self) -> JsSyntaxKind {
let current = unsafe { self.current_unchecked() };

if current != b'/' {
return self.lex_token();
}

let start = self.position;
let mut in_class = false;

while let Some(byte) = self.next_byte() {
match byte {
b'[' => in_class = true,
b']' => in_class = false,
b'/' => {
while let Some(c) = self.next_char_unchecked() {
match c {
'[' => in_class = true,
']' => in_class = false,
'/' => {
if !in_class {
let (mut g, mut i, mut m, mut s, mut u, mut y, mut d) =
(false, false, false, false, false, false, false);
Expand Down Expand Up @@ -1546,7 +1566,7 @@ impl<'src> Lexer<'src> {
return JsSyntaxKind::JS_REGEX_LITERAL;
}
}
b'\\' => {
'\\' => {
if self.next_byte_bounded().is_none() {
self.diagnostics.push(
Diagnostic::error(
Expand Down
1 change: 1 addition & 0 deletions crates/rome_js_parser/src/syntax/expr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,7 @@ pub(crate) fn parse_expression_or_recover_to_next_statement(
// 0, 0.0, 0n, 0e00
// "test\
// new-line";
// /^[يفمئامئ‍ئاسۆند]/i; //regex with unicode

// test_err literals
// 00, 012, 08, 091, 0789 // parser errors
Expand Down
1 change: 1 addition & 0 deletions crates/rome_js_parser/test_data/inline/ok/literals.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ null
0, 0.0, 0n, 0e00
"test\
new-line";
/^[يفمئامئ‍ئاسۆند]/i; //regex with unicode
18 changes: 14 additions & 4 deletions crates/rome_js_parser/test_data/inline/ok/literals.rast
Original file line number Diff line number Diff line change
Expand Up @@ -74,14 +74,20 @@ JsModule {
},
semicolon_token: SEMICOLON@66..67 ";" [] [],
},
JsExpressionStatement {
expression: JsRegexLiteralExpression {
value_token: JS_REGEX_LITERAL@67..103 "/^[يفمئامئ\u{200d}ئاسۆند]/i" [Newline("\n")] [],
},
semicolon_token: SEMICOLON@103..125 ";" [] [Whitespace(" "), Comments("//regex with unicode")],
},
],
eof_token: EOF@67..68 "" [Newline("\n")] [],
eof_token: EOF@125..126 "" [Newline("\n")] [],
}

0: JS_MODULE@0..68
0: JS_MODULE@0..126
0: (empty)
1: JS_DIRECTIVE_LIST@0..0
2: JS_MODULE_ITEM_LIST@0..67
2: JS_MODULE_ITEM_LIST@0..125
0: JS_EXPRESSION_STATEMENT@0..1
0: JS_NUMBER_LITERAL_EXPRESSION@0..1
0: JS_NUMBER_LITERAL@0..1 "5" [] []
Expand Down Expand Up @@ -130,4 +136,8 @@ JsModule {
0: JS_STRING_LITERAL_EXPRESSION@49..66
0: JS_STRING_LITERAL@49..66 "\"test\\\nnew-line\"" [Newline("\n")] []
1: SEMICOLON@66..67 ";" [] []
3: EOF@67..68 "" [Newline("\n")] []
9: JS_EXPRESSION_STATEMENT@67..125
0: JS_REGEX_LITERAL_EXPRESSION@67..103
0: JS_REGEX_LITERAL@67..103 "/^[يفمئامئ\u{200d}ئاسۆند]/i" [Newline("\n")] []
1: SEMICOLON@103..125 ";" [] [Whitespace(" "), Comments("//regex with unicode")]
3: EOF@125..126 "" [Newline("\n")] []

0 comments on commit d6bd13f

Please sign in to comment.