fix(rome_js_parser): correctly parse regex with unicode chars (#2344)

* correctly parse regex with unicode chars
rome · Apr 4, 2022 · d6bd13f · d6bd13f
1 parent 59e3eda
commit d6bd13f
Show file tree

Hide file tree

Showing 4 changed files with 45 additions and 13 deletions.
diff --git a/crates/rome_js_parser/src/lexer/mod.rs b/crates/rome_js_parser/src/lexer/mod.rs
@@ -505,7 +505,8 @@ impl<'src> Lexer<'src> {
         }
     }
 
-    /// Get the unicode char which starts at the current byte
+    /// Get the UTF8 char which starts at the current byte
+    /// Safety: Must be called at the begining of a UTF8 char.
     fn current_char_unchecked(&self) -> char {
         // This is unreachable for all intents and purposes, but this is just a precautionary measure
         debug_assert!(!self.is_eof());
@@ -561,6 +562,19 @@ impl<'src> Lexer<'src> {
         self.current_byte()
     }
 
+    /// Advances the position by the current char UTF8 length and returns the next char
+    /// Safety: Must be called at the begining of a UTF8 char.
+    #[inline]
+    fn next_char_unchecked(&mut self) -> Option<char> {
+        self.advance_char_unchecked();
+
+        if self.is_eof() {
+            None
+        } else {
+            Some(self.current_char_unchecked())
+        }
+    }
+
     /// Get the next byte but only advance the index if there is a next byte.
     /// This is really just a hack for certain methods like escapes
     #[inline]
@@ -595,6 +609,14 @@ impl<'src> Lexer<'src> {
         self.position += n;
     }
 
+    /// Advances the current position by the current char UTF8 length
+    /// Safety: Must be called at the begining of a UTF8 char.
+    #[inline]
+    fn advance_char_unchecked(&mut self) {
+        let c = self.current_char_unchecked();
+        self.position += c.len_utf8();
+    }
+
     /// Returns `true` if the parser is at or passed the end of the file.
     #[inline]
     fn is_eof(&self) -> bool {
@@ -1444,7 +1466,6 @@ impl<'src> Lexer<'src> {
                 }
                 COMMENT
             }
-            // _ if self.state.expr_allowed => self.read_regex(),
             Some(b'=') => {
                 self.advance(2);
                 SLASHEQ
@@ -1467,19 +1488,18 @@ impl<'src> Lexer<'src> {
     #[allow(clippy::many_single_char_names)]
     fn read_regex(&mut self) -> JsSyntaxKind {
         let current = unsafe { self.current_unchecked() };
-
         if current != b'/' {
             return self.lex_token();
         }
 
         let start = self.position;
         let mut in_class = false;
 
-        while let Some(byte) = self.next_byte() {
-            match byte {
-                b'[' => in_class = true,
-                b']' => in_class = false,
-                b'/' => {
+        while let Some(c) = self.next_char_unchecked() {
+            match c {
+                '[' => in_class = true,
+                ']' => in_class = false,
+                '/' => {
                     if !in_class {
                         let (mut g, mut i, mut m, mut s, mut u, mut y, mut d) =
                             (false, false, false, false, false, false, false);
@@ -1546,7 +1566,7 @@ impl<'src> Lexer<'src> {
                         return JsSyntaxKind::JS_REGEX_LITERAL;
                     }
                 }
-                b'\\' => {
+                '\\' => {
                     if self.next_byte_bounded().is_none() {
                         self.diagnostics.push(
                             Diagnostic::error(

diff --git a/crates/rome_js_parser/src/syntax/expr.rs b/crates/rome_js_parser/src/syntax/expr.rs
@@ -144,6 +144,7 @@ pub(crate) fn parse_expression_or_recover_to_next_statement(
 // 0, 0.0, 0n, 0e00
 // "test\
 // new-line";
+// /^[يفمئامئ‍ئاسۆند]/i; //regex with unicode
 
 // test_err literals
 // 00, 012, 08, 091, 0789 // parser errors

diff --git a/crates/rome_js_parser/test_data/inline/ok/literals.js b/crates/rome_js_parser/test_data/inline/ok/literals.js
@@ -8,3 +8,4 @@ null
 0, 0.0, 0n, 0e00
 "test\
 new-line";
+/^[يفمئامئ‍ئاسۆند]/i; //regex with unicode
diff --git a/crates/rome_js_parser/test_data/inline/ok/literals.rast b/crates/rome_js_parser/test_data/inline/ok/literals.rast
@@ -74,14 +74,20 @@ JsModule {
             },
             semicolon_token: SEMICOLON@66..67 ";" [] [],
         },
+        JsExpressionStatement {
+            expression: JsRegexLiteralExpression {
+                value_token: JS_REGEX_LITERAL@67..103 "/^[يفمئامئ\u{200d}ئاسۆند]/i" [Newline("\n")] [],
+            },
+            semicolon_token: SEMICOLON@103..125 ";" [] [Whitespace(" "), Comments("//regex with unicode")],
+        },
     ],
-    eof_token: EOF@67..68 "" [Newline("\n")] [],
+    eof_token: EOF@125..126 "" [Newline("\n")] [],
 }
 
-0: JS_MODULE@0..68
+0: JS_MODULE@0..126
   0: (empty)
   1: JS_DIRECTIVE_LIST@0..0
-  2: JS_MODULE_ITEM_LIST@0..67
+  2: JS_MODULE_ITEM_LIST@0..125
     0: JS_EXPRESSION_STATEMENT@0..1
       0: JS_NUMBER_LITERAL_EXPRESSION@0..1
         0: JS_NUMBER_LITERAL@0..1 "5" [] []
@@ -130,4 +136,8 @@ JsModule {
       0: JS_STRING_LITERAL_EXPRESSION@49..66
         0: JS_STRING_LITERAL@49..66 "\"test\\\nnew-line\"" [Newline("\n")] []
       1: SEMICOLON@66..67 ";" [] []
-  3: EOF@67..68 "" [Newline("\n")] []
+    9: JS_EXPRESSION_STATEMENT@67..125
+      0: JS_REGEX_LITERAL_EXPRESSION@67..103
+        0: JS_REGEX_LITERAL@67..103 "/^[يفمئامئ\u{200d}ئاسۆند]/i" [Newline("\n")] []
+      1: SEMICOLON@103..125 ";" [] [Whitespace(" "), Comments("//regex with unicode")]
+  3: EOF@125..126 "" [Newline("\n")] []