Skip to content
This repository has been archived by the owner on Aug 31, 2023. It is now read-only.

feat(rome_js_analyze): noControlCharactersInRegex #4656

Merged
merged 3 commits into from
Jul 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions crates/rome_diagnostics_categories/src/categories.rs
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ define_categories! {
"lint/nursery/noConfusingArrow": "https://docs.rome.tools/lint/rules/noConfusingArrow",
"lint/nursery/noConsoleLog": "https://docs.rome.tools/lint/rules/noConsoleLog",
"lint/nursery/noConstantCondition": "https://docs.rome.tools/lint/rules/noConstantCondition",
"lint/nursery/noControlCharactersInRegex": "https://docs.rome.tools/lint/rules/noControlCharactersInRegex",
"lint/nursery/noDuplicateJsonKeys": "https://docs.rome.tools/lint/rules/noDuplicateJsonKeys",
"lint/nursery/noDuplicateJsxProps": "https://docs.rome.tools/lint/rules/noDuplicateJsxProps",
"lint/nursery/noExcessiveComplexity": "https://docs.rome.tools/lint/rules/noExcessiveComplexity",
Expand Down
2 changes: 2 additions & 0 deletions crates/rome_js_analyze/src/analyzers/nursery.rs

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
@@ -0,0 +1,274 @@
use crate::utils::escape_string;
use rome_analyze::{context::RuleContext, declare_rule, Ast, Rule, RuleDiagnostic};
use rome_console::markup;
use rome_js_syntax::{
AnyJsExpression, JsCallArguments, JsCallExpression, JsNewExpression, JsRegexLiteralExpression,
JsStringLiteralExpression,
};
use rome_rowan::{declare_node_union, AstNode, AstSeparatedList};
use std::{iter::Peekable, str::Chars};

declare_rule! {
/// Prevents from having control characters and some escape sequences that match control characters in regular expressions.
///
/// Control characters are hidden special characters that are numbered from 0 to 31 in the ASCII system.
/// They're not commonly used in JavaScript text. So, if you see them in a pattern (called a regular expression), it's probably a mistake.
///
/// The following elements of regular expression patterns are considered possible errors in typing and are therefore disallowed by this rule:
///
/// - Hexadecimal character escapes from `\x00` to `\x1F`
/// - Unicode character escapes from `\u0000` to `\u001F`
/// - Unicode code point escapes from `\u{0}` to `\u{1F}`
/// - Unescaped raw characters from U+0000 to U+001F
///
/// Control escapes such as `\t` and `\n` are allowed by this rule.
///
/// Source: https://eslint.org/docs/latest/rules/no-control-regex
///
/// ## Examples
///
/// ### Invalid
/// ```js,expect_diagnostic
/// var pattern1 = /\x00/;
/// ```
/// ```js,expect_diagnostic
/// var pattern2 = /\x0C/;
/// ```
/// ```js,expect_diagnostic
/// var pattern3 = /\x1F/;
/// ```
/// ```js,expect_diagnostic
/// var pattern4 = /\u000C/;
/// ```
/// ```js,expect_diagnostic
/// var pattern5 = /\u{C}/u;
/// ```
/// ```js,expect_diagnostic
/// var pattern7 = new RegExp("\x0C");
/// ```
/// ```js,expect_diagnostic
/// var pattern7 = new RegExp("\\x0C");
/// ```
///
/// ### Valid
/// ```js
/// var pattern1 = /\x20/;
/// var pattern2 = /\u0020/;
/// var pattern3 = /\u{20}/u;
/// var pattern4 = /\t/;
/// var pattern5 = /\n/;
/// var pattern6 = new RegExp("\x20");
/// ```
///
pub(crate) NoControlCharactersInRegex {
version: "next",
name: "noControlCharactersInRegex",
recommended: true,
}
}

declare_node_union! {
pub(crate) RegexExpressionLike = JsNewExpression | JsCallExpression | JsRegexLiteralExpression
}

fn decode_hex_character_to_code_point(iter: &mut Peekable<Chars>) -> Option<(String, i64)> {
let first = iter.next()?;
let second = iter.next()?;
let digits = format!("{first}{second}");
let code_point = i64::from_str_radix(&digits, 16).ok()?;
Some((digits, code_point))
}

fn decode_unicode_escape_to_code_point(iter: &mut Peekable<Chars>) -> Option<(String, i64)> {
let mut digits = String::new();
// Loop 4 times as unicode escape sequence has exactly 4 hexadecimal digits
for _ in 0..4 {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you leave a comment to explain why 0..4? To me, as a first reader, don't make sense

if let Some(&c) = iter.peek() {
match c {
'0'..='9' | 'a'..='f' | 'A'..='F' => digits.push(iter.next()?),
_ => continue,
}
}
}
let code_point = i64::from_str_radix(digits.as_str(), 16).ok()?;
Some((digits, code_point))
}

fn decode_escaped_code_point_to_code_point(iter: &mut Peekable<Chars>) -> Option<(String, i64)> {
let mut digits = String::new();
if iter.peek() == Some(&'{') {
iter.next();
while let Some(&c) = iter.peek() {
if c == '}' {
iter.next();
let code_point = i64::from_str_radix(&digits, 16).ok()?;
return Some((format!("{{{}}}", digits), code_point));
} else {
digits.push(iter.next()?);
}
}
}
None
}

fn add_control_character_to_vec(
prefix: &str,
iter: &mut Peekable<Chars>,
control_characters: &mut Vec<String>,
decode: fn(&mut Peekable<Chars>) -> Option<(String, i64)>,
) {
if let Some((s, code_point)) = decode(iter) {
// ASCII control characters are represented by code points from 0 to 31
if (0..=31).contains(&code_point) {
control_characters.push(format!("{prefix}{s}"));
}
}
}

/// Collecting control characters for regex. The following characters in regular expression patterns are considered as control characters:
/// - Hexadecimal character escapes from `\x00` to `\x1F`.
/// - Unicode character escapes from `\u0000` to `\u001F`.
/// - Unicode code point escapes range from `\u{0}` to `\u{1F}`.
/// - The Unicode flag must be set as true in order for these Unicode code point escapes to work: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/unicode.
/// - Unescaped raw characters from U+0000 to U+001F.
fn collect_control_characters(pattern: String, flags: Option<String>) -> Option<Vec<String>> {
let mut control_characters: Vec<String> = Vec::new();
let is_unicode_flag_set = flags.unwrap_or_default().contains('u');
let mut iter = pattern.chars().peekable();

while let Some(c) = iter.next() {
match c {
'\\' => match iter.next() {
Some('x') => add_control_character_to_vec(
"\\x",
&mut iter,
&mut control_characters,
decode_hex_character_to_code_point,
),
Some('u') if is_unicode_flag_set => add_control_character_to_vec(
"\\u",
&mut iter,
&mut control_characters,
decode_escaped_code_point_to_code_point,
),
Some('u') => add_control_character_to_vec(
"\\u",
&mut iter,
&mut control_characters,
decode_unicode_escape_to_code_point,
),
Some('\\') => continue,
_ => break,
},
_ => continue,
}
}

if !control_characters.is_empty() {
Some(control_characters)
} else {
None
}
}

fn collect_control_characters_from_expression(
callee: &AnyJsExpression,
js_call_arguments: &JsCallArguments,
) -> Option<Vec<String>> {
let js_identifier = match callee {
AnyJsExpression::JsIdentifierExpression(js_identifier) => js_identifier,
_ => return None,
};

if js_identifier.name().ok()?.has_name("RegExp") {
let mut args = js_call_arguments.args().iter();
let raw_pattern = args
.next()
.and_then(|arg| arg.ok())
.and_then(|arg| JsStringLiteralExpression::cast_ref(arg.syntax()))
.and_then(|js_string_literal| js_string_literal.inner_string_text().ok())?
.to_string();

let pattern = escape_string(&raw_pattern).unwrap_or(raw_pattern);

let regexp_flags = args
.next()
.and_then(|arg| arg.ok())
.and_then(|arg| JsStringLiteralExpression::cast_ref(arg.syntax()))
.map(|js_string_literal| js_string_literal.text());

return collect_control_characters(pattern, regexp_flags);
}
None
}

impl Rule for NoControlCharactersInRegex {
type Query = Ast<RegexExpressionLike>;
type State = Vec<String>;
type Signals = Option<Self::State>;
type Options = ();

fn run(ctx: &RuleContext<Self>) -> Self::Signals {
let node = ctx.query();
match node {
RegexExpressionLike::JsNewExpression(js_new_expression) => {
collect_control_characters_from_expression(
&js_new_expression.callee().ok()?,
&js_new_expression.arguments()?,
)
}
RegexExpressionLike::JsCallExpression(js_call_expression) => {
collect_control_characters_from_expression(
&js_call_expression.callee().ok()?,
&js_call_expression.arguments().ok()?,
)
}
RegexExpressionLike::JsRegexLiteralExpression(js_regex_literal_expression) => {
collect_control_characters(
js_regex_literal_expression.pattern().ok()?,
js_regex_literal_expression.flags().ok(),
)
}
}
}

fn diagnostic(ctx: &RuleContext<Self>, state: &Self::State) -> Option<RuleDiagnostic> {
Some(RuleDiagnostic::new(
rule_category!(),
ctx.query().range(),
markup! {
"Unexpected control character(s) in regular expression: "<Emphasis>{state.join(", ")}</Emphasis>""
},
Comment on lines +239 to +241
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should let the user know what they should do here. For example, I see the diagnostic for the first time, and I don't know how to solve the issue 😄

).note(
markup! {
"Control characters are unusual and potentially incorrect inputs, so they are disallowed."
}
))
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_collect_control_characters() {
assert_eq!(
collect_control_characters(String::from("\\x00\\x0F\\u0010\\u001F"), None),
Some(vec![
String::from("\\x00"),
String::from("\\x0F"),
String::from("\\u0010"),
String::from("\\u001F")
])
);
assert_eq!(
collect_control_characters(String::from("\\u{0}\\u{1F}"), Some(String::from("u"))),
Some(vec![String::from("\\u{0}"), String::from("\\u{1F}")])
);
assert_eq!(
collect_control_characters(String::from("\\x20\\u0020\\u{20}\\t\\n"), None),
None
);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
var regex = RegExp("\\x1f");
var regex = RegExp("\\u{1111}*\\x1F", "u");
var regex = new RegExp("\\x1f\\x1e");
var regex = new RegExp("\\x1fFOO\\x00");
var regex = new RegExp("FOO\\x1fFOO\\x1f");
var regex = new RegExp("\\x1f");
var regex = new RegExp("\\u001F", flags);
var regex = new RegExp("\\u{1111}*\\x1F", "u");
var regex = new RegExp("\\u{1F}", "u");
var regex = new RegExp("\\u{1F}", "gui");
var regex = new RegExp("\\x0C");
var regex = new RegExp("\x0C");
var regex = /\x00/;
var regex = /\x0C/;
var regex = /\x1F/;
var regex = /\u000C/;
var regex = /\u{C}/u;
var regex = /\\\x1f\\x1e/;
var regex = /\\\x1fFOO\\x00/;
var regex = /FOO\\\x1fFOO\\x1f/;
var regex = /(?<a>\\x1f)/;
var regex = /(?<\u{1d49c}>.)\x1f/;
var regex = /\u{1111}*\x1F/u;
var regex = /\u{1F}/u;
var regex = /\u{1F}/gui;
Loading
Loading