Skip to content

Commit

Permalink
Improve extraction of tokens from regexes
Browse files Browse the repository at this point in the history
Fixed flawed extraction of tokens with optional sequences, i.e.
when quantifier could be zero.
Related issue:
- uBlockOrigin/uBlock-issues#2367

Ignore look-around sequences as suggested when normalizing into
tokenizable string.
Related issue:
- uBlockOrigin/uBlock-issues#2368

Fix regex analyzer throwing with trailing `-` in character
class sequence.
Related issue:
- AdguardTeam/AdguardFilters#134630
  • Loading branch information
gorhill committed Nov 17, 2022
1 parent 560c1fe commit d51b7e0
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 9 deletions.
44 changes: 35 additions & 9 deletions src/js/static-filtering-parser.js
Original file line number Diff line number Diff line change
Expand Up @@ -3009,11 +3009,11 @@ Parser.utils = Parser.prototype.utils = (( ) => {

class regex {
static firstCharCodeClass(s) {
return /^[\x01%0-9A-Za-z]/.test(s) ? 1 : 0;
return /^[\x01\x03%0-9A-Za-z]/.test(s) ? 1 : 0;
}

static lastCharCodeClass(s) {
return /[\x01%0-9A-Za-z]$/.test(s) ? 1 : 0;
return /[\x01\x03%0-9A-Za-z]$/.test(s) ? 1 : 0;
}

static tokenizableStrFromNode(node) {
Expand Down Expand Up @@ -3042,18 +3042,24 @@ Parser.utils = Parser.prototype.utils = (( ) => {
return String.fromCharCode(firstChar, lastChar);
}
case 4: /* T_GROUP, 'Group' */ {
if ( node.flags.NegativeLookAhead === 1 ) { return '\x01'; }
if ( node.flags.NegativeLookBehind === 1 ) { return '\x01'; }
if (
node.flags.LookAhead === 1 ||
node.flags.NegativeLookAhead === 1 ||
node.flags.LookBehind === 1 ||
node.flags.NegativeLookBehind === 1
) {
return '';
}
return this.tokenizableStrFromNode(node.val);
}
case 16: /* T_QUANTIFIER, 'Quantifier' */ {
const s = this.tokenizableStrFromNode(node.val);
const first = this.firstCharCodeClass(s);
const last = this.lastCharCodeClass(s);
if ( node.flags.min === 0 && first === 0 && last === 0 ) {
return '';
if ( node.flags.min !== 0 ) {
return String.fromCharCode(first, last);
}
return String.fromCharCode(first, last);
return String.fromCharCode(first+2, last+2);
}
case 64: /* T_HEXCHAR, 'HexChar' */ {
return String.fromCharCode(parseInt(node.val.slice(1), 16));
Expand Down Expand Up @@ -3142,13 +3148,33 @@ Parser.utils = Parser.prototype.utils = (( ) => {

static toTokenizableStr(reStr) {
if ( regexAnalyzer === null ) { return ''; }
let s = '';
try {
return this.tokenizableStrFromNode(
s = this.tokenizableStrFromNode(
regexAnalyzer(reStr, false).tree()
);
} catch(ex) {
}
return '';
// Process optional sequences
const reOptional = /[\x02\x03]+/g;
for (;;) {
const match = reOptional.exec(s);
if ( match === null ) { break; }
const left = s.slice(0, match.index);
const middle = match[0];
const right = s.slice(match.index + middle.length);
s = left;
s += this.firstCharCodeClass(right) === 1 ||
this.firstCharCodeClass(middle) === 1
? '\x01'
: '\x00';
s += this.lastCharCodeClass(left) === 1 ||
this.lastCharCodeClass(middle) === 1
? '\x01'
: '\x00';
s += right;
}
return s;
}
}

Expand Down
Empty file.
6 changes: 6 additions & 0 deletions src/lib/regexanalyzer/regex.js
Original file line number Diff line number Diff line change
Expand Up @@ -1019,6 +1019,12 @@ var rnd = function( a, b ){ return Math.round((b-a)*Math.random()+a); },
}
}

if ( isRange && escaped === false && ']' === ch )
{
isRange = false;
chars.push('-');
}

if ( isRange )
{
if ( chars.length )
Expand Down

0 comments on commit d51b7e0

Please sign in to comment.