Assume UTF-8 when no encoding can be looked up.

This will make HTML filtering and `replace=` filter option less likely to be bypassed by uBO, as the body response filterer previously required an encoding to be expressly declared before acting on the response body. UTF-8 usage is currently reported as ~98.2%: https://w3techs.com/technologies/history_overview/character_encoding
gorhill · Mar 5, 2024 · 63acdcb · 63acdcb
1 parent b95a1e9
commit 63acdcb
Showing 1 changed file with 10 additions and 6 deletions.
diff --git a/src/js/traffic.js b/src/js/traffic.js
@@ -749,7 +749,7 @@ const bodyFilterer = (( ) => {
  /* t */ if ( bytes[i+6] !== 0x74 ) { continue; }
  break;
  }
- if ( (i - 40) >= 65536 ) { return; }
+ if ( (i + 40) >= 65536 ) { return; }
  i += 8;
  // find first alpha character
  let j = -1;
@@ -827,13 +827,17 @@ const bodyFilterer = (( ) => {
  }
  if ( this.status !== 'finishedtransferringdata' ) { return; }
 
- // If encoding is still unknown, try to extract from stream data
+ // If encoding is still unknown, try to extract from stream data.
+ // Just assume utf-8 if ultimately no encoding can be looked up.
  if ( session.charset === undefined ) {
  const charsetFound = charsetFromStream(session.buffer);
- if ( charsetFound === undefined ) { return streamClose(session); }
- const charsetUsed = textEncode.normalizeCharset(charsetFound);
- if ( charsetUsed === undefined ) { return streamClose(session); }
- session.charset = charsetUsed;
+ if ( charsetFound !== undefined ) {
+ const charsetUsed = textEncode.normalizeCharset(charsetFound);
+ if ( charsetUsed === undefined ) { return streamClose(session); }
+ session.charset = charsetUsed;
+ } else {
+ session.charset = 'utf-8';
+ }
  }
 
  while ( session.jobs.length !== 0 ) {