Skip to content

Commit

Permalink
Speed and GC optimizations in html()
Browse files Browse the repository at this point in the history
Attribute: removed regex matchers to check for key validity, using a simple scan now.

Entities: use a charBuf to append supplemental characters

Improves on #2183
  • Loading branch information
jhy committed Jul 30, 2024
1 parent 60281c5 commit 27e7d5f
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 11 deletions.
2 changes: 1 addition & 1 deletion CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
root `Document` node. This enables source position tracking on the Document node (which was previously unset). And
it also enables the node traversor to see the outer Document node. [2182](https://github.com/jhy/jsoup/pull/2182)
* Speed optimized `html()` and `Entities.escape()` when the input contains UTF characters in a supplementary plane, by
around 22%. [2183](https://github.com/jhy/jsoup/pull/2183)
around 49%. [2183](https://github.com/jhy/jsoup/pull/2183)

### Bug Fixes

Expand Down
40 changes: 33 additions & 7 deletions src/main/java/org/jsoup/nodes/Attribute.java
Original file line number Diff line number Diff line change
Expand Up @@ -174,29 +174,55 @@ static void htmlNoValidate(String key, @Nullable String val, Appendable accum, D
}
}

private static final Pattern xmlKeyValid = Pattern.compile("[a-zA-Z_:][-a-zA-Z0-9_:.]*");
private static final Pattern xmlKeyReplace = Pattern.compile("[^-a-zA-Z0-9_:.]+");
private static final Pattern htmlKeyValid = Pattern.compile("[^\\x00-\\x1f\\x7f-\\x9f \"'/=]+");
private static final Pattern htmlKeyReplace = Pattern.compile("[\\x00-\\x1f\\x7f-\\x9f \"'/=]+");

/**
* Get a valid attribute key for the given syntax. If the key is not valid, it will be coerced into a valid key.
* @param key the original attribute key
* @param syntax HTML or XML
* @return the original key if it's valid; a key with invalid characters replaced with "_" otherwise; or null if a valid key could not be created.
*/
@Nullable public static String getValidKey(String key, Syntax syntax) {
if (syntax == Syntax.xml && !xmlKeyValid.matcher(key).matches()) {
if (syntax == Syntax.xml && !isValidXmlKey(key)) {
key = xmlKeyReplace.matcher(key).replaceAll("_");
return xmlKeyValid.matcher(key).matches() ? key : null; // null if could not be coerced
return isValidXmlKey(key) ? key : null; // null if could not be coerced
}
else if (syntax == Syntax.html && !htmlKeyValid.matcher(key).matches()) {
else if (syntax == Syntax.html && !isValidHtmlKey(key)) {
key = htmlKeyReplace.matcher(key).replaceAll("_");
return htmlKeyValid.matcher(key).matches() ? key : null; // null if could not be coerced
return isValidHtmlKey(key) ? key : null; // null if could not be coerced
}
return key;
}

// perf critical in html() so using manual scan vs regex:
// note that we aren't using anything in supplemental space, so OK to iter charAt
private static boolean isValidXmlKey(String key) {
// =~ [a-zA-Z_:][-a-zA-Z0-9_:.]*
final int length = key.length();
if (length ==0) return false;
char c = key.charAt(0);
if (!((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c == ':'))
return false;
for (int i = 1; i < length; i++) {
c = key.charAt(i);
if (!((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c == ':'))
return false;
}
return true;
}

private static boolean isValidHtmlKey(String key) {
// =~ [\x00-\x1f\x7f-\x9f "'/=]+
final int length = key.length();
if (length ==0) return false;
for (int i = 0; i < length; i++) {
char c = key.charAt(i);
if (c <= 0x1f || c >= 0x7f && c <= 0x9f || c == ' ' || c == '"' || c == '\'' || c == '/' || c == '=')
return false;
}
return true;
}

/**
Get the string representation of this attribute, implemented as {@link #html()}.
@return string
Expand Down
15 changes: 12 additions & 3 deletions src/main/java/org/jsoup/nodes/Entities.java
Original file line number Diff line number Diff line change
Expand Up @@ -248,12 +248,21 @@ private static void appendEscaped(Appendable accum, OutputSettings out, int opti
}
} else {
if (canEncode(coreCharset, c, encoder)) {
String s = new String(Character.toChars(codePoint));
accum.append(s);
} else appendEncoded(accum, escapeMode, codePoint);
// reads into charBuf - we go through these steps to avoid GC objects as much as possible (would be a new String and a new char[2] for each character)
char[] chars = charBuf.get();
int len = Character.toChars(codePoint, chars, 0);
if (accum instanceof StringBuilder) // true unless the user supplied their own
((StringBuilder) accum).append(chars, 0, len);
else
accum.append(new String(chars, 0, len));
} else {
appendEncoded(accum, escapeMode, codePoint);
}
}
}

private static final ThreadLocal<char[]> charBuf = ThreadLocal.withInitial(() -> new char[2]);

private static void appendNbsp(Appendable accum, EscapeMode escapeMode) throws IOException {
if (escapeMode != EscapeMode.xhtml) accum.append("&nbsp;");
else accum.append("&#xa0;");
Expand Down

0 comments on commit 27e7d5f

Please sign in to comment.