Skip to content

Commit

Permalink
Simplified Entities.escape to not require an OutputSettings
Browse files Browse the repository at this point in the history
This is a cleaner decoupling of OutputSettings and Entities than the previous impl which required a lazy initialisation of OutputSettings.

Also simplified how we get a fallback encoder.

Related to #1910, #2042
  • Loading branch information
jhy committed Aug 12, 2024
1 parent c778c3a commit b731fd7
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 62 deletions.
43 changes: 16 additions & 27 deletions src/main/java/org/jsoup/nodes/Document.java
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
import org.jspecify.annotations.Nullable;

import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder;
import java.util.List;

import static org.jsoup.parser.Parser.NamespaceHtml;
Expand Down Expand Up @@ -395,29 +394,32 @@ public static class OutputSettings implements Cloneable {
* The output serialization syntax.
*/
public enum Syntax {html, xml}

private Entities.EscapeMode escapeMode = Entities.EscapeMode.base;
private Charset charset;
Entities.CoreCharset coreCharset; // fast encoders for ascii and utf8
private final ThreadLocal<CharsetEncoder> encoderThreadLocal = new ThreadLocal<>(); // initialized by start of OuterHtmlVisitor

private Charset charset = DataUtil.UTF_8;
private boolean prettyPrint = true;
private boolean outline = false;
private int indentAmount = 1;
private int maxPaddingWidth = 30;
private Syntax syntax = Syntax.html;

/**
Create a new OutputSettings object, with the default settings (UTF-8, HTML, EscapeMode.base, pretty-printing,
indent amount of 1).
*/
public OutputSettings() {
charset(DataUtil.UTF_8);
}

/**
* Get the document's current HTML escape mode: <code>base</code>, which provides a limited set of named HTML
* entities and escapes other characters as numbered entities for maximum compatibility; or <code>extended</code>,
* which uses the complete set of HTML named entities.
* <p>
* The default escape mode is <code>base</code>.
* @return the document's current escape mode
Get the document's current entity escape mode:
<ul>
<li><code>xhtml</code>, the minimal named entities in XHTML / XML</li>
<li><code>base</code>, which provides a limited set of named HTML
entities and escapes other characters as numbered entities for maximum compatibility</li>
<li><code>extended</code>,
which uses the complete set of HTML named entities.</li>
</ul>
<p>The default escape mode is <code>base</code>.
@return the document's current escape mode
*/
public Entities.EscapeMode escapeMode() {
return escapeMode;
Expand Down Expand Up @@ -453,7 +455,6 @@ public Charset charset() {
*/
public OutputSettings charset(Charset charset) {
this.charset = charset;
coreCharset = Entities.CoreCharset.byName(charset.name());
return this;
}

Expand All @@ -467,18 +468,6 @@ public OutputSettings charset(String charset) {
return this;
}

CharsetEncoder prepareEncoder() {
// created at start of OuterHtmlVisitor so each pass has own encoder, so OutputSettings can be shared among threads
CharsetEncoder encoder = charset.newEncoder();
encoderThreadLocal.set(encoder);
return encoder;
}

CharsetEncoder encoder() {
CharsetEncoder encoder = encoderThreadLocal.get();
return encoder != null ? encoder : prepareEncoder();
}

/**
* Get the document's current output syntax.
* @return current syntax
Expand Down
83 changes: 49 additions & 34 deletions src/main/java/org/jsoup/nodes/Entities.java
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
package org.jsoup.nodes;

import org.jsoup.SerializationException;
import org.jsoup.helper.DataUtil;
import org.jsoup.internal.StringUtil;
import org.jsoup.helper.Validate;
import org.jsoup.nodes.Document.OutputSettings;
import org.jsoup.parser.CharacterReader;
import org.jsoup.parser.Parser;
import org.jspecify.annotations.Nullable;

import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder;
import java.util.Arrays;
import java.util.HashMap;
Expand Down Expand Up @@ -136,51 +137,55 @@ public static int codepointsForName(final String name, final int[] codepoints) {
/**
HTML escape an input string. That is, {@code <} is returned as {@code &lt;}. The escaped string is suitable for use
both in attributes and in text data.
@param string the un-escaped string to escape
@param data the un-escaped string to escape
@param out the output settings to use. This configures the character set escaped against (that is, if a
character is supported in the output character set, it doesn't have to be escaped), and also HTML or XML
settings.
@return the escaped string
*/
public static String escape(String string, OutputSettings out) {
if (string == null)
public static String escape(String data, OutputSettings out) {
return escapeString(data, out.escapeMode(), out.syntax(), out.charset());
}

/**
HTML escape an input string, using the default settings (UTF-8, base entities, HTML syntax). That is, {@code <} is
returned as {@code &lt;}. The escaped string is suitable for use both in attributes and in text data.
@param data the un-escaped string to escape
@return the escaped string
@see #escape(String, OutputSettings)
*/
public static String escape(String data) {
return escapeString(data, base, Syntax.html, DataUtil.UTF_8);
}

private static String escapeString(String data, EscapeMode escapeMode, Syntax syntax, Charset charset) {
if (data == null)
return "";
StringBuilder accum = StringUtil.borrowBuilder();
try {
escape(accum, string, out, ForText | ForAttribute); // for text and for attribute; preserve whitespaces
doEscape(data, accum, escapeMode, syntax, charset, ForText | ForAttribute);
} catch (IOException e) {
throw new SerializationException(e); // doesn't happen
}
return StringUtil.releaseBuilder(accum);
}

/**
* HTML escape an input string, using the default settings (UTF-8, base entities). That is, {@code <} is returned as
* {@code &lt;}. The escaped string is suitable for use both in attributes and in text data.
*
* @param string the un-escaped string to escape
* @return the escaped string
* @see #escape(String, OutputSettings)
*/
public static String escape(String string) {
if (DefaultOutput == null)
DefaultOutput = new OutputSettings();
return escape(string, DefaultOutput);

static void escape(Appendable accum, String data, OutputSettings out, int options) throws IOException {
doEscape(data, accum, out.escapeMode(), out.syntax(), out.charset(), options);
}
private static @Nullable OutputSettings DefaultOutput; // lazy-init, to break circular dependency with OutputSettings

static void escape(Appendable accum, String string, OutputSettings out, int options) throws IOException {
final EscapeMode escapeMode = out.escapeMode();
final CharsetEncoder encoder = out.encoder();
final CoreCharset coreCharset = out.coreCharset; // init in out.prepareEncoder()
final int length = string.length();
private static void doEscape(String data, Appendable accum, EscapeMode mode, Syntax syntax, Charset charset, int options) throws IOException {
final CoreCharset coreCharset = CoreCharset.byName(charset.name());
final CharsetEncoder fallback = encoderFor(charset);
final int length = data.length();

int codePoint;
boolean lastWasWhite = false;
boolean reachedNonWhite = false;
boolean skipped = false;
for (int offset = 0; offset < length; offset += Character.charCount(codePoint)) {
codePoint = string.codePointAt(offset);
codePoint = data.codePointAt(offset);

if ((options & Normalise) != 0) {
if (StringUtil.isWhitespace(codePoint)) {
Expand All @@ -202,12 +207,12 @@ static void escape(Appendable accum, String string, OutputSettings out, int opti
}
}
}
appendEscaped(accum, out, options, codePoint, escapeMode, encoder, coreCharset);
appendEscaped(codePoint, accum, options, mode, syntax, coreCharset, fallback);
}
}

private static void appendEscaped(Appendable accum, OutputSettings out, int options,
int codePoint, EscapeMode escapeMode, CharsetEncoder encoder, CoreCharset coreCharset) throws IOException {
private static void appendEscaped(int codePoint, Appendable accum, int options, EscapeMode escapeMode,
Syntax syntax, CoreCharset coreCharset, CharsetEncoder fallback) throws IOException {

// surrogate pairs, split implementation for efficiency on single char common case (saves creating strings, char[]):
final char c = (char) codePoint;
Expand All @@ -222,7 +227,7 @@ private static void appendEscaped(Appendable accum, OutputSettings out, int opti
break;
case '<':
// escape when in character data or when in a xml attribute val or XML syntax; not needed in html attr val
appendLt(accum, options, escapeMode, out);
appendLt(accum, options, escapeMode, syntax);
break;
case '>':
if ((options & ForText) != 0) accum.append("&gt;");
Expand All @@ -243,11 +248,11 @@ private static void appendEscaped(Appendable accum, OutputSettings out, int opti
accum.append(c);
break;
default:
if (c < 0x20 || !canEncode(coreCharset, c, encoder)) appendEncoded(accum, escapeMode, codePoint);
if (c < 0x20 || !canEncode(coreCharset, c, fallback)) appendEncoded(accum, escapeMode, codePoint);
else accum.append(c);
}
} else {
if (canEncode(coreCharset, c, encoder)) {
if (canEncode(coreCharset, c, fallback)) {
// reads into charBuf - we go through these steps to avoid GC objects as much as possible (would be a new String and a new char[2] for each character)
char[] chars = charBuf.get();
int len = Character.toChars(codePoint, chars, 0);
Expand All @@ -268,9 +273,9 @@ private static void appendNbsp(Appendable accum, EscapeMode escapeMode) throws I
else accum.append("&#xa0;");
}

private static void appendLt(Appendable accum, int options, EscapeMode escapeMode, OutputSettings out) throws IOException {
if ((options & ForText) != 0 || escapeMode == EscapeMode.xhtml || out.syntax() == Syntax.xml) accum.append("&lt;");
else accum.append('<');
private static void appendLt(Appendable accum, int options, EscapeMode escapeMode, Syntax syntax) throws IOException {
if ((options & ForText) != 0 || escapeMode == EscapeMode.xhtml || syntax == Syntax.xml) accum.append("&lt;");
else accum.append('<'); // no need to escape < when in an HTML attribute
}

private static void appendApos(Appendable accum, int options, EscapeMode escapeMode) throws IOException {
Expand All @@ -282,7 +287,6 @@ private static void appendApos(Appendable accum, int options, EscapeMode escapeM
}
}


private static void appendEncoded(Appendable accum, EscapeMode escapeMode, int codePoint) throws IOException {
final String name = escapeMode.nameForCodepoint(codePoint);
if (!emptyName.equals(name)) // ok for identity check
Expand Down Expand Up @@ -349,6 +353,17 @@ static CoreCharset byName(final String name) {
}
}

// cache the last used fallback encoder to save recreating on every use
private static final ThreadLocal<CharsetEncoder> LocalEncoder = new ThreadLocal<>();
private static CharsetEncoder encoderFor(Charset charset) {
CharsetEncoder encoder = LocalEncoder.get();
if (encoder == null || !encoder.charset().equals(charset)) {
encoder = charset.newEncoder();
LocalEncoder.set(encoder);
}
return encoder;
}

private static void load(EscapeMode e, String pointsData, int size) {
e.nameKeys = new String[size];
e.codeVals = new int[size];
Expand Down
1 change: 0 additions & 1 deletion src/main/java/org/jsoup/nodes/Node.java
Original file line number Diff line number Diff line change
Expand Up @@ -942,7 +942,6 @@ private static class OuterHtmlVisitor implements NodeVisitor {
OuterHtmlVisitor(Appendable accum, Document.OutputSettings out) {
this.accum = accum;
this.out = out;
out.prepareEncoder();
}

public void head(Node node, int depth) {
Expand Down

0 comments on commit b731fd7

Please sign in to comment.