Skip to content

Commit

Permalink
Improve buffer management throughout the load/fetch and parse lifecyc…
Browse files Browse the repository at this point in the history
…le (#2186)

* Introduces SoftPool

A SoftPool is a ThreadLocal SofReference Stack of <T>, with borrow and release methods. Allows simple recycling of objects between uses, and for those to be reaped when inactive.

* Refactored bufferUp in CharacterReader
Removed the use of a backing BufferedInputReader, which was redundant and creating large char array buffers, and reuse the char[] buffer via SoftPool.

* Maintain the string flyweight cache between runs in CharacterReader

* Updated ControllableInputStream to use new SimpleBufferedInput, instead of BufferedInputStream, so we can reuse those byte[] buffers. Also, those are only borrowed/created on the first need for the buffer; otherwise fill reads cascade down to the underlying stream. And removes synchronized / lock sections, as this is not passed between threads concurrently.

* Reduced the DefaultBufferSize to 8K from 32K

* In DataUtil, when reading ahead to detect the charset, instead of buffering into a new ByteBuffer, reuse the ControllableInputStream.

* Simplify BOM character skip

Overall, heap allocations and GC during the read & parse phase are down between -6% and -89%, and throughput improved up to +143%.
  • Loading branch information
jhy committed Aug 10, 2024
1 parent 1714af2 commit ec2d0c4
Show file tree
Hide file tree
Showing 20 changed files with 719 additions and 300 deletions.
1 change: 1 addition & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@
<ignore>java.util.Set</ignore> <!-- Set#stream() -->
<ignore>java.util.Spliterator</ignore>
<ignore>java.util.Spliterators</ignore>
<ignore>java.nio.ByteBuffer</ignore> <!-- .flip(); added in API1; possibly due to .flip previously returning Buffer, later ByteBuffer; return unused -->

<ignore>java.net.HttpURLConnection</ignore><!-- .setAuthenticator(java.net.Authenticator) in Java 9; only used in multirelease 9+ version -->
</ignores>
Expand Down
103 changes: 38 additions & 65 deletions src/main/java/org/jsoup/helper/DataUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,17 +15,14 @@
import org.jspecify.annotations.Nullable;

import java.io.BufferedReader;
import java.io.CharArrayReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.Reader;
import java.io.UncheckedIOException;
import java.nio.Buffer;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.channels.Channels;
import java.nio.channels.SeekableByteChannel;
import java.nio.charset.Charset;
Expand Down Expand Up @@ -119,8 +116,7 @@ public static Document load(Path path, @Nullable String charsetName, String base
* @since 1.17.2
*/
public static Document load(Path path, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
InputStream stream = openStream(path);
return parseInputStream(stream, charsetName, baseUri, parser);
return parseInputStream(openStream(path), charsetName, baseUri, parser);
}

/**
Expand All @@ -144,14 +140,13 @@ public static StreamParser streamParser(Path path, @Nullable Charset charset, St
String charsetName = charset != null? charset.name() : null;
DataUtil.CharsetDoc charsetDoc = DataUtil.detectCharset(openStream(path), charsetName, baseUri, parser);
BufferedReader reader = new BufferedReader(new InputStreamReader(charsetDoc.input, charsetDoc.charset), DefaultBufferSize);
maybeSkipBom(reader, charsetDoc);
streamer.parse(reader, baseUri); // initializes the parse and the document, but does not step() it

return streamer;
}

/** Open an input stream from a file; if it's a gzip file, returns a GZIPInputStream to unzip it. */
private static InputStream openStream(Path path) throws IOException {
private static ControllableInputStream openStream(Path path) throws IOException {
final SeekableByteChannel byteChannel = Files.newByteChannel(path);
InputStream stream = Channels.newInputStream(byteChannel);
String name = Normalizer.lowerCase(path.getFileName().toString());
Expand All @@ -162,7 +157,7 @@ private static InputStream openStream(Path path) throws IOException {
stream = new GZIPInputStream(stream);
}
}
return stream;
return ControllableInputStream.wrap(stream, 0);
}

/**
Expand All @@ -174,7 +169,7 @@ private static InputStream openStream(Path path) throws IOException {
* @throws IOException on IO error
*/
public static Document load(InputStream in, @Nullable String charsetName, String baseUri) throws IOException {
return parseInputStream(in, charsetName, baseUri, Parser.htmlParser());
return parseInputStream(ControllableInputStream.wrap(in, 0), charsetName, baseUri, Parser.htmlParser());
}

/**
Expand All @@ -187,7 +182,7 @@ public static Document load(InputStream in, @Nullable String charsetName, String
* @throws IOException on IO error
*/
public static Document load(InputStream in, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
return parseInputStream(in, charsetName, baseUri, parser);
return parseInputStream(ControllableInputStream.wrap(in, 0), charsetName, baseUri, parser);
}

/**
Expand All @@ -209,17 +204,15 @@ static class CharsetDoc {
Charset charset;
InputStream input;
@Nullable Document doc;
boolean skip;

CharsetDoc(Charset charset, @Nullable Document doc, InputStream input, boolean skip) {
CharsetDoc(Charset charset, @Nullable Document doc, InputStream input) {
this.charset = charset;
this.input = input;
this.doc = doc;
this.skip = skip;
}
}

static Document parseInputStream(@Nullable InputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
static Document parseInputStream(@Nullable ControllableInputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
if (input == null) // empty body // todo reconsider?
return new Document(baseUri);

Expand All @@ -235,30 +228,28 @@ static Document parseInputStream(@Nullable InputStream input, @Nullable String c
return doc;
}

static CharsetDoc detectCharset(InputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
static CharsetDoc detectCharset(ControllableInputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
Document doc = null;

// read the start of the stream and look for a BOM or meta charset
InputStream wrappedInputStream = ControllableInputStream.wrap(input, DefaultBufferSize, 0);
wrappedInputStream.mark(DefaultBufferSize);
ByteBuffer firstBytes = readToByteBuffer(wrappedInputStream, firstReadBufferSize - 1); // -1 because we read one more to see if completed. First read is < buffer size, so can't be invalid.
boolean fullyRead = (wrappedInputStream.read() == -1);
wrappedInputStream.reset();

// read the start of the stream and look for a BOM or meta charset:
// look for BOM - overrides any other header or input
BomCharset bomCharset = detectCharsetFromBom(firstBytes);
String bomCharset = detectCharsetFromBom(input); // resets / consumes appropriately
if (bomCharset != null)
charsetName = bomCharset.charset;
charsetName = bomCharset;

if (charsetName == null) { // determine from meta. safe first parse as UTF-8
if (charsetName == null) { // read ahead and determine from meta. safe first parse as UTF-8
int origMax = input.max();
input.max(firstReadBufferSize);
input.mark(firstReadBufferSize);
input.allowClose(false); // ignores closes during parse, in case we need to rewind
try {
CharBuffer defaultDecoded = UTF_8.decode(firstBytes);
if (defaultDecoded.hasArray())
doc = parser.parseInput(new CharArrayReader(defaultDecoded.array(), defaultDecoded.arrayOffset(), defaultDecoded.limit()), baseUri);
else
doc = parser.parseInput(defaultDecoded.toString(), baseUri);
Reader reader = new InputStreamReader(input, UTF_8); // input is currently capped to firstReadBufferSize
doc = parser.parseInput(reader, baseUri);
input.reset();
input.max(origMax); // reset for a full read if required
} catch (UncheckedIOException e) {
throw e.getCause();
} finally {
input.allowClose(true);
}

// look for <meta http-equiv="Content-Type" content="text/html;charset=gb2312"> or HTML5 <meta charset="gb2312">
Expand Down Expand Up @@ -293,7 +284,9 @@ else if (first instanceof Comment) {
foundCharset = foundCharset.trim().replaceAll("[\"']", "");
charsetName = foundCharset;
doc = null;
} else if (!fullyRead) {
} else if (input.baseReadFully()) { // if we have read fully, and the charset was correct, keep that current parse
input.close(); // the parser tried to close it
} else {
doc = null;
}
} else { // specified by content type header (or by user on file load)
Expand All @@ -304,9 +297,7 @@ else if (first instanceof Comment) {
if (charsetName == null)
charsetName = defaultCharsetName;
Charset charset = charsetName.equals(defaultCharsetName) ? UTF_8 : Charset.forName(charsetName);
boolean skip = bomCharset != null && bomCharset.offset; // skip 1 if the BOM is there and needs offset
// if consumer needs to parse the input; prep it if there's a BOM. Can't skip in inputstream as wrapping buffer will ignore the pos
return new CharsetDoc(charset, doc, wrappedInputStream, skip);
return new CharsetDoc(charset, doc, input);
}

static Document parseInputStream(CharsetDoc charsetDoc, String baseUri, Parser parser) throws IOException {
Expand All @@ -318,8 +309,7 @@ static Document parseInputStream(CharsetDoc charsetDoc, String baseUri, Parser p
Validate.notNull(input);
final Document doc;
final Charset charset = charsetDoc.charset;
try (BufferedReader reader = new BufferedReader(new InputStreamReader(input, charset), DefaultBufferSize)) {
maybeSkipBom(reader, charsetDoc);
try (Reader reader = new InputStreamReader(input, charset)) {
try {
doc = parser.parseInput(reader, baseUri);
} catch (UncheckedIOException e) {
Expand All @@ -335,13 +325,6 @@ static Document parseInputStream(CharsetDoc charsetDoc, String baseUri, Parser p
return doc;
}

static void maybeSkipBom(Reader reader, CharsetDoc charsetDoc) throws IOException {
if (charsetDoc.skip) {
long skipped = reader.skip(1);
Validate.isTrue(skipped == 1); // WTF if this fails.
}
}

/**
* Read the input stream into a byte buffer. To deal with slow input streams, you may interrupt the thread this
* method is executing on. The data read until being interrupted will be available.
Expand Down Expand Up @@ -400,34 +383,24 @@ static String mimeBoundary() {
return StringUtil.releaseBuilder(mime);
}

private static @Nullable BomCharset detectCharsetFromBom(final ByteBuffer byteData) {
@SuppressWarnings("UnnecessaryLocalVariable") final Buffer buffer = byteData; // .mark and rewind used to return Buffer, now ByteBuffer, so cast for backward compat
buffer.mark();
private static @Nullable String detectCharsetFromBom(ControllableInputStream input) throws IOException {
byte[] bom = new byte[4];
if (byteData.remaining() >= bom.length) {
byteData.get(bom);
buffer.rewind();
}
input.mark(bom.length);
//noinspection ResultOfMethodCallIgnored
input.read(bom, 0, 4);
input.reset();

// 16 and 32 decoders consume the BOM to determine be/le; utf-8 should be consumed here
if (bom[0] == 0x00 && bom[1] == 0x00 && bom[2] == (byte) 0xFE && bom[3] == (byte) 0xFF || // BE
bom[0] == (byte) 0xFF && bom[1] == (byte) 0xFE && bom[2] == 0x00 && bom[3] == 0x00) { // LE
return new BomCharset("UTF-32", false); // and I hope it's on your system
return "UTF-32"; // and I hope it's on your system
} else if (bom[0] == (byte) 0xFE && bom[1] == (byte) 0xFF || // BE
bom[0] == (byte) 0xFF && bom[1] == (byte) 0xFE) {
return new BomCharset("UTF-16", false); // in all Javas
return "UTF-16"; // in all Javas
} else if (bom[0] == (byte) 0xEF && bom[1] == (byte) 0xBB && bom[2] == (byte) 0xBF) {
return new BomCharset("UTF-8", true); // in all Javas
// 16 and 32 decoders consume the BOM to determine be/le; utf-8 should be consumed here
input.read(bom, 0, 3); // consume the UTF-8 BOM
return "UTF-8"; // in all Javas
}
return null;
}

private static class BomCharset {
private final String charset;
private final boolean offset;

public BomCharset(String charset, boolean offset) {
this.charset = charset;
this.offset = offset;
}
}
}
34 changes: 24 additions & 10 deletions src/main/java/org/jsoup/helper/HttpConnection.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import org.jsoup.UnsupportedMimeTypeException;
import org.jsoup.internal.ControllableInputStream;
import org.jsoup.internal.Functions;
import org.jsoup.internal.SharedConstants;
import org.jsoup.internal.StringUtil;
import org.jsoup.nodes.Document;
import org.jsoup.parser.Parser;
Expand Down Expand Up @@ -52,6 +51,7 @@
import static org.jsoup.Connection.Method.HEAD;
import static org.jsoup.helper.DataUtil.UTF_8;
import static org.jsoup.internal.Normalizer.lowerCase;
import static org.jsoup.internal.SharedConstants.DefaultBufferSize;

/**
* Implementation of {@link Connection}.
Expand Down Expand Up @@ -915,7 +915,7 @@ else if (res.hasHeaderWithValue(CONTENT_ENCODING, "deflate"))
stream = new InflaterInputStream(stream, new Inflater(true));

res.bodyStream = ControllableInputStream.wrap(
stream, SharedConstants.DefaultBufferSize, req.maxBodySize())
stream, DefaultBufferSize, req.maxBodySize())
.timeout(startTime, req.timeout());

if (req.responseProgress != null) // set response progress listener
Expand Down Expand Up @@ -965,11 +965,12 @@ public String contentType() {
}

/** Called from parse() or streamParser(), validates and prepares the input stream, and aligns common settings. */
private InputStream prepareParse() {
private ControllableInputStream prepareParse() {
Validate.isTrue(executed, "Request must be executed (with .execute(), .get(), or .post() before parsing response");
InputStream stream = bodyStream;
ControllableInputStream stream = bodyStream;
if (byteData != null) { // bytes have been read in to the buffer, parse that
stream = new ByteArrayInputStream(byteData.array());
ByteArrayInputStream bytes = new ByteArrayInputStream(byteData.array(), 0, byteData.limit());
stream = ControllableInputStream.wrap(bytes, 0); // no max
inputStreamRead = false; // ok to reparse if in bytes
}
Validate.isFalse(inputStreamRead, "Input stream already read and parsed, cannot re-read.");
Expand All @@ -979,7 +980,7 @@ private InputStream prepareParse() {
}

@Override public Document parse() throws IOException {
InputStream stream = prepareParse();
ControllableInputStream stream = prepareParse();
Document doc = DataUtil.parseInputStream(stream, charset, url.toExternalForm(), req.parser());
doc.connection(new HttpConnection(req, this)); // because we're static, don't have the connection obj. // todo - maybe hold in the req?
charset = doc.outputSettings().charset().name(); // update charset from meta-equiv, possibly
Expand All @@ -988,15 +989,14 @@ private InputStream prepareParse() {
}

@Override public StreamParser streamParser() throws IOException {
InputStream stream = prepareParse();
ControllableInputStream stream = prepareParse();
String baseUri = url.toExternalForm();
DataUtil.CharsetDoc charsetDoc = DataUtil.detectCharset(stream, charset, baseUri, req.parser());
// note that there may be a document in CharsetDoc as a result of scanning meta-data -- but as requires a stream parse, it is not used here. todo - revisit.

// set up the stream parser and rig this connection up to the parsed doc:
StreamParser streamer = new StreamParser(req.parser());
BufferedReader reader = new BufferedReader(new InputStreamReader(stream, charsetDoc.charset));
DataUtil.maybeSkipBom(reader, charsetDoc);
streamer.parse(reader, baseUri); // initializes the parse and the document, but does not step() it
streamer.document().connection(new HttpConnection(req, this));
charset = charsetDoc.charset.name();
Expand Down Expand Up @@ -1035,7 +1035,19 @@ public String body() {
public byte[] bodyAsBytes() {
prepareByteData();
Validate.notNull(byteData);
return byteData.array();
Validate.isTrue(byteData.hasArray()); // we made it, so it should

byte[] array = byteData.array();
int offset = byteData.arrayOffset();
int length = byteData.limit();

if (offset == 0 && length == array.length) { // exact, just return it
return array;
} else { // trim to size
byte[] exactArray = new byte[length];
System.arraycopy(array, offset, exactArray, 0, length);
return exactArray;
}
}

@Override
Expand All @@ -1050,7 +1062,9 @@ public BufferedInputStream bodyStream() {

// if we have read to bytes (via buffer up), return those as a stream.
if (byteData != null) {
return new BufferedInputStream(new ByteArrayInputStream(byteData.array()), SharedConstants.DefaultBufferSize);
return new BufferedInputStream(
new ByteArrayInputStream(byteData.array(), 0, byteData.limit()),
DefaultBufferSize);
}

Validate.isFalse(inputStreamRead, "Request has already been read");
Expand Down
Loading

0 comments on commit ec2d0c4

Please sign in to comment.