Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve buffer management throughout the load/fetch and parse lifecycle #2186

Merged
merged 10 commits into from
Aug 10, 2024
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@
<ignore>java.util.Set</ignore> <!-- Set#stream() -->
<ignore>java.util.Spliterator</ignore>
<ignore>java.util.Spliterators</ignore>
<ignore>java.nio.ByteBuffer</ignore> <!-- .flip(); added in API1; possibly due to .flip previously returning Buffer, later ByteBuffer; return unused -->

<ignore>java.net.HttpURLConnection</ignore><!-- .setAuthenticator(java.net.Authenticator) in Java 9; only used in multirelease 9+ version -->
</ignores>
Expand Down
103 changes: 38 additions & 65 deletions src/main/java/org/jsoup/helper/DataUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,17 +15,14 @@
import org.jspecify.annotations.Nullable;

import java.io.BufferedReader;
import java.io.CharArrayReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.Reader;
import java.io.UncheckedIOException;
import java.nio.Buffer;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.channels.Channels;
import java.nio.channels.SeekableByteChannel;
import java.nio.charset.Charset;
Expand Down Expand Up @@ -119,8 +116,7 @@
* @since 1.17.2
*/
public static Document load(Path path, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
InputStream stream = openStream(path);
return parseInputStream(stream, charsetName, baseUri, parser);
return parseInputStream(openStream(path), charsetName, baseUri, parser);
}

/**
Expand All @@ -144,14 +140,13 @@
String charsetName = charset != null? charset.name() : null;
DataUtil.CharsetDoc charsetDoc = DataUtil.detectCharset(openStream(path), charsetName, baseUri, parser);
BufferedReader reader = new BufferedReader(new InputStreamReader(charsetDoc.input, charsetDoc.charset), DefaultBufferSize);
maybeSkipBom(reader, charsetDoc);
streamer.parse(reader, baseUri); // initializes the parse and the document, but does not step() it

return streamer;
}

/** Open an input stream from a file; if it's a gzip file, returns a GZIPInputStream to unzip it. */
private static InputStream openStream(Path path) throws IOException {
private static ControllableInputStream openStream(Path path) throws IOException {
final SeekableByteChannel byteChannel = Files.newByteChannel(path);
InputStream stream = Channels.newInputStream(byteChannel);
String name = Normalizer.lowerCase(path.getFileName().toString());
Expand All @@ -162,7 +157,7 @@
stream = new GZIPInputStream(stream);
}
}
return stream;
return ControllableInputStream.wrap(stream, 0);
}

/**
Expand All @@ -174,7 +169,7 @@
* @throws IOException on IO error
*/
public static Document load(InputStream in, @Nullable String charsetName, String baseUri) throws IOException {
return parseInputStream(in, charsetName, baseUri, Parser.htmlParser());
return parseInputStream(ControllableInputStream.wrap(in, 0), charsetName, baseUri, Parser.htmlParser());
}

/**
Expand All @@ -187,7 +182,7 @@
* @throws IOException on IO error
*/
public static Document load(InputStream in, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
return parseInputStream(in, charsetName, baseUri, parser);
return parseInputStream(ControllableInputStream.wrap(in, 0), charsetName, baseUri, parser);
}

/**
Expand All @@ -209,17 +204,15 @@
Charset charset;
InputStream input;
@Nullable Document doc;
boolean skip;

CharsetDoc(Charset charset, @Nullable Document doc, InputStream input, boolean skip) {
CharsetDoc(Charset charset, @Nullable Document doc, InputStream input) {
this.charset = charset;
this.input = input;
this.doc = doc;
this.skip = skip;
}
}

static Document parseInputStream(@Nullable InputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
static Document parseInputStream(@Nullable ControllableInputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
if (input == null) // empty body // todo reconsider?
return new Document(baseUri);

Expand All @@ -235,30 +228,28 @@
return doc;
}

static CharsetDoc detectCharset(InputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
static CharsetDoc detectCharset(ControllableInputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
Document doc = null;

// read the start of the stream and look for a BOM or meta charset
InputStream wrappedInputStream = ControllableInputStream.wrap(input, DefaultBufferSize, 0);
wrappedInputStream.mark(DefaultBufferSize);
ByteBuffer firstBytes = readToByteBuffer(wrappedInputStream, firstReadBufferSize - 1); // -1 because we read one more to see if completed. First read is < buffer size, so can't be invalid.
boolean fullyRead = (wrappedInputStream.read() == -1);
wrappedInputStream.reset();

// read the start of the stream and look for a BOM or meta charset:
// look for BOM - overrides any other header or input
BomCharset bomCharset = detectCharsetFromBom(firstBytes);
String bomCharset = detectCharsetFromBom(input); // resets / consumes appropriately
if (bomCharset != null)
charsetName = bomCharset.charset;
charsetName = bomCharset;

if (charsetName == null) { // determine from meta. safe first parse as UTF-8
if (charsetName == null) { // read ahead and determine from meta. safe first parse as UTF-8
int origMax = input.max();
input.max(firstReadBufferSize);
input.mark(firstReadBufferSize);
input.allowClose(false); // ignores closes during parse, in case we need to rewind
try {
CharBuffer defaultDecoded = UTF_8.decode(firstBytes);
if (defaultDecoded.hasArray())
doc = parser.parseInput(new CharArrayReader(defaultDecoded.array(), defaultDecoded.arrayOffset(), defaultDecoded.limit()), baseUri);
else
doc = parser.parseInput(defaultDecoded.toString(), baseUri);
Reader reader = new InputStreamReader(input, UTF_8); // input is currently capped to firstReadBufferSize
doc = parser.parseInput(reader, baseUri);
input.reset();
input.max(origMax); // reset for a full read if required
} catch (UncheckedIOException e) {
throw e.getCause();
} finally {
input.allowClose(true);
}

// look for <meta http-equiv="Content-Type" content="text/html;charset=gb2312"> or HTML5 <meta charset="gb2312">
Expand Down Expand Up @@ -293,7 +284,9 @@
foundCharset = foundCharset.trim().replaceAll("[\"']", "");
charsetName = foundCharset;
doc = null;
} else if (!fullyRead) {
} else if (input.baseReadFully()) { // if we have read fully, and the charset was correct, keep that current parse
input.close(); // the parser tried to close it
} else {
doc = null;
}
} else { // specified by content type header (or by user on file load)
Expand All @@ -304,9 +297,7 @@
if (charsetName == null)
charsetName = defaultCharsetName;
Charset charset = charsetName.equals(defaultCharsetName) ? UTF_8 : Charset.forName(charsetName);
boolean skip = bomCharset != null && bomCharset.offset; // skip 1 if the BOM is there and needs offset
// if consumer needs to parse the input; prep it if there's a BOM. Can't skip in inputstream as wrapping buffer will ignore the pos
return new CharsetDoc(charset, doc, wrappedInputStream, skip);
return new CharsetDoc(charset, doc, input);
}

static Document parseInputStream(CharsetDoc charsetDoc, String baseUri, Parser parser) throws IOException {
Expand All @@ -318,8 +309,7 @@
Validate.notNull(input);
final Document doc;
final Charset charset = charsetDoc.charset;
try (BufferedReader reader = new BufferedReader(new InputStreamReader(input, charset), DefaultBufferSize)) {
maybeSkipBom(reader, charsetDoc);
try (Reader reader = new InputStreamReader(input, charset)) {
try {
doc = parser.parseInput(reader, baseUri);
} catch (UncheckedIOException e) {
Expand All @@ -335,13 +325,6 @@
return doc;
}

static void maybeSkipBom(Reader reader, CharsetDoc charsetDoc) throws IOException {
if (charsetDoc.skip) {
long skipped = reader.skip(1);
Validate.isTrue(skipped == 1); // WTF if this fails.
}
}

/**
* Read the input stream into a byte buffer. To deal with slow input streams, you may interrupt the thread this
* method is executing on. The data read until being interrupted will be available.
Expand Down Expand Up @@ -400,34 +383,24 @@
return StringUtil.releaseBuilder(mime);
}

private static @Nullable BomCharset detectCharsetFromBom(final ByteBuffer byteData) {
@SuppressWarnings("UnnecessaryLocalVariable") final Buffer buffer = byteData; // .mark and rewind used to return Buffer, now ByteBuffer, so cast for backward compat
buffer.mark();
private static @Nullable String detectCharsetFromBom(ControllableInputStream input) throws IOException {
byte[] bom = new byte[4];
if (byteData.remaining() >= bom.length) {
byteData.get(bom);
buffer.rewind();
}
input.mark(bom.length);
//noinspection ResultOfMethodCallIgnored
input.read(bom, 0, 4);
Fixed Show fixed Hide fixed
Dismissed Show dismissed Hide dismissed
input.reset();

// 16 and 32 decoders consume the BOM to determine be/le; utf-8 should be consumed here
if (bom[0] == 0x00 && bom[1] == 0x00 && bom[2] == (byte) 0xFE && bom[3] == (byte) 0xFF || // BE
bom[0] == (byte) 0xFF && bom[1] == (byte) 0xFE && bom[2] == 0x00 && bom[3] == 0x00) { // LE
return new BomCharset("UTF-32", false); // and I hope it's on your system
return "UTF-32"; // and I hope it's on your system
} else if (bom[0] == (byte) 0xFE && bom[1] == (byte) 0xFF || // BE
bom[0] == (byte) 0xFF && bom[1] == (byte) 0xFE) {
return new BomCharset("UTF-16", false); // in all Javas
return "UTF-16"; // in all Javas
} else if (bom[0] == (byte) 0xEF && bom[1] == (byte) 0xBB && bom[2] == (byte) 0xBF) {
return new BomCharset("UTF-8", true); // in all Javas
// 16 and 32 decoders consume the BOM to determine be/le; utf-8 should be consumed here
input.read(bom, 0, 3); // consume the UTF-8 BOM
Dismissed Show dismissed Hide dismissed
return "UTF-8"; // in all Javas
}
return null;
}

private static class BomCharset {
private final String charset;
private final boolean offset;

public BomCharset(String charset, boolean offset) {
this.charset = charset;
this.offset = offset;
}
}
}
34 changes: 24 additions & 10 deletions src/main/java/org/jsoup/helper/HttpConnection.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import org.jsoup.UnsupportedMimeTypeException;
import org.jsoup.internal.ControllableInputStream;
import org.jsoup.internal.Functions;
import org.jsoup.internal.SharedConstants;
import org.jsoup.internal.StringUtil;
import org.jsoup.nodes.Document;
import org.jsoup.parser.Parser;
Expand Down Expand Up @@ -52,6 +51,7 @@
import static org.jsoup.Connection.Method.HEAD;
import static org.jsoup.helper.DataUtil.UTF_8;
import static org.jsoup.internal.Normalizer.lowerCase;
import static org.jsoup.internal.SharedConstants.DefaultBufferSize;

/**
* Implementation of {@link Connection}.
Expand Down Expand Up @@ -915,7 +915,7 @@ else if (res.hasHeaderWithValue(CONTENT_ENCODING, "deflate"))
stream = new InflaterInputStream(stream, new Inflater(true));

res.bodyStream = ControllableInputStream.wrap(
stream, SharedConstants.DefaultBufferSize, req.maxBodySize())
stream, DefaultBufferSize, req.maxBodySize())
.timeout(startTime, req.timeout());

if (req.responseProgress != null) // set response progress listener
Expand Down Expand Up @@ -965,11 +965,12 @@ public String contentType() {
}

/** Called from parse() or streamParser(), validates and prepares the input stream, and aligns common settings. */
private InputStream prepareParse() {
private ControllableInputStream prepareParse() {
Validate.isTrue(executed, "Request must be executed (with .execute(), .get(), or .post() before parsing response");
InputStream stream = bodyStream;
ControllableInputStream stream = bodyStream;
if (byteData != null) { // bytes have been read in to the buffer, parse that
stream = new ByteArrayInputStream(byteData.array());
ByteArrayInputStream bytes = new ByteArrayInputStream(byteData.array(), 0, byteData.limit());
stream = ControllableInputStream.wrap(bytes, 0); // no max
inputStreamRead = false; // ok to reparse if in bytes
}
Validate.isFalse(inputStreamRead, "Input stream already read and parsed, cannot re-read.");
Expand All @@ -979,7 +980,7 @@ private InputStream prepareParse() {
}

@Override public Document parse() throws IOException {
InputStream stream = prepareParse();
ControllableInputStream stream = prepareParse();
Document doc = DataUtil.parseInputStream(stream, charset, url.toExternalForm(), req.parser());
doc.connection(new HttpConnection(req, this)); // because we're static, don't have the connection obj. // todo - maybe hold in the req?
charset = doc.outputSettings().charset().name(); // update charset from meta-equiv, possibly
Expand All @@ -988,15 +989,14 @@ private InputStream prepareParse() {
}

@Override public StreamParser streamParser() throws IOException {
InputStream stream = prepareParse();
ControllableInputStream stream = prepareParse();
String baseUri = url.toExternalForm();
DataUtil.CharsetDoc charsetDoc = DataUtil.detectCharset(stream, charset, baseUri, req.parser());
// note that there may be a document in CharsetDoc as a result of scanning meta-data -- but as requires a stream parse, it is not used here. todo - revisit.

// set up the stream parser and rig this connection up to the parsed doc:
StreamParser streamer = new StreamParser(req.parser());
BufferedReader reader = new BufferedReader(new InputStreamReader(stream, charsetDoc.charset));
DataUtil.maybeSkipBom(reader, charsetDoc);
streamer.parse(reader, baseUri); // initializes the parse and the document, but does not step() it
streamer.document().connection(new HttpConnection(req, this));
charset = charsetDoc.charset.name();
Expand Down Expand Up @@ -1035,7 +1035,19 @@ public String body() {
public byte[] bodyAsBytes() {
prepareByteData();
Validate.notNull(byteData);
return byteData.array();
Validate.isTrue(byteData.hasArray()); // we made it, so it should

byte[] array = byteData.array();
int offset = byteData.arrayOffset();
int length = byteData.limit();

if (offset == 0 && length == array.length) { // exact, just return it
return array;
} else { // trim to size
byte[] exactArray = new byte[length];
System.arraycopy(array, offset, exactArray, 0, length);
return exactArray;
}
}

@Override
Expand All @@ -1050,7 +1062,9 @@ public BufferedInputStream bodyStream() {

// if we have read to bytes (via buffer up), return those as a stream.
if (byteData != null) {
return new BufferedInputStream(new ByteArrayInputStream(byteData.array()), SharedConstants.DefaultBufferSize);
return new BufferedInputStream(
new ByteArrayInputStream(byteData.array(), 0, byteData.limit()),
DefaultBufferSize);
}

Validate.isFalse(inputStreamRead, "Request has already been read");
Expand Down
Loading
Loading