Skip to content

Commit

Permalink
In DataUtil, reuse the input stream when detecting charset
Browse files Browse the repository at this point in the history
When reading ahead to detect the charset, instead of buffering into a new ByteBuffer, reuse the ControllableInputStream.
  • Loading branch information
jhy committed Aug 8, 2024
1 parent 5071f6c commit 848cfe0
Show file tree
Hide file tree
Showing 6 changed files with 135 additions and 77 deletions.
62 changes: 30 additions & 32 deletions src/main/java/org/jsoup/helper/DataUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -119,8 +119,7 @@ public static Document load(Path path, @Nullable String charsetName, String base
* @since 1.17.2
*/
public static Document load(Path path, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
InputStream stream = openStream(path);
return parseInputStream(stream, charsetName, baseUri, parser);
return parseInputStream(openStream(path), charsetName, baseUri, parser);
}

/**
Expand Down Expand Up @@ -151,7 +150,7 @@ public static StreamParser streamParser(Path path, @Nullable Charset charset, St
}

/** Open an input stream from a file; if it's a gzip file, returns a GZIPInputStream to unzip it. */
private static InputStream openStream(Path path) throws IOException {
private static ControllableInputStream openStream(Path path) throws IOException {
final SeekableByteChannel byteChannel = Files.newByteChannel(path);
InputStream stream = Channels.newInputStream(byteChannel);
String name = Normalizer.lowerCase(path.getFileName().toString());
Expand All @@ -162,7 +161,7 @@ private static InputStream openStream(Path path) throws IOException {
stream = new GZIPInputStream(stream);
}
}
return stream;
return ControllableInputStream.wrap(stream, 0);
}

/**
Expand All @@ -174,7 +173,7 @@ private static InputStream openStream(Path path) throws IOException {
* @throws IOException on IO error
*/
public static Document load(InputStream in, @Nullable String charsetName, String baseUri) throws IOException {
return parseInputStream(in, charsetName, baseUri, Parser.htmlParser());
return parseInputStream(ControllableInputStream.wrap(in, 0), charsetName, baseUri, Parser.htmlParser());
}

/**
Expand All @@ -187,7 +186,7 @@ public static Document load(InputStream in, @Nullable String charsetName, String
* @throws IOException on IO error
*/
public static Document load(InputStream in, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
return parseInputStream(in, charsetName, baseUri, parser);
return parseInputStream(ControllableInputStream.wrap(in, 0), charsetName, baseUri, parser);
}

/**
Expand Down Expand Up @@ -219,7 +218,7 @@ static class CharsetDoc {
}
}

static Document parseInputStream(@Nullable InputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
static Document parseInputStream(@Nullable ControllableInputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
if (input == null) // empty body // todo reconsider?
return new Document(baseUri);

Expand All @@ -235,30 +234,28 @@ static Document parseInputStream(@Nullable InputStream input, @Nullable String c
return doc;
}

static CharsetDoc detectCharset(InputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
static CharsetDoc detectCharset(ControllableInputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
Document doc = null;

// read the start of the stream and look for a BOM or meta charset
InputStream wrappedInputStream = ControllableInputStream.wrap(input, DefaultBufferSize, 0);
wrappedInputStream.mark(DefaultBufferSize);
ByteBuffer firstBytes = readToByteBuffer(wrappedInputStream, firstReadBufferSize - 1); // -1 because we read one more to see if completed. First read is < buffer size, so can't be invalid.
boolean fullyRead = (wrappedInputStream.read() == -1);
wrappedInputStream.reset();

// read the start of the stream and look for a BOM or meta charset:
// look for BOM - overrides any other header or input
BomCharset bomCharset = detectCharsetFromBom(firstBytes);
BomCharset bomCharset = detectCharsetFromBom(input); // resets
if (bomCharset != null)
charsetName = bomCharset.charset;

if (charsetName == null) { // determine from meta. safe first parse as UTF-8
if (charsetName == null) { // read ahead and determine from meta. safe first parse as UTF-8
int origMax = input.max();
input.max(firstReadBufferSize);
input.mark(firstReadBufferSize);
input.allowClose(false); // ignores closes during parse, in case we need to rewind
try {
CharBuffer defaultDecoded = UTF_8.decode(firstBytes);
if (defaultDecoded.hasArray())
doc = parser.parseInput(new CharArrayReader(defaultDecoded.array(), defaultDecoded.arrayOffset(), defaultDecoded.limit()), baseUri);
else
doc = parser.parseInput(defaultDecoded.toString(), baseUri);
Reader reader = new InputStreamReader(input, UTF_8); // input is currently capped to firstReadBufferSize
doc = parser.parseInput(reader, baseUri);
input.reset();
input.max(origMax); // reset for a full read if required
} catch (UncheckedIOException e) {
throw e.getCause();
} finally {
input.allowClose(true);
}

// look for <meta http-equiv="Content-Type" content="text/html;charset=gb2312"> or HTML5 <meta charset="gb2312">
Expand Down Expand Up @@ -293,7 +290,9 @@ else if (first instanceof Comment) {
foundCharset = foundCharset.trim().replaceAll("[\"']", "");
charsetName = foundCharset;
doc = null;
} else if (!fullyRead) {
} else if (input.baseReadFully()) { // if we have read fully, and the charset was correct, keep that current parse
input.close(); // the parser tried to close it
} else {
doc = null;
}
} else { // specified by content type header (or by user on file load)
Expand All @@ -306,7 +305,7 @@ else if (first instanceof Comment) {
Charset charset = charsetName.equals(defaultCharsetName) ? UTF_8 : Charset.forName(charsetName);
boolean skip = bomCharset != null && bomCharset.offset; // skip 1 if the BOM is there and needs offset
// if consumer needs to parse the input; prep it if there's a BOM. Can't skip in inputstream as wrapping buffer will ignore the pos
return new CharsetDoc(charset, doc, wrappedInputStream, skip);
return new CharsetDoc(charset, doc, input, skip);
}

static Document parseInputStream(CharsetDoc charsetDoc, String baseUri, Parser parser) throws IOException {
Expand Down Expand Up @@ -400,14 +399,13 @@ static String mimeBoundary() {
return StringUtil.releaseBuilder(mime);
}

private static @Nullable BomCharset detectCharsetFromBom(final ByteBuffer byteData) {
@SuppressWarnings("UnnecessaryLocalVariable") final Buffer buffer = byteData; // .mark and rewind used to return Buffer, now ByteBuffer, so cast for backward compat
buffer.mark();
private static @Nullable BomCharset detectCharsetFromBom(ControllableInputStream input) throws IOException {
byte[] bom = new byte[4];
if (byteData.remaining() >= bom.length) {
byteData.get(bom);
buffer.rewind();
}
input.mark(bom.length);
//noinspection ResultOfMethodCallIgnored
input.read(bom, 0, 4);

Check notice

Code scanning / CodeQL

Ignored error status of call Note

Method detectCharsetFromBom ignores exceptional return value of ControllableInputStream.read.
input.reset();

if (bom[0] == 0x00 && bom[1] == 0x00 && bom[2] == (byte) 0xFE && bom[3] == (byte) 0xFF || // BE
bom[0] == (byte) 0xFF && bom[1] == (byte) 0xFE && bom[2] == 0x00 && bom[3] == 0x00) { // LE
return new BomCharset("UTF-32", false); // and I hope it's on your system
Expand Down
16 changes: 9 additions & 7 deletions src/main/java/org/jsoup/helper/HttpConnection.java
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
import static org.jsoup.Connection.Method.HEAD;
import static org.jsoup.helper.DataUtil.UTF_8;
import static org.jsoup.internal.Normalizer.lowerCase;
import static org.jsoup.internal.SharedConstants.DefaultBufferSize;

/**
* Implementation of {@link Connection}.
Expand Down Expand Up @@ -915,7 +916,7 @@ else if (res.hasHeaderWithValue(CONTENT_ENCODING, "deflate"))
stream = new InflaterInputStream(stream, new Inflater(true));

res.bodyStream = ControllableInputStream.wrap(
stream, SharedConstants.DefaultBufferSize, req.maxBodySize())
stream, DefaultBufferSize, req.maxBodySize())
.timeout(startTime, req.timeout());

if (req.responseProgress != null) // set response progress listener
Expand Down Expand Up @@ -965,11 +966,12 @@ public String contentType() {
}

/** Called from parse() or streamParser(), validates and prepares the input stream, and aligns common settings. */
private InputStream prepareParse() {
private ControllableInputStream prepareParse() {
Validate.isTrue(executed, "Request must be executed (with .execute(), .get(), or .post() before parsing response");
InputStream stream = bodyStream;
ControllableInputStream stream = bodyStream;
if (byteData != null) { // bytes have been read in to the buffer, parse that
stream = new ByteArrayInputStream(byteData.array(), 0, byteData.limit());
ByteArrayInputStream bytes = new ByteArrayInputStream(byteData.array(), 0, byteData.limit());
stream = ControllableInputStream.wrap(bytes, 0); // no max
inputStreamRead = false; // ok to reparse if in bytes
}
Validate.isFalse(inputStreamRead, "Input stream already read and parsed, cannot re-read.");
Expand All @@ -979,7 +981,7 @@ private InputStream prepareParse() {
}

@Override public Document parse() throws IOException {
InputStream stream = prepareParse();
ControllableInputStream stream = prepareParse();
Document doc = DataUtil.parseInputStream(stream, charset, url.toExternalForm(), req.parser());
doc.connection(new HttpConnection(req, this)); // because we're static, don't have the connection obj. // todo - maybe hold in the req?
charset = doc.outputSettings().charset().name(); // update charset from meta-equiv, possibly
Expand All @@ -988,7 +990,7 @@ private InputStream prepareParse() {
}

@Override public StreamParser streamParser() throws IOException {
InputStream stream = prepareParse();
ControllableInputStream stream = prepareParse();
String baseUri = url.toExternalForm();
DataUtil.CharsetDoc charsetDoc = DataUtil.detectCharset(stream, charset, baseUri, req.parser());
// note that there may be a document in CharsetDoc as a result of scanning meta-data -- but as requires a stream parse, it is not used here. todo - revisit.
Expand Down Expand Up @@ -1064,7 +1066,7 @@ public BufferedInputStream bodyStream() {
if (byteData != null) {
return new BufferedInputStream(
new ByteArrayInputStream(byteData.array(), 0, byteData.limit()),
SharedConstants.DefaultBufferSize);
DefaultBufferSize);
}

Validate.isFalse(inputStreamRead, "Request has already been read");
Expand Down
98 changes: 68 additions & 30 deletions src/main/java/org/jsoup/internal/ControllableInputStream.java
Original file line number Diff line number Diff line change
Expand Up @@ -14,31 +14,30 @@
import static org.jsoup.internal.SharedConstants.DefaultBufferSize;

/**
* A jsoup internal class (so don't use it as there is no contract API) that enables controls on a Buffered Input Stream,
* A jsoup internal class (so don't use it as there is no contract API) that enables controls on a buffered input stream,
* namely a maximum read size, and the ability to Thread.interrupt() the read.
*/
// reimplemented from ConstrainableInputStream for JDK21 - extending BufferedInputStream will pin threads during read
public class ControllableInputStream extends FilterInputStream {
private final InputStream buff;
private final boolean capped;
private final int maxSize;
private final SimpleBufferedInput buff; // super.in, but typed as SimpleBufferedInput
private int maxSize;
private long startTime;
private long timeout = 0; // optional max time of request
private int remaining;
private int markPos;
private boolean interrupted;
private boolean allowClose = true; // for cases where we want to re-read the input, can ignore .close() from the parser

// if we are tracking progress, will have the expected content length, progress callback, connection
private @Nullable Progress<?> progress;
private @Nullable Object progressContext;
private int contentLength = -1;
private int readPos = 0; // amount read; can be reset()

private ControllableInputStream(InputStream in, int maxSize) {
private ControllableInputStream(SimpleBufferedInput in, int maxSize) {
super(in);
Validate.isTrue(maxSize >= 0);
buff = in;
capped = maxSize != 0;
this.maxSize = maxSize;
remaining = maxSize;
markPos = -1;
Expand All @@ -48,52 +47,63 @@ private ControllableInputStream(InputStream in, int maxSize) {
/**
* If this InputStream is not already a ControllableInputStream, let it be one.
* @param in the input stream to (maybe) wrap
* @param bufferSize the buffer size to use when reading
* @param maxSize the maximum size to allow to be read. 0 == infinite.
* @return a controllable input stream
*/
public static ControllableInputStream wrap(InputStream in, int bufferSize, int maxSize) {
public static ControllableInputStream wrap(InputStream in, int maxSize) {
// bufferSize currently unused; consider implementing as a min size in the SoftPool recycler
if (in instanceof ControllableInputStream)
return (ControllableInputStream) in;
else if (in instanceof BufferedInputStream)
return new ControllableInputStream(in, maxSize);
else
return new ControllableInputStream(new SimpleBufferedInput(in), maxSize);
}

/**
* If this InputStream is not already a ControllableInputStream, let it be one.
* @param in the input stream to (maybe) wrap
* @param bufferSize the buffer size to use when reading
* @param maxSize the maximum size to allow to be read. 0 == infinite.
* @return a controllable input stream
*/
public static ControllableInputStream wrap(InputStream in, int bufferSize, int maxSize) {

Check notice

Code scanning / CodeQL

Useless parameter Note

The parameter 'bufferSize' is never used.
// todo - bufferSize currently unused; consider implementing as a min size in the SoftPool recycler; or just deprecate if always DefaultBufferSize
return wrap(in, maxSize);
}

@Override
public int read(byte[] b, int off, int len) throws IOException {
if (readPos == 0) emitProgress(); // emits a progress


boolean capped = maxSize != 0;
if (interrupted || capped && remaining <= 0)
return -1;
if (Thread.currentThread().isInterrupted()) {
// interrupted latches, because parse() may call twice
interrupted = true;
return -1;
}
if (expired())
throw new SocketTimeoutException("Read timeout");

if (capped && len > remaining)
len = remaining; // don't read more than desired, even if available

try {
final int read = super.read(b, off, len);
if (read == -1) { // completed
contentLength = readPos;
} else {
remaining -= read;
readPos += read;
}
emitProgress();

return read;
} catch (SocketTimeoutException e) {
while (true) { // loop trying to read until we get some data or hit the overall timeout, if we have one
if (expired())
throw e;
return 0;
throw new SocketTimeoutException("Read timeout");

try {
final int read = super.read(b, off, len);
if (read == -1) { // completed
contentLength = readPos;
} else {
remaining -= read;
readPos += read;
}
emitProgress();
return read;
} catch (SocketTimeoutException e) {
if (expired() || timeout == 0)
throw e;
}
}
}

Expand Down Expand Up @@ -146,6 +156,36 @@ public static ByteBuffer readToByteBuffer(InputStream in, int max) throws IOExce
markPos = maxSize - remaining;
}

/**
Check if the underlying InputStream has been read fully. There may still content in buffers to be consumed, and
read methods may return -1 if hit the read limit.
@return true if the underlying inputstream has been read fully.
*/
public boolean baseReadFully() {
return buff.baseReadFully();
}

/**
Get the max size of this stream (how far at most will be read from the underlying stream)
* @return the max size
*/
public int max() {
return maxSize;
}

public void max(int newMax) {
remaining += newMax - maxSize; // update remaining to reflect the difference in the new maxsize
maxSize = newMax;
}

public void allowClose(boolean allowClose) {
this.allowClose = allowClose;
}

@Override public void close() throws IOException {
if (allowClose) super.close();
}

public ControllableInputStream timeout(long startTimeNanos, long timeoutMillis) {
this.startTime = startTimeNanos;
this.timeout = timeoutMillis * 1000000;
Expand Down Expand Up @@ -181,8 +221,6 @@ private boolean expired() {

public BufferedInputStream inputStream() {
// called via HttpConnection.Response.bodyStream(), needs an OG BufferedInputStream
if (buff instanceof BufferedInputStream)
return (BufferedInputStream) buff; // if originally supplied a BIS in .wrap()
else return new BufferedInputStream(buff);
return new BufferedInputStream(buff);
}
}
Loading

0 comments on commit 848cfe0

Please sign in to comment.