diff --git a/pom.xml b/pom.xml index ae8403bf72..02b4207e16 100644 --- a/pom.xml +++ b/pom.xml @@ -105,6 +105,7 @@ java.util.Set java.util.Spliterator java.util.Spliterators + java.nio.ByteBuffer java.net.HttpURLConnection diff --git a/src/main/java/org/jsoup/helper/DataUtil.java b/src/main/java/org/jsoup/helper/DataUtil.java index 9664d3eac1..71429c3262 100644 --- a/src/main/java/org/jsoup/helper/DataUtil.java +++ b/src/main/java/org/jsoup/helper/DataUtil.java @@ -15,7 +15,6 @@ import org.jspecify.annotations.Nullable; import java.io.BufferedReader; -import java.io.CharArrayReader; import java.io.File; import java.io.IOException; import java.io.InputStream; @@ -23,9 +22,7 @@ import java.io.OutputStream; import java.io.Reader; import java.io.UncheckedIOException; -import java.nio.Buffer; import java.nio.ByteBuffer; -import java.nio.CharBuffer; import java.nio.channels.Channels; import java.nio.channels.SeekableByteChannel; import java.nio.charset.Charset; @@ -119,8 +116,7 @@ public static Document load(Path path, @Nullable String charsetName, String base * @since 1.17.2 */ public static Document load(Path path, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { - InputStream stream = openStream(path); - return parseInputStream(stream, charsetName, baseUri, parser); + return parseInputStream(openStream(path), charsetName, baseUri, parser); } /** @@ -144,14 +140,13 @@ public static StreamParser streamParser(Path path, @Nullable Charset charset, St String charsetName = charset != null? charset.name() : null; DataUtil.CharsetDoc charsetDoc = DataUtil.detectCharset(openStream(path), charsetName, baseUri, parser); BufferedReader reader = new BufferedReader(new InputStreamReader(charsetDoc.input, charsetDoc.charset), DefaultBufferSize); - maybeSkipBom(reader, charsetDoc); streamer.parse(reader, baseUri); // initializes the parse and the document, but does not step() it return streamer; } /** Open an input stream from a file; if it's a gzip file, returns a GZIPInputStream to unzip it. */ - private static InputStream openStream(Path path) throws IOException { + private static ControllableInputStream openStream(Path path) throws IOException { final SeekableByteChannel byteChannel = Files.newByteChannel(path); InputStream stream = Channels.newInputStream(byteChannel); String name = Normalizer.lowerCase(path.getFileName().toString()); @@ -162,7 +157,7 @@ private static InputStream openStream(Path path) throws IOException { stream = new GZIPInputStream(stream); } } - return stream; + return ControllableInputStream.wrap(stream, 0); } /** @@ -174,7 +169,7 @@ private static InputStream openStream(Path path) throws IOException { * @throws IOException on IO error */ public static Document load(InputStream in, @Nullable String charsetName, String baseUri) throws IOException { - return parseInputStream(in, charsetName, baseUri, Parser.htmlParser()); + return parseInputStream(ControllableInputStream.wrap(in, 0), charsetName, baseUri, Parser.htmlParser()); } /** @@ -187,7 +182,7 @@ public static Document load(InputStream in, @Nullable String charsetName, String * @throws IOException on IO error */ public static Document load(InputStream in, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { - return parseInputStream(in, charsetName, baseUri, parser); + return parseInputStream(ControllableInputStream.wrap(in, 0), charsetName, baseUri, parser); } /** @@ -209,17 +204,15 @@ static class CharsetDoc { Charset charset; InputStream input; @Nullable Document doc; - boolean skip; - CharsetDoc(Charset charset, @Nullable Document doc, InputStream input, boolean skip) { + CharsetDoc(Charset charset, @Nullable Document doc, InputStream input) { this.charset = charset; this.input = input; this.doc = doc; - this.skip = skip; } } - static Document parseInputStream(@Nullable InputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { + static Document parseInputStream(@Nullable ControllableInputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { if (input == null) // empty body // todo reconsider? return new Document(baseUri); @@ -235,30 +228,28 @@ static Document parseInputStream(@Nullable InputStream input, @Nullable String c return doc; } - static CharsetDoc detectCharset(InputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { + static CharsetDoc detectCharset(ControllableInputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { Document doc = null; - - // read the start of the stream and look for a BOM or meta charset - InputStream wrappedInputStream = ControllableInputStream.wrap(input, DefaultBufferSize, 0); - wrappedInputStream.mark(DefaultBufferSize); - ByteBuffer firstBytes = readToByteBuffer(wrappedInputStream, firstReadBufferSize - 1); // -1 because we read one more to see if completed. First read is < buffer size, so can't be invalid. - boolean fullyRead = (wrappedInputStream.read() == -1); - wrappedInputStream.reset(); - + // read the start of the stream and look for a BOM or meta charset: // look for BOM - overrides any other header or input - BomCharset bomCharset = detectCharsetFromBom(firstBytes); + String bomCharset = detectCharsetFromBom(input); // resets / consumes appropriately if (bomCharset != null) - charsetName = bomCharset.charset; + charsetName = bomCharset; - if (charsetName == null) { // determine from meta. safe first parse as UTF-8 + if (charsetName == null) { // read ahead and determine from meta. safe first parse as UTF-8 + int origMax = input.max(); + input.max(firstReadBufferSize); + input.mark(firstReadBufferSize); + input.allowClose(false); // ignores closes during parse, in case we need to rewind try { - CharBuffer defaultDecoded = UTF_8.decode(firstBytes); - if (defaultDecoded.hasArray()) - doc = parser.parseInput(new CharArrayReader(defaultDecoded.array(), defaultDecoded.arrayOffset(), defaultDecoded.limit()), baseUri); - else - doc = parser.parseInput(defaultDecoded.toString(), baseUri); + Reader reader = new InputStreamReader(input, UTF_8); // input is currently capped to firstReadBufferSize + doc = parser.parseInput(reader, baseUri); + input.reset(); + input.max(origMax); // reset for a full read if required } catch (UncheckedIOException e) { throw e.getCause(); + } finally { + input.allowClose(true); } // look for or HTML5 @@ -293,7 +284,9 @@ else if (first instanceof Comment) { foundCharset = foundCharset.trim().replaceAll("[\"']", ""); charsetName = foundCharset; doc = null; - } else if (!fullyRead) { + } else if (input.baseReadFully()) { // if we have read fully, and the charset was correct, keep that current parse + input.close(); // the parser tried to close it + } else { doc = null; } } else { // specified by content type header (or by user on file load) @@ -304,9 +297,7 @@ else if (first instanceof Comment) { if (charsetName == null) charsetName = defaultCharsetName; Charset charset = charsetName.equals(defaultCharsetName) ? UTF_8 : Charset.forName(charsetName); - boolean skip = bomCharset != null && bomCharset.offset; // skip 1 if the BOM is there and needs offset - // if consumer needs to parse the input; prep it if there's a BOM. Can't skip in inputstream as wrapping buffer will ignore the pos - return new CharsetDoc(charset, doc, wrappedInputStream, skip); + return new CharsetDoc(charset, doc, input); } static Document parseInputStream(CharsetDoc charsetDoc, String baseUri, Parser parser) throws IOException { @@ -318,8 +309,7 @@ static Document parseInputStream(CharsetDoc charsetDoc, String baseUri, Parser p Validate.notNull(input); final Document doc; final Charset charset = charsetDoc.charset; - try (BufferedReader reader = new BufferedReader(new InputStreamReader(input, charset), DefaultBufferSize)) { - maybeSkipBom(reader, charsetDoc); + try (Reader reader = new InputStreamReader(input, charset)) { try { doc = parser.parseInput(reader, baseUri); } catch (UncheckedIOException e) { @@ -335,13 +325,6 @@ static Document parseInputStream(CharsetDoc charsetDoc, String baseUri, Parser p return doc; } - static void maybeSkipBom(Reader reader, CharsetDoc charsetDoc) throws IOException { - if (charsetDoc.skip) { - long skipped = reader.skip(1); - Validate.isTrue(skipped == 1); // WTF if this fails. - } - } - /** * Read the input stream into a byte buffer. To deal with slow input streams, you may interrupt the thread this * method is executing on. The data read until being interrupted will be available. @@ -400,34 +383,24 @@ static String mimeBoundary() { return StringUtil.releaseBuilder(mime); } - private static @Nullable BomCharset detectCharsetFromBom(final ByteBuffer byteData) { - @SuppressWarnings("UnnecessaryLocalVariable") final Buffer buffer = byteData; // .mark and rewind used to return Buffer, now ByteBuffer, so cast for backward compat - buffer.mark(); + private static @Nullable String detectCharsetFromBom(ControllableInputStream input) throws IOException { byte[] bom = new byte[4]; - if (byteData.remaining() >= bom.length) { - byteData.get(bom); - buffer.rewind(); - } + input.mark(bom.length); + //noinspection ResultOfMethodCallIgnored + input.read(bom, 0, 4); + input.reset(); + + // 16 and 32 decoders consume the BOM to determine be/le; utf-8 should be consumed here if (bom[0] == 0x00 && bom[1] == 0x00 && bom[2] == (byte) 0xFE && bom[3] == (byte) 0xFF || // BE bom[0] == (byte) 0xFF && bom[1] == (byte) 0xFE && bom[2] == 0x00 && bom[3] == 0x00) { // LE - return new BomCharset("UTF-32", false); // and I hope it's on your system + return "UTF-32"; // and I hope it's on your system } else if (bom[0] == (byte) 0xFE && bom[1] == (byte) 0xFF || // BE bom[0] == (byte) 0xFF && bom[1] == (byte) 0xFE) { - return new BomCharset("UTF-16", false); // in all Javas + return "UTF-16"; // in all Javas } else if (bom[0] == (byte) 0xEF && bom[1] == (byte) 0xBB && bom[2] == (byte) 0xBF) { - return new BomCharset("UTF-8", true); // in all Javas - // 16 and 32 decoders consume the BOM to determine be/le; utf-8 should be consumed here + input.read(bom, 0, 3); // consume the UTF-8 BOM + return "UTF-8"; // in all Javas } return null; } - - private static class BomCharset { - private final String charset; - private final boolean offset; - - public BomCharset(String charset, boolean offset) { - this.charset = charset; - this.offset = offset; - } - } } diff --git a/src/main/java/org/jsoup/helper/HttpConnection.java b/src/main/java/org/jsoup/helper/HttpConnection.java index e52c39433e..db2877a7bb 100644 --- a/src/main/java/org/jsoup/helper/HttpConnection.java +++ b/src/main/java/org/jsoup/helper/HttpConnection.java @@ -7,7 +7,6 @@ import org.jsoup.UnsupportedMimeTypeException; import org.jsoup.internal.ControllableInputStream; import org.jsoup.internal.Functions; -import org.jsoup.internal.SharedConstants; import org.jsoup.internal.StringUtil; import org.jsoup.nodes.Document; import org.jsoup.parser.Parser; @@ -52,6 +51,7 @@ import static org.jsoup.Connection.Method.HEAD; import static org.jsoup.helper.DataUtil.UTF_8; import static org.jsoup.internal.Normalizer.lowerCase; +import static org.jsoup.internal.SharedConstants.DefaultBufferSize; /** * Implementation of {@link Connection}. @@ -915,7 +915,7 @@ else if (res.hasHeaderWithValue(CONTENT_ENCODING, "deflate")) stream = new InflaterInputStream(stream, new Inflater(true)); res.bodyStream = ControllableInputStream.wrap( - stream, SharedConstants.DefaultBufferSize, req.maxBodySize()) + stream, DefaultBufferSize, req.maxBodySize()) .timeout(startTime, req.timeout()); if (req.responseProgress != null) // set response progress listener @@ -965,11 +965,12 @@ public String contentType() { } /** Called from parse() or streamParser(), validates and prepares the input stream, and aligns common settings. */ - private InputStream prepareParse() { + private ControllableInputStream prepareParse() { Validate.isTrue(executed, "Request must be executed (with .execute(), .get(), or .post() before parsing response"); - InputStream stream = bodyStream; + ControllableInputStream stream = bodyStream; if (byteData != null) { // bytes have been read in to the buffer, parse that - stream = new ByteArrayInputStream(byteData.array()); + ByteArrayInputStream bytes = new ByteArrayInputStream(byteData.array(), 0, byteData.limit()); + stream = ControllableInputStream.wrap(bytes, 0); // no max inputStreamRead = false; // ok to reparse if in bytes } Validate.isFalse(inputStreamRead, "Input stream already read and parsed, cannot re-read."); @@ -979,7 +980,7 @@ private InputStream prepareParse() { } @Override public Document parse() throws IOException { - InputStream stream = prepareParse(); + ControllableInputStream stream = prepareParse(); Document doc = DataUtil.parseInputStream(stream, charset, url.toExternalForm(), req.parser()); doc.connection(new HttpConnection(req, this)); // because we're static, don't have the connection obj. // todo - maybe hold in the req? charset = doc.outputSettings().charset().name(); // update charset from meta-equiv, possibly @@ -988,7 +989,7 @@ private InputStream prepareParse() { } @Override public StreamParser streamParser() throws IOException { - InputStream stream = prepareParse(); + ControllableInputStream stream = prepareParse(); String baseUri = url.toExternalForm(); DataUtil.CharsetDoc charsetDoc = DataUtil.detectCharset(stream, charset, baseUri, req.parser()); // note that there may be a document in CharsetDoc as a result of scanning meta-data -- but as requires a stream parse, it is not used here. todo - revisit. @@ -996,7 +997,6 @@ private InputStream prepareParse() { // set up the stream parser and rig this connection up to the parsed doc: StreamParser streamer = new StreamParser(req.parser()); BufferedReader reader = new BufferedReader(new InputStreamReader(stream, charsetDoc.charset)); - DataUtil.maybeSkipBom(reader, charsetDoc); streamer.parse(reader, baseUri); // initializes the parse and the document, but does not step() it streamer.document().connection(new HttpConnection(req, this)); charset = charsetDoc.charset.name(); @@ -1035,7 +1035,19 @@ public String body() { public byte[] bodyAsBytes() { prepareByteData(); Validate.notNull(byteData); - return byteData.array(); + Validate.isTrue(byteData.hasArray()); // we made it, so it should + + byte[] array = byteData.array(); + int offset = byteData.arrayOffset(); + int length = byteData.limit(); + + if (offset == 0 && length == array.length) { // exact, just return it + return array; + } else { // trim to size + byte[] exactArray = new byte[length]; + System.arraycopy(array, offset, exactArray, 0, length); + return exactArray; + } } @Override @@ -1050,7 +1062,9 @@ public BufferedInputStream bodyStream() { // if we have read to bytes (via buffer up), return those as a stream. if (byteData != null) { - return new BufferedInputStream(new ByteArrayInputStream(byteData.array()), SharedConstants.DefaultBufferSize); + return new BufferedInputStream( + new ByteArrayInputStream(byteData.array(), 0, byteData.limit()), + DefaultBufferSize); } Validate.isFalse(inputStreamRead, "Request has already been read"); diff --git a/src/main/java/org/jsoup/internal/ControllableInputStream.java b/src/main/java/org/jsoup/internal/ControllableInputStream.java index 96cbd15c0e..5d8e8bba93 100644 --- a/src/main/java/org/jsoup/internal/ControllableInputStream.java +++ b/src/main/java/org/jsoup/internal/ControllableInputStream.java @@ -5,7 +5,6 @@ import org.jspecify.annotations.Nullable; import java.io.BufferedInputStream; -import java.io.ByteArrayOutputStream; import java.io.FilterInputStream; import java.io.IOException; import java.io.InputStream; @@ -15,19 +14,19 @@ import static org.jsoup.internal.SharedConstants.DefaultBufferSize; /** - * A jsoup internal class (so don't use it as there is no contract API) that enables controls on a Buffered Input Stream, + * A jsoup internal class (so don't use it as there is no contract API) that enables controls on a buffered input stream, * namely a maximum read size, and the ability to Thread.interrupt() the read. */ // reimplemented from ConstrainableInputStream for JDK21 - extending BufferedInputStream will pin threads during read public class ControllableInputStream extends FilterInputStream { - private final BufferedInputStream buff; - private final boolean capped; - private final int maxSize; + private final SimpleBufferedInput buff; // super.in, but typed as SimpleBufferedInput + private int maxSize; private long startTime; private long timeout = 0; // optional max time of request private int remaining; private int markPos; private boolean interrupted; + private boolean allowClose = true; // for cases where we want to re-read the input, can ignore .close() from the parser // if we are tracking progress, will have the expected content length, progress callback, connection private @Nullable Progress progress; @@ -35,11 +34,10 @@ public class ControllableInputStream extends FilterInputStream { private int contentLength = -1; private int readPos = 0; // amount read; can be reset() - private ControllableInputStream(BufferedInputStream in, int maxSize) { + private ControllableInputStream(SimpleBufferedInput in, int maxSize) { super(in); Validate.isTrue(maxSize >= 0); buff = in; - capped = maxSize != 0; this.maxSize = maxSize; remaining = maxSize; markPos = -1; @@ -49,23 +47,34 @@ private ControllableInputStream(BufferedInputStream in, int maxSize) { /** * If this InputStream is not already a ControllableInputStream, let it be one. * @param in the input stream to (maybe) wrap - * @param bufferSize the buffer size to use when reading * @param maxSize the maximum size to allow to be read. 0 == infinite. * @return a controllable input stream */ - public static ControllableInputStream wrap(InputStream in, int bufferSize, int maxSize) { + public static ControllableInputStream wrap(InputStream in, int maxSize) { + // bufferSize currently unused; consider implementing as a min size in the SoftPool recycler if (in instanceof ControllableInputStream) return (ControllableInputStream) in; - else if (in instanceof BufferedInputStream) - return new ControllableInputStream((BufferedInputStream) in, maxSize); else - return new ControllableInputStream(new BufferedInputStream(in, bufferSize), maxSize); + return new ControllableInputStream(new SimpleBufferedInput(in), maxSize); + } + + /** + * If this InputStream is not already a ControllableInputStream, let it be one. + * @param in the input stream to (maybe) wrap + * @param bufferSize the buffer size to use when reading + * @param maxSize the maximum size to allow to be read. 0 == infinite. + * @return a controllable input stream + */ + public static ControllableInputStream wrap(InputStream in, int bufferSize, int maxSize) { + // todo - bufferSize currently unused; consider implementing as a min size in the SoftPool recycler; or just deprecate if always DefaultBufferSize + return wrap(in, maxSize); } @Override public int read(byte[] b, int off, int len) throws IOException { if (readPos == 0) emitProgress(); // emits a progress - + + boolean capped = maxSize != 0; if (interrupted || capped && remaining <= 0) return -1; if (Thread.currentThread().isInterrupted()) { @@ -73,27 +82,28 @@ public int read(byte[] b, int off, int len) throws IOException { interrupted = true; return -1; } - if (expired()) - throw new SocketTimeoutException("Read timeout"); if (capped && len > remaining) len = remaining; // don't read more than desired, even if available - try { - final int read = super.read(b, off, len); - if (read == -1) { // completed - contentLength = readPos; - } else { - remaining -= read; - readPos += read; - } - emitProgress(); - - return read; - } catch (SocketTimeoutException e) { + while (true) { // loop trying to read until we get some data or hit the overall timeout, if we have one if (expired()) - throw e; - return 0; + throw new SocketTimeoutException("Read timeout"); + + try { + final int read = super.read(b, off, len); + if (read == -1) { // completed + contentLength = readPos; + } else { + remaining -= read; + readPos += read; + } + emitProgress(); + return read; + } catch (SocketTimeoutException e) { + if (expired() || timeout == 0) + throw e; + } } } @@ -104,26 +114,33 @@ public int read(byte[] b, int off, int len) throws IOException { public static ByteBuffer readToByteBuffer(InputStream in, int max) throws IOException { Validate.isTrue(max >= 0, "maxSize must be 0 (unlimited) or larger"); Validate.notNull(in); - final boolean localCapped = max > 0; // still possibly capped in total stream - final int bufferSize = localCapped && max < DefaultBufferSize ? max : DefaultBufferSize; - final byte[] readBuffer = new byte[bufferSize]; - final ByteArrayOutputStream outStream = new ByteArrayOutputStream(bufferSize); - - int read; - int remaining = max; - while (true) { - read = in.read(readBuffer, 0, localCapped ? Math.min(remaining, bufferSize) : bufferSize); - if (read == -1) break; - if (localCapped) { // this local byteBuffer cap may be smaller than the overall maxSize (like when reading first bytes) - if (read >= remaining) { - outStream.write(readBuffer, 0, remaining); - break; + final boolean capped = max > 0; + final byte[] readBuf = SimpleBufferedInput.BufferPool.borrow(); // Share the same byte[] pool as SBI + final int outSize = capped ? Math.min(max, DefaultBufferSize) : DefaultBufferSize; + ByteBuffer outBuf = ByteBuffer.allocate(outSize); + + try { + int remaining = max; + int read; + while ((read = in.read(readBuf, 0, capped ? Math.min(remaining, DefaultBufferSize) : DefaultBufferSize)) != -1) { + if (outBuf.remaining() < read) { // needs to grow + int newCapacity = (int) Math.max(outBuf.capacity() * 1.5, outBuf.capacity() + read); + ByteBuffer newBuffer = ByteBuffer.allocate(newCapacity); + outBuf.flip(); + newBuffer.put(outBuf); + outBuf = newBuffer; + } + outBuf.put(readBuf, 0, read); + if (capped) { + remaining -= read; + if (remaining <= 0) break; } - remaining -= read; } - outStream.write(readBuffer, 0, read); + outBuf.flip(); // Prepare the buffer for reading + return outBuf; + } finally { + SimpleBufferedInput.BufferPool.release(readBuf); } - return ByteBuffer.wrap(outStream.toByteArray()); } @SuppressWarnings("NonSynchronizedMethodOverridesSynchronizedMethod") // not synchronized in later JDKs @@ -139,6 +156,36 @@ public static ByteBuffer readToByteBuffer(InputStream in, int max) throws IOExce markPos = maxSize - remaining; } + /** + Check if the underlying InputStream has been read fully. There may still content in buffers to be consumed, and + read methods may return -1 if hit the read limit. + @return true if the underlying inputstream has been read fully. + */ + public boolean baseReadFully() { + return buff.baseReadFully(); + } + + /** + Get the max size of this stream (how far at most will be read from the underlying stream) + * @return the max size + */ + public int max() { + return maxSize; + } + + public void max(int newMax) { + remaining += newMax - maxSize; // update remaining to reflect the difference in the new maxsize + maxSize = newMax; + } + + public void allowClose(boolean allowClose) { + this.allowClose = allowClose; + } + + @Override public void close() throws IOException { + if (allowClose) super.close(); + } + public ControllableInputStream timeout(long startTimeNanos, long timeoutMillis) { this.startTime = startTimeNanos; this.timeout = timeoutMillis * 1000000; @@ -173,6 +220,7 @@ private boolean expired() { } public BufferedInputStream inputStream() { - return buff; + // called via HttpConnection.Response.bodyStream(), needs an OG BufferedInputStream + return new BufferedInputStream(buff); } } diff --git a/src/main/java/org/jsoup/internal/SharedConstants.java b/src/main/java/org/jsoup/internal/SharedConstants.java index 8e8520cf62..6fdf05e76d 100644 --- a/src/main/java/org/jsoup/internal/SharedConstants.java +++ b/src/main/java/org/jsoup/internal/SharedConstants.java @@ -10,7 +10,7 @@ public final class SharedConstants { public static final String RangeKey = "jsoup.start"; public static final String EndRangeKey = "jsoup.end"; - public static final int DefaultBufferSize = 1024 * 32; + public static final int DefaultBufferSize = 8 * 1024; public static final String[] FormSubmitTags = { "input", "keygen", "object", "select", "textarea" diff --git a/src/main/java/org/jsoup/internal/SimpleBufferedInput.java b/src/main/java/org/jsoup/internal/SimpleBufferedInput.java new file mode 100644 index 0000000000..08bbb8914e --- /dev/null +++ b/src/main/java/org/jsoup/internal/SimpleBufferedInput.java @@ -0,0 +1,156 @@ +package org.jsoup.internal; + +import org.jsoup.helper.Validate; +import org.jspecify.annotations.Nullable; + +import java.io.FilterInputStream; +import java.io.IOException; +import java.io.InputStream; + +import static org.jsoup.internal.SharedConstants.DefaultBufferSize; + +/** + A simple implemented of a buffered input stream, in which we can control the byte[] buffer to recycle it. Not safe for + use between threads; no sync or locks. The buffer is borrowed on initial demand in fill. + @since 1.18.2 + */ +class SimpleBufferedInput extends FilterInputStream { + static final int BufferSize = DefaultBufferSize; + static final SoftPool BufferPool = new SoftPool<>(() -> new byte[BufferSize]); + + private byte @Nullable [] byteBuf; // the byte buffer; recycled via SoftPool. Created in fill if required + private int bufPos; + private int bufLength; + private int bufMark = -1; + private boolean inReadFully = false; // true when the underlying inputstream has been read fully + + SimpleBufferedInput(InputStream in) { + super(in); + } + + @Override + public int read() throws IOException { + if (bufPos >= bufLength) { + fill(); + if (bufPos >= bufLength) + return -1; + } + return getBuf()[bufPos++] & 0xff; + } + + @Override + public int read(byte[] dest, int offset, int desiredLen) throws IOException { + Validate.notNull(dest); + if (offset < 0 || desiredLen < 0 || desiredLen > dest.length - offset) { + throw new IndexOutOfBoundsException(); + } else if (desiredLen == 0) { + return 0; + } + + int bufAvail = bufLength - bufPos; + if (bufAvail <= 0) { // can't serve from the buffer + if (!inReadFully && bufMark < 0) { + // skip creating / copying into a local buffer; just pass through + int read = in.read(dest, offset, desiredLen); + closeIfDone(read); + return read; + } + fill(); + bufAvail = bufLength - bufPos; + } + + int read = Math.min(bufAvail, desiredLen); + if (read <= 0) { + return -1; + } + + System.arraycopy(getBuf(), bufPos, dest, offset, read); + bufPos += read; + return read; + } + + private void fill() throws IOException { + if (inReadFully) return; + if (byteBuf == null) { // get one on first demand + byteBuf = BufferPool.borrow(); + } + + if (bufMark < 0) { // no mark, can lose buffer (assumes we've read to bufLen) + bufPos = 0; + } else if (bufPos >= BufferSize) { // no room left in buffer + if (bufMark > 0) { // can throw away early part of the buffer + int size = bufPos - bufMark; + System.arraycopy(byteBuf, bufMark, byteBuf, 0, size); + bufPos = size; + bufMark = 0; + } else { // invalidate mark + bufMark = -1; + bufPos = 0; + } + } + bufLength = bufPos; + int read = in.read(byteBuf, bufPos, byteBuf.length - bufPos); + if (read > 0) { + bufLength = read + bufPos; + while (byteBuf.length - bufLength > 0) { // read in more if we have space, without blocking + if (in.available() < 1) break; + read = in.read(byteBuf, bufLength, byteBuf.length - bufLength); + if (read <= 0) break; + bufLength += read; + } + } + closeIfDone(read); + } + + private void closeIfDone(int read) throws IOException { + if (read == -1) { + inReadFully = true; + super.close(); // close underlying stream immediately; frees resources a little earlier + } + } + + byte[] getBuf() { + Validate.notNull(byteBuf); + return byteBuf; + } + + /** + Check if the underlying InputStream has been read fully. There may still content in this buffer to be consumed. + @return true if the underlying inputstream has been read fully. + */ + boolean baseReadFully() { + return inReadFully; + } + + @Override + public int available() throws IOException { + if (byteBuf != null && bufLength - bufPos > 0) + return bufLength - bufPos; // doesn't include those in.available(), but mostly used as a block test + return inReadFully ? 0 : in.available(); + } + + @SuppressWarnings("NonSynchronizedMethodOverridesSynchronizedMethod") // explicitly not synced + @Override + public void mark(int readlimit) { + if (readlimit > BufferSize) { + throw new IllegalArgumentException("Read-ahead limit is greater than buffer size"); + } + bufMark = bufPos; + } + + @SuppressWarnings("NonSynchronizedMethodOverridesSynchronizedMethod") // explicitly not synced + @Override + public void reset() throws IOException { + if (bufMark < 0) + throw new IOException("Resetting to invalid mark"); + bufPos = bufMark; + } + + @Override + public void close() throws IOException { + super.close(); + if (byteBuf == null) return; // already closed, or never allocated + BufferPool.release(byteBuf); // return the buffer to the pool + byteBuf = null; // NPE further attempts to read + } +} diff --git a/src/main/java/org/jsoup/internal/SoftPool.java b/src/main/java/org/jsoup/internal/SoftPool.java new file mode 100644 index 0000000000..1125b7d1a0 --- /dev/null +++ b/src/main/java/org/jsoup/internal/SoftPool.java @@ -0,0 +1,66 @@ +package org.jsoup.internal; + +import java.lang.ref.SoftReference; +import java.util.Stack; +import java.util.function.Supplier; + +/** + A SoftPool is a ThreadLocal that holds a SoftReference to a pool of initializable objects. This allows us to reuse + expensive objects (buffers, etc.) between invocations (the ThreadLocal), but also for those objects to be reaped if + they are no longer in use. +

Like a ThreadLocal, should be stored in a static field.

+ @param the type of object to pool. + @since 1.18.2 + */ +public class SoftPool { + final ThreadLocal>> threadLocalStack; + private final Supplier initializer; + /** + How many total uses of the creating object might be instantiated on the same thread at once. More than this and + those objects aren't recycled. Doesn't need to be too conservative, as they can still be GCed as SoftRefs. + */ + static final int MaxIdle = 12; + + /** + Create a new SoftPool. + @param initializer a supplier that creates a new object when one is needed. + */ + public SoftPool(Supplier initializer) { + this.initializer = initializer; + this.threadLocalStack = ThreadLocal.withInitial(() -> new SoftReference<>(new Stack<>())); + } + + /** + Borrow an object from the pool, creating a new one if the pool is empty. Make sure to release it back to the pool + when done, so that it can be reused. + @return an object from the pool, as defined by the initializer. + */ + public T borrow() { + Stack stack = getStack(); + if (!stack.isEmpty()) { + return stack.pop(); + } + return initializer.get(); + } + + /** + Release an object back to the pool. If the pool is full, the object is not retained. If you don't want to reuse a + borrowed object (for e.g. a StringBuilder that grew too large), just don't release it. + @param value the object to release back to the pool. + */ + public void release(T value) { + Stack stack = getStack(); + if (stack.size() < MaxIdle) { + stack.push(value); + } + } + + Stack getStack() { + Stack stack = threadLocalStack.get().get(); + if (stack == null) { + stack = new Stack<>(); + threadLocalStack.set(new SoftReference<>(stack)); + } + return stack; + } +} diff --git a/src/main/java/org/jsoup/internal/StringUtil.java b/src/main/java/org/jsoup/internal/StringUtil.java index cb2ebec97d..9d6b0154c3 100644 --- a/src/main/java/org/jsoup/internal/StringUtil.java +++ b/src/main/java/org/jsoup/internal/StringUtil.java @@ -8,7 +8,6 @@ import java.util.Arrays; import java.util.Collection; import java.util.Iterator; -import java.util.Stack; import java.util.regex.Pattern; import java.util.stream.Collector; import java.util.stream.Collectors; @@ -337,7 +336,10 @@ private static String stripControlChars(final String input) { return controlChars.matcher(input).replaceAll(""); } - private static final ThreadLocal> threadLocalBuilders = ThreadLocal.withInitial(Stack::new); + private static final int InitBuilderSize = 1024; + private static final int MaxBuilderSize = 8 * 1024; + private static final SoftPool BuilderPool = new SoftPool<>( + () -> new StringBuilder(InitBuilderSize)); /** * Maintains cached StringBuilders in a flyweight pattern, to minimize new StringBuilder GCs. The StringBuilder is @@ -347,10 +349,7 @@ private static String stripControlChars(final String input) { * @return an empty StringBuilder */ public static StringBuilder borrowBuilder() { - Stack builders = threadLocalBuilders.get(); - return builders.empty() ? - new StringBuilder(MaxCachedBuilderSize) : - builders.pop(); + return BuilderPool.borrow(); } /** @@ -363,17 +362,12 @@ public static String releaseBuilder(StringBuilder sb) { Validate.notNull(sb); String string = sb.toString(); - if (sb.length() > MaxCachedBuilderSize) - sb = new StringBuilder(MaxCachedBuilderSize); // make sure it hasn't grown too big - else + // if it hasn't grown too big, reset it and return it to the pool: + if (sb.length() <= MaxBuilderSize) { sb.delete(0, sb.length()); // make sure it's emptied on release - - Stack builders = threadLocalBuilders.get(); - builders.push(sb); - - while (builders.size() > MaxIdleBuilders) { - builders.pop(); + BuilderPool.release(sb); } + return string; } @@ -394,6 +388,4 @@ public static String releaseBuilder(StringBuilder sb) { StringJoiner::complete); } - private static final int MaxCachedBuilderSize = 8 * 1024; - private static final int MaxIdleBuilders = 8; } diff --git a/src/main/java/org/jsoup/parser/CharacterReader.java b/src/main/java/org/jsoup/parser/CharacterReader.java index 9710c414a9..1e015b0c06 100644 --- a/src/main/java/org/jsoup/parser/CharacterReader.java +++ b/src/main/java/org/jsoup/parser/CharacterReader.java @@ -2,6 +2,7 @@ import org.jsoup.UncheckedIOException; import org.jsoup.helper.Validate; +import org.jsoup.internal.SoftPool; import org.jspecify.annotations.Nullable; import java.io.IOException; @@ -17,38 +18,43 @@ */ public final class CharacterReader { static final char EOF = (char) -1; - private static final int maxStringCacheLen = 12; - static final int maxBufferLen = 1024 * 32; // visible for testing - static final int readAheadLimit = (int) (maxBufferLen * 0.75); // visible for testing - private static final int minReadAheadLen = 1024; // the minimum mark length supported. No HTML entities can be larger than this. - - private char[] charBuf; - private Reader reader; - private int bufLength; - private int bufSplitPoint; - private int bufPos; - private int readerPos; - private int bufMark = -1; - private static final int stringCacheSize = 512; - private String[] stringCache = new String[stringCacheSize]; // holds reused strings in this doc, to lessen garbage + private static final int MaxStringCacheLen = 12; + private static final int StringCacheSize = 512; + private String[] stringCache; // holds reused strings in this doc, to lessen garbage + private static final SoftPool StringPool = new SoftPool<>(() -> new String[StringCacheSize]); // reuse cache between iterations + + static final int BufferSize = 1024 * 2; // visible for testing + static final int RefillPoint = BufferSize / 2; // when bufPos characters read, refill; visible for testing + private static final int RewindLimit = 1024; // the maximum we can rewind. No HTML entities can be larger than this. + + private Reader reader; // underlying Reader, will be backed by a buffered+controlled input stream, or StringReader + private char[] charBuf; // character buffer we consume from; filled from Reader + private int bufPos; // position in charBuf that's been consumed to + private int bufLength; // the num of characters actually buffered in charBuf, <= charBuf.length + private int fillPoint = 0; // how far into the charBuf we read before re-filling. 0.5 of charBuf.length after bufferUp + private int consumed; // how many characters total have been consumed from this CharacterReader (less the current bufPos) + private int bufMark = -1; // if not -1, the marked rewind position + private boolean readFully; // if the underlying stream has been completely read, no value in further buffering + + private static final SoftPool BufferPool = new SoftPool<>(() -> new char[BufferSize]); // recycled char buffer @Nullable private ArrayList newlinePositions = null; // optionally track the pos() position of newlines - scans during bufferUp() private int lineNumberOffset = 1; // line numbers start at 1; += newlinePosition[indexof(pos)] public CharacterReader(Reader input, int sz) { - Validate.notNull(input); - Validate.isTrue(input.markSupported(), "The supplied Reader must support mark(), but does not."); - reader = input; - charBuf = new char[Math.min(sz, maxBufferLen)]; - bufferUp(); + this(input); // sz is no longer used } public CharacterReader(Reader input) { - this(input, maxBufferLen); + Validate.notNull(input); + reader = input; + charBuf = BufferPool.borrow(); + stringCache = StringPool.borrow(); + bufferUp(); } public CharacterReader(String input) { - this(new StringReader(input), input.length()); + this(new StringReader(input)); } public void close() { @@ -59,61 +65,79 @@ public void close() { } catch (IOException ignored) { } finally { reader = null; + Arrays.fill(charBuf, (char) 0); // before release, clear the buffer. Not required, but acts as a safety net, and makes debug view clearer + BufferPool.release(charBuf); charBuf = null; + StringPool.release(stringCache); // conversely, we don't clear the string cache, so we can reuse the contents stringCache = null; } } - private boolean readFully; // if the underlying stream has been completely read, no value in further buffering private void bufferUp() { - if (readFully || bufPos < bufSplitPoint) + if (readFully || bufPos < fillPoint || bufMark != -1) return; - - final int pos; - final int offset; - if (bufMark != -1) { - pos = bufMark; - offset = bufPos - bufMark; - } else { - pos = bufPos; - offset = 0; - } - - try { - final long skipped = reader.skip(pos); - reader.mark(maxBufferLen); - int read = 0; - while (read <= minReadAheadLen) { - int thisRead = reader.read(charBuf, read, charBuf.length - read); - if (thisRead == -1) + doBufferUp(); // structured so bufferUp may become an intrinsic candidate + } + + private void doBufferUp() { + /* + The flow: + - if read fully, or if bufPos < fillPoint, or if marked - do not fill. + - update readerPos (total amount consumed from this CharacterReader) += bufPos + - shift charBuf contents such that bufPos = 0; set next read offset (bufLength) -= shift amount + - loop read the Reader until we fill charBuf. bufLength += read. + - readFully = true when read = -1 + */ + consumed += bufPos; + bufLength -= bufPos; + if (bufLength > 0) + System.arraycopy(charBuf, bufPos, charBuf, 0, bufLength); + bufPos = 0; + while (bufLength < BufferSize) { + try { + int read = reader.read(charBuf, bufLength, charBuf.length - bufLength); + if (read == -1) { readFully = true; - if (thisRead <= 0) break; - read += thisRead; - } - reader.reset(); - if (read > 0) { - Validate.isTrue(skipped == pos); // Previously asserted that there is room in buf to skip, so this will be a WTF - bufLength = read; - readerPos += pos; - bufPos = offset; - if (bufMark != -1) - bufMark = 0; - bufSplitPoint = Math.min(bufLength, readAheadLimit); + } + bufLength += read; + } catch (IOException e) { + throw new UncheckedIOException(e); } - } catch (IOException e) { - throw new UncheckedIOException(e); } + fillPoint = Math.min(bufLength, RefillPoint); + scanBufferForNewlines(); // if enabled, we index newline positions for line number tracking lastIcSeq = null; // cache for last containsIgnoreCase(seq) } + void mark() { + // make sure there is enough look ahead capacity + if (bufLength - bufPos < RewindLimit) + fillPoint = 0; + + bufferUp(); + bufMark = bufPos; + } + + void unmark() { + bufMark = -1; + } + + void rewindToMark() { + if (bufMark == -1) + throw new UncheckedIOException(new IOException("Mark invalid")); + + bufPos = bufMark; + unmark(); + } + /** * Gets the position currently read to in the content. Starts at 0. * @return current position */ public int pos() { - return readerPos + bufPos; + return consumed + bufPos; } /** Tests if the buffer has been fully read. */ @@ -131,7 +155,7 @@ boolean readFully() { */ public void trackNewlines(boolean track) { if (track && newlinePositions == null) { - newlinePositions = new ArrayList<>(maxBufferLen / 80); // rough guess of likely count + newlinePositions = new ArrayList<>(BufferSize / 80); // rough guess of likely count scanBufferForNewlines(); // first pass when enabled; subsequently called during bufferUp } else if (!track) @@ -216,7 +240,7 @@ private void scanBufferForNewlines() { if (newlinePositions.size() > 0) { // work out the line number that we have read up to (as we have likely scanned past this point) - int index = lineNumIndex(readerPos); + int index = lineNumIndex(consumed); if (index == -1) index = 0; // first line int linePos = newlinePositions.get(index); lineNumberOffset += index; // the num lines we've read up to @@ -226,7 +250,7 @@ private void scanBufferForNewlines() { for (int i = bufPos; i < bufLength; i++) { if (charBuf[i] == '\n') - newlinePositions.add(1 + readerPos + i); + newlinePositions.add(1 + consumed + i); } } @@ -276,27 +300,6 @@ public void advance() { bufPos++; } - void mark() { - // make sure there is enough look ahead capacity - if (bufLength - bufPos < minReadAheadLen) - bufSplitPoint = 0; - - bufferUp(); - bufMark = bufPos; - } - - void unmark() { - bufMark = -1; - } - - void rewindToMark() { - if (bufMark == -1) - throw new UncheckedIOException(new IOException("Mark invalid")); - - bufPos = bufMark; - unmark(); - } - /** * Returns the number of characters between the current position and the next instance of the input char * @param c scan target @@ -716,20 +719,20 @@ public String toString() { * some more duplicates. */ private static String cacheString(final char[] charBuf, final String[] stringCache, final int start, final int count) { - // limit (no cache): - if (count > maxStringCacheLen) + if (count > MaxStringCacheLen) // don't cache strings that are too big return new String(charBuf, start, count); if (count < 1) return ""; // calculate hash: int hash = 0; - for (int i = 0; i < count; i++) { - hash = 31 * hash + charBuf[start + i]; + int end = count + start; + for (int i = start; i < end; i++) { + hash = 31 * hash + charBuf[i]; } // get from cache - final int index = hash & stringCacheSize - 1; + final int index = hash & StringCacheSize - 1; String cached = stringCache[index]; if (cached != null && rangeEquals(charBuf, start, count, cached)) // positive hit diff --git a/src/test/java/org/jsoup/helper/DataUtilTest.java b/src/test/java/org/jsoup/helper/DataUtilTest.java index a588c98e0e..a583ddc0af 100644 --- a/src/test/java/org/jsoup/helper/DataUtilTest.java +++ b/src/test/java/org/jsoup/helper/DataUtilTest.java @@ -2,6 +2,7 @@ import org.jsoup.Jsoup; import org.jsoup.integration.ParseTest; +import org.jsoup.internal.ControllableInputStream; import org.jsoup.nodes.Document; import org.jsoup.parser.Parser; import org.junit.jupiter.api.Test; @@ -37,12 +38,12 @@ public void testQuotedCharset() { assertEquals("UTF-8", DataUtil.getCharsetFromContentType("text/html; charset='UTF-8'")); } - private InputStream stream(String data) { - return new ByteArrayInputStream(data.getBytes(StandardCharsets.UTF_8)); + private ControllableInputStream stream(String data) { + return ControllableInputStream.wrap(new ByteArrayInputStream(data.getBytes(StandardCharsets.UTF_8)), 0); } - private InputStream stream(String data, String charset) { - return new ByteArrayInputStream(data.getBytes(Charset.forName(charset))); + private ControllableInputStream stream(String data, String charset) { + return ControllableInputStream.wrap(new ByteArrayInputStream(data.getBytes(Charset.forName(charset))), 0); } @Test @@ -143,7 +144,8 @@ public void parseSequenceInputStream() throws IOException { stream(firstPart), stream(secondPart) ); - Document doc = DataUtil.parseInputStream(sequenceStream, null, "", Parser.htmlParser()); + ControllableInputStream stream = ControllableInputStream.wrap(sequenceStream, 0); + Document doc = DataUtil.parseInputStream(stream, null, "", Parser.htmlParser()); assertEquals(fileContent, doc.outerHtml()); } @@ -331,7 +333,7 @@ void handlesUnlimitedRead() throws IOException { VaryingReadInputStream stream = new VaryingReadInputStream(ParseTest.inputStreamFrom(input)); ByteBuffer byteBuffer = DataUtil.readToByteBuffer(stream, 0); - String read = new String(byteBuffer.array()); + String read = new String(byteBuffer.array(), 0, byteBuffer.limit(), StandardCharsets.UTF_8); assertEquals(input, read); } diff --git a/src/test/java/org/jsoup/integration/ConnectIT.java b/src/test/java/org/jsoup/integration/ConnectIT.java index 92c10c1b40..672fed2911 100644 --- a/src/test/java/org/jsoup/integration/ConnectIT.java +++ b/src/test/java/org/jsoup/integration/ConnectIT.java @@ -54,6 +54,7 @@ public void canInterruptBodyStringRead() throws InterruptedException { @Test public void canInterruptDocumentRead() throws InterruptedException { // todo - implement in interruptable channels, so it's immediate + long start = System.currentTimeMillis(); final String[] body = new String[1]; Thread runner = new Thread(() -> { try { @@ -68,12 +69,15 @@ public void canInterruptDocumentRead() throws InterruptedException { }); runner.start(); - Thread.sleep(1000 * 3); + Thread.sleep(3 * 1000); runner.interrupt(); assertTrue(runner.isInterrupted()); runner.join(); - assertEquals(0, body[0].length()); // doesn't read a failed doc + long end = System.currentTimeMillis(); + // check we are between 3 and connect timeout seconds (should be just over 3; but allow some slack for slow CI runners) + assertTrue(end - start > 3 * 1000); + assertTrue(end - start < 10 * 1000); } @Test public void canInterruptThenJoinASpawnedThread() throws InterruptedException { @@ -184,6 +188,8 @@ public void infiniteReadSupported() throws IOException { assertTrue(caught); } + private static final int LargeHtmlSize = 280735; + @Test public void remainingAfterFirstRead() throws IOException { int bufferSize = 5 * 1024; @@ -214,33 +220,34 @@ public void remainingAfterFirstRead() throws IOException { // bodyStream is not capped to body size - only for jsoup consumed stream assertTrue(fullArray.length > capSize); - assertEquals(280735, fullArray.length); - String fullText = new String(fullArray, StandardCharsets.UTF_8); + assertEquals(LargeHtmlSize, fullRead.limit()); + String fullText = new String(fullRead.array(), 0, fullRead.limit(), StandardCharsets.UTF_8); assertTrue(fullText.startsWith(firstText)); + assertEquals(LargeHtmlSize, fullText.length()); } } @Test public void noLimitAfterFirstRead() throws IOException { - int bufferSize = 5 * 1024; + int firstMaxRead = 5 * 1024; String url = FileServlet.urlTo("/htmltests/large.html"); // 280 K try (BufferedInputStream stream = Jsoup.connect(url).execute().bodyStream()) { // simulates parse which does a limited read first - stream.mark(bufferSize); - ByteBuffer firstBytes = DataUtil.readToByteBuffer(stream, bufferSize); + stream.mark(firstMaxRead); + ByteBuffer firstBytes = DataUtil.readToByteBuffer(stream, firstMaxRead); byte[] array = firstBytes.array(); String firstText = new String(array, StandardCharsets.UTF_8); assertTrue(firstText.startsWith("Large")); - assertEquals(bufferSize, array.length); + assertEquals(firstMaxRead, array.length); // reset and read fully stream.reset(); ByteBuffer fullRead = DataUtil.readToByteBuffer(stream, 0); - byte[] fullArray = fullRead.array(); - assertEquals(280735, fullArray.length); - String fullText = new String(fullArray, StandardCharsets.UTF_8); + assertEquals(LargeHtmlSize, fullRead.limit()); + String fullText = new String(fullRead.array(), 0, fullRead.limit(), StandardCharsets.UTF_8); assertTrue(fullText.startsWith(firstText)); + assertEquals(LargeHtmlSize, fullText.length()); } } @@ -255,8 +262,7 @@ public void noLimitAfterFirstRead() throws IOException { .bodyStream()) { ByteBuffer cappedRead = DataUtil.readToByteBuffer(stream, 0); - byte[] cappedArray = cappedRead.array(); - assertEquals(cap, cappedArray.length); + assertEquals(cap, cappedRead.limit()); } } } diff --git a/src/test/java/org/jsoup/integration/ConnectTest.java b/src/test/java/org/jsoup/integration/ConnectTest.java index 688713dc0a..4c1f7d378a 100644 --- a/src/test/java/org/jsoup/integration/ConnectTest.java +++ b/src/test/java/org/jsoup/integration/ConnectTest.java @@ -33,6 +33,7 @@ import java.net.URL; import java.net.URLDecoder; import java.nio.file.Files; +import java.nio.file.Path; import java.util.List; import java.util.Map; import java.util.concurrent.atomic.AtomicBoolean; @@ -603,13 +604,22 @@ public void testBinaryContentTypeThrowsException() throws IOException { @Test public void canFetchBinaryAsBytes() throws IOException { - Connection.Response res = Jsoup.connect(FileServlet.urlTo("/htmltests/thumb.jpg")) + String path = "/htmltests/thumb.jpg"; + int actualSize = 1052; + + Connection.Response res = Jsoup.connect(FileServlet.urlTo(path)) .data(FileServlet.ContentTypeParam, "image/jpeg") .ignoreContentType(true) .execute(); - byte[] bytes = res.bodyAsBytes(); - assertEquals(1052, bytes.length); + byte[] resBytes = res.bodyAsBytes(); + assertEquals(actualSize, resBytes.length); + + // compare the content of the file and the bytes: + Path filePath = ParseTest.getPath(path); + byte[] fileBytes = Files.readAllBytes(filePath); + assertEquals(actualSize, fileBytes.length); + assertArrayEquals(fileBytes, resBytes); } @Test @@ -996,8 +1006,14 @@ void progressListener(String path) throws IOException { // should expect to see events relative to how large the buffer is. int expected = LargeDocFileLen / 8192; - assertTrue(numProgress.get() > expected * 0.75); - assertTrue(numProgress.get() < expected * 1.25); + + int num = numProgress.get(); + // debug log if not in those ranges: + if (num < expected * 0.75 || num > expected * 1.25) { + System.err.println("Expected: " + expected + ", got: " + num); + } + assertTrue(num > expected * 0.75); + assertTrue(num < expected * 1.25); // check the document works assertEquals(LargeDocTextLen, document.text().length()); diff --git a/src/test/java/org/jsoup/integration/ParseTest.java b/src/test/java/org/jsoup/integration/ParseTest.java index d84c103497..d963d1d621 100644 --- a/src/test/java/org/jsoup/integration/ParseTest.java +++ b/src/test/java/org/jsoup/integration/ParseTest.java @@ -153,7 +153,8 @@ public static String getFileAsString(File file) throws IOException { if (file.getName().endsWith(".gz")) { InputStream stream = new GZIPInputStream(new FileInputStream(file)); ByteBuffer byteBuffer = DataUtil.readToByteBuffer(stream, 0); - bytes = byteBuffer.array(); + bytes = new byte[byteBuffer.limit()]; + System.arraycopy(byteBuffer.array(), 0, bytes, 0, byteBuffer.limit()); } else { bytes = Files.readAllBytes(file.toPath()); } diff --git a/src/test/java/org/jsoup/integration/servlets/EchoServlet.java b/src/test/java/org/jsoup/integration/servlets/EchoServlet.java index 6ff31a2a0d..76ef7ff5d0 100644 --- a/src/test/java/org/jsoup/integration/servlets/EchoServlet.java +++ b/src/test/java/org/jsoup/integration/servlets/EchoServlet.java @@ -90,7 +90,7 @@ protected void doIt(HttpServletRequest req, HttpServletResponse res) throws IOEx // post body ByteBuffer byteBuffer = DataUtil.readToByteBuffer(req.getInputStream(), 0); - String postData = new String(byteBuffer.array(), StandardCharsets.UTF_8); + String postData = new String(byteBuffer.array(), byteBuffer.arrayOffset(), byteBuffer.limit(), StandardCharsets.UTF_8); if (!StringUtil.isBlank(postData)) { write(w, "Post Data", postData); } diff --git a/src/test/java/org/jsoup/integration/servlets/InterruptedServlet.java b/src/test/java/org/jsoup/integration/servlets/InterruptedServlet.java index 26b2fef9f5..4158e1e454 100644 --- a/src/test/java/org/jsoup/integration/servlets/InterruptedServlet.java +++ b/src/test/java/org/jsoup/integration/servlets/InterruptedServlet.java @@ -27,8 +27,8 @@ protected void doIt(HttpServletRequest req, HttpServletResponse res) throws IOEx StringBuilder sb = new StringBuilder(); sb.append("<title>Something"); - while (sb.length() <= CharacterReaderTest.maxBufferLen) { - sb.append("A suitable amount of data. \n"); + while (sb.length() <= 32 * 1024) { + sb.append("
A suitable amount of data.
\n"); } sb.append("

Finale.

"); String data = sb.toString(); diff --git a/src/test/java/org/jsoup/internal/SoftPoolTest.java b/src/test/java/org/jsoup/internal/SoftPoolTest.java new file mode 100644 index 0000000000..bca2c199a4 --- /dev/null +++ b/src/test/java/org/jsoup/internal/SoftPoolTest.java @@ -0,0 +1,139 @@ +package org.jsoup.internal; + +import org.junit.jupiter.api.Test; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.Stack; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.atomic.AtomicInteger; + +import static org.junit.jupiter.api.Assertions.*; + +public class SoftPoolTest { + + private static final int BufSize = 12; + private static final int NumThreads = 5; + private static final int NumObjects = 3; + + @Test + public void testSoftLocalPool() throws InterruptedException { + SoftPool softLocalPool = new SoftPool<>(() -> new char[BufSize]); + + ExecutorService executorService = Executors.newFixedThreadPool(NumThreads); + CountDownLatch latch = new CountDownLatch(NumThreads); + + Set allBuffers = new HashSet<>(); + Set[] threadLocalBuffers = new Set[NumThreads]; + + for (int i = 0; i < NumThreads; i++) { + threadLocalBuffers[i] = new HashSet<>(); + } + + AtomicInteger threadCount = new AtomicInteger(); + + Runnable task = () -> { + try { + int threadIndex = threadCount.getAndIncrement(); + Set localBuffers = new HashSet<>(); + // First borrow + for (int i = 0; i < NumObjects; i++) { + char[] buffer = softLocalPool.borrow(); + assertEquals(BufSize, buffer.length); + localBuffers.add(buffer); + } + + // Release buffers back to the pool + for (char[] buffer : localBuffers) { + softLocalPool.release(buffer); + } + + // Borrow again and ensure buffers are reused + for (int i = 0; i < NumObjects; i++) { + char[] buffer = softLocalPool.borrow(); + assertTrue(localBuffers.contains(buffer), "Buffer was not reused in the same thread"); + threadLocalBuffers[threadIndex].add(buffer); + } + + synchronized (allBuffers) { + allBuffers.addAll(threadLocalBuffers[threadIndex]); + } + } finally { + latch.countDown(); + } + }; + + // Run the tasks + for (int i = 0; i < NumThreads; i++) { + executorService.submit(task::run); + } + + // Wait for all threads to complete + latch.await(); + executorService.shutdown(); + + // Ensure no buffers are shared between threads + Set uniqueBuffers = new HashSet<>(); + for (Set bufferSet : threadLocalBuffers) { + for (char[] buffer : bufferSet) { + assertTrue(uniqueBuffers.add(buffer), "Buffer was shared between threads"); + } + } + } + + @Test + public void testSoftReferenceBehavior() { + SoftPool softLocalPool = new SoftPool<>(() -> new char[BufSize]); + + // Borrow and release an object + char[] buffer = softLocalPool.borrow(); + assertEquals(BufSize, buffer.length); + softLocalPool.release(buffer); + + // Fake a GC + softLocalPool.threadLocalStack.get().clear(); + + // Ensure the object is garbage collected + assertNull(softLocalPool.threadLocalStack.get().get()); + + char[] second = softLocalPool.borrow(); + // should be different, but same size + assertNotEquals(buffer, second); + assertEquals(BufSize, second.length); + } + + @Test + public void testBorrowFromEmptyPool() { + SoftPool softLocalPool = new SoftPool<>(() -> new char[BufSize]); + + // Borrow from an empty pool + char[] buffer = softLocalPool.borrow(); + assertNotNull(buffer, "Borrowed null from an empty pool"); + assertEquals(BufSize, buffer.length); + } + + @Test + public void testReleaseMoreThanMaxIdle() { + SoftPool softLocalPool = new SoftPool<>(() -> new char[BufSize]); + + // Borrow more than MaxIdle objects + List borrowedBuffers = new ArrayList<>(); + for (int i = 0; i < SoftPool.MaxIdle + 5; i++) { + char[] buffer = softLocalPool.borrow(); + borrowedBuffers.add(buffer); + } + + // Release all borrowed objects back to the pool + for (char[] buffer : borrowedBuffers) { + softLocalPool.release(buffer); + } + + // Ensure the pool size does not exceed MaxIdle + Stack stack = softLocalPool.getStack(); + assertTrue(stack.size() <= SoftPool.MaxIdle, "Pool size exceeded MaxIdle limit"); + } +} \ No newline at end of file diff --git a/src/test/java/org/jsoup/parser/CharacterReaderTest.java b/src/test/java/org/jsoup/parser/CharacterReaderTest.java index 7071bfe51d..8f0ee30ca2 100644 --- a/src/test/java/org/jsoup/parser/CharacterReaderTest.java +++ b/src/test/java/org/jsoup/parser/CharacterReaderTest.java @@ -1,6 +1,7 @@ package org.jsoup.parser; import org.jsoup.integration.ParseTest; +import org.jsoup.internal.StringUtil; import org.junit.jupiter.api.Test; import java.io.BufferedReader; @@ -16,7 +17,7 @@ * @author Jonathan Hedley, jonathan@hedley.net */ public class CharacterReaderTest { - public final static int maxBufferLen = CharacterReader.maxBufferLen; + public final static int maxBufferLen = CharacterReader.BufferSize; @Test public void consume() { CharacterReader r = new CharacterReader("one"); @@ -359,24 +360,23 @@ public void consumeToNonexistentEndWhenAtAnd() { @Test public void notEmptyAtBufferSplitPoint() { - CharacterReader r = new CharacterReader(new StringReader("How about now"), 3); - assertEquals("How", r.consumeTo(' ')); - assertFalse(r.isEmpty(), "Should not be empty"); - - assertEquals(' ', r.consume()); - assertFalse(r.isEmpty()); - assertEquals(4, r.pos()); - assertEquals('a', r.consume()); - assertEquals(5, r.pos()); - assertEquals('b', r.consume()); - assertEquals('o', r.consume()); - assertEquals('u', r.consume()); - assertEquals('t', r.consume()); - assertEquals(' ', r.consume()); - assertEquals('n', r.consume()); - assertEquals('o', r.consume()); - assertEquals('w', r.consume()); + int len = CharacterReader.BufferSize * 12; + StringBuilder builder = StringUtil.borrowBuilder(); + while (builder.length() <= len) builder.append('!'); + CharacterReader r = new CharacterReader(builder.toString()); + StringUtil.releaseBuilder(builder); + + // consume through + for (int pos = 0; pos < len; pos ++) { + assertEquals(pos, r.pos()); + assertFalse(r.isEmpty()); + assertEquals('!', r.consume()); + assertEquals(pos + 1, r.pos()); + assertFalse(r.isEmpty()); + } + assertEquals('!', r.consume()); assertTrue(r.isEmpty()); + assertEquals(CharacterReader.EOF, r.consume()); } @Test public void bufferUp() { @@ -437,10 +437,10 @@ public void notEmptyAtBufferSplitPoint() { // get over the buffer while (!noTrack.matches("[foo]")) noTrack.consumeTo("[foo]"); - assertEquals(32778, noTrack.pos()); + assertEquals(2090, noTrack.pos()); assertEquals(1, noTrack.lineNumber()); assertEquals(noTrack.pos()+1, noTrack.columnNumber()); - assertEquals("1:32779", noTrack.posLineCol()); + assertEquals("1:2091", noTrack.posLineCol()); // and the line numbers: "\n\n\n" assertEquals(0, track.pos()); @@ -468,12 +468,12 @@ public void notEmptyAtBufferSplitPoint() { // get over the buffer while (!track.matches("[foo]")) track.consumeTo("[foo]"); - assertEquals(32778, track.pos()); + assertEquals(2090, track.pos()); assertEquals(4, track.lineNumber()); - assertEquals(32761, track.columnNumber()); - assertEquals("4:32761", track.posLineCol()); + assertEquals(2073, track.columnNumber()); + assertEquals("4:2073", track.posLineCol()); track.consumeTo('\n'); - assertEquals("4:32766", track.posLineCol()); + assertEquals("4:2078", track.posLineCol()); track.consumeTo("[bar]"); assertEquals(5, track.lineNumber()); @@ -491,9 +491,11 @@ public void notEmptyAtBufferSplitPoint() { reader.trackNewlines(true); assertEquals("1:1", reader.posLineCol()); + StringBuilder seen = new StringBuilder(); while (!reader.isEmpty()) - reader.consume(); - assertEquals(131096, reader.pos()); + seen.append(reader.consume()); + assertEquals(content, seen.toString()); + assertEquals(content.length(), reader.pos()); assertEquals(reader.pos() + 1, reader.columnNumber()); assertEquals(1, reader.lineNumber()); } diff --git a/src/test/java/org/jsoup/parser/HtmlParserTest.java b/src/test/java/org/jsoup/parser/HtmlParserTest.java index f911fe161b..7fa7a67a59 100644 --- a/src/test/java/org/jsoup/parser/HtmlParserTest.java +++ b/src/test/java/org/jsoup/parser/HtmlParserTest.java @@ -368,7 +368,7 @@ private static Stream dupeAttributeData() { @Test public void handlesCdataAcrossBuffer() { StringBuilder sb = new StringBuilder(); - while (sb.length() <= CharacterReader.maxBufferLen) { + while (sb.length() <= CharacterReader.BufferSize) { sb.append("A suitable amount of CData.\n"); } String cdata = sb.toString(); diff --git a/src/test/java/org/jsoup/parser/TokeniserStateTest.java b/src/test/java/org/jsoup/parser/TokeniserStateTest.java index d34c38fcea..6d9b5f7a77 100644 --- a/src/test/java/org/jsoup/parser/TokeniserStateTest.java +++ b/src/test/java/org/jsoup/parser/TokeniserStateTest.java @@ -208,21 +208,21 @@ public void testPublicAndSystemIdentifiersWithWhitespace() { @Test public void testUnconsumeAtBufferBoundary() { String triggeringSnippet = "One"; @@ -60,10 +60,10 @@ public void bufferUpInAttributeVal() { } @Test public void handleSuperLargeAttributeName() { - StringBuilder sb = new StringBuilder(maxBufferLen); + StringBuilder sb = new StringBuilder(BufferSize); do { sb.append("LargAttributeName"); - } while (sb.length() < maxBufferLen); + } while (sb.length() < BufferSize); String attrName = sb.toString(); String html = "

One

"; @@ -79,10 +79,10 @@ public void bufferUpInAttributeVal() { } @Test public void handleLargeText() { - StringBuilder sb = new StringBuilder(maxBufferLen); + StringBuilder sb = new StringBuilder(BufferSize); do { sb.append("A Large Amount of Text"); - } while (sb.length() < maxBufferLen); + } while (sb.length() < BufferSize); String text = sb.toString(); String html = "

" + text + "

"; @@ -96,10 +96,10 @@ public void bufferUpInAttributeVal() { } @Test public void handleLargeComment() { - StringBuilder sb = new StringBuilder(maxBufferLen); + StringBuilder sb = new StringBuilder(BufferSize); do { sb.append("Quite a comment "); - } while (sb.length() < maxBufferLen); + } while (sb.length() < BufferSize); String comment = sb.toString(); String html = "

"; @@ -114,10 +114,10 @@ public void bufferUpInAttributeVal() { } @Test public void handleLargeCdata() { - StringBuilder sb = new StringBuilder(maxBufferLen); + StringBuilder sb = new StringBuilder(BufferSize); do { sb.append("Quite a lot of CDATA <><><><>"); - } while (sb.length() < maxBufferLen); + } while (sb.length() < BufferSize); String cdata = sb.toString(); String html = "

"; @@ -133,10 +133,10 @@ public void bufferUpInAttributeVal() { } @Test public void handleLargeTitle() { - StringBuilder sb = new StringBuilder(maxBufferLen); + StringBuilder sb = new StringBuilder(BufferSize); do { sb.append("Quite a long title"); - } while (sb.length() < maxBufferLen); + } while (sb.length() < BufferSize); String title = sb.toString(); String html = "" + title + ""; @@ -178,10 +178,10 @@ public void bufferUpInAttributeVal() { } @Test public void canParseVeryLongBogusComment() { - StringBuilder commentData = new StringBuilder(maxBufferLen); + StringBuilder commentData = new StringBuilder(BufferSize); do { commentData.append("blah blah blah blah "); - } while (commentData.length() < maxBufferLen); + } while (commentData.length() < BufferSize); String expectedCommentData = commentData.toString(); String testMarkup = ""; Parser parser = new Parser(new HtmlTreeBuilder()); @@ -196,7 +196,7 @@ public void bufferUpInAttributeVal() { @Test public void canParseCdataEndingAtEdgeOfBuffer() { String cdataStart = ""; - int bufLen = maxBufferLen - cdataStart.length() - 1; // also breaks with -2, but not with -3 or 0 + int bufLen = BufferSize - cdataStart.length() - 1; // also breaks with -2, but not with -3 or 0 char[] cdataContentsArray = new char[bufLen]; Arrays.fill(cdataContentsArray, 'x'); String cdataContents = new String(cdataContentsArray);