Improve buffer management throughout the load/fetch and parse lifecyc…

…le (#2186) * Introduces SoftPool A SoftPool is a ThreadLocal SofReference Stack of <T>, with borrow and release methods. Allows simple recycling of objects between uses, and for those to be reaped when inactive. * Refactored bufferUp in CharacterReader Removed the use of a backing BufferedInputReader, which was redundant and creating large char array buffers, and reuse the char[] buffer via SoftPool. * Maintain the string flyweight cache between runs in CharacterReader * Updated ControllableInputStream to use new SimpleBufferedInput, instead of BufferedInputStream, so we can reuse those byte[] buffers. Also, those are only borrowed/created on the first need for the buffer; otherwise fill reads cascade down to the underlying stream. And removes synchronized / lock sections, as this is not passed between threads concurrently. * Reduced the DefaultBufferSize to 8K from 32K * In DataUtil, when reading ahead to detect the charset, instead of buffering into a new ByteBuffer, reuse the ControllableInputStream. * Simplify BOM character skip Overall, heap allocations and GC during the read & parse phase are down between -6% and -89%, and throughput improved up to +143%.
jhy · Aug 10, 2024 · ec2d0c4 · ec2d0c4
1 parent 1714af2
commit ec2d0c4
Show file tree

Hide file tree

Showing 20 changed files with 719 additions and 300 deletions.
diff --git a/pom.xml b/pom.xml
@@ -105,6 +105,7 @@
                 <ignore>java.util.Set</ignore> <!-- Set#stream() -->
                 <ignore>java.util.Spliterator</ignore>
                 <ignore>java.util.Spliterators</ignore>
+                <ignore>java.nio.ByteBuffer</ignore> <!-- .flip(); added in API1; possibly due to .flip previously returning Buffer, later ByteBuffer; return unused -->
 
                 <ignore>java.net.HttpURLConnection</ignore><!-- .setAuthenticator(java.net.Authenticator) in Java 9; only used in multirelease 9+ version -->
               </ignores>

diff --git a/src/main/java/org/jsoup/helper/DataUtil.java b/src/main/java/org/jsoup/helper/DataUtil.java
@@ -15,17 +15,14 @@
 import org.jspecify.annotations.Nullable;
 
 import java.io.BufferedReader;
-import java.io.CharArrayReader;
 import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.OutputStream;
 import java.io.Reader;
 import java.io.UncheckedIOException;
-import java.nio.Buffer;
 import java.nio.ByteBuffer;
-import java.nio.CharBuffer;
 import java.nio.channels.Channels;
 import java.nio.channels.SeekableByteChannel;
 import java.nio.charset.Charset;
@@ -119,8 +116,7 @@ public static Document load(Path path, @Nullable String charsetName, String base
      * @since 1.17.2
      */
     public static Document load(Path path, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
-        InputStream stream = openStream(path);
-        return parseInputStream(stream, charsetName, baseUri, parser);
+        return parseInputStream(openStream(path), charsetName, baseUri, parser);
     }
 
     /**
@@ -144,14 +140,13 @@ public static StreamParser streamParser(Path path, @Nullable Charset charset, St
         String charsetName = charset != null? charset.name() : null;
         DataUtil.CharsetDoc charsetDoc = DataUtil.detectCharset(openStream(path), charsetName, baseUri, parser);
         BufferedReader reader = new BufferedReader(new InputStreamReader(charsetDoc.input, charsetDoc.charset), DefaultBufferSize);
-        maybeSkipBom(reader, charsetDoc);
         streamer.parse(reader, baseUri); // initializes the parse and the document, but does not step() it
 
         return streamer;
     }
 
     /** Open an input stream from a file; if it's a gzip file, returns a GZIPInputStream to unzip it. */
-    private static InputStream openStream(Path path) throws IOException {
+    private static ControllableInputStream openStream(Path path) throws IOException {
         final SeekableByteChannel byteChannel = Files.newByteChannel(path);
         InputStream stream = Channels.newInputStream(byteChannel);
         String name = Normalizer.lowerCase(path.getFileName().toString());
@@ -162,7 +157,7 @@ private static InputStream openStream(Path path) throws IOException {
                 stream = new GZIPInputStream(stream);
             }
         }
-        return stream;
+        return ControllableInputStream.wrap(stream, 0);
     }
 
     /**
@@ -174,7 +169,7 @@ private static InputStream openStream(Path path) throws IOException {
      * @throws IOException on IO error
      */
     public static Document load(InputStream in, @Nullable String charsetName, String baseUri) throws IOException {
-        return parseInputStream(in, charsetName, baseUri, Parser.htmlParser());
+        return parseInputStream(ControllableInputStream.wrap(in, 0), charsetName, baseUri, Parser.htmlParser());
     }
 
     /**
@@ -187,7 +182,7 @@ public static Document load(InputStream in, @Nullable String charsetName, String
      * @throws IOException on IO error
      */
     public static Document load(InputStream in, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
-        return parseInputStream(in, charsetName, baseUri, parser);
+        return parseInputStream(ControllableInputStream.wrap(in, 0), charsetName, baseUri, parser);
     }
 
     /**
@@ -209,17 +204,15 @@ static class CharsetDoc {
         Charset charset;
         InputStream input;
         @Nullable Document doc;
-        boolean skip;
 
-        CharsetDoc(Charset charset, @Nullable Document doc, InputStream input, boolean skip) {
+        CharsetDoc(Charset charset, @Nullable Document doc, InputStream input) {
             this.charset = charset;
             this.input = input;
             this.doc = doc;
-            this.skip = skip;
         }
     }
 
-    static Document parseInputStream(@Nullable InputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
+    static Document parseInputStream(@Nullable ControllableInputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
         if (input == null) // empty body // todo reconsider?
             return new Document(baseUri);
 
@@ -235,30 +228,28 @@ static Document parseInputStream(@Nullable InputStream input, @Nullable String c
         return doc;
     }
 
-    static CharsetDoc detectCharset(InputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
+    static CharsetDoc detectCharset(ControllableInputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
         Document doc = null;
-
-        // read the start of the stream and look for a BOM or meta charset
-        InputStream wrappedInputStream = ControllableInputStream.wrap(input, DefaultBufferSize, 0);
-        wrappedInputStream.mark(DefaultBufferSize);
-        ByteBuffer firstBytes = readToByteBuffer(wrappedInputStream, firstReadBufferSize - 1); // -1 because we read one more to see if completed. First read is < buffer size, so can't be invalid.
-        boolean fullyRead = (wrappedInputStream.read() == -1);
-        wrappedInputStream.reset();
-
+        // read the start of the stream and look for a BOM or meta charset:
         // look for BOM - overrides any other header or input
-        BomCharset bomCharset = detectCharsetFromBom(firstBytes);
+        String bomCharset = detectCharsetFromBom(input); // resets / consumes appropriately
         if (bomCharset != null)
-            charsetName = bomCharset.charset;
+            charsetName = bomCharset;
 
-        if (charsetName == null) { // determine from meta. safe first parse as UTF-8
+        if (charsetName == null) { // read ahead and determine from meta. safe first parse as UTF-8
+            int origMax = input.max();
+            input.max(firstReadBufferSize);
+            input.mark(firstReadBufferSize);
+            input.allowClose(false); // ignores closes during parse, in case we need to rewind
             try {
-                CharBuffer defaultDecoded = UTF_8.decode(firstBytes);
-                if (defaultDecoded.hasArray())
-                    doc = parser.parseInput(new CharArrayReader(defaultDecoded.array(), defaultDecoded.arrayOffset(), defaultDecoded.limit()), baseUri);
-                else
-                    doc = parser.parseInput(defaultDecoded.toString(), baseUri);
+                Reader reader = new InputStreamReader(input, UTF_8); // input is currently capped to firstReadBufferSize
+                doc = parser.parseInput(reader, baseUri);
+                input.reset();
+                input.max(origMax); // reset for a full read if required
             } catch (UncheckedIOException e) {
                 throw e.getCause();
+            } finally {
+                input.allowClose(true);
             }
 
             // look for <meta http-equiv="Content-Type" content="text/html;charset=gb2312"> or HTML5 <meta charset="gb2312">
@@ -293,7 +284,9 @@ else if (first instanceof Comment) {
                 foundCharset = foundCharset.trim().replaceAll("[\"']", "");
                 charsetName = foundCharset;
                 doc = null;
-            } else if (!fullyRead) {
+            } else if (input.baseReadFully()) { // if we have read fully, and the charset was correct, keep that current parse
+                input.close(); // the parser tried to close it
+            } else {
                 doc = null;
             }
         } else { // specified by content type header (or by user on file load)
@@ -304,9 +297,7 @@ else if (first instanceof Comment) {
         if (charsetName == null)
             charsetName = defaultCharsetName;
         Charset charset = charsetName.equals(defaultCharsetName) ? UTF_8 : Charset.forName(charsetName);
-        boolean skip = bomCharset != null && bomCharset.offset; // skip 1 if the BOM is there and needs offset
-        // if consumer needs to parse the input; prep it if there's a BOM. Can't skip in inputstream as wrapping buffer will ignore the pos
-        return new CharsetDoc(charset, doc, wrappedInputStream, skip);
+        return new CharsetDoc(charset, doc, input);
     }
 
     static Document parseInputStream(CharsetDoc charsetDoc, String baseUri, Parser parser) throws IOException {
@@ -318,8 +309,7 @@ static Document parseInputStream(CharsetDoc charsetDoc, String baseUri, Parser p
         Validate.notNull(input);
         final Document doc;
         final Charset charset = charsetDoc.charset;
-        try (BufferedReader reader = new BufferedReader(new InputStreamReader(input, charset), DefaultBufferSize)) {
-            maybeSkipBom(reader, charsetDoc);
+        try (Reader reader = new InputStreamReader(input, charset)) {
             try {
                 doc = parser.parseInput(reader, baseUri);
             } catch (UncheckedIOException e) {
@@ -335,13 +325,6 @@ static Document parseInputStream(CharsetDoc charsetDoc, String baseUri, Parser p
         return doc;
     }
 
-    static void maybeSkipBom(Reader reader, CharsetDoc charsetDoc) throws IOException {
-        if (charsetDoc.skip) {
-            long skipped = reader.skip(1);
-            Validate.isTrue(skipped == 1); // WTF if this fails.
-        }
-    }
-
     /**
      * Read the input stream into a byte buffer. To deal with slow input streams, you may interrupt the thread this
      * method is executing on. The data read until being interrupted will be available.
@@ -400,34 +383,24 @@ static String mimeBoundary() {
         return StringUtil.releaseBuilder(mime);
     }
 
-    private static @Nullable BomCharset detectCharsetFromBom(final ByteBuffer byteData) {
-        @SuppressWarnings("UnnecessaryLocalVariable") final Buffer buffer = byteData; // .mark and rewind used to return Buffer, now ByteBuffer, so cast for backward compat
-        buffer.mark();
+    private static @Nullable String detectCharsetFromBom(ControllableInputStream input) throws IOException {
         byte[] bom = new byte[4];
-        if (byteData.remaining() >= bom.length) {
-            byteData.get(bom);
-            buffer.rewind();
-        }
+        input.mark(bom.length);
+        //noinspection ResultOfMethodCallIgnored
+        input.read(bom, 0, 4);
+        input.reset();
+
+        // 16 and 32 decoders consume the BOM to determine be/le; utf-8 should be consumed here
         if (bom[0] == 0x00 && bom[1] == 0x00 && bom[2] == (byte) 0xFE && bom[3] == (byte) 0xFF || // BE
             bom[0] == (byte) 0xFF && bom[1] == (byte) 0xFE && bom[2] == 0x00 && bom[3] == 0x00) { // LE
-            return new BomCharset("UTF-32", false); // and I hope it's on your system
+            return "UTF-32"; // and I hope it's on your system
         } else if (bom[0] == (byte) 0xFE && bom[1] == (byte) 0xFF || // BE
             bom[0] == (byte) 0xFF && bom[1] == (byte) 0xFE) {
-            return new BomCharset("UTF-16", false); // in all Javas
+            return "UTF-16"; // in all Javas
         } else if (bom[0] == (byte) 0xEF && bom[1] == (byte) 0xBB && bom[2] == (byte) 0xBF) {
-            return new BomCharset("UTF-8", true); // in all Javas
-            // 16 and 32 decoders consume the BOM to determine be/le; utf-8 should be consumed here
+            input.read(bom, 0, 3); // consume the UTF-8 BOM
+            return "UTF-8"; // in all Javas
         }
         return null;
     }
-
-    private static class BomCharset {
-        private final String charset;
-        private final boolean offset;
-
-        public BomCharset(String charset, boolean offset) {
-            this.charset = charset;
-            this.offset = offset;
-        }
-    }
 }
diff --git a/src/main/java/org/jsoup/helper/HttpConnection.java b/src/main/java/org/jsoup/helper/HttpConnection.java
@@ -7,7 +7,6 @@
 import org.jsoup.UnsupportedMimeTypeException;
 import org.jsoup.internal.ControllableInputStream;
 import org.jsoup.internal.Functions;
-import org.jsoup.internal.SharedConstants;
 import org.jsoup.internal.StringUtil;
 import org.jsoup.nodes.Document;
 import org.jsoup.parser.Parser;
@@ -52,6 +51,7 @@
 import static org.jsoup.Connection.Method.HEAD;
 import static org.jsoup.helper.DataUtil.UTF_8;
 import static org.jsoup.internal.Normalizer.lowerCase;
+import static org.jsoup.internal.SharedConstants.DefaultBufferSize;
 
 /**
  * Implementation of {@link Connection}.
@@ -915,7 +915,7 @@ else if (res.hasHeaderWithValue(CONTENT_ENCODING, "deflate"))
                         stream = new InflaterInputStream(stream, new Inflater(true));
 
                     res.bodyStream = ControllableInputStream.wrap(
-                        stream, SharedConstants.DefaultBufferSize, req.maxBodySize())
+                        stream, DefaultBufferSize, req.maxBodySize())
                         .timeout(startTime, req.timeout());
 
                     if (req.responseProgress != null) // set response progress listener
@@ -965,11 +965,12 @@ public String contentType() {
         }
 
         /** Called from parse() or streamParser(), validates and prepares the input stream, and aligns common settings. */
-        private InputStream prepareParse() {
+        private ControllableInputStream prepareParse() {
             Validate.isTrue(executed, "Request must be executed (with .execute(), .get(), or .post() before parsing response");
-            InputStream stream = bodyStream;
+            ControllableInputStream stream = bodyStream;
             if (byteData != null) { // bytes have been read in to the buffer, parse that
-                stream = new ByteArrayInputStream(byteData.array());
+                ByteArrayInputStream bytes = new ByteArrayInputStream(byteData.array(), 0, byteData.limit());
+                stream = ControllableInputStream.wrap(bytes, 0); // no max
                 inputStreamRead = false; // ok to reparse if in bytes
             }
             Validate.isFalse(inputStreamRead, "Input stream already read and parsed, cannot re-read.");
@@ -979,7 +980,7 @@ private InputStream prepareParse() {
         }
 
         @Override public Document parse() throws IOException {
-            InputStream stream = prepareParse();
+            ControllableInputStream stream = prepareParse();
             Document doc = DataUtil.parseInputStream(stream, charset, url.toExternalForm(), req.parser());
             doc.connection(new HttpConnection(req, this)); // because we're static, don't have the connection obj. // todo - maybe hold in the req?
             charset = doc.outputSettings().charset().name(); // update charset from meta-equiv, possibly
@@ -988,15 +989,14 @@ private InputStream prepareParse() {
         }
 
         @Override public StreamParser streamParser() throws IOException {
-            InputStream stream = prepareParse();
+            ControllableInputStream stream = prepareParse();
             String baseUri = url.toExternalForm();
             DataUtil.CharsetDoc charsetDoc = DataUtil.detectCharset(stream, charset, baseUri, req.parser());
             // note that there may be a document in CharsetDoc as a result of scanning meta-data -- but as requires a stream parse, it is not used here. todo - revisit.
 
             // set up the stream parser and rig this connection up to the parsed doc:
             StreamParser streamer = new StreamParser(req.parser());
             BufferedReader reader = new BufferedReader(new InputStreamReader(stream, charsetDoc.charset));
-            DataUtil.maybeSkipBom(reader, charsetDoc);
             streamer.parse(reader, baseUri); // initializes the parse and the document, but does not step() it
             streamer.document().connection(new HttpConnection(req, this));
             charset = charsetDoc.charset.name();
@@ -1035,7 +1035,19 @@ public String body() {
         public byte[] bodyAsBytes() {
             prepareByteData();
             Validate.notNull(byteData);
-            return byteData.array();
+            Validate.isTrue(byteData.hasArray()); // we made it, so it should
+
+            byte[] array = byteData.array();
+            int offset = byteData.arrayOffset();
+            int length = byteData.limit();
+
+            if (offset == 0 && length == array.length) { // exact, just return it
+                return array;
+            } else { // trim to size
+                byte[] exactArray = new byte[length];
+                System.arraycopy(array, offset, exactArray, 0, length);
+                return exactArray;
+            }
         }
 
         @Override
@@ -1050,7 +1062,9 @@ public BufferedInputStream bodyStream() {
 
             // if we have read to bytes (via buffer up), return those as a stream.
             if (byteData != null) {
-                return new BufferedInputStream(new ByteArrayInputStream(byteData.array()), SharedConstants.DefaultBufferSize);
+                return new BufferedInputStream(
+                    new ByteArrayInputStream(byteData.array(), 0, byteData.limit()),
+                    DefaultBufferSize);
             }
 
             Validate.isFalse(inputStreamRead, "Request has already been read");