Skip to content

Commit

Permalink
Simplify BOM character skip
Browse files Browse the repository at this point in the history
Also close underlying reader if passthrough read is done.
  • Loading branch information
jhy committed Aug 8, 2024
1 parent 848cfe0 commit 92104bf
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 41 deletions.
45 changes: 10 additions & 35 deletions src/main/java/org/jsoup/helper/DataUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,17 +15,14 @@
import org.jspecify.annotations.Nullable;

import java.io.BufferedReader;
import java.io.CharArrayReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.Reader;
import java.io.UncheckedIOException;
import java.nio.Buffer;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.channels.Channels;
import java.nio.channels.SeekableByteChannel;
import java.nio.charset.Charset;
Expand Down Expand Up @@ -143,7 +140,6 @@ public static StreamParser streamParser(Path path, @Nullable Charset charset, St
String charsetName = charset != null? charset.name() : null;
DataUtil.CharsetDoc charsetDoc = DataUtil.detectCharset(openStream(path), charsetName, baseUri, parser);
BufferedReader reader = new BufferedReader(new InputStreamReader(charsetDoc.input, charsetDoc.charset), DefaultBufferSize);
maybeSkipBom(reader, charsetDoc);
streamer.parse(reader, baseUri); // initializes the parse and the document, but does not step() it

return streamer;
Expand Down Expand Up @@ -208,13 +204,11 @@ static class CharsetDoc {
Charset charset;
InputStream input;
@Nullable Document doc;
boolean skip;

CharsetDoc(Charset charset, @Nullable Document doc, InputStream input, boolean skip) {
CharsetDoc(Charset charset, @Nullable Document doc, InputStream input) {
this.charset = charset;
this.input = input;
this.doc = doc;
this.skip = skip;
}
}

Expand All @@ -238,9 +232,9 @@ static CharsetDoc detectCharset(ControllableInputStream input, @Nullable String
Document doc = null;
// read the start of the stream and look for a BOM or meta charset:
// look for BOM - overrides any other header or input
BomCharset bomCharset = detectCharsetFromBom(input); // resets
String bomCharset = detectCharsetFromBom(input); // resets / consumes appropriately
if (bomCharset != null)
charsetName = bomCharset.charset;
charsetName = bomCharset;

if (charsetName == null) { // read ahead and determine from meta. safe first parse as UTF-8
int origMax = input.max();
Expand Down Expand Up @@ -303,9 +297,7 @@ else if (first instanceof Comment) {
if (charsetName == null)
charsetName = defaultCharsetName;
Charset charset = charsetName.equals(defaultCharsetName) ? UTF_8 : Charset.forName(charsetName);
boolean skip = bomCharset != null && bomCharset.offset; // skip 1 if the BOM is there and needs offset
// if consumer needs to parse the input; prep it if there's a BOM. Can't skip in inputstream as wrapping buffer will ignore the pos
return new CharsetDoc(charset, doc, input, skip);
return new CharsetDoc(charset, doc, input);
}

static Document parseInputStream(CharsetDoc charsetDoc, String baseUri, Parser parser) throws IOException {
Expand All @@ -318,7 +310,6 @@ static Document parseInputStream(CharsetDoc charsetDoc, String baseUri, Parser p
final Document doc;
final Charset charset = charsetDoc.charset;
try (Reader reader = new InputStreamReader(input, charset)) {
maybeSkipBom(reader, charsetDoc);
try {
doc = parser.parseInput(reader, baseUri);
} catch (UncheckedIOException e) {
Expand All @@ -334,13 +325,6 @@ static Document parseInputStream(CharsetDoc charsetDoc, String baseUri, Parser p
return doc;
}

static void maybeSkipBom(Reader reader, CharsetDoc charsetDoc) throws IOException {
if (charsetDoc.skip) {
long skipped = reader.skip(1);
Validate.isTrue(skipped == 1); // WTF if this fails.
}
}

/**
* Read the input stream into a byte buffer. To deal with slow input streams, you may interrupt the thread this
* method is executing on. The data read until being interrupted will be available.
Expand Down Expand Up @@ -399,33 +383,24 @@ static String mimeBoundary() {
return StringUtil.releaseBuilder(mime);
}

private static @Nullable BomCharset detectCharsetFromBom(ControllableInputStream input) throws IOException {
private static @Nullable String detectCharsetFromBom(ControllableInputStream input) throws IOException {
byte[] bom = new byte[4];
input.mark(bom.length);
//noinspection ResultOfMethodCallIgnored
input.read(bom, 0, 4);

Check notice

Code scanning / CodeQL

Ignored error status of call Note

Method detectCharsetFromBom ignores exceptional return value of ControllableInputStream.read.
input.reset();

// 16 and 32 decoders consume the BOM to determine be/le; utf-8 should be consumed here
if (bom[0] == 0x00 && bom[1] == 0x00 && bom[2] == (byte) 0xFE && bom[3] == (byte) 0xFF || // BE
bom[0] == (byte) 0xFF && bom[1] == (byte) 0xFE && bom[2] == 0x00 && bom[3] == 0x00) { // LE
return new BomCharset("UTF-32", false); // and I hope it's on your system
return "UTF-32"; // and I hope it's on your system
} else if (bom[0] == (byte) 0xFE && bom[1] == (byte) 0xFF || // BE
bom[0] == (byte) 0xFF && bom[1] == (byte) 0xFE) {
return new BomCharset("UTF-16", false); // in all Javas
return "UTF-16"; // in all Javas
} else if (bom[0] == (byte) 0xEF && bom[1] == (byte) 0xBB && bom[2] == (byte) 0xBF) {
return new BomCharset("UTF-8", true); // in all Javas
// 16 and 32 decoders consume the BOM to determine be/le; utf-8 should be consumed here
input.read(bom, 0, 3); // consume the UTF-8 BOM

Check notice

Code scanning / CodeQL

Ignored error status of call Note

Method detectCharsetFromBom ignores exceptional return value of ControllableInputStream.read.
return "UTF-8"; // in all Javas
}
return null;
}

private static class BomCharset {
private final String charset;
private final boolean offset;

public BomCharset(String charset, boolean offset) {
this.charset = charset;
this.offset = offset;
}
}
}
2 changes: 0 additions & 2 deletions src/main/java/org/jsoup/helper/HttpConnection.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import org.jsoup.UnsupportedMimeTypeException;
import org.jsoup.internal.ControllableInputStream;
import org.jsoup.internal.Functions;
import org.jsoup.internal.SharedConstants;
import org.jsoup.internal.StringUtil;
import org.jsoup.nodes.Document;
import org.jsoup.parser.Parser;
Expand Down Expand Up @@ -998,7 +997,6 @@ private ControllableInputStream prepareParse() {
// set up the stream parser and rig this connection up to the parsed doc:
StreamParser streamer = new StreamParser(req.parser());
BufferedReader reader = new BufferedReader(new InputStreamReader(stream, charsetDoc.charset));
DataUtil.maybeSkipBom(reader, charsetDoc);
streamer.parse(reader, baseUri); // initializes the parse and the document, but does not step() it
streamer.document().connection(new HttpConnection(req, this));
charset = charsetDoc.charset.name();
Expand Down
14 changes: 10 additions & 4 deletions src/main/java/org/jsoup/internal/SimpleBufferedInput.java
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,12 @@ public int read(byte[] dest, int offset, int desiredLen) throws IOException {
}

int bufAvail = bufLength - bufPos;
if (bufAvail <= 0) {
if (desiredLen >= BufferSize && bufMark < 0) {
// We can skip creating / copying into a local buffer; just pass through
return in.read(dest, offset, desiredLen);
if (bufAvail <= 0) { // can't serve from the buffer
if (!inReadFully && bufMark < 0) {
// skip creating / copying into a local buffer; just pass through
int read = in.read(dest, offset, desiredLen);
closeIfDone(read);
return read;
}
fill();
bufAvail = bufLength - bufPos;
Expand Down Expand Up @@ -97,6 +99,10 @@ private void fill() throws IOException {
bufLength += read;
}
}
closeIfDone(read);
}

private void closeIfDone(int read) throws IOException {
if (read == -1) {
inReadFully = true;
super.close(); // close underlying stream immediately; frees resources a little earlier
Expand Down

0 comments on commit 92104bf

Please sign in to comment.