From 8601e85f96a67f8548216eec5e27737085d54488 Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Tue, 10 Sep 2024 16:36:20 +1000 Subject: [PATCH] Parse in quirksmode if no doctype html Fixes #2197 --- CHANGES.md | 2 ++ .../jsoup/parser/HtmlTreeBuilderState.java | 5 +++-- .../java/org/jsoup/parser/HtmlParserTest.java | 20 +++++++++++++++++++ 3 files changed, 25 insertions(+), 2 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index c2280ef6c6..0981c4f97d 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -22,6 +22,8 @@ character. [2169](https://github.com/jhy/jsoup/issues/2169) * When tracking source ranges, a text node following an invalid self-closing element may be left untracked.[2175](https://github.com/jhy/jsoup/issues/2175) +* When a document has no doctype, or a doctype not named `html`, it should be parsed in Quirks + Mode. [2197](https://github.com/jhy/jsoup/issues/2197) ## 1.18.1 (2024-Jul-10) diff --git a/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java b/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java index f1b2d7b239..470a785a50 100644 --- a/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java +++ b/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java @@ -25,18 +25,19 @@ enum HtmlTreeBuilderState { tb.insertCommentNode(t.asComment()); } else if (t.isDoctype()) { // todo: parse error check on expected doctypes - // todo: quirk state check on doctype ids Token.Doctype d = t.asDoctype(); DocumentType doctype = new DocumentType( tb.settings.normalizeTag(d.getName()), d.getPublicIdentifier(), d.getSystemIdentifier()); doctype.setPubSysKey(d.getPubSysKey()); tb.getDocument().appendChild(doctype); tb.onNodeInserted(doctype); - if (d.isForceQuirks()) + // todo: quirk state check on more doctype ids, if deemed useful (most are ancient legacy and presumably irrelevant) + if (d.isForceQuirks() || !doctype.name().equals("html") || doctype.publicId().equalsIgnoreCase("HTML")) tb.getDocument().quirksMode(Document.QuirksMode.quirks); tb.transition(BeforeHtml); } else { // todo: check not iframe srcdoc + tb.getDocument().quirksMode(Document.QuirksMode.quirks); // missing doctype tb.transition(BeforeHtml); return tb.process(t); // re-process token } diff --git a/src/test/java/org/jsoup/parser/HtmlParserTest.java b/src/test/java/org/jsoup/parser/HtmlParserTest.java index 7fa7a67a59..a67003a839 100644 --- a/src/test/java/org/jsoup/parser/HtmlParserTest.java +++ b/src/test/java/org/jsoup/parser/HtmlParserTest.java @@ -1888,4 +1888,24 @@ private static void assertMathNamespace(Element el) { img.ownerDocument().outputSettings().charset("ascii"); assertEquals("", img.outerHtml()); } + + @Test void tableInPInQuirksMode() { + // https://github.com/jhy/jsoup/issues/2197 + String html = "

Hello table data

"; + Document doc = Jsoup.parse(html); + assertEquals(Document.QuirksMode.quirks, doc.quirksMode()); + assertEquals( + "

Hello table data

", // quirks, allows table in p + TextUtil.normalizeSpaces(doc.body().html()) + ); + + // doctype set, no quirks + html ="

Hello table data

"; + doc = Jsoup.parse(html); + assertEquals(Document.QuirksMode.noQuirks, doc.quirksMode()); + assertEquals( + "

Hello table data

", // no quirks, p gets closed + TextUtil.normalizeSpaces(doc.body().html()) + ); + } }