From 2bca7bd84a5cf13af8f5633dd7d3d519fc990d67 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Tue, 23 Jul 2024 05:53:46 +0900 Subject: [PATCH] Add support for detecting invalid XML that has unsupported content before root element (#184) ## Why? XML with content at the start of the document is invalid. https://www.w3.org/TR/2006/REC-xml11-20060816/#document ``` [1] document ::= ( prolog element Misc* ) - ( Char* RestrictedChar Char* ) ``` https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-prolog ``` [22] prolog ::= XMLDecl Misc* (doctypedecl Misc*)? ``` https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-XMLDecl ``` [23] XMLDecl ::= '' ``` https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Misc ``` [27] Misc ::= Comment | PI | S ``` https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PI ``` [16] PI ::= '' Char*)))? '?>' ``` https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PITarget ``` [17] PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l')) ``` https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-doctypedecl ``` [28] doctypedecl ::= '' ``` See: https://github.com/ruby/rexml/pull/164#discussion_r1683552024 --- lib/rexml/parsers/baseparser.rb | 10 ++++-- test/parse/test_comment.rb | 12 +++++++ test/parse/test_processing_instruction.rb | 43 +++++++++++++---------- test/parse/test_text.rb | 17 +++++++++ 4 files changed, 60 insertions(+), 22 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index bbdcfc6c..54014e57 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -486,11 +486,15 @@ def pull_event if text.chomp!("<") @source.position -= "<".bytesize end - if @tags.empty? and @have_root + if @tags.empty? unless /\A\s*\z/.match?(text) - raise ParseException.new("Malformed XML: Extra content at the end of the document (got '#{text}')", @source) + if @have_root + raise ParseException.new("Malformed XML: Extra content at the end of the document (got '#{text}')", @source) + else + raise ParseException.new("Malformed XML: Content at the start of the document (got '#{text}')", @source) + end end - return pull_event + return pull_event if @have_root end return [ :text, text ] end diff --git a/test/parse/test_comment.rb b/test/parse/test_comment.rb index b7892232..4475dca7 100644 --- a/test/parse/test_comment.rb +++ b/test/parse/test_comment.rb @@ -110,6 +110,18 @@ def test_after_doctype_malformed_comment_end end end + def test_before_root + parser = REXML::Parsers::BaseParser.new('') + + events = {} + while parser.has_next? + event = parser.pull + events[event[0]] = event[1] + end + + assert_equal(" ok comment ", events[:comment]) + end + def test_after_root parser = REXML::Parsers::BaseParser.new('') diff --git a/test/parse/test_processing_instruction.rb b/test/parse/test_processing_instruction.rb index 7943cd3c..8d42e964 100644 --- a/test/parse/test_processing_instruction.rb +++ b/test/parse/test_processing_instruction.rb @@ -25,25 +25,6 @@ def test_no_name DETAIL end - def test_garbage_text - # TODO: This should be parse error. - # Create test/parse/test_document.rb or something and move this to it. - doc = parse(<<-XML) -x?> - - XML - pi = doc.children[1] - assert_equal([ - "x", - "y\n?> + + XML + assert_equal([["x", "y\n"]], + [[doc.children[0].target, doc.children[0].content], + [doc.children[1].target, doc.children[1].content]]) + end + + def test_before_root + parser = REXML::Parsers::BaseParser.new('') + + events = {} + while parser.has_next? + event = parser.pull + events[event[0]] = event[1] + end + + assert_equal("abc", events[:processing_instruction]) + end + def test_after_root parser = REXML::Parsers::BaseParser.new('') diff --git a/test/parse/test_text.rb b/test/parse/test_text.rb index 1acefc40..04f553ae 100644 --- a/test/parse/test_text.rb +++ b/test/parse/test_text.rb @@ -4,6 +4,23 @@ module REXMLTests class TestParseText < Test::Unit::TestCase class TestInvalid < self + def test_before_root + exception = assert_raise(REXML::ParseException) do + parser = REXML::Parsers::BaseParser.new('b') + while parser.has_next? + parser.pull + end + end + + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed XML: Content at the start of the document (got 'b') + Line: 1 + Position: 4 + Last 80 unconsumed characters: + + DETAIL + end + def test_after_root exception = assert_raise(REXML::ParseException) do parser = REXML::Parsers::BaseParser.new('c')