From c8110b4830c990453e167ccca934e65858308fa1 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Sat, 17 Aug 2024 11:25:28 +0900 Subject: [PATCH] Fix to not allow parameter entity references at internal subsets (#191) ## Why? In the internal subset of DTD, references to parameter entities are not allowed within markup declarations. See: https://www.w3.org/TR/xml/#wfc-PEinInternalSubset > Well-formedness constraint: PEs in Internal Subset > In the internal DTD subset, parameter-entity references MUST NOT occur within markup declarations; they may occur where markup declarations can occur. (This does not apply to references that occur in external parameter entities or to the external subset.) --- lib/rexml/entity.rb | 52 ++-------------- lib/rexml/parsers/baseparser.rb | 3 + test/test_document.rb | 50 ---------------- test/test_entity.rb | 102 +++++++++++++++++++++++++------- 4 files changed, 89 insertions(+), 118 deletions(-) diff --git a/lib/rexml/entity.rb b/lib/rexml/entity.rb index 573db691..12bbad3f 100644 --- a/lib/rexml/entity.rb +++ b/lib/rexml/entity.rb @@ -12,6 +12,7 @@ class Entity < Child EXTERNALID = "(?:(?:(SYSTEM)\\s+#{SYSTEMLITERAL})|(?:(PUBLIC)\\s+#{PUBIDLITERAL}\\s+#{SYSTEMLITERAL}))" NDATADECL = "\\s+NDATA\\s+#{NAME}" PEREFERENCE = "%#{NAME};" + PEREFERENCE_RE = /#{PEREFERENCE}/um ENTITYVALUE = %Q{((?:"(?:[^%&"]|#{PEREFERENCE}|#{REFERENCE})*")|(?:'([^%&']|#{PEREFERENCE}|#{REFERENCE})*'))} PEDEF = "(?:#{ENTITYVALUE}|#{EXTERNALID})" ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))" @@ -19,7 +20,7 @@ class Entity < Child GEDECL = "" ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um - attr_reader :name, :external, :ref, :ndata, :pubid + attr_reader :name, :external, :ref, :ndata, :pubid, :value # Create a new entity. Simple entities can be constructed by passing a # name, value to the constructor; this creates a generic, plain entity @@ -68,14 +69,11 @@ def Entity::matches? string end # Evaluates to the unnormalized value of this entity; that is, replacing - # all entities -- both %ent; and &ent; entities. This differs from - # +value()+ in that +value+ only replaces %ent; entities. + # &ent; entities. def unnormalized document.record_entity_expansion unless document.nil? - v = value() - return nil if v.nil? - @unnormalized = Text::unnormalize(v, parent) - @unnormalized + return nil if @value.nil? + @unnormalized = Text::unnormalize(@value, parent) end #once :unnormalized @@ -121,46 +119,6 @@ def to_s write rv rv end - - PEREFERENCE_RE = /#{PEREFERENCE}/um - # Returns the value of this entity. At the moment, only internal entities - # are processed. If the value contains internal references (IE, - # %blah;), those are replaced with their values. IE, if the doctype - # contains: - # - # - # then: - # doctype.entity('yada').value #-> "nanoo bar nanoo" - def value - @resolved_value ||= resolve_value - end - - def parent=(other) - @resolved_value = nil - super - end - - private - def resolve_value - return nil if @value.nil? - return @value unless @value.match?(PEREFERENCE_RE) - - matches = @value.scan(PEREFERENCE_RE) - rv = @value.clone - if @parent - sum = 0 - matches.each do |entity_reference| - entity_value = @parent.entity( entity_reference[0] ) - if sum + entity_value.bytesize > Security.entity_expansion_text_limit - raise "entity expansion has grown too large" - else - sum += entity_value.bytesize - end - rv.gsub!( /%#{entity_reference.join};/um, entity_value ) - end - end - rv - end end # This is a set of entity constants -- the ones defined in the XML diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index c03f375f..e38ff86e 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -141,6 +141,7 @@ class BaseParser } module Private + PEREFERENCE_PATTERN = /#{PEREFERENCE}/um TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um @@ -350,6 +351,8 @@ def pull_event match[4] = match[4][1..-2] # HREF match.delete_at(5) if match.size > 5 # Chop out NDATA decl # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ] + elsif Private::PEREFERENCE_PATTERN.match?(match[2]) + raise REXML::ParseException.new("Parameter entity references forbidden in internal subset: #{match[2]}", @source) else match[2] = match[2][1..-2] match.pop if match.size == 4 diff --git a/test/test_document.rb b/test/test_document.rb index 72ec3579..25a8828f 100644 --- a/test/test_document.rb +++ b/test/test_document.rb @@ -147,56 +147,6 @@ def test_entity_expansion_text_limit assert_equal(90, doc.root.children.first.value.bytesize) end end - - class ParameterEntityTest < self - def test_have_value - xml = < - - - - - - - -]> - -EOF - - assert_raise(REXML::ParseException) do - REXML::Document.new(xml) - end - REXML::Security.entity_expansion_limit = 100 - assert_equal(100, REXML::Security.entity_expansion_limit) - assert_raise(REXML::ParseException) do - REXML::Document.new(xml) - end - end - - def test_empty_value - xml = < - - - - - - - -]> - -EOF - - REXML::Document.new(xml) - REXML::Security.entity_expansion_limit = 90 - assert_equal(90, REXML::Security.entity_expansion_limit) - assert_raise(REXML::ParseException) do - REXML::Document.new(xml) - end - end - end end def test_tag_in_cdata_with_not_ascii_only_but_ascii8bit_encoding_source diff --git a/test/test_entity.rb b/test/test_entity.rb index a2b262f7..89f83894 100644 --- a/test/test_entity.rb +++ b/test/test_entity.rb @@ -59,8 +59,7 @@ def test_parse_entity def test_constructor one = [ %q{}, - %q{}, - %q{}, + %q{}, '', '' ] source = %q{ - - + ', + "a", + "B", + "B", + "B", + ], + [ + entity.to_s, + entity.name, + entity.value, + entity.normalized, + entity.unnormalized, + ]) + end + + def test_readers_without_reference + entity = REXML::Entity.new([:entitydecl, "a", "&b;"]) + assert_equal([ + '', + "a", + "&b;", + "&b;", + "&b;", + ], + [ + entity.to_s, + entity.name, + entity.value, + entity.normalized, + entity.unnormalized, + ]) + end + + def test_readers_with_nested_references + doctype = REXML::DocType.new('root') + doctype.add(REXML::Entity.new([:entitydecl, "a", "&b;"])) + doctype.add(REXML::Entity.new([:entitydecl, "b", "X"])) + assert_equal([ + "a", + "&b;", + "&b;", + "X", + "b", + "X", + "X", + "X", + ], + [ + doctype.entities["a"].name, + doctype.entities["a"].value, + doctype.entities["a"].normalized, + doctype.entities["a"].unnormalized, + doctype.entities["b"].name, + doctype.entities["b"].value, + doctype.entities["b"].normalized, + doctype.entities["b"].unnormalized, + ]) + end + + def test_parameter_entity_reference_forbidden_by_internal_subset_in_parser + source = ' ]>' + parser = REXML::Parsers::BaseParser.new(source) + exception = assert_raise(REXML::ParseException) do + while parser.has_next? + parser.pull + end + end + assert_equal(<<-DETAIL, exception.to_s) +Parameter entity references forbidden in internal subset: "%a;" +Line: 1 +Position: 54 +Last 80 unconsumed characters: + DETAIL + end + def test_entity_string_limit template = ' ]> $' len = 5120 # 5k per entity @@ -122,22 +198,6 @@ def test_entity_string_limit end end - def test_entity_string_limit_for_parameter_entity - template = ' ]>' - len = 5120 # 5k per entity - template.sub!(/\^/, "B" * len) - - # 10k is OK - entities = '%a;' * 2 # 5k entity * 2 = 10k - REXML::Document.new(template.sub(/\$/, entities)) - - # above 10k explodes - entities = '%a;' * 3 # 5k entity * 2 = 15k - assert_raise(REXML::ParseException) do - REXML::Document.new(template.sub(/\$/, entities)) - end - end - def test_raw source = ' @@ -161,7 +221,7 @@ def test_lazy_evaluation def test_entity_replacement source = %q{ - ]> + ]> &WhatHeSaid;} d = REXML::Document.new( source )