From 86f7f6364491b0099d215db858ecdc0c89ded040 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Sat, 27 Aug 2022 14:05:40 -0400 Subject: [PATCH] fix: replace recursive approach to cdata with escaping solution --- lib/loofah/html5/scrub.rb | 40 +++++++++++++++++++++++++++++++++ lib/loofah/scrubber.rb | 4 ++++ lib/loofah/scrubbers.rb | 8 ++----- test/integration/test_ad_hoc.rb | 17 +++++++++++--- 4 files changed, 60 insertions(+), 9 deletions(-) diff --git a/lib/loofah/html5/scrub.rb b/lib/loofah/html5/scrub.rb index 55fba469..9880b787 100644 --- a/lib/loofah/html5/scrub.rb +++ b/lib/loofah/html5/scrub.rb @@ -182,6 +182,46 @@ def force_correct_attribute_escaping!(node) end.force_encoding(encoding) end end + + def cdata_needs_escaping?(node) + # Nokogiri's HTML4 parser on JRuby doesn't flag the child of a `style` or `script` tag as cdata, but it acts that way + node.cdata? || (Nokogiri.jruby? && node.text? && (node.parent.name == "style" || node.parent.name == "script")) + end + + def cdata_escape(node) + escaped_text = escape_tags(node.text) + if Nokogiri.jruby? + node.document.create_text_node(escaped_text) + else + node.document.create_cdata(escaped_text) + end + end + + TABLE_FOR_ESCAPE_HTML__ = { + '<' => '<', + '>' => '>', + '&' => '&', + } + + def escape_tags(string) + # modified version of CGI.escapeHTML from ruby 3.1 + enc = string.encoding + unless enc.ascii_compatible? + if enc.dummy? + origenc = enc + enc = Encoding::Converter.asciicompat_encoding(enc) + string = enc ? string.encode(enc) : string.b + end + table = Hash[TABLE_FOR_ESCAPE_HTML__.map {|pair|pair.map {|s|s.encode(enc)}}] + string = string.gsub(/#{"[<>&]".encode(enc)}/, table) + string.encode!(origenc) if origenc + string + else + string = string.b + string.gsub!(/[<>&]/, TABLE_FOR_ESCAPE_HTML__) + string.force_encoding(enc) + end + end end end end diff --git a/lib/loofah/scrubber.rb b/lib/loofah/scrubber.rb index 1569e44e..38c994b4 100644 --- a/lib/loofah/scrubber.rb +++ b/lib/loofah/scrubber.rb @@ -108,6 +108,10 @@ def html5lib_sanitize(node) return Scrubber::CONTINUE end when Nokogiri::XML::Node::TEXT_NODE, Nokogiri::XML::Node::CDATA_SECTION_NODE + if HTML5::Scrub.cdata_needs_escaping?(node) + node.before(HTML5::Scrub.cdata_escape(node)) + return Scrubber::STOP + end return Scrubber::CONTINUE end Scrubber::STOP diff --git a/lib/loofah/scrubbers.rb b/lib/loofah/scrubbers.rb index a0093e98..f1313e02 100644 --- a/lib/loofah/scrubbers.rb +++ b/lib/loofah/scrubbers.rb @@ -100,13 +100,9 @@ def initialize def scrub(node) return CONTINUE if html5lib_sanitize(node) == CONTINUE - if node.children.length == 1 && node.children.first.cdata? - sanitized_text = Loofah.fragment(node.children.first.to_html).scrub!(:strip).to_html - node.before Nokogiri::XML::Text.new(sanitized_text, node.document) - else - node.before node.children - end + node.before(node.children) node.remove + return STOP end end diff --git a/test/integration/test_ad_hoc.rb b/test/integration/test_ad_hoc.rb index 9f871877..a9735986 100644 --- a/test/integration/test_ad_hoc.rb +++ b/test/integration/test_ad_hoc.rb @@ -100,17 +100,28 @@ def test_return_empty_string_when_nothing_left end def test_nested_script_cdata_tags_should_be_scrubbed - html = "" + html = "" stripped = Loofah.fragment(html).scrub!(:strip) + assert_empty stripped.xpath("//script") - refute_match("" * n) + "alert(1);" + ("" * n) + "" + expected = "
" + ("<script>" * (n-1)) + "alert(1);
" + actual = Loofah.fragment(html).scrub!(:strip).to_html + + assert_equal(expected, actual) end def test_removal_of_all_tags