From 86f7f6364491b0099d215db858ecdc0c89ded040 Mon Sep 17 00:00:00 2001
From: Mike Dalessio <mike.dalessio@gmail.com>
Date: Sat, 27 Aug 2022 14:05:40 -0400
Subject: [PATCH] fix: replace recursive approach to cdata with escaping
 solution

---
 lib/loofah/html5/scrub.rb       | 40 +++++++++++++++++++++++++++++++++
 lib/loofah/scrubber.rb          |  4 ++++
 lib/loofah/scrubbers.rb         |  8 ++-----
 test/integration/test_ad_hoc.rb | 17 +++++++++++---
 4 files changed, 60 insertions(+), 9 deletions(-)
diff --git a/lib/loofah/html5/scrub.rb b/lib/loofah/html5/scrub.rb
index 55fba469..9880b787 100644
--- a/lib/loofah/html5/scrub.rb
+++ b/lib/loofah/html5/scrub.rb
@@ -182,6 +182,46 @@ def force_correct_attribute_escaping!(node)
             end.force_encoding(encoding)
           end
         end
+
+        def cdata_needs_escaping?(node)
+          # Nokogiri's HTML4 parser on JRuby doesn't flag the child of a `style` or `script` tag as cdata, but it acts that way
+          node.cdata? || (Nokogiri.jruby? && node.text? && (node.parent.name == "style" || node.parent.name == "script"))
+        end
+
+        def cdata_escape(node)
+          escaped_text = escape_tags(node.text)
+          if Nokogiri.jruby?
+            node.document.create_text_node(escaped_text)
+          else
+            node.document.create_cdata(escaped_text)
+          end
+        end
+
+        TABLE_FOR_ESCAPE_HTML__ = {
+          '<' => '&lt;',
+          '>' => '&gt;',
+          '&' => '&amp;',
+        }
+
+        def escape_tags(string)
+          # modified version of CGI.escapeHTML from ruby 3.1
+          enc = string.encoding
+          unless enc.ascii_compatible?
+            if enc.dummy?
+              origenc = enc
+              enc = Encoding::Converter.asciicompat_encoding(enc)
+              string = enc ? string.encode(enc) : string.b
+            end
+            table = Hash[TABLE_FOR_ESCAPE_HTML__.map {|pair|pair.map {|s|s.encode(enc)}}]
+            string = string.gsub(/#{"[<>&]".encode(enc)}/, table)
+            string.encode!(origenc) if origenc
+            string
+          else
+            string = string.b
+            string.gsub!(/[<>&]/, TABLE_FOR_ESCAPE_HTML__)
+            string.force_encoding(enc)
+          end
+        end
       end
     end
   end
diff --git a/lib/loofah/scrubber.rb b/lib/loofah/scrubber.rb
index 1569e44e..38c994b4 100644
--- a/lib/loofah/scrubber.rb
+++ b/lib/loofah/scrubber.rb
@@ -108,6 +108,10 @@ def html5lib_sanitize(node)
           return Scrubber::CONTINUE
         end
       when Nokogiri::XML::Node::TEXT_NODE, Nokogiri::XML::Node::CDATA_SECTION_NODE
+        if HTML5::Scrub.cdata_needs_escaping?(node)
+          node.before(HTML5::Scrub.cdata_escape(node))
+          return Scrubber::STOP
+        end
         return Scrubber::CONTINUE
       end
       Scrubber::STOP
diff --git a/lib/loofah/scrubbers.rb b/lib/loofah/scrubbers.rb
index a0093e98..f1313e02 100644
--- a/lib/loofah/scrubbers.rb
+++ b/lib/loofah/scrubbers.rb
@@ -100,13 +100,9 @@ def initialize
 
       def scrub(node)
         return CONTINUE if html5lib_sanitize(node) == CONTINUE
-        if node.children.length == 1 && node.children.first.cdata?
-          sanitized_text = Loofah.fragment(node.children.first.to_html).scrub!(:strip).to_html
-          node.before Nokogiri::XML::Text.new(sanitized_text, node.document)
-        else
-          node.before node.children
-        end
+        node.before(node.children)
         node.remove
+        return STOP
       end
     end
 
diff --git a/test/integration/test_ad_hoc.rb b/test/integration/test_ad_hoc.rb
index 9f871877..a9735986 100644
--- a/test/integration/test_ad_hoc.rb
+++ b/test/integration/test_ad_hoc.rb
@@ -100,17 +100,28 @@ def test_return_empty_string_when_nothing_left
     end
 
     def test_nested_script_cdata_tags_should_be_scrubbed
-      html = "<script><script src='malicious.js'></script>"
+      html = "<script><script src=\"malicious.js\">this & that</script>"
       stripped = Loofah.fragment(html).scrub!(:strip)
+
       assert_empty stripped.xpath("//script")
-      refute_match("<script", stripped.to_html)
+      assert_equal("&lt;script src=\"malicious.js\"&gt;this &amp; that", stripped.to_html)
     end
 
     def test_nested_script_cdata_tags_should_be_scrubbed_2
       html = "<script><script>alert('a');</script></script>"
       stripped = Loofah.fragment(html).scrub!(:strip)
+
       assert_empty stripped.xpath("//script")
-      refute_match("<script", stripped.to_html)
+      assert_equal("&lt;script&gt;alert('a');", stripped.to_html)
+    end
+
+    def test_nested_script_cdata_tags_should_be_scrubbed_max_recursion
+      n = 40
+      html = "<div>" + ("<script>" * n) + "alert(1);" + ("</script>" * n) + "</div>"
+      expected = "<div>" + ("&lt;script&gt;" * (n-1)) + "alert(1);</div>"
+      actual = Loofah.fragment(html).scrub!(:strip).to_html
+
+      assert_equal(expected, actual)
     end
 
     def test_removal_of_all_tags