From 48d2142b441049968b4716b774d0f3b48115a538 Mon Sep 17 00:00:00 2001 From: Earlopain Date: Tue, 16 May 2023 08:41:43 +0200 Subject: [PATCH] feat: add an option to preserve whitespace to FullSanitizer --- README.md | 7 ++++ lib/rails/html/sanitizer.rb | 16 +++++++-- test/sanitizer_test.rb | 66 ++++++++++++++++++++++++------------- 3 files changed, 64 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index dd1f8ab..8c34637 100644 --- a/README.md +++ b/README.md @@ -60,6 +60,13 @@ All sanitizers respond to `sanitize`. full_sanitizer = Rails::Html::FullSanitizer.new full_sanitizer.sanitize("Bold no more! See more here...") # => Bold no more! See more here... + +# Whitespace is swallowed by default. If whitespace is significant you must pass an option to preserve it: + +full_sanitizer = Rails::Html::FullSanitizer.new +full_sanitizer.sanitize("

Paragraphs

and
newlines", preserve_whitespace: true) +# => \nParagraphs\n and \n newlines + ``` #### LinkSanitizer diff --git a/lib/rails/html/sanitizer.rb b/lib/rails/html/sanitizer.rb index 531ff0d..0ce4838 100644 --- a/lib/rails/html/sanitizer.rb +++ b/lib/rails/html/sanitizer.rb @@ -24,6 +24,14 @@ def properly_encode(fragment, options) # full_sanitizer = Rails::Html::FullSanitizer.new # full_sanitizer.sanitize("Bold no more! See more here...") # # => Bold no more! See more here... + # + # === Options + # + # If whitespace is significant you can pass preserve_whitespace: true. + # + # full_sanitizer = Rails::Html::FullSanitizer.new + # full_sanitizer.sanitize("

Paragraphs

and
newlines", preserve_whitespace: true) + # # => \nParagraphs\n and \n newlines class FullSanitizer < Sanitizer def sanitize(html, options = {}) return unless html @@ -31,9 +39,13 @@ def sanitize(html, options = {}) loofah_fragment = Loofah.fragment(html) - loofah_fragment.scrub!(TextOnlyScrubber.new) + if options[:preserve_whitespace] + loofah_fragment.to_text + else + loofah_fragment.scrub!(TextOnlyScrubber.new) - properly_encode(loofah_fragment, encoding: "UTF-8") + properly_encode(loofah_fragment, encoding: "UTF-8") + end end end diff --git a/test/sanitizer_test.rb b/test/sanitizer_test.rb index dae5001..a24051e 100644 --- a/test/sanitizer_test.rb +++ b/test/sanitizer_test.rb @@ -78,37 +78,41 @@ def test_remove_xpaths_called_with_enumerable_xpaths def test_strip_tags_with_quote input = '<" hi' - result = full_sanitize(input) acceptable_results = [ # libxml2 >= 2.9.14 and xerces+neko %{<" hi}, # other libxml2 %{ hi}, + # preserve_whitespace: true + "<" hi", ] - assert_includes(acceptable_results, result) + assert_full_sanitized(acceptable_results, input) end def test_strip_invalid_html - assert_equal "<<", full_sanitize("<<This is a test.\n\n\n\n

It no longer contains any HTML.

\n} + acceptable_results = [ + %{This is a test.\n\n\n\nIt no longer contains any HTML.\n}, + # preserve_whitespace: true + %{\nThis is a test.\n\nIt no longer contains any HTML.\n\n} + ] - assert_equal expected, full_sanitize(input) + assert_full_sanitized acceptable_results, input end def test_remove_unclosed_tags input = "This is <-- not\n a comment here." - result = full_sanitize(input) acceptable_results = [ # libxml2 >= 2.9.14 and xerces+neko %{This is <-- not\n a comment here.}, @@ -116,12 +120,11 @@ def test_remove_unclosed_tags %{This is }, ] - assert_includes(acceptable_results, result) + assert_full_sanitized(acceptable_results, input) end def test_strip_cdata input = "This has a ]]> here." - result = full_sanitize(input) acceptable_results = [ # libxml2 = 2.9.14 %{This has a <![CDATA[]]> here.}, @@ -131,7 +134,7 @@ def test_strip_cdata %{This has a here.}, ] - assert_includes(acceptable_results, result) + assert_full_sanitized(acceptable_results, input) end def test_strip_unclosed_cdata @@ -153,40 +156,52 @@ def test_strip_unclosed_cdata def test_strip_blank_string assert_nil full_sanitize(nil) - assert_equal "", full_sanitize("") - assert_equal " ", full_sanitize(" ") + assert_nil full_sanitize(nil, preserve_whitespace: true) + assert_full_sanitized "", "" + assert_full_sanitized " ", " " end def test_strip_tags_with_plaintext - assert_equal "Don't touch me", full_sanitize("Don't touch me") + assert_full_sanitized "Don't touch me", "Don't touch me" end def test_strip_tags_with_tags - assert_equal "This is a test.", full_sanitize("

This is a test.

") + assert_full_sanitized "This is a test.", "This is a test." end def test_escape_tags_with_many_open_quotes - assert_equal "<<", full_sanitize("<<") + assert_full_sanitized "<<", "<<" end def test_strip_tags_with_sentence - assert_equal "This is a test.", full_sanitize("This is a test.") + assert_full_sanitized "This is a test.", "This is a test." end def test_strip_tags_with_comment - assert_equal "This has a here.", full_sanitize("This has a here.") + assert_full_sanitized "This has a here.", "This has a here." end def test_strip_tags_with_frozen_string - assert_equal "Frozen string with no tags", full_sanitize("Frozen string with no tags") + assert_full_sanitized "Frozen string with no tags", "Frozen string with no tags" end def test_full_sanitize_respect_html_escaping_of_the_given_string - assert_equal 'test\r\nstring', full_sanitize('test\r\nstring') - assert_equal "&", full_sanitize("&") - assert_equal "&", full_sanitize("&") - assert_equal "&amp;", full_sanitize("&amp;") - assert_equal "omg <script>BOM</script>", full_sanitize("omg <script>BOM</script>") + assert_full_sanitized 'test\r\nstring', 'test\r\nstring' + assert_full_sanitized "&", "&" + assert_full_sanitized "&", "&" + assert_full_sanitized "&amp;", "&amp;" + assert_full_sanitized "omg <script>BOM</script>", "omg <script>BOM</script>" + end + + def test_full_sanitize_preserve_whitespace + assert_equal "\na\n\nb\n", full_sanitize("

a

b

", preserve_whitespace: true) + end + + def test_full_sanitize_preserve_whitespace_ascii_8bit_string + full_sanitize("hello".encode("ASCII-8BIT")).tap do |sanitized| + assert_equal "hello", sanitized + assert_equal Encoding::UTF_8, sanitized.encoding + end end def test_strip_links_with_tags_in_tags @@ -917,6 +932,11 @@ def assert_sanitized(input, expected = nil) assert_equal((expected || input), safe_list_sanitize(input)) end + def assert_full_sanitized(acceptable_results, input) + assert_includes(Array(acceptable_results), full_sanitize(input)) + assert_includes(Array(acceptable_results), full_sanitize(input, preserve_whitespace: true)) + end + def sanitize_css(input) Rails::Html::SafeListSanitizer.new.sanitize_css(input) end