Skip to content

Commit

Permalink
feat: add an option to preserve whitespace to FullSanitizer
Browse files Browse the repository at this point in the history
  • Loading branch information
Earlopain committed May 16, 2023
1 parent 5a1006f commit 48d2142
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 25 deletions.
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,13 @@ All sanitizers respond to `sanitize`.
full_sanitizer = Rails::Html::FullSanitizer.new
full_sanitizer.sanitize("<b>Bold</b> no more! <a href='more.html'>See more here</a>...")
# => Bold no more! See more here...

# Whitespace is swallowed by default. If whitespace is significant you must pass an option to preserve it:

full_sanitizer = Rails::Html::FullSanitizer.new
full_sanitizer.sanitize("<p>Paragraphs</p> and <br> newlines", preserve_whitespace: true)
# => \nParagraphs\n and \n newlines

```

#### LinkSanitizer
Expand Down
16 changes: 14 additions & 2 deletions lib/rails/html/sanitizer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -24,16 +24,28 @@ def properly_encode(fragment, options)
# full_sanitizer = Rails::Html::FullSanitizer.new
# full_sanitizer.sanitize("<b>Bold</b> no more! <a href='more.html'>See more here</a>...")
# # => Bold no more! See more here...
#
# === Options
#
# If whitespace is significant you can pass preserve_whitespace: true.
#
# full_sanitizer = Rails::Html::FullSanitizer.new
# full_sanitizer.sanitize("<p>Paragraphs</p> and <br> newlines", preserve_whitespace: true)
# # => \nParagraphs\n and \n newlines
class FullSanitizer < Sanitizer
def sanitize(html, options = {})
return unless html
return html if html.empty?

loofah_fragment = Loofah.fragment(html)

loofah_fragment.scrub!(TextOnlyScrubber.new)
if options[:preserve_whitespace]
loofah_fragment.to_text
else
loofah_fragment.scrub!(TextOnlyScrubber.new)

properly_encode(loofah_fragment, encoding: "UTF-8")
properly_encode(loofah_fragment, encoding: "UTF-8")
end
end
end

Expand Down
66 changes: 43 additions & 23 deletions test/sanitizer_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -78,50 +78,53 @@ def test_remove_xpaths_called_with_enumerable_xpaths

def test_strip_tags_with_quote
input = '<" <img src="trollface.gif" onload="alert(1)"> hi'
result = full_sanitize(input)
acceptable_results = [
# libxml2 >= 2.9.14 and xerces+neko
%{&lt;" hi},
# other libxml2
%{ hi},
# preserve_whitespace: true
"&lt;&quot; hi",
]

assert_includes(acceptable_results, result)
assert_full_sanitized(acceptable_results, input)
end

def test_strip_invalid_html
assert_equal "&lt;&lt;", full_sanitize("<<<bad html")
assert_full_sanitized "&lt;&lt;", "<<<bad html"
end

def test_strip_nested_tags
expected = "Wei&lt;a onclick='alert(document.cookie);'/&gt;rdos"
input = "Wei<<a>a onclick='alert(document.cookie);'</a>/>rdos"
assert_equal expected, full_sanitize(input)
assert_full_sanitized expected, input
end

def test_strip_tags_multiline
expected = %{This is a test.\n\n\n\nIt no longer contains any HTML.\n}
input = %{<h1>This is <b>a <a href="" target="_blank">test</a></b>.</h1>\n\n<!-- it has a comment -->\n\n<p>It no <b>longer <strong>contains <em>any <strike>HTML</strike></em>.</strong></b></p>\n}
acceptable_results = [
%{This is a test.\n\n\n\nIt no longer contains any HTML.\n},
# preserve_whitespace: true
%{\nThis is a test.\n\nIt no longer contains any HTML.\n\n}
]

assert_equal expected, full_sanitize(input)
assert_full_sanitized acceptable_results, input
end

def test_remove_unclosed_tags
input = "This is <-- not\n a comment here."
result = full_sanitize(input)
acceptable_results = [
# libxml2 >= 2.9.14 and xerces+neko
%{This is &lt;-- not\n a comment here.},
# other libxml2
%{This is },
]

assert_includes(acceptable_results, result)
assert_full_sanitized(acceptable_results, input)
end

def test_strip_cdata
input = "This has a <![CDATA[<section>]]> here."
result = full_sanitize(input)
acceptable_results = [
# libxml2 = 2.9.14
%{This has a &lt;![CDATA[]]&gt; here.},
Expand All @@ -131,7 +134,7 @@ def test_strip_cdata
%{This has a here.},
]

assert_includes(acceptable_results, result)
assert_full_sanitized(acceptable_results, input)
end

def test_strip_unclosed_cdata
Expand All @@ -153,40 +156,52 @@ def test_strip_unclosed_cdata

def test_strip_blank_string
assert_nil full_sanitize(nil)
assert_equal "", full_sanitize("")
assert_equal " ", full_sanitize(" ")
assert_nil full_sanitize(nil, preserve_whitespace: true)
assert_full_sanitized "", ""
assert_full_sanitized " ", " "
end

def test_strip_tags_with_plaintext
assert_equal "Don't touch me", full_sanitize("Don't touch me")
assert_full_sanitized "Don't touch me", "Don't touch me"
end

def test_strip_tags_with_tags
assert_equal "This is a test.", full_sanitize("<p>This <u>is<u> a <a href='test.html'><strong>test</strong></a>.</p>")
assert_full_sanitized "This is a test.", "<b>This <u>is<u> a <a href='test.html'><strong>test</strong></a>.</b>"
end

def test_escape_tags_with_many_open_quotes
assert_equal "&lt;&lt;", full_sanitize("<<<bad html>")
assert_full_sanitized "&lt;&lt;", "<<<bad html>"
end

def test_strip_tags_with_sentence
assert_equal "This is a test.", full_sanitize("This is a test.")
assert_full_sanitized "This is a test.", "This is a test."
end

def test_strip_tags_with_comment
assert_equal "This has a here.", full_sanitize("This has a <!-- comment --> here.")
assert_full_sanitized "This has a here.", "This has a <!-- comment --> here."
end

def test_strip_tags_with_frozen_string
assert_equal "Frozen string with no tags", full_sanitize("Frozen string with no tags")
assert_full_sanitized "Frozen string with no tags", "Frozen string with no tags"
end

def test_full_sanitize_respect_html_escaping_of_the_given_string
assert_equal 'test\r\nstring', full_sanitize('test\r\nstring')
assert_equal "&amp;", full_sanitize("&")
assert_equal "&amp;", full_sanitize("&amp;")
assert_equal "&amp;amp;", full_sanitize("&amp;amp;")
assert_equal "omg &lt;script&gt;BOM&lt;/script&gt;", full_sanitize("omg &lt;script&gt;BOM&lt;/script&gt;")
assert_full_sanitized 'test\r\nstring', 'test\r\nstring'
assert_full_sanitized "&amp;", "&"
assert_full_sanitized "&amp;", "&amp;"
assert_full_sanitized "&amp;amp;", "&amp;amp;"
assert_full_sanitized "omg &lt;script&gt;BOM&lt;/script&gt;", "omg &lt;script&gt;BOM&lt;/script&gt;"
end

def test_full_sanitize_preserve_whitespace
assert_equal "\na\n\nb\n", full_sanitize("<p>a</p><p>b</p>", preserve_whitespace: true)
end

def test_full_sanitize_preserve_whitespace_ascii_8bit_string
full_sanitize("<a>hello</a>".encode("ASCII-8BIT")).tap do |sanitized|
assert_equal "hello", sanitized
assert_equal Encoding::UTF_8, sanitized.encoding
end
end

def test_strip_links_with_tags_in_tags
Expand Down Expand Up @@ -917,6 +932,11 @@ def assert_sanitized(input, expected = nil)
assert_equal((expected || input), safe_list_sanitize(input))
end

def assert_full_sanitized(acceptable_results, input)
assert_includes(Array(acceptable_results), full_sanitize(input))
assert_includes(Array(acceptable_results), full_sanitize(input, preserve_whitespace: true))
end

def sanitize_css(input)
Rails::Html::SafeListSanitizer.new.sanitize_css(input)
end
Expand Down

0 comments on commit 48d2142

Please sign in to comment.