# coding=utf-8 from __future__ import unicode_literals from contextlib import contextmanager from unittest import TestCase from .sanitizer import Sanitizer class SanitizerTestCase(TestCase): if not hasattr(TestCase, "subTest"): @contextmanager def subTest(self, *args, **kwargs): yield def run_tests(self, entries, sanitizer=Sanitizer()): for before, after in entries: with self.subTest(before=before, after=after): after = before if after is None else after result = sanitizer.sanitize(before) self.assertEqual( result, after, "Cleaning '%s', expected '%s' but got '%s'" % ( before.encode("unicode-escape"), after.encode("unicode-escape"), result.encode("unicode-escape"), ), ) def test_01_sanitize(self): entries = [ ("

", " "), ("

", " "), ( 'Something

', "Something", ), ( "

abc def ghi jkl mno

", "

abc def ghi jkl mno

", ), ( 'Something

', "Something", ), ('

bla

', "

bla

"), ('

bla

', "

bla

"), ("

abc
def

", "

abc
def

"), ("

", ""), ("

abc

", "

abc

"), ("

bla

", "

bla

"), ( "

just testing

", "

just testing

", ), ( "

Hallo
Welt

Hallo
Welt

", "

Hallo
Welt
Hallo
Welt

", ), ( "

Zeile 1
Zeile 2
Zeile 3

", "

Zeile 1
Zeile 2
Zeile 3

", ), ( "

A, B" " und C

", "

A, B" " und C

", ), ("

", "

Zeile 1

"), # Suboptimal, should be cleaned further ("", "

Zeile 2

"), ("1

2", "1 2"), ("1

2", "12"), ("

sonderbar

", "

sonderbar

"), # Empty a tags are allowed... (" ", " "), # ...but breaks without any additional content are still removed ("
", " "), ] self.run_tests(entries) def test_02_a_tag(self): entries = ( ('foo', None), ( 'foo', 'foo', ), ('foo', None), ('foo', None), ('foo', 'foo'), ('foo', 'foo'), ('foo', None), ('foo', None), ) self.run_tests(entries) def test_03_merge(self): entries = ( ("

foo

bar

", "

foo bar

"), ("

foo

bar

", "

foo bar

"), ) self.run_tests(entries) def test_04_p_in_li(self): entries = ( ("

foo

", "

foo

"), ("

foo

", "

foo

"), ( "

foo

barxxrab

baz" "a

", "

foo bar xxrabbaza b" " c

", ), ) self.run_tests(entries) def test_05_p_in_p(self): entries = ( ("

foo

", "

foo

"), ("

", " "), # This is actually correct as the second

implicitely # closes the first paragraph, and the trailing

is # deleted because it has no matching opening

foo

bar

baz

", "

foo

bar

baz"), ("

bla

blub

blaaa

", "

bla

blub

blaaa"), ( "

text1

text2

tail2

text3

tail3

tail1", "

text1

text2

tail2

text3

tail3 tail1", ), ) self.run_tests(entries) def test_06_allowlist(self): entries = ( ('', ""), ('', ""), ) self.run_tests(entries) def test_07_configuration(self): sanitizer = Sanitizer( {"tags": ["h1", "h2"], "empty": (), "separate": (), "attributes": {}} ) entries = ( ("

foo

", None), ("

foo

bar

baz

", "

foo

bar

baz"), ) self.run_tests(entries, sanitizer=sanitizer) def test_08_li_with_marker(self): entries = ( ("

- foo

", "

foo

"), ("

* foo

", "

foo

"), ) self.run_tests(entries) def test_09_empty_p_text_in_li(self): # this results in an empty p.text entries = ( ("

foo

", "

foo

"), ("

foo

", "

foo

"), ) self.run_tests(entries) def test_10_broken_html(self): entries = ( ("

bla", "

bla

"), ("
bla<>/dsiad
bla<>/dsiad
"), ) self.run_tests(entries) def test_11_nofollow(self): sanitizer = Sanitizer({"add_nofollow": True}) entries = ( ( '
example.com
', '
example.com
', ), ) self.run_tests(entries, sanitizer=sanitizer) def test_12_replacements(self): entries = ( ("Bla", "Bla"), ("Bla", "Bla"), ) self.run_tests(entries) def test_13_autolink(self): self.run_tests([("
https://github.com/
", "
https://github.com/
")]) sanitizer = Sanitizer({"autolink": True}) self.run_tests( [ ( "
https://github.com/
", '
https://github.com/
', ), ( # localhost is not autolinked by default by lxml "
https://localhost/
", "
https://localhost/
", ), ], sanitizer=sanitizer, ) sanitizer = Sanitizer({"autolink": True, "add_nofollow": True}) self.run_tests( [ ( "
https://github.com/
", '
https://github.com/
', ) ], sanitizer=sanitizer, ) sanitizer = Sanitizer({"autolink": {"avoid_hosts": []}}) self.run_tests( [ ( "
https://github.com/
", '
https://github.com/
', ), ( "
https://localhost/
", '
https://localhost/
', ), ], sanitizer=sanitizer, ) def test_14_classes(self): """Class attributes should not be treated specially""" sanitizer = Sanitizer( { "tags": {"h1", "h2", "p", "a", "span"}, "attributes": { "a": ("href", "name", "target", "title", "id", "rel"), "h1": ("class",), "p": ("class",), "span": ("class",), }, "empty": set(), "separate": {"a", "p"}, } ) self.run_tests( [ ('
Test
', '
Test
'), ( '
Test span
', '
Test span
', ), ( '
Test span' 'span
', '
Test span span
', ), ('
Test
', '
Test
'), ('
Test
', "
Test
"), ], sanitizer=sanitizer, ) def test_15_classes(self): """Class attributes may disable merging""" sanitizer = Sanitizer( { "tags": {"h1", "h2", "p", "a", "span"}, "attributes": { "a": ("href", "name", "target", "title", "id", "rel"), "h1": ("class",), "p": ("class",), "span": ("class",), }, "empty": set(), "separate": {"a", "p"}, "is_mergeable": lambda e1, e2: e1.get("class") == e2.get("class"), } ) self.run_tests( [ ( '
Test span' 'span
', '
Test span' 'span
', ), ( '
Test span' 'span
', '
Test span span
', ), ], sanitizer=sanitizer, ) def test_16_emoji(self): self.run_tests([("
😂
", "
😂
"), ("
💕
", "
💕
")]) def test_target_blank(self): self.run_tests( [ ( 'test', 'test', ) ] ) def test_remove_everything(self): sanitizer = Sanitizer( {"tags": {"__never"}, "attributes": {}, "empty": set(), "separate": set()} ) self.run_tests( [ ( '11:44:14', "11:44:14", ) ], sanitizer=sanitizer, ) def test_more_merging(self): self.run_tests( [ ("
", "
"), ("
", "
"), ( '', '', ), ] ) def test_keep_consecutive_br_tags(self): sanitizer = Sanitizer({"whitespace": set(), "separate": {"br"}}) self.run_tests( [ ("
Hello

World
", "
Hello

World
"), ("
Hello

", "
Hello

"), ("

World
", "

World
"), ("

", "

"), ("

", "

"), ], sanitizer=sanitizer, ) def test_custom_allowed_attribute(self): sanitizer = Sanitizer({"attributes": {"a": ("href", "custom")}}) self.run_tests( [ ( 'Test', 'Test', ) ], sanitizer=sanitizer, ) def test_blob(self): source = """\
1.2. Definition des Spesenbegriffs

Als Spesen im Sinne dieses Reglements gelten die Auslagen, die einem Mitarbeitenden im Interesse des Arbeitgebers angefallen sind. Sämtliche Mitarbeitende sind verpflichtet, ihre Spesen im Rahmen dieses Reglements möglichst tief zu halten. Aufwendungen, die für die Arbeitsausführung nicht notwendig waren, werden von der Firma nicht übernommen, sondern sind von den Mitarbeitenden selbst zu tragen.

Im Wesentlichen werden den Mitarbeitenden folgende geschäftlich bedingten Auslagen ersetzt:

- Fahrtkosten (nachfolgend 2.)

- Verpflegungskosten (nachfolgend 3.)

- Übernachtungskosten (nachfolgend 4.)

- Übrige Kosten (nachfolgend 5.)

""" # noqa result = """\
1.2. Definition des Spesenbegriffs

Als Spesen im Sinne dieses Reglements gelten die Auslagen, die einem Mitarbeitenden im Interesse des Arbeitgebers angefallen sind. Sämtliche Mitarbeitende sind verpflichtet, ihre Spesen im Rahmen dieses Reglements möglichst tief zu halten. Aufwendungen, die für die Arbeitsausführung nicht notwendig waren, werden von der Firma nicht übernommen, sondern sind von den Mitarbeitenden selbst zu tragen.

Im Wesentlichen werden den Mitarbeitenden folgende geschäftlich bedingten Auslagen ersetzt:

- Fahrtkosten (nachfolgend 2.)
- Verpflegungskosten (nachfolgend 3.)
- Übernachtungskosten (nachfolgend 4.)
- Übrige Kosten (nachfolgend 5.)
""" # noqa # XXX An exact match isn't really required. Using Django's # assertHTMLEqual would be great but we'd have to depend on Django in # the test suite for this (not a big problem really, because then we # could also test html_sanitizer.django but I didn't yet *have* to do # this) self.run_tests([(source, result)]) def test_keep_typographic_whitespace(self): sanitizer = Sanitizer({"keep_typographic_whitespace": True}) # Note some unicode normalization of typographic whitespace self.run_tests( [ ( "\u200a\u2001\u202f\u2004\xa0\u2007\u2002\u2000" "\u2003\u2009\u205f\u2005\u2006\u2008\u3000", "\u200a\u2003\u202f\u2004\xa0\u2007\u2002\u2002" "\u2003\u2009\u205f\u2005\u2006\u2008\u3000", ) ], sanitizer=sanitizer, ) def test_strip_typographic_whitespace(self): sanitizer = Sanitizer({"keep_typographic_whitespace": False}) self.run_tests( [ ( "\u200a\u2001\u202f\u2004\xa0\u2007\u2002\u2000" "\u2003\u2009\u205f\u2005\u2006\u2008\u3000", " ", ) ], sanitizer=sanitizer, ) def test_anchor_names(self): self.run_tests( [ ('', '',), ('', '',), ('', '',), ], ) def test_style_tag(self): # don't allow style tag (default) self.run_tests( [("foobar", "foobar")], sanitizer=Sanitizer( { "tags": {"impossible tag"}, "attributes": {}, "empty": set(), "separate": set(), } ), ) # allow style tag but no style attribute self.run_tests( [ ( "foobar", "foobar", ), ('
bla
', "
bla
"), ], sanitizer=Sanitizer( { "tags": {"h2", "style"}, "attributes": {}, "empty": set(), "separate": set(), } ), ) # allow style tag and style attribute self.run_tests( [ ( "foobar", "foobar", ), ( '
bla
', '
bla
', ), ], sanitizer=Sanitizer( { "tags": {"h2", "style"}, "attributes": {"h2": {"style"}}, "empty": set(), "separate": set(), } ), )