1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45
|
import pytest
from ftfy import fix_text, fix_text_segment
from ftfy.fixes import unescape_html
def test_entities():
example = "&\n<html>\n&"
assert fix_text(example) == "&\n<html>\n&"
assert fix_text_segment(example) == "&\n<html>\n&"
assert fix_text(example, unescape_html=True) == "&\n<html>\n&"
assert fix_text_segment(example, unescape_html=True) == "&\n<html>\n&"
assert fix_text(example, unescape_html=False) == "&\n<html>\n&"
assert fix_text_segment(example, unescape_html=False) == "&\n<html>\n&"
assert fix_text_segment("<>", unescape_html=False) == "<>"
assert fix_text_segment("<>", unescape_html=True) == "<>"
assert fix_text_segment("<>") == "<>"
assert fix_text_segment("jednocześnie") == "jednocześnie"
assert fix_text_segment("JEDNOCZEŚNIE") == "JEDNOCZEŚNIE"
assert fix_text_segment("ellipsis…", normalization="NFKC") == "ellipsis..."
assert fix_text_segment("ellipsis…", normalization="NFKC") == "ellipsis..."
assert fix_text_segment("broken") == "broken\x81"
assert fix_text_segment("&amp;amp;") == "&"
assert unescape_html("euro €") == "euro €"
assert unescape_html("EURO &EURO;") == "EURO €"
assert unescape_html("not an entity x6;") == "not an entity x6;"
assert unescape_html("JEDNOCZE&SACUTE;NIE") == "JEDNOCZEŚNIE"
assert unescape_html("V&SCARON;ICHNI") == "VŠICHNI"
assert unescape_html("") == ""
assert unescape_html("�") == "\ufffd"
assert (
fix_text_segment("this is just informal english ¬ html")
== "this is just informal english ¬ html"
)
def test_old_parameter_name():
example = "&\n<html>\n&"
with pytest.deprecated_call():
assert fix_text(example, fix_entities=True) == "&\n<html>\n&"
with pytest.deprecated_call():
assert fix_text(example, fix_entities=False) == "&\n<html>\n&"
|