File: test_characters.py

package info (click to toggle)
python-ftfy 6.3.1-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 808 kB
  • sloc: python: 1,716; makefile: 148
file content (56 lines) | stat: -rw-r--r-- 1,741 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
from ftfy import (
    fix_and_explain,
    fix_encoding,
    fix_text,
)
from ftfy.chardata import possible_encoding
from ftfy.fixes import fix_surrogates, remove_control_chars


def test_possible_encoding():
    for codept in range(256):
        char = chr(codept)
        assert possible_encoding(char, "latin-1")


def test_byte_order_mark():
    assert fix_encoding("") == "\ufeff"


def test_control_chars():
    text = (
        "\ufeffSometimes, \ufffcbad ideas \x7f\ufffalike these characters\ufffb "
        "\u206aget standardized.\r\n"
    )
    fixed = "Sometimes, bad ideas like these characters get standardized.\r\n"
    assert remove_control_chars(text) == fixed


def test_welsh_flag():
    # ftfy used to remove "tag characters", but they have been repurposed in the
    # "Flag of England", "Flag of Scotland", and "Flag of Wales" emoji sequences.
    text = "This flag has a dragon on it 🏴󠁧󠁢󠁷󠁬󠁳󠁿"
    assert remove_control_chars(text) == text


def test_ohio_flag():
    # I did not expect to find the "Flag of Ohio" emoji in the wild but there it is.
    # Test that this emoji (which no emoji database believes has been implemented)
    # passes through unchanged.
    text = "#superman #ohio 🏴\U000e0075\U000e0073\U000e006f\U000e0068\U000e007f #cleveland #usa 🇺🇸"
    assert fix_text(text) == text


def test_surrogates():
    assert fix_surrogates("\udbff\udfff") == "\U0010ffff"
    assert fix_surrogates("\ud800\udc00") == "\U00010000"


def test_color_escapes():
    fixed, plan = fix_and_explain("\001\033[36;44mfoo")
    print(plan)
    assert fixed == "foo"
    assert plan == [
        ("apply", "remove_terminal_escapes"),
        ("apply", "remove_control_chars"),
    ]