File: escape.py

package info (click to toggle)
python-clevercsv 0.8.4%2Bds-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 2,080 kB
  • sloc: python: 6,211; ansic: 870; makefile: 90
file content (84 lines) | stat: -rw-r--r-- 1,955 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# -*- coding: utf-8 -*-

"""
Common functions for dealing with escape characters.

Author: Gertjan van den Burg
Date: 2018-11-06
"""

import codecs
import sys
import unicodedata

from typing import Iterable
from typing import Optional
from typing import Set

#: Set of default characters to *never* consider as escape character
DEFAULT_BLOCK_CHARS: Set[str] = set(
    [
        "!",
        "?",
        '"',
        "'",
        ".",
        ",",
        ";",
        ":",
        "%",
        "*",
        "&",
        "#",
    ]
)

#: Set of characters in the Unicode "Po" category
UNICODE_PO_CHARS: Set[str] = set(
    [
        c
        for c in map(chr, range(sys.maxunicode + 1))
        if unicodedata.category(c) == "Po"
    ]
)


def is_potential_escapechar(
    char: str, encoding: str, block_char: Optional[Iterable[str]] = None
) -> bool:
    """Check if a character is a potential escape character.

    A character is considered a potential escape character if it is in the
    "Punctuation, Other" Unicode category and not in the list of blocked
    characters.

    Parameters
    ----------
    char: str
        The character to check

    encoding : str
        The encoding of the character

    block_char : Optional[Iterable[str]]
        Characters that are in the Punctuation Other category but that should
        not be considered as escape character. If None, the default set is
        used, which is defined in :py:data:`DEFAULT_BLOCK_CHARS`.

    Returns
    -------
    is_escape : bool
        Whether the character is considered a potential escape or not.

    """
    if encoding.lower() in set(["utf-8", "ascii"]):
        uchar = char
    else:
        uchar = codecs.decode(bytes(char, encoding), encoding=encoding)

    block_chars = (
        DEFAULT_BLOCK_CHARS if block_char is None else set(block_char)
    )
    if uchar in UNICODE_PO_CHARS and uchar not in block_chars:
        return True
    return False