File: potential_dialects.py

package info (click to toggle)
python-clevercsv 0.7.5%2Bds-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 872 kB
  • sloc: python: 5,076; ansic: 763; makefile: 81
file content (282 lines) | stat: -rw-r--r-- 7,945 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
# -*- coding: utf-8 -*-

"""
Code for selecting the potential dialects of a file.

Author: Gertjan van den Burg

"""

import codecs
import itertools
import unicodedata

import regex

from .detect_type import PATTERNS
from .dialect import SimpleDialect
from .escape import is_potential_escapechar
from .utils import pairwise


def get_dialects(
    data, encoding="UTF-8", delimiters=None, test_masked_by_quotes=False
):
    """Return the possible dialects for the given data.

    We consider as escape characters those characters for which
    is_potential_escapechar() is True and that occur at least once before a
    quote character or delimiter in the dialect.

    One may wonder if self-escaping is an issue here (i.e. "\\\\", two times
    backslash). It is not. In a file where a single backslash is desired and
    escaping with a backslash is used, then it only makes sense to do this in a
    file where the backslash is already used as an escape character (in which
    case we include it). If it is never used as escape for the delimiter or
    quotechar, then it is not necessary to self-escape. This is an assumption,
    but it holds in general and it reduces noise.

    Parameters
    ----------
    data: str
        The data for the file

    encoding: str
        The encoding of the file

    delimiters: iterable
        Set of delimiters to consider. See :func:`get_delimiters` for more
        info.

    test_masked_by_quotes : bool
        Remove dialects where the delimiter is always masked by the quote
        character. Enabling this typically removes a number of potential
        dialects from the list, which can remove false positives. It however
        not a very fast operation, so it is disabled by default.

    Returns
    -------
    dialects: list
        List of SimpleDialect objects that are considered potential dialects.

    """
    # URLs are removed to reduce noise
    no_url = filter_urls(data)
    delims = get_delimiters(no_url, encoding, delimiters=delimiters)
    quotechars = get_quotechars(no_url)
    escapechars = {}

    for delim, quotechar in itertools.product(delims, quotechars):
        escapechars[(delim, quotechar)] = set([""])

    # escapechars are those that precede a delimiter or quotechar
    for u, v in pairwise(data):
        if not is_potential_escapechar(u, encoding):
            continue
        for delim, quotechar in itertools.product(delims, quotechars):
            if v == delim or v == quotechar:
                escapechars[(delim, quotechar)].add(u)

    # remove dialects where the delimiter is always masked by quotes.
    dialects = []
    for delim in delims:
        for quotechar in quotechars:
            for escapechar in escapechars[(delim, quotechar)]:
                if test_masked_by_quotes and masked_by_quotechar(
                    data, quotechar, escapechar, delim
                ):
                    continue
                d = SimpleDialect(delim, quotechar, escapechar)
                dialects.append(d)

    return dialects


def unicode_category(x, encoding=None):
    """Return the Unicode category of a character

    Parameters
    ----------
    x : str
        character

    encoding: str
        Encoding of the character

    Returns
    -------
    category: str
        The Unicode category of the character.

    """
    as_unicode = codecs.decode(bytes(x, encoding), encoding=encoding)
    return unicodedata.category(as_unicode)


def filter_urls(data):
    """Filter URLs from the data"""
    pat = PATTERNS["url"]
    return regex.sub(pat, "U", data, count=0)


def get_delimiters(
    data, encoding, delimiters=None, block_cat=None, block_char=None
):
    """Get potential delimiters

    The set of potential delimiters is constructed as follows. For each unique
    character of the file, we check if its Unicode character category is in the
    set ``block_cat`` of prohibited categories.  If it is, we don't allow it to
    be a delimiter, with the exception of Tab (which is in the Control
    category).  We furthermore block characters in :attr:`block_char` from
    being delimiters.

    Parameters
    ----------
    data: str
        The data of the file

    encoding: str
        The encoding of the file

    delimiters: iterable
        Allowed delimiters. If provided, it overrides the block_cat/block_char
        mechanism and only the provided characters will be considered
        delimiters (if they occur in the file). If None, all characters can be
        considered delimiters subject to the :attr:`block_cat` and
        :attr:`block_char` parameters.

    block_cat: list
        List of Unicode categories (2-letter abbreviations) for characters that
        should not be considered as delimiters. If None, the following default
        set is used::

        ["Lu", "Ll", "Lt", "Lm", "Lo", "Nd", "Nl", "No", "Ps", "Pe", "Co"]

    block_char: list
        Explicit list of characters that should not be considered delimiters.
        If None, the following default set is used::

        [".", "/", '"', "'", "\\n", "\\r"]


    Returns
    -------
    delims: set
        Set of potential delimiters. The empty string is added by default.

    """
    if block_cat is None:
        block_cat = [
            "Lu",
            "Ll",
            "Lt",
            "Lm",
            "Lo",
            "Nd",
            "Nl",
            "No",
            "Ps",
            "Pe",
            "Co",
        ]
    if block_char is None:
        block_char = [".", "/", '"', "'", "\n", "\r"]

    D = set()
    for x in set(data):
        c = unicode_category(x, encoding=encoding)
        if delimiters is None:
            if x == "\t" or ((x not in block_char) and (c not in block_cat)):
                D.add(x)
        else:
            if x in delimiters:
                D.add(x)
    D.add("")
    return D


def get_quotechars(data, quote_chars=None):
    """Get potential quote characters

    Quote characters are those that occur in the ``quote_chars`` set and are
    found at least once in the file.

    Parameters
    ----------

    data: str
        The data of the file as a string

    quote_chars: iterable
        Characters that should be considered quote characters. If it is None,
        the following default set is used::

        ["'", '"', "~", "`"]


    Returns
    -------
    quotes: set
        Set of potential quote characters. The empty string is added by
        default.

    """
    if quote_chars is None:
        quote_chars = ["'", '"', "~", "`"]
    Q = set(quote_chars) & set(data)
    Q.add("")
    return Q


def masked_by_quotechar(data, quotechar, escapechar, test_char):
    """Test if a character is always masked by quote characters

    This function tests if a given character is always within quoted segments
    (defined by the quote character). Double quoting and escaping is supported.

    Parameters
    ----------
    data: str
        The data of the file as a string

    quotechar: str
        The quote character

    escapechar: str
        The escape character

    test_char: str
        The character to test

    Returns
    -------
    masked: bool
        Returns True if the test character is never outside quoted segements,
        False otherwise.

    """
    if test_char == "":
        return False
    escape_next = False
    in_quotes = False
    i = 0
    while i < len(data):
        s = data[i]
        if s == quotechar:
            if escape_next:
                i += 1
                continue
            if not in_quotes:
                in_quotes = True
            else:
                if i + 1 < len(data) and data[i + 1] == quotechar:
                    i += 1
                else:
                    in_quotes = False
        elif s == test_char and not in_quotes:
            return False
        elif s == escapechar:
            escape_next = True
        i += 1
    return True