File: normal_form.py

package info (click to toggle)
python-clevercsv 0.7.5%2Bds-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 872 kB
  • sloc: python: 5,076; ansic: 763; makefile: 81
file content (347 lines) | stat: -rw-r--r-- 9,125 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
# -*- coding: utf-8 -*-

"""
Detect the dialect with very strict functional tests.

This module uses so-called "normal forms" to detect the dialect of CSV files.
Normal forms are detected with strict functional tests. The normal forms are 
used as a pre-test to check if files are simple enough that computing the data 
consistency measure is not necessary.

Author: Gertjan van den Burg

"""

import itertools

import regex

from .dialect import SimpleDialect
from .escape import is_potential_escapechar
from .utils import pairwise

DELIMS = [",", ";", "|", "\t"]
QUOTECHARS = ["'", '"']


def detect_dialect_normal(
    data, encoding="UTF-8", delimiters=None, verbose=False
):
    """Detect the normal form of a file from a given sample

    Parameters
    ----------
    data : str
        The data as a single string

    encoding : str
        The encoding of the data

    Returns
    -------
    dialect : SimpleDialect
        The dialect detected using normal forms, or None if no such dialect can
        be found.
    """
    if delimiters is None:
        delimiters = DELIMS
    delimiters = list(delimiters)
    for delim, quotechar in itertools.product(delimiters, QUOTECHARS):
        if maybe_has_escapechar(data, encoding, delim, quotechar):
            if verbose:
                print("Not normal, has potential escapechar.")
            return None

    form_and_dialect = []

    for delim in delimiters:
        dialect = SimpleDialect(delimiter=delim, quotechar="", escapechar="")
        form_and_dialect.append((2, is_form_2, dialect))

    for delim, quotechar in itertools.product(delimiters, QUOTECHARS):
        dialect = SimpleDialect(
            delimiter=delim, quotechar=quotechar, escapechar=""
        )
        form_and_dialect.append((1, is_form_1, dialect))
        form_and_dialect.append((3, is_form_3, dialect))
        form_and_dialect.append((5, is_form_5, dialect))
    for quotechar in QUOTECHARS:
        dialect = SimpleDialect(
            delimiter="", quotechar=quotechar, escapechar=""
        )
        form_and_dialect.append((4, is_form_4, dialect))
    form_and_dialect.append(
        (
            4,
            is_form_4,
            SimpleDialect(delimiter="", quotechar="", escapechar=""),
        )
    )

    for ID, form_func, dialect in form_and_dialect:
        if form_func(data, dialect):
            if verbose:
                print("Matched normal form %i." % ID)
            return dialect
    if verbose:
        print("Didn't match any normal forms.")


def is_quoted_cell(cell, quotechar):
    if len(cell) < 2:
        return False
    return cell[0] == quotechar and cell[-1] == quotechar


def is_any_quoted_cell(cell):
    return is_quoted_cell(cell, "'") or is_quoted_cell(cell, '"')


def is_any_partial_quoted_cell(cell):
    if len(cell) < 1:
        return False
    return (
        cell[0] == '"' or cell[0] == "'" or cell[-1] == '"' or cell[-1] == "'"
    )


def is_empty_quoted(cell, quotechar):
    return len(cell) == 2 and is_quoted_cell(cell, quotechar)


def is_empty_unquoted(cell):
    return cell == ""


def is_any_empty(cell):
    return (
        is_empty_unquoted(cell)
        or is_empty_quoted(cell, "'")
        or is_empty_quoted(cell, '"')
    )


def has_delimiter(string, delim):
    return delim in string


def has_nested_quotes(string, quotechar):
    return quotechar in string[1:-1]


def maybe_has_escapechar(data, encoding, delim, quotechar):
    if not delim in data and not quotechar in data:
        return False
    for u, v in pairwise(data):
        if v in [delim, quotechar] and is_potential_escapechar(u, encoding):
            return True
    return False


def strip_trailing_crnl(data):
    while data.endswith("\n"):
        data = data.rstrip("\n")
    while data.endswith("\r"):
        data = data.rstrip("\r")
    return data


def every_row_has_delim(rows, dialect):
    for row in rows:
        if not has_delimiter(row, dialect.delimiter):
            return False
    return True


def is_elementary(cell):
    return not (
        regex.fullmatch("[a-zA-Z0-9\.\_\&\-\@\+\%\(\)\ \/]+", cell) is None
    )


def even_rows(rows, dialect):
    cells_per_row = set()
    for row in rows:
        cells_per_row.add(len(split_row(row, dialect)))
    return len(cells_per_row) == 1


def split_file(data):
    data = strip_trailing_crnl(data)
    if "\r\n" in data:
        return data.split("\r\n")
    elif "\n" in data:
        return data.split("\n")
    elif "\r" in data:
        return data.split("\r")
    else:
        return [data]


def split_row(row, dialect):
    # no nested quotes
    if dialect.quotechar == "" or not dialect.quotechar in row:
        if dialect.delimiter == "":
            return [row]
        return row.split(dialect.delimiter)

    cells = []
    current_cell = ""
    in_quotes = False
    for c in row:
        if c == dialect.delimiter and not in_quotes:
            cells.append(current_cell)
            current_cell = ""
        elif c == dialect.quotechar:
            in_quotes = not in_quotes
            current_cell += c
        else:
            current_cell += c
    if current_cell:
        cells.append(current_cell)
    return cells


def is_form_1(data, dialect=None):
    # All cells quoted, quoted empty allowed, no nested quotes, more than one
    # column

    rows = split_file(data)

    if not every_row_has_delim(rows, dialect):
        return False
    if not even_rows(rows, dialect):
        return False

    for row in rows:
        cells = split_row(row, dialect)
        if len(cells) == 1:
            return False
        for cell in cells:
            # No empty cells
            if is_empty_unquoted(cell):
                return False

            # All cells must be quoted
            if not is_quoted_cell(cell, dialect.quotechar):
                return False

            # No quotes inside quotes
            if has_nested_quotes(cell, dialect.quotechar):
                return False

    return True


def is_form_2(data, dialect):
    # All unquoted, empty allowed, all elementary

    rows = split_file(data)

    if not every_row_has_delim(rows, dialect):
        return False
    if not even_rows(rows, dialect):
        return False

    for row in rows:
        cells = split_row(row, dialect)
        if len(cells) == 1:
            return False
        for cell in cells:
            # All cells must be unquoted
            if is_any_quoted_cell(cell):
                return False
            # All cells must not be partially quoted
            if is_any_partial_quoted_cell(cell):
                return False
            # Cells have to be elementary
            if not is_empty_unquoted(cell) and not is_elementary(cell):
                return False
    return True


def is_form_3(data, dialect):
    # some quoted, some not quoted, no empty, no nested quotes

    rows = split_file(data)

    if not every_row_has_delim(rows, dialect):
        return False
    if not even_rows(rows, dialect):
        return False
    if len(rows) <= 1:
        return False

    for row in rows:
        cells = split_row(row, dialect)
        if len(cells) == 1:
            return False
        for cell in cells:
            if is_any_empty(cell):
                return False

            # if it is quoted
            if is_any_quoted_cell(cell):
                # but not quoted with the quotechar of the dialect
                if not is_quoted_cell(cell, dialect.quotechar):
                    # then this form isn't right
                    return False
            # if it's not quoted
            else:
                # and it's not elementary
                if not is_elementary(cell):
                    # then this form isn't right
                    return False

    return True


def is_form_4(data, dialect):
    # no delim, single column (either entirely quoted or entirely unquoted)
    rows = split_file(data)

    if len(rows) <= 1:
        return False

    unquoted_search = regex.compile(r"[^A-Za-z0-9.\_&\-]").search
    quoted_search = regex.compile(r"[^A-Za-z0-9.\_&\-\ ]").search
    for row in rows:
        cell = row[:]
        if dialect.quotechar == "":
            if is_any_quoted_cell(cell):
                return False
            if unquoted_search(cell):
                return False
        else:
            if not is_quoted_cell(cell, dialect.quotechar):
                return False
            if quoted_search(cell[1:-1]):
                return False

    return True


def is_form_5(data, dialect):
    # all rows quoted, no nested quotes
    # basically form 2 but with quotes around each row

    rows = split_file(data)

    if not every_row_has_delim(rows, dialect):
        return False
    if len(rows) <= 1:
        return False

    for row in rows:
        if not (
            len(row) > 2
            and row[0] == dialect.quotechar
            and row[-1] == dialect.quotechar
        ):
            return False

    newrows = []
    for row in rows:
        newrows.append(row[1:-1])

    return is_form_2("\n".join(newrows), dialect)