File: break_ties.py

package info (click to toggle)
python-clevercsv 0.7.5%2Bds-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 872 kB
  • sloc: python: 5,076; ansic: 763; makefile: 81
file content (390 lines) | stat: -rw-r--r-- 12,291 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
# -*- coding: utf-8 -*-

"""
Break ties in the data consistency measure.

Author: Gertjan van den Burg

"""

from .cparser_util import parse_string
from .utils import pairwise


def tie_breaker(data, dialects):
    """
    Break ties between dialects.

    This function is used to break ties where possible between two, three, or
    four dialects that receive the same value for the data consistency measure.

    Parameters
    ----------
    data: str
        The data as a single string
    dialects: list
        Dialects that are tied

    Returns
    -------
    dialect: SimpleDialect
        One of the dialects from the list provided or None.


    """
    if len(dialects) == 2:
        return break_ties_two(data, dialects[0], dialects[1])
    elif len(dialects) == 3:
        return break_ties_three(data, dialects[0], dialects[1], dialects[2])
    elif len(dialects) == 4:
        return break_ties_four(data, dialects)
    return None


def reduce_pairwise(data, dialects):
    """Reduce the set of dialects by breaking pairwise ties

    Parameters
    ----------

    data: str
        The data of the file as a string

    dialects: list
        List of SimpleDialect objects

    Returns
    -------
    dialects: list
        List of SimpleDialect objects.

    """
    equal_delim = len(set([d.delimiter for d in dialects])) == 1
    if not equal_delim:
        return None

    # First, identify dialects that result in the same parsing result.
    equal_dialects = []
    for a, b in pairwise(dialects):
        X = list(parse_string(data, a))
        Y = list(parse_string(data, b))
        if X == Y:
            equal_dialects.append((a, b))

    # Try to break the ties in these pairs
    new_dialects = set()
    visited = set()
    for A, B in equal_dialects:
        ans = break_ties_two(data, A, B)
        if not ans is None:
            new_dialects.add(ans)
        visited.add(A)
        visited.add(B)

    # and add the dialects that we didn't visit
    for d in dialects:
        if not d in visited:
            new_dialects.add(d)

    return list(new_dialects)


def break_ties_two(data, A, B):
    """Break ties between two dialects.

    This function breaks ties between two dialects that give the same score. We
    distinguish several cases:

    1. If delimiter and escapechar are the same and one of the quote characters
    is the empty string. We parse the file with both dialects and check if the
    parsing result is the same. If it is, the correct dialect is the one with
    no quotechar, otherwise it's the other one.
    2. If quotechar and escapechar are the same and the delimiters are comma
    and space, then we go for comma. Alternatively, if either of the delimiters
    is the hyphen, we assume it's the other dialect.
    3. If the delimiter and quotechar is the same and one dialect uses the
    escapchar and the other doesn't. We break this tie by checking if the
    escapechar has an effect and if it occurs an even or odd number of times.

    If it's none of these cases, we don't break the tie and return None.

    Parameters
    ----------

    data: str
        The data of the file as a string.

    A: SimpleDialect
        A potential dialect

    B: SimpleDialect
        A potential dialect

    Returns
    -------

    dialect: SimpleDialect or None
        The chosen dialect if the tie can be broken, None otherwise.

    """
    keys = {"delimiter", "quotechar", "escapechar"}
    diff_only_in_key = lambda key: all(
        getattr(A, x) == getattr(B, x) for x in keys if x != key
    )
    if diff_only_in_key("quotechar"):
        if A.quotechar == "" or B.quotechar == "":
            d_no = A if A.quotechar == "" else B
            d_yes = B if d_no == A else A

            X = list(parse_string(data, dialect=d_no))
            Y = list(parse_string(data, dialect=d_yes))

            if X == Y:
                # quotechar has no effect
                return d_no
            else:
                # quotechar has an effect
                return d_yes
    elif diff_only_in_key("delimiter"):
        if sorted([A.delimiter, B.delimiter]) == sorted([",", " "]):
            # Artifact due to type detection (comma as radix point)
            if A.delimiter == ",":
                return A
            else:
                return B
        elif A.delimiter == "-" or B.delimiter == "-":
            # Artifact due to type detection (dash as minus sign)
            if A.delimiter == "-":
                return B
            else:
                return A
    elif diff_only_in_key("escapechar"):
        Dnone, Descape = (A, B) if A.escapechar == "" else (B, A)

        X = list(parse_string(data, Dnone))
        Y = list(parse_string(data, Descape))

        # double check shape. Usually if the shape differs the pattern score
        # should have caught it, but if by a freakish occurance it hasn't then
        # we can't break this tie (for now)
        if len(X) != len(Y):
            return None
        for x, y in zip(X, Y):
            if len(x) != len(y):
                return None

        cells_escaped = []
        cells_unescaped = []
        for x, y in zip(X, Y):
            for u, v in zip(x, y):
                if u != v:
                    cells_unescaped.append(u)
                    cells_escaped.append(v)

        # We will break the ties in the following ways:
        #
        # If the escapechar precedes the quotechar an even number of times
        # within each offending cell, then we think it is a functional escape
        # and the escaped version is the correct dialect. Note that if an odd
        # number of escaped quotechars would occur, then the shape of the file
        # will be different if it is ignored. Only if it occurs an even number
        # of times within the cell can we get the same shape.
        for u in cells_unescaped:
            count = 0
            for a, b in pairwise(u):
                if a != Descape.escapechar:
                    continue
                if a == Descape.escapechar and b == Descape.quotechar:
                    count += 1
            if count > 0 and count % 2 == 0:
                return Descape
            else:
                return Dnone
    elif A.delimiter == B.delimiter:
        Aq, Ae = A.quotechar, A.escapechar
        Bq, Be = B.quotechar, B.escapechar
        if (Aq, Ae) == ("", "") or (Bq, Be) == ("", ""):
            # This case is activated if the escapechar+quotechar combination
            # occurs in the cells (i.e. "Jill\'s data") but no actual quoting
            # is done with the quote character.
            d_no = A if (Aq, Ae) == ("", "") else B
            d_yes = B if d_no == A else A

            X = list(parse_string(data, dialect=d_no))
            Y = list(parse_string(data, dialect=d_yes))

            if len(X) != len(Y):
                return None
            for x, y in zip(X, Y):
                if len(x) != len(y):
                    return None

            # if we're here, then there is no effect on structure.
            # we test if the only cells that differ are those that have an
            # escapechar+quotechar combination.
            eq = d_yes.escapechar + d_yes.quotechar
            for rX, rY in zip(X, Y):
                for x, y in zip(rX, rY):
                    if x != y:
                        if not eq in x:
                            return None

            # Now we know that the only cells that have the
            # escapechar+quotechar combination are the cause of the difference.
            # The right thing to do is to return the dialect that uses them.
            return d_yes

    return None


def break_ties_three(data, A, B, C):
    """Break ties between three dialects.

    If the delimiters and the escape characters are all equal, then we look for
    the dialect that has no quotechar. The tie is broken by calling
    :func:`break_ties_two` for the dialect without quotechar and another
    dialect that gives the same parsing result.

    If only the delimiter is the same for all dialects then use
    :func:`break_ties_two` on the dialects that do not have a quotechar,
    provided there are only two of these.

    Parameters
    ----------

    data: str
        The data of the file as a string

    A: SimpleDialect
        a dialect

    B: SimpleDialect
        a dialect

    C: SimpleDialect
        a dialect

    Returns
    -------

    dialect: SimpleDialect
        The chosen dialect if the tie can be broken, None otherwise.

    Notes
    -----
    We have only observed one tie for each case during development, so
    this may need to be improved in the future.

    """

    equal_delim = A.delimiter == B.delimiter == C.delimiter
    equal_escape = A.escapechar == B.escapechar == C.escapechar

    if equal_delim and equal_escape:
        # difference is *only* in quotechar
        dialects = [A, B, C]

        pA = list(parse_string(data, A))
        pB = list(parse_string(data, B))
        pC = list(parse_string(data, C))

        if len(pA) != len(pB) or len(pA) != len(pC) or len(pB) != len(pC):
            return None

        p_none, d_none = next(
            (
                (p, d)
                for p, d in zip([pA, pB, pC], dialects)
                if d.quotechar == ""
            ),
            (None, None),
        )
        if p_none is None:
            return None

        rem = [
            (p, d) for p, d in zip([pA, pB, pC], dialects) if not p == p_none
        ]

        if len(rem) <= 1:
            # This case was reached for the file
            # 6da5ab459bcc7c3a5ed2e06d65810958.csv from the GitHub corpus of
            # the CSV paper. When fixing the delimiter to Tab, rem = [].
            # Try to reduce pairwise
            new_dialects = reduce_pairwise(data, dialects)
            if len(new_dialects) == 1:
                return new_dialects[0]
            return None
        if p_none == rem[0][0]:
            return break_ties_two(data, d_none, rem[0][1])
        elif len(rem) > 1 and p_none == rem[1][0]:
            return break_ties_two(data, d_none, rem[1][1])
    elif equal_delim:
        # difference is in quotechar *and* escapechar

        # The reasoning here is as follows. If we are in this situation,
        # then there is both a potential escapechar and there are quotechars,
        # but the pattern score is the same and the type score can't make a
        # difference because no cells become clean if we interpret the
        # quote/escape correctly. This implies that the quote and escape do
        # have a function. Thus, we find the dialects that have a quote and
        # defer to break_ties_two.

        dialects = [A, B, C]
        with_quote = [d for d in dialects if d.quotechar != ""]

        if len(with_quote) != 2:
            return None

        return break_ties_two(data, with_quote[0], with_quote[1])

    return None


def break_ties_four(data, dialects):
    """Break ties between four dialects.

    This function works by breaking the ties between pairs of dialects that
    result in the same parsing result (if any). If this reduces the number of
    dialects, then :func:`break_ties_three` or :func:`break_ties_two` is used,
    otherwise, the tie can't be broken.

    Ties are only broken if all dialects have the same delimiter.

    Parameters
    ----------

    data: str
        The data of the file as a string

    dialects: list
        List of SimpleDialect objects

    Returns
    -------
    dialect: SimpleDialect
        The chosen dialect if the tie can be broken, None otherwise.

    Notes
    -----
    We have only observed one case during development where this
    function was needed. It may need to be revisited in the future if other
    examples are found.

    """

    equal_delim = len(set([d.delimiter for d in dialects])) == 1
    if not equal_delim:
        return None

    dialects = reduce_pairwise(data, dialects)

    # Defer to other functions if the number of dialects was reduced
    if len(dialects) == 1:
        return dialects[0]
    elif len(dialects) == 2:
        return break_ties_two(data, *dialects)
    elif len(dialects) == 3:
        return break_ties_three(data, *dialects)

    return None