File: detect.py

package info (click to toggle)
python-clevercsv 0.7.5%2Bds-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 872 kB
  • sloc: python: 5,076; ansic: 763; makefile: 81
file content (137 lines) | stat: -rw-r--r-- 5,132 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# -*- coding: utf-8 -*-

import json
import sys
import time

from wilderness import Command

from clevercsv.wrappers import detect_dialect

from ._docs import FLAG_DESCRIPTIONS
from ._utils import parse_int


class DetectCommand(Command):

    _description = "Detect the dialect of a CSV file."

    def __init__(self):
        super().__init__(
            name="detect",
            title="Detect the dialect of a CSV file",
            description=self._description,
            extra_sections={"CleverCSV": "Part of the CleverCSV suite"},
        )

    def register(self):
        self.add_argument("path", help="Path to the CSV file")
        self.add_argument(
            "-c",
            "--consistency",
            action="store_true",
            help="Only use the consistency measure for detection.",
            description=(
                "By default, the dialect of CSV files is detected using "
                "atwo-step process. First, a strict set of checks is used to "
                "see if the file adheres to a very basic format (for example, "
                "when all cells in the file are integers). If none of these "
                "checks succeed, the data consistency measure of Van den "
                "Burg, et al. (2019) is used to detect the dialect. With this "
                "option, you can force the detection to always use the data "
                "consistency measure. This can be useful for testing or "
                "research purposes, for instance."
            ),
        )
        self.add_argument(
            "-e",
            "--encoding",
            help="Set the encoding of the file",
            description=FLAG_DESCRIPTIONS["encoding"],
        )
        self.add_argument(
            "-n",
            "--num-chars",
            help="Number of characters to use for detection",
            type=int,
            description=FLAG_DESCRIPTIONS["num-chars"],
        )
        group = self.add_mutually_exclusive_group()
        group.add_argument(
            "-p",
            "--plain",
            action="store_true",
            help="Print the components of the dialect on separate lines",
        )
        group.add_argument(
            "-j",
            "--json",
            action="store_true",
            help="Print the components of the dialect as a JSON object",
            description=(
                "Print the dialect to standard output in the form of a JSON "
                "object. This object will always have the 'delimiter', "
                "'quotechar', 'escapechar', and 'strict' keys. If "
                "--add-runtime is specified, it will also have a 'runtime' "
                "key."
            ),
        )
        self.add_argument(
            "--no-skip",
            action="store_true",
            help="Don't skip type detection for dialects with a low pattern score",
            description=(
                "The data consistency score used for dialect detection "
                "consists of two components: a pattern score and a type "
                "score. The type score lies between 0 and 1. When computing "
                "the data consistency measures for different dialects, we "
                "skip the computation of the type score if we see that the "
                "pattern score is lower than the best data consistency score "
                "we've seen so far. This option can be used to disable this "
                "behaviour and compute the type score for all dialects. This "
                "is mainly useful for debugging and testing purposes."
            ),
        )
        self.add_argument(
            "--add-runtime",
            action="store_true",
            help="Add the runtime of the detection to the detection output.",
        )

    def handle(self):
        verbose = self.args.verbose
        num_chars = parse_int(self.args.num_chars, "num-chars")
        method = "consistency" if self.args.consistency else "auto"
        skip = not self.args.no_skip

        t_start = time.time()
        dialect = detect_dialect(
            self.args.path,
            num_chars=num_chars,
            encoding=self.args.encoding,
            verbose=verbose,
            method=method,
            skip=skip,
        )
        runtime = time.time() - t_start

        if dialect is None:
            print("Error: Dialect detection failed.", file=sys.stderr)
            return 1

        if self.args.plain:
            print(f"delimiter = {dialect.delimiter}".strip())
            print(f"quotechar = {dialect.quotechar}".strip())
            print(f"escapechar = {dialect.escapechar}".strip())
            if self.args.add_runtime:
                print(f"runtime = {runtime}")
        elif self.args.json:
            dialect_dict = dialect.to_dict()
            if self.args.add_runtime:
                dialect_dict["runtime"] = runtime
            print(json.dumps(dialect_dict))
        else:
            print("Detected: " + str(dialect))
            if self.args.add_runtime:
                print(f"Runtime: {runtime:.6f} seconds")
        return 0