1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347
|
# -*- coding: utf-8 -*-
"""
Detect the dialect with very strict functional tests.
This module uses so-called "normal forms" to detect the dialect of CSV files.
Normal forms are detected with strict functional tests. The normal forms are
used as a pre-test to check if files are simple enough that computing the data
consistency measure is not necessary.
Author: Gertjan van den Burg
"""
import itertools
import regex
from .dialect import SimpleDialect
from .escape import is_potential_escapechar
from .utils import pairwise
DELIMS = [",", ";", "|", "\t"]
QUOTECHARS = ["'", '"']
def detect_dialect_normal(
data, encoding="UTF-8", delimiters=None, verbose=False
):
"""Detect the normal form of a file from a given sample
Parameters
----------
data : str
The data as a single string
encoding : str
The encoding of the data
Returns
-------
dialect : SimpleDialect
The dialect detected using normal forms, or None if no such dialect can
be found.
"""
if delimiters is None:
delimiters = DELIMS
delimiters = list(delimiters)
for delim, quotechar in itertools.product(delimiters, QUOTECHARS):
if maybe_has_escapechar(data, encoding, delim, quotechar):
if verbose:
print("Not normal, has potential escapechar.")
return None
form_and_dialect = []
for delim in delimiters:
dialect = SimpleDialect(delimiter=delim, quotechar="", escapechar="")
form_and_dialect.append((2, is_form_2, dialect))
for delim, quotechar in itertools.product(delimiters, QUOTECHARS):
dialect = SimpleDialect(
delimiter=delim, quotechar=quotechar, escapechar=""
)
form_and_dialect.append((1, is_form_1, dialect))
form_and_dialect.append((3, is_form_3, dialect))
form_and_dialect.append((5, is_form_5, dialect))
for quotechar in QUOTECHARS:
dialect = SimpleDialect(
delimiter="", quotechar=quotechar, escapechar=""
)
form_and_dialect.append((4, is_form_4, dialect))
form_and_dialect.append(
(
4,
is_form_4,
SimpleDialect(delimiter="", quotechar="", escapechar=""),
)
)
for ID, form_func, dialect in form_and_dialect:
if form_func(data, dialect):
if verbose:
print("Matched normal form %i." % ID)
return dialect
if verbose:
print("Didn't match any normal forms.")
def is_quoted_cell(cell, quotechar):
if len(cell) < 2:
return False
return cell[0] == quotechar and cell[-1] == quotechar
def is_any_quoted_cell(cell):
return is_quoted_cell(cell, "'") or is_quoted_cell(cell, '"')
def is_any_partial_quoted_cell(cell):
if len(cell) < 1:
return False
return (
cell[0] == '"' or cell[0] == "'" or cell[-1] == '"' or cell[-1] == "'"
)
def is_empty_quoted(cell, quotechar):
return len(cell) == 2 and is_quoted_cell(cell, quotechar)
def is_empty_unquoted(cell):
return cell == ""
def is_any_empty(cell):
return (
is_empty_unquoted(cell)
or is_empty_quoted(cell, "'")
or is_empty_quoted(cell, '"')
)
def has_delimiter(string, delim):
return delim in string
def has_nested_quotes(string, quotechar):
return quotechar in string[1:-1]
def maybe_has_escapechar(data, encoding, delim, quotechar):
if not delim in data and not quotechar in data:
return False
for u, v in pairwise(data):
if v in [delim, quotechar] and is_potential_escapechar(u, encoding):
return True
return False
def strip_trailing_crnl(data):
while data.endswith("\n"):
data = data.rstrip("\n")
while data.endswith("\r"):
data = data.rstrip("\r")
return data
def every_row_has_delim(rows, dialect):
for row in rows:
if not has_delimiter(row, dialect.delimiter):
return False
return True
def is_elementary(cell):
return not (
regex.fullmatch("[a-zA-Z0-9\.\_\&\-\@\+\%\(\)\ \/]+", cell) is None
)
def even_rows(rows, dialect):
cells_per_row = set()
for row in rows:
cells_per_row.add(len(split_row(row, dialect)))
return len(cells_per_row) == 1
def split_file(data):
data = strip_trailing_crnl(data)
if "\r\n" in data:
return data.split("\r\n")
elif "\n" in data:
return data.split("\n")
elif "\r" in data:
return data.split("\r")
else:
return [data]
def split_row(row, dialect):
# no nested quotes
if dialect.quotechar == "" or not dialect.quotechar in row:
if dialect.delimiter == "":
return [row]
return row.split(dialect.delimiter)
cells = []
current_cell = ""
in_quotes = False
for c in row:
if c == dialect.delimiter and not in_quotes:
cells.append(current_cell)
current_cell = ""
elif c == dialect.quotechar:
in_quotes = not in_quotes
current_cell += c
else:
current_cell += c
if current_cell:
cells.append(current_cell)
return cells
def is_form_1(data, dialect=None):
# All cells quoted, quoted empty allowed, no nested quotes, more than one
# column
rows = split_file(data)
if not every_row_has_delim(rows, dialect):
return False
if not even_rows(rows, dialect):
return False
for row in rows:
cells = split_row(row, dialect)
if len(cells) == 1:
return False
for cell in cells:
# No empty cells
if is_empty_unquoted(cell):
return False
# All cells must be quoted
if not is_quoted_cell(cell, dialect.quotechar):
return False
# No quotes inside quotes
if has_nested_quotes(cell, dialect.quotechar):
return False
return True
def is_form_2(data, dialect):
# All unquoted, empty allowed, all elementary
rows = split_file(data)
if not every_row_has_delim(rows, dialect):
return False
if not even_rows(rows, dialect):
return False
for row in rows:
cells = split_row(row, dialect)
if len(cells) == 1:
return False
for cell in cells:
# All cells must be unquoted
if is_any_quoted_cell(cell):
return False
# All cells must not be partially quoted
if is_any_partial_quoted_cell(cell):
return False
# Cells have to be elementary
if not is_empty_unquoted(cell) and not is_elementary(cell):
return False
return True
def is_form_3(data, dialect):
# some quoted, some not quoted, no empty, no nested quotes
rows = split_file(data)
if not every_row_has_delim(rows, dialect):
return False
if not even_rows(rows, dialect):
return False
if len(rows) <= 1:
return False
for row in rows:
cells = split_row(row, dialect)
if len(cells) == 1:
return False
for cell in cells:
if is_any_empty(cell):
return False
# if it is quoted
if is_any_quoted_cell(cell):
# but not quoted with the quotechar of the dialect
if not is_quoted_cell(cell, dialect.quotechar):
# then this form isn't right
return False
# if it's not quoted
else:
# and it's not elementary
if not is_elementary(cell):
# then this form isn't right
return False
return True
def is_form_4(data, dialect):
# no delim, single column (either entirely quoted or entirely unquoted)
rows = split_file(data)
if len(rows) <= 1:
return False
unquoted_search = regex.compile(r"[^A-Za-z0-9.\_&\-]").search
quoted_search = regex.compile(r"[^A-Za-z0-9.\_&\-\ ]").search
for row in rows:
cell = row[:]
if dialect.quotechar == "":
if is_any_quoted_cell(cell):
return False
if unquoted_search(cell):
return False
else:
if not is_quoted_cell(cell, dialect.quotechar):
return False
if quoted_search(cell[1:-1]):
return False
return True
def is_form_5(data, dialect):
# all rows quoted, no nested quotes
# basically form 2 but with quotes around each row
rows = split_file(data)
if not every_row_has_delim(rows, dialect):
return False
if len(rows) <= 1:
return False
for row in rows:
if not (
len(row) > 2
and row[0] == dialect.quotechar
and row[-1] == dialect.quotechar
):
return False
newrows = []
for row in rows:
newrows.append(row[1:-1])
return is_form_2("\n".join(newrows), dialect)
|