1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282
|
# -*- coding: utf-8 -*-
"""
Code for selecting the potential dialects of a file.
Author: Gertjan van den Burg
"""
import codecs
import itertools
import unicodedata
import regex
from .detect_type import PATTERNS
from .dialect import SimpleDialect
from .escape import is_potential_escapechar
from .utils import pairwise
def get_dialects(
data, encoding="UTF-8", delimiters=None, test_masked_by_quotes=False
):
"""Return the possible dialects for the given data.
We consider as escape characters those characters for which
is_potential_escapechar() is True and that occur at least once before a
quote character or delimiter in the dialect.
One may wonder if self-escaping is an issue here (i.e. "\\\\", two times
backslash). It is not. In a file where a single backslash is desired and
escaping with a backslash is used, then it only makes sense to do this in a
file where the backslash is already used as an escape character (in which
case we include it). If it is never used as escape for the delimiter or
quotechar, then it is not necessary to self-escape. This is an assumption,
but it holds in general and it reduces noise.
Parameters
----------
data: str
The data for the file
encoding: str
The encoding of the file
delimiters: iterable
Set of delimiters to consider. See :func:`get_delimiters` for more
info.
test_masked_by_quotes : bool
Remove dialects where the delimiter is always masked by the quote
character. Enabling this typically removes a number of potential
dialects from the list, which can remove false positives. It however
not a very fast operation, so it is disabled by default.
Returns
-------
dialects: list
List of SimpleDialect objects that are considered potential dialects.
"""
# URLs are removed to reduce noise
no_url = filter_urls(data)
delims = get_delimiters(no_url, encoding, delimiters=delimiters)
quotechars = get_quotechars(no_url)
escapechars = {}
for delim, quotechar in itertools.product(delims, quotechars):
escapechars[(delim, quotechar)] = set([""])
# escapechars are those that precede a delimiter or quotechar
for u, v in pairwise(data):
if not is_potential_escapechar(u, encoding):
continue
for delim, quotechar in itertools.product(delims, quotechars):
if v == delim or v == quotechar:
escapechars[(delim, quotechar)].add(u)
# remove dialects where the delimiter is always masked by quotes.
dialects = []
for delim in delims:
for quotechar in quotechars:
for escapechar in escapechars[(delim, quotechar)]:
if test_masked_by_quotes and masked_by_quotechar(
data, quotechar, escapechar, delim
):
continue
d = SimpleDialect(delim, quotechar, escapechar)
dialects.append(d)
return dialects
def unicode_category(x, encoding=None):
"""Return the Unicode category of a character
Parameters
----------
x : str
character
encoding: str
Encoding of the character
Returns
-------
category: str
The Unicode category of the character.
"""
as_unicode = codecs.decode(bytes(x, encoding), encoding=encoding)
return unicodedata.category(as_unicode)
def filter_urls(data):
"""Filter URLs from the data"""
pat = PATTERNS["url"]
return regex.sub(pat, "U", data, count=0)
def get_delimiters(
data, encoding, delimiters=None, block_cat=None, block_char=None
):
"""Get potential delimiters
The set of potential delimiters is constructed as follows. For each unique
character of the file, we check if its Unicode character category is in the
set ``block_cat`` of prohibited categories. If it is, we don't allow it to
be a delimiter, with the exception of Tab (which is in the Control
category). We furthermore block characters in :attr:`block_char` from
being delimiters.
Parameters
----------
data: str
The data of the file
encoding: str
The encoding of the file
delimiters: iterable
Allowed delimiters. If provided, it overrides the block_cat/block_char
mechanism and only the provided characters will be considered
delimiters (if they occur in the file). If None, all characters can be
considered delimiters subject to the :attr:`block_cat` and
:attr:`block_char` parameters.
block_cat: list
List of Unicode categories (2-letter abbreviations) for characters that
should not be considered as delimiters. If None, the following default
set is used::
["Lu", "Ll", "Lt", "Lm", "Lo", "Nd", "Nl", "No", "Ps", "Pe", "Co"]
block_char: list
Explicit list of characters that should not be considered delimiters.
If None, the following default set is used::
[".", "/", '"', "'", "\\n", "\\r"]
Returns
-------
delims: set
Set of potential delimiters. The empty string is added by default.
"""
if block_cat is None:
block_cat = [
"Lu",
"Ll",
"Lt",
"Lm",
"Lo",
"Nd",
"Nl",
"No",
"Ps",
"Pe",
"Co",
]
if block_char is None:
block_char = [".", "/", '"', "'", "\n", "\r"]
D = set()
for x in set(data):
c = unicode_category(x, encoding=encoding)
if delimiters is None:
if x == "\t" or ((x not in block_char) and (c not in block_cat)):
D.add(x)
else:
if x in delimiters:
D.add(x)
D.add("")
return D
def get_quotechars(data, quote_chars=None):
"""Get potential quote characters
Quote characters are those that occur in the ``quote_chars`` set and are
found at least once in the file.
Parameters
----------
data: str
The data of the file as a string
quote_chars: iterable
Characters that should be considered quote characters. If it is None,
the following default set is used::
["'", '"', "~", "`"]
Returns
-------
quotes: set
Set of potential quote characters. The empty string is added by
default.
"""
if quote_chars is None:
quote_chars = ["'", '"', "~", "`"]
Q = set(quote_chars) & set(data)
Q.add("")
return Q
def masked_by_quotechar(data, quotechar, escapechar, test_char):
"""Test if a character is always masked by quote characters
This function tests if a given character is always within quoted segments
(defined by the quote character). Double quoting and escaping is supported.
Parameters
----------
data: str
The data of the file as a string
quotechar: str
The quote character
escapechar: str
The escape character
test_char: str
The character to test
Returns
-------
masked: bool
Returns True if the test character is never outside quoted segements,
False otherwise.
"""
if test_char == "":
return False
escape_next = False
in_quotes = False
i = 0
while i < len(data):
s = data[i]
if s == quotechar:
if escape_next:
i += 1
continue
if not in_quotes:
in_quotes = True
else:
if i + 1 < len(data) and data[i + 1] == quotechar:
i += 1
else:
in_quotes = False
elif s == test_char and not in_quotes:
return False
elif s == escapechar:
escape_next = True
i += 1
return True
|