File: misc.py

package info (click to toggle)
kitchen 1.2.4-2
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 2,044 kB
  • ctags: 1,493
  • sloc: python: 10,651; makefile: 14; sh: 4
file content (368 lines) | stat: -rw-r--r-- 14,520 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
# -*- coding: utf-8 -*-
# Copyright (c) 2012 Red Hat, Inc
# Copyright (c) 2010 Seth Vidal
#
# kitchen is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# kitchen is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with kitchen; if not, see <http://www.gnu.org/licenses/>
#
# Authors:
#   James Antill
#   Toshio Kuratomi <toshio@fedoraproject.org>
#   Seth Vidal
#
# Portions of this code taken from yum/misc.py and yum/i18n.py
'''
---------------------------------------------
Miscellaneous functions for manipulating text
---------------------------------------------

Collection of text functions that don't fit in another category.

.. versionchanged:: kitchen 1.2.0, API: kitchen.text 2.2.0
    Added :func:`~kitchen.text.misc.isbasestring`,
    :func:`~kitchen.text.misc.isbytestring`, and
    :func:`~kitchen.text.misc.isunicodestring` to help tell which string type
    is which on python2 and python3
'''
import htmlentitydefs
import itertools
import re

try:
    import chardet
except ImportError:
    chardet = None

from kitchen.text.exceptions import ControlCharError

# Define a threshold for chardet confidence.  If we fall below this we decode
# byte strings we're guessing about as latin1
_CHARDET_THRESHHOLD = 0.6

# ASCII control codes (the c0 codes) that are illegal in xml 1.0
# Also unicode control codes (the C1 codes): also illegal in xml
_CONTROL_CODES = frozenset(range(0, 8) + [11, 12] + range(14, 32) + range(128, 160))
_CONTROL_CHARS = frozenset(itertools.imap(unichr, _CONTROL_CODES))
_IGNORE_TABLE = dict(zip(_CONTROL_CODES, [None] * len(_CONTROL_CODES)))
_REPLACE_TABLE = dict(zip(_CONTROL_CODES, [u'?'] * len(_CONTROL_CODES)))

# _ENTITY_RE
_ENTITY_RE = re.compile(r'(?s)<[^>]*>|&#?\w+;')

def isbasestring(obj):
    '''Determine if obj is a byte :class:`str` or :class:`unicode` string

    In python2 this is eqiuvalent to isinstance(obj, basestring).  In python3
    it checks whether the object is an instance of str, bytes, or bytearray.
    This is an aid to porting code that needed to test whether an object was
    derived from basestring in python2 (commonly used in unicode-bytes
    conversion functions)

    :arg obj: Object to test
    :returns: True if the object is a :class:`basestring`.  Otherwise False.

    .. versionadded:: Kitchen: 1.2.0, API kitchen.text 2.2.0
    '''
    if isinstance(obj, basestring):
        return True
    return False

def isbytestring(obj):
    '''Determine if obj is a byte :class:`str`

    In python2 this is equivalent to isinstance(obj, str).  In python3 it
    checks whether the object is an instance of bytes or bytearray.

    :arg obj: Object to test
    :returns: True if the object is a byte :class:`str`.  Otherwise, False.

    .. versionadded:: Kitchen: 1.2.0, API kitchen.text 2.2.0
    '''
    if isinstance(obj, str):
        return True
    return False

def isunicodestring(obj):
    '''Determine if obj is a :class:`unicode` string

    In python2 this is equivalent to isinstance(obj, unicode).  In python3 it
    checks whether the object is an instance of :class:`str`.

    :arg obj: Object to test
    :returns: True if the object is a :class:`unicode` string.  Otherwise, False.

    .. versionadded:: Kitchen: 1.2.0, API kitchen.text 2.2.0
    '''
    if isinstance(obj, unicode):
        return True
    return False

def guess_encoding(byte_string, disable_chardet=False):
    '''Try to guess the encoding of a byte :class:`str`

    :arg byte_string: byte :class:`str` to guess the encoding of
    :kwarg disable_chardet: If this is True, we never attempt to use
        :mod:`chardet` to guess the encoding.  This is useful if you need to
        have reproducibility whether :mod:`chardet` is installed or not.
        Default: :data:`False`.
    :raises TypeError: if :attr:`byte_string` is not a byte :class:`str` type
    :returns: string containing a guess at the encoding of
        :attr:`byte_string`.  This is appropriate to pass as the encoding
        argument when encoding and decoding unicode strings.

    We start by attempting to decode the byte :class:`str` as :term:`UTF-8`.
    If this succeeds we tell the world it's :term:`UTF-8` text.  If it doesn't
    and :mod:`chardet` is installed on the system and :attr:`disable_chardet`
    is False this function will use it to try detecting the encoding of
    :attr:`byte_string`.  If it is not installed or :mod:`chardet` cannot
    determine the encoding with a high enough confidence then we rather
    arbitrarily claim that it is ``latin-1``.  Since ``latin-1`` will encode
    to every byte, decoding from ``latin-1`` to :class:`unicode` will not
    cause :exc:`UnicodeErrors` although the output might be mangled.
    '''
    if not isbytestring(byte_string):
        raise TypeError('first argument must be a byte string (str)')
    input_encoding = 'utf-8'
    try:
        unicode(byte_string, input_encoding, 'strict')
    except UnicodeDecodeError:
        input_encoding = None

    if not input_encoding and chardet and not disable_chardet:
        detection_info = chardet.detect(byte_string)
        if detection_info['confidence'] >= _CHARDET_THRESHHOLD:
            input_encoding = detection_info['encoding']

    if not input_encoding:
        input_encoding = 'latin-1'

    return input_encoding

def str_eq(str1, str2, encoding='utf-8', errors='replace'):
    '''Compare two strings, converting to byte :class:`str` if one is
    :class:`unicode`

    :arg str1: First string to compare
    :arg str2: Second string to compare
    :kwarg encoding: If we need to convert one string into a byte :class:`str`
        to compare, the encoding to use.  Default is :term:`utf-8`.
    :kwarg errors: What to do if we encounter errors when encoding the string.
        See the :func:`kitchen.text.converters.to_bytes` documentation for
        possible values.  The default is ``replace``.

    This function prevents :exc:`UnicodeError` (python-2.4 or less) and
    :exc:`UnicodeWarning` (python 2.5 and higher) when we compare
    a :class:`unicode` string to a byte :class:`str`.  The errors normally
    arise because the conversion is done to :term:`ASCII`.  This function
    lets you convert to :term:`utf-8` or another encoding instead.

    .. note::

        When we need to convert one of the strings from :class:`unicode` in
        order to compare them we convert the :class:`unicode` string into
        a byte :class:`str`.  That means that strings can compare differently
        if you use different encodings for each.

    Note that ``str1 == str2`` is faster than this function if you can accept
    the following limitations:

    * Limited to python-2.5+ (otherwise a :exc:`UnicodeDecodeError` may be
      thrown)
    * Will generate a :exc:`UnicodeWarning` if non-:term:`ASCII` byte
      :class:`str` is compared to :class:`unicode` string.
    '''
    try:
        return (not str1 < str2) and (not str1 > str2)
    except UnicodeError:
        pass

    if isunicodestring(str1):
        str1 = str1.encode(encoding, errors)
    else:
        str2 = str2.encode(encoding, errors)
    if str1 == str2:
        return True

    return False

def process_control_chars(string, strategy='replace'):
    '''Look for and transform :term:`control characters` in a string

    :arg string: string to search for and transform :term:`control characters`
        within
    :kwarg strategy: XML does not allow :term:`ASCII` :term:`control
        characters`.  When we encounter those we need to know what to do.
        Valid options are:

        :replace: (default) Replace the :term:`control characters`
            with ``"?"``
        :ignore: Remove the characters altogether from the output
        :strict: Raise a :exc:`~kitchen.text.exceptions.ControlCharError` when
            we encounter a control character
    :raises TypeError: if :attr:`string` is not a unicode string.
    :raises ValueError: if the strategy is not one of replace, ignore, or
        strict.
    :raises kitchen.text.exceptions.ControlCharError: if the strategy is
        ``strict`` and a :term:`control character` is present in the
        :attr:`string`
    :returns: :class:`unicode` string with no :term:`control characters` in
        it.

    .. versionchanged:: kitchen 1.2.0, API: kitchen.text 2.2.0
        Strip out the C1 control characters in addition to the C0 control
        characters.
    '''
    if not isunicodestring(string):
        raise TypeError('process_control_char must have a unicode type as'
                ' the first argument.')
    if strategy not in ('replace', 'ignore', 'strict'):
        raise ValueError('The strategy argument to process_control_chars'
                ' must be one of ignore, replace, or strict')

    # Most strings don't have control chars and translating carries
    # a higher cost than testing whether the chars are in the string
    # So only translate if necessary
    if not _CONTROL_CHARS.isdisjoint(string):
        if strategy == 'replace':
            control_table = _REPLACE_TABLE
        elif strategy == 'ignore':
            control_table = _IGNORE_TABLE
        else:
            # strategy can only equal 'strict'
            raise ControlCharError('ASCII control code present in string'
                    ' input')
        string = string.translate(control_table)

    return string

# Originally written by Fredrik Lundh (January 15, 2003) and placed in the
# public domain::
#
#   Unless otherwise noted, source code can be be used freely. Examples, test
#   scripts and other short code fragments can be considered as being in the
#   public domain.
#
# http://effbot.org/zone/re-sub.htm#unescape-html
# http://effbot.org/zone/copyright.htm
#
def html_entities_unescape(string):
    '''Substitute unicode characters for HTML entities

    :arg string: :class:`unicode` string to substitute out html entities
    :raises TypeError: if something other than a :class:`unicode` string is
        given
    :rtype: :class:`unicode` string
    :returns: The plain text without html entities
    '''
    def fixup(match):
        string = match.group(0)
        if string[:1] == u"<":
            return "" # ignore tags
        if string[:2] == u"&#":
            try:
                if string[:3] == u"&#x":
                    return unichr(int(string[3:-1], 16))
                else:
                    return unichr(int(string[2:-1]))
            except ValueError:
                # If the value is outside the unicode codepoint range, leave
                # it in the output as is
                pass
        elif string[:1] == u"&":
            entity = htmlentitydefs.entitydefs.get(string[1:-1].encode('utf-8'))
            if entity:
                if entity[:2] == "&#":
                    try:
                        return unichr(int(entity[2:-1]))
                    except ValueError:
                        # If the value is outside the unicode codepoint range,
                        # leave it in the output as is
                        pass
                else:
                    return unicode(entity, "iso-8859-1")
        return string # leave as is

    if not isunicodestring(string):
        raise TypeError('html_entities_unescape must have a unicode type'
                ' for its first argument')
    return re.sub(_ENTITY_RE, fixup, string)

def byte_string_valid_xml(byte_string, encoding='utf-8'):
    '''Check that a byte :class:`str` would be valid in xml

    :arg byte_string: Byte :class:`str` to check
    :arg encoding: Encoding of the xml file.  Default: :term:`UTF-8`
    :returns: :data:`True` if the string is valid.  :data:`False` if it would
        be invalid in the xml file

    In some cases you'll have a whole bunch of byte strings and rather than
    transforming them to :class:`unicode` and back to byte :class:`str` for
    output to xml, you will just want to make sure they work with the xml file
    you're constructing.  This function will help you do that.  Example::

        ARRAY_OF_MOSTLY_UTF8_STRINGS = [...]
        processed_array = []
        for string in ARRAY_OF_MOSTLY_UTF8_STRINGS:
            if byte_string_valid_xml(string, 'utf-8'):
                processed_array.append(string)
            else:
                processed_array.append(guess_bytes_to_xml(string, encoding='utf-8'))
        output_xml(processed_array)
    '''
    if not isbytestring(byte_string):
        # Not a byte string
        return False

    try:
        u_string = unicode(byte_string, encoding)
    except UnicodeError:
        # Not encoded with the xml file's encoding
        return False

    data = frozenset(u_string)
    if data.intersection(_CONTROL_CHARS):
        # Contains control codes
        return False

    # The byte string is compatible with this xml file
    return True

def byte_string_valid_encoding(byte_string, encoding='utf-8'):
    '''Detect if a byte :class:`str` is valid in a specific encoding

    :arg byte_string: Byte :class:`str` to test for bytes not valid in this
        encoding
    :kwarg encoding: encoding to test against.  Defaults to :term:`UTF-8`.
    :returns: :data:`True` if there are no invalid :term:`UTF-8` characters.
        :data:`False` if an invalid character is detected.

    .. note::

        This function checks whether the byte :class:`str` is valid in the
        specified encoding.  It **does not** detect whether the byte
        :class:`str` actually was encoded in that encoding.  If you want that
        sort of functionality, you probably want to use
        :func:`~kitchen.text.misc.guess_encoding` instead.
    '''
    try:
        unicode(byte_string, encoding)
    except UnicodeError:
        # Not encoded with the xml file's encoding
        return False

    # byte string is valid in this encoding
    return True

__all__ = ('byte_string_valid_encoding', 'byte_string_valid_xml',
        'guess_encoding', 'html_entities_unescape', 'isbasestring',
        'isbytestring', 'isunicodestring', 'process_control_chars', 'str_eq')