File: utils.py

package info (click to toggle)
python-flanker 0.9.15-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 17,976 kB
  • sloc: python: 9,308; makefile: 4
file content (69 lines) | stat: -rw-r--r-- 1,805 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import chardet as fallback_detector
import six

# Made cchardet optional according to https://github.com/mailgun/flanker/pull/84
try:
    import cchardet as primary_detector
except ImportError:
    primary_detector = fallback_detector

from flanker.mime.message import errors


def _guess_and_convert_with(value, detector=primary_detector):
    """
    Try to guess the encoding of the passed value with the provided detector
    and decode it.

    The detector is either chardet or cchardet module.
    """
    charset = detector.detect(value)

    if not charset["encoding"]:
        raise errors.DecodingError("Failed to guess encoding")

    try:
        value = value.decode(charset["encoding"], "replace")
        return value

    except (UnicodeError, LookupError) as e:
        raise errors.DecodingError(str(e))


def _guess_and_convert(value):
    """
    Try to guess the encoding of the passed value and decode it.

    Uses cchardet to guess the encoding and if guessing or decoding fails, falls
    back to chardet which is much slower.
    """
    try:
        return _guess_and_convert_with(value, detector=primary_detector)
    except Exception:
        return _guess_and_convert_with(value, detector=fallback_detector)


def _make_unicode(value, charset=None):
    if isinstance(value, six.text_type):
        return value

    charset = charset or "utf-8"
    try:
        value = value.decode(charset, "strict")
    except (UnicodeError, LookupError):
        value = _guess_and_convert(value)

    return value


def to_utf8(value, charset=None):
    """
    Safely returns a UTF-8 version of a given string
    """
    value = _make_unicode(value, charset)

    return value.encode("utf-8", "strict")


def to_unicode(value, charset=None):
    return _make_unicode(value, charset)