1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69
|
import chardet as fallback_detector
import six
# Made cchardet optional according to https://github.com/mailgun/flanker/pull/84
try:
import cchardet as primary_detector
except ImportError:
primary_detector = fallback_detector
from flanker.mime.message import errors
def _guess_and_convert_with(value, detector=primary_detector):
"""
Try to guess the encoding of the passed value with the provided detector
and decode it.
The detector is either chardet or cchardet module.
"""
charset = detector.detect(value)
if not charset["encoding"]:
raise errors.DecodingError("Failed to guess encoding")
try:
value = value.decode(charset["encoding"], "replace")
return value
except (UnicodeError, LookupError) as e:
raise errors.DecodingError(str(e))
def _guess_and_convert(value):
"""
Try to guess the encoding of the passed value and decode it.
Uses cchardet to guess the encoding and if guessing or decoding fails, falls
back to chardet which is much slower.
"""
try:
return _guess_and_convert_with(value, detector=primary_detector)
except Exception:
return _guess_and_convert_with(value, detector=fallback_detector)
def _make_unicode(value, charset=None):
if isinstance(value, six.text_type):
return value
charset = charset or "utf-8"
try:
value = value.decode(charset, "strict")
except (UnicodeError, LookupError):
value = _guess_and_convert(value)
return value
def to_utf8(value, charset=None):
"""
Safely returns a UTF-8 version of a given string
"""
value = _make_unicode(value, charset)
return value.encode("utf-8", "strict")
def to_unicode(value, charset=None):
return _make_unicode(value, charset)
|