1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
|
"""Parse a Python source code encoding string"""
import codecs
import re
# Regexp to match python magic encoding line
PYTHON_MAGIC_COMMENT_re = re.compile(
r'[ \t\f]* \# .* coding[=:][ \t]*([-\w.]+)', re.VERBOSE)
def parse_encoding(lines):
"""Deduce the encoding of a source file from magic comment.
It does this in the same way as the `Python interpreter`__
.. __: http://docs.python.org/ref/encodings.html
The ``lines`` argument should be a list of the first 2 lines of the
source code.
(From Jeff Dairiki)
"""
try:
line1 = lines[0]
has_bom = line1.startswith(codecs.BOM_UTF8)
if has_bom:
line1 = line1[len(codecs.BOM_UTF8):]
m = PYTHON_MAGIC_COMMENT_re.match(line1)
if not m:
try:
import parser
parser.suite(line1)
except (ImportError, SyntaxError):
# Either it's a real syntax error, in which case the source is
# not valid python source, or line2 is a continuation of line1,
# in which case we don't want to scan line2 for a magic
# comment.
pass
else:
line2 = lines[1]
m = PYTHON_MAGIC_COMMENT_re.match(line2)
if has_bom:
if m:
raise SyntaxError(
"python refuses to compile code with both a UTF8 "
"byte-order-mark and a magic encoding comment")
return 'utf_8'
elif m:
return m.group(1)
else:
return None
except:
return None
|