1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288
|
#!/usr/bin/python
"""
$Id: xmlEncoding.py 988 2008-03-12 18:22:48Z sa3ruby $
This module deals with detecting XML encodings, using both BOMs and
explicit declarations.
"""
__author__ = "Joseph Walton <http://www.kafsemo.org/>"
__version__ = "$Revision: 988 $"
__copyright__ = "Copyright (c) 2004 Joseph Walton"
import codecs
import re
from logging import ObscureEncoding, NonstdEncoding
import logging
class FailingCodec:
def __init__(self, name):
self.name = name
def fail(self, txt, errors='strict'):
raise UnicodeError('No codec available for ' + self.name + ' in this installation of FeedValidator')
# Don't die if the codec can't be found, but return
# a decoder that will fail on use
def getdecoder(codec):
try:
return codecs.getdecoder(codec)
except:
return FailingCodec(codec).fail
# These are generic decoders that are only used
# to decode the XML declaration, from which we can read
# the real encoding
_decUTF32BE = getdecoder('UTF-32BE')
_decUTF32LE = getdecoder('UTF-32LE')
_decUTF16BE = getdecoder('UTF-16BE')
_decUTF16LE = getdecoder('UTF-16LE')
_decEBCDIC = getdecoder('IBM037') # EBCDIC
_decACE = getdecoder('ISO-8859-1') # An ASCII-compatible encoding
# Given a character index into a string, calculate its 1-based row and column
def _position(txt, idx):
row = txt.count('\n', 0, idx) + 1
ln = txt.rfind('\n', 0, idx) + 1
column = 0
for c in txt[ln:idx]:
if c == '\t':
column = (column // 8 + 1) * 8
else:
column += 1
column += 1
return (row, column)
def _normaliseNewlines(txt):
return txt.replace('\r\n', '\n').replace('\r', '\n')
def _logEvent(loggedEvents, e, pos=None):
if pos:
e.params['line'], e.params['column'] = pos
loggedEvents.append(e)
# Return the encoding from the declaration, or 'None'
# Return None if the 'permitted' list is passed in and the encoding
# isn't found in it. This is so that, e.g., a 4-byte-character XML file
# that claims to be US-ASCII will fail now.
def _decodeDeclaration(sig, dec, permitted, loggedEvents):
sig = _normaliseNewlines(dec(sig)[0])
eo = _encodingFromDecl(sig)
if not(eo):
_logEvent(loggedEvents,
logging.UnicodeError({'exception': 'This XML file (apparently ' + permitted[0] + ') requires an encoding declaration'}), (1, 1))
elif permitted and not(eo[0].upper() in permitted):
if _hasCodec(eo[0]):
# see if the codec is an alias of one of the permitted encodings
codec=codecs.lookup(eo[0])
for encoding in permitted:
if _hasCodec(encoding) and codecs.lookup(encoding)[-1]==codec[-1]: break
else:
_logEvent(loggedEvents,
logging.UnicodeError({'exception': 'This XML file claims an encoding of ' + eo[0] + ', but looks more like ' + permitted[0]}), eo[1])
return eo
# Return the encoding from the declaration, or 'fallback' if none is
# present. Return None if the 'permitted' list is passed in and
# the encoding isn't found in it
def _decodePostBOMDeclaration(sig, dec, permitted, loggedEvents, fallback=None):
sig = _normaliseNewlines(dec(sig)[0])
eo = _encodingFromDecl(sig)
if eo and not(eo[0].upper() in permitted):
_logEvent(loggedEvents,
logging.UnicodeError({'exception': 'Document starts with ' + permitted[0] + ' BOM marker but has incompatible declaration of ' + eo[0]}), eo[1])
return None
else:
return eo or (fallback, None)
def isStandard(x):
""" Is this encoding required by the XML 1.0 Specification, 4.3.3? """
return x.upper() in ['UTF-8', 'UTF-16']
def isCommon(x):
"""Is this encoding commonly used, according to
<http://www.syndic8.com/stats.php?Section=feeds#XMLEncodings>
(as of 2004-03-27)?"""
return isStandard(x) or x.upper() in ['US-ASCII', 'ISO-8859-1',
'EUC-JP', 'ISO-8859-2', 'ISO-8859-15', 'ISO-8859-7',
'KOI8-R', 'SHIFT_JIS', 'WINDOWS-1250', 'WINDOWS-1251',
'WINDOWS-1252', 'WINDOWS-1254', 'WINDOWS-1255', 'WINDOWS-1256',
# This doesn't seem to be popular, but is the Chinese
# government's mandatory standard
'GB18030'
]
# Inspired by xmlproc's autodetect_encoding, but rewritten
def _detect(doc_start, loggedEvents=[], fallback='UTF-8'):
"""This is the logic from appendix F.1 of the XML 1.0 specification.
Pass in the start of a document (>= 256 octets), and receive the encoding to
use, or None if there is a problem with the document."""
sig = doc_start[:4]
# With a BOM. We also check for a declaration, and make sure
# it doesn't contradict (for 4-byte encodings, it's required)
if sig == '\x00\x00\xFE\xFF': # UTF-32 BE
eo = _decodeDeclaration(doc_start[4:], _decUTF32BE, ['UTF-32', 'ISO-10646-UCS-4', 'CSUCS4', 'UCS-4'], loggedEvents)
elif sig == '\xFF\xFE\x00\x00': # UTF-32 LE
eo = _decodeDeclaration(doc_start[4:], _decUTF32LE, ['UTF-32', 'ISO-10646-UCS-4', 'CSUCS4', 'UCS-4'], loggedEvents)
elif sig == '\x00\x00\xFF\xFE' or sig == '\xFE\xFF\x00\x00':
raise UnicodeError('Unable to process UCS-4 with unusual octet ordering')
elif sig[:2] == '\xFE\xFF': # UTF-16 BE
eo = _decodePostBOMDeclaration(doc_start[2:], _decUTF16BE, ['UTF-16', 'ISO-10646-UCS-2', 'CSUNICODE', 'UCS-2'], loggedEvents, fallback='UTF-16')
elif sig[:2] == '\xFF\xFE': # UTF-16 LE
eo = _decodePostBOMDeclaration(doc_start[2:], _decUTF16LE, ['UTF-16', 'ISO-10646-UCS-2', 'CSUNICODE', 'UCS-2'], loggedEvents, fallback='UTF-16')
elif sig[:3] == '\xEF\xBB\xBF':
eo = _decodePostBOMDeclaration(doc_start[3:], _decACE, ['UTF-8'], loggedEvents, fallback='UTF-8')
# Without a BOM; we must read the declaration
elif sig == '\x00\x00\x00\x3C':
eo = _decodeDeclaration(doc_start, _decUTF32BE, ['UTF-32BE', 'UTF-32', 'ISO-10646-UCS-4', 'CSUCS4', 'UCS-4'], loggedEvents)
elif sig == '\x3C\x00\x00\x00':
eo = _decodeDeclaration(doc_start, _decUTF32LE, ['UTF-32LE', 'UTF-32', 'ISO-10646-UCS-4', 'CSUCS4', 'UCS-4'], loggedEvents)
elif sig == '\x00\x3C\x00\x3F':
eo = _decodeDeclaration(doc_start, _decUTF16BE, ['UTF-16BE', 'UTF-16', 'ISO-10646-UCS-2', 'CSUNICODE', 'UCS-2'], loggedEvents)
elif sig == '\x3C\x00\x3F\x00':
eo = _decodeDeclaration(doc_start, _decUTF16LE, ['UTF-16LE', 'UTF-16', 'ISO-10646-UCS-2', 'CSUNICODE', 'UCS-2'], loggedEvents)
elif sig == '\x3C\x3F\x78\x6D':
eo = _encodingFromDecl(_normaliseNewlines(_decACE(doc_start)[0])) or ('UTF-8', None)
elif sig == '\x4C\x6F\xA7\x94':
eo = _decodeDeclaration(doc_start, _decEBCDIC, ['IBM037', 'CP037', 'IBM038', 'EBCDIC-INT'], loggedEvents)
# There's no BOM, and no declaration. It's UTF-8, or mislabelled.
else:
eo = (fallback, None)
return eo
def detect(doc_start, loggedEvents=[], fallback='UTF-8'):
eo = _detect(doc_start, loggedEvents, fallback)
if eo:
return eo[0]
else:
return None
_encRe = re.compile(r'<\?xml\s+version\s*=\s*(?:"[-a-zA-Z0-9_.:]+"|\'[-a-zA-Z0-9_.:]+\')\s+(encoding\s*=\s*(?:"([-A-Za-z0-9._]+)"|\'([-A-Za-z0-9._]+)\'))')
def _encodingFromDecl(x):
m = _encRe.match(x)
if m:
if m.group(2):
return m.group(2), _position(x, m.start(2))
else:
return m.group(3), _position(x, m.start(3))
else:
return None
def removeDeclaration(x):
"""Replace an XML document string's encoding declaration with the
same number of spaces. Some XML parsers don't allow the
encoding to be overridden, and this is a workaround."""
m = _encRe.match(x)
if m:
s = m.start(1)
e = m.end(1)
res = x[:s] + ' ' * (e - s) + x[e:]
else:
res = x
return res
def _hasCodec(enc):
try:
return codecs.lookup(enc) is not None
except:
return False
def decode(mediaType, charset, bs, loggedEvents, fallback=None):
eo = _detect(bs, loggedEvents, fallback=None)
# Check declared encodings
if eo and eo[1] and _hasCodec(eo[0]):
if not(isCommon(eo[0])):
_logEvent(loggedEvents, ObscureEncoding({"encoding": eo[0]}), eo[1])
elif not(isStandard(eo[0])):
_logEvent(loggedEvents, NonstdEncoding({"encoding": eo[0]}), eo[1])
if eo:
encoding = eo[0]
else:
encoding = None
if charset and encoding and charset.lower() != encoding.lower():
# RFC 3023 requires us to use 'charset', but a number of aggregators
# ignore this recommendation, so we should warn.
loggedEvents.append(logging.EncodingMismatch({"charset": charset, "encoding": encoding}))
if mediaType and mediaType.startswith("text/") and charset is None:
loggedEvents.append(logging.TextXml({}))
# RFC 3023 requires text/* to default to US-ASCII. Issue a warning
# if this occurs, but continue validation using the detected encoding
try:
bs.decode("US-ASCII")
except:
if not encoding:
try:
bs.decode(fallback)
encoding=fallback
except:
pass
if encoding and encoding.lower() != 'us-ascii':
loggedEvents.append(logging.EncodingMismatch({"charset": "US-ASCII", "encoding": encoding}))
enc = charset or encoding
if enc is None:
loggedEvents.append(logging.MissingEncoding({}))
enc = fallback
elif not(_hasCodec(enc)):
if eo:
_logEvent(loggedEvents, logging.UnknownEncoding({'encoding': enc}), eo[1])
else:
_logEvent(loggedEvents, logging.UnknownEncoding({'encoding': enc}))
enc = fallback
if enc is None:
return enc, None
dec = getdecoder(enc)
try:
return enc, dec(bs)[0]
except UnicodeError, ue:
salvage = dec(bs, 'replace')[0]
if 'start' in ue.__dict__:
# XXX 'start' is in bytes, not characters. This is wrong for multibyte
# encodings
pos = _position(salvage, ue.start)
else:
pos = None
_logEvent(loggedEvents, logging.UnicodeError({"exception":ue}), pos)
return enc, salvage
_encUTF8 = codecs.getencoder('UTF-8')
def asUTF8(x):
"""Accept a Unicode string and return a UTF-8 encoded string, with
its encoding declaration removed, suitable for parsing."""
x = removeDeclaration(unicode(x))
return _encUTF8(x)[0]
if __name__ == '__main__':
from sys import argv
from os.path import isfile
for x in argv[1:]:
if isfile(x):
f = open(x, 'r')
l = f.read(1024)
log = []
eo = detect(l, log)
if eo:
print x,eo
else:
print repr(log)
|