File: xmlEncoding.py

package info (click to toggle)
python-feedvalidator 0~svn1022-2
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd, squeeze, wheezy
  • size: 652 kB
  • ctags: 2,452
  • sloc: python: 9,481; makefile: 27; sh: 8
file content (288 lines) | stat: -rw-r--r-- 10,243 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
#!/usr/bin/python

"""
$Id: xmlEncoding.py 988 2008-03-12 18:22:48Z sa3ruby $
This module deals with detecting XML encodings, using both BOMs and
explicit declarations.
"""

__author__ = "Joseph Walton <http://www.kafsemo.org/>"
__version__ = "$Revision: 988 $"
__copyright__ = "Copyright (c) 2004 Joseph Walton"

import codecs
import re
from logging import ObscureEncoding, NonstdEncoding
import logging

class FailingCodec:
  def __init__(self, name):
    self.name = name
  def fail(self, txt, errors='strict'):
    raise UnicodeError('No codec available for ' + self.name + ' in this installation of FeedValidator')

# Don't die if the codec can't be found, but return
#  a decoder that will fail on use
def getdecoder(codec):
  try:
    return codecs.getdecoder(codec)
  except:
    return FailingCodec(codec).fail

# These are generic decoders that are only used
#  to decode the XML declaration, from which we can read
#  the real encoding
_decUTF32BE = getdecoder('UTF-32BE')
_decUTF32LE = getdecoder('UTF-32LE')
_decUTF16BE = getdecoder('UTF-16BE')
_decUTF16LE = getdecoder('UTF-16LE')
_decEBCDIC = getdecoder('IBM037') # EBCDIC
_decACE = getdecoder('ISO-8859-1') # An ASCII-compatible encoding

# Given a character index into a string, calculate its 1-based row and column
def _position(txt, idx):
  row = txt.count('\n', 0, idx) + 1
  ln = txt.rfind('\n', 0, idx) + 1
  column = 0
  for c in txt[ln:idx]:
    if c == '\t':
      column = (column // 8 + 1) * 8
    else:
      column += 1
  column += 1
  return (row, column)

def _normaliseNewlines(txt):
  return txt.replace('\r\n', '\n').replace('\r', '\n')

def _logEvent(loggedEvents, e, pos=None):
  if pos:
    e.params['line'], e.params['column'] = pos
  loggedEvents.append(e)

# Return the encoding from the declaration, or 'None'
# Return None if the 'permitted' list is passed in and the encoding
#  isn't found in it. This is so that, e.g., a 4-byte-character XML file
#  that claims to be US-ASCII will fail now.
def _decodeDeclaration(sig, dec, permitted, loggedEvents):
  sig = _normaliseNewlines(dec(sig)[0])
  eo = _encodingFromDecl(sig)
  if not(eo):
    _logEvent(loggedEvents,
      logging.UnicodeError({'exception': 'This XML file (apparently ' + permitted[0] + ') requires an encoding declaration'}), (1, 1))
  elif permitted and not(eo[0].upper() in permitted):
    if _hasCodec(eo[0]):
      # see if the codec is an alias of one of the permitted encodings
      codec=codecs.lookup(eo[0])
      for encoding in permitted:
        if _hasCodec(encoding) and codecs.lookup(encoding)[-1]==codec[-1]: break
      else:
        _logEvent(loggedEvents,
          logging.UnicodeError({'exception': 'This XML file claims an encoding of ' + eo[0] + ', but looks more like ' + permitted[0]}), eo[1])
  return eo

# Return the encoding from the declaration, or 'fallback' if none is
#  present. Return None if the 'permitted' list is passed in and
#  the encoding isn't found in it
def _decodePostBOMDeclaration(sig, dec, permitted, loggedEvents, fallback=None):
  sig = _normaliseNewlines(dec(sig)[0])
  eo = _encodingFromDecl(sig)
  if eo and not(eo[0].upper() in permitted):
    _logEvent(loggedEvents,
      logging.UnicodeError({'exception': 'Document starts with ' + permitted[0] + ' BOM marker but has incompatible declaration of ' + eo[0]}), eo[1])
    return None
  else:
    return eo or (fallback, None)

def isStandard(x):
  """ Is this encoding required by the XML 1.0 Specification, 4.3.3? """
  return x.upper() in ['UTF-8', 'UTF-16']

def isCommon(x):
  """Is this encoding commonly used, according to
  <http://www.syndic8.com/stats.php?Section=feeds#XMLEncodings>
  (as of 2004-03-27)?"""

  return isStandard(x) or x.upper() in ['US-ASCII', 'ISO-8859-1',
    'EUC-JP', 'ISO-8859-2', 'ISO-8859-15', 'ISO-8859-7',
    'KOI8-R', 'SHIFT_JIS', 'WINDOWS-1250', 'WINDOWS-1251',
    'WINDOWS-1252', 'WINDOWS-1254', 'WINDOWS-1255', 'WINDOWS-1256',

    # This doesn't seem to be popular, but is the Chinese
    #  government's mandatory standard
    'GB18030'
    ]

# Inspired by xmlproc's autodetect_encoding, but rewritten
def _detect(doc_start, loggedEvents=[], fallback='UTF-8'):
  """This is the logic from appendix F.1 of the XML 1.0 specification.
  Pass in the start of a document (>= 256 octets), and receive the encoding to
  use, or None if there is a problem with the document."""
  sig = doc_start[:4]

  # With a BOM. We also check for a declaration, and make sure
  #  it doesn't contradict (for 4-byte encodings, it's required)
  if sig == '\x00\x00\xFE\xFF':  # UTF-32 BE
    eo = _decodeDeclaration(doc_start[4:], _decUTF32BE, ['UTF-32', 'ISO-10646-UCS-4', 'CSUCS4', 'UCS-4'], loggedEvents)
  elif sig == '\xFF\xFE\x00\x00':  # UTF-32 LE
    eo = _decodeDeclaration(doc_start[4:], _decUTF32LE, ['UTF-32', 'ISO-10646-UCS-4', 'CSUCS4', 'UCS-4'], loggedEvents)
  elif sig == '\x00\x00\xFF\xFE'  or sig == '\xFE\xFF\x00\x00':
    raise UnicodeError('Unable to process UCS-4 with unusual octet ordering')
  elif sig[:2] == '\xFE\xFF':  # UTF-16 BE
    eo = _decodePostBOMDeclaration(doc_start[2:], _decUTF16BE, ['UTF-16', 'ISO-10646-UCS-2', 'CSUNICODE', 'UCS-2'], loggedEvents, fallback='UTF-16')
  elif sig[:2] == '\xFF\xFE':  # UTF-16 LE
    eo = _decodePostBOMDeclaration(doc_start[2:], _decUTF16LE, ['UTF-16', 'ISO-10646-UCS-2', 'CSUNICODE', 'UCS-2'], loggedEvents, fallback='UTF-16')
  elif sig[:3] == '\xEF\xBB\xBF':
    eo = _decodePostBOMDeclaration(doc_start[3:], _decACE, ['UTF-8'], loggedEvents, fallback='UTF-8')
  
  # Without a BOM; we must read the declaration
  elif sig == '\x00\x00\x00\x3C':
    eo = _decodeDeclaration(doc_start, _decUTF32BE, ['UTF-32BE', 'UTF-32', 'ISO-10646-UCS-4', 'CSUCS4', 'UCS-4'], loggedEvents)
  elif sig == '\x3C\x00\x00\x00':
    eo = _decodeDeclaration(doc_start, _decUTF32LE, ['UTF-32LE', 'UTF-32', 'ISO-10646-UCS-4', 'CSUCS4', 'UCS-4'], loggedEvents)
  elif sig == '\x00\x3C\x00\x3F':
    eo = _decodeDeclaration(doc_start, _decUTF16BE, ['UTF-16BE', 'UTF-16', 'ISO-10646-UCS-2', 'CSUNICODE', 'UCS-2'], loggedEvents)
  elif sig == '\x3C\x00\x3F\x00':
    eo = _decodeDeclaration(doc_start, _decUTF16LE, ['UTF-16LE', 'UTF-16', 'ISO-10646-UCS-2', 'CSUNICODE', 'UCS-2'], loggedEvents)
  elif sig == '\x3C\x3F\x78\x6D':
    eo = _encodingFromDecl(_normaliseNewlines(_decACE(doc_start)[0])) or ('UTF-8', None)
  elif sig == '\x4C\x6F\xA7\x94':
    eo = _decodeDeclaration(doc_start, _decEBCDIC, ['IBM037', 'CP037', 'IBM038', 'EBCDIC-INT'], loggedEvents)

  # There's no BOM, and no declaration. It's UTF-8, or mislabelled.
  else:
    eo = (fallback, None)

  return eo

def detect(doc_start, loggedEvents=[], fallback='UTF-8'):
  eo = _detect(doc_start, loggedEvents, fallback)

  if eo:
    return eo[0]
  else:
    return None

_encRe = re.compile(r'<\?xml\s+version\s*=\s*(?:"[-a-zA-Z0-9_.:]+"|\'[-a-zA-Z0-9_.:]+\')\s+(encoding\s*=\s*(?:"([-A-Za-z0-9._]+)"|\'([-A-Za-z0-9._]+)\'))')

def _encodingFromDecl(x):
  m = _encRe.match(x)
  if m:
    if m.group(2):
      return m.group(2), _position(x, m.start(2))
    else:
      return m.group(3), _position(x, m.start(3))
  else:
    return None

def removeDeclaration(x):
  """Replace an XML document string's encoding declaration with the
  same number of spaces. Some XML parsers don't allow the
  encoding to be overridden, and this is a workaround."""
  m = _encRe.match(x)
  if m:
    s = m.start(1)
    e = m.end(1)
    res = x[:s] + ' ' * (e - s) + x[e:]
  else:
    res = x
  return res

def _hasCodec(enc):
  try:
    return codecs.lookup(enc) is not None
  except:
    return False

def decode(mediaType, charset, bs, loggedEvents, fallback=None):
  eo = _detect(bs, loggedEvents, fallback=None)

  # Check declared encodings
  if eo and eo[1] and _hasCodec(eo[0]):
    if not(isCommon(eo[0])):
      _logEvent(loggedEvents, ObscureEncoding({"encoding": eo[0]}), eo[1])
    elif not(isStandard(eo[0])):
      _logEvent(loggedEvents, NonstdEncoding({"encoding": eo[0]}), eo[1])

  if eo:
    encoding = eo[0]
  else:
    encoding = None

  if charset and encoding and charset.lower() != encoding.lower():
    # RFC 3023 requires us to use 'charset', but a number of aggregators
    # ignore this recommendation, so we should warn.
    loggedEvents.append(logging.EncodingMismatch({"charset": charset, "encoding": encoding}))

  if mediaType and mediaType.startswith("text/") and charset is None:
    loggedEvents.append(logging.TextXml({}))

    # RFC 3023 requires text/* to default to US-ASCII.  Issue a warning
    # if this occurs, but continue validation using the detected encoding
    try:
      bs.decode("US-ASCII")
    except:
      if not encoding:
        try:
          bs.decode(fallback)
          encoding=fallback
        except:
          pass
      if encoding and encoding.lower() != 'us-ascii':
        loggedEvents.append(logging.EncodingMismatch({"charset": "US-ASCII", "encoding": encoding}))

  enc = charset or encoding

  if enc is None:
    loggedEvents.append(logging.MissingEncoding({}))
    enc = fallback
  elif not(_hasCodec(enc)):
    if eo:
      _logEvent(loggedEvents, logging.UnknownEncoding({'encoding': enc}), eo[1])
    else:
      _logEvent(loggedEvents, logging.UnknownEncoding({'encoding': enc}))
    enc = fallback

  if enc is None:
    return enc, None

  dec = getdecoder(enc)
  try:
    return enc, dec(bs)[0]
  except UnicodeError, ue:
    salvage = dec(bs, 'replace')[0]
    if 'start' in ue.__dict__:
      # XXX 'start' is in bytes, not characters. This is wrong for multibyte
      #  encodings
      pos = _position(salvage, ue.start)
    else:
      pos = None

    _logEvent(loggedEvents, logging.UnicodeError({"exception":ue}), pos)

    return enc, salvage


_encUTF8 = codecs.getencoder('UTF-8')

def asUTF8(x):
  """Accept a Unicode string and return a UTF-8 encoded string, with
  its encoding declaration removed, suitable for parsing."""
  x = removeDeclaration(unicode(x))
  return _encUTF8(x)[0]


if __name__ == '__main__':
  from sys import argv
  from os.path import isfile

  for x in argv[1:]:
    if isfile(x):
      f = open(x, 'r')
      l = f.read(1024)
      log = []
      eo = detect(l, log)
      if eo:
        print x,eo
      else:
        print repr(log)