1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232
|
# filters.py
# Copyright (C) 2006, 2007, 2008 Geoffrey T. Dairiki <dairiki@dairiki.org>
# and Michael Bayer <mike_mp@zzzcomputing.com> and Ben Bangert <ben@groovie.org>
#
# This module heavily based on the one from Mako and is released under
# the MIT License: http://www.opensource.org/licenses/mit-license.php
import re, cgi, urllib, htmlentitydefs, codecs
from StringIO import StringIO
xml_escapes = {
'&' : '&',
'>' : '>',
'<' : '<',
'"' : '"', # also " in html-only
"'" : ''' # also ' in html-only
}
# XXX: " is valid in HTML and XML
# ' is not valid HTML, but is valid XML
def html_escape(string):
return cgi.escape(string, True)
def xml_escape(string):
return re.sub(r'([&<"\'>])', lambda m: xml_escapes[m.group()], string)
def url_escape(string):
# convert into a list of octets
string = string.encode("utf8")
return urllib.quote_plus(string)
def url_unescape(string):
text = urllib.unquote_plus(string)
if not is_ascii_str(text):
text = text.decode("utf8")
return text
def trim(string):
return string.strip()
class Decode(object):
def __getattr__(self, key):
def decode(x):
if isinstance(x, unicode):
return x
elif not isinstance(x, str):
return unicode(str(x), encoding=key)
else:
return unicode(x, encoding=key)
return decode
decode = Decode()
_ASCII_re = re.compile(r'\A[\x00-\x7f]*\Z')
def is_ascii_str(text):
return isinstance(text, str) and _ASCII_re.match(text)
################################################################
class XMLEntityEscaper(object):
def __init__(self, codepoint2name, name2codepoint):
self.codepoint2entity = dict([(c, u'&%s;' % n)
for c,n in codepoint2name.iteritems()])
self.name2codepoint = name2codepoint
def escape_entities(self, text):
"""Replace characters with their character entity references.
Only characters corresponding to a named entity are replaced.
"""
return unicode(text).translate(self.codepoint2entity)
def __escape(self, m):
codepoint = ord(m.group())
try:
return self.codepoint2entity[codepoint]
except (KeyError, IndexError):
return '&#x%X;' % codepoint
__escapable = re.compile(r'["&<>]|[^\x00-\x7f]')
def escape(self, text):
"""Replace characters with their character references.
Replace characters by their named entity references.
Non-ASCII characters, if they do not have a named entity reference,
are replaced by numerical character references.
The return value is guaranteed to be ASCII.
"""
return self.__escapable.sub(self.__escape, unicode(text)
).encode('ascii')
# XXX: This regexp will not match all valid XML entity names__.
# (It punts on details involving involving CombiningChars and Extenders.)
#
# .. __: http://www.w3.org/TR/2000/REC-xml-20001006#NT-EntityRef
__characterrefs = re.compile(r'''& (?:
\#(\d+)
| \#x([\da-f]+)
| ( (?!\d) [:\w] [-.:\w]+ )
) ;''',
re.X | re.UNICODE)
def __unescape(self, m):
dval, hval, name = m.groups()
if dval:
codepoint = int(dval)
elif hval:
codepoint = int(hval, 16)
else:
codepoint = self.name2codepoint.get(name, 0xfffd)
# U+FFFD = "REPLACEMENT CHARACTER"
if codepoint < 128:
return chr(codepoint)
return unichr(codepoint)
def unescape(self, text):
"""Unescape character references.
All character references (both entity references and numerical
character references) are unescaped.
"""
return self.__characterrefs.sub(self.__unescape, text)
_html_entities_escaper = XMLEntityEscaper(htmlentitydefs.codepoint2name,
htmlentitydefs.name2codepoint)
html_entities_escape = _html_entities_escaper.escape_entities
html_entities_unescape = _html_entities_escaper.unescape
def htmlentityreplace_errors(ex):
"""An encoding error handler.
This python `codecs`_ error handler replaces unencodable
characters with HTML entities, or, if no HTML entity exists for
the character, XML character references.
>>> u'The cost was \u20ac12.'.encode('latin1', 'htmlentityreplace')
'The cost was €12.'
"""
if isinstance(ex, UnicodeEncodeError):
# Handle encoding errors
bad_text = ex.object[ex.start:ex.end]
text = _html_entities_escaper.escape(bad_text)
return (unicode(text), ex.end)
raise ex
codecs.register_error('htmlentityreplace', htmlentityreplace_errors)
# TODO: options to make this dynamic per-compilation will be added in a later release
DEFAULT_ESCAPES = {
'x':'filters.xml_escape',
'h':'filters.html_escape',
'u':'filters.url_escape',
'trim':'filters.trim',
'entity':'filters.html_entities_escape',
'unicode':'unicode',
'decode':'decode',
'str':'str',
'n':'n'
}
# regexps used by _translateCdata(),
# made global to compile once.
# see http://www.xml.com/axml/target.html#dt-character
ILLEGAL_LOW_CHARS = '[\x01-\x08\x0B-\x0C\x0E-\x1F]'
ILLEGAL_HIGH_CHARS = '\xEF\xBF[\xBE\xBF]'
# Note: Prolly fuzzy on this, but it looks as if characters from the
# surrogate block are allowed if in scalar form, which is encoded in UTF8 the
# same was as in surrogate block form
XML_ILLEGAL_CHAR_PATTERN = re.compile(
'%s|%s' % (ILLEGAL_LOW_CHARS, ILLEGAL_HIGH_CHARS))
# the characters that we will want to turn into entrefs
# We must do so for &, <, and > following ]].
# The xml parser has more leeway, but we're not the parser.
# http://www.xml.com/axml/target.html#dt-chardata
# characters that we must *always* turn to entrefs:
g_cdataCharPatternReq = re.compile('[&<]|]]>')
g_charToEntityReq = {
'&': '&',
'<': '<',
']]>': ']]>',
}
# characters that we must turn to entrefs in attr values:
g_cdataCharPattern = re.compile('[&<>"\']|]]>')
g_charToEntity = {
'&': '&',
'<': '<',
'>': '>',
'"': '"',
"'": ''',
']]>': ']]>',
}
def removeIllegalChars(characters):
if XML_ILLEGAL_CHAR_PATTERN.search(characters):
characters = XML_ILLEGAL_CHAR_PATTERN.subn(
lambda m: '&#%i;' % ord(m.group()),
characters)[0]
return characters
def translateCdata(characters, allEntRefs = None):
"""Translate characters into a legal format."""
if not characters:
return ''
if allEntRefs: # translate all chars to entrefs; for attr value
if g_cdataCharPattern.search(characters):
new_string = g_cdataCharPattern.subn(
lambda m, d=g_charToEntity: d[m.group()],
characters)[0]
else:
new_string = characters
else: # translate only required chars to entrefs
if g_cdataCharPatternReq.search(characters):
new_string = g_cdataCharPatternReq.subn(
lambda m, d=g_charToEntityReq: d[m.group()],
characters)[0]
else:
new_string = characters
if XML_ILLEGAL_CHAR_PATTERN.search(new_string):
new_string = XML_ILLEGAL_CHAR_PATTERN.subn(
lambda m: '&#%i;' % ord(m.group()),
new_string)[0]
return new_string
|