File: escaping.py

package info (click to toggle)
python-weberror 0.13.1%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: buster, stretch
  • size: 408 kB
  • ctags: 441
  • sloc: python: 2,774; makefile: 18
file content (232 lines) | stat: -rw-r--r-- 7,763 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
# filters.py
# Copyright (C) 2006, 2007, 2008 Geoffrey T. Dairiki <dairiki@dairiki.org> 
# and Michael Bayer <mike_mp@zzzcomputing.com> and Ben Bangert <ben@groovie.org>
#
# This module heavily based on the one from Mako and is released under
# the MIT License: http://www.opensource.org/licenses/mit-license.php


import re, cgi, urllib, htmlentitydefs, codecs
from StringIO import StringIO

xml_escapes = {
    '&' : '&amp;',
    '>' : '&gt;',
    '<' : '&lt;',
    '"' : '&#34;',   # also &quot; in html-only
    "'" : '&#39;'    # also &apos; in html-only   
}
# XXX: &quot; is valid in HTML and XML
#      &apos; is not valid HTML, but is valid XML

def html_escape(string):
    return cgi.escape(string, True)

def xml_escape(string):
    return re.sub(r'([&<"\'>])', lambda m: xml_escapes[m.group()], string)

def url_escape(string):
    # convert into a list of octets
    string = string.encode("utf8")
    return urllib.quote_plus(string)

def url_unescape(string):
    text = urllib.unquote_plus(string)
    if not is_ascii_str(text):
        text = text.decode("utf8")
    return text

def trim(string):
    return string.strip()


class Decode(object):
    def __getattr__(self, key):
        def decode(x):
            if isinstance(x, unicode):
                return x
            elif not isinstance(x, str):
                return unicode(str(x), encoding=key)
            else:
                return unicode(x, encoding=key)
        return decode
decode = Decode()
       
           
_ASCII_re = re.compile(r'\A[\x00-\x7f]*\Z')

def is_ascii_str(text):
    return isinstance(text, str) and _ASCII_re.match(text)

################################################################   

class XMLEntityEscaper(object):
    def __init__(self, codepoint2name, name2codepoint):
        self.codepoint2entity = dict([(c, u'&%s;' % n)
                                      for c,n in codepoint2name.iteritems()])
        self.name2codepoint = name2codepoint

    def escape_entities(self, text):
        """Replace characters with their character entity references.

        Only characters corresponding to a named entity are replaced.
        """
        return unicode(text).translate(self.codepoint2entity)

    def __escape(self, m):
        codepoint = ord(m.group())
        try:
            return self.codepoint2entity[codepoint]
        except (KeyError, IndexError):
            return '&#x%X;' % codepoint


    __escapable = re.compile(r'["&<>]|[^\x00-\x7f]')

    def escape(self, text):
        """Replace characters with their character references.

        Replace characters by their named entity references.
        Non-ASCII characters, if they do not have a named entity reference,
        are replaced by numerical character references.

        The return value is guaranteed to be ASCII.
        """
        return self.__escapable.sub(self.__escape, unicode(text)
                                    ).encode('ascii')

    # XXX: This regexp will not match all valid XML entity names__.
    # (It punts on details involving involving CombiningChars and Extenders.)
    #
    # .. __: http://www.w3.org/TR/2000/REC-xml-20001006#NT-EntityRef
    __characterrefs = re.compile(r'''& (?:
                                          \#(\d+)
                                          | \#x([\da-f]+)
                                          | ( (?!\d) [:\w] [-.:\w]+ )
                                          ) ;''',
                                 re.X | re.UNICODE)
   
    def __unescape(self, m):
        dval, hval, name = m.groups()
        if dval:
            codepoint = int(dval)
        elif hval:
            codepoint = int(hval, 16)
        else:
            codepoint = self.name2codepoint.get(name, 0xfffd)
            # U+FFFD = "REPLACEMENT CHARACTER"
        if codepoint < 128:
            return chr(codepoint)
        return unichr(codepoint)
   
    def unescape(self, text):
        """Unescape character references.

        All character references (both entity references and numerical
        character references) are unescaped.
        """
        return self.__characterrefs.sub(self.__unescape, text)


_html_entities_escaper = XMLEntityEscaper(htmlentitydefs.codepoint2name,
                                          htmlentitydefs.name2codepoint)

html_entities_escape = _html_entities_escaper.escape_entities
html_entities_unescape = _html_entities_escaper.unescape


def htmlentityreplace_errors(ex):
    """An encoding error handler.

    This python `codecs`_ error handler replaces unencodable
    characters with HTML entities, or, if no HTML entity exists for
    the character, XML character references.

    >>> u'The cost was \u20ac12.'.encode('latin1', 'htmlentityreplace')
    'The cost was &euro;12.'
    """
    if isinstance(ex, UnicodeEncodeError):
        # Handle encoding errors
        bad_text = ex.object[ex.start:ex.end]
        text = _html_entities_escaper.escape(bad_text)
        return (unicode(text), ex.end)
    raise ex

codecs.register_error('htmlentityreplace', htmlentityreplace_errors)


# TODO: options to make this dynamic per-compilation will be added in a later release
DEFAULT_ESCAPES = {
    'x':'filters.xml_escape',
    'h':'filters.html_escape',
    'u':'filters.url_escape',
    'trim':'filters.trim',
    'entity':'filters.html_entities_escape',
    'unicode':'unicode',
    'decode':'decode',
    'str':'str',
    'n':'n'
}

# regexps used by _translateCdata(),
# made global to compile once.
# see http://www.xml.com/axml/target.html#dt-character
ILLEGAL_LOW_CHARS = '[\x01-\x08\x0B-\x0C\x0E-\x1F]'
ILLEGAL_HIGH_CHARS = '\xEF\xBF[\xBE\xBF]'
# Note: Prolly fuzzy on this, but it looks as if characters from the
# surrogate block are allowed if in scalar form, which is encoded in UTF8 the
# same was as in surrogate block form
XML_ILLEGAL_CHAR_PATTERN = re.compile(
    '%s|%s' % (ILLEGAL_LOW_CHARS, ILLEGAL_HIGH_CHARS))
# the characters that we will want to turn into entrefs
# We must do so for &, <,  and > following ]].
# The xml parser has more leeway, but we're not the parser.
# http://www.xml.com/axml/target.html#dt-chardata
# characters that we must *always* turn to entrefs:
g_cdataCharPatternReq = re.compile('[&<]|]]>')
g_charToEntityReq = {
    '&': '&amp;',
    '<': '&lt;',
    ']]>': ']]&gt;',
    }
# characters that we must turn to entrefs in attr values:
g_cdataCharPattern = re.compile('[&<>"\']|]]>')
g_charToEntity = {
    '&': '&amp;',
    '<': '&lt;',
    '>': '&gt;',
    '"': '&quot;',
    "'": '&apos;',
    ']]>': ']]&gt;',
    }

def removeIllegalChars(characters):
    if XML_ILLEGAL_CHAR_PATTERN.search(characters):
        characters = XML_ILLEGAL_CHAR_PATTERN.subn(
            lambda m: '&#%i;' % ord(m.group()),
            characters)[0]
    return characters

def translateCdata(characters, allEntRefs = None):
    """Translate characters into a legal format."""
    if not characters:
        return ''
    if allEntRefs: # translate all chars to entrefs; for attr value
        if g_cdataCharPattern.search(characters):
            new_string = g_cdataCharPattern.subn(
                lambda m, d=g_charToEntity: d[m.group()],
                characters)[0]
        else:
            new_string = characters
    else: # translate only required chars to entrefs
        if g_cdataCharPatternReq.search(characters):
            new_string = g_cdataCharPatternReq.subn(
                lambda m, d=g_charToEntityReq: d[m.group()],
                characters)[0]
        else:
            new_string = characters
    if XML_ILLEGAL_CHAR_PATTERN.search(new_string):
        new_string = XML_ILLEGAL_CHAR_PATTERN.subn(
            lambda m: '&#%i;' % ord(m.group()),
            new_string)[0]
    return new_string