File: html2safehtml.py

package info (click to toggle)
zope-stripogram 1.4-5
  • links: PTS
  • area: main
  • in suites: etch, etch-m68k
  • size: 88 kB
  • ctags: 126
  • sloc: python: 649; makefile: 29; sh: 2
file content (128 lines) | stat: -rw-r--r-- 3,788 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# Copyright (c) 2001 Chris Withers
#
# This Software is released under the MIT License:
# http://www.opensource.org/licenses/mit-license.html
# See license.txt for more details.
#
# $Id: html2safehtml.py,v 1.7 2002/11/21 12:59:43 fresh Exp $

from HTMLParser import HTMLParser,HTMLParseError,piclose, charref, entityref
from string import lower,find

class HTML2SafeHTML(HTMLParser):
    
    can_close   = ['li','p','dd','dt','option']
    never_close = ['br','wbr','hr','input','isindex','base','meta','img']
    
    def __init__(self,valid_tags):
        HTMLParser.__init__(self)
        self.valid_tags = valid_tags
        self.result = ""
        self.openTags = []
        
    def end_tag(self,tag):
        self.result = "%s</%s>" % (self.result, tag)
            
    def handle_data(self, data):
        if data:
            self.result = self.result + data

    def handle_charref(self, name):
        self.result = "%s&#%s;" % (self.result, name)
        
    from htmlentitydefs import entitydefs # our entity defs list to use
    
    def handle_entityref(self, name):
        # this quotes non-standard entities
        if self.entitydefs.has_key(name): 
            amp = '&'
        else:
            amp = '&amp;'
        self.result = "%s%s%s;" % (self.result, amp, name)

    def handle_starttag(self, tag, attrs):
        """ Delete all tags except for legal ones """

        if tag in self.valid_tags:
            
            self.result = self.result + '<' + tag
            
            for k, v in attrs:
                if v is None:
                    self.result += ' ' + k
                else:
                    if lower(k[0:2]) != 'on' and lower(v[0:10]) != 'javascript':
                        self.result += ' %s="%s"' % (k, v)
                    
            if tag not in self.never_close:
                self.openTags.append(tag)
                
            self.result = self.result + '>'
                
    def handle_endtag(self, tag):

        try:            

            while tag != self.openTags[-1] and self.openTags[-1] in self.can_close:
                self.openTags.pop()
            
            if tag==self.openTags[-1]:
                self.end_tag(self.openTags.pop())                
                
        except IndexError:
            pass

    def cleanup(self):
        """ Append missing closing tags """
        while self.openTags:
            tag = self.openTags.pop()
            if tag not in self.can_close:
                self.end_tag(tag)

    def parse_starttag(self,i):
        try:
            return HTMLParser.parse_starttag(self,i)
        except HTMLParseError:
            try:
                return piclose.search(self.rawdata,i).end()
            except AttributeError:
                return -1
        
    def parse_endtag(self,i):
        try:
            return HTMLParser.parse_endtag(self,i)
        except HTMLParseError:
            try:
                return piclose.search(self.rawdata,i).end()
            except:
                return -1

    def goahead(self,end):

        # fix incomplete entity and char refs        
        rawdata = self.rawdata
        
        i = 0
        n = len(rawdata)
        newdata=''
        
        while i < n:
            j = find(rawdata,'&',i)
            if j==-1:
                break
            newdata = newdata + rawdata[i:j]
            if charref.match(rawdata, j) or entityref.match(rawdata, j):
                newdata = newdata + '&'
            else:
                newdata = newdata + '&amp;'
            i = j+1
            
        self.rawdata = newdata + rawdata[i:]

        # do normal parsing
        try:
            return HTMLParser.goahead(self,end)
        except HTMLParseError:
            pass