1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128
|
# Copyright (c) 2001 Chris Withers
#
# This Software is released under the MIT License:
# http://www.opensource.org/licenses/mit-license.html
# See license.txt for more details.
#
# $Id: html2safehtml.py,v 1.7 2002/11/21 12:59:43 fresh Exp $
from HTMLParser import HTMLParser,HTMLParseError,piclose, charref, entityref
from string import lower,find
class HTML2SafeHTML(HTMLParser):
can_close = ['li','p','dd','dt','option']
never_close = ['br','wbr','hr','input','isindex','base','meta','img']
def __init__(self,valid_tags):
HTMLParser.__init__(self)
self.valid_tags = valid_tags
self.result = ""
self.openTags = []
def end_tag(self,tag):
self.result = "%s</%s>" % (self.result, tag)
def handle_data(self, data):
if data:
self.result = self.result + data
def handle_charref(self, name):
self.result = "%s&#%s;" % (self.result, name)
from htmlentitydefs import entitydefs # our entity defs list to use
def handle_entityref(self, name):
# this quotes non-standard entities
if self.entitydefs.has_key(name):
amp = '&'
else:
amp = '&'
self.result = "%s%s%s;" % (self.result, amp, name)
def handle_starttag(self, tag, attrs):
""" Delete all tags except for legal ones """
if tag in self.valid_tags:
self.result = self.result + '<' + tag
for k, v in attrs:
if v is None:
self.result += ' ' + k
else:
if lower(k[0:2]) != 'on' and lower(v[0:10]) != 'javascript':
self.result += ' %s="%s"' % (k, v)
if tag not in self.never_close:
self.openTags.append(tag)
self.result = self.result + '>'
def handle_endtag(self, tag):
try:
while tag != self.openTags[-1] and self.openTags[-1] in self.can_close:
self.openTags.pop()
if tag==self.openTags[-1]:
self.end_tag(self.openTags.pop())
except IndexError:
pass
def cleanup(self):
""" Append missing closing tags """
while self.openTags:
tag = self.openTags.pop()
if tag not in self.can_close:
self.end_tag(tag)
def parse_starttag(self,i):
try:
return HTMLParser.parse_starttag(self,i)
except HTMLParseError:
try:
return piclose.search(self.rawdata,i).end()
except AttributeError:
return -1
def parse_endtag(self,i):
try:
return HTMLParser.parse_endtag(self,i)
except HTMLParseError:
try:
return piclose.search(self.rawdata,i).end()
except:
return -1
def goahead(self,end):
# fix incomplete entity and char refs
rawdata = self.rawdata
i = 0
n = len(rawdata)
newdata=''
while i < n:
j = find(rawdata,'&',i)
if j==-1:
break
newdata = newdata + rawdata[i:j]
if charref.match(rawdata, j) or entityref.match(rawdata, j):
newdata = newdata + '&'
else:
newdata = newdata + '&'
i = j+1
self.rawdata = newdata + rawdata[i:]
# do normal parsing
try:
return HTMLParser.goahead(self,end)
except HTMLParseError:
pass
|