1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104
|
# Copyright (c) 2001 Chris Withers
#
# This Software is released under the MIT License:
# http://www.opensource.org/licenses/mit-license.html
# See license.txt for more details.
#
# $Id: html2text.py,v 1.2 2001/08/17 15:33:43 fresh Exp $
__version__='$Revision: 1.2 $'[11:-2]
import sgmllib, string
class HTML2Text(sgmllib.SGMLParser):
from htmlentitydefs import entitydefs # replace entitydefs from sgmllib
def __init__(self):
sgmllib.SGMLParser.__init__(self)
self.result = ""
self.indent = 0
self.ol_number = 0
def add_line(self,text,newline='\n'):
self.result = self.result + self.indent*' ' + text + newline
def mod_indent(self,i):
self.indent = self.indent + i
if self.indent < 0:
self.indent = 0
def handle_data(self, data):
if data:
data = string.split(string.strip(data), '\n')
if len(data)>1:
map(self.add_line, data)
else:
self.add_line(data[0], newline='')
def unknown_starttag(self, tag, attrs):
""" Convert HTML to something meaningful in plain text """
tag = string.lower(tag)
if tag[0]=='h' or tag in ['br','pre','p','hr']:
# insert a blank line
self.add_line('')
elif tag =='img':
# newline, text, newline
src = ''
for k, v in attrs:
if string.lower(k) == 'src':
src = v
self.add_line('')
self.add_line('Image: %s' % src)
elif tag =='li':
self.add_line('')
if self.ol_number:
# num - text
self.add_line('%s - ' % self.ol_number,'')
self.ol_number = self.ol_number + 1
else:
# - text
self.add_line('- ','')
elif tag in ['dd','dt']:
self.add_line('')
# increase indent
self.mod_indent(+1)
elif tag in ['ul','dl','ol']:
# blank line
#self.add_line('')
# increase indent
self.mod_indent(+1)
if tag=='ol':
self.ol_number = 1
def unknown_endtag(self, tag):
""" Convert HTML to something meaningful in plain text """
tag = string.lower(tag)
if tag[0]=='h' or tag in ['pre']:
# newline, text, newline
self.add_line('')
elif tag =='li':
#self.add_line('')
pass
elif tag in ['dd','dt']:
#self.add_line('')
# descrease indent
self.mod_indent(-1)
elif tag in ['ul','dl','ol']:
# blank line
#self.add_line('')
# decrease indent
self.mod_indent(-1)
self.ol_number = 0
|