File: html2text.py

package info (click to toggle)
squishdot 1.3.0-1
  • links: PTS
  • area: main
  • in suites: woody
  • size: 896 kB
  • ctags: 349
  • sloc: python: 2,313; makefile: 56; sh: 54
file content (104 lines) | stat: -rw-r--r-- 3,037 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# Copyright (c) 2001 Chris Withers
#
# This Software is released under the MIT License:
# http://www.opensource.org/licenses/mit-license.html
# See license.txt for more details.
#
# $Id: html2text.py,v 1.2 2001/08/17 15:33:43 fresh Exp $

__version__='$Revision: 1.2 $'[11:-2]  

import sgmllib, string

class HTML2Text(sgmllib.SGMLParser):
    
    from htmlentitydefs import entitydefs # replace entitydefs from sgmllib
    
    def __init__(self):
        sgmllib.SGMLParser.__init__(self)
        self.result = ""
        self.indent = 0
        self.ol_number = 0

    def add_line(self,text,newline='\n'):
        self.result = self.result + self.indent*'   ' + text + newline

    def mod_indent(self,i):
        self.indent = self.indent + i
        if self.indent < 0:
            self.indent = 0
        
    def handle_data(self, data):
        if data:
            data = string.split(string.strip(data), '\n')
            if len(data)>1:
                map(self.add_line, data)
            else:
                self.add_line(data[0], newline='')

    def unknown_starttag(self, tag, attrs):
        """ Convert HTML to something meaningful in plain text """
        tag = string.lower(tag)
        
        if tag[0]=='h' or tag in ['br','pre','p','hr']:
            # insert a blank line
            self.add_line('')
            
        elif tag =='img':
            # newline, text, newline
            src = ''
            
            for k, v in attrs:
                if string.lower(k) == 'src':
                    src = v
                    
            self.add_line('')
            self.add_line('Image: %s' % src)
            
        elif tag =='li':
            self.add_line('')
            if self.ol_number:
                # num - text
                self.add_line('%s - ' % self.ol_number,'')
                self.ol_number = self.ol_number + 1
            else:
                # - text
                self.add_line('- ','')
            
        elif tag in ['dd','dt']:
            self.add_line('')
            # increase indent
            self.mod_indent(+1)
            
        elif tag in ['ul','dl','ol']:
            # blank line
            #self.add_line('')
            # increase indent
            self.mod_indent(+1)
            if tag=='ol':
                self.ol_number = 1
                
    def unknown_endtag(self, tag):
        """ Convert HTML to something meaningful in plain text """
        tag = string.lower(tag)
        
        if tag[0]=='h' or tag in ['pre']:
            # newline, text, newline
            self.add_line('')
            
        elif tag =='li':
            #self.add_line('')
            pass
            
        elif tag in ['dd','dt']:
            #self.add_line('')
            # descrease indent
            self.mod_indent(-1)
            
        elif tag in ['ul','dl','ol']:
            # blank line
            #self.add_line('')
            # decrease indent
            self.mod_indent(-1)
            self.ol_number = 0