# Copyright (c) 2001 Chris Withers
#
# This Software is released under the MIT License:
# http://www.opensource.org/licenses/mit-license.html
# See license.txt for more details.
#
# $Id: html2text.py,v 1.2 2001/08/17 15:33:43 fresh Exp $

__version__='$Revision: 1.2 $'[11:-2]  

import sgmllib, string

class HTML2Text(sgmllib.SGMLParser):
    
    from htmlentitydefs import entitydefs # replace entitydefs from sgmllib
    
    def __init__(self):
        sgmllib.SGMLParser.__init__(self)
        self.result = ""
        self.indent = 0
        self.ol_number = 0

    def add_line(self,text,newline='\n'):
        self.result = self.result + self.indent*'   ' + text + newline

    def mod_indent(self,i):
        self.indent = self.indent + i
        if self.indent < 0:
            self.indent = 0
        
    def handle_data(self, data):
        if data:
            data = string.split(string.strip(data), '\n')
            if len(data)>1:
                map(self.add_line, data)
            else:
                self.add_line(data[0], newline='')

    def unknown_starttag(self, tag, attrs):
        """ Convert HTML to something meaningful in plain text """
        tag = string.lower(tag)
        
        if tag[0]=='h' or tag in ['br','pre','p','hr']:
            # insert a blank line
            self.add_line('')
            
        elif tag =='img':
            # newline, text, newline
            src = ''
            
            for k, v in attrs:
                if string.lower(k) == 'src':
                    src = v
                    
            self.add_line('')
            self.add_line('Image: %s' % src)
            
        elif tag =='li':
            self.add_line('')
            if self.ol_number:
                # num - text
                self.add_line('%s - ' % self.ol_number,'')
                self.ol_number = self.ol_number + 1
            else:
                # - text
                self.add_line('- ','')
            
        elif tag in ['dd','dt']:
            self.add_line('')
            # increase indent
            self.mod_indent(+1)
            
        elif tag in ['ul','dl','ol']:
            # blank line
            #self.add_line('')
            # increase indent
            self.mod_indent(+1)
            if tag=='ol':
                self.ol_number = 1
                
    def unknown_endtag(self, tag):
        """ Convert HTML to something meaningful in plain text """
        tag = string.lower(tag)
        
        if tag[0]=='h' or tag in ['pre']:
            # newline, text, newline
            self.add_line('')
            
        elif tag =='li':
            #self.add_line('')
            pass
            
        elif tag in ['dd','dt']:
            #self.add_line('')
            # descrease indent
            self.mod_indent(-1)
            
        elif tag in ['ul','dl','ol']:
            # blank line
            #self.add_line('')
            # decrease indent
            self.mod_indent(-1)
            self.ol_number = 0        

