File: HtmlLib.py

package info (click to toggle)
python-xml 0.8.4-10.1%2Blenny1
links: PTS
area: main
in suites: lenny
size: 4,972 kB
ctags: 10,628
sloc: python: 46,730; ansic: 14,354; xml: 968; makefile: 201; sh: 20
file content (88 lines) | stat: -rw-r--r-- 3,174 bytes
parent folder | download | duplicates (3)
########################################################################
#
# File Name:            HtmlLib.py
#
#
"""
Components for reading HTML files using htmllib.py.
WWW: http://4suite.com/4DOM         e-mail: support@4suite.com

Copyright (c) 2000 Fourthought Inc, USA.   All Rights Reserved.
See  http://4suite.com/COPYRIGHT  for license and copyright information
"""

import os, urllib
from xml.dom.ext import reader
from xml.dom import Node

import Sgmlop

class Reader(reader.Reader):
    def __init__(self):
        self.parser = Sgmlop.HtmlParser()

    def fromStream(self, stream, ownerDoc=None, charset=''):
        self.parser.initParser()
        self.parser.initState(ownerDoc, charset)
        self.parser.parse(stream)
        frag = self.parser.rootNode

        if ownerDoc is None:
            # Use the created document
            doc = frag.ownerDocument

            # we have created a new document, and a documentFragment that belongs to this document.
            # However, the document is already '<HTML><HEAD><TITLE></TITLE></HEAD><BODY></BODY></HTML>'
            # so we need to find out if we have an HTML node in the fragment.
            for child in frag.childNodes:
                if child.nodeType == Node.ELEMENT_NODE and child.tagName == 'HTML':
                    # clean up the junk automatically generated by the HTMLDomImplementation
                    while doc.documentElement.firstChild:
                        c = doc.documentElement.removeChild(doc.documentElement.firstChild)
                        self.releaseNode(c)
                    # copy stuff
                    while child.firstChild:
                        doc.documentElement.appendChild(child.firstChild)
                    self.releaseNode(frag)
                    return doc

            # We are here if we could not find an HTML element in the fragment.
            # In this case, we should append everything under the BODY element in the document
            body = doc.documentElement.lastChild
            body.appendChild(frag)
            self.releaseNode(frag)
            return doc
        else:
            # an owner document was passed, we return the fragment, as is.
            return frag

    def fromUri(self, uri, ownerDoc=None, charset=''):
        stream = reader.BASIC_RESOLVER.resolve(uri)
        try:
            return self.fromStream(stream, ownerDoc, charset)
        finally:
            stream.close()

    def fromString(self, str, ownerDoc=None, charset=''):
        stream = reader.StrStream(str)
        try:
            return self.fromStream(stream, ownerDoc, charset)
        finally:
            stream.close()

########################## Deprecated ##############################

def FromHtmlStream(fp, ownerDoc=None, charset=''):
    return Reader().fromStream(fp, ownerDoc, charset)


def FromHtmlFile(fileName, ownerDoc=None, charset=''):
    return Reader().fromUri(fileName, ownerDoc, charset)


def FromHtmlUrl(url, ownerDoc=None, charset=''):
    return Reader().fromUri(url, ownerDoc, charset)


def FromHtml(text, ownerDoc=None, charset=''):
    return Reader().fromString(text, ownerDoc, charset)