File: HtmlLib.py

package info (click to toggle)
python-xml 0.8.4-10.1%2Blenny1
  • links: PTS
  • area: main
  • in suites: lenny
  • size: 4,972 kB
  • ctags: 10,628
  • sloc: python: 46,730; ansic: 14,354; xml: 968; makefile: 201; sh: 20
file content (88 lines) | stat: -rw-r--r-- 3,174 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
########################################################################
#
# File Name:            HtmlLib.py
#
#
"""
Components for reading HTML files using htmllib.py.
WWW: http://4suite.com/4DOM         e-mail: support@4suite.com

Copyright (c) 2000 Fourthought Inc, USA.   All Rights Reserved.
See  http://4suite.com/COPYRIGHT  for license and copyright information
"""

import os, urllib
from xml.dom.ext import reader
from xml.dom import Node

import Sgmlop

class Reader(reader.Reader):
    def __init__(self):
        self.parser = Sgmlop.HtmlParser()

    def fromStream(self, stream, ownerDoc=None, charset=''):
        self.parser.initParser()
        self.parser.initState(ownerDoc, charset)
        self.parser.parse(stream)
        frag = self.parser.rootNode

        if ownerDoc is None:
            # Use the created document
            doc = frag.ownerDocument

            # we have created a new document, and a documentFragment that belongs to this document.
            # However, the document is already '<HTML><HEAD><TITLE></TITLE></HEAD><BODY></BODY></HTML>'
            # so we need to find out if we have an HTML node in the fragment.
            for child in frag.childNodes:
                if child.nodeType == Node.ELEMENT_NODE and child.tagName == 'HTML':
                    # clean up the junk automatically generated by the HTMLDomImplementation
                    while doc.documentElement.firstChild:
                        c = doc.documentElement.removeChild(doc.documentElement.firstChild)
                        self.releaseNode(c)
                    # copy stuff
                    while child.firstChild:
                        doc.documentElement.appendChild(child.firstChild)
                    self.releaseNode(frag)
                    return doc

            # We are here if we could not find an HTML element in the fragment.
            # In this case, we should append everything under the BODY element in the document
            body = doc.documentElement.lastChild
            body.appendChild(frag)
            self.releaseNode(frag)
            return doc
        else:
            # an owner document was passed, we return the fragment, as is.
            return frag

    def fromUri(self, uri, ownerDoc=None, charset=''):
        stream = reader.BASIC_RESOLVER.resolve(uri)
        try:
            return self.fromStream(stream, ownerDoc, charset)
        finally:
            stream.close()

    def fromString(self, str, ownerDoc=None, charset=''):
        stream = reader.StrStream(str)
        try:
            return self.fromStream(stream, ownerDoc, charset)
        finally:
            stream.close()

########################## Deprecated ##############################

def FromHtmlStream(fp, ownerDoc=None, charset=''):
    return Reader().fromStream(fp, ownerDoc, charset)


def FromHtmlFile(fileName, ownerDoc=None, charset=''):
    return Reader().fromUri(fileName, ownerDoc, charset)


def FromHtmlUrl(url, ownerDoc=None, charset=''):
    return Reader().fromUri(url, ownerDoc, charset)


def FromHtml(text, ownerDoc=None, charset=''):
    return Reader().fromString(text, ownerDoc, charset)