1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67
|
#!/usr/bin/python
#
# This example program converts a chunk of HTML to a DOM tree.
# It then prints the tree as HTML, as XML, and it prints a list of all
# the hyperlinks in the document by using getElementsByTagName() to
# retrieve all the A elements.
from xml.dom.html_builder import HtmlBuilder
from xml.dom.writer import HtmlWriter
from xml.dom import core
HTML_DATA = """<HTML>
<HEAD><TITLE>Les HOWTO Linux</TITLE></HEAD>
<BODY>
<HR> <H1>Les HOWTO Linux</H1>
<P>Les Howto que vous trouverez ci-dessous sont en français.
Ils peuvent etre trouvés dans les formats suivants
sur le site
<A HREF="ftp://ftp.lip6.fr/pub/linux/french/docs/HOWTO">ftp.lip6.fr</a>
dans le répertoire /pub/linux/french/docs/HOWTO :
<UL>
<LI><A HREF="Access-HOWTO.html">Access-HOWTO</A> (Version <A
HREF="Access-HOWTO.ps">Postscript</A>)</LI>
</UL></BODY></HTML>
"""
# Construct an HtmlBuilder object and feed the data to it
b = HtmlBuilder()
b.feed(HTML_DATA)
# Get the newly-constructed document object
doc = b.document
# Output it as HTML
print "============"
print "HTML version"
w = HtmlWriter()
w.write(b.document)
# Output it as XML
print "\n==========="
print "XML version"
print doc.toxml()
print "\n==========="
print "Links in the document"
# Retrieve all the link objects
links = doc.getElementsByTagName('A')
for node in links:
# Collect any children of the A element that are Text nodes
# (Note that this won't work on invalid HTML, like
# <a href="xxx"><b>Text</b></a>. You could fix this by actually
# traversing all the child nodes of the A element.)
linktext = ""
for child in node.childNodes:
if child.nodeType == core.TEXT_NODE:
linktext = linktext + child.value
# Get the HREF attribute, if present
url = node.getAttribute('HREF')
if url != "":
print "HREF=", url, linktext
print links
|