1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165
|
### PAGE #############################################################################################
# Code for querying the HTML DOM.
# It wraps BeautifulSoup by Leonard Richardson.
# Author: Tom De Smedt.
# Copyright (c) 2007 by Tom De Smedt.
# See LICENSE.txt for details.
from BeautifulSoup import BeautifulSoup, Tag
from url import URLAccumulator, URLParser
from html import replace_entities, plain
from cache import Cache
def clear_cache():
Cache("html").clear()
### PAGE ERRORS ######################################################################################
class PageUnicodeError(Exception):
def __str__(self): return str(self.__class__)
class PageParseError(Exception):
def __str__(self): return str(self.__class__)
### PAGE #########@###################################################################################
Tag.find_all = Tag.findAll
class Page(BeautifulSoup, URLAccumulator):
""" DOM tree of a HTML page.
Page is essentially an asynchronous download of a BeautifulSoup page.
It has the following methods:
description() - returns meta description
keywords() - returns meta keywords
links() - by default, returns external links
find(tag, attribute=value) - find the first tag with given attributes
find_all(tag, attribute=value) - find all tags with given attributes
find() and find_all() return objects that have find() and find_all() too.
They're essentially lists of Tag objects.
Alternatively, get tags directly as properties, e.g.
page.body.p - returns a list of all p Tag objects (each has find() and find_all() )
To get attributes from a Tag:
p["id"]
"""
def __init__(self, url, wait=10, asynchronous=False, cached=True):
if cached:
cache = "html"
else:
cache = None
URLAccumulator.__init__(self, url, wait, asynchronous, cache)
def load(self, data):
data = replace_entities(data)
try:
BeautifulSoup.__init__(self, data)
except UnicodeEncodeError:
self.error = PageUnicodeError()
BeautifulSoup.__init__(self, "")
except:
self.error = PageParseError()
BeautifulSoup.__init__(self, "")
def _title(self):
""" Returns the page title.
"""
return self.find("title").string
title = property(_title)
def _description(self):
""" Returns the meta description in the page.
"""
meta = self.find("meta", {"name":"description"})
if isinstance(meta, dict) and \
meta.has_key("content"):
return meta["content"]
else:
return u""
description = property(_description)
def _keywords(self):
""" Returns the meta keywords in the page.
"""
meta = self.find("meta", {"name":"keywords"})
if isinstance(meta, dict) and \
meta.has_key("content"):
keywords = [k.strip() for k in meta["content"].split(",")]
else:
keywords = []
return keywords
keywords = property(_keywords)
def links(self, external=True):
""" Retrieves links in the page.
Returns a list of URL's.
By default, only external URL's are returned.
External URL's starts with http:// and point to another
domain than the domain the page is on.
"""
domain = URLParser(self.url).domain
links = []
for a in self("a"):
for attribute, value in a.attrs:
if attribute == "href":
if not external \
or (value.startswith("http://") and value.find("http://"+domain) < 0):
links.append(value)
return links
def find_class(self, classname, tag=""):
return self( tag, {"class": classname} )
def parse(url, wait=10, asynchronous=False, cached=True):
return Page(url, wait, asynchronous, cached)
"""
import url
url = url.create("http://nodebox.net/code/index.php/Share")
url.query["p"] = 2
print url
page = parse(url)
print page.title
print page.title.string
print page.description()
print page.keywords()
print page.find(id="content")["id"]
# find() returns a list of Tags and has a find() method
for p in page.body.find("div", id="content").find_all("p"):
print ">>>", plain(p)
print page.links()
print page.find_all("h2")
print page.contents[0].name
# .div returns a list of Tags
print page.body.div(id="content")[0].p
"""
|