File: page.py

package info (click to toggle)
nodebox-web 1.9.4.6-1
  • links: PTS, VCS
  • area: main
  • in suites: squeeze
  • size: 1,904 kB
  • ctags: 1,602
  • sloc: python: 7,582; ansic: 581; xml: 239; makefile: 2
file content (165 lines) | stat: -rw-r--r-- 4,780 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
### PAGE #############################################################################################
# Code for querying the HTML DOM.
# It wraps BeautifulSoup by Leonard Richardson.

# Author: Tom De Smedt.
# Copyright (c) 2007 by Tom De Smedt.
# See LICENSE.txt for details.

from BeautifulSoup import BeautifulSoup, Tag

from url import URLAccumulator, URLParser
from html import replace_entities, plain
from cache import Cache

def clear_cache():
    Cache("html").clear()

### PAGE ERRORS ######################################################################################

class PageUnicodeError(Exception):
    def __str__(self): return str(self.__class__)  

class PageParseError(Exception):
    def __str__(self): return str(self.__class__)  

### PAGE #########@###################################################################################

Tag.find_all = Tag.findAll

class Page(BeautifulSoup, URLAccumulator):
    
    """ DOM tree of a HTML page.
    
    Page is essentially an asynchronous download of a BeautifulSoup page.
    It has the following methods:
    description() - returns meta description
    keywords() - returns meta keywords
    links() - by default, returns external links
    find(tag, attribute=value) - find the first tag with given attributes
    find_all(tag, attribute=value) - find all tags with given attributes
    
    find() and find_all() return objects that have find() and find_all() too.
    They're essentially lists of Tag objects.
    
    Alternatively, get tags directly as properties, e.g.
    page.body.p - returns a list of all p Tag objects (each has find() and find_all() )
    
    To get attributes from a Tag:
    p["id"]
    
    """
    
    def __init__(self, url, wait=10, asynchronous=False, cached=True):
        
        if cached: 
            cache = "html"
        else:
            cache = None
        URLAccumulator.__init__(self, url, wait, asynchronous, cache)

    def load(self, data):
        
        data = replace_entities(data)
        try:
            BeautifulSoup.__init__(self, data)
        except UnicodeEncodeError:
            self.error = PageUnicodeError()
            BeautifulSoup.__init__(self, "")
        except:
            self.error = PageParseError()
            BeautifulSoup.__init__(self, "")            

    def _title(self):
        
        """ Returns the page title.
        """    
        
        return self.find("title").string
        
    title = property(_title)    

    def _description(self):
        
        """ Returns the meta description in the page.
        """        

        meta = self.find("meta", {"name":"description"})
        if isinstance(meta, dict) and \
           meta.has_key("content"):
            return meta["content"]
        else:
            return u""
            
    description = property(_description)

    def _keywords(self):
        
        """ Returns the meta keywords in the page.
        """
        
        meta = self.find("meta", {"name":"keywords"})
        if isinstance(meta, dict) and \
           meta.has_key("content"):
            keywords = [k.strip() for k in meta["content"].split(",")]
        else:
            keywords = []
            
        return keywords
        
    keywords = property(_keywords)

    def links(self, external=True):
        
        """ Retrieves links in the page.
        
        Returns a list of URL's.
        By default, only external URL's are returned.
        External URL's starts with http:// and point to another
        domain than the domain the page is on.
        
        """
        
        domain = URLParser(self.url).domain
        
        links = []
        for a in self("a"):
            for attribute, value in a.attrs:
                if attribute == "href":
                    if not external \
                    or (value.startswith("http://") and value.find("http://"+domain) < 0):
                        links.append(value)
                        
        return links
    
    def find_class(self, classname, tag=""):
        return self( tag, {"class": classname} )
        
def parse(url, wait=10, asynchronous=False, cached=True):
    return Page(url, wait, asynchronous, cached)

"""
import url
url = url.create("http://nodebox.net/code/index.php/Share")
url.query["p"] = 2
print url

page = parse(url)
print page.title
print page.title.string
print page.description()
print page.keywords()

print page.find(id="content")["id"]
# find() returns a list of Tags and has a find() method
for p in page.body.find("div", id="content").find_all("p"):
    print ">>>", plain(p)

print page.links()
print page.find_all("h2")

print page.contents[0].name

# .div returns a list of Tags
print page.body.div(id="content")[0].p
"""