File: 13-crawler.py

package info (click to toggle)
python-pattern 2.6%2Bgit20150109-3
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 78,672 kB
  • sloc: python: 53,865; xml: 11,965; ansic: 2,318; makefile: 94
file content (133 lines) | stat: -rw-r--r-- 6,240 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import os, sys; sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))

from pattern.web import Crawler, DEPTH, BREADTH, FIFO, LIFO

# This example demonstrates how to use the Crawler class for web crawling.

# -------------------------------------------------------------------------------------------------
# First, we need a subclass of Crawler with its own Crawler.visit() method.
# The visit() method takes two parameters: the visited link and the HTML source.
# We could parse the HTML DOM to extract information we need, for example.
# Anything that is not HTML (e.g., a JPEG file) is passed to Crawler.fail().

class SimpleCrawler1(Crawler):
    
    def visit(self, link, source=None):
        print "visiting:", link.url, "from:", link.referrer
        
    def fail(self, link):
        print "failed:", link.url

# Create a new crawler.
# 1) The links parameter is a list of URL's to visit.
#    The crawler will visit the first link, extract new links from the HTML, and queue these for a visit too.
# 2) The domains parameter is a list of allowed domains.
#    The crawler will never leave these domains.
# 3) The delay parameter specifies a number of seconds to wait before revisiting the same domain.
#    In the meantime, other queued links will be crawled if possible.

crawler1 = SimpleCrawler1(links=["http://www.clips.ua.ac.be/pages/pattern/"], domains=["ua.ac.be"], delay=0.0)

print "CRAWLER 1 " + "-" * 50
while len(crawler1.visited) < 5: # Crawler.visited is a dictionary of all URL's visited so far.
    # The Crawler.crawl() method has the same optional parameters as URL.download(),
    # for example: cached=True, proxy=("proxy.com", "https"), ...
    crawler1.crawl(cached=False)

# -------------------------------------------------------------------------------------------------
# Typically, you'll want a crawler that runs in an endless loop as a background process,
# and just keeps on visiting new URL's. In this case, it is rude to use a delay of 0.0,
# because you will keep hammering servers with automated requests.
# A higher delay (in a real-world scenario, say 30 seconds) is better:

crawler2 = SimpleCrawler1(links=["http://www.clips.ua.ac.be/pages/pattern/"], domains=["ua.ac.be"], delay=0.1)

print
print "CRAWLER 2 " + "-" * 50
while True:
    crawler2.crawl(cached=False)
    print "wait..."
    # Of course we don't want this example to run forever,
    # so we still add a stop condition:
    if len(crawler2.visited) > 2:
        break

# -------------------------------------------------------------------------------------------------
# If you create a crawler without a domains=[..] restriction, it is free to roam the entire web.
# What to visit first? You can use Crawler.crawl() with an optional "method" parameter.
# When set to DEPTH, it prefers to visit links in the same domain.
# When set to BREADTH, it prefers to visit links to other domains.
# Observe the difference between crawler3 and crawler4,
# which use DEPTH and BREADTH respectively.

crawler3 = SimpleCrawler1(links=["http://www.clips.ua.ac.be/pages/pattern/"], delay=0.0)

print
print "CRAWLER 3 " + "-" * 50
while len(crawler3.visited) < 3:
    crawler3.crawl(method=DEPTH)
    
crawler4 = SimpleCrawler1(links=["http://www.clips.ua.ac.be/pages/pattern/"], delay=0.0)

print
print "CRAWLER 4 " + "-" * 50
while len(crawler4.visited) < 3:
    crawler4.crawl(method=BREADTH)

# -------------------------------------------------------------------------------------------------
# With Crawler.crawl(method=DEPTH) and a delay,
# the crawler will wait between requests to the same domain.
# In the meantime, it will visit other links.
# Usually this means that it will alternate between a couple of domains:

crawler5 = SimpleCrawler1(links=["http://www.clips.ua.ac.be/pages/pattern/"], delay=0.1)

print
print "CRAWLER 5 " + "-" * 50
while len(crawler5.visited) < 4:
    crawler5.crawl(method=DEPTH)

# -------------------------------------------------------------------------------------------------
# A BREADTH-crawler in an endless crawl loop will eventually queue the entire web for a visit.
# But this is not possible of course: we can't keep the entire web in memory.
# When the number of queued links exceeds Crawler.QUEUE (10,000 by default),
# less relevant queued links will be discarded.
# "Less relevant" depends on two settings:
# 1) First, there is the Crawler.priority() method that returns a number between 0.0-1.0 for a link.
#    Links with a higher priority are more relevant and will be visited sooner.
# 2) Links with an equal priority are queued either FIFO or LIFO.
#    FIFO means first-in-first-out: the earliest queued links will be visited sooner.
#    LIFO means last-in-first-out: more recently queued links will be visited sooner.

class SimpleCrawler2(Crawler):
    
    def visit(self, link, source=None):
        print "visiting:", link.url, "from:", link.referrer
    
    def priority(self, link, method=DEPTH):
        if "?" in link.url:
            # This ignores links with a querystring.
            return 0.0
        else:
            # Otherwise use the default priority ranker,
            # i.e. the priority depends on DEPTH or BREADTH crawl mode.
            return Crawler.priority(self, link, method)

# Note the LIFO sort order. 
# This will make more recently queued links more relevant.
# If you observe the given URL in a browser,
# you'll notice that the last external link at the bottom of the page is now visited first.
crawler6 = SimpleCrawler2(links=["http://www.clips.ua.ac.be/pages/pattern/"], delay=0.1, sort=LIFO)

print
print "CRAWLER 6 " + "-" * 50
while len(crawler6.visited) < 4:
    crawler6.crawl(method=BREADTH)

# -------------------------------------------------------------------------------------------------
# In the long run, the Crawler.visited dictionary will start filling up memory too.
# If you want a single crawler that runs forever, you should empty the dictionary every now and then,
# and instead use a strategy with a persistent database of visited links,
# in combination with Crawler.follow().
# Another strategy would be to use different DEPTH-crawlers for different domains,
# and delete them when they are done.