File: 13-crawler.py

package info (click to toggle)
python-pattern 2.6%2Bgit20180818-2
  • links: PTS
  • area: main
  • in suites: bullseye
  • size: 93,888 kB
  • sloc: python: 28,119; xml: 15,085; makefile: 194
file content (185 lines) | stat: -rw-r--r-- 7,045 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
from __future__ import print_function
from __future__ import unicode_literals

from builtins import str, bytes, dict, int

import os
import sys
import time
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))

from pattern.web import Crawler, DEPTH, BREADTH, FIFO, LIFO, crawl, asynchronous

# This example demonstrates how to use the Crawler class for web crawling.

# -------------------------------------------------------------------------------------------------
# First, we need a subclass of Crawler with its own Crawler.visit() method.
# The visit() method takes two parameters: the visited link and the HTML source.
# We could parse the HTML DOM to extract information we need, for example.
# Anything that is not HTML (e.g., a JPEG file) is passed to Crawler.fail().


# class Polly(Crawler):
#    def visit(self, link, source=None):
#        print("visited:", link.url, "from:", link.referrer)
#    def fail(self, link):
#        print("failed:", link.url)
#
# p = Polly(links=["http://nodebox.net/"], domains=["nodebox.net"], delay=5)
# while not p.done:
#    p.crawl(method=DEPTH, cached=True, throttle=5)


# for link, source in crawl("http://www.clips.ua.ac.be/", delay=0, throttle=1, cached=False):
#    print(link)
#
# g = crawl("http://www.clips.ua.ac.be/")
# for i in range(10):
#    p = asynchronous(g.next)
#    while not p.done:
#        print("zzz...")
#        time.sleep(0.1)
#    link, source = p.value
#    print(link)



class SimpleCrawler1(Crawler):

    def visit(self, link, source=None):
        print("visiting: %s from: %s" % (link.url, link.referrer))

    def fail(self, link):
        print("failed: %s" % link.url)

# Create a new crawler.
# 1) The links parameter is a list of URL's to visit.
#    The crawler will visit the first link, extract new links from the HTML, and queue these for a visit too.
# 2) The domains parameter is a list of allowed domains.
#    The crawler will never leave these domains.
# 3) The delay parameter specifies a number of seconds to wait before revisiting the same domain.
#    In the meantime, other queued links will be crawled if possible.

crawler1 = SimpleCrawler1(links=["http://nodebox.net/"], domains=["nodebox.net"], delay=1)

print("CRAWLER 1 " + "-" * 50)
while len(crawler1.visited) < 5:  # Crawler.visited is a dictionary of all URL's visited so far.
    # The Crawler.crawl() method has the same optional parameters as URL.download(),
    # for example: cached=True, proxy=("proxy.com", "https"), ...
    crawler1.crawl(cached=True, throttle=5)

# -------------------------------------------------------------------------------------------------
# Typically, you'll want a crawler that runs in an endless loop as a background process,
# and just keeps on visiting new URL's. In this case, it is rude to use a delay of 0.0,
# because you will keep hammering servers with automated requests.
# A higher delay (in a real-world scenario, say 30 seconds) is better:

crawler2 = SimpleCrawler1(
    links=["http://nodebox.net/"],
    domains=["nodebox.net"],
      delay=0.1
)

print("")
print("CRAWLER 2 " + "-" * 50)
while True:
    crawler2.crawl(cached=False)
    print("wait...")
    # Of course we don't want this example to run forever,
    # so we still add a stop condition:
    if len(crawler2.visited) > 2:
        break

# -------------------------------------------------------------------------------------------------
# If you create a crawler without a domains=[..] restriction, it is free to roam the entire web.
# What to visit first? You can use Crawler.crawl() with an optional "method" parameter.
# When set to DEPTH, it prefers to visit links in the same domain.
# When set to BREADTH, it prefers to visit links to other domains.
# Observe the difference between crawler3 and crawler4,
# which use DEPTH and BREADTH respectively.

crawler3 = SimpleCrawler1(
    links=["http://nodebox.net/"],
    delay=0.0
)

print("")
print("CRAWLER 3 " + "-" * 50)
while len(crawler3.visited) < 3:
    crawler3.crawl(method=DEPTH)

crawler4 = SimpleCrawler1(
    links=["http://nodebox.net/"],
    delay=0.0
)

print("")
print("CRAWLER 4 " + "-" * 50)
while len(crawler4.visited) < 3:
    crawler4.crawl(method=BREADTH)

# -------------------------------------------------------------------------------------------------
# With Crawler.crawl(method=DEPTH) and a delay,
# the crawler will wait between requests to the same domain.
# In the meantime, it will visit other links.
# Usually this means that it will alternate between a couple of domains:

crawler5 = SimpleCrawler1(
    links=["http://nodebox.net/"],
    delay=0.1
)

print("")
print("CRAWLER 5 " + "-" * 50)
while len(crawler5.visited) < 4:
    crawler5.crawl(method=DEPTH)

# -------------------------------------------------------------------------------------------------
# A BREADTH-crawler in an endless crawl loop will eventually queue the entire web for a visit.
# But this is not possible of course: we can't keep the entire web in memory.
# When the number of queued links exceeds Crawler.QUEUE (10,000 by default),
# less relevant queued links will be discarded.
# "Less relevant" depends on two settings:
# 1) First, there is the Crawler.priority() method that returns a number between 0.0-1.0 for a link.
#    Links with a higher priority are more relevant and will be visited sooner.
# 2) Links with an equal priority are queued either FIFO or LIFO.
#    FIFO means first-in-first-out: the earliest queued links will be visited sooner.
#    LIFO means last-in-first-out: more recently queued links will be visited sooner.


class SimpleCrawler2(Crawler):

    def visit(self, link, source=None):
        print("visiting: %s from: %s" % (link.url, link.referrer))

    def priority(self, link, method=DEPTH):
        if "?" in link.url:
            # This ignores links with a querystring.
            return 0.0
        else:
            # Otherwise use the default priority ranker,
            # i.e. the priority depends on DEPTH or BREADTH crawl mode.
            return Crawler.priority(self, link, method)

# Note the LIFO sort order.
# This will make more recently queued links more relevant.
# If you observe the given URL in a browser,
# you'll notice that the last external link at the bottom of the page is now visited first.
crawler6 = SimpleCrawler2(
    links=["http://nodebox.net/"],
    delay=0.1,
    sort=LIFO
)

print("")
print("CRAWLER 6 " + "-" * 50)
while len(crawler6.visited) < 4:
    crawler6.crawl(method=BREADTH)

# -------------------------------------------------------------------------------------------------
# In the long run, the Crawler.visited dictionary will start filling up memory too.
# If you want a single crawler that runs forever, you should empty the dictionary every now and then,
# and instead use a strategy with a persistent database of visited links,
# in combination with Crawler.follow().
# Another strategy would be to use different DEPTH-crawlers for different domains,
# and delete them when they are done.