1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
|
"""A simple web-spider that crawls all the pages in http://tornadoweb.org.
``spider()`` downloads the page at `base_url` and any pages it links to,
recursively. It ignores pages that are not beneath `base_url` hierarchically.
This function demos two Toro classes: :class:`~toro.JoinableQueue` and
:class:`~toro.BoundedSemaphore`.
The :class:`~toro.JoinableQueue` is a work queue; it begins containing only
`base_url`, and each discovered URL is added to it. We wait for
:meth:`~toro.JoinableQueue.join` to complete before exiting. This ensures that
the function as a whole ends when all URLs have been downloaded.
The :class:`~toro.BoundedSemaphore` regulates concurrency. We block trying to
decrement the semaphore before each download, and increment it after each
download completes.
"""
# start-file
import HTMLParser
import time
import urlparse
from datetime import timedelta
from tornado import httpclient, gen, ioloop
import toro
@gen.coroutine
def spider(base_url, concurrency):
q = toro.JoinableQueue()
sem = toro.BoundedSemaphore(concurrency)
start = time.time()
fetching, fetched = set(), set()
@gen.coroutine
def fetch_url():
current_url = yield q.get()
try:
if current_url in fetching:
return
print 'fetching', current_url
fetching.add(current_url)
urls = yield get_links_from_url(current_url)
fetched.add(current_url)
for new_url in urls:
# Only follow links beneath the base URL
if new_url.startswith(base_url):
yield q.put(new_url)
finally:
q.task_done()
sem.release()
@gen.coroutine
def worker():
while True:
yield sem.acquire()
# Launch a subtask
fetch_url()
q.put(base_url)
# Start worker, then wait for the work queue to be empty.
worker()
yield q.join(deadline=timedelta(seconds=300))
assert fetching == fetched
print 'Done in %d seconds, fetched %s URLs.' % (
time.time() - start, len(fetched))
@gen.coroutine
def get_links_from_url(url):
"""Download the page at `url` and parse it for links. Returned links have
had the fragment after `#` removed, and have been made absolute so, e.g.
the URL 'gen.html#tornado.gen.coroutine' becomes
'http://www.tornadoweb.org/en/stable/gen.html'.
"""
try:
response = yield httpclient.AsyncHTTPClient().fetch(url)
print 'fetched', url
urls = [urlparse.urljoin(url, remove_fragment(new_url))
for new_url in get_links(response.body)]
except Exception, e:
print e, url
raise gen.Return([])
raise gen.Return(urls)
def remove_fragment(url):
scheme, netloc, url, params, query, fragment = urlparse.urlparse(url)
return urlparse.urlunparse((scheme, netloc, url, params, query, ''))
def get_links(html):
class URLSeeker(HTMLParser.HTMLParser):
def __init__(self):
HTMLParser.HTMLParser.__init__(self)
self.urls = []
def handle_starttag(self, tag, attrs):
href = dict(attrs).get('href')
if href and tag == 'a':
self.urls.append(href)
url_seeker = URLSeeker()
url_seeker.feed(html)
return url_seeker.urls
if __name__ == '__main__':
import logging
logging.basicConfig()
loop = ioloop.IOLoop.current()
def stop(future):
loop.stop()
future.result() # Raise error if there is one
future = spider('http://www.tornadoweb.org/en/stable/', 10)
future.add_done_callback(stop)
loop.start()
|