File: webcrawler.py

package info (click to toggle)
celery 5.6.2-1
  • links: PTS, VCS
  • area: main
  • in suites: forky
  • size: 8,336 kB
  • sloc: python: 67,264; sh: 795; makefile: 378
file content (68 lines) | stat: -rw-r--r-- 2,009 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
"""Recursive webcrawler example.

For asynchronous DNS lookups install the `dnspython` package:

    $ pip install dnspython

Requires the `pybloom` module for the bloom filter which is used
to ensure a lower chance of recrawling a URL previously seen.

Since the bloom filter is not shared, but only passed as an argument
to each subtask, it would be much better to have this as a centralized
service.  Redis sets could also be a practical solution.

A BloomFilter with a capacity of 100_000 members and an error rate
of 0.001 is 2.8MB pickled, but if compressed with zlib it only takes
up 2.9kB(!).

We don't have to do compression manually, just set the tasks compression
to "zlib", and the serializer to "pickle".

"""

import re

import requests
from eventlet import Timeout
from pybloom_live import BloomFilter

from celery import group, shared_task

try:
    from urllib.parse import urlsplit
except ImportError:
    from urlparse import urlsplit

# http://daringfireball.net/2009/11/liberal_regex_for_matching_urls
url_regex = re.compile(
    r'\b(([\w-]+://?|www[.])[^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|/)))')


def domain(url):
    """Return the domain part of a URL."""
    return urlsplit(url)[1].split(':')[0]


@shared_task(ignore_result=True, serializer='pickle', compression='zlib')
def crawl(url, seen=None):
    print(f'crawling: {url}')
    if not seen:
        seen = BloomFilter(capacity=50000, error_rate=0.0001)

    with Timeout(5, False):
        try:
            response = requests.get(url)
        except requests.exception.RequestError:
            return

    location = domain(url)
    wanted_urls = []
    for url_match in url_regex.finditer(response.text):
        url = url_match.group(0)
        # To not destroy the internet, we only fetch URLs on the same domain.
        if url not in seen and location in domain(url):
            wanted_urls.append(url)
            seen.add(url)

    subtasks = group(crawl.s(url, seen) for url in wanted_urls)
    subtasks.delay()