File: perf_crawl.py

package info (click to toggle)
mpi4py 4.1.0-4
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 4,540 kB
  • sloc: python: 34,465; ansic: 16,475; makefile: 614; sh: 325; cpp: 193; f90: 178
file content (104 lines) | stat: -rw-r--r-- 2,711 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
"""
Compare the speed of downloading URLs sequentially vs. using futures.
"""

import contextlib
import functools
import sys
import time

try:
    from urllib.request import urlopen
except ImportError:
    from urllib2 import urlopen

try:
    from concurrent.futures import ThreadPoolExecutor
except ImportError:
    ThreadPoolExecutor = lambda _: None  # noqa: E731
try:
    from concurrent.futures import ProcessPoolExecutor
except ImportError:
    ProcessPoolExecutor = lambda _: None  # noqa: E731

from mpi4py.futures import MPIPoolExecutor, as_completed

URLS = [
    "http://www.google.com/",
    "http://www.apple.com/",
    "http://www.ibm.com",
    "http://www.thisurlprobablydoesnotexist.com",
    "http://www.slashdot.org/",
    "http://www.python.org/",
    "http://www.bing.com/",
    "http://www.facebook.com/",
    "http://www.github.com/",
    "http://www.youtube.com/",
    "http://www.blogger.com/",
]


def load_url(url, timeout):
    return urlopen(url, timeout=timeout).read()


def download_urls_sequential(urls, timeout=60):
    url_to_content = {}
    for url in urls:
        with contextlib.suppress(Exception):
            url_to_content[url] = load_url(url, timeout=timeout)
    return url_to_content


def download_urls_with_executor(executor, urls, timeout=60):
    if executor is None:
        return {}
    try:
        url_to_content = {}
        future_to_url = {
            executor.submit(load_url, url, timeout): url for url in urls
        }
        for future in as_completed(future_to_url):
            with contextlib.suppress(Exception):
                url_to_content[future_to_url[future]] = future.result()
        return url_to_content
    finally:
        executor.shutdown()


def main():
    for meth, fn in [
        ("sequential", functools.partial(download_urls_sequential, URLS)),
        (
            "threads",
            functools.partial(
                download_urls_with_executor, ThreadPoolExecutor(10), URLS
            ),
        ),
        (
            "processes",
            functools.partial(
                download_urls_with_executor, ProcessPoolExecutor(10), URLS
            ),
        ),
        (
            "mpi4py",
            functools.partial(
                download_urls_with_executor, MPIPoolExecutor(10), URLS
            ),
        ),
    ]:
        sys.stdout.write(f"{meth.ljust(11)}: ")
        sys.stdout.flush()
        start = time.time()
        url_map = fn()
        elapsed = time.time() - start
        m, n = len(url_map), len(URLS)
        sys.stdout.write(
            f"{elapsed:5.2f} seconds ({m:2d} of {n:d} downloaded)\n",
        )
        sys.stdout.flush()


if __name__ == "__main__":
    main()