File: test_downloaderslotssettings.py

package info (click to toggle)
python-scrapy 2.13.3-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 5,664 kB
  • sloc: python: 52,028; xml: 199; makefile: 25; sh: 7
file content (102 lines) | stat: -rw-r--r-- 3,153 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import time

from twisted.internet import defer
from twisted.trial.unittest import TestCase

from scrapy import Request
from scrapy.core.downloader import Downloader, Slot
from scrapy.crawler import CrawlerRunner
from scrapy.utils.test import get_crawler
from tests.mockserver import MockServer
from tests.spiders import MetaSpider


class DownloaderSlotsSettingsTestSpider(MetaSpider):
    name = "downloader_slots"

    custom_settings = {
        "DOWNLOAD_DELAY": 1,
        "RANDOMIZE_DOWNLOAD_DELAY": False,
        "DOWNLOAD_SLOTS": {
            "quotes.toscrape.com": {
                "concurrency": 1,
                "delay": 2,
                "randomize_delay": False,
                "throttle": False,
            },
            "books.toscrape.com": {"delay": 3, "randomize_delay": False},
        },
    }

    async def start(self):
        self.times = {None: []}

        slots = [*self.custom_settings.get("DOWNLOAD_SLOTS", {}), None]

        for slot in slots:
            url = self.mockserver.url(f"/?downloader_slot={slot}")
            self.times[slot] = []
            yield Request(url, callback=self.parse, meta={"download_slot": slot})

    def parse(self, response):
        slot = response.meta.get("download_slot", None)
        self.times[slot].append(time.time())
        url = self.mockserver.url(f"/?downloader_slot={slot}&req=2")
        yield Request(url, callback=self.not_parse, meta={"download_slot": slot})

    def not_parse(self, response):
        slot = response.meta.get("download_slot", None)
        self.times[slot].append(time.time())


class CrawlTestCase(TestCase):
    @classmethod
    def setUpClass(cls):
        cls.mockserver = MockServer()
        cls.mockserver.__enter__()

    @classmethod
    def tearDownClass(cls):
        cls.mockserver.__exit__(None, None, None)

    def setUp(self):
        self.runner = CrawlerRunner()

    @defer.inlineCallbacks
    def test_delay(self):
        crawler = get_crawler(DownloaderSlotsSettingsTestSpider)
        yield crawler.crawl(mockserver=self.mockserver)
        slots = crawler.engine.downloader.slots
        times = crawler.spider.times
        tolerance = 0.3

        delays_real = {k: v[1] - v[0] for k, v in times.items()}
        error_delta = {
            k: 1 - min(delays_real[k], v.delay) / max(delays_real[k], v.delay)
            for k, v in slots.items()
        }

        assert max(list(error_delta.values())) < tolerance


def test_params():
    params = {
        "concurrency": 1,
        "delay": 2,
        "randomize_delay": False,
    }
    settings = {
        "DOWNLOAD_SLOTS": {
            "example.com": params,
        },
    }
    crawler = get_crawler(settings_dict=settings)
    downloader = Downloader(crawler)
    downloader._slot_gc_loop.stop()  # Prevent an unclean reactor.
    request = Request("https://example.com")
    _, actual = downloader._get_slot(request, spider=None)
    expected = Slot(**params)
    for param in params:
        assert getattr(expected, param) == getattr(actual, param), (
            f"Slot.{param}: {getattr(expected, param)!r} != {getattr(actual, param)!r}"
        )