1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102
|
import time
from twisted.internet import defer
from twisted.trial.unittest import TestCase
from scrapy import Request
from scrapy.core.downloader import Downloader, Slot
from scrapy.crawler import CrawlerRunner
from scrapy.utils.test import get_crawler
from tests.mockserver import MockServer
from tests.spiders import MetaSpider
class DownloaderSlotsSettingsTestSpider(MetaSpider):
name = "downloader_slots"
custom_settings = {
"DOWNLOAD_DELAY": 1,
"RANDOMIZE_DOWNLOAD_DELAY": False,
"DOWNLOAD_SLOTS": {
"quotes.toscrape.com": {
"concurrency": 1,
"delay": 2,
"randomize_delay": False,
"throttle": False,
},
"books.toscrape.com": {"delay": 3, "randomize_delay": False},
},
}
async def start(self):
self.times = {None: []}
slots = [*self.custom_settings.get("DOWNLOAD_SLOTS", {}), None]
for slot in slots:
url = self.mockserver.url(f"/?downloader_slot={slot}")
self.times[slot] = []
yield Request(url, callback=self.parse, meta={"download_slot": slot})
def parse(self, response):
slot = response.meta.get("download_slot", None)
self.times[slot].append(time.time())
url = self.mockserver.url(f"/?downloader_slot={slot}&req=2")
yield Request(url, callback=self.not_parse, meta={"download_slot": slot})
def not_parse(self, response):
slot = response.meta.get("download_slot", None)
self.times[slot].append(time.time())
class CrawlTestCase(TestCase):
@classmethod
def setUpClass(cls):
cls.mockserver = MockServer()
cls.mockserver.__enter__()
@classmethod
def tearDownClass(cls):
cls.mockserver.__exit__(None, None, None)
def setUp(self):
self.runner = CrawlerRunner()
@defer.inlineCallbacks
def test_delay(self):
crawler = get_crawler(DownloaderSlotsSettingsTestSpider)
yield crawler.crawl(mockserver=self.mockserver)
slots = crawler.engine.downloader.slots
times = crawler.spider.times
tolerance = 0.3
delays_real = {k: v[1] - v[0] for k, v in times.items()}
error_delta = {
k: 1 - min(delays_real[k], v.delay) / max(delays_real[k], v.delay)
for k, v in slots.items()
}
assert max(list(error_delta.values())) < tolerance
def test_params():
params = {
"concurrency": 1,
"delay": 2,
"randomize_delay": False,
}
settings = {
"DOWNLOAD_SLOTS": {
"example.com": params,
},
}
crawler = get_crawler(settings_dict=settings)
downloader = Downloader(crawler)
downloader._slot_gc_loop.stop() # Prevent an unclean reactor.
request = Request("https://example.com")
_, actual = downloader._get_slot(request, spider=None)
expected = Slot(**params)
for param in params:
assert getattr(expected, param) == getattr(actual, param), (
f"Slot.{param}: {getattr(expected, param)!r} != {getattr(actual, param)!r}"
)
|