1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188
|
import random
import warnings
from time import time
from collections import deque
from twisted.internet import reactor, defer, task
from scrapy.utils.defer import mustbe_deferred
from scrapy.utils.httpobj import urlparse_cached
from scrapy.resolver import dnscache
from scrapy.exceptions import ScrapyDeprecationWarning
from scrapy import signals
from .middleware import DownloaderMiddlewareManager
from .handlers import DownloadHandlers
class Slot(object):
"""Downloader slot"""
def __init__(self, concurrency, delay, settings):
self.concurrency = concurrency
self.delay = delay
self.randomize_delay = settings.getbool('RANDOMIZE_DOWNLOAD_DELAY')
self.active = set()
self.queue = deque()
self.transferring = set()
self.lastseen = 0
self.latercall = None
def free_transfer_slots(self):
return self.concurrency - len(self.transferring)
def download_delay(self):
if self.randomize_delay:
return random.uniform(0.5 * self.delay, 1.5 * self.delay)
return self.delay
def close(self):
if self.latercall and self.latercall.active():
self.latercall.cancel()
def _get_concurrency_delay(concurrency, spider, settings):
delay = settings.getfloat('DOWNLOAD_DELAY')
if hasattr(spider, 'DOWNLOAD_DELAY'):
warnings.warn("%s.DOWNLOAD_DELAY attribute is deprecated, use %s.download_delay instead" %
(type(spider).__name__, type(spider).__name__))
delay = spider.DOWNLOAD_DELAY
if hasattr(spider, 'download_delay'):
delay = spider.download_delay
# TODO: remove for Scrapy 0.15
c = settings.getint('CONCURRENT_REQUESTS_PER_SPIDER')
if c:
warnings.warn("CONCURRENT_REQUESTS_PER_SPIDER setting is deprecated, "
"use CONCURRENT_REQUESTS_PER_DOMAIN instead", ScrapyDeprecationWarning)
concurrency = c
# ----------------------------
if hasattr(spider, 'max_concurrent_requests'):
concurrency = spider.max_concurrent_requests
return concurrency, delay
class Downloader(object):
def __init__(self, crawler):
self.settings = crawler.settings
self.signals = crawler.signals
self.slots = {}
self.active = set()
self.handlers = DownloadHandlers(crawler)
self.total_concurrency = self.settings.getint('CONCURRENT_REQUESTS')
self.domain_concurrency = self.settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN')
self.ip_concurrency = self.settings.getint('CONCURRENT_REQUESTS_PER_IP')
self.middleware = DownloaderMiddlewareManager.from_crawler(crawler)
self._slot_gc_loop = task.LoopingCall(self._slot_gc)
self._slot_gc_loop.start(60)
def fetch(self, request, spider):
def _deactivate(response):
self.active.remove(request)
return response
self.active.add(request)
dfd = self.middleware.download(self._enqueue_request, request, spider)
return dfd.addBoth(_deactivate)
def needs_backout(self):
return len(self.active) >= self.total_concurrency
def _get_slot(self, request, spider):
key = self._get_slot_key(request, spider)
if key not in self.slots:
conc = self.ip_concurrency if self.ip_concurrency else self.domain_concurrency
conc, delay = _get_concurrency_delay(conc, spider, self.settings)
self.slots[key] = Slot(conc, delay, self.settings)
return key, self.slots[key]
def _get_slot_key(self, request, spider):
if 'download_slot' in request.meta:
return request.meta['download_slot']
key = urlparse_cached(request).hostname or ''
if self.ip_concurrency:
key = dnscache.get(key, key)
return key
def _enqueue_request(self, request, spider):
key, slot = self._get_slot(request, spider)
request.meta['download_slot'] = key
def _deactivate(response):
slot.active.remove(request)
return response
slot.active.add(request)
deferred = defer.Deferred().addBoth(_deactivate)
slot.queue.append((request, deferred))
self._process_queue(spider, slot)
return deferred
def _process_queue(self, spider, slot):
if slot.latercall and slot.latercall.active():
return
# Delay queue processing if a download_delay is configured
now = time()
delay = slot.download_delay()
if delay:
penalty = delay - now + slot.lastseen
if penalty > 0:
slot.latercall = reactor.callLater(penalty, self._process_queue, spider, slot)
return
# Process enqueued requests if there are free slots to transfer for this slot
while slot.queue and slot.free_transfer_slots() > 0:
slot.lastseen = now
request, deferred = slot.queue.popleft()
dfd = self._download(slot, request, spider)
dfd.chainDeferred(deferred)
# prevent burst if inter-request delays were configured
if delay:
self._process_queue(spider, slot)
break
def _download(self, slot, request, spider):
# The order is very important for the following deferreds. Do not change!
# 1. Create the download deferred
dfd = mustbe_deferred(self.handlers.download_request, request, spider)
# 2. Notify response_downloaded listeners about the recent download
# before querying queue for next request
def _downloaded(response):
self.signals.send_catch_log(signal=signals.response_downloaded,
response=response,
request=request,
spider=spider)
return response
dfd.addCallback(_downloaded)
# 3. After response arrives, remove the request from transferring
# state to free up the transferring slot so it can be used by the
# following requests (perhaps those which came from the downloader
# middleware itself)
slot.transferring.add(request)
def finish_transferring(_):
slot.transferring.remove(request)
self._process_queue(spider, slot)
return _
return dfd.addBoth(finish_transferring)
def close(self):
self._slot_gc_loop.stop()
for slot in self.slots.itervalues():
slot.close()
def _slot_gc(self, age=60):
mintime = time() - age
for key, slot in self.slots.items():
if not slot.active and slot.lastseen + slot.delay < mintime:
self.slots.pop(key).close()
|