1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56
|
"""
Depth Spider Middleware
See documentation in docs/topics/spider-middleware.rst
"""
import warnings
from scrapy import log
from scrapy.http import Request
from scrapy.exceptions import ScrapyDeprecationWarning
class DepthMiddleware(object):
def __init__(self, maxdepth, stats=None, verbose_stats=False, prio=1):
self.maxdepth = maxdepth
self.stats = stats
self.verbose_stats = verbose_stats
self.prio = prio
@classmethod
def from_settings(cls, settings):
maxdepth = settings.getint('DEPTH_LIMIT')
usestats = settings.getbool('DEPTH_STATS')
verbose = settings.getbool('DEPTH_STATS_VERBOSE')
prio = settings.getint('DEPTH_PRIORITY')
if usestats:
from scrapy.stats import stats
else:
stats = None
return cls(maxdepth, stats, verbose, prio)
def process_spider_output(self, response, result, spider):
def _filter(request):
if isinstance(request, Request):
depth = response.request.meta['depth'] + 1
request.meta['depth'] = depth
if self.prio:
request.priority -= depth * self.prio
if self.maxdepth and depth > self.maxdepth:
log.msg("Ignoring link (depth > %d): %s " % (self.maxdepth, request.url), \
level=log.DEBUG, spider=spider)
return False
elif self.stats:
if self.verbose_stats:
self.stats.inc_value('request_depth_count/%s' % depth, spider=spider)
self.stats.max_value('request_depth_max', depth, spider=spider)
return True
# base case (depth=0)
if self.stats and 'depth' not in response.request.meta:
response.request.meta['depth'] = 0
if self.verbose_stats:
self.stats.inc_value('request_depth_count/0', spider=spider)
return (r for r in result or () if _filter(r))
|