1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
|
"""
Url Length Spider Middleware
See documentation in docs/topics/spider-middleware.rst
"""
from scrapy import log
from scrapy.http import Request
from scrapy.exceptions import NotConfigured
class UrlLengthMiddleware(object):
def __init__(self, maxlength):
self.maxlength = maxlength
@classmethod
def from_settings(cls, settings):
maxlength = settings.getint('URLLENGTH_LIMIT')
if not maxlength:
raise NotConfigured
return cls(maxlength)
def process_spider_output(self, response, result, spider):
def _filter(request):
if isinstance(request, Request) and len(request.url) > self.maxlength:
log.msg(format="Ignoring link (url length > %(maxlength)d): %(url)s ",
level=log.DEBUG, spider=spider,
maxlength=self.maxlength, url=request.url)
return False
else:
return True
return (r for r in result or () if _filter(r))
|