1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
|
"""
DuplicatesFilterMiddleware: Filter out already visited urls
See documentation in docs/topics/scheduler-middleware.rst
"""
from scrapy.core.exceptions import IgnoreRequest, NotConfigured
from scrapy.utils.misc import load_object
from scrapy.conf import settings
class DuplicatesFilterMiddleware(object):
def __init__(self):
clspath = settings.get('DUPEFILTER_CLASS')
if not clspath:
raise NotConfigured
self.dupefilter = load_object(clspath)()
def enqueue_request(self, spider, request):
seen = self.dupefilter.request_seen(spider, request)
if seen and not request.dont_filter:
raise IgnoreRequest('Skipped (request already seen)')
def open_spider(self, spider):
self.dupefilter.open_spider(spider)
def close_spider(self, spider):
self.dupefilter.close_spider(spider)
|