File: duplicatesfilter.py

package info (click to toggle)
python-scrapy 0.8-3
  • links: PTS, VCS
  • area: main
  • in suites: squeeze
  • size: 2,904 kB
  • ctags: 2,981
  • sloc: python: 15,349; xml: 199; makefile: 68; sql: 64; sh: 34
file content (29 lines) | stat: -rw-r--r-- 875 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
"""
DuplicatesFilterMiddleware: Filter out already visited urls

See documentation in docs/topics/scheduler-middleware.rst
"""

from scrapy.core.exceptions import IgnoreRequest, NotConfigured
from scrapy.utils.misc import load_object
from scrapy.conf import settings

class DuplicatesFilterMiddleware(object):

    def __init__(self):
        clspath = settings.get('DUPEFILTER_CLASS')
        if not clspath:
            raise NotConfigured

        self.dupefilter = load_object(clspath)()

    def enqueue_request(self, spider, request):
        seen = self.dupefilter.request_seen(spider, request)
        if seen and not request.dont_filter:
            raise IgnoreRequest('Skipped (request already seen)')

    def open_spider(self, spider):
        self.dupefilter.open_spider(spider)

    def close_spider(self, spider):
        self.dupefilter.close_spider(spider)