File: dupefilter.py

package info (click to toggle)
python-scrapy 0.8-3
  • links: PTS, VCS
  • area: main
  • in suites: squeeze
  • size: 2,904 kB
  • ctags: 2,981
  • sloc: python: 15,349; xml: 199; makefile: 68; sql: 64; sh: 34
file content (49 lines) | stat: -rw-r--r-- 1,377 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
"""
Dupe Filter classes implement a mechanism for filtering duplicate requests.
They must implement the following methods:

* open_spider(spider)
  open a spider for tracking duplicates (typically used to reserve resources)

* close_spider(spider)
  close a spider (typically used for freeing resources)

* request_seen(spider, request, dont_record=False)
  return ``True`` if the request was seen before, or ``False`` otherwise. If
  ``dont_record`` is ``True`` the request must not be recorded as seen.

"""

from scrapy.utils.request import request_fingerprint


class NullDupeFilter(dict):
    def open_spider(self, spider):
        pass

    def close_spider(self, spider):
        pass

    def request_seen(self, spider, request, dont_record=False):
        return False


class RequestFingerprintDupeFilter(object):
    """Duplicate filter using scrapy.utils.request.request_fingerprint"""

    def __init__(self):
        self.fingerprints = {}

    def open_spider(self, spider):
        self.fingerprints[spider] = set()

    def close_spider(self, spider):
        del self.fingerprints[spider]

    def request_seen(self, spider, request, dont_record=False):
        fp = request_fingerprint(request)
        if fp in self.fingerprints[spider]:
            return True
        if not dont_record:
            self.fingerprints[spider].add(fp)
        return False