1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
|
import hashlib
import tempfile
import unittest
import shutil
from scrapy.dupefilters import RFPDupeFilter
from scrapy.http import Request
from scrapy.utils.python import to_bytes
class RFPDupeFilterTest(unittest.TestCase):
def test_filter(self):
dupefilter = RFPDupeFilter()
dupefilter.open()
r1 = Request('http://scrapytest.org/1')
r2 = Request('http://scrapytest.org/2')
r3 = Request('http://scrapytest.org/2')
assert not dupefilter.request_seen(r1)
assert dupefilter.request_seen(r1)
assert not dupefilter.request_seen(r2)
assert dupefilter.request_seen(r3)
dupefilter.close('finished')
def test_dupefilter_path(self):
r1 = Request('http://scrapytest.org/1')
r2 = Request('http://scrapytest.org/2')
path = tempfile.mkdtemp()
try:
df = RFPDupeFilter(path)
df.open()
assert not df.request_seen(r1)
assert df.request_seen(r1)
df.close('finished')
df2 = RFPDupeFilter(path)
df2.open()
assert df2.request_seen(r1)
assert not df2.request_seen(r2)
assert df2.request_seen(r2)
df2.close('finished')
finally:
shutil.rmtree(path)
def test_request_fingerprint(self):
"""Test if customization of request_fingerprint method will change
output of request_seen.
"""
r1 = Request('http://scrapytest.org/index.html')
r2 = Request('http://scrapytest.org/INDEX.html')
dupefilter = RFPDupeFilter()
dupefilter.open()
assert not dupefilter.request_seen(r1)
assert not dupefilter.request_seen(r2)
dupefilter.close('finished')
class CaseInsensitiveRFPDupeFilter(RFPDupeFilter):
def request_fingerprint(self, request):
fp = hashlib.sha1()
fp.update(to_bytes(request.url.lower()))
return fp.hexdigest()
case_insensitive_dupefilter = CaseInsensitiveRFPDupeFilter()
case_insensitive_dupefilter.open()
assert not case_insensitive_dupefilter.request_seen(r1)
assert case_insensitive_dupefilter.request_seen(r2)
case_insensitive_dupefilter.close('finished')
|