File: test_spidermiddleware_offsite.py

package info (click to toggle)
python-scrapy 2.13.3-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 5,664 kB
  • sloc: python: 52,028; xml: 199; makefile: 25; sh: 7
file content (105 lines) | stat: -rw-r--r-- 3,763 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import warnings
from urllib.parse import urlparse

from scrapy.http import Request, Response
from scrapy.spidermiddlewares.offsite import OffsiteMiddleware, PortWarning, URLWarning
from scrapy.spiders import Spider
from scrapy.utils.test import get_crawler


class TestOffsiteMiddleware:
    def setup_method(self):
        crawler = get_crawler(Spider)
        self.spider = crawler.spider = crawler._create_spider(**self._get_spiderargs())
        self.mw = OffsiteMiddleware.from_crawler(crawler)
        self.mw.spider_opened(self.spider)

    def _get_spiderargs(self):
        return {
            "name": "foo",
            "allowed_domains": ["scrapytest.org", "scrapy.org", "scrapy.test.org"],
        }

    def test_process_spider_output(self):
        res = Response("http://scrapytest.org")

        onsite_reqs = [
            Request("http://scrapytest.org/1"),
            Request("http://scrapy.org/1"),
            Request("http://sub.scrapy.org/1"),
            Request("http://offsite.tld/letmepass", dont_filter=True),
            Request("http://offsite-2.tld/allow", meta={"allow_offsite": True}),
            Request("http://scrapy.test.org/"),
            Request("http://scrapy.test.org:8000/"),
        ]
        offsite_reqs = [
            Request("http://scrapy2.org"),
            Request("http://offsite.tld/"),
            Request("http://offsite.tld/scrapytest.org"),
            Request("http://offsite.tld/rogue.scrapytest.org"),
            Request("http://rogue.scrapytest.org.haha.com"),
            Request("http://roguescrapytest.org"),
            Request("http://test.org/"),
            Request("http://notscrapy.test.org/"),
        ]
        reqs = onsite_reqs + offsite_reqs

        out = list(self.mw.process_spider_output(res, reqs, self.spider))
        assert out == onsite_reqs


class TestOffsiteMiddleware2(TestOffsiteMiddleware):
    def _get_spiderargs(self):
        return {"name": "foo", "allowed_domains": None}

    def test_process_spider_output(self):
        res = Response("http://scrapytest.org")
        reqs = [Request("http://a.com/b.html"), Request("http://b.com/1")]
        out = list(self.mw.process_spider_output(res, reqs, self.spider))
        assert out == reqs


class TestOffsiteMiddleware3(TestOffsiteMiddleware2):
    def _get_spiderargs(self):
        return {"name": "foo"}


class TestOffsiteMiddleware4(TestOffsiteMiddleware3):
    def _get_spiderargs(self):
        bad_hostname = urlparse("http:////scrapytest.org").hostname
        return {
            "name": "foo",
            "allowed_domains": ["scrapytest.org", None, bad_hostname],
        }

    def test_process_spider_output(self):
        res = Response("http://scrapytest.org")
        reqs = [Request("http://scrapytest.org/1")]
        out = list(self.mw.process_spider_output(res, reqs, self.spider))
        assert out == reqs


class TestOffsiteMiddleware5(TestOffsiteMiddleware4):
    def test_get_host_regex(self):
        self.spider.allowed_domains = [
            "http://scrapytest.org",
            "scrapy.org",
            "scrapy.test.org",
        ]
        with warnings.catch_warnings(record=True) as w:
            warnings.simplefilter("always")
            self.mw.get_host_regex(self.spider)
            assert issubclass(w[-1].category, URLWarning)


class TestOffsiteMiddleware6(TestOffsiteMiddleware4):
    def test_get_host_regex(self):
        self.spider.allowed_domains = [
            "scrapytest.org:8000",
            "scrapy.org",
            "scrapy.test.org",
        ]
        with warnings.catch_warnings(record=True) as w:
            warnings.simplefilter("always")
            self.mw.get_host_regex(self.spider)
            assert issubclass(w[-1].category, PortWarning)