File: test_command_genspider.py

package info (click to toggle)
python-scrapy 2.14.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 6,308 kB
  • sloc: python: 55,321; xml: 199; makefile: 25; sh: 7
file content (205 lines) | stat: -rw-r--r-- 8,771 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
from __future__ import annotations

import re
from pathlib import Path

import pytest

from tests.test_commands import TestProjectBase
from tests.utils.cmdline import call, proc


def find_in_file(filename: Path, regex: str) -> re.Match | None:
    """Find first pattern occurrence in file"""
    pattern = re.compile(regex)
    with filename.open("r", encoding="utf-8") as f:
        for line in f:
            match = pattern.search(line)
            if match is not None:
                return match
    return None


class TestGenspiderCommand(TestProjectBase):
    def test_arguments(self, proj_path: Path) -> None:
        spider = proj_path / self.project_name / "spiders" / "test_name.py"
        # only pass one argument. spider script shouldn't be created
        assert call("genspider", "test_name", cwd=proj_path) == 2
        assert not spider.exists()
        # pass two arguments <name> <domain>. spider script should be created
        assert call("genspider", "test_name", "test.com", cwd=proj_path) == 0
        assert spider.exists()

    @pytest.mark.parametrize(
        "tplname",
        [
            "basic",
            "crawl",
            "xmlfeed",
            "csvfeed",
        ],
    )
    def test_template(self, tplname: str, proj_path: Path) -> None:
        args = [f"--template={tplname}"] if tplname else []
        spname = "test_spider"
        spmodule = f"{self.project_name}.spiders.{spname}"
        spfile = proj_path / self.project_name / "spiders" / f"{spname}.py"
        _, out, _ = proc("genspider", spname, "test.com", *args, cwd=proj_path)
        assert (
            f"Created spider {spname!r} using template {tplname!r} in module:\n  {spmodule}"
            in out
        )
        assert spfile.exists()
        modify_time_before = spfile.stat().st_mtime
        _, out, _ = proc("genspider", spname, "test.com", *args, cwd=proj_path)
        assert f"Spider {spname!r} already exists in module" in out
        modify_time_after = spfile.stat().st_mtime
        assert modify_time_after == modify_time_before

    def test_list(self, proj_path: Path) -> None:
        assert call("genspider", "--list", cwd=proj_path) == 0

    def test_dump(self, proj_path: Path) -> None:
        assert call("genspider", "--dump=basic", cwd=proj_path) == 0
        assert call("genspider", "-d", "basic", cwd=proj_path) == 0

    def test_same_name_as_project(self, proj_path: Path) -> None:
        assert call("genspider", self.project_name, cwd=proj_path) == 2
        assert not (
            proj_path / self.project_name / "spiders" / f"{self.project_name}.py"
        ).exists()

    @pytest.mark.parametrize("force", [True, False])
    def test_same_filename_as_existing_spider(
        self, force: bool, proj_path: Path
    ) -> None:
        file_name = "example"
        file_path = proj_path / self.project_name / "spiders" / f"{file_name}.py"
        assert call("genspider", file_name, "example.com", cwd=proj_path) == 0
        assert file_path.exists()

        # change name of spider but not its file name
        with file_path.open("r+", encoding="utf-8") as spider_file:
            file_data = spider_file.read()
            file_data = file_data.replace('name = "example"', 'name = "renamed"')
            spider_file.seek(0)
            spider_file.write(file_data)
            spider_file.truncate()
        modify_time_before = file_path.stat().st_mtime
        file_contents_before = file_data

        if force:
            _, out, _ = proc(
                "genspider", "--force", file_name, "example.com", cwd=proj_path
            )
            assert (
                f"Created spider {file_name!r} using template 'basic' in module" in out
            )
            modify_time_after = file_path.stat().st_mtime
            assert modify_time_after != modify_time_before
            file_contents_after = file_path.read_text(encoding="utf-8")
            assert file_contents_after != file_contents_before
        else:
            _, out, _ = proc("genspider", file_name, "example.com", cwd=proj_path)
            assert f"{file_path.resolve()} already exists" in out
            modify_time_after = file_path.stat().st_mtime
            assert modify_time_after == modify_time_before
            file_contents_after = file_path.read_text(encoding="utf-8")
            assert file_contents_after == file_contents_before

    @pytest.mark.parametrize(
        ("url", "domain"),
        [
            ("test.com", "test.com"),
            ("https://test.com", "test.com"),
        ],
    )
    def test_url(self, url: str, domain: str, proj_path: Path) -> None:
        assert call("genspider", "--force", "test_name", url, cwd=proj_path) == 0
        spider = proj_path / self.project_name / "spiders" / "test_name.py"
        m = find_in_file(spider, r"allowed_domains\s*=\s*\[['\"](.+)['\"]\]")
        assert m is not None
        assert m.group(1) == domain
        m = find_in_file(spider, r"start_urls\s*=\s*\[['\"](.+)['\"]\]")
        assert m is not None
        assert m.group(1) == f"https://{domain}"

    @pytest.mark.parametrize(
        ("url", "expected", "template"),
        [
            # basic
            ("https://test.com", "https://test.com", "basic"),
            ("http://test.com", "http://test.com", "basic"),
            ("http://test.com/other/path", "http://test.com/other/path", "basic"),
            ("test.com/other/path", "https://test.com/other/path", "basic"),
            # crawl
            ("https://test.com", "https://test.com", "crawl"),
            ("http://test.com", "http://test.com", "crawl"),
            ("http://test.com/other/path", "http://test.com/other/path", "crawl"),
            ("test.com/other/path", "https://test.com/other/path", "crawl"),
            ("test.com", "https://test.com", "crawl"),
            # xmlfeed
            ("https://test.com/feed.xml", "https://test.com/feed.xml", "xmlfeed"),
            ("http://test.com/feed.xml", "http://test.com/feed.xml", "xmlfeed"),
            ("test.com/feed.xml", "https://test.com/feed.xml", "xmlfeed"),
            # csvfeed
            ("https://test.com/feed.csv", "https://test.com/feed.csv", "csvfeed"),
            ("http://test.com/feed.xml", "http://test.com/feed.xml", "csvfeed"),
            ("test.com/feed.csv", "https://test.com/feed.csv", "csvfeed"),
        ],
    )
    def test_template_start_urls(
        self, url: str, expected: str, template: str, proj_path: Path
    ) -> None:
        assert (
            call(
                "genspider", "-t", template, "--force", "test_name", url, cwd=proj_path
            )
            == 0
        )
        spider = proj_path / self.project_name / "spiders" / "test_name.py"
        m = find_in_file(spider, r"start_urls\s*=\s*\[['\"](.+)['\"]\]")
        assert m is not None
        assert m.group(1) == expected


class TestGenspiderStandaloneCommand:
    def test_generate_standalone_spider(self, tmp_path: Path) -> None:
        call("genspider", "example", "example.com", cwd=tmp_path)
        assert Path(tmp_path, "example.py").exists()

    @pytest.mark.parametrize("force", [True, False])
    def test_same_name_as_existing_file(self, force: bool, tmp_path: Path) -> None:
        file_name = "example"
        file_path = Path(tmp_path, file_name + ".py")
        _, out, _ = proc("genspider", file_name, "example.com", cwd=tmp_path)
        assert f"Created spider {file_name!r} using template 'basic' " in out
        assert file_path.exists()
        modify_time_before = file_path.stat().st_mtime
        file_contents_before = file_path.read_text(encoding="utf-8")

        if force:
            # use different template to ensure contents were changed
            _, out, _ = proc(
                "genspider",
                "--force",
                "-t",
                "crawl",
                file_name,
                "example.com",
                cwd=tmp_path,
            )
            assert f"Created spider {file_name!r} using template 'crawl' " in out
            modify_time_after = file_path.stat().st_mtime
            assert modify_time_after != modify_time_before
            file_contents_after = file_path.read_text(encoding="utf-8")
            assert file_contents_after != file_contents_before
        else:
            _, out, _ = proc("genspider", file_name, "example.com", cwd=tmp_path)
            assert (
                f"{Path(tmp_path, file_name + '.py').resolve()} already exists" in out
            )
            modify_time_after = file_path.stat().st_mtime
            assert modify_time_after == modify_time_before
            file_contents_after = file_path.read_text(encoding="utf-8")
            assert file_contents_after == file_contents_before