File: genspider.py

package info (click to toggle)
python-scrapy 2.13.3-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 5,664 kB
  • sloc: python: 52,028; xml: 199; makefile: 25; sh: 7
file content (223 lines) | stat: -rw-r--r-- 7,327 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
from __future__ import annotations

import os
import shutil
import string
from importlib import import_module
from pathlib import Path
from typing import TYPE_CHECKING, Any, cast
from urllib.parse import urlparse

import scrapy
from scrapy.commands import ScrapyCommand
from scrapy.exceptions import UsageError
from scrapy.utils.template import render_templatefile, string_camelcase

if TYPE_CHECKING:
    import argparse


def sanitize_module_name(module_name: str) -> str:
    """Sanitize the given module name, by replacing dashes and points
    with underscores and prefixing it with a letter if it doesn't start
    with one
    """
    module_name = module_name.replace("-", "_").replace(".", "_")
    if module_name[0] not in string.ascii_letters:
        module_name = "a" + module_name
    return module_name


def extract_domain(url: str) -> str:
    """Extract domain name from URL string"""
    o = urlparse(url)
    if o.scheme == "" and o.netloc == "":
        o = urlparse("//" + url.lstrip("/"))
    return o.netloc


def verify_url_scheme(url: str) -> str:
    """Check url for scheme and insert https if none found."""
    parsed = urlparse(url)
    if parsed.scheme == "" and parsed.netloc == "":
        parsed = urlparse("//" + url)._replace(scheme="https")
    return parsed.geturl()


class Command(ScrapyCommand):
    requires_project = False
    default_settings = {"LOG_ENABLED": False}

    def syntax(self) -> str:
        return "[options] <name> <domain>"

    def short_desc(self) -> str:
        return "Generate new spider using pre-defined templates"

    def add_options(self, parser: argparse.ArgumentParser) -> None:
        super().add_options(parser)
        parser.add_argument(
            "-l",
            "--list",
            dest="list",
            action="store_true",
            help="List available templates",
        )
        parser.add_argument(
            "-e",
            "--edit",
            dest="edit",
            action="store_true",
            help="Edit spider after creating it",
        )
        parser.add_argument(
            "-d",
            "--dump",
            dest="dump",
            metavar="TEMPLATE",
            help="Dump template to standard output",
        )
        parser.add_argument(
            "-t",
            "--template",
            dest="template",
            default="basic",
            help="Uses a custom template.",
        )
        parser.add_argument(
            "--force",
            dest="force",
            action="store_true",
            help="If the spider already exists, overwrite it with the template",
        )

    def run(self, args: list[str], opts: argparse.Namespace) -> None:
        if opts.list:
            self._list_templates()
            return
        if opts.dump:
            template_file = self._find_template(opts.dump)
            if template_file:
                print(template_file.read_text(encoding="utf-8"))
            return
        if len(args) != 2:
            raise UsageError

        name, url = args[0:2]
        url = verify_url_scheme(url)
        module = sanitize_module_name(name)

        if self.settings.get("BOT_NAME") == module:
            print("Cannot create a spider with the same name as your project")
            return

        if not opts.force and self._spider_exists(name):
            return

        template_file = self._find_template(opts.template)
        if template_file:
            self._genspider(module, name, url, opts.template, template_file)
            if opts.edit:
                self.exitcode = os.system(f'scrapy edit "{name}"')  # noqa: S605

    def _generate_template_variables(
        self,
        module: str,
        name: str,
        url: str,
        template_name: str,
    ) -> dict[str, Any]:
        capitalized_module = "".join(s.capitalize() for s in module.split("_"))
        return {
            "project_name": self.settings.get("BOT_NAME"),
            "ProjectName": string_camelcase(self.settings.get("BOT_NAME")),
            "module": module,
            "name": name,
            "url": url,
            "domain": extract_domain(url),
            "classname": f"{capitalized_module}Spider",
        }

    def _genspider(
        self,
        module: str,
        name: str,
        url: str,
        template_name: str,
        template_file: str | os.PathLike,
    ) -> None:
        """Generate the spider module, based on the given template"""
        tvars = self._generate_template_variables(module, name, url, template_name)
        if self.settings.get("NEWSPIDER_MODULE"):
            spiders_module = import_module(self.settings["NEWSPIDER_MODULE"])
            assert spiders_module.__file__
            spiders_dir = Path(spiders_module.__file__).parent.resolve()
        else:
            spiders_module = None
            spiders_dir = Path()
        spider_file = f"{spiders_dir / module}.py"
        shutil.copyfile(template_file, spider_file)
        render_templatefile(spider_file, **tvars)
        print(
            f"Created spider {name!r} using template {template_name!r} ",
            end=("" if spiders_module else "\n"),
        )
        if spiders_module:
            print(f"in module:\n  {spiders_module.__name__}.{module}")

    def _find_template(self, template: str) -> Path | None:
        template_file = Path(self.templates_dir, f"{template}.tmpl")
        if template_file.exists():
            return template_file
        print(f"Unable to find template: {template}\n")
        print('Use "scrapy genspider --list" to see all available templates.')
        return None

    def _list_templates(self) -> None:
        print("Available templates:")
        for file in sorted(Path(self.templates_dir).iterdir()):
            if file.suffix == ".tmpl":
                print(f"  {file.stem}")

    def _spider_exists(self, name: str) -> bool:
        if not self.settings.get("NEWSPIDER_MODULE"):
            # if run as a standalone command and file with same filename already exists
            path = Path(name + ".py")
            if path.exists():
                print(f"{path.resolve()} already exists")
                return True
            return False

        assert self.crawler_process is not None, (
            "crawler_process must be set before calling run"
        )

        try:
            spidercls = self.crawler_process.spider_loader.load(name)
        except KeyError:
            pass
        else:
            # if spider with same name exists
            print(f"Spider {name!r} already exists in module:")
            print(f"  {spidercls.__module__}")
            return True

        # a file with the same name exists in the target directory
        spiders_module = import_module(self.settings["NEWSPIDER_MODULE"])
        spiders_dir = Path(cast(str, spiders_module.__file__)).parent
        spiders_dir_abs = spiders_dir.resolve()
        path = spiders_dir_abs / (name + ".py")
        if path.exists():
            print(f"{path} already exists")
            return True

        return False

    @property
    def templates_dir(self) -> str:
        return str(
            Path(
                self.settings["TEMPLATES_DIR"] or Path(scrapy.__path__[0], "templates"),
                "spiders",
            )
        )