1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223
|
from __future__ import annotations
import os
import shutil
import string
from importlib import import_module
from pathlib import Path
from typing import TYPE_CHECKING, Any, cast
from urllib.parse import urlparse
import scrapy
from scrapy.commands import ScrapyCommand
from scrapy.exceptions import UsageError
from scrapy.utils.template import render_templatefile, string_camelcase
if TYPE_CHECKING:
import argparse
def sanitize_module_name(module_name: str) -> str:
"""Sanitize the given module name, by replacing dashes and points
with underscores and prefixing it with a letter if it doesn't start
with one
"""
module_name = module_name.replace("-", "_").replace(".", "_")
if module_name[0] not in string.ascii_letters:
module_name = "a" + module_name
return module_name
def extract_domain(url: str) -> str:
"""Extract domain name from URL string"""
o = urlparse(url)
if o.scheme == "" and o.netloc == "":
o = urlparse("//" + url.lstrip("/"))
return o.netloc
def verify_url_scheme(url: str) -> str:
"""Check url for scheme and insert https if none found."""
parsed = urlparse(url)
if parsed.scheme == "" and parsed.netloc == "":
parsed = urlparse("//" + url)._replace(scheme="https")
return parsed.geturl()
class Command(ScrapyCommand):
requires_project = False
default_settings = {"LOG_ENABLED": False}
def syntax(self) -> str:
return "[options] <name> <domain>"
def short_desc(self) -> str:
return "Generate new spider using pre-defined templates"
def add_options(self, parser: argparse.ArgumentParser) -> None:
super().add_options(parser)
parser.add_argument(
"-l",
"--list",
dest="list",
action="store_true",
help="List available templates",
)
parser.add_argument(
"-e",
"--edit",
dest="edit",
action="store_true",
help="Edit spider after creating it",
)
parser.add_argument(
"-d",
"--dump",
dest="dump",
metavar="TEMPLATE",
help="Dump template to standard output",
)
parser.add_argument(
"-t",
"--template",
dest="template",
default="basic",
help="Uses a custom template.",
)
parser.add_argument(
"--force",
dest="force",
action="store_true",
help="If the spider already exists, overwrite it with the template",
)
def run(self, args: list[str], opts: argparse.Namespace) -> None:
if opts.list:
self._list_templates()
return
if opts.dump:
template_file = self._find_template(opts.dump)
if template_file:
print(template_file.read_text(encoding="utf-8"))
return
if len(args) != 2:
raise UsageError
name, url = args[0:2]
url = verify_url_scheme(url)
module = sanitize_module_name(name)
if self.settings.get("BOT_NAME") == module:
print("Cannot create a spider with the same name as your project")
return
if not opts.force and self._spider_exists(name):
return
template_file = self._find_template(opts.template)
if template_file:
self._genspider(module, name, url, opts.template, template_file)
if opts.edit:
self.exitcode = os.system(f'scrapy edit "{name}"') # noqa: S605
def _generate_template_variables(
self,
module: str,
name: str,
url: str,
template_name: str,
) -> dict[str, Any]:
capitalized_module = "".join(s.capitalize() for s in module.split("_"))
return {
"project_name": self.settings.get("BOT_NAME"),
"ProjectName": string_camelcase(self.settings.get("BOT_NAME")),
"module": module,
"name": name,
"url": url,
"domain": extract_domain(url),
"classname": f"{capitalized_module}Spider",
}
def _genspider(
self,
module: str,
name: str,
url: str,
template_name: str,
template_file: str | os.PathLike,
) -> None:
"""Generate the spider module, based on the given template"""
tvars = self._generate_template_variables(module, name, url, template_name)
if self.settings.get("NEWSPIDER_MODULE"):
spiders_module = import_module(self.settings["NEWSPIDER_MODULE"])
assert spiders_module.__file__
spiders_dir = Path(spiders_module.__file__).parent.resolve()
else:
spiders_module = None
spiders_dir = Path()
spider_file = f"{spiders_dir / module}.py"
shutil.copyfile(template_file, spider_file)
render_templatefile(spider_file, **tvars)
print(
f"Created spider {name!r} using template {template_name!r} ",
end=("" if spiders_module else "\n"),
)
if spiders_module:
print(f"in module:\n {spiders_module.__name__}.{module}")
def _find_template(self, template: str) -> Path | None:
template_file = Path(self.templates_dir, f"{template}.tmpl")
if template_file.exists():
return template_file
print(f"Unable to find template: {template}\n")
print('Use "scrapy genspider --list" to see all available templates.')
return None
def _list_templates(self) -> None:
print("Available templates:")
for file in sorted(Path(self.templates_dir).iterdir()):
if file.suffix == ".tmpl":
print(f" {file.stem}")
def _spider_exists(self, name: str) -> bool:
if not self.settings.get("NEWSPIDER_MODULE"):
# if run as a standalone command and file with same filename already exists
path = Path(name + ".py")
if path.exists():
print(f"{path.resolve()} already exists")
return True
return False
assert self.crawler_process is not None, (
"crawler_process must be set before calling run"
)
try:
spidercls = self.crawler_process.spider_loader.load(name)
except KeyError:
pass
else:
# if spider with same name exists
print(f"Spider {name!r} already exists in module:")
print(f" {spidercls.__module__}")
return True
# a file with the same name exists in the target directory
spiders_module = import_module(self.settings["NEWSPIDER_MODULE"])
spiders_dir = Path(cast(str, spiders_module.__file__)).parent
spiders_dir_abs = spiders_dir.resolve()
path = spiders_dir_abs / (name + ".py")
if path.exists():
print(f"{path} already exists")
return True
return False
@property
def templates_dir(self) -> str:
return str(
Path(
self.settings["TEMPLATES_DIR"] or Path(scrapy.__path__[0], "templates"),
"spiders",
)
)
|