1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130
|
from __future__ import annotations
import json
from typing import TYPE_CHECKING, Any
from itemadapter import ItemAdapter, is_item
from scrapy.contracts import Contract
from scrapy.exceptions import ContractFail
from scrapy.http import Request
if TYPE_CHECKING:
from collections.abc import Callable
# contracts
class UrlContract(Contract):
"""Contract to set the url of the request (mandatory)
@url http://scrapy.org
"""
name = "url"
def adjust_request_args(self, args: dict[str, Any]) -> dict[str, Any]:
args["url"] = self.args[0]
return args
class CallbackKeywordArgumentsContract(Contract):
"""Contract to set the keyword arguments for the request.
The value should be a JSON-encoded dictionary, e.g.:
@cb_kwargs {"arg1": "some value"}
"""
name = "cb_kwargs"
def adjust_request_args(self, args: dict[str, Any]) -> dict[str, Any]:
args["cb_kwargs"] = json.loads(" ".join(self.args))
return args
class MetadataContract(Contract):
"""Contract to set metadata arguments for the request.
The value should be JSON-encoded dictionary, e.g.:
@meta {"arg1": "some value"}
"""
name = "meta"
def adjust_request_args(self, args: dict[str, Any]) -> dict[str, Any]:
args["meta"] = json.loads(" ".join(self.args))
return args
class ReturnsContract(Contract):
"""Contract to check the output of a callback
general form:
@returns request(s)/item(s) [min=1 [max]]
e.g.:
@returns request
@returns request 2
@returns request 2 10
@returns request 0 10
"""
name = "returns"
object_type_verifiers: dict[str | None, Callable[[Any], bool]] = {
"request": lambda x: isinstance(x, Request),
"requests": lambda x: isinstance(x, Request),
"item": is_item,
"items": is_item,
}
def __init__(self, *args: Any, **kwargs: Any):
super().__init__(*args, **kwargs)
if len(self.args) not in [1, 2, 3]:
raise ValueError(
f"Incorrect argument quantity: expected 1, 2 or 3, got {len(self.args)}"
)
self.obj_name = self.args[0] or None
self.obj_type_verifier = self.object_type_verifiers[self.obj_name]
try:
self.min_bound: float = int(self.args[1])
except IndexError:
self.min_bound = 1
try:
self.max_bound: float = int(self.args[2])
except IndexError:
self.max_bound = float("inf")
def post_process(self, output: list[Any]) -> None:
occurrences = 0
for x in output:
if self.obj_type_verifier(x):
occurrences += 1
assertion = self.min_bound <= occurrences <= self.max_bound
if not assertion:
if self.min_bound == self.max_bound:
expected = str(self.min_bound)
else:
expected = f"{self.min_bound}..{self.max_bound}"
raise ContractFail(
f"Returned {occurrences} {self.obj_name}, expected {expected}"
)
class ScrapesContract(Contract):
"""Contract to check presence of fields in scraped items
@scrapes page_name page_body
"""
name = "scrapes"
def post_process(self, output: list[Any]) -> None:
for x in output:
if is_item(x):
missing = [arg for arg in self.args if arg not in ItemAdapter(x)]
if missing:
missing_fields = ", ".join(missing)
raise ContractFail(f"Missing fields: {missing_fields}")
|