File: default.py

package info (click to toggle)
python-scrapy 2.4.1-2%2Bdeb11u1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 4,748 kB
  • sloc: python: 32,888; xml: 199; makefile: 90; sh: 7
file content (108 lines) | stat: -rw-r--r-- 2,966 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import json

from itemadapter import is_item, ItemAdapter

from scrapy.contracts import Contract
from scrapy.exceptions import ContractFail
from scrapy.http import Request


# contracts
class UrlContract(Contract):
    """ Contract to set the url of the request (mandatory)
        @url http://scrapy.org
    """

    name = 'url'

    def adjust_request_args(self, args):
        args['url'] = self.args[0]
        return args


class CallbackKeywordArgumentsContract(Contract):
    """ Contract to set the keyword arguments for the request.
        The value should be a JSON-encoded dictionary, e.g.:

        @cb_kwargs {"arg1": "some value"}
    """

    name = 'cb_kwargs'

    def adjust_request_args(self, args):
        args['cb_kwargs'] = json.loads(' '.join(self.args))
        return args


class ReturnsContract(Contract):
    """ Contract to check the output of a callback

        general form:
        @returns request(s)/item(s) [min=1 [max]]

        e.g.:
        @returns request
        @returns request 2
        @returns request 2 10
        @returns request 0 10
    """

    name = 'returns'
    object_type_verifiers = {
        'request': lambda x: isinstance(x, Request),
        'requests': lambda x: isinstance(x, Request),
        'item': is_item,
        'items': is_item,
    }

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        if len(self.args) not in [1, 2, 3]:
            raise ValueError(
                f"Incorrect argument quantity: expected 1, 2 or 3, got {len(self.args)}"
            )
        self.obj_name = self.args[0] or None
        self.obj_type_verifier = self.object_type_verifiers[self.obj_name]

        try:
            self.min_bound = int(self.args[1])
        except IndexError:
            self.min_bound = 1

        try:
            self.max_bound = int(self.args[2])
        except IndexError:
            self.max_bound = float('inf')

    def post_process(self, output):
        occurrences = 0
        for x in output:
            if self.obj_type_verifier(x):
                occurrences += 1

        assertion = (self.min_bound <= occurrences <= self.max_bound)

        if not assertion:
            if self.min_bound == self.max_bound:
                expected = self.min_bound
            else:
                expected = f'{self.min_bound}..{self.max_bound}'

            raise ContractFail(f"Returned {occurrences} {self.obj_name}, expected {expected}")


class ScrapesContract(Contract):
    """ Contract to check presence of fields in scraped items
        @scrapes page_name page_body
    """

    name = 'scrapes'

    def post_process(self, output):
        for x in output:
            if is_item(x):
                missing = [arg for arg in self.args if arg not in ItemAdapter(x)]
                if missing:
                    missing_fields = ", ".join(missing)
                    raise ContractFail(f"Missing fields: {missing_fields}")