File: parse.py

package info (click to toggle)
python-scrapy 2.4.1-2%2Bdeb11u1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 4,748 kB
  • sloc: python: 32,888; xml: 199; makefile: 90; sh: 7
file content (256 lines) | stat: -rw-r--r-- 10,028 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
import json
import logging

from itemadapter import is_item, ItemAdapter
from w3lib.url import is_url

from scrapy.commands import BaseRunSpiderCommand
from scrapy.http import Request
from scrapy.utils import display
from scrapy.utils.spider import iterate_spider_output, spidercls_for_request
from scrapy.exceptions import UsageError

logger = logging.getLogger(__name__)


class Command(BaseRunSpiderCommand):
    requires_project = True

    spider = None
    items = {}
    requests = {}

    first_response = None

    def syntax(self):
        return "[options] <url>"

    def short_desc(self):
        return "Parse URL (using its spider) and print the results"

    def add_options(self, parser):
        BaseRunSpiderCommand.add_options(self, parser)
        parser.add_option("--spider", dest="spider", default=None,
                          help="use this spider without looking for one")
        parser.add_option("--pipelines", action="store_true",
                          help="process items through pipelines")
        parser.add_option("--nolinks", dest="nolinks", action="store_true",
                          help="don't show links to follow (extracted requests)")
        parser.add_option("--noitems", dest="noitems", action="store_true",
                          help="don't show scraped items")
        parser.add_option("--nocolour", dest="nocolour", action="store_true",
                          help="avoid using pygments to colorize the output")
        parser.add_option("-r", "--rules", dest="rules", action="store_true",
                          help="use CrawlSpider rules to discover the callback")
        parser.add_option("-c", "--callback", dest="callback",
                          help="use this callback for parsing, instead looking for a callback")
        parser.add_option("-m", "--meta", dest="meta",
                          help="inject extra meta into the Request, it must be a valid raw json string")
        parser.add_option("--cbkwargs", dest="cbkwargs",
                          help="inject extra callback kwargs into the Request, it must be a valid raw json string")
        parser.add_option("-d", "--depth", dest="depth", type="int", default=1,
                          help="maximum depth for parsing requests [default: %default]")
        parser.add_option("-v", "--verbose", dest="verbose", action="store_true",
                          help="print each depth level one by one")

    @property
    def max_level(self):
        max_items, max_requests = 0, 0
        if self.items:
            max_items = max(self.items)
        if self.requests:
            max_requests = max(self.requests)
        return max(max_items, max_requests)

    def add_items(self, lvl, new_items):
        old_items = self.items.get(lvl, [])
        self.items[lvl] = old_items + new_items

    def add_requests(self, lvl, new_reqs):
        old_reqs = self.requests.get(lvl, [])
        self.requests[lvl] = old_reqs + new_reqs

    def print_items(self, lvl=None, colour=True):
        if lvl is None:
            items = [item for lst in self.items.values() for item in lst]
        else:
            items = self.items.get(lvl, [])

        print("# Scraped Items ", "-" * 60)
        display.pprint([ItemAdapter(x).asdict() for x in items], colorize=colour)

    def print_requests(self, lvl=None, colour=True):
        if lvl is None:
            if self.requests:
                requests = self.requests[max(self.requests)]
            else:
                requests = []
        else:
            requests = self.requests.get(lvl, [])

        print("# Requests ", "-" * 65)
        display.pprint(requests, colorize=colour)

    def print_results(self, opts):
        colour = not opts.nocolour

        if opts.verbose:
            for level in range(1, self.max_level + 1):
                print(f'\n>>> DEPTH LEVEL: {level} <<<')
                if not opts.noitems:
                    self.print_items(level, colour)
                if not opts.nolinks:
                    self.print_requests(level, colour)
        else:
            print(f'\n>>> STATUS DEPTH LEVEL {self.max_level} <<<')
            if not opts.noitems:
                self.print_items(colour=colour)
            if not opts.nolinks:
                self.print_requests(colour=colour)

    def run_callback(self, response, callback, cb_kwargs=None):
        cb_kwargs = cb_kwargs or {}
        items, requests = [], []

        for x in iterate_spider_output(callback(response, **cb_kwargs)):
            if is_item(x):
                items.append(x)
            elif isinstance(x, Request):
                requests.append(x)
        return items, requests

    def get_callback_from_rules(self, spider, response):
        if getattr(spider, 'rules', None):
            for rule in spider.rules:
                if rule.link_extractor.matches(response.url):
                    return rule.callback or "parse"
        else:
            logger.error('No CrawlSpider rules found in spider %(spider)r, '
                         'please specify a callback to use for parsing',
                         {'spider': spider.name})

    def set_spidercls(self, url, opts):
        spider_loader = self.crawler_process.spider_loader
        if opts.spider:
            try:
                self.spidercls = spider_loader.load(opts.spider)
            except KeyError:
                logger.error('Unable to find spider: %(spider)s',
                             {'spider': opts.spider})
        else:
            self.spidercls = spidercls_for_request(spider_loader, Request(url))
            if not self.spidercls:
                logger.error('Unable to find spider for: %(url)s', {'url': url})

        def _start_requests(spider):
            yield self.prepare_request(spider, Request(url), opts)
        self.spidercls.start_requests = _start_requests

    def start_parsing(self, url, opts):
        self.crawler_process.crawl(self.spidercls, **opts.spargs)
        self.pcrawler = list(self.crawler_process.crawlers)[0]
        self.crawler_process.start()

        if not self.first_response:
            logger.error('No response downloaded for: %(url)s',
                         {'url': url})

    def prepare_request(self, spider, request, opts):
        def callback(response, **cb_kwargs):
            # memorize first request
            if not self.first_response:
                self.first_response = response

            # determine real callback
            cb = response.meta['_callback']
            if not cb:
                if opts.callback:
                    cb = opts.callback
                elif opts.rules and self.first_response == response:
                    cb = self.get_callback_from_rules(spider, response)

                    if not cb:
                        logger.error('Cannot find a rule that matches %(url)r in spider: %(spider)s',
                                     {'url': response.url, 'spider': spider.name})
                        return
                else:
                    cb = 'parse'

            if not callable(cb):
                cb_method = getattr(spider, cb, None)
                if callable(cb_method):
                    cb = cb_method
                else:
                    logger.error('Cannot find callback %(callback)r in spider: %(spider)s',
                                 {'callback': cb, 'spider': spider.name})
                    return

            # parse items and requests
            depth = response.meta['_depth']

            items, requests = self.run_callback(response, cb, cb_kwargs)
            if opts.pipelines:
                itemproc = self.pcrawler.engine.scraper.itemproc
                for item in items:
                    itemproc.process_item(item, spider)
            self.add_items(depth, items)
            self.add_requests(depth, requests)

            scraped_data = items if opts.output else []
            if depth < opts.depth:
                for req in requests:
                    req.meta['_depth'] = depth + 1
                    req.meta['_callback'] = req.callback
                    req.callback = callback
                scraped_data += requests

            return scraped_data

        # update request meta if any extra meta was passed through the --meta/-m opts.
        if opts.meta:
            request.meta.update(opts.meta)

        # update cb_kwargs if any extra values were was passed through the --cbkwargs option.
        if opts.cbkwargs:
            request.cb_kwargs.update(opts.cbkwargs)

        request.meta['_depth'] = 1
        request.meta['_callback'] = request.callback
        request.callback = callback
        return request

    def process_options(self, args, opts):
        BaseRunSpiderCommand.process_options(self, args, opts)

        self.process_request_meta(opts)
        self.process_request_cb_kwargs(opts)

    def process_request_meta(self, opts):
        if opts.meta:
            try:
                opts.meta = json.loads(opts.meta)
            except ValueError:
                raise UsageError("Invalid -m/--meta value, pass a valid json string to -m or --meta. "
                                 "Example: --meta='{\"foo\" : \"bar\"}'", print_help=False)

    def process_request_cb_kwargs(self, opts):
        if opts.cbkwargs:
            try:
                opts.cbkwargs = json.loads(opts.cbkwargs)
            except ValueError:
                raise UsageError("Invalid --cbkwargs value, pass a valid json string to --cbkwargs. "
                                 "Example: --cbkwargs='{\"foo\" : \"bar\"}'", print_help=False)

    def run(self, args, opts):
        # parse arguments
        if not len(args) == 1 or not is_url(args[0]):
            raise UsageError()
        else:
            url = args[0]

        # prepare spidercls
        self.set_spidercls(url, opts)

        if self.spidercls and opts.depth > 0:
            self.start_parsing(url, opts)
            self.print_results(opts)