File: parse.py

package info (click to toggle)
python-scrapy 0.8-3
  • links: PTS, VCS
  • area: main
  • in suites: squeeze
  • size: 2,904 kB
  • ctags: 2,981
  • sloc: python: 15,349; xml: 199; makefile: 68; sql: 64; sh: 34
file content (106 lines) | stat: -rw-r--r-- 4,324 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
from scrapy.command import ScrapyCommand
from scrapy.utils.fetch import fetch
from scrapy.http import Request
from scrapy.item import BaseItem
from scrapy.spider import spiders
from scrapy.utils import display
from scrapy import log

class Command(ScrapyCommand):

    requires_project = True

    def syntax(self):
        return "[options] <url>"

    def short_desc(self):
        return "Parse the given URL (using the spider) and print the results"

    def add_options(self, parser):
        ScrapyCommand.add_options(self, parser)
        parser.add_option("--nolinks", dest="nolinks", action="store_true", \
            help="don't show extracted links")
        parser.add_option("--noitems", dest="noitems", action="store_true", \
            help="don't show scraped items")
        parser.add_option("--nocolour", dest="nocolour", action="store_true", \
            help="avoid using pygments to colorize the output")
        parser.add_option("-r", "--rules", dest="rules", action="store_true", \
            help="try to match and parse the url with the defined rules (if any)")
        parser.add_option("-c", "--callbacks", dest="callbacks", action="store", \
            help="use the provided callback(s) for parsing the url (separated with commas)")

    def process_options(self, args, opts):
        super(Command, self).process_options(args, opts)
        self.callbacks = opts.callbacks.split(',') if opts.callbacks else []

    def pipeline_process(self, item, spider, opts):
        return item

    def run_callback(self, spider, response, callback, args, opts):
        spider = spiders.fromurl(response.url)
        if not spider:
            log.msg('Cannot find spider for url: %s' % response.url, level=log.ERROR)
            return (), ()

        if callback:
            callback_fcn = callback if callable(callback) else getattr(spider, callback, None)
            if not callback_fcn:
                log.msg('Cannot find callback %s in %s spider' % (callback, spider.domain_name))
                return (), ()

            result = callback_fcn(response)
            links = [i for i in result if isinstance(i, Request)]
            items = [self.pipeline_process(i, spider, opts) for i in result if \
                     isinstance(i, BaseItem)]
            return items, links

        return (), ()

    def print_results(self, items, links, cb_name, opts):
        display.nocolour = opts.nocolour
        if not opts.noitems:
            for item in items:
                for key in item.__dict__.keys():
                    if key.startswith('_'):
                        item.__dict__.pop(key, None)
            print "# Scraped Items - callback: %s" % cb_name, "-"*60
            display.pprint(list(items))

        if not opts.nolinks:
            print "# Links - callback: %s" % cb_name, "-"*68
            display.pprint(list(links))

    def run(self, args, opts):
        if not args:
            print "An URL is required"
            return

        for response in fetch(args):
            spider = spiders.fromurl(response.url)
            if not spider:
                log.msg('Cannot find spider for "%s"' % response.url)
                continue

            if self.callbacks:
                for callback in self.callbacks:
                    items, links = self.run_callback(spider, response, callback, args, opts)
                    self.print_results(items, links, callback, opts)

            elif opts.rules:
                rules = getattr(spider, 'rules', None)
                if rules:
                    items, links = [], []
                    for rule in rules:
                        if rule.callback and rule.link_extractor.matches(response.url):
                            items, links = self.run_callback(spider, response, rule.callback, args, opts)
                            self.print_results(items, links, rule.callback, opts)
                            break
                else:
                    log.msg('No rules found for spider "%s", please specify a callback for parsing' \
                        % spider.domain_name)
                    continue

            else:
                items, links = self.run_callback(spider, response, 'parse', args, opts)
                self.print_results(items, links, 'parse', opts)