1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106
|
from scrapy.command import ScrapyCommand
from scrapy.utils.fetch import fetch
from scrapy.http import Request
from scrapy.item import BaseItem
from scrapy.spider import spiders
from scrapy.utils import display
from scrapy import log
class Command(ScrapyCommand):
requires_project = True
def syntax(self):
return "[options] <url>"
def short_desc(self):
return "Parse the given URL (using the spider) and print the results"
def add_options(self, parser):
ScrapyCommand.add_options(self, parser)
parser.add_option("--nolinks", dest="nolinks", action="store_true", \
help="don't show extracted links")
parser.add_option("--noitems", dest="noitems", action="store_true", \
help="don't show scraped items")
parser.add_option("--nocolour", dest="nocolour", action="store_true", \
help="avoid using pygments to colorize the output")
parser.add_option("-r", "--rules", dest="rules", action="store_true", \
help="try to match and parse the url with the defined rules (if any)")
parser.add_option("-c", "--callbacks", dest="callbacks", action="store", \
help="use the provided callback(s) for parsing the url (separated with commas)")
def process_options(self, args, opts):
super(Command, self).process_options(args, opts)
self.callbacks = opts.callbacks.split(',') if opts.callbacks else []
def pipeline_process(self, item, spider, opts):
return item
def run_callback(self, spider, response, callback, args, opts):
spider = spiders.fromurl(response.url)
if not spider:
log.msg('Cannot find spider for url: %s' % response.url, level=log.ERROR)
return (), ()
if callback:
callback_fcn = callback if callable(callback) else getattr(spider, callback, None)
if not callback_fcn:
log.msg('Cannot find callback %s in %s spider' % (callback, spider.domain_name))
return (), ()
result = callback_fcn(response)
links = [i for i in result if isinstance(i, Request)]
items = [self.pipeline_process(i, spider, opts) for i in result if \
isinstance(i, BaseItem)]
return items, links
return (), ()
def print_results(self, items, links, cb_name, opts):
display.nocolour = opts.nocolour
if not opts.noitems:
for item in items:
for key in item.__dict__.keys():
if key.startswith('_'):
item.__dict__.pop(key, None)
print "# Scraped Items - callback: %s" % cb_name, "-"*60
display.pprint(list(items))
if not opts.nolinks:
print "# Links - callback: %s" % cb_name, "-"*68
display.pprint(list(links))
def run(self, args, opts):
if not args:
print "An URL is required"
return
for response in fetch(args):
spider = spiders.fromurl(response.url)
if not spider:
log.msg('Cannot find spider for "%s"' % response.url)
continue
if self.callbacks:
for callback in self.callbacks:
items, links = self.run_callback(spider, response, callback, args, opts)
self.print_results(items, links, callback, opts)
elif opts.rules:
rules = getattr(spider, 'rules', None)
if rules:
items, links = [], []
for rule in rules:
if rule.callback and rule.link_extractor.matches(response.url):
items, links = self.run_callback(spider, response, rule.callback, args, opts)
self.print_results(items, links, rule.callback, opts)
break
else:
log.msg('No rules found for spider "%s", please specify a callback for parsing' \
% spider.domain_name)
continue
else:
items, links = self.run_callback(spider, response, 'parse', args, opts)
self.print_results(items, links, 'parse', opts)
|