1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44
|
from twisted.internet import task
from scrapy.exceptions import NotConfigured
from scrapy import log, signals
class LogStats(object):
"""Log basic scraping stats periodically"""
def __init__(self, stats, interval=60.0):
self.stats = stats
self.interval = interval
self.multiplier = 60.0 / self.interval
@classmethod
def from_crawler(cls, crawler):
interval = crawler.settings.getfloat('LOGSTATS_INTERVAL')
if not interval:
raise NotConfigured
o = cls(crawler.stats, interval)
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
return o
def spider_opened(self, spider):
self.pagesprev = 0
self.itemsprev = 0
self.task = task.LoopingCall(self.log, spider)
self.task.start(self.interval)
def log(self, spider):
items = self.stats.get_value('item_scraped_count', 0)
pages = self.stats.get_value('response_received_count', 0)
irate = (items - self.itemsprev) * self.multiplier
prate = (pages - self.pagesprev) * self.multiplier
self.pagesprev, self.itemsprev = pages, items
msg = "Crawled %d pages (at %d pages/min), scraped %d items (at %d items/min)" \
% (pages, prate, items, irate)
log.msg(msg, spider=spider)
def spider_closed(self, spider, reason):
if self.task.running:
self.task.stop()
|