1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
|
"""
Scrapy extension for collecting scraping stats
"""
import os
import getpass
import socket
import datetime
from scrapy.xlib.pydispatch import dispatcher
from scrapy.core import signals
from scrapy.stats import stats
from scrapy.stats.signals import stats_spider_opened, stats_spider_closing
from scrapy.conf import settings
class CoreStats(object):
"""Scrapy core stats collector"""
def __init__(self):
stats.set_value('envinfo/user', getpass.getuser())
stats.set_value('envinfo/host', socket.gethostname())
stats.set_value('envinfo/logfile', settings['LOG_FILE'])
stats.set_value('envinfo/pid', os.getpid())
dispatcher.connect(self.stats_spider_opened, signal=stats_spider_opened)
dispatcher.connect(self.stats_spider_closing, signal=stats_spider_closing)
dispatcher.connect(self.item_scraped, signal=signals.item_scraped)
dispatcher.connect(self.item_passed, signal=signals.item_passed)
dispatcher.connect(self.item_dropped, signal=signals.item_dropped)
def stats_spider_opened(self, spider):
stats.set_value('start_time', datetime.datetime.utcnow(), spider=spider)
stats.set_value('envinfo/host', stats.get_value('envinfo/host'), spider=spider)
stats.inc_value('spider_count/opened')
def stats_spider_closing(self, spider, reason):
stats.set_value('finish_time', datetime.datetime.utcnow(), spider=spider)
stats.set_value('finish_status', 'OK' if reason == 'finished' else reason, spider=spider)
stats.inc_value('spider_count/%s' % reason, spider=spider)
def item_scraped(self, item, spider):
stats.inc_value('item_scraped_count', spider=spider)
stats.inc_value('item_scraped_count')
def item_passed(self, item, spider):
stats.inc_value('item_passed_count', spider=spider)
stats.inc_value('item_passed_count')
def item_dropped(self, item, spider, exception):
reason = exception.__class__.__name__
stats.inc_value('item_dropped_count', spider=spider)
stats.inc_value('item_dropped_reasons_count/%s' % reason, spider=spider)
stats.inc_value('item_dropped_count')
|