1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64
|
"""
SpiderProfiler is an extension that hooks itself into every Request callback
returned from spiders to measure the processing time and memory allocation
caused by spiders code.
The results are collected using the StatsCollector.
This extension introduces a big impact on crawling performance, so enable only
for debugging.
"""
from time import time
from scrapy.xlib.pydispatch import dispatcher
from scrapy.core import signals
from scrapy.core.exceptions import NotConfigured
from scrapy.utils.memory import get_vmvalue_from_procfs
from scrapy.stats import stats
from scrapy.conf import settings
class SpiderProfiler(object):
def __init__(self):
if not settings.getbool('SPIDERPROFILER_ENABLED'):
raise NotConfigured
try:
get_vmvalue_from_procfs('VmSize')
except RuntimeError:
self._mem_tracking = False
else:
self._mem_tracking = True
dispatcher.connect(self._request_received, signals.request_received)
def _request_received(self, request, spider):
old_cbs = request.deferred.callbacks[0]
new_cbs = ((self._profiled_callback(old_cbs[0][0], spider), old_cbs[0][1], \
old_cbs[0][2]), old_cbs[1])
request.deferred.callbacks[0] = new_cbs
def _profiled_callback(self, function, spider):
def new_callback(*args, **kwargs):
tbefore = time()
mbefore = self._memusage()
r = function(*args, **kwargs)
mafter = self._memusage()
ct = time() - tbefore
tcc = stats.get_value('profiling/total_callback_time', 0, spider=spider)
sct = stats.get_value('profiling/slowest_callback_time', 0, spider=spider)
stats.set_value('profiling/total_callback_time', tcc+ct, spider=spider)
if ct > sct:
stats.set_value('profiling/slowest_callback_time', ct, spider=spider)
stats.set_value('profiling/slowest_callback_name', function.__name__, \
spider=spider)
stats.set_value('profiling/slowest_callback_url', args[0].url, \
spider=spider)
if self._memusage:
stats.inc_value('profiling/total_mem_allocated_in_callbacks', \
count=mafter-mbefore, spider=spider)
return r
return new_callback
def _memusage(self):
return get_vmvalue_from_procfs('VmSize') if self._mem_tracking else 0.0
|