File: spiderprofiler.py

package info (click to toggle)
python-scrapy 0.8-3
  • links: PTS, VCS
  • area: main
  • in suites: squeeze
  • size: 2,904 kB
  • ctags: 2,981
  • sloc: python: 15,349; xml: 199; makefile: 68; sql: 64; sh: 34
file content (64 lines) | stat: -rw-r--r-- 2,490 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
"""
SpiderProfiler is an extension that hooks itself into every Request callback
returned from spiders to measure the processing time and memory allocation
caused by spiders code.

The results are collected using the StatsCollector.

This extension introduces a big impact on crawling performance, so enable only
for debugging.
"""

from time import time

from scrapy.xlib.pydispatch import dispatcher

from scrapy.core import signals
from scrapy.core.exceptions import NotConfigured
from scrapy.utils.memory import get_vmvalue_from_procfs
from scrapy.stats import stats
from scrapy.conf import settings

class SpiderProfiler(object):
    
    def __init__(self):
        if not settings.getbool('SPIDERPROFILER_ENABLED'):
            raise NotConfigured
        try:
            get_vmvalue_from_procfs('VmSize')
        except RuntimeError:
            self._mem_tracking = False
        else:
            self._mem_tracking = True
        dispatcher.connect(self._request_received, signals.request_received)

    def _request_received(self, request, spider):
        old_cbs = request.deferred.callbacks[0]
        new_cbs = ((self._profiled_callback(old_cbs[0][0], spider), old_cbs[0][1], \
            old_cbs[0][2]), old_cbs[1])
        request.deferred.callbacks[0] = new_cbs

    def _profiled_callback(self, function, spider):
        def new_callback(*args, **kwargs):
            tbefore = time()
            mbefore = self._memusage()
            r = function(*args, **kwargs)
            mafter = self._memusage()
            ct = time() - tbefore
            tcc = stats.get_value('profiling/total_callback_time', 0, spider=spider)
            sct = stats.get_value('profiling/slowest_callback_time', 0, spider=spider)
            stats.set_value('profiling/total_callback_time', tcc+ct, spider=spider)
            if ct > sct:
                stats.set_value('profiling/slowest_callback_time', ct, spider=spider)
                stats.set_value('profiling/slowest_callback_name', function.__name__, \
                    spider=spider)
                stats.set_value('profiling/slowest_callback_url', args[0].url, \
                    spider=spider)
            if self._memusage:
                stats.inc_value('profiling/total_mem_allocated_in_callbacks', \
                    count=mafter-mbefore, spider=spider)
            return r
        return new_callback

    def _memusage(self):
        return get_vmvalue_from_procfs('VmSize') if self._mem_tracking else 0.0