1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134
|
"""
Scrapy extension for collecting scraping stats
"""
from __future__ import annotations
import logging
import pprint
from typing import TYPE_CHECKING, Any
from scrapy.utils.decorators import _warn_spider_arg
if TYPE_CHECKING:
from scrapy import Spider
from scrapy.crawler import Crawler
logger = logging.getLogger(__name__)
StatsT = dict[str, Any]
class StatsCollector:
def __init__(self, crawler: Crawler):
self._dump: bool = crawler.settings.getbool("STATS_DUMP")
self._stats: StatsT = {}
self._crawler: Crawler = crawler
def __getattribute__(self, name):
cached_name = f"_cached_{name}"
try:
return super().__getattribute__(cached_name)
except AttributeError:
pass
original_attr = super().__getattribute__(name)
if name in {
"get_value",
"get_stats",
"set_value",
"set_stats",
"inc_value",
"max_value",
"min_value",
"clear_stats",
"open_spider",
"close_spider",
} and callable(original_attr):
wrapped = _warn_spider_arg(original_attr)
setattr(self, cached_name, wrapped)
return wrapped
return original_attr
def get_value(
self, key: str, default: Any = None, spider: Spider | None = None
) -> Any:
return self._stats.get(key, default)
def get_stats(self, spider: Spider | None = None) -> StatsT:
return self._stats
def set_value(self, key: str, value: Any, spider: Spider | None = None) -> None:
self._stats[key] = value
def set_stats(self, stats: StatsT, spider: Spider | None = None) -> None:
self._stats = stats
def inc_value(
self, key: str, count: int = 1, start: int = 0, spider: Spider | None = None
) -> None:
d = self._stats
d[key] = d.setdefault(key, start) + count
def max_value(self, key: str, value: Any, spider: Spider | None = None) -> None:
self._stats[key] = max(self._stats.setdefault(key, value), value)
def min_value(self, key: str, value: Any, spider: Spider | None = None) -> None:
self._stats[key] = min(self._stats.setdefault(key, value), value)
def clear_stats(self, spider: Spider | None = None) -> None:
self._stats.clear()
def open_spider(self, spider: Spider | None = None) -> None:
pass
def close_spider(
self, spider: Spider | None = None, reason: str | None = None
) -> None:
if self._dump:
logger.info(
"Dumping Scrapy stats:\n" + pprint.pformat(self._stats),
extra={"spider": self._crawler.spider},
)
self._persist_stats(self._stats)
def _persist_stats(self, stats: StatsT) -> None:
pass
class MemoryStatsCollector(StatsCollector):
def __init__(self, crawler: Crawler):
super().__init__(crawler)
self.spider_stats: dict[str, StatsT] = {}
def _persist_stats(self, stats: StatsT) -> None:
if self._crawler.spider:
self.spider_stats[self._crawler.spider.name] = stats
class DummyStatsCollector(StatsCollector):
def get_value(
self, key: str, default: Any = None, spider: Spider | None = None
) -> Any:
return default
def set_value(self, key: str, value: Any, spider: Spider | None = None) -> None:
pass
def set_stats(self, stats: StatsT, spider: Spider | None = None) -> None:
pass
def inc_value(
self, key: str, count: int = 1, start: int = 0, spider: Spider | None = None
) -> None:
pass
def max_value(self, key: str, value: Any, spider: Spider | None = None) -> None:
pass
def min_value(self, key: str, value: Any, spider: Spider | None = None) -> None:
pass
|