File: statscollectors.py

package info (click to toggle)
python-scrapy 2.14.1-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 6,316 kB
  • sloc: python: 55,421; xml: 199; makefile: 25; sh: 7
file content (134 lines) | stat: -rw-r--r-- 3,889 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
"""
Scrapy extension for collecting scraping stats
"""

from __future__ import annotations

import logging
import pprint
from typing import TYPE_CHECKING, Any

from scrapy.utils.decorators import _warn_spider_arg

if TYPE_CHECKING:
    from scrapy import Spider
    from scrapy.crawler import Crawler


logger = logging.getLogger(__name__)


StatsT = dict[str, Any]


class StatsCollector:
    def __init__(self, crawler: Crawler):
        self._dump: bool = crawler.settings.getbool("STATS_DUMP")
        self._stats: StatsT = {}
        self._crawler: Crawler = crawler

    def __getattribute__(self, name):
        cached_name = f"_cached_{name}"
        try:
            return super().__getattribute__(cached_name)
        except AttributeError:
            pass

        original_attr = super().__getattribute__(name)

        if name in {
            "get_value",
            "get_stats",
            "set_value",
            "set_stats",
            "inc_value",
            "max_value",
            "min_value",
            "clear_stats",
            "open_spider",
            "close_spider",
        } and callable(original_attr):
            wrapped = _warn_spider_arg(original_attr)
            setattr(self, cached_name, wrapped)
            return wrapped

        return original_attr

    def get_value(
        self, key: str, default: Any = None, spider: Spider | None = None
    ) -> Any:
        return self._stats.get(key, default)

    def get_stats(self, spider: Spider | None = None) -> StatsT:
        return self._stats

    def set_value(self, key: str, value: Any, spider: Spider | None = None) -> None:
        self._stats[key] = value

    def set_stats(self, stats: StatsT, spider: Spider | None = None) -> None:
        self._stats = stats

    def inc_value(
        self, key: str, count: int = 1, start: int = 0, spider: Spider | None = None
    ) -> None:
        d = self._stats
        d[key] = d.setdefault(key, start) + count

    def max_value(self, key: str, value: Any, spider: Spider | None = None) -> None:
        self._stats[key] = max(self._stats.setdefault(key, value), value)

    def min_value(self, key: str, value: Any, spider: Spider | None = None) -> None:
        self._stats[key] = min(self._stats.setdefault(key, value), value)

    def clear_stats(self, spider: Spider | None = None) -> None:
        self._stats.clear()

    def open_spider(self, spider: Spider | None = None) -> None:
        pass

    def close_spider(
        self, spider: Spider | None = None, reason: str | None = None
    ) -> None:
        if self._dump:
            logger.info(
                "Dumping Scrapy stats:\n" + pprint.pformat(self._stats),
                extra={"spider": self._crawler.spider},
            )
        self._persist_stats(self._stats)

    def _persist_stats(self, stats: StatsT) -> None:
        pass


class MemoryStatsCollector(StatsCollector):
    def __init__(self, crawler: Crawler):
        super().__init__(crawler)
        self.spider_stats: dict[str, StatsT] = {}

    def _persist_stats(self, stats: StatsT) -> None:
        if self._crawler.spider:
            self.spider_stats[self._crawler.spider.name] = stats


class DummyStatsCollector(StatsCollector):
    def get_value(
        self, key: str, default: Any = None, spider: Spider | None = None
    ) -> Any:
        return default

    def set_value(self, key: str, value: Any, spider: Spider | None = None) -> None:
        pass

    def set_stats(self, stats: StatsT, spider: Spider | None = None) -> None:
        pass

    def inc_value(
        self, key: str, count: int = 1, start: int = 0, spider: Spider | None = None
    ) -> None:
        pass

    def max_value(self, key: str, value: Any, spider: Spider | None = None) -> None:
        pass

    def min_value(self, key: str, value: Any, spider: Spider | None = None) -> None:
        pass