File: test_spidermiddleware_urllength.py

package info (click to toggle)
python-scrapy 2.14.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 6,308 kB
  • sloc: python: 55,321; xml: 199; makefile: 25; sh: 7
file content (56 lines) | stat: -rw-r--r-- 1,572 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
from __future__ import annotations

from logging import INFO
from typing import TYPE_CHECKING

import pytest

from scrapy.http import Request, Response
from scrapy.spidermiddlewares.urllength import UrlLengthMiddleware
from scrapy.spiders import Spider
from scrapy.utils.test import get_crawler

if TYPE_CHECKING:
    from scrapy.crawler import Crawler
    from scrapy.statscollectors import StatsCollector


maxlength = 25
response = Response("http://scrapytest.org")
short_url_req = Request("http://scrapytest.org/")
long_url_req = Request("http://scrapytest.org/this_is_a_long_url")
reqs: list[Request] = [short_url_req, long_url_req]


@pytest.fixture
def crawler() -> Crawler:
    return get_crawler(Spider, {"URLLENGTH_LIMIT": maxlength})


@pytest.fixture
def stats(crawler: Crawler) -> StatsCollector:
    assert crawler.stats is not None
    return crawler.stats


@pytest.fixture
def mw(crawler: Crawler) -> UrlLengthMiddleware:
    return UrlLengthMiddleware.from_crawler(crawler)


def process_spider_output(mw: UrlLengthMiddleware) -> list[Request]:
    return list(mw.process_spider_output(response, reqs))


def test_middleware_works(mw: UrlLengthMiddleware) -> None:
    assert process_spider_output(mw) == [short_url_req]


def test_logging(
    stats: StatsCollector, mw: UrlLengthMiddleware, caplog: pytest.LogCaptureFixture
) -> None:
    with caplog.at_level(INFO):
        process_spider_output(mw)
    ric = stats.get_value("urllength/request_ignored_count")
    assert ric == 1
    assert f"Ignoring link (url length > {maxlength})" in caplog.text