File: pages.py

package info (click to toggle)
python-web-poet 0.23.2-1
  • links: PTS, VCS
  • area: main
  • in suites:
  • size: 908 kB
  • sloc: python: 6,112; makefile: 19
file content (177 lines) | stat: -rw-r--r-- 5,242 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
import abc
import inspect
from contextlib import suppress
from functools import wraps
from types import GenericAlias
from typing import Any, Generic, TypeVar, overload

import attr
import parsel

from web_poet.fields import FieldsMixin, item_from_fields
from web_poet.mixins import ResponseShortcutsMixin, SelectorShortcutsMixin
from web_poet.page_inputs import BrowserResponse, HttpResponse
from web_poet.utils import (
    CallableT,
    cached_method,
    get_generic_param,
)


class Injectable(abc.ABC, FieldsMixin):
    """Base Page Object class, which all Page Objects should inherit from
    (probably through Injectable subclasses).

    Frameworks which are using ``web-poet`` Page Objects should use
    :func:`is_injectable` function to detect if an object is an Injectable,
    and if an object is injectable, allow building it automatically
    through dependency injection, using https://github.com/scrapinghub/andi
    library.

    Instead of inheriting you can also use ``Injectable.register(MyWebPage)``.
    ``Injectable.register`` can also be used as a decorator.
    """


def is_injectable(cls: Any) -> bool:
    """Return True if ``cls`` is a class which inherits
    from :class:`~.Injectable`."""
    return (
        isinstance(cls, type)
        and not isinstance(cls, GenericAlias)
        and issubclass(cls, Injectable)
    )


ItemT = TypeVar("ItemT")


class Returns(Generic[ItemT]):
    """Inherit from this generic mixin to change the item class used by
    :class:`~.ItemPage`"""

    @property
    def item_cls(self) -> type:
        """Item class"""
        return get_item_cls(self.__class__, default=dict)


@overload
def get_item_cls(cls: type, default: type) -> type: ...


@overload
def get_item_cls(cls: type, default: None) -> type | None: ...


def get_item_cls(cls: type, default: type | None = None) -> type | None:
    param = get_generic_param(cls, Returns)
    return param or default


_NOT_SET = object()


def validates_input(to_item: CallableT) -> CallableT:
    """Decorator to apply input validation to custom to_item method
    implementations in :class:`~web_poet.pages.ItemPage` subclasses."""

    if inspect.iscoroutinefunction(to_item):

        @wraps(to_item)
        async def _to_item(self, *args, **kwargs):
            validation_item = self._validate_input()
            if validation_item is not None:
                return validation_item
            return await to_item(self, *args, **kwargs)

    else:

        @wraps(to_item)
        def _to_item(self, *args, **kwargs):
            validation_item = self._validate_input()
            if validation_item is not None:
                return validation_item
            return to_item(self, *args, **kwargs)

    return _to_item  # type: ignore[return-value]


class Extractor(Returns[ItemT], FieldsMixin):
    """Base class for field support."""

    _skip_nonitem_fields = _NOT_SET

    def _get_skip_nonitem_fields(self) -> bool:
        value = self._skip_nonitem_fields
        return False if value is _NOT_SET else bool(value)

    def __init_subclass__(
        cls, skip_nonitem_fields: Any = _NOT_SET, **kwargs: Any
    ) -> None:
        super().__init_subclass__(**kwargs)
        if skip_nonitem_fields is _NOT_SET:
            # This is a workaround for attrs issue.
            # See: https://github.com/scrapinghub/web-poet/issues/141
            return
        cls._skip_nonitem_fields = skip_nonitem_fields

    async def to_item(self) -> ItemT:
        """Extract an item"""
        return await item_from_fields(
            self,
            item_cls=self.item_cls,
            skip_nonitem_fields=self._get_skip_nonitem_fields(),
        )


class ItemPage(Extractor[ItemT], Injectable):
    """Base class for page objects."""

    @cached_method
    def _validate_input(self) -> Any:
        """Run self.validate_input if defined."""
        if not hasattr(self, "validate_input"):
            return None
        with suppress(AttributeError):
            if self.__validating_input:
                # We are in a recursive call, i.e. _validate_input is being
                # called from _validate_input itself (likely through a @field
                # method).
                return None

        self.__validating_input: bool = True
        validation_item = self.validate_input()
        self.__validating_input = False
        return validation_item

    @validates_input
    async def to_item(self) -> ItemT:
        """Extract an item from a web page"""
        return await super().to_item()


@attr.s(auto_attribs=True)
class WebPage(ItemPage[ItemT], ResponseShortcutsMixin):
    """Base Page Object which requires :class:`~.HttpResponse`
    and provides XPath / CSS shortcuts.
    """

    response: HttpResponse


@attr.s(auto_attribs=True)
class BrowserPage(ItemPage[ItemT], ResponseShortcutsMixin):
    """Base Page Object which requires :class:`~.BrowserResponse`
    and provides XPath / CSS shortcuts.
    """

    response: BrowserResponse


@attr.s(auto_attribs=True)
class SelectorExtractor(Extractor[ItemT], SelectorShortcutsMixin):
    """Extractor that takes a :class:`parsel.Selector` and provides shortcuts
    for its methods."""

    selector: parsel.Selector