1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177
|
import abc
import inspect
from contextlib import suppress
from functools import wraps
from types import GenericAlias
from typing import Any, Generic, TypeVar, overload
import attr
import parsel
from web_poet.fields import FieldsMixin, item_from_fields
from web_poet.mixins import ResponseShortcutsMixin, SelectorShortcutsMixin
from web_poet.page_inputs import BrowserResponse, HttpResponse
from web_poet.utils import (
CallableT,
cached_method,
get_generic_param,
)
class Injectable(abc.ABC, FieldsMixin):
"""Base Page Object class, which all Page Objects should inherit from
(probably through Injectable subclasses).
Frameworks which are using ``web-poet`` Page Objects should use
:func:`is_injectable` function to detect if an object is an Injectable,
and if an object is injectable, allow building it automatically
through dependency injection, using https://github.com/scrapinghub/andi
library.
Instead of inheriting you can also use ``Injectable.register(MyWebPage)``.
``Injectable.register`` can also be used as a decorator.
"""
def is_injectable(cls: Any) -> bool:
"""Return True if ``cls`` is a class which inherits
from :class:`~.Injectable`."""
return (
isinstance(cls, type)
and not isinstance(cls, GenericAlias)
and issubclass(cls, Injectable)
)
ItemT = TypeVar("ItemT")
class Returns(Generic[ItemT]):
"""Inherit from this generic mixin to change the item class used by
:class:`~.ItemPage`"""
@property
def item_cls(self) -> type:
"""Item class"""
return get_item_cls(self.__class__, default=dict)
@overload
def get_item_cls(cls: type, default: type) -> type: ...
@overload
def get_item_cls(cls: type, default: None) -> type | None: ...
def get_item_cls(cls: type, default: type | None = None) -> type | None:
param = get_generic_param(cls, Returns)
return param or default
_NOT_SET = object()
def validates_input(to_item: CallableT) -> CallableT:
"""Decorator to apply input validation to custom to_item method
implementations in :class:`~web_poet.pages.ItemPage` subclasses."""
if inspect.iscoroutinefunction(to_item):
@wraps(to_item)
async def _to_item(self, *args, **kwargs):
validation_item = self._validate_input()
if validation_item is not None:
return validation_item
return await to_item(self, *args, **kwargs)
else:
@wraps(to_item)
def _to_item(self, *args, **kwargs):
validation_item = self._validate_input()
if validation_item is not None:
return validation_item
return to_item(self, *args, **kwargs)
return _to_item # type: ignore[return-value]
class Extractor(Returns[ItemT], FieldsMixin):
"""Base class for field support."""
_skip_nonitem_fields = _NOT_SET
def _get_skip_nonitem_fields(self) -> bool:
value = self._skip_nonitem_fields
return False if value is _NOT_SET else bool(value)
def __init_subclass__(
cls, skip_nonitem_fields: Any = _NOT_SET, **kwargs: Any
) -> None:
super().__init_subclass__(**kwargs)
if skip_nonitem_fields is _NOT_SET:
# This is a workaround for attrs issue.
# See: https://github.com/scrapinghub/web-poet/issues/141
return
cls._skip_nonitem_fields = skip_nonitem_fields
async def to_item(self) -> ItemT:
"""Extract an item"""
return await item_from_fields(
self,
item_cls=self.item_cls,
skip_nonitem_fields=self._get_skip_nonitem_fields(),
)
class ItemPage(Extractor[ItemT], Injectable):
"""Base class for page objects."""
@cached_method
def _validate_input(self) -> Any:
"""Run self.validate_input if defined."""
if not hasattr(self, "validate_input"):
return None
with suppress(AttributeError):
if self.__validating_input:
# We are in a recursive call, i.e. _validate_input is being
# called from _validate_input itself (likely through a @field
# method).
return None
self.__validating_input: bool = True
validation_item = self.validate_input()
self.__validating_input = False
return validation_item
@validates_input
async def to_item(self) -> ItemT:
"""Extract an item from a web page"""
return await super().to_item()
@attr.s(auto_attribs=True)
class WebPage(ItemPage[ItemT], ResponseShortcutsMixin):
"""Base Page Object which requires :class:`~.HttpResponse`
and provides XPath / CSS shortcuts.
"""
response: HttpResponse
@attr.s(auto_attribs=True)
class BrowserPage(ItemPage[ItemT], ResponseShortcutsMixin):
"""Base Page Object which requires :class:`~.BrowserResponse`
and provides XPath / CSS shortcuts.
"""
response: BrowserResponse
@attr.s(auto_attribs=True)
class SelectorExtractor(Extractor[ItemT], SelectorShortcutsMixin):
"""Extractor that takes a :class:`parsel.Selector` and provides shortcuts
for its methods."""
selector: parsel.Selector
|