File: client.py

package info (click to toggle)
python-web-poet 0.23.2-1
  • links: PTS, VCS
  • area: main
  • in suites:
  • size: 908 kB
  • sloc: python: 6,112; makefile: 19
file content (287 lines) | stat: -rw-r--r-- 10,347 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
from __future__ import annotations

import asyncio
import logging
from dataclasses import dataclass
from http import HTTPStatus
from typing import TYPE_CHECKING, TypeAlias, cast

from web_poet.exceptions import HttpError, HttpResponseError
from web_poet.exceptions.core import NoSavedHttpResponse
from web_poet.page_inputs.http import (
    HttpRequest,
    HttpRequestBody,
    HttpRequestHeaders,
    HttpResponse,
    request_fingerprint,
)
from web_poet.requests import RequestDownloaderT, _perform_request
from web_poet.utils import as_list

if TYPE_CHECKING:
    from collections.abc import Iterable

from web_poet.page_inputs.url import _Url

logger = logging.getLogger(__name__)

_StrMapping: TypeAlias = dict[str, str]
_Headers: TypeAlias = _StrMapping | HttpRequestHeaders
_Body: TypeAlias = bytes | HttpRequestBody
_StatusList: TypeAlias = str | int | list[str | int]


@dataclass
class _SavedResponseData:
    """Class for storing a request and its result."""

    request: HttpRequest
    response: HttpResponse | None
    exception: HttpError | None = None

    def fingerprint(self) -> str:
        """Return the request fingeprint."""
        return request_fingerprint(self.request)


class HttpClient:
    """Async HTTP client to be used in Page Objects.

    See :ref:`additional-requests` for the usage information.

    HttpClient doesn't make HTTP requests on itself. It uses either the
    request function assigned to the ``web_poet.request_downloader_var``
    :mod:`contextvar <contextvars>`, or a function passed via
    ``request_downloader`` argument of the :meth:`~.HttpClient.__init__` method.

    Either way, this function should be an ``async def`` function which
    receives an  :class:`~.HttpRequest` instance, and either returns a
    :class:`~.HttpResponse` instance, or raises a subclass of
    :class:`~.HttpError`. You can read more in the
    :ref:`advanced-downloader-impl` documentation.
    """

    def __init__(
        self,
        request_downloader: RequestDownloaderT | None = None,
        *,
        save_responses: bool = False,
        return_only_saved_responses: bool = False,
        responses: Iterable[_SavedResponseData] | None = None,
    ):
        self._request_downloader = request_downloader or _perform_request
        self.save_responses = save_responses
        self.return_only_saved_responses = return_only_saved_responses
        self._saved_responses: dict[str, _SavedResponseData] = {
            data.fingerprint(): data for data in responses or []
        }

    @staticmethod
    def _handle_status(
        response: HttpResponse,
        request: HttpRequest,
        *,
        allow_status: _StatusList | None = None,
    ) -> None:
        allow_status_normalized = list(map(str, as_list(allow_status)))
        allow_all_status = any(
            True for s in allow_status_normalized if s.strip() == "*"
        )

        if (
            allow_all_status
            or response.status is None  # allows serialized responses from tests
            or response.status < 400
            or str(response.status) in allow_status_normalized
        ):
            return

        status_name = _http_status_name(response.status)
        msg = f"{response.status} {status_name} response for {response.url}"
        raise HttpResponseError(msg, request=request, response=response)

    async def request(
        self,
        url: str | _Url,
        *,
        method: str = "GET",
        headers: _Headers | None = None,
        body: _Body | None = None,
        allow_status: _StatusList | None = None,
    ) -> HttpResponse:
        """This is a shortcut for creating an :class:`~.HttpRequest` instance and
        executing that request.

        :class:`~.HttpRequestError` is raised for
        *connection errors*, *connection and read timeouts*, etc.

        An :class:`~.HttpResponse` instance is returned for successful
        responses in the ``100-3xx`` status code range.

        Otherwise, an exception of type :class:`~.HttpResponseError` is raised.

        Rasing :class:`~.HttpResponseError` can be suppressed for certain
        status codes using the ``allow_status`` param - it is
        a list of status code values for which :class:`~.HttpResponse`
        should be returned instead of raising :class:`~.HttpResponseError`.

        There is a special "*" ``allow_status`` value which allows
        any status code.

        There is no need to include ``100-3xx`` status codes in ``allow_status``,
        because :class:`~.HttpResponseError` is not raised for them.
        """
        headers = headers or {}
        body = body or b""
        req = HttpRequest(url=url, method=method, headers=headers, body=body)
        return await self.execute(req, allow_status=allow_status)

    async def get(
        self,
        url: str | _Url,
        *,
        headers: _Headers | None = None,
        allow_status: _StatusList | None = None,
    ) -> HttpResponse:
        """Similar to :meth:`~.HttpClient.request` but peforming a ``GET``
        request.
        """
        return await self.request(
            url=url,
            method="GET",
            headers=headers,
            allow_status=allow_status,
        )

    async def post(
        self,
        url: str | _Url,
        *,
        headers: _Headers | None = None,
        body: _Body | None = None,
        allow_status: _StatusList | None = None,
    ) -> HttpResponse:
        """Similar to :meth:`~.HttpClient.request` but performing a ``POST``
        request.
        """
        return await self.request(
            url=url,
            method="POST",
            headers=headers,
            body=body,
            allow_status=allow_status,
        )

    async def execute(
        self, request: HttpRequest, *, allow_status: _StatusList | None = None
    ) -> HttpResponse:
        """Execute the specified :class:`~.HttpRequest` instance using the
        request implementation configured in the :class:`~.HttpClient`
        instance.

        :class:`~.HttpRequestError` is raised for
        *connection errors*, *connection and read timeouts*, etc.

        :class:`~.HttpResponse` instance is returned for successful
        responses in the ``100-3xx`` status code range.

        Otherwise, an exception of type :class:`~.HttpResponseError` is raised.

        Rasing :class:`~.HttpResponseError` can be suppressed for certain
        status codes using the ``allow_status`` param - it is
        a list of status code values for which :class:`~.HttpResponse`
        should be returned instead of raising :class:`~.HttpResponseError`.

        There is a special "*" ``allow_status`` value which allows
        any status code.

        There is no need to include ``100-3xx`` status codes in ``allow_status``,
        because :class:`~.HttpResponseError` is not raised for them.
        """
        if self.return_only_saved_responses:
            for fp, saved_data in self._saved_responses.items():
                if request_fingerprint(request) == fp:
                    if saved_data.exception:
                        raise saved_data.exception
                    assert saved_data.response
                    self._handle_status(
                        saved_data.response,
                        saved_data.request,
                        allow_status=allow_status,
                    )
                    return saved_data.response
            raise NoSavedHttpResponse(request=request)

        try:
            response = await self._request_downloader(request)
        except HttpError as ex:
            if self.save_responses:
                self._saved_responses[request_fingerprint(request)] = (
                    _SavedResponseData(request, None, ex)
                )
            raise

        if self.save_responses:
            self._saved_responses[request_fingerprint(request)] = _SavedResponseData(
                request, response
            )
        self._handle_status(response, request, allow_status=allow_status)
        return response

    async def batch_execute(
        self,
        *requests: HttpRequest,
        return_exceptions: bool = False,
        allow_status: _StatusList | None = None,
    ) -> list[HttpResponse | HttpResponseError]:
        """Similar to :meth:`~.HttpClient.execute` but accepts a collection of
        :class:`~.HttpRequest` instances that would be batch executed.

        The order of the :class:`~.HttpResponses` would correspond to the order
        of :class:`~.HttpRequest` passed.

        If any of the :class:`~.HttpRequest` raises an exception upon execution,
        the exception is raised.

        To prevent this, the actual exception can be returned alongside any
        successful :class:`~.HttpResponse`. This enables salvaging any usable
        responses despite any possible failures. This can be done by setting
        ``True`` to the ``return_exceptions`` parameter.

        Like :meth:`~.HttpClient.execute`, :class:`~.HttpResponseError`
        will be raised for responses with status codes in the ``400-5xx`` range.
        The ``allow_status`` parameter could be used the same way here to prevent
        these exceptions from being raised.

        You can omit ``allow_status="*"`` if you're passing ``return_exceptions=True``.
        However, it would be returning :class:`~.HttpResponseError`
        instead of :class:`~.HttpResponse`.

        Lastly, a :class:`~.HttpRequestError` may be raised
        on cases like *connection errors*, *connection and read timeouts*, etc.
        """

        coroutines = [self.execute(r, allow_status=allow_status) for r in requests]
        responses = await asyncio.gather(
            *coroutines, return_exceptions=return_exceptions
        )
        return cast("list[HttpResponse | HttpResponseError]", responses)

    def get_saved_responses(self) -> Iterable[_SavedResponseData]:
        """Return saved requests and responses."""
        return self._saved_responses.values()


def _http_status_name(status: int) -> str:
    """
    >>> _http_status_name(200)
    'OK'
    >>> _http_status_name(404)
    'NOT_FOUND'
    >>> _http_status_name(999)
    'UNKNOWN'
    """
    try:
        return HTTPStatus(status).name
    except ValueError:
        return "UNKNOWN"