1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287
|
from __future__ import annotations
import asyncio
import logging
from dataclasses import dataclass
from http import HTTPStatus
from typing import TYPE_CHECKING, TypeAlias, cast
from web_poet.exceptions import HttpError, HttpResponseError
from web_poet.exceptions.core import NoSavedHttpResponse
from web_poet.page_inputs.http import (
HttpRequest,
HttpRequestBody,
HttpRequestHeaders,
HttpResponse,
request_fingerprint,
)
from web_poet.requests import RequestDownloaderT, _perform_request
from web_poet.utils import as_list
if TYPE_CHECKING:
from collections.abc import Iterable
from web_poet.page_inputs.url import _Url
logger = logging.getLogger(__name__)
_StrMapping: TypeAlias = dict[str, str]
_Headers: TypeAlias = _StrMapping | HttpRequestHeaders
_Body: TypeAlias = bytes | HttpRequestBody
_StatusList: TypeAlias = str | int | list[str | int]
@dataclass
class _SavedResponseData:
"""Class for storing a request and its result."""
request: HttpRequest
response: HttpResponse | None
exception: HttpError | None = None
def fingerprint(self) -> str:
"""Return the request fingeprint."""
return request_fingerprint(self.request)
class HttpClient:
"""Async HTTP client to be used in Page Objects.
See :ref:`additional-requests` for the usage information.
HttpClient doesn't make HTTP requests on itself. It uses either the
request function assigned to the ``web_poet.request_downloader_var``
:mod:`contextvar <contextvars>`, or a function passed via
``request_downloader`` argument of the :meth:`~.HttpClient.__init__` method.
Either way, this function should be an ``async def`` function which
receives an :class:`~.HttpRequest` instance, and either returns a
:class:`~.HttpResponse` instance, or raises a subclass of
:class:`~.HttpError`. You can read more in the
:ref:`advanced-downloader-impl` documentation.
"""
def __init__(
self,
request_downloader: RequestDownloaderT | None = None,
*,
save_responses: bool = False,
return_only_saved_responses: bool = False,
responses: Iterable[_SavedResponseData] | None = None,
):
self._request_downloader = request_downloader or _perform_request
self.save_responses = save_responses
self.return_only_saved_responses = return_only_saved_responses
self._saved_responses: dict[str, _SavedResponseData] = {
data.fingerprint(): data for data in responses or []
}
@staticmethod
def _handle_status(
response: HttpResponse,
request: HttpRequest,
*,
allow_status: _StatusList | None = None,
) -> None:
allow_status_normalized = list(map(str, as_list(allow_status)))
allow_all_status = any(
True for s in allow_status_normalized if s.strip() == "*"
)
if (
allow_all_status
or response.status is None # allows serialized responses from tests
or response.status < 400
or str(response.status) in allow_status_normalized
):
return
status_name = _http_status_name(response.status)
msg = f"{response.status} {status_name} response for {response.url}"
raise HttpResponseError(msg, request=request, response=response)
async def request(
self,
url: str | _Url,
*,
method: str = "GET",
headers: _Headers | None = None,
body: _Body | None = None,
allow_status: _StatusList | None = None,
) -> HttpResponse:
"""This is a shortcut for creating an :class:`~.HttpRequest` instance and
executing that request.
:class:`~.HttpRequestError` is raised for
*connection errors*, *connection and read timeouts*, etc.
An :class:`~.HttpResponse` instance is returned for successful
responses in the ``100-3xx`` status code range.
Otherwise, an exception of type :class:`~.HttpResponseError` is raised.
Rasing :class:`~.HttpResponseError` can be suppressed for certain
status codes using the ``allow_status`` param - it is
a list of status code values for which :class:`~.HttpResponse`
should be returned instead of raising :class:`~.HttpResponseError`.
There is a special "*" ``allow_status`` value which allows
any status code.
There is no need to include ``100-3xx`` status codes in ``allow_status``,
because :class:`~.HttpResponseError` is not raised for them.
"""
headers = headers or {}
body = body or b""
req = HttpRequest(url=url, method=method, headers=headers, body=body)
return await self.execute(req, allow_status=allow_status)
async def get(
self,
url: str | _Url,
*,
headers: _Headers | None = None,
allow_status: _StatusList | None = None,
) -> HttpResponse:
"""Similar to :meth:`~.HttpClient.request` but peforming a ``GET``
request.
"""
return await self.request(
url=url,
method="GET",
headers=headers,
allow_status=allow_status,
)
async def post(
self,
url: str | _Url,
*,
headers: _Headers | None = None,
body: _Body | None = None,
allow_status: _StatusList | None = None,
) -> HttpResponse:
"""Similar to :meth:`~.HttpClient.request` but performing a ``POST``
request.
"""
return await self.request(
url=url,
method="POST",
headers=headers,
body=body,
allow_status=allow_status,
)
async def execute(
self, request: HttpRequest, *, allow_status: _StatusList | None = None
) -> HttpResponse:
"""Execute the specified :class:`~.HttpRequest` instance using the
request implementation configured in the :class:`~.HttpClient`
instance.
:class:`~.HttpRequestError` is raised for
*connection errors*, *connection and read timeouts*, etc.
:class:`~.HttpResponse` instance is returned for successful
responses in the ``100-3xx`` status code range.
Otherwise, an exception of type :class:`~.HttpResponseError` is raised.
Rasing :class:`~.HttpResponseError` can be suppressed for certain
status codes using the ``allow_status`` param - it is
a list of status code values for which :class:`~.HttpResponse`
should be returned instead of raising :class:`~.HttpResponseError`.
There is a special "*" ``allow_status`` value which allows
any status code.
There is no need to include ``100-3xx`` status codes in ``allow_status``,
because :class:`~.HttpResponseError` is not raised for them.
"""
if self.return_only_saved_responses:
for fp, saved_data in self._saved_responses.items():
if request_fingerprint(request) == fp:
if saved_data.exception:
raise saved_data.exception
assert saved_data.response
self._handle_status(
saved_data.response,
saved_data.request,
allow_status=allow_status,
)
return saved_data.response
raise NoSavedHttpResponse(request=request)
try:
response = await self._request_downloader(request)
except HttpError as ex:
if self.save_responses:
self._saved_responses[request_fingerprint(request)] = (
_SavedResponseData(request, None, ex)
)
raise
if self.save_responses:
self._saved_responses[request_fingerprint(request)] = _SavedResponseData(
request, response
)
self._handle_status(response, request, allow_status=allow_status)
return response
async def batch_execute(
self,
*requests: HttpRequest,
return_exceptions: bool = False,
allow_status: _StatusList | None = None,
) -> list[HttpResponse | HttpResponseError]:
"""Similar to :meth:`~.HttpClient.execute` but accepts a collection of
:class:`~.HttpRequest` instances that would be batch executed.
The order of the :class:`~.HttpResponses` would correspond to the order
of :class:`~.HttpRequest` passed.
If any of the :class:`~.HttpRequest` raises an exception upon execution,
the exception is raised.
To prevent this, the actual exception can be returned alongside any
successful :class:`~.HttpResponse`. This enables salvaging any usable
responses despite any possible failures. This can be done by setting
``True`` to the ``return_exceptions`` parameter.
Like :meth:`~.HttpClient.execute`, :class:`~.HttpResponseError`
will be raised for responses with status codes in the ``400-5xx`` range.
The ``allow_status`` parameter could be used the same way here to prevent
these exceptions from being raised.
You can omit ``allow_status="*"`` if you're passing ``return_exceptions=True``.
However, it would be returning :class:`~.HttpResponseError`
instead of :class:`~.HttpResponse`.
Lastly, a :class:`~.HttpRequestError` may be raised
on cases like *connection errors*, *connection and read timeouts*, etc.
"""
coroutines = [self.execute(r, allow_status=allow_status) for r in requests]
responses = await asyncio.gather(
*coroutines, return_exceptions=return_exceptions
)
return cast("list[HttpResponse | HttpResponseError]", responses)
def get_saved_responses(self) -> Iterable[_SavedResponseData]:
"""Return saved requests and responses."""
return self._saved_responses.values()
def _http_status_name(status: int) -> str:
"""
>>> _http_status_name(200)
'OK'
>>> _http_status_name(404)
'NOT_FOUND'
>>> _http_status_name(999)
'UNKNOWN'
"""
try:
return HTTPStatus(status).name
except ValueError:
return "UNKNOWN"
|