1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239
|
"""Deprecated HTTP/1.0 helper classes used by HTTP10DownloadHandler."""
from __future__ import annotations
import warnings
from time import time
from typing import TYPE_CHECKING
from urllib.parse import urldefrag, urlparse, urlunparse
from twisted.internet import defer
from twisted.internet.protocol import ClientFactory
from twisted.web.http import HTTPClient
from scrapy.exceptions import ScrapyDeprecationWarning
from scrapy.http import Headers, Response
from scrapy.responsetypes import responsetypes
from scrapy.utils.httpobj import urlparse_cached
from scrapy.utils.python import to_bytes, to_unicode
if TYPE_CHECKING:
from scrapy import Request
class ScrapyHTTPPageGetter(HTTPClient):
delimiter = b"\n"
def __init__(self):
warnings.warn(
"ScrapyHTTPPageGetter is deprecated and will be removed in a future Scrapy version.",
category=ScrapyDeprecationWarning,
stacklevel=2,
)
super().__init__()
def connectionMade(self):
self.headers = Headers() # bucket for response headers
# Method command
self.sendCommand(self.factory.method, self.factory.path)
# Headers
for key, values in self.factory.headers.items():
for value in values:
self.sendHeader(key, value)
self.endHeaders()
# Body
if self.factory.body is not None:
self.transport.write(self.factory.body)
def lineReceived(self, line):
return HTTPClient.lineReceived(self, line.rstrip())
def handleHeader(self, key, value):
self.headers.appendlist(key, value)
def handleStatus(self, version, status, message):
self.factory.gotStatus(version, status, message)
def handleEndHeaders(self):
self.factory.gotHeaders(self.headers)
def connectionLost(self, reason):
self._connection_lost_reason = reason
HTTPClient.connectionLost(self, reason)
self.factory.noPage(reason)
def handleResponse(self, response):
if self.factory.method.upper() == b"HEAD":
self.factory.page(b"")
elif self.length is not None and self.length > 0:
self.factory.noPage(self._connection_lost_reason)
else:
self.factory.page(response)
self.transport.loseConnection()
def timeout(self):
self.transport.loseConnection()
# transport cleanup needed for HTTPS connections
if self.factory.url.startswith(b"https"):
self.transport.stopProducing()
self.factory.noPage(
defer.TimeoutError(
f"Getting {self.factory.url} took longer "
f"than {self.factory.timeout} seconds."
)
)
# This class used to inherit from Twisted’s
# twisted.web.client.HTTPClientFactory. When that class was deprecated in
# Twisted (https://github.com/twisted/twisted/pull/643), we merged its
# non-overridden code into this class.
class ScrapyHTTPClientFactory(ClientFactory):
protocol = ScrapyHTTPPageGetter
waiting = 1
noisy = False
followRedirect = False
afterFoundGet = False
def _build_response(self, body, request):
request.meta["download_latency"] = self.headers_time - self.start_time
status = int(self.status)
headers = Headers(self.response_headers)
respcls = responsetypes.from_args(headers=headers, url=self._url, body=body)
return respcls(
url=self._url,
status=status,
headers=headers,
body=body,
protocol=to_unicode(self.version),
)
def _set_connection_attributes(self, request):
proxy = request.meta.get("proxy")
if proxy:
proxy_parsed = urlparse(to_bytes(proxy, encoding="ascii"))
self.scheme = proxy_parsed.scheme
self.host = proxy_parsed.hostname
self.port = proxy_parsed.port
self.netloc = proxy_parsed.netloc
if self.port is None:
self.port = 443 if proxy_parsed.scheme == b"https" else 80
self.path = self.url
else:
parsed = urlparse_cached(request)
path_str = urlunparse(
("", "", parsed.path or "/", parsed.params, parsed.query, "")
)
self.path = to_bytes(path_str, encoding="ascii")
assert parsed.hostname is not None
self.host = to_bytes(parsed.hostname, encoding="ascii")
self.port = parsed.port
self.scheme = to_bytes(parsed.scheme, encoding="ascii")
self.netloc = to_bytes(parsed.netloc, encoding="ascii")
if self.port is None:
self.port = 443 if self.scheme == b"https" else 80
def __init__(self, request: Request, timeout: float = 180):
warnings.warn(
"ScrapyHTTPClientFactory is deprecated and will be removed in a future Scrapy version.",
category=ScrapyDeprecationWarning,
stacklevel=2,
)
self._url: str = urldefrag(request.url)[0]
# converting to bytes to comply to Twisted interface
self.url: bytes = to_bytes(self._url, encoding="ascii")
self.method: bytes = to_bytes(request.method, encoding="ascii")
self.body: bytes | None = request.body or None
self.headers: Headers = Headers(request.headers)
self.response_headers: Headers | None = None
self.timeout: float = request.meta.get("download_timeout") or timeout
self.start_time: float = time()
self.deferred: defer.Deferred[Response] = defer.Deferred().addCallback(
self._build_response, request
)
# Fixes Twisted 11.1.0+ support as HTTPClientFactory is expected
# to have _disconnectedDeferred. See Twisted r32329.
# As Scrapy implements it's own logic to handle redirects is not
# needed to add the callback _waitForDisconnect.
# Specifically this avoids the AttributeError exception when
# clientConnectionFailed method is called.
self._disconnectedDeferred: defer.Deferred[None] = defer.Deferred()
self._set_connection_attributes(request)
# set Host header based on url
self.headers.setdefault("Host", self.netloc)
# set Content-Length based len of body
if self.body is not None:
self.headers["Content-Length"] = len(self.body)
# just in case a broken http/1.1 decides to keep connection alive
self.headers.setdefault("Connection", "close")
# Content-Length must be specified in POST method even with no body
elif self.method == b"POST":
self.headers["Content-Length"] = 0
def __repr__(self) -> str:
return f"<{self.__class__.__name__}: {self._url}>"
def _cancelTimeout(self, result, timeoutCall):
if timeoutCall.active():
timeoutCall.cancel()
return result
def buildProtocol(self, addr):
p = ClientFactory.buildProtocol(self, addr)
p.followRedirect = self.followRedirect
p.afterFoundGet = self.afterFoundGet
if self.timeout:
from twisted.internet import reactor
timeoutCall = reactor.callLater(self.timeout, p.timeout)
self.deferred.addBoth(self._cancelTimeout, timeoutCall)
return p
def gotHeaders(self, headers):
self.headers_time = time()
self.response_headers = headers
def gotStatus(self, version, status, message):
"""
Set the status of the request on us.
@param version: The HTTP version.
@type version: L{bytes}
@param status: The HTTP status code, an integer represented as a
bytestring.
@type status: L{bytes}
@param message: The HTTP status message.
@type message: L{bytes}
"""
self.version, self.status, self.message = version, status, message
def page(self, page):
if self.waiting:
self.waiting = 0
self.deferred.callback(page)
def noPage(self, reason):
if self.waiting:
self.waiting = 0
self.deferred.errback(reason)
def clientConnectionFailed(self, _, reason):
"""
When a connection attempt fails, the request cannot be issued. If no
result has yet been provided to the result Deferred, provide the
connection failure reason as an error result.
"""
if self.waiting:
self.waiting = 0
# If the connection attempt failed, there is nothing more to
# disconnect, so just fire that Deferred now.
self._disconnectedDeferred.callback(None)
self.deferred.errback(reason)
|