1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148
|
from __future__ import annotations
from typing import TYPE_CHECKING, Any
from twisted.internet import defer
from twisted.internet.base import ReactorBase, ThreadedResolver
from twisted.internet.interfaces import (
IAddress,
IHostnameResolver,
IHostResolution,
IResolutionReceiver,
IResolverSimple,
)
from zope.interface.declarations import implementer, provider
from scrapy.utils.datatypes import LocalCache
if TYPE_CHECKING:
from collections.abc import Sequence
from twisted.internet.defer import Deferred
# typing.Self requires Python 3.11
from typing_extensions import Self
from scrapy.crawler import Crawler
# TODO: cache misses
dnscache: LocalCache[str, Any] = LocalCache(10000)
@implementer(IResolverSimple)
class CachingThreadedResolver(ThreadedResolver):
"""
Default caching resolver. IPv4 only, supports setting a timeout value for DNS requests.
"""
def __init__(self, reactor: ReactorBase, cache_size: int, timeout: float):
super().__init__(reactor)
dnscache.limit = cache_size
self.timeout = timeout
@classmethod
def from_crawler(cls, crawler: Crawler, reactor: ReactorBase) -> Self:
if crawler.settings.getbool("DNSCACHE_ENABLED"):
cache_size = crawler.settings.getint("DNSCACHE_SIZE")
else:
cache_size = 0
return cls(reactor, cache_size, crawler.settings.getfloat("DNS_TIMEOUT"))
def install_on_reactor(self) -> None:
self.reactor.installResolver(self)
def getHostByName(self, name: str, timeout: Sequence[int] = ()) -> Deferred[str]:
if name in dnscache:
return defer.succeed(dnscache[name])
# in Twisted<=16.6, getHostByName() is always called with
# a default timeout of 60s (actually passed as (1, 3, 11, 45) tuple),
# so the input argument above is simply overridden
# to enforce Scrapy's DNS_TIMEOUT setting's value
# The timeout arg is typed as Sequence[int] but supports floats.
timeout = (self.timeout,) # type: ignore[assignment]
d = super().getHostByName(name, timeout)
if dnscache.limit:
d.addCallback(self._cache_result, name)
return d
def _cache_result(self, result: Any, name: str) -> Any:
dnscache[name] = result
return result
@implementer(IHostResolution)
class HostResolution:
def __init__(self, name: str):
self.name: str = name
def cancel(self) -> None:
raise NotImplementedError
@provider(IResolutionReceiver)
class _CachingResolutionReceiver:
def __init__(self, resolutionReceiver: IResolutionReceiver, hostName: str):
self.resolutionReceiver: IResolutionReceiver = resolutionReceiver
self.hostName: str = hostName
self.addresses: list[IAddress] = []
def resolutionBegan(self, resolution: IHostResolution) -> None:
self.resolutionReceiver.resolutionBegan(resolution)
self.resolution = resolution
def addressResolved(self, address: IAddress) -> None:
self.resolutionReceiver.addressResolved(address)
self.addresses.append(address)
def resolutionComplete(self) -> None:
self.resolutionReceiver.resolutionComplete()
if self.addresses:
dnscache[self.hostName] = self.addresses
@implementer(IHostnameResolver)
class CachingHostnameResolver:
"""
Experimental caching resolver. Resolves IPv4 and IPv6 addresses,
does not support setting a timeout value for DNS requests.
"""
def __init__(self, reactor: ReactorBase, cache_size: int):
self.reactor: ReactorBase = reactor
self.original_resolver: IHostnameResolver = reactor.nameResolver
dnscache.limit = cache_size
@classmethod
def from_crawler(cls, crawler: Crawler, reactor: ReactorBase) -> Self:
if crawler.settings.getbool("DNSCACHE_ENABLED"):
cache_size = crawler.settings.getint("DNSCACHE_SIZE")
else:
cache_size = 0
return cls(reactor, cache_size)
def install_on_reactor(self) -> None:
self.reactor.installNameResolver(self)
def resolveHostName(
self,
resolutionReceiver: IResolutionReceiver,
hostName: str,
portNumber: int = 0,
addressTypes: Sequence[type[IAddress]] | None = None,
transportSemantics: str = "TCP",
) -> IHostResolution:
try:
addresses = dnscache[hostName]
except KeyError:
return self.original_resolver.resolveHostName(
_CachingResolutionReceiver(resolutionReceiver, hostName),
hostName,
portNumber,
addressTypes,
transportSemantics,
)
resolutionReceiver.resolutionBegan(HostResolution(hostName))
for addr in addresses:
resolutionReceiver.addressResolved(addr)
resolutionReceiver.resolutionComplete()
return resolutionReceiver
|