1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124
|
from twisted.internet import defer
from twisted.internet.base import ThreadedResolver
from twisted.internet.interfaces import IHostResolution, IHostnameResolver, IResolutionReceiver, IResolverSimple
from zope.interface.declarations import implementer, provider
from scrapy.utils.datatypes import LocalCache
# TODO: cache misses
dnscache = LocalCache(10000)
@implementer(IResolverSimple)
class CachingThreadedResolver(ThreadedResolver):
"""
Default caching resolver. IPv4 only, supports setting a timeout value for DNS requests.
"""
def __init__(self, reactor, cache_size, timeout):
super().__init__(reactor)
dnscache.limit = cache_size
self.timeout = timeout
@classmethod
def from_crawler(cls, crawler, reactor):
if crawler.settings.getbool('DNSCACHE_ENABLED'):
cache_size = crawler.settings.getint('DNSCACHE_SIZE')
else:
cache_size = 0
return cls(reactor, cache_size, crawler.settings.getfloat('DNS_TIMEOUT'))
def install_on_reactor(self):
self.reactor.installResolver(self)
def getHostByName(self, name, timeout=None):
if name in dnscache:
return defer.succeed(dnscache[name])
# in Twisted<=16.6, getHostByName() is always called with
# a default timeout of 60s (actually passed as (1, 3, 11, 45) tuple),
# so the input argument above is simply overridden
# to enforce Scrapy's DNS_TIMEOUT setting's value
timeout = (self.timeout,)
d = super().getHostByName(name, timeout)
if dnscache.limit:
d.addCallback(self._cache_result, name)
return d
def _cache_result(self, result, name):
dnscache[name] = result
return result
@implementer(IHostResolution)
class HostResolution:
def __init__(self, name):
self.name = name
def cancel(self):
raise NotImplementedError()
@provider(IResolutionReceiver)
class _CachingResolutionReceiver:
def __init__(self, resolutionReceiver, hostName):
self.resolutionReceiver = resolutionReceiver
self.hostName = hostName
self.addresses = []
def resolutionBegan(self, resolution):
self.resolutionReceiver.resolutionBegan(resolution)
self.resolution = resolution
def addressResolved(self, address):
self.resolutionReceiver.addressResolved(address)
self.addresses.append(address)
def resolutionComplete(self):
self.resolutionReceiver.resolutionComplete()
if self.addresses:
dnscache[self.hostName] = self.addresses
@implementer(IHostnameResolver)
class CachingHostnameResolver:
"""
Experimental caching resolver. Resolves IPv4 and IPv6 addresses,
does not support setting a timeout value for DNS requests.
"""
def __init__(self, reactor, cache_size):
self.reactor = reactor
self.original_resolver = reactor.nameResolver
dnscache.limit = cache_size
@classmethod
def from_crawler(cls, crawler, reactor):
if crawler.settings.getbool('DNSCACHE_ENABLED'):
cache_size = crawler.settings.getint('DNSCACHE_SIZE')
else:
cache_size = 0
return cls(reactor, cache_size)
def install_on_reactor(self):
self.reactor.installNameResolver(self)
def resolveHostName(
self, resolutionReceiver, hostName, portNumber=0, addressTypes=None, transportSemantics="TCP"
):
try:
addresses = dnscache[hostName]
except KeyError:
return self.original_resolver.resolveHostName(
_CachingResolutionReceiver(resolutionReceiver, hostName),
hostName,
portNumber,
addressTypes,
transportSemantics,
)
else:
resolutionReceiver.resolutionBegan(HostResolution(hostName))
for addr in addresses:
resolutionReceiver.addressResolved(addr)
resolutionReceiver.resolutionComplete()
return resolutionReceiver
|