File: ip_address.py

package info (click to toggle)
python-scrapy 2.14.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 6,308 kB
  • sloc: python: 55,321; xml: 199; makefile: 25; sh: 7
file content (59 lines) | stat: -rw-r--r-- 1,962 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# ruff: noqa: E402

from scrapy.utils.reactor import install_reactor
from tests.mockserver.dns import MockDNSServer
from tests.mockserver.http import MockServer

install_reactor("twisted.internet.asyncioreactor.AsyncioSelectorReactor")


from twisted.names import cache, resolve
from twisted.names import hosts as hostsModule
from twisted.names.client import Resolver
from twisted.python.runtime import platform

from scrapy import Request, Spider
from scrapy.crawler import CrawlerRunner
from scrapy.utils.httpobj import urlparse_cached
from scrapy.utils.log import configure_logging


# https://stackoverflow.com/a/32784190
def createResolver(servers=None, resolvconf=None, hosts=None):
    if hosts is None:
        hosts = b"/etc/hosts" if platform.getType() == "posix" else r"c:\windows\hosts"
    theResolver = Resolver(resolvconf, servers)
    hostResolver = hostsModule.Resolver(hosts)
    chain = [hostResolver, cache.CacheResolver(), theResolver]
    return resolve.ResolverChain(chain)


class LocalhostSpider(Spider):
    name = "localhost_spider"

    async def start(self):
        yield Request(self.url)

    def parse(self, response):
        netloc = urlparse_cached(response).netloc
        host = netloc.split(":")[0]
        self.logger.info(f"Host: {host}")
        self.logger.info(f"Type: {type(response.ip_address)}")
        self.logger.info(f"IP address: {response.ip_address}")


if __name__ == "__main__":
    from twisted.internet import reactor

    with MockServer() as mock_http_server, MockDNSServer() as mock_dns_server:
        port = mock_http_server.http_port
        url = f"http://not.a.real.domain:{port}/echo"

        servers = [(mock_dns_server.host, mock_dns_server.port)]
        reactor.installResolver(createResolver(servers=servers))

        configure_logging()
        runner = CrawlerRunner()
        d = runner.crawl(LocalhostSpider, url=url)
        d.addBoth(lambda _: reactor.stop())
        reactor.run()