File: caching_hostname_resolver.py

package info (click to toggle)
python-scrapy 2.13.3-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 5,664 kB
  • sloc: python: 52,028; xml: 199; makefile: 25; sh: 7
file content (35 lines) | stat: -rw-r--r-- 910 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import sys

import scrapy
from scrapy.crawler import CrawlerProcess


class CachingHostnameResolverSpider(scrapy.Spider):
    """
    Finishes in a finite amount of time (does not hang indefinitely in the DNS resolution)
    """

    name = "caching_hostname_resolver_spider"

    async def start(self):
        yield scrapy.Request(self.url)

    def parse(self, response):
        for _ in range(10):
            yield scrapy.Request(
                response.url, dont_filter=True, callback=self.ignore_response
            )

    def ignore_response(self, response):
        self.logger.info(repr(response.ip_address))


if __name__ == "__main__":
    process = CrawlerProcess(
        settings={
            "RETRY_ENABLED": False,
            "DNS_RESOLVER": "scrapy.resolver.CachingHostnameResolver",
        }
    )
    process.crawl(CachingHostnameResolverSpider, url=sys.argv[1])
    process.start()