File: fetch_robotstxt.py

package info (click to toggle)
python-protego 0.4.0%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 30,200 kB
  • sloc: python: 1,598; perl: 190; cpp: 33; sh: 4; makefile: 3
file content (72 lines) | stat: -rw-r--r-- 2,060 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
"""A tool to simplify fetching robots.txt from a large number of websites.

Usage
-----
>>> python fetch_robotstxt.py -l top-10000-websites.txt -d test_data
"""

import argparse
import os
import sys
from urllib.parse import ParseResult, urlparse, urlunparse

import scrapy
from scrapy.crawler import CrawlerProcess

parser = argparse.ArgumentParser(description="Download robots.txt of given websites.")
parser.add_argument(
    "-l",
    "--list",
    action="append",
    dest="websites",
    help="Adds to the list of websites.",
)
parser.add_argument(
    "-d",
    "--destination",
    action="store",
    dest="directory",
    help="Directory to save robots.txt files.",
)
args = parser.parse_args()

if not args.directory or not args.websites:
    print("Insufficient or invalid argument(s) provided.")
    sys.exit()


class RobotstxtSpider(scrapy.Spider):
    name = "robotstxt_spider"

    def start_requests(self):
        for w in args.websites:
            if os.path.isfile(w):
                with open(w, "r") as f:
                    for domain in f:
                        domain = domain.strip()
                        yield scrapy.Request(
                            url="https://{}/robots.txt".format(domain),
                            callback=self.parse,
                            errback=self.err_cb,
                        )

    def parse(self, response):
        filename = urlparse(response.url).netloc
        if not os.path.exists(args.directory):
            os.mkdir(args.directory)
        with open(os.path.join(args.directory, filename), "wb") as f:
            f.write(response.body)

    def err_cb(self, failure):
        request = failure.request
        parts = urlparse(request.url)
        parts = ParseResult(
            "http", parts.netloc, parts.path, parts.params, parts.query, parts.fragment
        )
        url = urlunparse(parts)
        yield scrapy.Request(url=url, callback=self.parse)


process = CrawlerProcess(settings={"ROBOTSTXT_OBEY": False})
process.crawl(RobotstxtSpider)
process.start()