1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
|
"""A tool to simplify fetching robots.txt from a large number of websites.
Usage
-----
>>> python fetch_robotstxt.py -l top-10000-websites.txt -d test_data
"""
import argparse
import os
import sys
from urllib.parse import ParseResult, urlparse, urlunparse
import scrapy
from scrapy.crawler import CrawlerProcess
parser = argparse.ArgumentParser(description="Download robots.txt of given websites.")
parser.add_argument(
"-l",
"--list",
action="append",
dest="websites",
help="Adds to the list of websites.",
)
parser.add_argument(
"-d",
"--destination",
action="store",
dest="directory",
help="Directory to save robots.txt files.",
)
args = parser.parse_args()
if not args.directory or not args.websites:
print("Insufficient or invalid argument(s) provided.")
sys.exit()
class RobotstxtSpider(scrapy.Spider):
name = "robotstxt_spider"
def start_requests(self):
for w in args.websites:
if os.path.isfile(w):
with open(w, "r") as f:
for domain in f:
domain = domain.strip()
yield scrapy.Request(
url="https://{}/robots.txt".format(domain),
callback=self.parse,
errback=self.err_cb,
)
def parse(self, response):
filename = urlparse(response.url).netloc
if not os.path.exists(args.directory):
os.mkdir(args.directory)
with open(os.path.join(args.directory, filename), "wb") as f:
f.write(response.body)
def err_cb(self, failure):
request = failure.request
parts = urlparse(request.url)
parts = ParseResult(
"http", parts.netloc, parts.path, parts.params, parts.query, parts.fragment
)
url = urlunparse(parts)
yield scrapy.Request(url=url, callback=self.parse)
process = CrawlerProcess(settings={"ROBOTSTXT_OBEY": False})
process.crawl(RobotstxtSpider)
process.start()
|