1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
|
#!/usr/bin/env python3
# Port of Hack 21 from the O'Reilly book "Spidering Hacks" by Tara
# Calishain and Kevin Hemenway. Of course, there's no need to explicitly
# catch exceptions in Python, unlike checking error return values in Perl,
# but I've left those in for the sake of a direct port.
from __future__ import print_function
import sys
import os
import re
import mechanize
from mechanize.polyglot import HTTPError
mech = mechanize.Browser()
# Addition 2005-01-05: Be naughty, since robots.txt asks not to
# access /search now. We're not madly searching for everything, so
# I don't feel too guilty.
mech.set_handle_robots(False)
# mech.set_debug_http(True)
# Get the starting search page
try:
mech.open("http://search.cpan.org")
except HTTPError as e:
sys.exit("%d: %s" % (e.code, e.msg))
# Select the form, fill the fields, and submit
mech.select_form(nr=0)
mech["query"] = "Lester"
mech["mode"] = ["author"]
try:
mech.submit()
except HTTPError as e:
sys.exit("post failed: %d: %s" % (e.code, e.msg))
# Find the link for "Andy"
try:
mech.follow_link(text_regex=re.compile("Andy"))
except HTTPError as e:
sys.exit("post failed: %d: %s" % (e.code, e.msg))
# Get all the tarballs
urls = [link.absolute_url for link in
mech.links(url_regex=re.compile(r"\.tar\.gz$"))]
print("Found", len(urls), "tarballs to download")
if "--all" not in sys.argv[1:]:
urls = urls[:1]
for url in urls:
filename = os.path.basename(url)
f = open(filename, "wb")
print("%s -->" % filename, end=' ')
r = mech.open(url)
while 1:
data = r.read(1024)
if not data:
break
f.write(data)
f.close()
print(os.stat(filename).st_size, "bytes")
|