File: hack21.py

package info (click to toggle)

python-mechanize 1%3A0.4.10%2Bds-5

links: PTS, VCS
area: main
in suites: forky, sid
size: 1,316 kB
sloc: python: 16,656; makefile: 11; sh: 4

file content (63 lines) | stat: -rw-r--r-- 1,726 bytes

parent folder | download | duplicates (3)

#!/usr/bin/env python3

# Port of Hack 21 from the O'Reilly book "Spidering Hacks" by Tara
# Calishain and Kevin Hemenway.  Of course, there's no need to explicitly
# catch exceptions in Python, unlike checking error return values in Perl,
# but I've left those in for the sake of a direct port.

from __future__ import print_function
import sys
import os
import re

import mechanize
from mechanize.polyglot import HTTPError

mech = mechanize.Browser()
# Addition 2005-01-05: Be naughty, since robots.txt asks not to
# access /search now.  We're not madly searching for everything, so
# I don't feel too guilty.
mech.set_handle_robots(False)
# mech.set_debug_http(True)

# Get the starting search page
try:
    mech.open("http://search.cpan.org")
except HTTPError as e:
    sys.exit("%d: %s" % (e.code, e.msg))

# Select the form, fill the fields, and submit
mech.select_form(nr=0)
mech["query"] = "Lester"
mech["mode"] = ["author"]
try:
    mech.submit()
except HTTPError as e:
    sys.exit("post failed: %d: %s" % (e.code, e.msg))

# Find the link for "Andy"
try:
    mech.follow_link(text_regex=re.compile("Andy"))
except HTTPError as e:
    sys.exit("post failed: %d: %s" % (e.code, e.msg))

# Get all the tarballs
urls = [link.absolute_url for link in
        mech.links(url_regex=re.compile(r"\.tar\.gz$"))]
print("Found", len(urls), "tarballs to download")

if "--all" not in sys.argv[1:]:
    urls = urls[:1]

for url in urls:
    filename = os.path.basename(url)
    f = open(filename, "wb")
    print("%s -->" % filename, end=' ')
    r = mech.open(url)
    while 1:
        data = r.read(1024)
        if not data:
            break
        f.write(data)
    f.close()
    print(os.stat(filename).st_size, "bytes")