@# This file is processed by EmPy: do not edit @# http://wwwsearch.sf.net/bits/colorize.py @{ from colorize import colorize import time import release last_modified = release.svn_id_to_time("$Id: README.html.in 27559 2006-05-21 22:39:21Z jjlee $") try: base except NameError: base = False }
Stateful programmatic web browsing in Python, after Andy Lester's Perl
module WWW::Mechanize
.
mechanize.Browser is a subclass of
mechanize.UserAgent, which is, in turn, a subclass of
urllib2.OpenerDirector (in fact, of
mechanize.OpenerDirector), so:
http:
mechanize.UserAgent offers easy dynamic configuration of
user-agent features like protocol, cookie, redirection and
robots.txt handling, without having to make a new
OpenerDirector each time, e.g. by calling
build_opener().
.back() and .reload()
methods).
Referer HTTP header is added properly (optional).
robots.txt.
This documentation is in need of reorganisation and extension!
The two below are just to give the gist. There are also some actual working examples. @{colorize(r""" import re from mechanize import Browser br = Browser() br.open("http://www.example.com/") # follow second link with element text matching regular expression response1 = br.follow_link(text_regex=r"cheese\s*shop", nr=1) assert br.viewing_html() print br.title() print response1.geturl() print response1.info() # headers print response1.read() # body response1.close() # (shown for clarity; in fact Browser does this for you) br.select_form(name="order") # Browser passes through unknown attributes (including methods) # to the selected HTMLForm (from ClientForm). br["cheeses"] = ["mozzarella", "caerphilly"] # (the method here is __setitem__) response2 = br.submit() # submit current form # print currently selected form (don't call .submit() on this, use br.submit()) print br.form response3 = br.back() # back to cheese shop (same data as response1) # the history mechanism returns cached response objects # we can still use the response, even though we closed it: response3.seek(0) response3.read() response4 = br.reload() # fetches from server for form in br.forms(): print form # .links() optionally accepts the keyword args of .follow_/.find_link() for link in br.links(url_regex="python.org"): print link br.follow_link(link) # takes EITHER Link instance OR keyword args br.back() """)}
You may control the browser's policy by using the methods of
mechanize.Browser's base class, mechanize.UserAgent.
For example:
@{colorize("""
br = Browser()
# Explicitly configure proxies (Browser will attempt to set good defaults).
# Note the userinfo ("joe:password@") and port number (":3128") are optional.
br.set_proxies({"http": "joe:password@myproxy.example.com:3128",
"ftp": "proxy.example.com",
})
# Add HTTP Basic/Digest auth username and password for HTTP proxy access.
# (equivalent to using "joe:password@..." form above)
br.add_proxy_password("joe", "password")
# Add HTTP Basic/Digest auth username and password for website access.
br.add_password("http://example.com/protected/", "joe", "password")
# Don't handle HTTP-EQUIV headers (HTTP headers embedded in HTML).
br.set_handle_equiv(False)
# Ignore robots.txt. Do not do this without thought and consideration.
br.set_handle_robots(False)
# Don't handle cookies
br.set_cookiejar()
# Supply your own mechanize.CookieJar (NOTE: cookie handling is ON by
# default: no need to do this unless you have some reason to use a
# particular cookiejar)
br.set_cookiejar(cj)
# Log information about HTTP redirects and Refreshes.
br.set_debug_redirects(True)
# Log HTTP response bodies (ie. the HTML, most of the time).
br.set_debug_responses(True)
# Print HTTP headers.
br.set_debug_http(True)
# To make sure you're seeing all debug output:
logger = logging.getLogger("mechanize")
logger.addHandler(logging.StreamHandler(sys.stdout))
logger.setLevel(logging.INFO)
# Sometimes it's useful to process bad headers or bad HTML:
response = br.response() # this is a copy of response
headers = response.info() # currently, this is a mimetools.Message
del headers["Content-type"] # get rid of (possibly multiple) existing headers
headers["Content-type"] = "text/html; charset=utf-8"
response.set_data(response.get_data().replace("