1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706
|
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2000-2010 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
Handle http links.
"""
import urlparse
import urllib
import re
import zlib
import socket
from cStringIO import StringIO
import Cookie
from .. import (log, LOG_CHECK, gzip2 as gzip, strformat, url as urlutil,
httplib2 as httplib, LinkCheckerError, configuration)
from . import (internpaturl, proxysupport, httpheaders as headers, urlbase,
get_url_from)
# import warnings
from .const import WARN_HTTP_ROBOTS_DENIED, \
WARN_HTTP_WRONG_REDIRECT, WARN_HTTP_MOVED_PERMANENT, \
WARN_HTTP_EMPTY_CONTENT, WARN_HTTP_COOKIE_STORE_ERROR, \
WARN_HTTP_DECOMPRESS_ERROR, WARN_HTTP_UNSUPPORTED_ENCODING, \
PARSE_MIMETYPES, HTML_MIMETYPES
# helper alias
unicode_safe = strformat.unicode_safe
supportHttps = hasattr(httplib, "HTTPSConnection") and \
hasattr(socket, "ssl")
_supported_encodings = ('gzip', 'x-gzip', 'deflate')
# Amazon blocks all HEAD requests
_is_amazon = re.compile(r'^www\.amazon\.(com|de|ca|fr|co\.(uk|jp))').search
# Stolen from Python CVS urllib2.py
# Mapping status codes to official W3C names
httpresponses = {
100: 'Continue',
101: 'Switching Protocols',
200: 'OK',
201: 'Created',
202: 'Accepted',
203: 'Non-Authoritative Information',
204: 'No Content',
205: 'Reset Content',
206: 'Partial Content',
300: 'Multiple Choices',
301: 'Moved Permanently',
302: 'Found',
303: 'See Other',
304: 'Not Modified',
305: 'Use Proxy',
306: '(Unused)',
307: 'Temporary Redirect',
400: 'Bad Request',
401: 'Unauthorized',
402: 'Payment Required',
403: 'Forbidden',
404: 'Not Found',
405: 'Method Not Allowed',
406: 'Not Acceptable',
407: 'Proxy Authentication Required',
408: 'Request Timeout',
409: 'Conflict',
410: 'Gone',
411: 'Length Required',
412: 'Precondition Failed',
413: 'Request Entity Too Large',
414: 'Request-URI Too Long',
415: 'Unsupported Media Type',
416: 'Requested Range Not Satisfiable',
417: 'Expectation Failed',
500: 'Internal Server Error',
501: 'Not Implemented',
502: 'Bad Gateway',
503: 'Service Unavailable',
504: 'Gateway Timeout',
505: 'HTTP Version Not Supported',
}
class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
"""
Url link with http scheme.
"""
def reset (self):
"""
Initialize HTTP specific variables.
"""
super(HttpUrl, self).reset()
self.max_redirects = 5
self.has301status = False
# flag if check had to fallback from HEAD to GET method
self.fallback_get = False
# flag if connection is persistent
self.persistent = False
# URLs seen through 301/302 redirections
self.aliases = []
# initialize check data
self.headers = None
self.auth = None
self.cookies = []
# temporary data filled when reading redirections
self._data = None
# flag indicating connection reuse
self.reused_connection = False
def allows_robots (self, url):
"""
Fetch and parse the robots.txt of given url. Checks if LinkChecker
can access the requested resource.
@param url: the url to be requested
@type url: string
@return: True if access is granted, otherwise False
@rtype: bool
"""
roboturl = self.get_robots_txt_url()
user, password = self.get_user_password()
rb = self.aggregate.robots_txt
callback = self.aggregate.connections.host_wait
return rb.allows_url(roboturl, url, self.proxy, user, password,
callback=callback)
def check_connection (self):
"""
Check a URL with HTTP protocol.
Here is an excerpt from RFC 1945 with common response codes:
The first digit of the Status-Code defines the class of response. The
last two digits do not have any categorization role. There are 5
values for the first digit:
- 1xx: Informational - Not used, but reserved for future use
- 2xx: Success - The action was successfully received,
understood, and accepted.
- 3xx: Redirection - Further action must be taken in order to
complete the request
- 4xx: Client Error - The request contains bad syntax or cannot
be fulfilled
- 5xx: Server Error - The server failed to fulfill an apparently
valid request
"""
# set the proxy, so a 407 status after this is an error
self.set_proxy(self.aggregate.config["proxy"].get(self.scheme))
# check robots.txt
if not self.allows_robots(self.url):
# remove all previously stored results
self.add_warning(
_("Access denied by robots.txt, checked only syntax."),
tag=WARN_HTTP_ROBOTS_DENIED)
self.set_result(u"syntax OK")
return
# check for amazon server quirk
if _is_amazon(self.urlparts[1]):
self.add_info(_("Amazon servers block HTTP HEAD requests, "
"using GET instead."))
self.method = "GET"
else:
# first try with HEAD
self.method = "HEAD"
# check the http connection
response = self.check_http_connection()
if self.headers and "Server" in self.headers:
server = self.headers['Server']
else:
server = _("unknown")
if self.fallback_get:
self.add_info(_("Server `%(name)s' did not support HEAD request; "
"a GET request was used instead.") %
{"name": server})
# redirections might have changed the URL
newurl = urlparse.urlunsplit(self.urlparts)
if self.url != newurl:
if self.warn_redirect:
log.warn(LOG_CHECK, _("""URL `%(url)s' has been redirected.
Use URL `%(newurl)s' instead for checking.""") % {
'url': self.url, 'newurl': newurl})
self.url = newurl
# check response
if response:
self.check_response(response)
response.close()
def check_http_connection (self):
"""
Check HTTP connection and return get response and a flag
if the check algorithm had to fall back to the GET method.
@return: response or None if url is already handled
@rtype: HttpResponse or None
"""
response = None
while True:
if response is not None:
response.close()
try:
response = self._try_http_response()
except httplib.BadStatusLine:
# some servers send empty HEAD replies
if self.method == "HEAD":
log.debug(LOG_CHECK, "Empty status line: falling back to GET")
self.method = "GET"
self.aliases = []
self.fallback_get = True
continue
raise
if response.reason:
response.reason = unicode_safe(response.reason)
log.debug(LOG_CHECK,
"Response: %s %s", response.status, response.reason)
log.debug(LOG_CHECK, "Headers: %s", self.headers)
# proxy enforcement (overrides standard proxy)
if response.status == 305 and self.headers:
oldproxy = (self.proxy, self.proxyauth)
newproxy = self.headers.getheader("Location")
self.add_info(_("Enforced proxy `%(name)s'.") %
{"name": newproxy})
self.set_proxy(newproxy)
if not self.proxy:
self.set_result(
_("Enforced proxy `%(name)s' ignored, aborting.") %
{"name": newproxy},
valid=False)
return response
response.close()
response = self._try_http_response()
# restore old proxy settings
self.proxy, self.proxyauth = oldproxy
try:
tries, response = self.follow_redirections(response)
except httplib.BadStatusLine:
# some servers send empty HEAD replies
if self.method == "HEAD":
log.debug(LOG_CHECK, "Empty status line: falling back to GET")
self.method = "GET"
self.aliases = []
self.fallback_get = True
continue
raise
if tries == -1:
log.debug(LOG_CHECK, "already handled")
response.close()
return None
if tries >= self.max_redirects:
if self.method == "HEAD":
# Microsoft servers tend to recurse HEAD requests
self.method = "GET"
self.aliases = []
self.fallback_get = True
continue
self.set_result(_("more than %d redirections, aborting") %
self.max_redirects, valid=False)
return response
# user authentication
if response.status == 401:
if not self.auth:
import base64
_user, _password = self.get_user_password()
self.auth = "Basic " + \
base64.encodestring("%s:%s" % (_user, _password))
log.debug(LOG_CHECK,
"Authentication %s/%s", _user, _password)
continue
elif response.status >= 400:
# retry with GET (but do not set fallback flag)
if self.method == "HEAD":
self.method = "GET"
self.aliases = []
continue
elif self.headers and self.method == "HEAD":
# test for HEAD support
mime = headers.get_content_type(self.headers)
poweredby = self.headers.get('X-Powered-By', '')
server = self.headers.get('Server', '')
if mime in ('application/octet-stream', 'text/plain') and \
(poweredby.startswith('Zope') or server.startswith('Zope')):
# Zope server could not get Content-Type with HEAD
self.method = "GET"
self.aliases = []
self.fallback_get = True
continue
break
return response
def follow_redirections (self, response, set_result=True):
"""
Follow all redirections of http response.
"""
log.debug(LOG_CHECK, "follow all redirections")
redirected = self.url
tries = 0
while response.status in [301, 302] and self.headers and \
tries < self.max_redirects:
newurl = self.headers.getheader("Location",
self.headers.getheader("Uri", ""))
# make new url absolute and unicode
newurl = unicode_safe(newurl)
newurl = urlparse.urljoin(redirected, newurl)
log.debug(LOG_CHECK, "Redirected to %r", newurl)
self.add_info(_("Redirected to `%(url)s'.") % {'url': newurl})
# norm base url - can raise UnicodeError from url.idna_encode()
redirected, is_idn = urlbase.url_norm(newurl)
if is_idn:
pass # XXX warn about idn use
log.debug(LOG_CHECK, "Norm redirected to %r", redirected)
urlparts = strformat.url_unicode_split(redirected)
# check extern filter again
self.set_extern(redirected)
if self.extern[0] and self.extern[0]:
if set_result:
self.check301status(response)
self.add_info(
_("The redirected URL is outside of the domain "
"filter, checked only syntax."))
self.set_result(u"filtered")
return -1, response
# check robots.txt allowance again
if not self.allows_robots(redirected):
if set_result:
self.add_warning(
_("Access to redirected URL denied by robots.txt, "
"checked only syntax."),
tag=WARN_HTTP_ROBOTS_DENIED)
self.set_result(u"syntax OK")
return -1, response
# see about recursive redirect
all_seen = [self.cache_url_key] + self.aliases
if redirected in all_seen:
if self.method == "HEAD":
# Microsoft servers tend to recurse HEAD requests
# fall back to the original url and use GET
return self.max_redirects, response
recursion = all_seen + [redirected]
if set_result:
self.set_result(
_("recursive redirection encountered:\n %(urls)s") %
{"urls": "\n => ".join(recursion)}, valid=False)
return -1, response
if urlparts[0] == self.scheme:
# remember redireced url as alias
self.aliases.append(redirected)
# note: urlparts has to be a list
self.urlparts = urlparts
if set_result:
self.check301status(response)
# check cache again on the changed URL
if self.aggregate.urlqueue.checked_redirect(redirected, self):
return -1, response
# in case of changed scheme make new URL object
if self.urlparts[0] != self.scheme:
if set_result:
self.add_warning(
_("Redirection to different URL type encountered; "
"the original URL was `%(url)s'.") %
{"url": self.url},
tag=WARN_HTTP_WRONG_REDIRECT)
newobj = get_url_from(
redirected, self.recursion_level, self.aggregate,
parent_url=self.parent_url, base_ref=self.base_ref,
line=self.line, column=self.column, name=self.name)
# append new object to queue
self.aggregate.urlqueue.put(newobj)
# pretend to be finished and logged
return -1, response
# new response data
response.close()
response = self._try_http_response()
tries += 1
return tries, response
def check301status (self, response):
"""If response page has been permanently moved add a warning."""
if response.status == 301 and not self.has301status:
self.add_warning(_("HTTP 301 (moved permanent) encountered: you"
" should update this link."),
tag=WARN_HTTP_MOVED_PERMANENT)
self.has301status = True
def get_alias_cache_data (self):
"""
Return all data values that should be put in the cache,
minus redirection warnings.
"""
data = self.get_cache_data()
data["warnings"] = [
x for x in self.warnings if x[0] != "http-moved-permanent"]
data["info"] = self.info
return data
def check_response (self, response):
"""Check final result and log it."""
if response.status >= 400:
self.set_result(u"%r %s" % (response.status, response.reason),
valid=False)
else:
if response.status == 204:
# no content
self.add_warning(unicode_safe(response.reason),
tag=WARN_HTTP_EMPTY_CONTENT)
# store cookies for valid links
if self.aggregate.config['storecookies']:
for c in self.cookies:
self.add_info(_("Store cookie: %(cookie)s.") %
{"cookie": c})
try:
out = self.aggregate.cookies.add(self.headers,
self.urlparts[0],
self.urlparts[1],
self.urlparts[2])
for h in out:
self.add_info(unicode_safe(h))
except Cookie.CookieError, msg:
self.add_warning(_("Could not store cookies: %(msg)s.") %
{'msg': str(msg)},
tag=WARN_HTTP_COOKIE_STORE_ERROR)
if response.status >= 200:
self.set_result(u"%r %s" % (response.status, response.reason))
else:
self.set_result(u"OK")
modified = self.headers.get('Last-Modified', '')
if modified:
self.add_info(_("Last modified %(date)s.") % {"date": modified})
def _try_http_response (self):
"""Try to get a HTTP response object. For reused persistent
connections that the server closed unexpected, a new connection
will be opened.
"""
try:
return self._get_http_response()
except socket.error, msg:
if msg.args[0] == 32 and self.reused_connection:
# server closed persistent connection - retry
log.debug(LOG_CHECK, "Server closed connection: retry")
self.persistent = False
return self._get_http_response()
raise
except httplib.BadStatusLine, msg:
if str(msg) == "Empty status line" and self.reused_connection:
# server closed connection - retry
log.debug(LOG_CHECK, "Empty status line: retry")
self.persistent = False
return self._get_http_response()
raise
def _get_http_response (self):
"""
Send HTTP request and get response object.
"""
if self.proxy:
host = self.proxy
scheme = self.proxytype
else:
host = self.urlparts[1]
scheme = self.urlparts[0]
log.debug(LOG_CHECK, "Connecting to %r", host)
# close/release a previous connection
self.close_connection()
self.url_connection = self.get_http_object(host, scheme)
# the anchor fragment is not part of a HTTP URL, see
# http://tools.ietf.org/html/rfc2616#section-3.2.2
anchor = ''
if self.proxy:
path = urlparse.urlunsplit((self.urlparts[0], self.urlparts[1],
self.urlparts[2], self.urlparts[3], anchor))
else:
path = urlparse.urlunsplit(('', '', self.urlparts[2],
self.urlparts[3], anchor))
self.url_connection.putrequest(self.method, path, skip_host=True,
skip_accept_encoding=True)
self.url_connection.putheader("Host", host)
# userinfo is from http://user@pass:host/
if self.userinfo:
self.url_connection.putheader("Authorization", self.userinfo)
# auth is the -u and -p configuration options
elif self.auth:
self.url_connection.putheader("Authorization", self.auth)
if self.proxyauth:
self.url_connection.putheader("Proxy-Authorization",
self.proxyauth)
if (self.parent_url and
self.parent_url.startswith(('http://', 'https://'))):
self.url_connection.putheader("Referer", self.parent_url)
self.url_connection.putheader("User-Agent", configuration.UserAgent)
self.url_connection.putheader("Accept-Encoding",
"gzip;q=1.0, deflate;q=0.9, identity;q=0.5")
if self.aggregate.config['sendcookies']:
scheme = self.urlparts[0]
host = self.urlparts[1]
port = urlutil.default_ports.get(scheme, 80)
host, port = urllib.splitnport(host, port)
path = self.urlparts[2]
self.cookies = self.aggregate.cookies.get(scheme, host, port, path)
for c in self.cookies:
name = c.client_header_name()
value = c.client_header_value()
self.url_connection.putheader(name, value)
self.url_connection.endheaders()
response = self.url_connection.getresponse()
self.timeout = headers.http_timeout(response)
self.headers = response.msg
self.persistent = not response.will_close
if self.persistent and self.method == "HEAD":
# Some servers send page content after a HEAD request,
# but only after making the *next* request. This breaks
# protocol synchronisation. Workaround here is to close
# the connection after HEAD.
# Example: http://www.empleo.gob.mx (Apache/1.3.33 (Unix) mod_jk)
self.persistent = False
if self.persistent and (self.method == "GET" or
self.headers.getheader("Content-Length") != "0"):
# always read content from persistent connections
self._read_content(response)
assert not response.will_close
# If possible, use official W3C HTTP response name
if response.status in httpresponses:
response.reason = httpresponses[response.status]
return response
def get_http_object (self, host, scheme):
"""
Open a HTTP connection.
@param host: the host to connect to
@type host: string of the form <host>[:<port>]
@param scheme: 'http' or 'https'
@type scheme: string
@return: open HTTP(S) connection
@rtype: httplib.HTTP(S)Connection
"""
_user, _password = self.get_user_password()
key = (scheme, self.urlparts[1], _user, _password)
conn = self.aggregate.connections.get(key)
if conn is not None:
log.debug(LOG_CHECK, "reuse cached HTTP(S) connection %s", conn)
self.reused_connection = True
return conn
self.aggregate.connections.wait_for_host(host)
if scheme == "http":
h = httplib.HTTPConnection(host)
elif scheme == "https" and supportHttps:
h = httplib.HTTPSConnection(host)
else:
msg = _("Unsupported HTTP url scheme `%(scheme)s'") % {"scheme": scheme}
raise LinkCheckerError(msg)
if log.is_debug(LOG_CHECK):
h.set_debuglevel(1)
h.connect()
return h
def read_content (self):
"""Get content of the URL target. The content data is cached after
the first call to this method.
@return: URL content, decompressed and decoded
@rtype: string
"""
self.method = "GET"
response = self._try_http_response()
response = self.follow_redirections(response, set_result=False)[1]
self.headers = response.msg
if self._data is None:
self._read_content(response)
data = self._data
self._data = None
return data
def _read_content (self, response):
data = response.read()
encoding = headers.get_content_encoding(self.headers)
if encoding in _supported_encodings:
try:
if encoding == 'deflate':
f = StringIO(zlib.decompress(data))
else:
f = gzip.GzipFile('', 'rb', 9, StringIO(data))
except zlib.error, msg:
self.add_warning(_("Decompress error %(err)s") %
{"err": str(msg)},
tag=WARN_HTTP_DECOMPRESS_ERROR)
f = StringIO(data)
try:
data = f.read()
finally:
f.close()
# store temporary data
self._data = data
def encoding_supported (self):
"""Check if page encoding is supported."""
encoding = headers.get_content_encoding(self.headers)
if encoding and encoding not in _supported_encodings and \
encoding != 'identity':
self.add_warning(_("Unsupported content encoding `%(encoding)s'.") %
{"encoding": encoding},
tag=WARN_HTTP_UNSUPPORTED_ENCODING)
return False
return True
def is_html (self):
"""
See if this URL points to a HTML file by looking at the
Content-Type header, file extension and file content.
@return: True if URL points to HTML file
@rtype: bool
"""
if not (self.valid and self.headers):
return False
if headers.get_content_type(self.headers) not in HTML_MIMETYPES:
return False
return self.encoding_supported()
def is_css (self):
"""Return True iff content of this url is CSS stylesheet."""
if not (self.valid and self.headers):
return False
if headers.get_content_type(self.headers) != "text/css":
return False
return self.encoding_supported()
def is_http (self):
"""
This is a HTTP file.
@return: True
@rtype: bool
"""
return True
def is_parseable (self):
"""
Check if content is parseable for recursion.
@return: True if content is parseable
@rtype: bool
"""
if not (self.valid and self.headers):
return False
if headers.get_content_type(self.headers) not in PARSE_MIMETYPES:
return False
return self.encoding_supported()
def parse_url (self):
"""
Parse file contents for new links to check.
"""
ctype = headers.get_content_type(self.headers)
if self.is_html():
self.parse_html()
elif self.is_css():
self.parse_css()
elif ctype == "application/x-shockwave-flash":
self.parse_swf()
elif ctype == "application/msword":
self.parse_word()
def get_robots_txt_url (self):
"""
Get the according robots.txt URL for this URL.
@return: robots.txt URL
@rtype: string
"""
return "%s://%s/robots.txt" % tuple(self.urlparts[0:2])
def close_connection (self):
"""
If connection is persistent, add it to the connection pool.
Else close the connection. Errors on closing are ignored.
"""
if self.url_connection is None:
# no connection is open
return
# add to cached connections
_user, _password = self.get_user_password()
key = ("http", self.urlparts[1], _user, _password)
if self.persistent and self.url_connection.is_idle():
self.aggregate.connections.add(
key, self.url_connection, self.timeout)
else:
try:
self.url_connection.close()
except Exception:
# ignore close errors
pass
self.url_connection = None
|