File: httpurl.py

package info (click to toggle)
linkchecker 5.2-2
  • links: PTS
  • area: main
  • in suites: squeeze
  • size: 3,508 kB
  • ctags: 3,805
  • sloc: python: 22,666; lex: 1,114; yacc: 785; makefile: 276; ansic: 95; sh: 68; sql: 19; awk: 4
file content (706 lines) | stat: -rw-r--r-- 28,437 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2000-2010 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
Handle http links.
"""

import urlparse
import urllib
import re
import zlib
import socket
from cStringIO import StringIO
import Cookie

from .. import (log, LOG_CHECK, gzip2 as gzip, strformat, url as urlutil,
    httplib2 as httplib, LinkCheckerError, configuration)
from . import (internpaturl, proxysupport, httpheaders as headers, urlbase,
    get_url_from)
# import warnings
from .const import WARN_HTTP_ROBOTS_DENIED, \
    WARN_HTTP_WRONG_REDIRECT, WARN_HTTP_MOVED_PERMANENT, \
    WARN_HTTP_EMPTY_CONTENT, WARN_HTTP_COOKIE_STORE_ERROR, \
    WARN_HTTP_DECOMPRESS_ERROR, WARN_HTTP_UNSUPPORTED_ENCODING, \
    PARSE_MIMETYPES, HTML_MIMETYPES

# helper alias
unicode_safe = strformat.unicode_safe

supportHttps = hasattr(httplib, "HTTPSConnection") and \
               hasattr(socket, "ssl")

_supported_encodings = ('gzip', 'x-gzip', 'deflate')

# Amazon blocks all HEAD requests
_is_amazon = re.compile(r'^www\.amazon\.(com|de|ca|fr|co\.(uk|jp))').search

# Stolen from Python CVS urllib2.py
# Mapping status codes to official W3C names
httpresponses = {
    100: 'Continue',
    101: 'Switching Protocols',

    200: 'OK',
    201: 'Created',
    202: 'Accepted',
    203: 'Non-Authoritative Information',
    204: 'No Content',
    205: 'Reset Content',
    206: 'Partial Content',

    300: 'Multiple Choices',
    301: 'Moved Permanently',
    302: 'Found',
    303: 'See Other',
    304: 'Not Modified',
    305: 'Use Proxy',
    306: '(Unused)',
    307: 'Temporary Redirect',

    400: 'Bad Request',
    401: 'Unauthorized',
    402: 'Payment Required',
    403: 'Forbidden',
    404: 'Not Found',
    405: 'Method Not Allowed',
    406: 'Not Acceptable',
    407: 'Proxy Authentication Required',
    408: 'Request Timeout',
    409: 'Conflict',
    410: 'Gone',
    411: 'Length Required',
    412: 'Precondition Failed',
    413: 'Request Entity Too Large',
    414: 'Request-URI Too Long',
    415: 'Unsupported Media Type',
    416: 'Requested Range Not Satisfiable',
    417: 'Expectation Failed',

    500: 'Internal Server Error',
    501: 'Not Implemented',
    502: 'Bad Gateway',
    503: 'Service Unavailable',
    504: 'Gateway Timeout',
    505: 'HTTP Version Not Supported',
}

class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
    """
    Url link with http scheme.
    """

    def reset (self):
        """
        Initialize HTTP specific variables.
        """
        super(HttpUrl, self).reset()
        self.max_redirects = 5
        self.has301status = False
        # flag if check had to fallback from HEAD to GET method
        self.fallback_get = False
        # flag if connection is persistent
        self.persistent = False
        # URLs seen through 301/302 redirections
        self.aliases = []
        # initialize check data
        self.headers = None
        self.auth = None
        self.cookies = []
        # temporary data filled when reading redirections
        self._data = None
        # flag indicating connection reuse
        self.reused_connection = False

    def allows_robots (self, url):
        """
        Fetch and parse the robots.txt of given url. Checks if LinkChecker
        can access the requested resource.

        @param url: the url to be requested
        @type url: string
        @return: True if access is granted, otherwise False
        @rtype: bool
        """
        roboturl = self.get_robots_txt_url()
        user, password = self.get_user_password()
        rb = self.aggregate.robots_txt
        callback = self.aggregate.connections.host_wait
        return rb.allows_url(roboturl, url, self.proxy, user, password,
            callback=callback)

    def check_connection (self):
        """
        Check a URL with HTTP protocol.
        Here is an excerpt from RFC 1945 with common response codes:
        The first digit of the Status-Code defines the class of response. The
        last two digits do not have any categorization role. There are 5
        values for the first digit:
          - 1xx: Informational - Not used, but reserved for future use
          - 2xx: Success - The action was successfully received,
            understood, and accepted.
          - 3xx: Redirection - Further action must be taken in order to
            complete the request
          - 4xx: Client Error - The request contains bad syntax or cannot
            be fulfilled
          - 5xx: Server Error - The server failed to fulfill an apparently
            valid request
        """
        # set the proxy, so a 407 status after this is an error
        self.set_proxy(self.aggregate.config["proxy"].get(self.scheme))
        # check robots.txt
        if not self.allows_robots(self.url):
            # remove all previously stored results
            self.add_warning(
                       _("Access denied by robots.txt, checked only syntax."),
                       tag=WARN_HTTP_ROBOTS_DENIED)
            self.set_result(u"syntax OK")
            return
        # check for amazon server quirk
        if _is_amazon(self.urlparts[1]):
            self.add_info(_("Amazon servers block HTTP HEAD requests, "
                            "using GET instead."))
            self.method = "GET"
        else:
            # first try with HEAD
            self.method = "HEAD"
        # check the http connection
        response = self.check_http_connection()
        if self.headers and "Server" in self.headers:
            server = self.headers['Server']
        else:
            server = _("unknown")
        if self.fallback_get:
            self.add_info(_("Server `%(name)s' did not support HEAD request; "
                            "a GET request was used instead.") %
                            {"name": server})
        # redirections might have changed the URL
        newurl = urlparse.urlunsplit(self.urlparts)
        if self.url != newurl:
            if self.warn_redirect:
                log.warn(LOG_CHECK, _("""URL `%(url)s' has been redirected.
Use URL `%(newurl)s' instead for checking.""") % {
                'url': self.url, 'newurl': newurl})
            self.url = newurl
        # check response
        if response:
            self.check_response(response)
            response.close()

    def check_http_connection (self):
        """
        Check HTTP connection and return get response and a flag
        if the check algorithm had to fall back to the GET method.

        @return: response or None if url is already handled
        @rtype: HttpResponse or None
        """
        response = None
        while True:
            if response is not None:
                response.close()
            try:
                response = self._try_http_response()
            except httplib.BadStatusLine:
                # some servers send empty HEAD replies
                if self.method == "HEAD":
                    log.debug(LOG_CHECK, "Empty status line: falling back to GET")
                    self.method = "GET"
                    self.aliases = []
                    self.fallback_get = True
                    continue
                raise
            if response.reason:
                response.reason = unicode_safe(response.reason)
            log.debug(LOG_CHECK,
                "Response: %s %s", response.status, response.reason)
            log.debug(LOG_CHECK, "Headers: %s", self.headers)
            # proxy enforcement (overrides standard proxy)
            if response.status == 305 and self.headers:
                oldproxy = (self.proxy, self.proxyauth)
                newproxy = self.headers.getheader("Location")
                self.add_info(_("Enforced proxy `%(name)s'.") %
                              {"name": newproxy})
                self.set_proxy(newproxy)
                if not self.proxy:
                    self.set_result(
                         _("Enforced proxy `%(name)s' ignored, aborting.") %
                         {"name": newproxy},
                         valid=False)
                    return response
                response.close()
                response = self._try_http_response()
                # restore old proxy settings
                self.proxy, self.proxyauth = oldproxy
            try:
                tries, response = self.follow_redirections(response)
            except httplib.BadStatusLine:
                # some servers send empty HEAD replies
                if self.method == "HEAD":
                    log.debug(LOG_CHECK, "Empty status line: falling back to GET")
                    self.method = "GET"
                    self.aliases = []
                    self.fallback_get = True
                    continue
                raise
            if tries == -1:
                log.debug(LOG_CHECK, "already handled")
                response.close()
                return None
            if tries >= self.max_redirects:
                if self.method == "HEAD":
                    # Microsoft servers tend to recurse HEAD requests
                    self.method = "GET"
                    self.aliases = []
                    self.fallback_get = True
                    continue
                self.set_result(_("more than %d redirections, aborting") %
                                self.max_redirects, valid=False)
                return response
            # user authentication
            if response.status == 401:
                if not self.auth:
                    import base64
                    _user, _password = self.get_user_password()
                    self.auth = "Basic " + \
                        base64.encodestring("%s:%s" % (_user, _password))
                    log.debug(LOG_CHECK,
                        "Authentication %s/%s", _user, _password)
                    continue
            elif response.status >= 400:
                # retry with GET (but do not set fallback flag)
                if self.method == "HEAD":
                    self.method = "GET"
                    self.aliases = []
                    continue
            elif self.headers and self.method == "HEAD":
                # test for HEAD support
                mime = headers.get_content_type(self.headers)
                poweredby = self.headers.get('X-Powered-By', '')
                server = self.headers.get('Server', '')
                if mime in ('application/octet-stream', 'text/plain') and \
                  (poweredby.startswith('Zope') or server.startswith('Zope')):
                    # Zope server could not get Content-Type with HEAD
                    self.method = "GET"
                    self.aliases = []
                    self.fallback_get = True
                    continue
            break
        return response

    def follow_redirections (self, response, set_result=True):
        """
        Follow all redirections of http response.
        """
        log.debug(LOG_CHECK, "follow all redirections")
        redirected = self.url
        tries = 0
        while response.status in [301, 302] and self.headers and \
              tries < self.max_redirects:
            newurl = self.headers.getheader("Location",
                         self.headers.getheader("Uri", ""))
            # make new url absolute and unicode
            newurl = unicode_safe(newurl)
            newurl = urlparse.urljoin(redirected, newurl)
            log.debug(LOG_CHECK, "Redirected to %r", newurl)
            self.add_info(_("Redirected to `%(url)s'.") % {'url': newurl})
            # norm base url - can raise UnicodeError from url.idna_encode()
            redirected, is_idn = urlbase.url_norm(newurl)
            if is_idn:
                pass # XXX warn about idn use
            log.debug(LOG_CHECK, "Norm redirected to %r", redirected)
            urlparts = strformat.url_unicode_split(redirected)
            # check extern filter again
            self.set_extern(redirected)
            if self.extern[0] and self.extern[0]:
                if set_result:
                    self.check301status(response)
                    self.add_info(
                          _("The redirected URL is outside of the domain "
                            "filter, checked only syntax."))
                    self.set_result(u"filtered")
                return -1, response
            # check robots.txt allowance again
            if not self.allows_robots(redirected):
                if set_result:
                    self.add_warning(
                       _("Access to redirected URL denied by robots.txt, "
                         "checked only syntax."),
                       tag=WARN_HTTP_ROBOTS_DENIED)
                    self.set_result(u"syntax OK")
                return -1, response
            # see about recursive redirect
            all_seen = [self.cache_url_key] + self.aliases
            if redirected in all_seen:
                if self.method == "HEAD":
                    # Microsoft servers tend to recurse HEAD requests
                    # fall back to the original url and use GET
                    return self.max_redirects, response
                recursion = all_seen + [redirected]
                if set_result:
                    self.set_result(
                          _("recursive redirection encountered:\n %(urls)s") %
                            {"urls": "\n  => ".join(recursion)}, valid=False)
                return -1, response
            if urlparts[0] == self.scheme:
                # remember redireced url as alias
                self.aliases.append(redirected)
            # note: urlparts has to be a list
            self.urlparts = urlparts
            if set_result:
                self.check301status(response)
            # check cache again on the changed URL
            if self.aggregate.urlqueue.checked_redirect(redirected, self):
                return -1, response
            # in case of changed scheme make new URL object
            if self.urlparts[0] != self.scheme:
                if set_result:
                    self.add_warning(
                           _("Redirection to different URL type encountered; "
                             "the original URL was `%(url)s'.") %
                             {"url": self.url},
                           tag=WARN_HTTP_WRONG_REDIRECT)
                newobj = get_url_from(
                          redirected, self.recursion_level, self.aggregate,
                          parent_url=self.parent_url, base_ref=self.base_ref,
                          line=self.line, column=self.column, name=self.name)
                # append new object to queue
                self.aggregate.urlqueue.put(newobj)
                # pretend to be finished and logged
                return -1, response
            # new response data
            response.close()
            response = self._try_http_response()
            tries += 1
        return tries, response

    def check301status (self, response):
        """If response page has been permanently moved add a warning."""
        if response.status == 301 and not self.has301status:
            self.add_warning(_("HTTP 301 (moved permanent) encountered: you"
                               " should update this link."),
                             tag=WARN_HTTP_MOVED_PERMANENT)
            self.has301status = True

    def get_alias_cache_data (self):
        """
        Return all data values that should be put in the cache,
        minus redirection warnings.
        """
        data = self.get_cache_data()
        data["warnings"] = [
            x for x in self.warnings if x[0] != "http-moved-permanent"]
        data["info"] = self.info
        return data

    def check_response (self, response):
        """Check final result and log it."""
        if response.status >= 400:
            self.set_result(u"%r %s" % (response.status, response.reason),
                            valid=False)
        else:
            if response.status == 204:
                # no content
                self.add_warning(unicode_safe(response.reason),
                                 tag=WARN_HTTP_EMPTY_CONTENT)
            # store cookies for valid links
            if self.aggregate.config['storecookies']:
                for c in self.cookies:
                    self.add_info(_("Store cookie: %(cookie)s.") %
                                  {"cookie": c})
                try:
                    out = self.aggregate.cookies.add(self.headers,
                                                     self.urlparts[0],
                                                     self.urlparts[1],
                                                     self.urlparts[2])
                    for h in out:
                        self.add_info(unicode_safe(h))
                except Cookie.CookieError, msg:
                    self.add_warning(_("Could not store cookies: %(msg)s.") %
                                     {'msg': str(msg)},
                                     tag=WARN_HTTP_COOKIE_STORE_ERROR)
            if response.status >= 200:
                self.set_result(u"%r %s" % (response.status, response.reason))
            else:
                self.set_result(u"OK")
        modified = self.headers.get('Last-Modified', '')
        if modified:
            self.add_info(_("Last modified %(date)s.") % {"date": modified})

    def _try_http_response (self):
        """Try to get a HTTP response object. For reused persistent
        connections that the server closed unexpected, a new connection
        will be opened.
        """
        try:
            return self._get_http_response()
        except socket.error, msg:
            if msg.args[0] == 32 and self.reused_connection:
                # server closed persistent connection - retry
                log.debug(LOG_CHECK, "Server closed connection: retry")
                self.persistent = False
                return self._get_http_response()
            raise
        except httplib.BadStatusLine, msg:
            if str(msg) == "Empty status line" and self.reused_connection:
                # server closed connection - retry
                log.debug(LOG_CHECK, "Empty status line: retry")
                self.persistent = False
                return self._get_http_response()
            raise

    def _get_http_response (self):
        """
        Send HTTP request and get response object.
        """
        if self.proxy:
            host = self.proxy
            scheme = self.proxytype
        else:
            host = self.urlparts[1]
            scheme = self.urlparts[0]
        log.debug(LOG_CHECK, "Connecting to %r", host)
        # close/release a previous connection
        self.close_connection()
        self.url_connection = self.get_http_object(host, scheme)
        # the anchor fragment is not part of a HTTP URL, see
        # http://tools.ietf.org/html/rfc2616#section-3.2.2
        anchor = ''
        if self.proxy:
            path = urlparse.urlunsplit((self.urlparts[0], self.urlparts[1],
                                 self.urlparts[2], self.urlparts[3], anchor))
        else:
            path = urlparse.urlunsplit(('', '', self.urlparts[2],
                                        self.urlparts[3], anchor))
        self.url_connection.putrequest(self.method, path, skip_host=True,
                                       skip_accept_encoding=True)
        self.url_connection.putheader("Host", host)
        # userinfo is from http://user@pass:host/
        if self.userinfo:
            self.url_connection.putheader("Authorization", self.userinfo)
        # auth is the -u and -p configuration options
        elif self.auth:
            self.url_connection.putheader("Authorization", self.auth)
        if self.proxyauth:
            self.url_connection.putheader("Proxy-Authorization",
                                         self.proxyauth)
        if (self.parent_url and
            self.parent_url.startswith(('http://', 'https://'))):
            self.url_connection.putheader("Referer", self.parent_url)
        self.url_connection.putheader("User-Agent", configuration.UserAgent)
        self.url_connection.putheader("Accept-Encoding",
                                  "gzip;q=1.0, deflate;q=0.9, identity;q=0.5")
        if self.aggregate.config['sendcookies']:
            scheme = self.urlparts[0]
            host = self.urlparts[1]
            port = urlutil.default_ports.get(scheme, 80)
            host, port = urllib.splitnport(host, port)
            path = self.urlparts[2]
            self.cookies = self.aggregate.cookies.get(scheme, host, port, path)
            for c in self.cookies:
                name = c.client_header_name()
                value = c.client_header_value()
                self.url_connection.putheader(name, value)
        self.url_connection.endheaders()
        response = self.url_connection.getresponse()
        self.timeout = headers.http_timeout(response)
        self.headers = response.msg
        self.persistent = not response.will_close
        if self.persistent and self.method == "HEAD":
            # Some servers send page content after a HEAD request,
            # but only after making the *next* request. This breaks
            # protocol synchronisation. Workaround here is to close
            # the connection after HEAD.
            # Example: http://www.empleo.gob.mx (Apache/1.3.33 (Unix) mod_jk)
            self.persistent = False
        if self.persistent and (self.method == "GET" or
           self.headers.getheader("Content-Length") != "0"):
            # always read content from persistent connections
            self._read_content(response)
            assert not response.will_close
        # If possible, use official W3C HTTP response name
        if response.status in httpresponses:
            response.reason = httpresponses[response.status]
        return response

    def get_http_object (self, host, scheme):
        """
        Open a HTTP connection.

        @param host: the host to connect to
        @type host: string of the form <host>[:<port>]
        @param scheme: 'http' or 'https'
        @type scheme: string
        @return: open HTTP(S) connection
        @rtype: httplib.HTTP(S)Connection
        """
        _user, _password = self.get_user_password()
        key = (scheme, self.urlparts[1], _user, _password)
        conn = self.aggregate.connections.get(key)
        if conn is not None:
            log.debug(LOG_CHECK, "reuse cached HTTP(S) connection %s", conn)
            self.reused_connection = True
            return conn
        self.aggregate.connections.wait_for_host(host)
        if scheme == "http":
            h = httplib.HTTPConnection(host)
        elif scheme == "https" and supportHttps:
            h = httplib.HTTPSConnection(host)
        else:
            msg = _("Unsupported HTTP url scheme `%(scheme)s'") % {"scheme": scheme}
            raise LinkCheckerError(msg)
        if log.is_debug(LOG_CHECK):
            h.set_debuglevel(1)
        h.connect()
        return h

    def read_content (self):
        """Get content of the URL target. The content data is cached after
        the first call to this method.

        @return: URL content, decompressed and decoded
        @rtype: string
        """
        self.method = "GET"
        response = self._try_http_response()
        response = self.follow_redirections(response, set_result=False)[1]
        self.headers = response.msg
        if self._data is None:
            self._read_content(response)
        data = self._data
        self._data = None
        return data

    def _read_content (self, response):
        data = response.read()
        encoding = headers.get_content_encoding(self.headers)
        if encoding in _supported_encodings:
            try:
                if encoding == 'deflate':
                    f = StringIO(zlib.decompress(data))
                else:
                    f = gzip.GzipFile('', 'rb', 9, StringIO(data))
            except zlib.error, msg:
                self.add_warning(_("Decompress error %(err)s") %
                                 {"err": str(msg)},
                                 tag=WARN_HTTP_DECOMPRESS_ERROR)
                f = StringIO(data)
            try:
                data = f.read()
            finally:
                f.close()
        # store temporary data
        self._data = data

    def encoding_supported (self):
        """Check if page encoding is supported."""
        encoding = headers.get_content_encoding(self.headers)
        if encoding and encoding not in _supported_encodings and \
           encoding != 'identity':
            self.add_warning(_("Unsupported content encoding `%(encoding)s'.") %
                             {"encoding": encoding},
                             tag=WARN_HTTP_UNSUPPORTED_ENCODING)
            return False
        return True

    def is_html (self):
        """
        See if this URL points to a HTML file by looking at the
        Content-Type header, file extension and file content.

        @return: True if URL points to HTML file
        @rtype: bool
        """
        if not (self.valid and self.headers):
            return False
        if headers.get_content_type(self.headers) not in HTML_MIMETYPES:
            return False
        return self.encoding_supported()

    def is_css (self):
        """Return True iff content of this url is CSS stylesheet."""
        if not (self.valid and self.headers):
            return False
        if headers.get_content_type(self.headers) != "text/css":
            return False
        return self.encoding_supported()

    def is_http (self):
        """
        This is a HTTP file.

        @return: True
        @rtype: bool
        """
        return True

    def is_parseable (self):
        """
        Check if content is parseable for recursion.

        @return: True if content is parseable
        @rtype: bool
        """
        if not (self.valid and self.headers):
            return False
        if headers.get_content_type(self.headers) not in PARSE_MIMETYPES:
            return False
        return self.encoding_supported()

    def parse_url (self):
        """
        Parse file contents for new links to check.
        """
        ctype = headers.get_content_type(self.headers)
        if self.is_html():
            self.parse_html()
        elif self.is_css():
            self.parse_css()
        elif ctype == "application/x-shockwave-flash":
            self.parse_swf()
        elif ctype == "application/msword":
            self.parse_word()

    def get_robots_txt_url (self):
        """
        Get the according robots.txt URL for this URL.

        @return: robots.txt URL
        @rtype: string
        """
        return "%s://%s/robots.txt" % tuple(self.urlparts[0:2])

    def close_connection (self):
        """
        If connection is persistent, add it to the connection pool.
        Else close the connection. Errors on closing are ignored.
        """
        if self.url_connection is None:
            # no connection is open
            return
        # add to cached connections
        _user, _password = self.get_user_password()
        key = ("http", self.urlparts[1], _user, _password)
        if self.persistent and self.url_connection.is_idle():
            self.aggregate.connections.add(
                  key, self.url_connection, self.timeout)
        else:
            try:
                self.url_connection.close()
            except Exception:
                # ignore close errors
                pass
        self.url_connection = None