File: urlbase.py

package info (click to toggle)
linkchecker 5.2-2
links: PTS
area: main
in suites: squeeze
size: 3,508 kB
ctags: 3,805
sloc: python: 22,666; lex: 1,114; yacc: 785; makefile: 276; ansic: 95; sh: 68; sql: 19; awk: 4
file content (1071 lines) | stat: -rw-r--r-- 40,753 bytes
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2000-2010 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
Base URL handler.
"""
import sys
import os
import logging
import urlparse
import urllib2
import urllib
import time
import errno
import socket
import select
import tempfile

from . import absolute_url, StoringHandler, get_url_from
from ..cache import geoip
from .. import (log, LOG_CHECK, LOG_CACHE, httputil, httplib2 as httplib,
    strformat, LinkCheckerError, url as urlutil, trace, clamav, containers,
    winutil)
from ..HtmlParser import htmlsax
from ..htmlutil import linkparse
from .const import (WARN_URL_EFFECTIVE_URL, WARN_URL_UNICODE_DOMAIN,
    WARN_URL_UNNORMED, WARN_URL_ERROR_GETTING_CONTENT,
    WARN_URL_ANCHOR_NOT_FOUND, WARN_URL_WARNREGEX_FOUND,
    WARN_URL_CONTENT_TOO_LARGE, ExcList, ExcSyntaxList, ExcNoCacheList)

# helper alias
unicode_safe = strformat.unicode_safe

def urljoin (parent, url, scheme):
    """
    If url is relative, join parent and url. Else leave url as-is.

    @return joined url
    """
    if url.startswith(scheme+":"):
        return url
    return urlparse.urljoin(parent, url)


def url_norm (url, encoding=None):
    """Wrapper for url.url_norm() to convert UnicodeError in
    LinkCheckerError."""
    try:
        return urlutil.url_norm(url, encoding=encoding)
    except UnicodeError:
        msg = _("URL has unparsable domain name: %(name)s") % \
            {"name": sys.exc_info()[1]}
        raise LinkCheckerError(msg)


class UrlBase (object):
    """An URL with additional information like validity etc."""

    def __init__ (self, base_url, recursion_level, aggregate,
                  parent_url=None, base_ref=None, line=-1, column=-1,
                  name=u"", url_encoding=None):
        """
        Initialize check data, and store given variables.

        @param base_url: unquoted and possibly unnormed url
        @param recursion_level: on what check level lies the base url
        @param aggregate: aggregate instance
        @param parent_url: quoted and normed url of parent or None
        @param base_ref: quoted and normed url of <base href=""> or None
        @param line: line number of url in parent content
        @param column: column number of url in parent content
        @param name: name of url or empty
        @param url_encoding: encoding of URL or None
        """
        self.init(base_ref, base_url, parent_url, recursion_level,
                  aggregate, line, column, name, url_encoding)
        self.reset()
        self.check_syntax()


    def init (self, base_ref, base_url, parent_url, recursion_level,
              aggregate, line, column, name, url_encoding):
        """
        Initialize internal data.
        """
        self.base_ref = base_ref
        # note that self.base_url must not be modified
        self.base_url = base_url
        self.parent_url = parent_url
        self.recursion_level = recursion_level
        self.aggregate = aggregate
        self.line = line
        self.column = column
        self.name = name
        self.encoding = url_encoding
        if self.base_ref:
            assert not urlutil.url_needs_quoting(self.base_ref), \
                   "unquoted base reference URL %r" % self.base_ref
        if self.parent_url:
            assert not urlutil.url_needs_quoting(self.parent_url), \
                   "unquoted parent URL %r" % self.parent_url
        url = absolute_url(base_url, base_ref, parent_url)
        # assume file link if no scheme is found
        self.scheme = url.split(":", 1)[0] or "file"
        # warn if URL is redirected (for commandline client)
        self.warn_redirect = False

    def reset (self):
        """
        Reset all variables to default values.
        """
        # self.url is constructed by self.build_url() out of base_url
        # and (base_ref or parent) as absolute and normed url.
        # This the real url we use when checking so it also referred to
        # as 'real url'
        self.url = None
        # a splitted version of url for convenience
        self.urlparts = None
        # the anchor part of url
        self.anchor = None
        # list of parsed anchors
        self.anchors = []
        # the result message string and flag
        self.result = u""
        self.has_result = False
        # cached or not
        self.cached = False
        # valid or not
        self.valid = True
        # list of warnings (without duplicates)
        self.warnings = []
        # list of infos
        self.info = []
        # download time
        self.dltime = -1
        # download size
        self.dlsize = -1
        # check time
        self.checktime = 0
        # connection object
        self.url_connection = None
        # data of url content,  (data == None) means no data is available
        self.data = None
        # cache keys, are set by build_url() calling set_cache_keys()
        self.cache_url_key = None
        self.cache_content_key = None
        # extern flags (is_extern, is_strict), both enabled as default
        self.extern = (1, 1)
        # flag if the result should be cached
        self.caching = True
        # title is either the URL or parsed from content
        self.title = None

    def set_result (self, msg, valid=True, overwrite=False):
        """
        Set result string and validity.
        """
        if self.has_result and not overwrite:
            log.warn(LOG_CHECK,
              "Double result %r (previous %r) for %s", msg, self.result, self)
        else:
            self.has_result = True
        if not isinstance(msg, unicode):
            log.warn(LOG_CHECK, "Non-unicode result for %s: %r", self, msg)
        elif not msg:
            log.warn(LOG_CHECK, "Empty result for %s", self)
        self.result = msg
        self.valid = valid

    def get_title (self):
        """Return title of page the URL refers to.
        This is per default the filename or the URL."""
        if self.title is None:
            url = u""
            if self.base_url:
                url = self.base_url
            elif self.url:
                url = self.url
            self.title = url
            if "/" in url:
                title = url.rsplit("/", 1)[1]
                if title:
                    self.title = title
        return self.title

    def set_title_from_content (self):
        """Set title of page the URL refers to.from page content."""
        if self.valid and self.is_html():
            try:
                handler = linkparse.TitleFinder(self.get_content())
            except tuple(ExcList):
                return
            parser = htmlsax.parser(handler)
            handler.parser = parser
            # parse
            try:
                parser.feed(self.get_content())
                parser.flush()
            except linkparse.StopParse, msg:
                log.debug(LOG_CHECK, "Stopped parsing: %s", msg)
            # break cyclic dependencies
            handler.parser = None
            parser.handler = None
            if handler.title:
                self.title = handler.title

    def is_parseable (self):
        """
        Return True iff content of this url is parseable.
        """
        return False

    def is_html (self):
        """
        Return True iff content of this url is HTML formatted.
        """
        return False

    def is_css (self):
        """Return True iff content of this url is CSS stylesheet."""
        return False

    def is_http (self):
        """
        Return True for http:// URLs.
        """
        return False

    def is_file (self):
        """
        Return True for file:// URLs.
        """
        return False

    def add_warning (self, s, tag=None):
        """
        Add a warning string.
        """
        item = (tag, s)
        if item not in self.warnings:
            self.warnings.append(item)

    def add_info (self, s):
        """
        Add an info string.
        """
        if s not in self.info:
            self.info.append(s)

    def copy_from_cache (self, cache_data):
        """
        Fill attributes from cache data.
        """
        self.result = cache_data["result"]
        self.has_result = True
        for tag, msg in cache_data["warnings"]:
            # do not copy anchor warnings, since the current anchor
            # might have changed
            if tag != WARN_URL_ANCHOR_NOT_FOUND:
                self.add_warning(msg, tag=tag)
        for info in cache_data["info"]:
            self.add_info(info)
        self.valid = cache_data["valid"]
        self.dltime = cache_data["dltime"]
        self.dlsize = cache_data["dlsize"]
        self.anchors = cache_data["anchors"]
        self.cached = True
        # recheck anchor
        if self.valid and self.anchor:
            self.check_anchor()

    def get_cache_data (self):
        """Return all data values that should be put in the cache."""
        return {"result": self.result,
                "warnings": self.warnings,
                "info": self.info,
                "valid": self.valid,
                "dltime": self.dltime,
                "dlsize": self.dlsize,
                "anchors": self.anchors,
               }

    def get_alias_cache_data (self):
        """Return all data values that should be put in the cache.
        Intended to be overridden by subclasses that handle aliases.
        """
        return self.get_cache_data()

    def set_cache_keys (self):
        """
        Set keys for URL checking and content recursion.
        """
        # remove anchor from content cache key since we assume
        # URLs with different anchors to have the same content
        self.cache_content_key = urlparse.urlunsplit(self.urlparts[:4]+[u''])
        assert isinstance(self.cache_content_key, unicode), self
        log.debug(LOG_CACHE, "Content cache key %r", self.cache_content_key)
        # construct cache key
        self.cache_url_key = self.cache_content_key
        assert isinstance(self.cache_url_key, unicode), self
        log.debug(LOG_CACHE, "URL cache key %r", self.cache_url_key)

    def check_syntax (self):
        """
        Called before self.check(), this function inspects the
        url syntax. Success enables further checking, failure
        immediately logs this url. Syntax checks must not
        use any network resources.
        """
        log.debug(LOG_CHECK, "checking syntax")
        if self.base_url is None:
            self.set_result(_("URL is missing"), valid=False)
            return
        if not (self.base_url or self.parent_url):
            self.set_result(_("URL is empty"), valid=False)
            return
        try:
            self.build_url()
            # check url warnings
            effectiveurl = urlparse.urlunsplit(self.urlparts)
            if self.url != effectiveurl:
                self.add_warning(_("Effective URL %(url)r.") %
                                 {"url": effectiveurl},
                                 tag=WARN_URL_EFFECTIVE_URL)
                self.url = effectiveurl
        except tuple(ExcSyntaxList), msg:
            self.set_result(unicode_safe(msg), valid=False)
            return
        self.set_cache_keys()

    def build_url (self):
        """
        Construct self.url and self.urlparts out of the given base
        url information self.base_url, self.parent_url and self.base_ref.
        """
        # norm base url - can raise UnicodeError from url.idna_encode()
        base_url, is_idn = url_norm(self.base_url, self.encoding)
        if is_idn:
            self.add_warning(_("""URL %(url)r has a unicode domain name which
                          is not yet widely supported. You should use
                          the URL %(idna_url)r instead.""") % \
                          {"url": self.base_url, "idna_url": base_url},
                          tag=WARN_URL_UNICODE_DOMAIN)
        elif self.base_url != base_url:
            self.add_warning(
              _("Base URL is not properly normed. Normed URL is %(url)s.") %
               {'url': base_url}, tag=WARN_URL_UNNORMED)
        # make url absolute
        if self.base_ref:
            # use base reference as parent url
            if ":" not in self.base_ref:
                # some websites have a relative base reference
                self.base_ref = urljoin(self.parent_url, self.base_ref,
                                        self.scheme)
            self.url = urljoin(self.base_ref, base_url, self.scheme)
        elif self.parent_url:
            # strip the parent url query and anchor
            urlparts = list(urlparse.urlsplit(self.parent_url))
            urlparts[3] = urlparts[4] = ""
            parent_url = urlparse.urlunsplit(urlparts)
            self.url = urljoin(parent_url, base_url, self.scheme)
        else:
            self.url = base_url
        # note: urljoin can unnorm the url path, so norm it again
        urlparts = list(urlparse.urlsplit(self.url))
        if urlparts[2]:
            urlparts[2] = urlutil.collapse_segments(urlparts[2])
        self.url = urlparse.urlunsplit(urlparts)
        # split into (modifiable) list
        self.urlparts = strformat.url_unicode_split(self.url)
        # and unsplit again
        self.url = urlparse.urlunsplit(self.urlparts)
        # check userinfo@host:port syntax
        self.userinfo, host = urllib.splituser(self.urlparts[1])
        # set host lowercase
        if self.userinfo:
            self.urlparts[1] = "%s@%s" % (self.userinfo, host.lower())
        else:
            self.urlparts[1] = host.lower()
        # safe anchor for later checking
        self.anchor = self.urlparts[4]
        self.host, self.port = urllib.splitport(host)
        if self.port is not None:
            if not urlutil.is_numeric_port(self.port):
                raise LinkCheckerError(_("URL has invalid port %(port)r") %
                    {"port": str(self.port)})
            self.port = int(self.port)

    def check (self):
        """Main check function for checking this URL."""
        if self.aggregate.config["trace"]:
            trace.trace_on()
        try:
            self.local_check()
        except (socket.error, select.error):
            # on Unix, ctrl-c can raise
            # error: (4, 'Interrupted system call')
            etype, value = sys.exc_info()[:2]
            if etype == errno.EINTR:
                raise KeyboardInterrupt(value)
            else:
                raise
        finally:
            # close/release possible open connection
            self.close_connection()

    def add_country_info (self):
        """
        Try to ask GeoIP database for country info.
        """
        country = geoip.get_country(self.host)
        if country is not None:
            self.add_info(_("URL is located in %(country)s.") %
                {"country": _(country)})

    def local_check (self):
        """Local check function can be overridden in subclasses."""
        log.debug(LOG_CHECK, "Checking %s", self)
        # start time for check
        check_start = time.time()
        self.set_extern(self.url)
        if self.extern[0] and self.extern[1]:
            self.add_info(_("Outside of domain filter, checked only syntax."))
            return

        # check connection
        log.debug(LOG_CHECK, "checking connection")
        try:
            self.check_connection()
            self.add_country_info()
            self.check_content()
        except tuple(ExcList):
            value = self.handle_exception()
            # make nicer error msg for unknown hosts
            if isinstance(value, socket.error) and value.args[0] == -2:
                value = _('Hostname not found')
            # make nicer error msg for bad status line
            if isinstance(value, httplib.BadStatusLine):
                value = _('Bad HTTP response %(line)r') % {"line": str(value)}
            self.set_result(unicode_safe(value), valid=False)
        self.checktime = time.time() - check_start
        # check recursion
        try:
            if self.allows_recursion():
                self.parse_url()
            # check content size
            self.check_size()
        except tuple(ExcList):
            value = self.handle_exception()
            self.add_warning(_("could not get content: %(msg)r") %
                 {"msg": str(value)}, tag=WARN_URL_ERROR_GETTING_CONTENT)

    def close_connection (self):
        """
        Close an opened url connection.
        """
        if self.url_connection is None:
            # no connection is open
            return
        try:
            self.url_connection.close()
        except Exception:
            # ignore close errors
            pass
        self.url_connection = None

    def handle_exception (self):
        """
        An exception occurred. Log it and set the cache flag.
        """
        etype, value = sys.exc_info()[:2]
        log.debug(LOG_CHECK, "Error in %s: %s %s", self.url, etype, value, exception=True)
        # note: etype must be the exact class, not a subclass
        if (etype in ExcNoCacheList) or \
           (etype == socket.error and value.args[0]==errno.EBADF) or \
            not value:
            # EBADF occurs when operating on an already socket
            self.caching = False
        errmsg = etype.__name__
        if str(value):
            # use Exception class name
            errmsg += ": %s" % str(value)
        # limit length to 240
        return strformat.limit(errmsg, length=240)

    def check_connection (self):
        """
        The basic connection check uses urllib2.urlopen to initialize
        a connection object.
        """
        self.url_connection = urllib2.urlopen(self.url)

    def allows_recursion (self):
        """
        Return True iff we can recurse into the url's content.
        """
        log.debug(LOG_CHECK, "checking recursion of %r ...", self.url)
        # Test self.valid before self.is_parseable().
        if not self.valid:
            log.debug(LOG_CHECK, "... no, invalid.")
            return False
        if not self.is_parseable():
            log.debug(LOG_CHECK, "... no, not parseable.")
            return False
        if not self.can_get_content():
            log.debug(LOG_CHECK, "... no, cannot get content.")
            return False
        rec_level = self.aggregate.config["recursionlevel"]
        if  rec_level >= 0 and self.recursion_level >= rec_level:
            log.debug(LOG_CHECK, "... no, maximum recursion level reached.")
            return False
        if self.extern[0]:
            log.debug(LOG_CHECK, "... no, extern.")
            return False
        if not self.content_allows_robots():
            log.debug(LOG_CHECK, "... no, robots.")
            return False
        log.debug(LOG_CHECK, "... yes, recursion.")
        return True

    def content_allows_robots (self):
        """
        Return True if the content of this URL forbids robots to
        search for recursive links.
        """
        if not self.is_html():
            return True
        if not (self.is_http() or self.is_file()):
            return True
        # construct parser object
        handler = linkparse.MetaRobotsFinder()
        parser = htmlsax.parser(handler)
        handler.parser = parser
        # parse
        try:
            parser.feed(self.get_content())
            parser.flush()
        except linkparse.StopParse, msg:
            log.debug(LOG_CHECK, "Stopped parsing: %s", msg)
        # break cyclic dependencies
        handler.parser = None
        parser.handler = None
        return handler.follow

    def get_anchors (self):
        """Store list of anchors for this URL. Precondition: this URL is
        an HTML resource."""
        log.debug(LOG_CHECK, "Getting HTML anchors %s", self)
        handler = linkparse.LinkFinder(self.get_content(), self.add_anchor,
                                   tags={'a': [u'name'], None: [u'id']})
        parser = htmlsax.parser(handler)
        handler.parser = parser
        # parse
        try:
            parser.feed(self.get_content())
            parser.flush()
        except linkparse.StopParse, msg:
            log.debug(LOG_CHECK, "Stopped parsing: %s", msg)
        # break cyclic dependencies
        handler.parser = None
        parser.handler = None

    def add_anchor (self, url, line, column, name, base):
        """Add anchor URL."""
        self.anchors.append((url, line, column, name, base))

    def check_anchor (self):
        """If URL was valid and has an anchor, check it. A warning is
        logged if the anchor is not found.
        """
        if not self.aggregate.config["anchors"]:
            return
        log.debug(LOG_CHECK, "checking anchor %r", self.anchor)
        if any(x for x in self.anchors if x[0] == self.anchor):
            return
        anchors = u",".join(u"`%s'" % x[0] for x in self.anchors)
        args = {"name": self.anchor, "anchors": anchors}
        msg = u"%s %s" % (_("Anchor `%(name)s' not found.") % args,
                          _("Available anchors: %(anchors)s.") % args)
        self.add_warning(msg, tag=WARN_URL_ANCHOR_NOT_FOUND)

    def set_extern (self, url):
        """
        Match URL against extern and intern link patterns. If no pattern
        matches the URL is extern. Sets self.extern to a tuple (bool,
        bool) with content (is_extern, is_strict).

        @return: None
        """
        for entry in self.aggregate.config["externlinks"]:
            match = entry['pattern'].search(url)
            if (entry['negate'] and not match) or \
               (match and not entry['negate']):
                log.debug(LOG_CHECK, "Extern URL %r", url)
                self.extern = (1, entry['strict'])
                return
        for entry in self.aggregate.config["internlinks"]:
            match = entry['pattern'].search(url)
            if (entry['negate'] and not match) or \
               (match and not entry['negate']):
                log.debug(LOG_CHECK, "Intern URL %r", url)
                self.extern = (0, 0)
                return
        log.debug(LOG_CHECK, "Explicit extern URL %r", url)
        self.extern = (1, 0)
        return

    def can_get_content (self):
        """Indicate wether url get_content() can be called."""
        return True

    def get_content (self):
        """Precondition: url_connection is an opened URL."""
        if self.data is None:
            log.debug(LOG_CHECK, "Get content of %r", self.url)
            t = time.time()
            self.data = self.read_content()
            self.dltime = time.time() - t
            self.dlsize = len(self.data)
        return self.data

    def read_content (self):
        """Return data for this URL. Can be overridden in subclasses."""
        return self.url_connection.read()

    def check_content (self):
        """Check content data for warnings, syntax errors, viruses etc."""
        if not (self.valid and self.can_get_content()):
            return
        if self.is_html():
            self.set_title_from_content()
            if self.aggregate.config["anchors"]:
                self.get_anchors()
        if self.anchor:
            self.check_anchor()
        warningregex = self.aggregate.config["warningregex"]
        if warningregex:
            log.debug(LOG_CHECK, "checking content")
            try:
                match = warningregex.search(self.get_content())
                if match:
                    self.add_warning(_("Found %(match)r in link contents.") %
                       {"match": match.group()}, tag=WARN_URL_WARNREGEX_FOUND)
            except tuple(ExcList):
                value = self.handle_exception()
                self.set_result(unicode_safe(value), valid=False)
        # is it an intern URL?
        if not self.extern[0]:
            # check HTML/CSS syntax
            if self.aggregate.config["checkhtml"] and self.is_html():
                self.check_html()
            if self.aggregate.config["checkcss"] and self.is_css():
                self.check_css()
            if self.aggregate.config["checkhtmlw3"] and self.is_html():
                self.check_html_w3()
            if self.aggregate.config["checkcssw3"] and self.is_css():
                self.check_css_w3()
            # check with clamav
            if self.aggregate.config["scanvirus"]:
                self.scan_virus()

    def check_size (self):
        """
        If a maximum size was given, call this function to check it
        against the content size of this url.
        """
        maxbytes = self.aggregate.config["warnsizebytes"]
        if maxbytes is not None and self.dlsize >= maxbytes:
            self.add_warning(
                   _("Content size %(dlsize)s is larger than %(maxbytes)s.") %
                        {"dlsize": strformat.strsize(self.dlsize),
                         "maxbytes": strformat.strsize(maxbytes)},
                          tag=WARN_URL_CONTENT_TOO_LARGE)

    def check_html (self):
        """Check HTML syntax of this page (which is supposed to be HTML)
        with the local HTML tidy module."""
        try:
            import tidy
        except ImportError:
            log.warn(LOG_CHECK, _("warning: tidy module is not available; " \
                 "download from http://utidylib.berlios.de/"))
            return
        options = dict(output_html=0, show_warnings=1, quiet=True,
            input_encoding='utf8', output_encoding='utf8', tidy_mark=0)
        try:
            doc = tidy.parseString(self.get_content(), **options)
            errors = filter_tidy_errors(doc.errors)
            if errors:
                for err in errors:
                    self.add_warning(u"HTMLTidy: %s" % err)
            else:
                self.add_info(u"HTMLTidy: %s" % _("valid HTML syntax"))
        except Exception:
            # catch _all_ exceptions since we dont want third party module
            # errors to propagate into this library
            err = str(sys.exc_info()[1])
            log.warn(LOG_CHECK,
                _("warning: tidy HTML parsing caused error: %(msg)s ") %
                {"msg": err})

    def check_css (self):
        """Check CSS syntax of this page (which is supposed to be CSS)
        with the local cssutils module."""
        try:
            import cssutils
        except ImportError:
            log.warn(LOG_CHECK,
                _("warning: cssutils module is not available; " \
                 "download from http://cthedot.de/cssutils/"))
            return
        try:
            csslog = logging.getLogger('cssutils')
            csslog.propagate = 0
            del csslog.handlers[:]
            handler = StoringHandler()
            csslog.addHandler(handler)
            csslog.setLevel(logging.WARN)
            cssparser = cssutils.CSSParser(log=csslog)
            cssparser.parseString(self.get_content(), href=self.url)
            if handler.storage:
                for record in handler.storage:
                    self.add_warning(u"cssutils: %s" % record.getMessage())
            else:
                self.add_info(u"cssutils: %s" % _("valid CSS syntax"))
        except Exception:
            # catch _all_ exceptions since we dont want third party module
            # errors to propagate into this library
            err = str(sys.exc_info()[1])
            log.warn(LOG_CHECK,
                _("warning: cssutils parsing caused error: %(msg)s") %
                {"msg": err})

    def check_html_w3 (self):
        """Check HTML syntax of this page (which is supposed to be HTML)
        with the online W3C HTML validator documented at
        http://validator.w3.org/docs/api.html
        """
        self.aggregate.check_w3_time()
        try:
            u = urllib2.urlopen('http://validator.w3.org/check',
                urllib.urlencode({
                    'fragment': self.get_content(),
                    'output': 'xml',
                }))
            if u.headers.get('x-w3c-validator-status', 'Invalid') == 'Valid':
                self.add_info(u"W3C Validator: %s" % _("valid HTML syntax"))
                return
            from xml.dom.minidom import parseString
            dom = parseString(u.read())
            elements = dom.getElementsByTagName('messages')[0].getElementsByTagName('msg')
            for msg in [e.firstChild.wholeText for e in elements]:
                self.add_warning(u"W3C HTML validation: %s" % msg)
        except Exception:
            # catch _all_ exceptions since we dont want third party module
            # errors to propagate into this library
            err = str(sys.exc_info()[1])
            log.warn(LOG_CHECK,
                _("warning: HTML W3C validation caused error: %(msg)s ") %
                {"msg": err})

    def check_css_w3 (self):
        """Check CSS syntax of this page (which is supposed to be CSS)
        with the online W3C CSS validator documented at
        http://jigsaw.w3.org/css-validator/manual.html#expert
        """
        self.aggregate.check_w3_time()
        try:
            host = 'jigsaw.w3.org'
            path = '/css-validator/validator'
            params = {
                'text': "div {}",
                'warning': '2',
                'output': 'soap12',
            }
            fields = params.items()
            content_type, body = httputil.encode_multipart_formdata(fields)
            h = httplib.HTTPConnection(host)
            h.putrequest('POST', path)
            h.putheader('Content-Type', content_type)
            h.putheader('Content-Length', str(len(body)))
            h.endheaders()
            h.send(body)
            r = h.getresponse()
            if r.getheader('X-W3C-Validator-Status', 'Invalid') == 'Valid':
                self.add_info(u"W3C Validator: %s" % _("valid CSS syntax"))
                return
            from xml.dom.minidom import parseString
            dom = parseString(r.read())
            elements = dom.getElementsByTagName('m:errors')[0].getElementsByTagName('m:error')
            for msg in [e.firstChild.wholeText for e in elements]:
                self.add_warning(u"W3C HTML validation: %s" % msg)
        except Exception:
            # catch _all_ exceptions since we dont want third party module
            # errors to propagate into this library
            err = str(sys.exc_info()[1])
            log.warn(LOG_CHECK,
                _("warning: CSS W3C validation caused error: %(msg)s ") %
                {"msg": err})

    def scan_virus (self):
        """Scan content for viruses."""
        infected, errors = clamav.scan(self.get_content())
        for msg in infected:
            self.add_warning(u"Virus scan infection: %s" % msg)
        for msg in errors:
            self.add_warning(u"Virus scan error: %s" % msg)

    def parse_url (self):
        """
        Parse url content and search for recursive links.
        Default parse type is html.
        """
        self.parse_html()

    def get_user_password (self):
        """
        Get tuple (user, password) from configured authentication.
        Both user and password can be None if not specified.
        """
        for auth in self.aggregate.config["authentication"]:
            if auth['pattern'].match(self.url):
                return auth['user'], auth['password']
        return None, None

    def parse_html (self):
        """Parse into HTML content and search for URLs to check.
        Found URLs are added to the URL queue.
        """
        log.debug(LOG_CHECK, "Parsing HTML %s", self)
        # construct parser object
        handler = linkparse.LinkFinder(self.get_content(), self.add_url)
        parser = htmlsax.parser(handler)
        handler.parser = parser
        # parse
        try:
            parser.feed(self.get_content())
            parser.flush()
        except linkparse.StopParse, msg:
            log.debug(LOG_CHECK, "Stopped parsing: %s", msg)
        # break cyclic dependencies
        handler.parser = None
        parser.handler = None

    def add_url (self, url, line, column, name, base):
        """Queue URL data for checking."""
        base_ref = urlutil.url_norm(base)[0]
        url_data = get_url_from(url, self.recursion_level+1, self.aggregate,
            parent_url=self.url, base_ref=base_ref, line=line, column=column,
            name=name)
        self.aggregate.urlqueue.put(url_data)

    def parse_opera (self):
        """Parse an opera bookmark file."""
        log.debug(LOG_CHECK, "Parsing Opera bookmarks %s", self)
        name = None
        lineno = 0
        for line in self.get_content().splitlines():
            lineno += 1
            line = line.strip()
            if line.startswith("NAME="):
                name = line[5:]
            elif line.startswith("URL="):
                url = line[4:]
                if url and name is not None:
                    url_data = get_url_from(url, self.recursion_level+1,
                        self.aggregate, parent_url=self.url,
                        line=lineno, name=name)
                    self.aggregate.urlqueue.put(url_data)
            else:
                name = None

    def parse_text (self):
        """
        Parse a text file with on url per line; comment and blank
        lines are ignored.
        """
        log.debug(LOG_CHECK, "Parsing text %s", self)
        lineno = 0
        for line in self.get_content().splitlines():
            lineno += 1
            line = line.strip()
            if not line or line.startswith('#'):
                continue
            url_data = get_url_from(line,
                              self.recursion_level+1, self.aggregate,
                              parent_url=self.url, line=lineno)
            self.aggregate.urlqueue.put(url_data)

    def parse_css (self):
        """
        Parse a CSS file for url() patterns.
        """
        log.debug(LOG_CHECK, "Parsing CSS %s", self)
        lineno = 0
        linkfinder = linkparse.css_url_re.finditer
        strip_comments = linkparse.strip_c_comments
        for line in strip_comments(self.get_content()).splitlines():
            lineno += 1
            for mo in linkfinder(line):
                column = mo.start("url")
                url = strformat.unquote(mo.group("url").strip())
                url_data = get_url_from(url,
                             self.recursion_level+1, self.aggregate,
                             parent_url=self.url, line=lineno, column=column)
                self.aggregate.urlqueue.put(url_data)

    def parse_swf (self):
        """Parse a SWF file for URLs."""
        linkfinder = linkparse.swf_url_re.finditer
        for mo in linkfinder(self.get_content()):
            url = mo.group()
            url_data = get_url_from(url,
                         self.recursion_level+1, self.aggregate,
                         parent_url=self.url)
            self.aggregate.urlqueue.put(url_data)

    def parse_word (self):
        """Parse a word file for hyperlinks."""
        if not winutil.has_word():
            return
        filename = self.get_temp_filename()
        # open word file and parse hyperlinks
        try:
            app = winutil.get_word_app()
            try:
                doc = winutil.open_wordfile(app, filename)
                try:
                    for link in doc.Hyperlinks:
                        url_data = get_url_from(link.Address,
                                 self.recursion_level+1, self.aggregate,
                                 parent_url=self.url, name=link.TextToDisplay)
                        self.aggregate.urlqueue.put(url_data)
                finally:
                    winutil.close_wordfile(doc)
            finally:
                winutil.close_word_app(app)
        except winutil.Error, msg:
            log.warn(LOG_CHECK, "Error parsing word file: %s", msg)

    def get_temp_filename (self):
        """Get temporary filename for content to parse."""
        # store content in temporary file
        fd, filename = tempfile.mkstemp(suffix='.doc', prefix='lc_')
        fp = os.fdopen(fd)
        fp.write(self.get_content())
        fp.close()

    def serialized (self):
        """
        Return serialized url check data as unicode string.
        """
        sep = unicode_safe(os.linesep)
        if self.base_url is not None:
            assert isinstance(self.base_url, unicode), self
        if self.parent_url is not None:
            assert isinstance(self.parent_url, unicode), self
        if self.base_ref is not None:
            assert isinstance(self.base_ref, unicode), self
        assert isinstance(self.name, unicode), self
        return sep.join([
            u"%s link" % self.scheme,
            u"base_url=%r" % self.base_url,
            u"parent_url=%r" % self.parent_url,
            u"base_ref=%r" % self.base_ref,
            u"recursion_level=%s" % self.recursion_level,
            u"url_connection=%s" % self.url_connection,
            u"line=%d" % self.line,
            u"column=%d" % self.column,
            u"name=%r" % self.name,
           ])

    def get_intern_pattern (self):
        """
        Get pattern for intern URL matching.

        @return non-empty regex pattern or None
        @rtype String or None
        """
        return None

    def __str__ (self):
        """
        Get URL info.

        @return: URL info, encoded with the output logger encoding
        @rtype: string
        """
        s = self.serialized()
        return self.aggregate.config['logger'].encode(s)

    def __repr__ (self):
        """
        Get URL info.

        @return: URL info
        @rtype: unicode
        """
        return u"<%s >" % self.serialized()

    def to_wire (self):
        """Return a simplified transport object for logging.

        The transport object must contain these attributes:
        - url_data.valid: bool
          Indicates if URL is valid
        - url_data.cached: bool
          Indicates if URL data has been loaded from cache.
        - url_data.result: unicode
          Result string
        - url_data.warnings: list of unicode
          List of tagged warnings for this URL.
        - url_data.name: unicode string or None
          name of URL (eg. filename or link name)
        - url_data.parent_url: unicode or None
          Parent URL
        - url_data.base_ref: unicode or None
          HTML base reference URL of parent
        - url_data.url: unicode or None
          Fully qualified URL.
        - url_data.checktime: int
          Number of seconds needed to check this link, default: zero.
        - url_data.dltime: int
          Number of seconds needed to download URL content, default: -1
        - url_data.dlsize: int
          Size of downloaded URL content, default: -1
        - url_data.info: list of unicode
          Additional information about this URL.
        - url_data.line: int
          Line number of this URL at parent document, or -1
        - url_data.column: int
          Column number of this URL at parent document, or -1
        """
        return containers.AttrDict(valid=self.valid,
          extern=self.extern[0],
          cached=self.cached,
          result=self.result,
          warnings=[x[1] for x in self.warnings],
          name=self.name or u"",
          title=self.get_title(),
          parent_url=self.parent_url or u"",
          base_ref=self.base_ref or u"",
          base_url=self.base_url or u"",
          url=self.url or u"",
          checktime=self.checktime,
          dltime=self.dltime,
          dlsize=self.dlsize,
          info=self.info,
          line=self.line,
          column=self.column,
          cache_url_key=self.cache_url_key,
        )


def filter_tidy_errors (errors):
    """Filter certain errors from HTML tidy run."""
    return [x for x in errors if not \
        (x.severity=='W' and x.message=='<table> lacks "summary" attribute')]