"""HTML handling.

Copyright 2003-2006 John J. Lee <jjl@pobox.com>

This code is free software; you can redistribute it and/or modify it under
the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
included with the distribution).

"""

import re, copy, urllib, htmlentitydefs
from urlparse import urljoin

import _request
from _headersutil import split_header_words, is_html as _is_html

## # XXXX miserable hack
## def urljoin(base, url):
##     if url.startswith("?"):
##         return base+url
##     else:
##         return urlparse.urljoin(base, url)

## def chr_range(a, b):
##     return "".join(map(chr, range(ord(a), ord(b)+1)))

## RESERVED_URL_CHARS = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ"
##                       "abcdefghijklmnopqrstuvwxyz"
##                       "-_.~")
## UNRESERVED_URL_CHARS = "!*'();:@&=+$,/?%#[]"
# we want (RESERVED_URL_CHARS+UNRESERVED_URL_CHARS), minus those
# 'safe'-by-default characters that urllib.urlquote never quotes
URLQUOTE_SAFE_URL_CHARS = "!*'();:@&=+$,/?%#[]~"

DEFAULT_ENCODING = "latin-1"

class CachingGeneratorFunction(object):
    """Caching wrapper around a no-arguments iterable.

    >>> i = [1]
    >>> func = CachingGeneratorFunction(i)
    >>> list(func())
    [1]
    >>> list(func())
    [1]

    >>> i = [1, 2, 3]
    >>> func = CachingGeneratorFunction(i)
    >>> list(func())
    [1, 2, 3]

    >>> i = func()
    >>> i.next()
    1
    >>> i.next()
    2
    >>> i.next()
    3

    >>> i = func()
    >>> j = func()
    >>> i.next()
    1
    >>> j.next()
    1
    >>> i.next()
    2
    >>> j.next()
    2
    >>> j.next()
    3
    >>> i.next()
    3
    >>> i.next()
    Traceback (most recent call last):
    ...
    StopIteration
    >>> j.next()
    Traceback (most recent call last):
    ...
    StopIteration
    """
    def __init__(self, iterable):
        def make_gen():
            for item in iterable:
                yield item

        self._cache = []
        self._generator = make_gen()

    def __call__(self):
        cache = self._cache

        for item in cache:
            yield item
        for item in self._generator:
            cache.append(item)
            yield item

def encoding_finder(default_encoding):
    def encoding(response):
        # HTTPEquivProcessor may be in use, so both HTTP and HTTP-EQUIV
        # headers may be in the response.  HTTP-EQUIV headers come last,
        # so try in order from first to last.
        for ct in response.info().getheaders("content-type"):
            for k, v in split_header_words([ct])[0]:
                if k == "charset":
                    return v
        return default_encoding
    return encoding

def make_is_html(allow_xhtml):
    def is_html(response, encoding):
        ct_hdrs = response.info().getheaders("content-type")
        url = response.geturl()
        # XXX encoding
        return _is_html(ct_hdrs, url, allow_xhtml)
    return is_html

# idea for this argument-processing trick is from Peter Otten
class Args:
    def __init__(self, args_map):
        self.dictionary = dict(args_map)
    def __getattr__(self, key):
        try:
            return self.dictionary[key]
        except KeyError:
            return getattr(self.__class__, key)

def form_parser_args(
    select_default=False,
    form_parser_class=None,
    request_class=None,
    backwards_compat=False,
    ):
    return Args(locals())


class Link:
    def __init__(self, base_url, url, text, tag, attrs):
        assert None not in [url, tag, attrs]
        self.base_url = base_url
        self.absolute_url = urljoin(base_url, url)
        self.url, self.text, self.tag, self.attrs = url, text, tag, attrs
    def __cmp__(self, other):
        try:
            for name in "url", "text", "tag", "attrs":
                if getattr(self, name) != getattr(other, name):
                    return -1
        except AttributeError:
            return -1
        return 0
    def __repr__(self):
        return "Link(base_url=%r, url=%r, text=%r, tag=%r, attrs=%r)" % (
            self.base_url, self.url, self.text, self.tag, self.attrs)


def clean_url(url, encoding):
    # percent-encode illegal URL characters
    # Trying to come up with test cases for this gave me a headache, revisit
    # when do switch to unicode.
    # Somebody else's comments (lost the attribution):
##     - IE will return you the url in the encoding you send it
##     - Mozilla/Firefox will send you latin-1 if there's no non latin-1
##     characters in your link. It will send you utf-8 however if there are...
    if type(url) == type(""):
        url = url.decode(encoding, "replace")
    url = url.strip()
    return urllib.quote(url.encode(encoding), URLQUOTE_SAFE_URL_CHARS)

class LinksFactory:

    def __init__(self,
                 link_parser_class=None,
                 link_class=Link,
                 urltags=None,
                 ):
        import _pullparser
        if link_parser_class is None:
            link_parser_class = _pullparser.TolerantPullParser
        self.link_parser_class = link_parser_class
        self.link_class = link_class
        if urltags is None:
            urltags = {
                "a": "href",
                "area": "href",
                "frame": "src",
                "iframe": "src",
                }
        self.urltags = urltags
        self._response = None
        self._encoding = None

    def set_response(self, response, base_url, encoding):
        self._response = response
        self._encoding = encoding
        self._base_url = base_url

    def links(self):
        """Return an iterator that provides links of the document."""
        response = self._response
        encoding = self._encoding
        base_url = self._base_url
        p = self.link_parser_class(response, encoding=encoding)

        for token in p.tags(*(self.urltags.keys()+["base"])):
            if token.data == "base":
                base_url = dict(token.attrs).get("href")
                continue
            if token.type == "endtag":
                continue
            attrs = dict(token.attrs)
            tag = token.data
            name = attrs.get("name")
            text = None
            # XXX use attr_encoding for ref'd doc if that doc does not provide
            #  one by other means
            #attr_encoding = attrs.get("charset")
            url = attrs.get(self.urltags[tag])  # XXX is "" a valid URL?
            if not url:
                # Probably an <A NAME="blah"> link or <AREA NOHREF...>.
                # For our purposes a link is something with a URL, so ignore
                # this.
                continue

            url = clean_url(url, encoding)
            if tag == "a":
                if token.type != "startendtag":
                    # hmm, this'd break if end tag is missing
                    text = p.get_compressed_text(("endtag", tag))
                # but this doesn't work for eg. <a href="blah"><b>Andy</b></a>
                #text = p.get_compressed_text()

            yield Link(base_url, url, text, tag, token.attrs)

class FormsFactory:

    """Makes a sequence of objects satisfying ClientForm.HTMLForm interface.

    For constructor argument docs, see ClientForm.ParseResponse
    argument docs.

    """

    def __init__(self,
                 select_default=False,
                 form_parser_class=None,
                 request_class=None,
                 backwards_compat=False,
                 ):
        import ClientForm
        self.select_default = select_default
        if form_parser_class is None:
            form_parser_class = ClientForm.FormParser
        self.form_parser_class = form_parser_class
        if request_class is None:
            request_class = _request.Request
        self.request_class = request_class
        self.backwards_compat = backwards_compat
        self._response = None
        self.encoding = None

    def set_response(self, response, encoding):
        self._response = response
        self.encoding = encoding

    def forms(self):
        import ClientForm
        encoding = self.encoding
        return ClientForm.ParseResponse(
            self._response,
            select_default=self.select_default,
            form_parser_class=self.form_parser_class,
            request_class=self.request_class,
            backwards_compat=self.backwards_compat,
            encoding=encoding,
            )

class TitleFactory:
    def __init__(self):
        self._response = self._encoding = None

    def set_response(self, response, encoding):
        self._response = response
        self._encoding = encoding

    def title(self):
        import _pullparser
        p = _pullparser.TolerantPullParser(
            self._response, encoding=self._encoding)
        try:
            p.get_tag("title")
        except _pullparser.NoMoreTokensError:
            return None
        else:
            return p.get_text()


def unescape(data, entities, encoding):
    if data is None or "&" not in data:
        return data

    def replace_entities(match):
        ent = match.group()
        if ent[1] == "#":
            return unescape_charref(ent[2:-1], encoding)

        repl = entities.get(ent[1:-1])
        if repl is not None:
            repl = unichr(repl)
            if type(repl) != type(""):
                try:
                    repl = repl.encode(encoding)
                except UnicodeError:
                    repl = ent
        else:
            repl = ent
        return repl

    return re.sub(r"&#?[A-Za-z0-9]+?;", replace_entities, data)

def unescape_charref(data, encoding):
    name, base = data, 10
    if name.startswith("x"):
        name, base= name[1:], 16
    uc = unichr(int(name, base))
    if encoding is None:
        return uc
    else:
        try:
            repl = uc.encode(encoding)
        except UnicodeError:
            repl = "&#%s;" % data
        return repl


try:
    import BeautifulSoup
except ImportError:
    pass
else:
    import sgmllib
    # monkeypatch to fix http://www.python.org/sf/803422 :-(
    sgmllib.charref = re.compile("&#(x?[0-9a-fA-F]+)[^0-9a-fA-F]")
    class MechanizeBs(BeautifulSoup.BeautifulSoup):
        _entitydefs = htmlentitydefs.name2codepoint
        # don't want the magic Microsoft-char workaround
        PARSER_MASSAGE = [(re.compile('(<[^<>]*)/>'),
                           lambda(x):x.group(1) + ' />'),
                          (re.compile('<!\s+([^<>]*)>'),
                           lambda(x):'<!' + x.group(1) + '>')
                          ]

        def __init__(self, encoding, text=None, avoidParserProblems=True,
                     initialTextIsEverything=True):
            self._encoding = encoding
            BeautifulSoup.BeautifulSoup.__init__(
                self, text, avoidParserProblems, initialTextIsEverything)

        def handle_charref(self, ref):
            t = unescape("&#%s;"%ref, self._entitydefs, self._encoding)
            self.handle_data(t)
        def handle_entityref(self, ref):
            t = unescape("&%s;"%ref, self._entitydefs, self._encoding)
            self.handle_data(t)
        def unescape_attrs(self, attrs):
            escaped_attrs = []
            for key, val in attrs:
                val = unescape(val, self._entitydefs, self._encoding)
                escaped_attrs.append((key, val))
            return escaped_attrs

class RobustLinksFactory:

    compress_re = re.compile(r"\s+")

    def __init__(self,
                 link_parser_class=None,
                 link_class=Link,
                 urltags=None,
                 ):
        import BeautifulSoup
        if link_parser_class is None:
            link_parser_class = MechanizeBs
        self.link_parser_class = link_parser_class
        self.link_class = link_class
        if urltags is None:
            urltags = {
                "a": "href",
                "area": "href",
                "frame": "src",
                "iframe": "src",
                }
        self.urltags = urltags
        self._bs = None
        self._encoding = None
        self._base_url = None

    def set_soup(self, soup, base_url, encoding):
        self._bs = soup
        self._base_url = base_url
        self._encoding = encoding

    def links(self):
        import BeautifulSoup
        bs = self._bs
        base_url = self._base_url
        encoding = self._encoding
        gen = bs.recursiveChildGenerator()
        for ch in bs.recursiveChildGenerator():
            if (isinstance(ch, BeautifulSoup.Tag) and
                ch.name in self.urltags.keys()+["base"]):
                link = ch
                attrs = bs.unescape_attrs(link.attrs)
                attrs_dict = dict(attrs)
                if link.name == "base":
                    base_url = attrs_dict.get("href")
                    continue
                url_attr = self.urltags[link.name]
                url = attrs_dict.get(url_attr)
                if not url:
                    continue
                url = clean_url(url, encoding)
                text = link.firstText(lambda t: True)
                if text is BeautifulSoup.Null:
                    # follow _pullparser's weird behaviour rigidly
                    if link.name == "a":
                        text = ""
                    else:
                        text = None
                else:
                    text = self.compress_re.sub(" ", text.strip())
                yield Link(base_url, url, text, link.name, attrs)


class RobustFormsFactory(FormsFactory):
    def __init__(self, *args, **kwds):
        import ClientForm
        args = form_parser_args(*args, **kwds)
        if args.form_parser_class is None:
            args.form_parser_class = ClientForm.RobustFormParser
        FormsFactory.__init__(self, **args.dictionary)

    def set_response(self, response, encoding):
        self._response = response
        self.encoding = encoding


class RobustTitleFactory:
    def __init__(self):
        self._bs = self._encoding = None

    def set_soup(self, soup, encoding):
        self._bs = soup
        self._encoding = encoding

    def title(soup):
        import BeautifulSoup
        title = self._bs.first("title")
        if title == BeautifulSoup.Null:
            return None
        else:
            return title.firstText(lambda t: True)


class Factory:
    """Factory for forms, links, etc.

    This interface may expand in future.

    Public methods:

    set_request_class(request_class)
    set_response(response)
    forms()
    links()

    Public attributes:

    encoding: string specifying the encoding of response if it contains a text
     document (this value is left unspecified for documents that do not have
     an encoding, e.g. an image file)
    is_html: true if response contains an HTML document (XHTML may be
     regarded as HTML too)
    title: page title, or None if no title or not HTML

    """

    def __init__(self, forms_factory, links_factory, title_factory,
                 get_encoding=encoding_finder(DEFAULT_ENCODING),
                 is_html_p=make_is_html(allow_xhtml=False),
                 ):
        """

        Pass keyword arguments only.

        default_encoding: character encoding to use if encoding cannot be
         determined (or guessed) from the response.  You should turn on
         HTTP-EQUIV handling if you want the best chance of getting this right
         without resorting to this default.  The default value of this
         parameter (currently latin-1) may change in future.

        """
        self._forms_factory = forms_factory
        self._links_factory = links_factory
        self._title_factory = title_factory
        self._get_encoding = get_encoding
        self._is_html_p = is_html_p

        self.set_response(None)

    def set_request_class(self, request_class):
        """Set urllib2.Request class.

        ClientForm.HTMLForm instances returned by .forms() will return
        instances of this class when .click()ed.

        """
        self._forms_factory.request_class = request_class

    def set_response(self, response):
        """Set response.

        The response must implement the same interface as objects returned by
        urllib2.urlopen().

        """
        self._response = response
        self._forms_genf = self._links_genf = None
        self._get_title = None
        for name in ["encoding", "is_html", "title"]:
            try:
                delattr(self, name)
            except AttributeError:
                pass

    def __getattr__(self, name):
        if name not in ["encoding", "is_html", "title"]:
            return getattr(self.__class__, name)

        try:
            if name == "encoding":
                self.encoding = self._get_encoding(self._response)
                return self.encoding
            elif name == "is_html":
                self.is_html = self._is_html_p(self._response, self.encoding)
                return self.is_html
            elif name == "title":
                if self.is_html:
                    self.title = self._title_factory.title()
                else:
                    self.title = None
                return self.title
        finally:
            self._response.seek(0)

    def forms(self):
        """Return iterable over ClientForm.HTMLForm-like objects."""
        if self._forms_genf is None:
            self._forms_genf = CachingGeneratorFunction(
                self._forms_factory.forms())
        return self._forms_genf()

    def links(self):
        """Return iterable over mechanize.Link-like objects."""
        if self._links_genf is None:
            self._links_genf = CachingGeneratorFunction(
                self._links_factory.links())
        return self._links_genf()

class DefaultFactory(Factory):
    """Based on sgmllib."""
    def __init__(self, i_want_broken_xhtml_support=False):
        Factory.__init__(
            self,
            forms_factory=FormsFactory(),
            links_factory=LinksFactory(),
            title_factory=TitleFactory(),
            is_html_p=make_is_html(allow_xhtml=i_want_broken_xhtml_support),
            )

    def set_response(self, response):
        Factory.set_response(self, response)
        if response is not None:
            self._forms_factory.set_response(
                copy.copy(response), self.encoding)
            self._links_factory.set_response(
                copy.copy(response), self._response.geturl(), self.encoding)
            self._title_factory.set_response(
                copy.copy(response), self.encoding)

class RobustFactory(Factory):
    """Based on BeautifulSoup, hopefully a bit more robust to bad HTML than is
    DefaultFactory.

    """
    def __init__(self, i_want_broken_xhtml_support=False,
                 soup_class=None):
        Factory.__init__(
            self,
            forms_factory=RobustFormsFactory(),
            links_factory=RobustLinksFactory(),
            title_factory=RobustTitleFactory(),
            is_html_p=make_is_html(allow_xhtml=i_want_broken_xhtml_support),
            )
        if soup_class is None:
            soup_class = MechanizeBs
        self._soup_class = soup_class

    def set_response(self, response):
        import BeautifulSoup
        Factory.set_response(self, response)
        if response is not None:
            data = response.read()
            soup = self._soup_class(self.encoding, data)
            self._forms_factory.set_response(response, self.encoding)
            self._links_factory.set_soup(
                soup, response.geturl(), self.encoding)
            self._title_factory.set_soup(soup, self.encoding)
