"""RFC 3986 URI parsing and relative reference resolution / absolutization.

(aka splitting and joining)

Copyright 2006 John J. Lee <jjl@pobox.com>

This code is free software; you can redistribute it and/or modify it under
the terms of the BSD or ZPL 2.1 licenses (see the file LICENSE
included with the distribution).

"""

# XXX Wow, this is ugly.  Overly-direct translation of the RFC ATM.

from __future__ import absolute_import
import re

from .polyglot import quote
# def chr_range(a, b):
# return "".join(map(chr, range(ord(a), ord(b)+1)))

# UNRESERVED_URI_CHARS = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ"
# "abcdefghijklmnopqrstuvwxyz"
# "0123456789"
# "-_.~")
# RESERVED_URI_CHARS = "!*'();:@&=+$,/?#[]"
# URI_CHARS = RESERVED_URI_CHARS+UNRESERVED_URI_CHARS+'%'
# this re matches any character that's not in URI_CHARS
BAD_URI_CHARS_RE = re.compile(r"[^A-Za-z0-9\-_.~!*'();:@&=+$,/?%#[\]]")


def clean_url(url, encoding='utf-8'):
    # percent-encode illegal URI characters
    # Trying to come up with test cases for this gave me a headache, revisit
    # when do switch to unicode.
    # Somebody else's comments (lost the attribution):
    # - IE will return you the url in the encoding you send it
    # - Mozilla/Firefox will send you latin-1 if there's no non latin-1
    # characters in your link. It will send you utf-8 however if there are...
    is_unicode = not isinstance(url, bytes)
    if not is_unicode:
        url = url.decode(encoding, "replace")
    url = url.strip()
    # for second param to urllib.quote(), we want URI_CHARS, minus the
    # 'always_safe' characters that urllib.quote() never percent-encodes
    ans = quote(url.encode(encoding), "!*'();:@&=+$,/?%#[]~")
    if is_unicode and isinstance(ans, bytes):
        ans = ans.decode(encoding)
    return ans


def is_clean_uri(uri):
    """
    >>> is_clean_uri("ABC!")
    True
    >>> is_clean_uri(u"ABC!")
    True
    >>> is_clean_uri("ABC|")
    False
    >>> is_clean_uri(u"ABC|")
    False
    >>> is_clean_uri("http://example.com/0")
    True
    >>> is_clean_uri(u"http://example.com/0")
    True
    """
    # note module re treats bytestrings as through they were decoded as latin-1
    # so this function accepts both unicode and bytestrings
    return not bool(BAD_URI_CHARS_RE.search(uri))


SPLIT_MATCH = re.compile(
    r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?").match


def urlsplit(absolute_uri):
    """Return scheme, authority, path, query, fragment."""
    match = SPLIT_MATCH(absolute_uri)
    if match:
        g = match.groups()
        return g[1], g[3], g[4], g[6], g[8]


def urlunsplit(parts):
    scheme, authority, path, query, fragment = parts
    r = []
    append = r.append
    if scheme is not None:
        append(scheme)
        append(":")
    if authority is not None:
        append("//")
        append(authority)
    append(path)
    if query is not None:
        append("?")
        append(query)
    if fragment is not None:
        append("#")
        append(fragment)
    return "".join(r)


def urljoin(base_uri, uri_reference):
    """Join a base URI with a URI reference and return the resulting URI.

    See RFC 3986.
    """
    return urlunsplit(urljoin_parts(urlsplit(base_uri),
                                    urlsplit(uri_reference)))

# oops, this doesn't do the same thing as the literal translation
# from the RFC below
# import posixpath
# def urljoin_parts(base_parts, reference_parts):
#     scheme, authority, path, query, fragment = base_parts
#     rscheme, rauthority, rpath, rquery, rfragment = reference_parts

# compute target URI path
# if rpath == "":
#         tpath = path
# else:
#         tpath = rpath
# if not tpath.startswith("/"):
#             tpath = merge(authority, path, tpath)
#         tpath = posixpath.normpath(tpath)

# if rscheme is not None:
# return (rscheme, rauthority, tpath, rquery, rfragment)
# elif rauthority is not None:
# return (scheme, rauthority, tpath, rquery, rfragment)
# elif rpath == "":
# if rquery is not None:
#             tquery = rquery
# else:
#             tquery = query
# return (scheme, authority, tpath, tquery, rfragment)
# else:
# return (scheme, authority, tpath, rquery, rfragment)


def urljoin_parts(base_parts, reference_parts):
    scheme, authority, path, query, fragment = base_parts
    rscheme, rauthority, rpath, rquery, rfragment = reference_parts

    if rscheme == scheme:
        rscheme = None

    if rscheme is not None:
        tscheme, tauthority, tpath, tquery = (
            rscheme, rauthority, remove_dot_segments(rpath), rquery)
    else:
        if rauthority is not None:
            tauthority, tpath, tquery = (
                rauthority, remove_dot_segments(rpath), rquery)
        else:
            if rpath == "":
                tpath = path
                if rquery is not None:
                    tquery = rquery
                else:
                    tquery = query
            else:
                if rpath.startswith("/"):
                    tpath = remove_dot_segments(rpath)
                else:
                    tpath = merge(authority, path, rpath)
                    tpath = remove_dot_segments(tpath)
                tquery = rquery
            tauthority = authority
        tscheme = scheme
    tfragment = rfragment
    return (tscheme, tauthority, tpath, tquery, tfragment)

# um, something *vaguely* like this is what I want, but I have to generate
# lots of test cases first, if only to understand what it is that
# remove_dot_segments really does...
# def remove_dot_segments(path):
# if path == '':
# return ''
#     comps = path.split('/')
#     new_comps = []
# for comp in comps:
# if comp in ['.', '']:
# if not new_comps or new_comps[-1]:
# new_comps.append('')
# continue
# if comp != '..':
# new_comps.append(comp)
# elif new_comps:
# new_comps.pop()
# return '/'.join(new_comps)


def remove_dot_segments(path):
    r = []
    while path:
        # A
        if path.startswith("../"):
            path = path[3:]
            continue
        if path.startswith("./"):
            path = path[2:]
            continue
        # B
        if path.startswith("/./"):
            path = path[2:]
            continue
        if path == "/.":
            path = "/"
            continue
        # C
        if path.startswith("/../"):
            path = path[3:]
            if r:
                r.pop()
            continue
        if path == "/..":
            path = "/"
            if r:
                r.pop()
            continue
        # D
        if path == ".":
            path = path[1:]
            continue
        if path == "..":
            path = path[2:]
            continue
        # E
        start = 0
        if path.startswith("/"):
            start = 1
        ii = path.find("/", start)
        if ii < 0:
            ii = None
        r.append(path[:ii])
        if ii is None:
            break
        path = path[ii:]
    return "".join(r)


def merge(base_authority, base_path, ref_path):
    # XXXX Oddly, the sample Perl implementation of this by Roy Fielding
    # doesn't even take base_authority as a parameter, despite the wording in
    # the RFC suggesting otherwise.  Perhaps I'm missing some obvious identity.
    # if base_authority is not None and base_path == "":
    if base_path == "":
        return "/" + ref_path
    ii = base_path.rfind("/")
    if ii >= 0:
        return base_path[:ii + 1] + ref_path
    return ref_path


if __name__ == "__main__":
    import doctest
    doctest.testmod()