# -*- coding: utf-8 -*-
"""
Various utilities for pyRdfa.

Most of the utilities are straightforward.

@organization: U{World Wide Web Consortium<http://www.w3.org>}
@author: U{Ivan Herman<a href="http://www.w3.org/People/Ivan/">}
@license: This software is available for use under the
U{W3C® SOFTWARE NOTICE AND LICENSE<href="http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231">}


"""

"""
$Id: utils.py,v 1.18 2016/12/08 10:13:34 ivan Exp $
$Date: 2016/12/08 10:13:34 $
"""
import sys, datetime

from urllib.parse import urljoin, quote
from http.server import BaseHTTPRequestHandler
from urllib.error import HTTPError as urllib_HTTPError

from .extras.httpheader import content_type, parse_http_datetime


from .host import preferred_suffixes

#########################################################################################################
# Handling URIs
class URIOpener:
    """A wrapper around the urllib2 method to open a resource. Beyond accessing the data itself, the class
    sets a number of instance variable that might be relevant for processing.
    The class also adds an accept header to the outgoing request, namely
    text/html and application/xhtml+xml (unless set explicitly by the caller).
    
    If the content type is set by the server, the relevant HTTP response field is used. Otherwise,
    common suffixes are used (see L{host.preferred_suffixes}) to set the content type (this is really of importance
    for C{file:///} URI-s). If none of these works, the content type is empty.
        
    Interpretation of the content type for the return is done by Deron Meranda's U{httpheader module<http://deron.meranda.us/>}.
    
    @ivar data: the real data, ie, a file-like object
    @ivar headers: the return headers as sent back by the server
    @ivar content_type: the content type of the resource or the empty string, if the content type cannot be determined
    @ivar location: the real location of the data (ie, after possible redirection and content negotiation)
    @ivar last_modified_date: sets the last modified date if set in the header, None otherwise
    @ivar expiration_date: sets the expiration date if set in the header, I{current UTC plus one day} otherwise (this is used for caching purposes, hence this artificial setting)
    """
    CONTENT_LOCATION = 'Content-Location'
    CONTENT_TYPE =     'Content-Type'
    LAST_MODIFIED =    'Last-Modified'
    EXPIRES =          'Expires'

    def __init__(self, name, additional_headers={}, verify=True):
        """
        @param name: URL to be opened
        @keyword additional_headers: additional HTTP request headers to be added to the call
        """        
        try:
            # Note the removal of the fragment ID. This is necessary, per the HTTP spec
            url = name.split('#')[0]
            if 'Accept' not in additional_headers:
                additional_headers['Accept'] = 'text/html, application/xhtml+xml'
                
            import requests
            # For security reason certificate verification is now done by default. But, can be
            # disabled for sites still go wrong because the cerficates are not o.k. with request...
            r = requests.get(url, headers=additional_headers, verify=verify)
            self.data = r.content
            self.headers = r.headers
            
            if URIOpener.CONTENT_TYPE in self.headers:
                # The call below will remove the possible media type parameters, like charset settings
                ct = content_type(self.headers[URIOpener.CONTENT_TYPE])
                self.content_type = ct.media_type
                if 'charset' in ct.parmdict:
                    self.charset = ct.parmdict['charset']
                else:
                    self.charset = None
                # print
            else:
                # check if the suffix can be used for the content type; this may be important
                # for file:// type URI or if the server is not properly set up to return the right
                # mime type
                self.charset = None
                self.content_type = ""
                for suffix in preferred_suffixes.keys():
                    if name.endswith(suffix):
                        self.content_type = preferred_suffixes[suffix]
                        break
            
            if URIOpener.CONTENT_LOCATION in self.headers:
                self.location = urljoin(r.url, self.headers[URIOpener.CONTENT_LOCATION])
            else:
                self.location = name
            
            self.expiration_date = datetime.datetime.utcnow() + datetime.timedelta(days=1)
            if URIOpener.EXPIRES in self.headers:
                try:
                    # Thanks to Deron Meranda for the HTTP date conversion method...
                    self.expiration_date = parse_http_datetime(self.headers[URIOpener.EXPIRES])
                except:
                    # The Expires date format was wrong, sorry, forget it...
                    pass

            self.last_modified_date = None
            if URIOpener.LAST_MODIFIED in self.headers:
                try:
                    # Thanks to Deron Meranda for the HTTP date conversion method...
                    self.last_modified_date = parse_http_datetime(self.headers[URIOpener.LAST_MODIFIED])
                except:
                    # The last modified date format was wrong, sorry, forget it...
                    pass
                
        except urllib_HTTPError:
            e = sys.exc_info()[1]
            from . import HTTPError
            msg = BaseHTTPRequestHandler.responses[e.code]
            raise HTTPError('%s' % msg[1], e.code)
        except Exception:
            e = sys.exc_info()[1]
            from . import RDFaError
            raise RDFaError('%s' % e)

#########################################################################################################

# 'safe' characters for the URI quoting, ie, characters that can safely stay as they are. Other 
# special characters are converted to their %.. equivalents for namespace prefixes
_unquotedChars = r':/\?=#~'
_warnChars = [' ','\n','\r','\t']

def quote_URI(uri, options=None):
    """
    'quote' a URI, ie, exchange special characters for their '%..' equivalents. Some of the characters
    may stay as they are (listed in L{_unquotedChars}. If one of the characters listed in L{_warnChars} 
    is also in the uri, an extra warning is also generated.
    @param uri: URI
    @param options: 
    @type options: L{Options<pyRdfa.Options>}
    """
    from . import err_unusual_char_in_URI
    suri = uri.strip()
    for c in _warnChars:
        if suri.find(c) != -1:
            if options != None:
                options.add_warning(err_unusual_char_in_URI % suri)
            break
    return quote(suri, _unquotedChars)
    
#########################################################################################################
    
def create_file_name(uri):
    """
    Create a suitable file name from an (absolute) URI. Used, eg, for the generation of a file name for a cached vocabulary file.
    """
    suri = uri.strip()
    final_uri = quote(suri,_unquotedChars)
    # Remove some potentially dangereous characters
    return final_uri.replace(' ','_').replace('%','_').replace('-','_').replace('+','_').replace('/','_').replace('?','_').replace(':','_').replace('=','_').replace('#','_')

#########################################################################################################
def has_one_of_attributes(node,*args):
    """
    Check whether one of the listed attributes is present on a (DOM) node.
    @param node: DOM element node
    @param args: possible attribute names
    @return: True or False
    @rtype: Boolean
    """
    if len(args) == 0:
        return None
    if isinstance(args[0], tuple) or isinstance(args[0], list):
        rargs = args[0]
    else:
        rargs = args
    
    return True in [ node.hasAttribute(attr) for attr in rargs ]

#########################################################################################################
def traverse_tree(node, func):
    """Traverse the whole element tree, and perform the function C{func} on all the elements.
    @param node: DOM element node
    @param func: function to be called on the node. Input parameter is a DOM Element Node. If the function returns a boolean True, the recursion is stopped.
    """
    if func(node):
        return

    for n in node.childNodes:
        if n.nodeType == node.ELEMENT_NODE:
            traverse_tree(n, func)

#########################################################################################################
def return_XML(state, inode, base=True, xmlns=True):
    """
    Get (recursively) the XML Literal content of a DOM Element Node. (Most of the processing is done
    via a C{node.toxml} call of the xml minidom implementation.)

    @param inode: DOM Node
    @param state: L{pyRdfa.state.ExecutionContext}
    @param base: whether the base element should be added to the output
    @type base: Boolean
    @param xmlns: whether the namespace declarations should be repeated in the generated node
    @type xmlns: Boolean
    @return: string
    """
    node = inode.cloneNode(True)
    # Decorate the element with namespaces value and, optionally, base
    if base:
        node.setAttribute("xml:base",state.base)
    if xmlns:
        for prefix in state.term_or_curie.xmlns:
            if not node.hasAttribute("xmlns:%s" % prefix):
                node.setAttribute("xmlns:%s" % prefix,"%s" % state.term_or_curie.xmlns[prefix])
        # Set the default namespace, if not done (and is available)
        if not node.getAttribute("xmlns") and state.defaultNS != None:
            node.setAttribute("xmlns", state.defaultNS)
    return node.toxml()

#########################################################################################################

def dump(node):
    """
    This is just for debug purposes: it prints the essential content of the node in the tree starting at node.

    @param node: DOM node
    """
    print(node.toprettyxml(indent="", newl=""))