1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252
|
# -*- coding: utf-8 -*-
"""
Various utilities for pyRdfa.
Most of the utilities are straightforward.
@organization: U{World Wide Web Consortium<http://www.w3.org>}
@author: U{Ivan Herman<a href="http://www.w3.org/People/Ivan/">}
@license: This software is available for use under the
U{W3C® SOFTWARE NOTICE AND LICENSE<href="http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231">}
"""
"""
$Id: utils.py,v 1.18 2016/12/08 10:13:34 ivan Exp $
$Date: 2016/12/08 10:13:34 $
"""
import os, os.path, sys, datetime, socket
# Python 3 vs. 2 switch
if sys.version_info[0] >= 3 :
from urllib.request import Request
from urllib.parse import urljoin, quote
from http.server import BaseHTTPRequestHandler
from urllib.error import HTTPError as urllib_HTTPError
else :
from urllib2 import Request
from urllib2 import HTTPError as urllib_HTTPError
from urlparse import urljoin
from urllib import quote
from BaseHTTPServer import BaseHTTPRequestHandler
from .extras.httpheader import content_type, parse_http_datetime
import rdflib
if rdflib.__version__ >= "3.0.0" :
from rdflib import RDF as ns_rdf
else :
from rdflib.RDF import RDFNS as ns_rdf
from .host import HostLanguage, preferred_suffixes
#########################################################################################################
# Handling URIs
class URIOpener :
"""A wrapper around the urllib2 method to open a resource. Beyond accessing the data itself, the class
sets a number of instance variable that might be relevant for processing.
The class also adds an accept header to the outgoing request, namely
text/html and application/xhtml+xml (unless set explicitly by the caller).
If the content type is set by the server, the relevant HTTP response field is used. Otherwise,
common suffixes are used (see L{host.preferred_suffixes}) to set the content type (this is really of importance
for C{file:///} URI-s). If none of these works, the content type is empty.
Interpretation of the content type for the return is done by Deron Meranda's U{httpheader module<http://deron.meranda.us/>}.
@ivar data: the real data, ie, a file-like object
@ivar headers: the return headers as sent back by the server
@ivar content_type: the content type of the resource or the empty string, if the content type cannot be determined
@ivar location: the real location of the data (ie, after possible redirection and content negotiation)
@ivar last_modified_date: sets the last modified date if set in the header, None otherwise
@ivar expiration_date: sets the expiration date if set in the header, I{current UTC plus one day} otherwise (this is used for caching purposes, hence this artificial setting)
"""
CONTENT_LOCATION = 'Content-Location'
CONTENT_TYPE = 'Content-Type'
LAST_MODIFIED = 'Last-Modified'
EXPIRES = 'Expires'
def __init__(self, name, additional_headers = {}) :
"""
@param name: URL to be opened
@keyword additional_headers: additional HTTP request headers to be added to the call
"""
try :
# Note the removal of the fragment ID. This is necessary, per the HTTP spec
url = name.split('#')[0]
if socket.getfqdn().endswith('.w3.org'):
import checkremote
checkremote.check_url_safety(url)
if 'Accept' not in additional_headers:
additional_headers['Accept'] = 'text/html, application/xhtml+xml'
import requests
# Switching off the verification is not cool. But, at least for now, too many
# sites still go wrong because the certificates are not o.k. with request...
r = requests.get(url, headers=additional_headers, verify=False)
self.data = r.content
self.headers = r.headers
if URIOpener.CONTENT_TYPE in self.headers :
# The call below will remove the possible media type parameters, like charset settings
ct = content_type(self.headers[URIOpener.CONTENT_TYPE])
self.content_type = ct.media_type
if 'charset' in ct.parmdict :
self.charset = ct.parmdict['charset']
else :
self.charset = None
# print
else :
# check if the suffix can be used for the content type; this may be important
# for file:// type URI or if the server is not properly set up to return the right
# mime type
self.charset = None
self.content_type = ""
for suffix in preferred_suffixes.keys() :
if name.endswith(suffix) :
self.content_type = preferred_suffixes[suffix]
break
if URIOpener.CONTENT_LOCATION in self.headers :
self.location = urljoin(r.url,self.headers[URIOpener.CONTENT_LOCATION])
else :
self.location = name
self.expiration_date = datetime.datetime.utcnow() + datetime.timedelta(days=1)
if URIOpener.EXPIRES in self.headers :
try :
# Thanks to Deron Meranda for the HTTP date conversion method...
self.expiration_date = parse_http_datetime(self.headers[URIOpener.EXPIRES])
except :
# The Expires date format was wrong, sorry, forget it...
pass
self.last_modified_date = None
if URIOpener.LAST_MODIFIED in self.headers :
try :
# Thanks to Deron Meranda for the HTTP date conversion method...
self.last_modified_date = parse_http_datetime(self.headers[URIOpener.LAST_MODIFIED])
except :
# The last modified date format was wrong, sorry, forget it...
pass
except urllib_HTTPError :
e = sys.exc_info()[1]
from . import HTTPError
msg = BaseHTTPRequestHandler.responses[e.code]
raise HTTPError('%s' % msg[1], e.code)
except Exception :
e = sys.exc_info()[1]
from . import RDFaError
raise RDFaError('%s' % e)
#########################################################################################################
# 'safe' characters for the URI quoting, ie, characters that can safely stay as they are. Other
# special characters are converted to their %.. equivalents for namespace prefixes
_unquotedChars = ':/\?=#~'
_warnChars = [' ','\n','\r','\t']
def quote_URI(uri, options = None) :
"""
'quote' a URI, ie, exchange special characters for their '%..' equivalents. Some of the characters
may stay as they are (listed in L{_unquotedChars}. If one of the characters listed in L{_warnChars}
is also in the uri, an extra warning is also generated.
@param uri: URI
@param options:
@type options: L{Options<pyRdfa.Options>}
"""
from . import err_unusual_char_in_URI
suri = uri.strip()
for c in _warnChars :
if suri.find(c) != -1 :
if options != None :
options.add_warning(err_unusual_char_in_URI % suri)
break
return quote(suri, _unquotedChars)
#########################################################################################################
def create_file_name(uri) :
"""
Create a suitable file name from an (absolute) URI. Used, eg, for the generation of a file name for a cached vocabulary file.
"""
suri = uri.strip()
final_uri = quote(suri,_unquotedChars)
# Remove some potentially dangereous characters
return final_uri.replace(' ','_').replace('%','_').replace('-','_').replace('+','_').replace('/','_').replace('?','_').replace(':','_').replace('=','_').replace('#','_')
#########################################################################################################
def has_one_of_attributes(node,*args) :
"""
Check whether one of the listed attributes is present on a (DOM) node.
@param node: DOM element node
@param args: possible attribute names
@return: True or False
@rtype: Boolean
"""
if len(args) == 0 :
return None
if isinstance(args[0], tuple) or isinstance(args[0], list) :
rargs = args[0]
else :
rargs = args
return True in [ node.hasAttribute(attr) for attr in rargs ]
#########################################################################################################
def traverse_tree(node, func) :
"""Traverse the whole element tree, and perform the function C{func} on all the elements.
@param node: DOM element node
@param func: function to be called on the node. Input parameter is a DOM Element Node. If the function returns a boolean True, the recursion is stopped.
"""
if func(node) :
return
for n in node.childNodes :
if n.nodeType == node.ELEMENT_NODE :
traverse_tree(n, func)
#########################################################################################################
def return_XML(state, inode, base = True, xmlns = True) :
"""
Get (recursively) the XML Literal content of a DOM Element Node. (Most of the processing is done
via a C{node.toxml} call of the xml minidom implementation.)
@param inode: DOM Node
@param state: L{pyRdfa.state.ExecutionContext}
@param base: whether the base element should be added to the output
@type base: Boolean
@param xmlns: whether the namespace declarations should be repeated in the generated node
@type xmlns: Boolean
@return: string
"""
node = inode.cloneNode(True)
# Decorate the element with namespaces value and, optionally, base
if base :
node.setAttribute("xml:base",state.base)
if xmlns :
for prefix in state.term_or_curie.xmlns :
if not node.hasAttribute("xmlns:%s" % prefix) :
node.setAttribute("xmlns:%s" % prefix,"%s" % state.term_or_curie.xmlns[prefix])
# Set the default namespace, if not done (and is available)
if not node.getAttribute("xmlns") and state.defaultNS != None :
node.setAttribute("xmlns", state.defaultNS)
if sys.version_info[0] >= 3 :
return node.toxml()
else :
q = node.toxml(encoding='utf-8')
return unicode(q, encoding='utf-8')
#########################################################################################################
def dump(node) :
"""
This is just for debug purposes: it prints the essential content of the node in the tree starting at node.
@param node: DOM node
"""
print( node.toprettyxml(indent="", newl="") )
|