1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
|
"""
Remote document loader using aiohttp.
.. module:: jsonld.documentloader.aiohttp
:synopsis: Remote document loader using aiohttp
.. moduleauthor:: Olaf Conradi <olaf@conradi.org>
"""
import string
import urllib.parse as urllib_parse
from pyld.jsonld import (JsonLdError, parse_link_header, LINK_HEADER_REL)
def aiohttp_document_loader(loop=None, secure=False, **kwargs):
"""
Create an Asynchronous document loader using aiohttp.
:param loop: the event loop used for processing HTTP requests.
:param secure: require all requests to use HTTPS (default: False).
:param **kwargs: extra keyword args for the aiohttp request get() call.
:return: the RemoteDocument loader function.
"""
import asyncio
import aiohttp
if loop is None:
loop = asyncio.get_event_loop()
async def async_loader(url, headers):
"""
Retrieves JSON-LD at the given URL asynchronously.
:param url: the URL to retrieve.
:return: the RemoteDocument.
"""
try:
# validate URL
pieces = urllib_parse.urlparse(url)
if (not all([pieces.scheme, pieces.netloc]) or
pieces.scheme not in ['http', 'https'] or
set(pieces.netloc) > set(
string.ascii_letters + string.digits + '-.:')):
raise JsonLdError(
'URL could not be dereferenced; '
'only "http" and "https" URLs are supported.',
'jsonld.InvalidUrl', {'url': url},
code='loading document failed')
if secure and pieces.scheme != 'https':
raise JsonLdError(
'URL could not be dereferenced; '
'secure mode enabled and '
'the URL\'s scheme is not "https".',
'jsonld.InvalidUrl', {'url': url},
code='loading document failed')
async with aiohttp.ClientSession(loop=loop) as session:
async with session.get(url,
headers=headers,
**kwargs) as response:
# Allow any content_type in trying to parse json
# similar to requests library
json_body = await response.json(content_type=None)
content_type = response.headers.get('content-type')
if not content_type:
content_type = 'application/octet-stream'
doc = {
'contentType': content_type,
'contextUrl': None,
'documentUrl': response.url.human_repr(),
'document': json_body
}
link_header = response.headers.get('link')
if link_header:
linked_context = parse_link_header(link_header).get(
LINK_HEADER_REL)
# only 1 related link header permitted
if linked_context and content_type != 'application/ld+json':
if isinstance(linked_context, list):
raise JsonLdError(
'URL could not be dereferenced, '
'it has more than one '
'associated HTTP Link Header.',
'jsonld.LoadDocumentError',
{'url': url},
code='multiple context link headers')
doc['contextUrl'] = linked_context['target']
linked_alternate = parse_link_header(link_header).get('alternate')
# if not JSON-LD, alternate may point there
if (linked_alternate and
linked_alternate.get('type') == 'application/ld+json' and
not re.match(r'^application\/(\w*\+)?json$', content_type)):
doc['contentType'] = 'application/ld+json'
doc['documentUrl'] = jsonld.prepend_base(url, linked_alternate['target'])
return doc
except JsonLdError as e:
raise e
except Exception as cause:
raise JsonLdError(
'Could not retrieve a JSON-LD document from the URL.',
'jsonld.LoadDocumentError', code='loading document failed',
cause=cause)
def loader(url, options={}):
"""
Retrieves JSON-LD at the given URL.
:param url: the URL to retrieve.
:return: the RemoteDocument.
"""
return loop.run_until_complete(
async_loader(url,
options.get('headers', {'Accept': 'application/ld+json, application/json'})))
return loader
|