1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87
|
import io
import posixpath
import re
from collections import deque
import xmlschema
from lxml import etree
from tlz.dicttoolz import keymap
include_re = re.compile(r'\s*<xsd:include schemaLocation="(?P<location>[^"/]+)"\s?/>')
def remove_includes(text):
return include_re.sub("", text)
def extract_includes(text):
return include_re.findall(text)
def normalize(root, path):
if posixpath.isabs(path) or posixpath.dirname(path):
return path
return posixpath.join(root, path)
def schema_paths(mapper, root_schema):
unvisited = deque([root_schema])
visited = []
while unvisited:
path = unvisited.popleft()
if path not in visited:
visited.append(path)
text = mapper[path].decode()
includes = extract_includes(text)
current_root = posixpath.dirname(path)
normalized = [normalize(current_root, p) for p in includes]
unvisited.extend([p for p in normalized if p not in visited])
return visited
def open_schema(mapper, schema):
"""fsspec-compatible way to open remote schema files
Parameters
----------
fs : fsspec.filesystem
pre-instantiated fsspec filesystem instance
root : str
URL of the root directory of the schema files
name : str
File name of the schema to open.
glob : str, default: "*.xsd"
The glob used to find other schema files
Returns
-------
xmlschema.XMLSchema
The opened schema object
"""
paths = schema_paths(mapper, schema)
preprocessed = [io.StringIO(remove_includes(mapper[p].decode())) for p in paths]
return xmlschema.XMLSchema(preprocessed)
def read_xml(mapper, path):
raw_data = mapper[path]
tree = etree.fromstring(raw_data)
namespaces = keymap(lambda x: x if x is not None else "rcm", tree.nsmap)
schema_location = tree.xpath("./@xsi:schemaLocation", namespaces=namespaces)[0]
_, schema_path_ = schema_location.split(" ")
schema_path = posixpath.normpath(
posixpath.join(posixpath.dirname(path), schema_path_)
)
schema = open_schema(mapper, schema_path)
decoded = schema.decode(tree)
return decoded
|