File: DataSetValidator.py

package info (click to toggle)
python-pbcore 2.1.2%2Bdfsg-5
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 6,476 kB
  • sloc: python: 13,393; xml: 2,504; makefile: 232; sh: 66
file content (100 lines) | stat: -rw-r--r-- 3,203 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# Author: Martin D. Smith


"""Validate DataSet XML files"""

from urllib.parse import urlparse, unquote
import xml.etree.ElementTree as ET
import logging
import os.path as op
import os
import re

XMLNS = "http://pacificbiosciences.com/PacBioDataModel.xsd"
XSD_FILE = os.environ.get("PB_DATASET_XSD", None)

log = logging.getLogger(__name__)


def validateResources(xmlroot, relTo='.'):
    """Validate the resources in an XML file.

    Args:
        xmlroot: The ET root of an xml tree
        relTo: ('.') The path relative to which resources may reside. This will
               work poorly if relTo is not set to the dirname of the incoming
               XML file.
    """
    # FIXME hacky workaround to avoid crashing on a field that was defined
    # improperly
    IGNORE_RESOURCES = set(["BioSamplesCsv"])
    stack = [xmlroot]
    while stack:
        element = stack.pop()
        stack.extend(element)
        resId = element.get('ResourceId')
        if resId:
            parsedId = urlparse(resId)
            rfn = unquote(urlparse(resId).path.strip())
            if not os.path.exists(rfn):
                if (not os.path.exists(os.path.join(relTo,
                                                    rfn)) and
                        not os.path.exists(os.path.join('.',
                                                        rfn))):
                    tag_name = re.sub(r"\{.*\}", "", element.tag)
                    if tag_name in IGNORE_RESOURCES:
                        log.warning("{f} not found".format(f=rfn))
                    else:
                        raise IOError("{f} not found".format(f=rfn))


def validateLxml(xml_fn, xsd_fn):
    try:
        from lxml import etree
        schema = etree.XMLSchema(etree.parse(xsd_fn))
        xml_file = etree.parse(xml_fn)
        if not schema.validate(xml_file):
            print(schema.error_log)
    except ImportError:
        log.debug('lxml not found, validation disabled')


def validateMiniXsv(xml_fn, xsd_fn):
    try:
        from minixsv import pyxsval
        pyxsval.parseAndValidate(xml_fn, xsd_fn)
    except ImportError:
        log.debug('minixsv not found, validation disabled')


def validateXmlschema(xml_src, xsd_file):
    try:
        import xmlschema
        schema = xmlschema.XMLSchema(xsd_file)
        schema.validate(xml_src)
    except ImportError:
        log.debug("xmlschema not found, validation disabled")


def validateXml(xmlroot, skipResources=False, relTo='.'):

    if not skipResources:
        validateResources(xmlroot, relTo)


def validateFile(xmlfn, skipResources=False, xsd_file=XSD_FILE):
    if ':' in xmlfn:
        xmlfn = urlparse(xmlfn).path.strip()
    with open(xmlfn, 'r') as xmlfile:
        root = ET.parse(xmlfile).getroot()
        validateXml(root,
                    skipResources=skipResources,
                    relTo=os.path.dirname(xmlfn))
        if xsd_file is not None:
            validateXmlschema(xmlString, xsd_file)


def validateString(xmlString, skipResources=False, relTo='.', xsd_file=XSD_FILE):
    validateXml(ET.fromstring(xmlString), skipResources, relTo)
    if xsd_file is not None:
        validateXmlschema(xmlString, xsd_file)