File: DataSetValidator.py

package info (click to toggle)
python-pbcore 1.6.5%2Bdfsg-1
links: PTS, VCS
area: main
in suites: buster
size: 19,168 kB
sloc: python: 25,497; xml: 2,846; makefile: 251; sh: 24
file content (90 lines) | stat: -rwxr-xr-x 3,182 bytes
# Author: Martin D. Smith


"""Validate DataSet XML files"""
from __future__ import absolute_import
from __future__ import print_function

import os
import re
from urlparse import urlparse
from urllib import unquote
import xml.etree.ElementTree as ET
import logging

XMLNS = "http://pacificbiosciences.com/PacBioDataModel.xsd"

log = logging.getLogger(__name__)

def validateResources(xmlroot, relTo='.'):
    """Validate the resources in an XML file.

    Args:
        xmlroot: The ET root of an xml tree
        relTo: ('.') The path relative to which resources may reside. This will
               work poorly if relTo is not set to the dirname of the incoming
               XML file.
    """
    stack = [xmlroot]
    while stack:
        element = stack.pop()
        stack.extend(element.getchildren())
        resId = element.get('ResourceId')
        if resId:
            parsedId = urlparse(resId)
            rfn = unquote(urlparse(resId).path.strip())
            if not os.path.exists(rfn):
                if (not os.path.exists(os.path.join(relTo,
                                                    rfn)) and
                        not os.path.exists(os.path.join('.',
                                                        rfn))):
                    raise IOError("{f} not found".format(f=rfn))

def validateLxml(xml_fn, xsd_fn):
    try:
        from lxml import etree
        schema = etree.XMLSchema(etree.parse(xsd_fn))
        xml_file = etree.parse(xml_fn)
        if not schema.validate(xml_file):
            print(schema.error_log)
    except ImportError:
        log.debug('lxml not found, validation disabled')

def validateMiniXsv(xml_fn, xsd_fn):
    try:
        from minixsv import pyxsval
        pyxsval.parseAndValidate(xml_fn, xsd_fn)
    except ImportError:
        log.debug('minixsv not found, validation disabled')

def validateXml(xmlroot, skipResources=False, relTo='.'):

    if not skipResources:
        validateResources(xmlroot, relTo)

    # Conceal the first characters of UniqueIds if they are legal numbers that
    # would for some odd reason be considered invalid. Let all illegal
    # characters fall through to the validator.
    try:
        from pbcore.io.dataset.pyxb import DataSetXsd
        log.debug('Validating with PyXb')
        fixedString = re.sub('UniqueId="[0-9]', 'UniqueId="f',
                             ET.tostring(xmlroot))
        fixedString = re.sub('Barcode="[0-9]', 'Barcode="f',
                             fixedString)
        fixedString = re.sub('Pointer>[0-9]', 'Pointer>f',
                             fixedString)
        DataSetXsd.CreateFromDocument(fixedString)
    except ImportError:
        log.info('PyXb not found, validation disabled')

def validateFile(xmlfn, skipResources=False):
    if ':' in xmlfn:
        xmlfn = urlparse(xmlfn).path.strip()
    with open(xmlfn, 'r') as xmlfile:
        root = ET.parse(xmlfile).getroot()
        return validateXml(root, skipResources=skipResources,
                           relTo=os.path.dirname(xmlfn))

def validateString(xmlString, skipResources=False, relTo='.'):
    validateXml(ET.fromstring(xmlString), skipResources, relTo)