File: daps-xmlwellformed

package info (click to toggle)
daps 3.3.2%2Bcleaned1-7
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 5,540 kB
  • sloc: xml: 9,773; sh: 3,059; python: 1,322; lisp: 380; makefile: 239
file content (233 lines) | stat: -rwxr-xr-x 7,106 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
#!/usr/bin/env python3
#
"""Performs a well-formedness check on XML.

This script is needed as xmllint cannot ignore duplicated IDs.
This tool:

 * Processes XIncludes (if requested using the option --xinclude)
 * Ignores non-unique IDs (attributes xml:id or id)
 * Outputs errors about undeclared entities
"""

__author__ = "Thomas Schraitle"
__version__ = "0.3.1"

import argparse
import enum
import logging
from io import StringIO
import re
import sys

from lxml import etree


ROOTLOGGER = "lxmlerrors"
# Hint: file keyword needs to be filled with extra keyword on the
# logger methods .critical, .debug(), .error(), .fatal(), and .warn().
# For example:
#   log.error("...", extra=dict(file="foo.xml", ))
# Basically everything can be used what is in ERROR_PATTERN
FORMAT = "%(file)s:\n    %(levelname)s: %(message)s (line %(line)s column %(col)s)"

logger = logging.getLogger(ROOTLOGGER)
ch = logging.StreamHandler()
ch.setLevel(logging.ERROR)
ch.setFormatter(logging.Formatter(FORMAT))
logger.addHandler(ch)

if etree.LXML_VERSION < (4, 4, 2):
    logger.fatal("Need a minimum version of 4.4.2 of lxml, got %s",
                 ".".join([str(e) for e in etree.LXML_VERSION]),
                 extra=dict(file=__file__,
                            line="", col=""))
    sys.exit(10)


# Format of libxml error messages:
#   foo.xml:3:16:FATAL:PARSER:ERR_UNDECLARED_ENTITY:
# This regex was tested against XML files with ":"
ERROR_PATTERN = re.compile(
    r"(?P<file>.*)"
    r":(?P<line>\d+)"
    r":(?P<col>\d+)"
    r":(?P<level>[A-Z]+)"
    r":(?P<origin>[A-Z]+)"
    r":(?P<domain>[A-Z_]+)"
    r":\s*"
    r"(?P<message>.*)$"
)


class ExitCode(enum.IntEnum):
    """Our return codes
    """
    ok, syntax, entities, xinclude, multiple, file_not_found = range(0, 60, 10)
    unknown = 200


def extract_log(stream):
    """Extract log record from stream
    """
    for line in stream.getvalue().split("\n"):
        if not line:
            continue
        match = ERROR_PATTERN.search(line)
        yield match.groupdict()


def print_error_msg(stream) -> int:
    """Print log line from stream

    :param stream: the stream we want to extract the messages from
    :return: the number of found entries (=error message lines)
    """
    error_map = {
        "FATAL": logging.FATAL,  # etree.ErrorLevels.FATAL
        "ERROR": logging.ERROR,  # etree.ErrorLevels.ERROR
        "WARNING": logging.WARNING,  # etree.ErrorLevels.WARNING
    }
    idx = 0
    for entry in extract_log(stream):
        level = error_map.get(entry["level"], logging.ERROR)
        msg = entry.pop("message")
        logger.log(level, msg, extra=entry)
        idx += 1

    return idx


def check_wellformedness(xmlfile: str, xinclude: bool = True) -> int:
    """Checks a file for well-formedness

    This only works with lxml >= 3.4.0 (because of collect_ids option)

    :param xmlfile: filename to XML file
    :param xinclude: do xinclude processing (default: True) or not
    :return: 0 (everything ok) or != 0 (some problem)
    :rtype: int
    """
    # We don't want to collect all IDs to avoid problems when
    # IDs are non-unique:
    result = ExitCode.ok

    log_stream = StringIO()
    log = logging.getLogger(ROOTLOGGER).getChild("wellformedness")
    log.propagate = False
    ch = logging.StreamHandler(log_stream)
    ch.setLevel(logging.WARNING)
    log.addHandler(ch)
    #
    etree.use_global_python_log(etree.PyErrorLog(xmlfile, log))
    xmlparser = etree.XMLParser(collect_ids=False)

    try:
        tree = etree.parse(xmlfile, parser=xmlparser)

        if xinclude:
            tree.xinclude()

    except OSError as err:
        # FATAL:PARSER:ERR_UNDECLARED_ENTITY:
        # Write artificial error message to our log stream:
        log_stream.write(f"{xmlfile}:0:0:FATAL:PARSER:FILE_NOT_FOUND:File does not exist\n")
        result = ExitCode.file_not_found.value

    except etree.XMLSyntaxError as err:
        result = ExitCode.syntax.value

    except etree.XIncludeError as err:
        result = ExitCode.xinclude.value

    idx = print_error_msg(log_stream)

    # HACK:
    # For some unknown reason, when parsing MAIN.SLEDS.xml with an unknown
    # entity, it doesn't raise XIncludeError or XMLSyntaxError exceptions. :-(
    #
    # If the return code hasn't changed but there are error messages,
    # make sure to return some error code:
    if idx and result == ExitCode.ok:
        result = ExitCode.multiple
    return result


def parse_cli(args=None) -> argparse.Namespace:
    """Parse CLI arguments

    :param list args: Use the list or sys.args
    :return: parsed arguments
    :rtype: :class:`argparse.Namespace`
    """
    parser = argparse.ArgumentParser(description=__doc__.split("\n", 1)[0],
                                     epilog=__doc__.split("\n", 1)[-1],
                                     )
    parser.add_argument("--version",
                        action="version",
                        version="%(prog)s {}".format(__version__),
                        )
    parser.add_argument("--xinclude",
                        action="store_true",
                        default=False,
                        help="Do XInclude processing"
                        )
    parser.add_argument("-W", "--warnings-as-errors",
                        action="store_true",
                        default=False,
                        help=("Flag to set the behavior when "
                              "referenced files with XIncludes cannot "
                              "be found; "
                              "(default: %(default)s)")
                        )
    parser.add_argument("-S", "--stats",
                        action="store_true",
                        default=False,
                        help=("Print statistic about successful/"
                              "failed files"
                              )
                        )
    parser.add_argument("xmlfiles",
                        nargs="+",
                        help="XML file(s) to check for well-formedness"
                        )
    args = parser.parse_args(args)
    return args


def main(cliargs=None) -> int:
    """Entry point for the application script

    :param list cliargs: Arguments to parse or None (=use :class:`sys.argv`)
    :return: error code; 0 => everything was successful, !=0 => error
    :rtype: int
    """
    endresult = ExitCode.ok
    failures = 0
    successes = 0

    args = parse_cli(cliargs)

    for xmlfile in args.xmlfiles:
        result = check_wellformedness(xmlfile,
                                      xinclude=args.xinclude
                                      )
        if result and args.warnings_as_errors:
            break
        if result:
            failures += 1
        else:
            successes += 1
        endresult += result

    if args.stats:
        msg = (f"--- Successful Files={successes}, "
               f"Failed files={failures} ---"
               )
        print(msg, file=sys.stderr)

    return int(endresult)


if __name__ == "__main__":
    sys.exit(main())