1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194
|
#! /usr/bin/env python3
# :Copyright: © 2024 Günter Milde.
# :License: Released under the terms of the `2-Clause BSD license`_, in short:
#
# Copying and distribution of this file, with or without modification,
# are permitted in any medium without royalty provided the copyright
# notice and this notice are preserved.
# This file is offered as-is, without any warranty.
#
# .. _2-Clause BSD license: https://opensource.org/licenses/BSD-2-Clause
#
# Revision: $Revision: 10136 $
# Date: $Date: 2025-05-20 17:48:27 +0200 (Di, 20. Mai 2025) $
"""A Docutils-XML parser.
Provisional:
The API is not fixed yet.
Defined objects may be renamed or changed
in any Docutils release without prior notice.
"""
from __future__ import annotations
__docformat__ = 'reStructuredText'
import re
import xml.etree.ElementTree as ET
from docutils import frontend, nodes, parsers, utils
class Parser(parsers.Parser):
"""A Docutils-XML parser."""
supported = ('xml', 'docutils-xml')
"""Aliases this parser supports."""
config_section = 'xml parser'
config_section_dependencies = ('parsers',)
settings_default_overrides = {'doctitle_xform': False,
'validate': True,
}
def parse(self, inputstring, document) -> None:
"""
Parse `inputstring` and populate `document`, a "document tree".
Provisional.
"""
self.setup_parse(inputstring, document)
node = parse_element(inputstring, document)
if not isinstance(node, nodes.document):
document.append(node)
self.finish_parse()
class Unknown(nodes.Special, nodes.Inline, nodes.Element):
"""An unknown element found by the XML parser."""
content_model = (((nodes.Element, nodes.Text), '*'),) # no restrictions
def parse_element(inputstring, document=None):
"""
Parse `inputstring` as "Docutils XML", return `nodes.Element` instance.
:inputstring: XML source.
:document: `nodes.document` instance (default: a new dummy instance).
Provides settings and reporter.
Populated and returned, if the inputstring's root element
is <document>.
Caution:
The function does not detect invalid XML.
To check the validity of the returned node,
you may use its `validate()` method::
node = parse_element('<tip><hint>text</hint></tip>')
node.validate()
Provisional.
"""
root = None
parser = ET.XMLPullParser(events=('start',))
for i, line in enumerate(inputstring.splitlines(keepends=True)):
try:
parser.feed(line)
for event, element in parser.read_events():
if root is None:
root = element
element.attrib['source line'] = str(i+1)
except ET.ParseError as e:
if document is None:
raise
document.reporter.error(f'XML parse error: {e}.',
source=document.settings._source,
line=e.position[0])
break
return element2node(root, document)
def element2node(element, document=None, unindent=True):
"""
Convert an `etree` element and its children to Docutils doctree nodes.
:element: `xml.etree` element
:document: see `parse_element()`
:unindent: Remove formatting indentation of follow-up text lines?
Cf. `append_text()`.
TODO: do we need an "unindent" configuration setting?
Return a `docutils.nodes.Element` instance.
Internal.
"""
if document is None:
document = utils.new_document('xml input',
frontend.get_default_settings(Parser))
document.source == 'xml input'
if element is None:
problem = nodes.problematic('', 'No XML element found.')
return nodes.paragraph('', '', problem)
# Get the corresponding `nodes.Element` instance:
try:
nodeclass = getattr(nodes, element.tag)
if not issubclass(nodeclass, nodes.Element):
nodeclass = Unknown
except AttributeError:
nodeclass = Unknown
if nodeclass == nodes.document:
node = document
document.source = document.source or document.settings._source
else:
node = nodeclass()
node.line = int(element.get('source line'))
if isinstance(node, Unknown):
node.tagname = element.tag
document.reporter.warning(
f'Unknown element type <{element.tag}>.',
base_node=node)
# Attributes: convert and add to `node.attributes`.
for key, value in element.items():
if key.startswith('{') or key == 'source line':
continue # skip duplicate attributes with namespace URL
try:
node.attributes[key] = nodes.ATTRIBUTE_VALIDATORS[key](value)
except (ValueError, KeyError):
if key in node.list_attributes:
value = value.split()
node.attributes[key] = value # node becomes invalid!
# Bookkeeping (register some elements/attributes in document-wide lists)
if isinstance(node, nodes.decoration):
document.decoration = node
elif isinstance(node, nodes.substitution_definition):
document.note_substitution_def(node, ' '.join(node['names']), document)
if node['ids']: # register, check for duplicates
document.set_id(node)
# TODO: anything missing?
# Append content:
# update "unindent" flag: change line indentation?
unindent = unindent and not isinstance(
node, (nodes.FixedTextElement, nodes.literal, Unknown))
# (leading) text
append_text(node, element.text, unindent)
# children and their tailing text
for child in element:
node.append(element2node(child, document, unindent))
# Text after a child node
append_text(node, child.tail, unindent)
return node
def append_text(node, text, unindent) -> None:
# Format `text`, wrap in a TextElement and append to `node`.
# Skip if `text` is empty or just formatting whitespace.
if not text:
return
if unindent:
text = re.sub('\n +', '\n', text)
if isinstance(node, nodes.TextElement):
node.append(nodes.Text(text))
elif text.strip():
# no TextElement: ignore formatting whitespace
# but append other text (node becomes invalid!)
node.append(nodes.Text(text.strip()))
|