File: docutils_xml.py

package info (click to toggle)
python-docutils 0.22%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: experimental
  • size: 11,448 kB
  • sloc: python: 53,302; lisp: 14,475; xml: 1,807; javascript: 1,032; makefile: 102; sh: 96
file content (194 lines) | stat: -rw-r--r-- 6,572 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
#! /usr/bin/env python3
# :Copyright: © 2024 Günter Milde.
# :License: Released under the terms of the `2-Clause BSD license`_, in short:
#
#    Copying and distribution of this file, with or without modification,
#    are permitted in any medium without royalty provided the copyright
#    notice and this notice are preserved.
#    This file is offered as-is, without any warranty.
#
# .. _2-Clause BSD license: https://opensource.org/licenses/BSD-2-Clause
#
# Revision: $Revision: 10136 $
# Date: $Date: 2025-05-20 17:48:27 +0200 (Di, 20. Mai 2025) $

"""A Docutils-XML parser.

   Provisional:
     The API is not fixed yet.
     Defined objects may be renamed or changed
     in any Docutils release without prior notice.
"""

from __future__ import annotations

__docformat__ = 'reStructuredText'

import re
import xml.etree.ElementTree as ET

from docutils import frontend, nodes, parsers, utils


class Parser(parsers.Parser):

    """A Docutils-XML parser."""

    supported = ('xml', 'docutils-xml')
    """Aliases this parser supports."""

    config_section = 'xml parser'
    config_section_dependencies = ('parsers',)
    settings_default_overrides = {'doctitle_xform': False,
                                  'validate': True,
                                  }

    def parse(self, inputstring, document) -> None:
        """
        Parse `inputstring` and populate `document`, a "document tree".

        Provisional.
        """
        self.setup_parse(inputstring, document)

        node = parse_element(inputstring, document)
        if not isinstance(node, nodes.document):
            document.append(node)

        self.finish_parse()


class Unknown(nodes.Special, nodes.Inline, nodes.Element):
    """An unknown element found by the XML parser."""
    content_model = (((nodes.Element, nodes.Text), '*'),)  # no restrictions


def parse_element(inputstring, document=None):
    """
    Parse `inputstring` as "Docutils XML", return `nodes.Element` instance.

    :inputstring: XML source.
    :document: `nodes.document` instance (default: a new dummy instance).
               Provides settings and reporter.
               Populated and returned, if the inputstring's root element
               is <document>.

    Caution:
      The function does not detect invalid XML.

      To check the validity of the returned node,
      you may use its `validate()` method::

        node = parse_element('<tip><hint>text</hint></tip>')
        node.validate()

    Provisional.
    """
    root = None
    parser = ET.XMLPullParser(events=('start',))
    for i, line in enumerate(inputstring.splitlines(keepends=True)):
        try:
            parser.feed(line)
            for event, element in parser.read_events():
                if root is None:
                    root = element
                element.attrib['source line'] = str(i+1)
        except ET.ParseError as e:
            if document is None:
                raise
            document.reporter.error(f'XML parse error: {e}.',
                                    source=document.settings._source,
                                    line=e.position[0])
            break
    return element2node(root, document)


def element2node(element, document=None, unindent=True):
    """
    Convert an `etree` element and its children to Docutils doctree nodes.

    :element:  `xml.etree` element
    :document: see `parse_element()`
    :unindent: Remove formatting indentation of follow-up text lines?
               Cf. `append_text()`.
               TODO: do we need an "unindent" configuration setting?

    Return a `docutils.nodes.Element` instance.

    Internal.
    """
    if document is None:
        document = utils.new_document('xml input',
                                      frontend.get_default_settings(Parser))
        document.source == 'xml input'
    if element is None:
        problem = nodes.problematic('', 'No XML element found.')
        return nodes.paragraph('', '', problem)
    # Get the corresponding `nodes.Element` instance:
    try:
        nodeclass = getattr(nodes, element.tag)
        if not issubclass(nodeclass, nodes.Element):
            nodeclass = Unknown
    except AttributeError:
        nodeclass = Unknown
    if nodeclass == nodes.document:
        node = document
        document.source = document.source or document.settings._source
    else:
        node = nodeclass()

    node.line = int(element.get('source line'))
    if isinstance(node, Unknown):
        node.tagname = element.tag
        document.reporter.warning(
            f'Unknown element type <{element.tag}>.',
            base_node=node)

    # Attributes: convert and add to `node.attributes`.
    for key, value in element.items():
        if key.startswith('{') or key == 'source line':
            continue  # skip duplicate attributes with namespace URL
        try:
            node.attributes[key] = nodes.ATTRIBUTE_VALIDATORS[key](value)
        except (ValueError, KeyError):
            if key in node.list_attributes:
                value = value.split()
            node.attributes[key] = value  # node becomes invalid!

    # Bookkeeping (register some elements/attributes in document-wide lists)
    if isinstance(node, nodes.decoration):
        document.decoration = node
    elif isinstance(node, nodes.substitution_definition):
        document.note_substitution_def(node, ' '.join(node['names']), document)
    if node['ids']:  # register, check for duplicates
        document.set_id(node)
    # TODO: anything missing?

    # Append content:
    # update "unindent" flag: change line indentation?
    unindent = unindent and not isinstance(
                   node, (nodes.FixedTextElement, nodes.literal, Unknown))
    # (leading) text
    append_text(node, element.text, unindent)
    # children and their tailing text
    for child in element:
        node.append(element2node(child, document, unindent))
        # Text after a child node
        append_text(node, child.tail, unindent)

    return node


def append_text(node, text, unindent) -> None:
    # Format `text`, wrap in a TextElement and append to `node`.
    # Skip if `text` is empty or just formatting whitespace.
    if not text:
        return
    if unindent:
        text = re.sub('\n +', '\n', text)
    if isinstance(node, nodes.TextElement):
        node.append(nodes.Text(text))
    elif text.strip():
        # no TextElement: ignore formatting whitespace
        # but append other text (node becomes invalid!)
        node.append(nodes.Text(text.strip()))