File: docutils_xml.py

package info (click to toggle)
python-docutils 0.22%2Bdfsg-1
links: PTS, VCS
area: main
in suites: experimental
size: 11,448 kB
sloc: python: 53,302; lisp: 14,475; xml: 1,807; javascript: 1,032; makefile: 102; sh: 96
file content (194 lines) | stat: -rw-r--r-- 6,572 bytes
#! /usr/bin/env python3
# :Copyright: © 2024 Günter Milde.
# :License: Released under the terms of the `2-Clause BSD license`_, in short:
#
#    Copying and distribution of this file, with or without modification,
#    are permitted in any medium without royalty provided the copyright
#    notice and this notice are preserved.
#    This file is offered as-is, without any warranty.
#
# .. _2-Clause BSD license: https://opensource.org/licenses/BSD-2-Clause
#
# Revision: $Revision: 10136 $
# Date: $Date: 2025-05-20 17:48:27 +0200 (Di, 20. Mai 2025) $

"""A Docutils-XML parser.

   Provisional:
     The API is not fixed yet.
     Defined objects may be renamed or changed
     in any Docutils release without prior notice.
"""

from __future__ import annotations

__docformat__ = 'reStructuredText'

import re
import xml.etree.ElementTree as ET

from docutils import frontend, nodes, parsers, utils


class Parser(parsers.Parser):

    """A Docutils-XML parser."""

    supported = ('xml', 'docutils-xml')
    """Aliases this parser supports."""

    config_section = 'xml parser'
    config_section_dependencies = ('parsers',)
    settings_default_overrides = {'doctitle_xform': False,
                                  'validate': True,
                                  }

    def parse(self, inputstring, document) -> None:
        """
        Parse `inputstring` and populate `document`, a "document tree".

        Provisional.
        """
        self.setup_parse(inputstring, document)

        node = parse_element(inputstring, document)
        if not isinstance(node, nodes.document):
            document.append(node)

        self.finish_parse()


class Unknown(nodes.Special, nodes.Inline, nodes.Element):
    """An unknown element found by the XML parser."""
    content_model = (((nodes.Element, nodes.Text), '*'),)  # no restrictions


def parse_element(inputstring, document=None):
    """
    Parse `inputstring` as "Docutils XML", return `nodes.Element` instance.

    :inputstring: XML source.
    :document: `nodes.document` instance (default: a new dummy instance).
               Provides settings and reporter.
               Populated and returned, if the inputstring's root element
               is <document>.

    Caution:
      The function does not detect invalid XML.

      To check the validity of the returned node,
      you may use its `validate()` method::

        node = parse_element('<tip><hint>text</hint></tip>')
        node.validate()

    Provisional.
    """
    root = None
    parser = ET.XMLPullParser(events=('start',))
    for i, line in enumerate(inputstring.splitlines(keepends=True)):
        try:
            parser.feed(line)
            for event, element in parser.read_events():
                if root is None:
                    root = element
                element.attrib['source line'] = str(i+1)
        except ET.ParseError as e:
            if document is None:
                raise
            document.reporter.error(f'XML parse error: {e}.',
                                    source=document.settings._source,
                                    line=e.position[0])
            break
    return element2node(root, document)


def element2node(element, document=None, unindent=True):
    """
    Convert an `etree` element and its children to Docutils doctree nodes.

    :element:  `xml.etree` element
    :document: see `parse_element()`
    :unindent: Remove formatting indentation of follow-up text lines?
               Cf. `append_text()`.
               TODO: do we need an "unindent" configuration setting?

    Return a `docutils.nodes.Element` instance.

    Internal.
    """
    if document is None:
        document = utils.new_document('xml input',
                                      frontend.get_default_settings(Parser))
        document.source == 'xml input'
    if element is None:
        problem = nodes.problematic('', 'No XML element found.')
        return nodes.paragraph('', '', problem)
    # Get the corresponding `nodes.Element` instance:
    try:
        nodeclass = getattr(nodes, element.tag)
        if not issubclass(nodeclass, nodes.Element):
            nodeclass = Unknown
    except AttributeError:
        nodeclass = Unknown
    if nodeclass == nodes.document:
        node = document
        document.source = document.source or document.settings._source
    else:
        node = nodeclass()

    node.line = int(element.get('source line'))
    if isinstance(node, Unknown):
        node.tagname = element.tag
        document.reporter.warning(
            f'Unknown element type <{element.tag}>.',
            base_node=node)

    # Attributes: convert and add to `node.attributes`.
    for key, value in element.items():
        if key.startswith('{') or key == 'source line':
            continue  # skip duplicate attributes with namespace URL
        try:
            node.attributes[key] = nodes.ATTRIBUTE_VALIDATORS[key](value)
        except (ValueError, KeyError):
            if key in node.list_attributes:
                value = value.split()
            node.attributes[key] = value  # node becomes invalid!

    # Bookkeeping (register some elements/attributes in document-wide lists)
    if isinstance(node, nodes.decoration):
        document.decoration = node
    elif isinstance(node, nodes.substitution_definition):
        document.note_substitution_def(node, ' '.join(node['names']), document)
    if node['ids']:  # register, check for duplicates
        document.set_id(node)
    # TODO: anything missing?

    # Append content:
    # update "unindent" flag: change line indentation?
    unindent = unindent and not isinstance(
                   node, (nodes.FixedTextElement, nodes.literal, Unknown))
    # (leading) text
    append_text(node, element.text, unindent)
    # children and their tailing text
    for child in element:
        node.append(element2node(child, document, unindent))
        # Text after a child node
        append_text(node, child.tail, unindent)

    return node


def append_text(node, text, unindent) -> None:
    # Format `text`, wrap in a TextElement and append to `node`.
    # Skip if `text` is empty or just formatting whitespace.
    if not text:
        return
    if unindent:
        text = re.sub('\n +', '\n', text)
    if isinstance(node, nodes.TextElement):
        node.append(nodes.Text(text))
    elif text.strip():
        # no TextElement: ignore formatting whitespace
        # but append other text (node becomes invalid!)
        node.append(nodes.Text(text.strip()))