File: cxml.py

package info (click to toggle)
python-docx 0.8.11%2Bdfsg1-5
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 6,640 kB
  • sloc: xml: 25,311; python: 21,911; makefile: 168
file content (285 lines) | stat: -rw-r--r-- 8,334 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
# encoding: utf-8

"""Parser for Compact XML Expression Language (CXEL) ('see-ex-ell').

CXEL is a compact XML specification language I made up that's useful for producing XML
element trees suitable for unit testing.
"""

from __future__ import absolute_import, division, print_function, unicode_literals

from pyparsing import (
    alphas,
    alphanums,
    Combine,
    dblQuotedString,
    delimitedList,
    Forward,
    Group,
    Literal,
    Optional,
    removeQuotes,
    stringEnd,
    Suppress,
    Word,
)

from docx.oxml import parse_xml
from docx.oxml.ns import nsmap


# ====================================================================
# api functions
# ====================================================================

def element(cxel_str):
    """
    Return an oxml element parsed from the XML generated from *cxel_str*.
    """
    _xml = xml(cxel_str)
    return parse_xml(_xml)


def xml(cxel_str):
    """
    Return the XML generated from *cxel_str*.
    """
    root_token = root_node.parseString(cxel_str)
    xml = root_token.element.xml
    return xml


# ====================================================================
# internals
# ====================================================================


def nsdecls(*nspfxs):
    """
    Return a string containing a namespace declaration for each of *nspfxs*,
    in the order they are specified.
    """
    nsdecls = ''
    for nspfx in nspfxs:
        nsdecls += ' xmlns:%s="%s"' % (nspfx, nsmap[nspfx])
    return nsdecls


class Element(object):
    """
    Represents an XML element, having a namespace, tagname, attributes, and
    may contain either text or children (but not both) or may be empty.
    """
    def __init__(self, tagname, attrs, text):
        self._tagname = tagname
        self._attrs = attrs
        self._text = text
        self._children = []
        self._is_root = False

    def __repr__(self):
        """
        Provide a more meaningful repr value for an Element object, one that
        displays the tagname as a simple empty element, e.g. ``<w:pPr/>``.
        """
        return "<%s/>" % self._tagname

    def connect_children(self, child_node_list):
        """
        Make each of the elements appearing in *child_node_list* a child of
        this element.
        """
        for node in child_node_list:
            child = node.element
            self._children.append(child)

    @classmethod
    def from_token(cls, token):
        """
        Return an ``Element`` object constructed from a parser element token.
        """
        tagname = token.tagname
        attrs = [(name, value) for name, value in token.attr_list]
        text = token.text
        return cls(tagname, attrs, text)

    @property
    def is_root(self):
        """
        |True| if this element is the root of the tree and should include the
        namespace prefixes. |False| otherwise.
        """
        return self._is_root

    @is_root.setter
    def is_root(self, value):
        self._is_root = bool(value)

    @property
    def local_nspfxs(self):
        """
        The namespace prefixes local to this element, both on the tagname and
        all of its attributes. An empty string (``''``) is used to represent
        the default namespace for an element tag having no prefix.
        """
        def nspfx(name, is_element=False):
            idx = name.find(':')
            if idx == -1:
                return '' if is_element else None
            return name[:idx]

        nspfxs = [nspfx(self._tagname, True)]
        for name, val in self._attrs:
            pfx = nspfx(name)
            if pfx is None or pfx in nspfxs or pfx == "xml":
                continue
            nspfxs.append(pfx)
        return nspfxs

    @property
    def nspfxs(self):
        """
        A sequence containing each of the namespace prefixes appearing in
        this tree. Each prefix appears once and only once, and in document
        order.
        """
        def merge(seq, seq_2):
            for item in seq_2:
                if item in seq:
                    continue
                seq.append(item)

        nspfxs = self.local_nspfxs
        for child in self._children:
            merge(nspfxs, child.nspfxs)
        return nspfxs

    @property
    def xml(self):
        """
        The XML corresponding to the tree rooted at this element,
        pretty-printed using 2-spaces indentation at each level and with
        a trailing '\n'.
        """
        return self._xml(indent=0)

    def _xml(self, indent):
        """
        Return a string containing the XML of this element and all its
        children with a starting indent of *indent* spaces.
        """
        self._indent_str = ' ' * indent
        xml = self._start_tag
        for child in self._children:
            xml += child._xml(indent+2)
        xml += self._end_tag
        return xml

    @property
    def _start_tag(self):
        """
        The text of the opening tag of this element, including attributes. If
        this is the root element, a namespace declaration for each of the
        namespace prefixes that occur in this tree is added in front of any
        attributes. If this element contains text, that text follows the
        start tag. If not, and this element has no children, an empty tag is
        returned. Otherwise, an opening tag is returned, followed by
        a newline. The tag is indented by this element's indent value in all
        cases.
        """
        _nsdecls = nsdecls(*self.nspfxs) if self.is_root else ''
        tag = '%s<%s%s' % (self._indent_str, self._tagname, _nsdecls)
        for attr in self._attrs:
            name, value = attr
            tag += ' %s="%s"' % (name, value)
        if self._text:
            tag += '>%s' % self._text
        elif self._children:
            tag += '>\n'
        else:
            tag += '/>\n'
        return tag

    @property
    def _end_tag(self):
        """
        The text of the closing tag of this element, if there is one. If the
        element contains text, no leading indentation is included.
        """
        if self._text:
            tag = '</%s>\n' % self._tagname
        elif self._children:
            tag = '%s</%s>\n' % (self._indent_str, self._tagname)
        else:
            tag = ''
        return tag


# ====================================================================
# parser
# ====================================================================

# parse actions ----------------------------------

def connect_node_children(s, loc, tokens):
    node = tokens[0]
    node.element.connect_children(node.child_node_list)


def connect_root_node_children(root_node):
    root_node.element.connect_children(root_node.child_node_list)
    root_node.element.is_root = True


def grammar():
    # terminals ----------------------------------
    colon = Literal(':')
    equal = Suppress('=')
    slash = Suppress('/')
    open_paren = Suppress('(')
    close_paren = Suppress(')')
    open_brace = Suppress('{')
    close_brace = Suppress('}')

    # np:tagName ---------------------------------
    nspfx = Word(alphas)
    local_name = Word(alphanums)
    tagname = Combine(nspfx + colon + local_name)

    # np:attr_name=attr_val ----------------------
    attr_name = Word(alphas + ':')
    attr_val = Word(alphanums + ' %-./:_')
    attr_def = Group(attr_name + equal + attr_val)
    attr_list = open_brace + delimitedList(attr_def) + close_brace

    text = dblQuotedString.setParseAction(removeQuotes)

    # w:jc{val=right} ----------------------------
    element = (
        tagname('tagname')
        + Group(Optional(attr_list))('attr_list')
        + Optional(text, default='')('text')
    ).setParseAction(Element.from_token)

    child_node_list = Forward()

    node = Group(
        element('element')
        + Group(Optional(slash + child_node_list))('child_node_list')
    ).setParseAction(connect_node_children)

    child_node_list << (
        open_paren + delimitedList(node) + close_paren
        | node
    )

    root_node = (
        element('element')
        + Group(Optional(slash + child_node_list))('child_node_list')
        + stringEnd
    ).setParseAction(connect_root_node_children)

    return root_node


root_node = grammar()