File: sdt.py

package info (click to toggle)
python-docxcompose 1.4.0-5
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 4,136 kB
  • sloc: python: 2,338; makefile: 14; xml: 5; sh: 3
file content (123 lines) | stat: -rw-r--r-- 4,643 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
from docxcompose.utils import xpath
from lxml.etree import Element
from lxml.etree import QName


class StructuredDocumentTags(object):
    """Structured Document Tags (aka Content Controls)"""

    def __init__(self, doc):
        self.doc = doc

    def tags_by_alias(self, alias):
        """Get Structured Document Tags by alias."""
        return xpath(
            self.doc.element.body,
            './/w:sdt/w:sdtPr/w:alias[@w:val="%s"]/ancestor::w:sdt' % alias)

    def set_text(self, alias, text):
        """Set the text content of all Structured Document Tags identified by
        an alias. Only plain text SDTs are supported.

        If the SDT has the 'multiLine' property, newlines in `text` will be
        respected, and the SDTs content will be updated with lines separated
        by line breaks.
        """
        text = text.strip()
        tags = self.tags_by_alias(alias)
        for tag in tags:
            # Ignore if it's not a plain text SDT
            plain_text = xpath(tag, './w:sdtPr/w:text')
            if not plain_text:
                continue

            nsmap = tag.nsmap
            is_multiline = bool(plain_text[0].xpath('./@w:multiLine', namespaces=nsmap))

            properties = xpath(tag, './w:sdtPr')
            content = xpath(tag, './w:sdtContent')
            if not content:
                continue

            run_elements = xpath(content[0], './/w:r')
            if not run_elements:
                continue

            # First, prepare the SDT for easy updating of its value.
            #
            # We do this by cleaning out the SDT content to only preserve
            # the first of possibly many runs, and remove the contents of
            # that run (except w:rPr formatting properties).
            #
            # That run can then be filled with new text nodes and line breaks
            # as needed. This should allow us to preserve formatting, but
            # otherwise start from a clean slate where we create new nodes
            # instead of having to carefully update an existing structure.

            first_run = run_elements[0]
            self._remove_placeholder(properties, content, first_run)
            self._remove_all_runs_except_first(run_elements)
            self._clean_first_run(first_run)

            # Now update contents by appending new text nodes.
            #
            # If the SDT has the multiLine property, we respect newlines
            # in the input value string and create text nodes delimited by
            # line breaks.
            if not is_multiline:
                text = text.replace('\n', ' ')

            lines = text.splitlines()
            for i, line in enumerate(lines, start=1):
                txt_node = Element(QName(nsmap['w'], "t"))
                txt_node.text = line
                first_run.append(txt_node)

                if i != len(lines):
                    br = Element(QName(nsmap['w'], "br"))
                    first_run.append(br)

    def _remove_placeholder(self, properties, content, first_run):
        """Remove placeholder marker and style.
        """
        showing_placeholder = xpath(properties[0], './w:showingPlcHdr')
        if showing_placeholder:
            properties[0].remove(showing_placeholder[0])
            run_props = xpath(first_run, './w:rPr')
            if run_props:
                first_run.remove(run_props[0])

    def _remove_all_runs_except_first(self, run_elements):
        """Remove all runs except the first one.
        """
        for run in run_elements[1:]:
            run.getparent().remove(run)

    def _clean_first_run(self, first_run):
        """Remove all elements from the first run except run formatting.
        """
        for child in first_run.getchildren():
            # Preserve formatting
            if QName(child).localname == 'rPr':
                continue
            first_run.remove(child)

    def get_text(self, alias):
        """Get the text content of the first Structured Document Tag identified
           by the given alias.
        """
        tags = self.tags_by_alias(alias)
        for tag in tags:
            # Ignore if it's not a plain text SDT
            if not xpath(tag, './w:sdtPr/w:text'):
                continue

            tokens = []
            text_and_brs = xpath(tag, './w:sdtContent//w:r/*[self::w:t or self::w:br]')
            for el in text_and_brs:
                if QName(el).localname == 't':
                    tokens.append(el.text)
                elif QName(el).localname == 'br':
                    tokens.append('\n')

            return ''.join(tokens)