1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578
|
from babel.dates import format_datetime
from copy import deepcopy
from datetime import datetime
from docx.opc.constants import CONTENT_TYPE as CT
from docx.opc.constants import RELATIONSHIP_TYPE as RT
from docx.opc.oxml import serialize_part_xml
from docx.opc.packuri import PackURI
from docx.opc.part import Part
from docx.oxml import parse_xml
from docx.oxml.coreprops import CT_CoreProperties
from docxcompose.utils import NS
from docxcompose.utils import word_to_python_date_format
from docxcompose.utils import xpath
from lxml.etree import FunctionNamespace
from lxml.etree import QName
binary_type = bytes
text_type = str
import pkg_resources
import re
CUSTOM_PROPERTY_FMTID = '{D5CDD505-2E9C-101B-9397-08002B2CF9AE}'
CUSTOM_PROPERTY_TYPES = {
'text': '<vt:lpwstr xmlns:vt="{}"/>'.format(NS['vt']),
'int': '<vt:i4 xmlns:vt="{}"/>'.format(NS['vt']),
'bool': '<vt:bool xmlns:vt="{}"/>'.format(NS['vt']),
'datetime': '<vt:filetime xmlns:vt="{}"/>'.format(NS['vt']),
'float': '<vt:r8 xmlns:vt="{}"/>'.format(NS['vt']),
}
MIN_PID = 2 # Property IDs have to start with 2
def value2vt(value):
if isinstance(value, bool):
el = parse_xml(CUSTOM_PROPERTY_TYPES['bool'])
el.text = 'true' if value else 'false'
elif isinstance(value, int):
el = parse_xml(CUSTOM_PROPERTY_TYPES['int'])
el.text = text_type(value)
elif isinstance(value, float):
el = parse_xml(CUSTOM_PROPERTY_TYPES['float'])
el.text = text_type(value)
elif isinstance(value, datetime):
el = parse_xml(CUSTOM_PROPERTY_TYPES['datetime'])
el.text = value.strftime('%Y-%m-%dT%H:%M:%SZ')
elif isinstance(value, text_type):
el = parse_xml(CUSTOM_PROPERTY_TYPES['text'])
el.text = value
elif isinstance(value, binary_type):
value = value.decode('utf-8')
el = parse_xml(CUSTOM_PROPERTY_TYPES['text'])
el.text = value
else:
raise TypeError('Unsupported type {}'.format(type(value)))
return el
def vt2value(element):
tag = QName(element).localname
if tag == 'bool':
if element.text.lower() == u'true':
return True
else:
return False
elif tag in ['i1', 'i2', 'i4', 'int', 'ui1', 'ui2', 'ui4', 'uint']:
return int(element.text)
elif tag in ['r4', 'r8']:
return float(element.text)
elif tag == 'filetime':
return CT_CoreProperties._parse_W3CDTF_to_datetime(element.text)
elif tag == 'lpwstr':
return element.text if element.text else u''
else:
return element.text
def is_text_property(property):
tag = QName(property).localname
return tag in ['bstr', 'lpstr', 'lpwstr']
ns = FunctionNamespace(None)
# lxml doesn't support XPath 2.0 functions
# Thus we implement lower-case() as an extension function
@ns('lower-case')
def lower_case(context, a):
return [el.lower() for el in a]
class CustomProperties(object):
"""Custom doc properties stored in ``/docProps/custom.xml``.
Allows updating of doc properties in a document.
"""
def __init__(self, doc):
self.doc = doc
self.part = None
self._element = None
self.language = self.get_doc_language()
try:
part = doc.part.package.part_related_by(RT.CUSTOM_PROPERTIES)
except KeyError:
self._element = parse_xml(self._part_template())
else:
self.part = part
self._element = parse_xml(part.blob)
def _part_template(self):
return pkg_resources.resource_string(
'docxcompose', 'templates/custom.xml')
def _update_part(self):
if self.part is None:
# Create a new part for custom properties
partname = PackURI('/docProps/custom.xml')
self.part = Part(
partname, CT.OFC_CUSTOM_PROPERTIES,
serialize_part_xml(self._element), self.doc.part.package)
self.doc.part.package.relate_to(self.part, RT.CUSTOM_PROPERTIES)
self._element = parse_xml(self.part.blob)
else:
self.part._blob = serialize_part_xml(self._element)
def __getitem__(self, key):
"""Get the value of a property."""
props = xpath(
self._element,
u'.//cp:property[lower-case(@name)="{}"]'.format(key.lower()))
if not props:
raise KeyError(key)
return vt2value(props[0][0])
def __setitem__(self, key, value):
"""Set the value of a property."""
props = xpath(
self._element,
u'.//cp:property[lower-case(@name)="{}"]'.format(key.lower()))
if not props:
self.add(key, value)
return
value_el = props[0][0]
new_value_el = value2vt(value)
value_el.getparent().replace(value_el, new_value_el)
self._update_part()
def __delitem__(self, key):
"""Delete a property."""
props = xpath(
self._element,
u'.//cp:property[lower-case(@name)="{}"]'.format(key.lower()))
if not props:
raise KeyError(key)
props[0].getparent().remove(props[0])
# Renumber pids
pid = MIN_PID
for prop in self._element:
prop.set('pid', text_type(pid))
pid += 1
self._update_part()
def get_doc_language(self):
"""We actually should determine the correct language for each field.
Instead we simply determine the language from the first w:lang tag in
the document, and if None are found, from the w:lang tag in the default
style.
"""
lang_tags = xpath(self.doc.element, ".//w:lang")
lang_tags.extend(xpath(self.doc.styles.element, ".//w:lang"))
# keep the first tag containing a setting for Latin languages
latin_lang_key = "{{{}}}val".format(NS["w"])
lang_tags = [tag for tag in lang_tags if latin_lang_key in tag.keys()]
if lang_tags:
language = lang_tags[0].attrib[latin_lang_key]
# babel does not support dashes in combined language codes
return language.replace("-", "_")
return None
def nullify(self, key):
"""Delete key for non text-properties, set key to empty string for
text.
"""
props = xpath(
self._element,
u'.//cp:property[lower-case(@name)="{}"]'.format(key.lower()))
if not props:
raise KeyError(key)
if is_text_property(props[0][0]):
self[key] = ''
else:
del self[key]
def __contains__(self, item):
props = xpath(
self._element,
u'.//cp:property[lower-case(@name)="{}"]'.format(item.lower()))
if props:
return True
else:
return False
def get(self, key, default=None):
try:
return self[key]
except KeyError:
return default
def add(self, name, value):
"""Add a property."""
pids = [int(pid) for pid in xpath(self._element, u'.//cp:property/@pid')]
if pids:
pid = max(pids) + 1
else:
pid = MIN_PID
prop = parse_xml('<cp:property xmlns:cp="{}"/>'.format(NS['cp']))
prop.set('fmtid', CUSTOM_PROPERTY_FMTID)
prop.set('name', name)
prop.set('pid', text_type(pid))
value_el = value2vt(value)
prop.append(value_el)
self._element.append(prop)
self._update_part()
def keys(self):
if self._element is None:
return []
props = xpath(self._element, u'.//cp:property')
return [prop.get('name') for prop in props]
def values(self):
if self._element is None:
return []
props = xpath(self._element, u'.//cp:property')
return [vt2value(prop[0]) for prop in props]
def items(self):
if self._element is None:
return []
props = xpath(self._element, u'.//cp:property')
return [(prop.get('name'), vt2value(prop[0])) for prop in props]
def set_properties(self, properties):
for name, value in properties.items():
self.set(name, value)
def find_docprops_in_document(self, name=None):
"""This method searches for all doc-properties in the document and
in section headers and footers.
"""
docprops = []
for section in self.doc.sections:
all_header_footers = [section.first_page_header,
section.header,
section.even_page_header,
section.first_page_footer,
section.footer,
section.even_page_footer,
]
# word seems to keep "hidden" header and footer definitions, so
# even though some may have been deactivated via the
# "different first page" or "different odd & even pages" checkboxes
# the definitions will be accessible and also reactivated when the
# checkboxes are re-enabled.
# we deliberately bypass the `different_first_page_header_footer`
# accessor method and check via the underlying `_has_definition`
# method if the header/footer has a definition in xml.
for container in all_header_footers:
if container._has_definition and not container.is_linked_to_previous:
docprops.extend(self._find_docprops_in(
container.part.element, name=name))
docprops.extend(self._find_docprops_in(
self.doc.element.body, name=name))
return docprops
def _find_docprops_in(self, element, name=None):
# First we search for the simple fields:
sfield_nodes = xpath(
element,
u'.//w:fldSimple[contains(@w:instr, \'DOCPROPERTY \')]')
docprops = [SimpleField(sfield_node) for sfield_node in sfield_nodes]
# Now for the complex fields
cfield_nodes = xpath(
element,
u'.//w:instrText[contains(.,\'DOCPROPERTY \')]')
docprops.extend([ComplexField(cfield_node) for cfield_node in cfield_nodes])
if name is not None:
docprops = filter(lambda prop: prop.name == name, docprops)
return docprops
def update_all(self):
"""Update all the document's doc-properties."""
docprops = self.find_docprops_in_document()
available_docprops = dict(self.items())
for docprop in docprops:
value = available_docprops.get(docprop.name)
if value is None:
continue
docprop.update(value, language=self.language)
def update(self, name, value):
"""Update all instances of a given doc-property in the document."""
docprops = self.find_docprops_in_document(name)
for docprop in docprops:
docprop.update(value, language=self.language)
def dissolve_fields(self, name):
"""Remove the property fields but keep their value."""
docprops = self.find_docprops_in_document(name)
for docprop in docprops:
docprop.replace_field_with_value()
class FieldBase(object):
"""Class used to represent a docproperty field in the document.xml.
"""
fieldname_and_format_search_expr = re.compile(
r'DOCPROPERTY +"{0,1}([^\\]*?)"{0,1} +(?:\\\@ +"{0,1}([^\\]*?)"{0,1} +){0,1}\\\* MERGEFORMAT',
flags=re.UNICODE)
def __init__(self, field_node):
self.node = field_node
self.name, self.date_format = self._parse_fieldname_and_format()
if self.date_format:
self.date_format = word_to_python_date_format(self.date_format)
else:
self.date_format = "short"
def _format_value(self, value, language=None):
if isinstance(value, bool):
return u'Y' if value else u'N'
elif isinstance(value, datetime):
if language is not None:
return format_datetime(value, self.date_format, locale=language)
return format_datetime(value, self.date_format)
else:
return text_type(value)
def update(self, value, language=None):
""" Sets the value of the docproperty in the document
"""
raise NotImplementedError()
def replace_field_with_value(self):
""" Removes the field from the document, replacing it with
its value.
"""
raise NotImplementedError()
def _get_fieldname_string(self):
raise NotImplementedError()
def _parse_fieldname_and_format(self):
match = self.fieldname_and_format_search_expr.search(
self._get_fieldname_string())
if match is None:
return None, None
return match.groups()
class SimpleField(FieldBase):
""" Represents a simple field, i.e. <w:fldSimple> node in the
document.xml, its body containing the value of the field.
self.node here is the <w:fldSimple> node.
"""
attr_name = "{{{}}}instr".format(NS["w"])
def _get_fieldname_string(self):
return self.node.attrib[self.attr_name]
def update(self, value, language=None):
text = xpath(self.node, './/w:t')
if text:
text[0].text = self._format_value(value, language=language)
def replace_field_with_value(self):
parent = self.node.getparent()
index = list(parent).index(self.node)
w_r = deepcopy(self.node[0])
parent.remove(self.node)
parent.insert(index, w_r)
class InvalidComplexField(Exception):
"""This exception is raised when a complex field cannot
be handled correctly."""
class ComplexField(FieldBase):
""" Represents a complex field, i.e. a several <w:r> nodes delimited by runs
containing <w:fldChar w:fldCharType="begin"/> and <w:fldChar w:fldCharType="end"/>.
In these fields, the actual value is stored in <w:r> nodes that come after a
<w:r><w:fldChar w:fldCharType="separate"/></w:r> node.
"""
XPATH_PRECEDING_BEGINS = "./preceding-sibling::w:r/w:fldChar[@w:fldCharType=\"begin\"]/.."
XPATH_FOLLOWING_ENDS = "./following-sibling::w:r/w:fldChar[@w:fldCharType=\"end\"]/.."
XPATH_FOLLOWING_SEPARATES = "./following-sibling::w:r/w:fldChar[@w:fldCharType=\"separate\"]/.."
XPATH_TEXTS = "w:instrText"
def __init__(self, field_node):
# run and paragraph containing the field
self.w_r = field_node.getparent()
self.w_p = self.w_r.getparent()
super(ComplexField, self).__init__(field_node)
def _get_fieldname_string(self):
"""The field name can be split up in several instrText runs
so we look for all the instrText nodes between the begin and either
separate or end runs
"""
separate_run = self.get_separate_run()
last = (self.w_p.index(separate_run) if separate_run is not None
else self.w_p.index(self.end_run))
runs = [run for run in self._runs if self.w_p.index(run) < last]
texts = []
for run in runs:
texts.extend(xpath(run, self.XPATH_TEXTS))
return "".join([each.text for each in texts])
@property
def begin_run(self):
begins = xpath(self.w_r, self.XPATH_PRECEDING_BEGINS)
if not begins:
msg = "Complex field without begin node is not supported"
raise InvalidComplexField(msg)
return begins[-1]
@property
def end_run(self):
if not hasattr(self, "_end_run"):
ends = xpath(self.w_r, self.XPATH_FOLLOWING_ENDS)
if not ends:
msg = "Complex field without end node is not supported"
raise InvalidComplexField(msg)
self._end_run = ends[0]
return self._end_run
def get_separate_run(self):
"""The ooxml format standard says that the separate node is optional,
so we check whether we find one in our complex field, otherwise
we return None."""
separates = xpath(self.w_r, self.XPATH_FOLLOWING_SEPARATES)
if not separates:
return None
separate = separates[0]
if not self.w_p.index(separate) < self.w_p.index(self.end_run):
return None
return separate
@property
def _runs(self):
return xpath(self.begin_run, "./following-sibling::w:r")
def get_runs_for_update(self):
"""
Get run fields after <w:r><w:fldChar w:fldCharType="separate"/></w:r>
"""
end_index = self.w_p.index(self.end_run)
separate_run = self.get_separate_run()
# if there is no separate, we have no value to update
if separate_run is None:
return []
separate_index = self.w_p.index(separate_run)
return [run for run in self._runs
if self.w_p.index(run) > separate_index and
self.w_p.index(run) < end_index]
def get_runs_to_replace_field_with_value(self):
"""
Get all <w:r> nodes between <w:fldChar w:fldCharType="begin"/>
and <w:fldChar w:fldCharType="separate"/> including boundaries,
plus the <w:fldChar w:fldCharType="end"/> node
"""
separate_run = self.get_separate_run()
# If there is no separate, then the field has no value
# meaning we can remove the whole field.
if separate_run is None:
end_index = self.w_p.index(self.end_run)
runs = [run for run in self._runs
if self.w_p.index(run) < end_index]
else:
separate_index = self.w_p.index(separate_run)
runs = [run for run in self._runs
if self.w_p.index(run) <= separate_index]
runs.insert(0, self.begin_run)
runs.append(self.end_run)
return runs
def update(self, value, language=None):
runs_after_separate = self.get_runs_for_update()
if runs_after_separate:
first_w_r = runs_after_separate[0]
text = xpath(first_w_r, u'.//w:t')
if text:
text[0].text = self._format_value(value, language=language)
# remove any additional text-nodes inside the first run. we
# update the first text-node only with the full cached
# docproperty value. if for some reason the initial cached
# value is split into multiple text nodes we remove any
# additional node after updating the first node.
for unnecessary_w_t in text[1:]:
first_w_r.remove(unnecessary_w_t)
# if there are multiple runs between "separate" and "end" they
# all may contain a piece of the cached docproperty value. we
# can't reliably handle this situation and only update the
# first node in the first run with the full cached value. it
# appears any additional runs with text nodes should then be
# removed to avoid duplicating parts of the cached docproperty
# value.
for run in runs_after_separate[1:]:
text = xpath(run, u'.//w:t')
if text:
self.w_p.remove(run)
else:
# create a <w:fldChar w:fldCharType="separate"/> run using
# the <w:fldChar w:fldCharType="begin"/> run as a template.
# the node can contain all kind of formatting information, the
# easiest way to preserve it seems to base new nodes on an existing
# node.
# we just swap out the fldCharType from begin to separate.
separate_run = deepcopy(self.begin_run)
w_fld_char = xpath(separate_run, 'w:fldChar')[0]
w_fld_char.set('{{{}}}fldCharType'.format(NS['w']), 'separate')
# create new run containing the actual docproperty value using
# the <w:fldChar w:fldCharType="begin"/> run as a template.
# the node can contain all kind of formatting information, the
# easiest way to preserve it seems to base new nodes on an existing
# node.
# we drop the fldChar node and insert a text node instead.
value_run = deepcopy(self.begin_run)
value_run.remove(xpath(value_run, 'w:fldChar')[0])
text = parse_xml('<w:t xmlns:w="{}"></w:t>'.format(NS['w']))
text.text = self._format_value(value, language=language)
value_run.append(text)
# insert newly created nodes after the node containing the
# docproperty field code in <w:instrText>.
docprop_index = self.w_p.index(self.w_r)
self.w_p.insert(docprop_index + 1, separate_run)
self.w_p.insert(docprop_index + 2, value_run)
def replace_field_with_value(self):
# Get list of <w:r> nodes for removal
runs_to_remove = self.get_runs_to_replace_field_with_value()
for run in runs_to_remove:
self.w_p.remove(run)
|