ODFPY  1.2.0
 All Classes Namespaces Files Functions Variables
odf2moinmoin.py
Go to the documentation of this file.
1 # -*- coding: utf-8 -*-
2 # Copyright (C) 2006-2008 Søren Roug, European Environment Agency
3 #
4 # This library is free software; you can redistribute it and/or
5 # modify it under the terms of the GNU Lesser General Public
6 # License as published by the Free Software Foundation; either
7 # version 2.1 of the License, or (at your option) any later version.
8 #
9 # This library is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 # Lesser General Public License for more details.
13 #
14 # You should have received a copy of the GNU Lesser General Public
15 # License along with this library; if not, write to the Free Software
16 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17 #
18 # See http://trac.edgewall.org/wiki/WikiFormatting
19 #
20 # Contributor(s):
21 #
22 
23 import sys, zipfile, xml.dom.minidom
24 from namespaces import nsdict
25 from elementtypes import *
26 
27 IGNORED_TAGS = [
28  'draw:a'
29  'draw:g',
30  'draw:line',
31  'draw:object-ole',
32  'office:annotation',
33  'presentation:notes',
34  'svg:desc',
35 ] + [ nsdict[item[0]]+":"+item[1] for item in empty_elements]
36 
37 INLINE_TAGS = [ nsdict[item[0]]+":"+item[1] for item in inline_elements]
38 
39 
40 ##
41 # Holds properties for a text style.
42 class TextProps:
43 
44  def __init__(self):
45 
46  self.italic = False
47  self.bold = False
48  self.fixed = False
49  self.underlined = False
50  self.strikethrough = False
51  self.superscript = False
52  self.subscript = False
53 
54  def setItalic(self, value):
55  if value == "italic":
56  self.italic = True
57  elif value == "normal":
58  self.italic = False
59 
60  def setBold(self, value):
61  if value == "bold":
62  self.bold = True
63  elif value == "normal":
64  self.bold = False
65 
66  def setFixed(self, value):
67  self.fixed = value
68 
69  def setUnderlined(self, value):
70  if value and value != "none":
71  self.underlined = True
72 
73  def setStrikethrough(self, value):
74  if value and value != "none":
75  self.strikethrough = True
76 
77  def setPosition(self, value):
78  if value is None or value == '':
79  return
80  posisize = value.split(' ')
81  textpos = posisize[0]
82  if textpos.find('%') == -1:
83  if textpos == "sub":
84  self.superscript = False
85  self.subscript = True
86  elif textpos == "super":
87  self.superscript = True
88  self.subscript = False
89  else:
90  itextpos = int(textpos[:textpos.find('%')])
91  if itextpos > 10:
92  self.superscript = False
93  self.subscript = True
94  elif itextpos < -10:
95  self.superscript = True
96  self.subscript = False
97 
98  def __str__(self):
99 
100  return "[italic=%s, bold=i%s, fixed=%s]" % (str(self.italic),
101  str(self.bold),
102  str(self.fixed))
103 
104 ##
105 # Holds properties of a paragraph style.
107 
108  def __init__(self):
109 
110  self.blockquote = False
111  self.headingLevel = 0
112  self.code = False
113  self.title = False
114  self.indented = 0
115 
116  def setIndented(self, value):
117  self.indented = value
118 
119  def setHeading(self, level):
120  self.headingLevel = level
121 
122  def setTitle(self, value):
123  self.title = value
124 
125  def setCode(self, value):
126  self.code = value
127 
128 
129  def __str__(self):
130 
131  return "[bq=%s, h=%d, code=%s]" % (str(self.blockquote),
132  self.headingLevel,
133  str(self.code))
134 
135 
136 ##
137 # Holds properties for a list style.
139 
140  def __init__(self):
141  self.ordered = False
142 
143  def setOrdered(self, value):
144  self.ordered = value
145 
146 
147 
149 
150 
151  def __init__(self, filepath):
152  self.footnotes = []
154  self.textStyles = {"Standard": TextProps()}
155  self.paragraphStyles = {"Standard": ParagraphProps()}
156  self.listStyles = {}
157  self.fixedFonts = []
158  self.hasTitle = 0
159  self.lastsegment = None
160 
161  # Tags
162  self.elements = {
163  'draw:page': self.textToString,
164  'draw:frame': self.textToString,
165  'draw:image': self.draw_image,
166  'draw:text-box': self.textToString,
167  'text:a': self.text_a,
168  'text:note': self.text_note,
169  }
170  for tag in IGNORED_TAGS:
171  self.elements[tag] = self.do_nothing
172 
173  for tag in INLINE_TAGS:
174  self.elements[tag] = self.inline_markup
175  self.elements['text:line-break'] = self.text_line_break
176  self.elements['text:s'] = self.text_s
177  self.elements['text:tab'] = self.text_tab
178 
179  self.load(filepath)
180 
181  ##
182  # Extracts necessary font information from a font-declaration
183  # element.
184  #
185  def processFontDeclarations(self, fontDecl):
186  for fontFace in fontDecl.getElementsByTagName("style:font-face"):
187  if fontFace.getAttribute("style:font-pitch") == "fixed":
188  self.fixedFonts.append(fontFace.getAttribute("style:name"))
189 
190 
191 
192  ##
193  # Extracts text properties from a style element.
194  def extractTextProperties(self, style, parent=None):
195 
196  textProps = TextProps()
197 
198  if parent:
199  parentProp = self.textStyles.get(parent, None)
200  if parentProp:
201  textProp = parentProp
202 
203  textPropEl = style.getElementsByTagName("style:text-properties")
204  if not textPropEl: return textProps
205 
206  textPropEl = textPropEl[0]
207 
208  textProps.setItalic(textPropEl.getAttribute("fo:font-style"))
209  textProps.setBold(textPropEl.getAttribute("fo:font-weight"))
210  textProps.setUnderlined(textPropEl.getAttribute("style:text-underline-style"))
211  textProps.setStrikethrough(textPropEl.getAttribute("style:text-line-through-style"))
212  textProps.setPosition(textPropEl.getAttribute("style:text-position"))
213 
214  if textPropEl.getAttribute("style:font-name") in self.fixedFonts:
215  textProps.setFixed(True)
216 
217  return textProps
218 
219  ##
220  # Extracts paragraph properties from a style element.
221  def extractParagraphProperties(self, style, parent=None):
222 
223  paraProps = ParagraphProps()
224 
225  name = style.getAttribute("style:name")
226 
227  if name.startswith("Heading_20_"):
228  level = name[11:]
229  try:
230  level = int(level)
231  paraProps.setHeading(level)
232  except:
233  level = 0
234 
235  if name == "Title":
236  paraProps.setTitle(True)
237 
238  paraPropEl = style.getElementsByTagName("style:paragraph-properties")
239  if paraPropEl:
240  paraPropEl = paraPropEl[0]
241  leftMargin = paraPropEl.getAttribute("fo:margin-left")
242  if leftMargin:
243  try:
244  leftMargin = float(leftMargin[:-2])
245  if leftMargin > 0.01:
246  paraProps.setIndented(True)
247  except:
248  pass
249 
250  textProps = self.extractTextProperties(style)
251  if textProps.fixed:
252  paraProps.setCode(True)
253 
254  return paraProps
255 
256 
257  ##
258  # Runs through "style" elements extracting necessary information.
259  #
260  def processStyles(self, styleElements):
261 
262  for style in styleElements:
263 
264  name = style.getAttribute("style:name")
265 
266  if name == "Standard": continue
267 
268  family = style.getAttribute("style:family")
269  parent = style.getAttribute("style:parent-style-name")
270 
271  if family == "text":
272  self.textStyles[name] = self.extractTextProperties(style, parent)
273 
274  elif family == "paragraph":
275  self.paragraphStyles[name] = \
276  self.extractParagraphProperties(style, parent)
277  self.textStyles[name] = self.extractTextProperties(style, parent)
278 
279  def processListStyles(self, listStyleElements):
280 
281  for style in listStyleElements:
282  name = style.getAttribute("style:name")
283 
284  prop = ListProperties()
285  if style.hasChildNodes():
286  subitems = [el for el in style.childNodes
287  if el.nodeType == xml.dom.Node.ELEMENT_NODE
288  and el.tagName == "text:list-level-style-number"]
289  if len(subitems) > 0:
290  prop.setOrdered(True)
291 
292  self.listStyles[name] = prop
293 
294 
295  ##
296  # Loads an ODT file.
297  def load(self, filepath):
298 
299  zip = zipfile.ZipFile(filepath)
300 
301  styles_doc = xml.dom.minidom.parseString(zip.read("styles.xml"))
302  fontfacedecls = styles_doc.getElementsByTagName("office:font-face-decls")
303  if fontfacedecls:
304  self.processFontDeclarations(fontfacedecls[0])
305  self.processStyles(styles_doc.getElementsByTagName("style:style"))
306  self.processListStyles(styles_doc.getElementsByTagName("text:list-style"))
307 
308  self.content = xml.dom.minidom.parseString(zip.read("content.xml"))
309  fontfacedecls = self.content.getElementsByTagName("office:font-face-decls")
310  if fontfacedecls:
311  self.processFontDeclarations(fontfacedecls[0])
312 
313  self.processStyles(self.content.getElementsByTagName("style:style"))
314  self.processListStyles(self.content.getElementsByTagName("text:list-style"))
315 
316  ##
317  # Removes extra blank lines from code blocks.
318  def compressCodeBlocks(self, text):
319 
320  return text
321  lines = text.split("\n")
322  buffer = []
323  numLines = len(lines)
324  for i in range(numLines):
325 
326  if (lines[i].strip() or i == numLines-1 or i == 0 or
327  not ( lines[i-1].startswith(" ")
328  and lines[i+1].startswith(" ") ) ):
329  buffer.append("\n" + lines[i])
330 
331  return ''.join(buffer)
332 
333 #-----------------------------------
334  def do_nothing(self, node):
335  return ''
336 
337  ##
338  #
339  #
340  def draw_image(self, node):
341 
342  link = node.getAttribute("xlink:href")
343  if link and link[:2] == './': # Indicates a sub-object, which isn't supported
344  return "%s\n" % link
345  if link and link[:9] == 'Pictures/':
346  link = link[9:]
347  return "[[Image(%s)]]\n" % link
348 
349  def text_a(self, node):
350  text = self.textToString(node)
351  link = node.getAttribute("xlink:href")
352  if link.strip() == text.strip():
353  return "[%s] " % link.strip()
354  else:
355  return "[%s %s] " % (link.strip(), text.strip())
356 
357 
358  def text_line_break(self, node):
359  return "[[BR]]"
360 
361  def text_note(self, node):
362  cite = (node.getElementsByTagName("text:note-citation")[0]
363  .childNodes[0].nodeValue)
364  body = (node.getElementsByTagName("text:note-body")[0]
365  .childNodes[0])
366  self.footnotes.append((cite, self.textToString(body)))
367  return "^%s^" % cite
368 
369  def text_s(self, node):
370  try:
371  num = int(node.getAttribute("text:c"))
372  return " "*num
373  except:
374  return " "
375 
376  def text_tab(self, node):
377  return " "
378 
379  def inline_markup(self, node):
380  text = self.textToString(node)
381 
382  if not text.strip():
383  return '' # don't apply styles to white space
384 
385  styleName = node.getAttribute("text:style-name")
386  style = self.textStyles.get(styleName, TextProps())
387 
388  if style.fixed:
389  return "`" + text + "`"
390 
391  mark = []
392  if style:
393  if style.italic:
394  mark.append("''")
395  if style.bold:
396  mark.append("'''")
397  if style.underlined:
398  mark.append("__")
399  if style.strikethrough:
400  mark.append("~~")
401  if style.superscript:
402  mark.append("^")
403  if style.subscript:
404  mark.append(",,")
405  revmark = mark[:]
406  revmark.reverse()
407  return "%s%s%s" % (''.join(mark), text, ''.join(revmark))
408 
409 #-----------------------------------
410  def listToString(self, listElement, indent = 0):
411 
412  self.lastsegment = listElement.tagName
413  buffer = []
414 
415  styleName = listElement.getAttribute("text:style-name")
416  props = self.listStyles.get(styleName, ListProperties())
417 
418  i = 0
419  for item in listElement.childNodes:
420  buffer.append(" "*indent)
421  i += 1
422  if props.ordered:
423  number = str(i)
424  number = " " + number + ". "
425  buffer.append(" 1. ")
426  else:
427  buffer.append(" * ")
428  subitems = [el for el in item.childNodes
429  if el.tagName in ["text:p", "text:h", "text:list"]]
430  for subitem in subitems:
431  if subitem.tagName == "text:list":
432  buffer.append("\n")
433  buffer.append(self.listToString(subitem, indent+3))
434  else:
435  buffer.append(self.paragraphToString(subitem, indent+3))
436  self.lastsegment = subitem.tagName
437  self.lastsegment = item.tagName
438  buffer.append("\n")
439 
440  return ''.join(buffer)
441 
442  ##
443  # MoinMoin uses || to delimit table cells
444  #
445  def tableToString(self, tableElement):
446 
447  self.lastsegment = tableElement.tagName
448  buffer = []
449 
450  for item in tableElement.childNodes:
451  self.lastsegment = item.tagName
452  if item.tagName == "table:table-header-rows":
453  buffer.append(self.tableToString(item))
454  if item.tagName == "table:table-row":
455  buffer.append("\n||")
456  for cell in item.childNodes:
457  buffer.append(self.inline_markup(cell))
458  buffer.append("||")
459  self.lastsegment = cell.tagName
460  return ''.join(buffer)
461 
462 
463  ##
464  # Converts the document to a string.
465  # FIXME: Result from second call differs from first call
466  #
467  def toString(self):
468  body = self.content.getElementsByTagName("office:body")[0]
469  text = body.childNodes[0]
470 
471  buffer = []
472 
473  paragraphs = [el for el in text.childNodes
474  if el.tagName in ["draw:page", "text:p", "text:h","text:section",
475  "text:list", "table:table"]]
476 
477  for paragraph in paragraphs:
478  if paragraph.tagName == "text:list":
479  text = self.listToString(paragraph)
480  elif paragraph.tagName == "text:section":
481  text = self.textToString(paragraph)
482  elif paragraph.tagName == "table:table":
483  text = self.tableToString(paragraph)
484  else:
485  text = self.paragraphToString(paragraph)
486  if text:
487  buffer.append(text)
488 
489  if self.footnotes:
490 
491  buffer.append("----")
492  for cite, body in self.footnotes:
493  buffer.append("%s: %s" % (cite, body))
494 
495 
496  buffer.append("")
497  return self.compressCodeBlocks('\n'.join(buffer))
498 
499 
500  def textToString(self, element):
501 
502  buffer = []
503 
504  for node in element.childNodes:
505 
506  if node.nodeType == xml.dom.Node.TEXT_NODE:
507  buffer.append(node.nodeValue)
508 
509  elif node.nodeType == xml.dom.Node.ELEMENT_NODE:
510  tag = node.tagName
511 
512  if tag in ("draw:text-box", "draw:frame"):
513  buffer.append(self.textToString(node))
514 
515  elif tag in ("text:p", "text:h"):
516  text = self.paragraphToString(node)
517  if text:
518  buffer.append(text)
519  elif tag == "text:list":
520  buffer.append(self.listToString(node))
521  else:
522  method = self.elements.get(tag)
523  if method:
524  buffer.append(method(node))
525  else:
526  buffer.append(" {" + tag + "} ")
527 
528  return ''.join(buffer)
529 
530  def paragraphToString(self, paragraph, indent = 0):
531 
532  dummyParaProps = ParagraphProps()
533 
534  style_name = paragraph.getAttribute("text:style-name")
535  paraProps = self.paragraphStyles.get(style_name, dummyParaProps)
536  text = self.inline_markup(paragraph)
537 
538  if paraProps and not paraProps.code:
539  text = text.strip()
540 
541  if paragraph.tagName == "text:p" and self.lastsegment == "text:p":
542  text = "\n" + text
543 
544  self.lastsegment = paragraph.tagName
545 
546  if paraProps.title:
547  self.hasTitle = 1
548  return "= " + text + " =\n"
549 
550  outlinelevel = paragraph.getAttribute("text:outline-level")
551  if outlinelevel:
552 
553  level = int(outlinelevel)
554  if self.hasTitle: level += 1
555 
556  if level >= 1:
557  return "=" * level + " " + text + " " + "=" * level + "\n"
558 
559  elif paraProps.code:
560  return "{{{\n" + text + "\n}}}\n"
561 
562  if paraProps.indented:
563  return self.wrapParagraph(text, indent = indent, blockquote = True)
564 
565  else:
566  return self.wrapParagraph(text, indent = indent)
567 
568 
569  def wrapParagraph(self, text, indent = 0, blockquote=False):
570 
571  counter = 0
572  buffer = []
573  LIMIT = 50
574 
575  if blockquote:
576  buffer.append(" ")
577 
578  return ''.join(buffer) + text
579  # Unused from here
580  for token in text.split():
581 
582  if counter > LIMIT - indent:
583  buffer.append("\n" + " "*indent)
584  if blockquote:
585  buffer.append(" ")
586  counter = 0
587 
588  buffer.append(token + " ")
589  counter += len(token)
590 
591  return ''.join(buffer)
Holds properties for a text style.
Definition: odf2moinmoin.py:42
Holds properties of a paragraph style.
def compressCodeBlocks
Removes extra blank lines from code blocks.
def tableToString
MoinMoin uses || to delimit table cells.
def extractParagraphProperties
Extracts paragraph properties from a style element.
def extractTextProperties
Extracts text properties from a style element.
def toString
Converts the document to a string.
def processStyles
Runs through "style" elements extracting necessary information.
def processFontDeclarations
Extracts necessary font information from a font-declaration element.
Holds properties for a list style.
def load
Loads an ODT file.