1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
|
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright (C) 2006-2007 Søren Roug, European Environment Agency
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#
# Contributor(s):
#
# This script is to be embedded in opendocument.py later
# The purpose is to read an ODT/ODP/ODS file and create the datastructure
# in memory. The user should then be able to make operations and then save
# the structure again.
import zipfile
from xml.sax import make_parser,handler
from xml.sax.xmlreader import InputSource
import xml.sax.saxutils
import sys
from ocrfeeder.odf.opendocument import OpenDocument
from ocrfeeder.odf import element
from ocrfeeder.odf.namespaces import STYLENS, OFFICENS
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
#
# Parse the XML files
#
class ODFParser(handler.ContentHandler):
""" Extract headings from content.xml of an ODT file """
def __init__(self, document):
self.doc = document
self.data = []
self.level = 0
self.parse = False
def characters(self, data):
if self.parse == False:
return
self.data.append(data)
def startElementNS(self, tag, qname, attrs):
if tag in ((OFFICENS, 'body'), (OFFICENS, 'styles')):
self.parse = True
if self.parse == False:
return
self.level = self.level + 1
# Add any accumulated text content
content = ''.join(self.data).strip()
if len(content) > 0:
self.parent.addText(content)
self.data = []
# Create the element
attrdict = {}
for (att,value) in attrs.items():
attrdict[att] = value
try:
e = element.Element(qname = tag, qattributes=attrdict)
self.curr = e
except AttributeError, v:
print "Error: %s" % v
if tag == (OFFICENS,'styles'):
self.doc.styles = e
elif tag == (OFFICENS, 'body'):
self.doc.body = e
else:
self.parent.addElement(e)
self.parent = e
def endElementNS(self, tag, qname):
if tag in ((OFFICENS, 'body'), (OFFICENS, 'styles')):
self.parse = False
if self.parse == False:
return
self.level = self.level - 1
str = ''.join(self.data)
self.data = []
self.parent = self.curr.parentNode
def _getxmlpart(odffile, xmlfile):
""" Get the content out of the ODT file"""
z = zipfile.ZipFile(odffile)
content = z.read(xmlfile)
z.close()
return content
def load(odtfile):
mimetype = _getxmlpart(odtfile,'mimetype')
d = OpenDocument(mimetype)
for xmlfile in ('content.xml',):
xmlpart = _getxmlpart(odtfile, xmlfile)
parser = make_parser()
parser.setFeature(handler.feature_namespaces, 1)
parser.setContentHandler(ODFParser(d))
parser.setErrorHandler(handler.ErrorHandler())
inpsrc = InputSource()
inpsrc.setByteStream(StringIO(xmlpart))
parser.parse(inpsrc)
return d
|