File: xmlreader.py

package info (click to toggle)
termsaver 0.1.1-1
  • links: PTS, VCS
  • area: main
  • in suites: wheezy
  • size: 284 kB
  • sloc: python: 1,446; makefile: 4
file content (209 lines) | stat: -rw-r--r-- 6,975 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
###############################################################################
#
# file:     xmlreader.py
#
# Purpose:  refer to module documentation for details
#
# Note:     This file is part of Termsaver application, and should not be used
#           or executed separately.
#
###############################################################################
#
# Copyright 2012 Termsaver
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
#
###############################################################################
"""
A helper class used for screens that require XML handling. See
additional information in the class itself.

The helper class available here is:

    * `XMLReaderHelperBase`

"""

#
# Python built-in modules
#
import os
from xml.dom.minidom import parse, Node, parseString

#
# Internal modules
#
from termsaverlib.screen.helper import ScreenHelperBase
from termsaverlib import exception


class XMLReaderHelperBase(ScreenHelperBase):
    """
    This helper class will handle basic XML parsing, not trying to solve all
    mysteries of the universe here. What we are looking for are main nodes that
    contain repetitive data, commonly found on a dataset, or RSS feed. More
    complex handling is not treated at this point, but it might be implemented
    if the need surfaces.

    The basic instantiation of this class require you to inform 2 arguments:

        * `base_node`: Defines the primary node where the data must be
           retrieved from.

        * `tags`: Defines which tags within the XML must be parsed to build a
           list of dictionaries.

    Those two arguments will give the hint about where is the data and which
    piece of it you are looking for.

    For actually getting the data, you will need to:

        * prepare a raw data (file content, Internet data, etc) into a
          XML object, named `__doc`; and
        * parse the XML object into a more convenient list of dictionaries
          that will be populated in `data` property.

    To prepare the data, you have 2 options:

        * `doc_xml_string`: a method that will create a dom xml document from
          a text string (obviously it must be a XML)

        * `doc_xml_file`: a method that will create a dom xml document from
          a file content (obviously it must be a XML)

    Once you have the XML properly prepared, and stored in `__doc`, you can
    call the parsing method:

        * `parse_data`: this will actually execute the action to extract the
           information you are looking for based on the arguments passed
           in the instantiation.

    """

    __doc = None
    """
    Holds the xml.dom.minidom document object
    """

    clean_dirt = []
    """
    Holds a list of strings that will be cleaned up from each result in the
    XML data, when placing them into the `data` property. This can be pretty
    handy to remove trailing spaces, new lines, or unwanted HTML tags from the
    data.
    """

    base_node = None
    """
    Defines the primary node where the data must be retrieved from.
    """

    tags = []
    """
    Defines which tags within the XML must be parsed to build a
    list of dictionaries.
    """

    data = None
    """
    Holds a list, created from properly parsing the dom document object in
    `__doc`, as specified with `base_node` and `tags` filtering.
    """

    def __init__(self, base_node, tags):
        """
        Creates a new instance of this class.

        Arguments:

        * `base_node`: Defines the primary node where the data must be
           retrieved from.

        * `tags`: Defines which tags within the XML must be parsed to build a
           list of dictionaries.
        """
        self.base_node = base_node
        self.tags = tags

    def parse_data(self):
        """
        Only call this once you have already created the dom document object,
        by calling either `doc_xml_file` or `doc_xml_string` methods.

        This will parse the document into a list, much simpler to deal with.
        On the logic here is done, the list is available in the property `data`
        """
        def get_note_value(node, node_type):
            result = ''
            for node2 in node:
                for node3 in node2.childNodes:
                    if node3.nodeType == node_type:
                        result += node3.data
            # clean possible dirt
            for t in self.clean_dirt:
                # execute a loop here for dealing with multiple occurrences
                # (such as multiple spaces)
                while result.find(t) > -1:
                    result = result.replace(t, "")
            return result

        if self.__doc is None:
            raise Exception("""You must parse the raw data, by calling a \
doc_xml_* method to populate the dom document object.""")

        result = []
        for node in self.__doc.getElementsByTagName(self.base_node):
            temp = {}
            for tag in self.tags:
                temp[tag] = get_note_value(node.getElementsByTagName(tag),
                                           Node.TEXT_NODE)
                if not temp[tag]:
                    temp[tag] = get_note_value(node.getElementsByTagName(tag),
                                               Node.CDATA_SECTION_NODE)
            result.append(temp)
        self.data = result

    def doc_xml_file(self, path):
        """
        Parses a specified file into a xml.dom.minidom document object, to be
        used by `parse_data` method later on. This method here will store the
        result in the private `__doc` property.

        Arguments:

            * path: the XML file path that will be parsed into a dom
              document object.
        """
        if not os.path.exists(path):
            raise exception.PathNotFoundException(path)

        try:
            self.__doc = parse(path)
        except:
            raise exception.XmlException(path)

    def doc_xml_string(self, text):
        """
        Parses a specified string into a xml.dom.minidom document object, to be
        used by `parse_data` method later on. This method here will store the
        result in the private `__doc` property.

        Arguments:

            * text: the XML string value that will be parsed into a dom
              document object.
        """
        try:
            self.__doc = parseString(text)
        except:
            raise exception.XmlException(text)