File: etree_xml.py

package info (click to toggle)
w3af 1.0-rc3svn3489-1
  • links: PTS
  • area: main
  • in suites: jessie, jessie-kfreebsd, squeeze, wheezy
  • size: 59,908 kB
  • ctags: 16,916
  • sloc: python: 136,990; xml: 63,472; sh: 153; ruby: 94; makefile: 40; asm: 35; jsp: 32; perl: 18; php: 5
file content (98 lines) | stat: -rw-r--r-- 3,174 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# -*- coding: utf-8 -*-
# Copyright © 2007-2008 Stockholm TreeAligner Project
# Author: Torsten Marek <shlomme@gmx.net>
# Licensed under the GNU GPLv2
import logging

__all__ = ("element_handler", "IterParseHandler", "ET")

has_lxml = False
has_schema = False

try:
    import sys
    import lxml.etree as ET
    # lxml 1.3.4 is broken on win32, crashes on large inputs.
    if sys.platform == "win32" and ET.__version__ == "1.3.4":
        del ET
        raise ImportError
    has_lxml = True
    has_schema = ET.LXML_VERSION[:3] >= (2, 0, 3)
    import lxml._elementpath as DONTUSE # py2exe workaround
    del DONTUSE
except ImportError:
    import xml.etree.ElementTree # py2exe workaround
    import xml.etree.cElementTree as ET

HANDLER_ATTRIBUTE_NAME = "_handled"

def element_handler(tag, event = "end"):
    assert event in ("start", "end")
    def _inner_element_handler(method):
        setattr(method, HANDLER_ATTRIBUTE_NAME, (event, tag))
        return method
    return _inner_element_handler


class IterParseType(type):
    def __new__(mcs, classname, bases, class_dict):
        class_dict["__x_handlers__"] = handlers = {}
        for attr in class_dict.itervalues():
            if callable(attr) and hasattr(attr, HANDLER_ATTRIBUTE_NAME):
                handlers[getattr(attr, HANDLER_ATTRIBUTE_NAME)] = attr
                
        return type.__new__(mcs, classname, bases, class_dict)


class IterParseHandler(object):
    DELETE_BRANCH = True
    
    __metaclass__ = IterParseType
    
    __x_handlers__ = {}
    
    def __init__(self, schema = None):
        self._schema = None
        
        if schema:
            if has_lxml:
                if has_schema:
                    self._schema = ET.XMLSchema(ET.parse(schema))
                else:
                    logging.warning(
                        "XML schema validation is only supported with lxml 2.0.3 and higher.")
            else:
                logging.warning("Validation requested, but lxml is not installed, deactivating!")
           
    def _parse(self, filename):
        """Parses the XML file `filename`."""
        events = ("start", "end")
            
        if has_schema:
            event_source = ET.iterparse(filename, events,  schema=self._schema)
        else:
            event_source = ET.iterparse(filename, events)
        
        context = iter(event_source)
        
        event, root = context.next()
        self._handle_root(root)
        
        for event, elem in context:
            try:
                handler = self.__x_handlers__[(event, elem.tag)]
            except KeyError:
                continue
            result = handler(self, elem)
            if result == self.DELETE_BRANCH and event == "end":
                elem.clear()
        
    def _handle_root(self, elem):
        """Called with the root element before any other processing is done.
        
        Only the attributes of the `elem` may be accessed at this time, children
        might be present as a side-effect, but may not be used.
        
        The default implementation does nothing, should be overridden by subclasses.
        """
        pass