File: package_sorter.py

package info (click to toggle)
advene 1.0-1
  • links: PTS, VCS
  • area: main
  • in suites: wheezy
  • size: 10,340 kB
  • sloc: xml: 56,204; python: 49,885; perl: 741; sh: 186; makefile: 83; ansic: 18
file content (146 lines) | stat: -rwxr-xr-x 4,175 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#! /usr/bin/python
#
# This file is part of Advene.
#
# Advene is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# Advene is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Foobar; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
#
"""Process an Advene XML package in order to get annotations ordered
by timestamp, and all other elements ordered by id.  
"""
import sys

import xml.etree.ElementTree as ET
from xml.etree.ElementTree import parse, Element, ElementTree, QName
import string

def tag(name):
    """Return the namespaced tag.
    """
    return '{%s}%s' % (ns, name)

def cmp_id(a, b):
    """Compare id
    """
    return cmp(a.attrib['id'], b.attrib['id'])

def sort_id(source):
    """Sort the source Element elements along their id.

    Returns a new Element
    """
    dest=Element(source.tag)
    dest.attrib.update(source.attrib)
    
    res=[ e for e in source ]
    res.sort(cmp_id)
    
    for e in res:
        dest.append(e)
    return dest

def cmp_time(a, b):
    """Compare time
    """
    return cmp(a._begin, b._begin)

def sort_time(source):
    """Sort the source Element elements along their time (for annotations) and id (for relations).

    Returns a new Element
    """
    dest=Element(source.tag)
    dest.attrib.update(source.attrib)
    
    antag=tag('annotation')
    reltag=tag('relation')

    rel=[ e for e in source if e.tag == reltag ]
    rel.sort(cmp_id)

    an=[ e for e in source if e.tag == antag ]
    # Pre-parse begin times
    for a in an:
        f=a.find(tag('millisecond-fragment'))
        if f is not None:
            a._begin = long(f.attrib['begin'])
        else:
            print "Error: cannot find begin time for ", a.attrib['id']
            a._begin = 0
    an.sort(cmp_time)
    
    for e in an:
        dest.append(e)
    for e in rel:
        dest.append(e)

    return dest

# Namespace handling
ns='http://experience.univ-lyon1.fr/advene/ns'
ET._namespace_map[ns]=''
ET._namespace_map['http://purl.org/dc/elements/1.1/']='dc'
ET._namespace_map['http://experience.univ-lyon1.fr/advene/ns/advenetool']='advenetool'

# Hack into elementtree to generate a readable (namespace-prefix-wise)
# Advene package
def my_fixtag(tag, namespaces):
    # given a decorated tag (of the form {uri}tag), return prefixed
    # tag and namespace declaration, if any
    if isinstance(tag, QName):
        tag = tag.text
    namespace_uri, tag = string.split(tag[1:], "}", 1)
    prefix = namespaces.get(namespace_uri)
    if prefix is None:
        prefix = ET._namespace_map.get(namespace_uri)
        if prefix is None:
            prefix = "ns%d" % len(namespaces)
        namespaces[namespace_uri] = prefix
        if prefix == "xml":
            xmlns = None
        elif prefix == '':
            # Empty prefix from _namespace_map, assume it is the
            # default
            xmlns = ('xmlns', namespace_uri)
        else:
            xmlns = ("xmlns:%s" % prefix, namespace_uri)
    else:
        xmlns = None
    if prefix == '':
        return tag, xmlns
    else:
        return "%s:%s" % (prefix, tag), xmlns

# Hook into elementtree
ET.fixtag = my_fixtag

tree = parse(sys.argv[1])
source = tree.getroot()
dest=Element(source.tag)
dest.attrib.update(source.attrib)


for e in source:
    if e.tag == tag('meta') or e.tag == tag('imports'):
        dest.append(e)
    elif e.tag in [ tag(n) for n in ('queries', 'schemas', 'views') ]:
        # Sort along id
        dest.append(sort_id(e))
    elif e.tag == tag('annotations'):
        dest.append(sort_time(e))
    else:
        print "Unknown tag", e.tag

tree=ElementTree(dest)
tree.write(open(sys.argv[2], 'w'), encoding='utf-8')