File: tidy_xml.py

package info (click to toggle)
ros-catkin 0.8.12-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 2,224 kB
  • sloc: python: 4,117; sh: 508; xml: 256; cpp: 136; makefile: 125
file content (91 lines) | stat: -rw-r--r-- 3,402 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# Software License Agreement (BSD License)
#
# Copyright (c) 2008, Willow Garage, Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above
#    copyright notice, this list of conditions and the following
#    disclaimer in the documentation and/or other materials provided
#    with the distribution.
#  * Neither the name of Willow Garage, Inc. nor the names of its
#    contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.

from __future__ import unicode_literals

import codecs
import os
import re

# unit test suites are not good about screening out illegal unicode characters (#603)
# recipe from http://boodebr.org/main/python/all-about-python-and-unicode#UNI_XML
# code copied from rosunit/src/junitxml.py
try:
    char = unichr
except NameError:
    char = chr
RE_XML_ILLEGAL = (
    '([%s-%s%s-%s%s-%s%s-%s])' +
    '|' +
    '([%s-%s][^%s-%s])|([^%s-%s][%s-%s])|([%s-%s]$)|(^[%s-%s])') % \
    (char(0x0000), char(0x0008), char(0x000b), char(0x000c),
     char(0x000e), char(0x001f), char(0xfffe), char(0xffff),
     char(0xd800), char(0xdbff), char(0xdc00), char(0xdfff),
     char(0xd800), char(0xdbff), char(0xdc00), char(0xdfff),
     char(0xd800), char(0xdbff), char(0xdc00), char(0xdfff))
_SAFE_XML_REGEX = re.compile(RE_XML_ILLEGAL)


def tidy_xml(filename):
    """
    Read in file, screen out unsafe unicode characters, write back file in utf-8.

    :param filename: str
    :returns: False if unable to read from file
    """
    if not os.path.isfile(filename):
        raise ValueError('file does not exist')

    # try first utf-8 then iso. This is ugly, but the files in
    # question that are problematic do not declare unicode type
    data = None
    for ftype in ['utf-8', 'iso8859-1']:
        fhand = None
        try:
            fhand = codecs.open(filename, 'r', ftype)
            data = fhand.read()
            break
        except ValueError:
            continue
        finally:
            if fhand is not None:
                fhand.close()

    if data is None:
        return False

    for match in _SAFE_XML_REGEX.finditer(data):
        data = data[:match.start()] + '?' + data[match.end():]

    with open(filename, 'w') as fhand:
        fhand.write(data)
    return True