File: utils.py

package info (click to toggle)
ocrodjvu 0.7.18-1
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 3,672 kB
  • ctags: 502
  • sloc: python: 3,535; xml: 941; makefile: 14; sh: 13
file content (156 lines) | stat: -rw-r--r-- 4,737 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
# encoding=UTF-8

# Copyright © 2008, 2009, 2010, 2011, 2012 Jakub Wilk <jwilk@jwilk.net>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; version 2 dated June, 1991.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.

import functools
import locale
import os
import re
import warnings

debian = os.path.exists('/etc/debian_version')

def enhance_import_error(exception, package, debian_package, homepage):
    message = str(exception)
    format = '%(message)s; please install the %(package)s package'
    if debian:
        package = debian_package
    else:
        format += ' <%(homepage)s>'
    exception.args = [format % locals()]

def parse_page_numbers(pages):
    '''
    parse_page_numbers(None) -> None
    parse_page_numbers('17') -> [17]
    parse_page_numbers('37-42') -> [37, 38, ..., 42]
    parse_page_numbers('17,37-42') -> [17, 37, 38, ..., 42]
    parse_page_numbers('42-37') -> []
    parse_page_numbers('17-17') -> [17]
    '''
    if pages is None:
        return
    result = []
    for page_range in pages.split(','):
        if '-' in page_range:
            x, y = map(int, page_range.split('-', 1))
            result += xrange(x, y + 1)
        else:
            result += int(page_range, 10),
    return result

_special_chars_replace = re.compile(ur'''[\x00-\x1f'"\x5c\x7f-\x9f]''', re.UNICODE).sub

def _special_chars_escape(m):
    ch = m.group(0)
    if ch in ('"', "'"):
        return '\\' + ch
    else:
        return repr(ch)[2:-1]

def smart_repr(s, encoding=None):
    if encoding is None:
        return repr(s)
    try:
        u = s.decode(encoding)
    except UnicodeDecodeError:
        return repr(s)
    return "'%s'" % _special_chars_replace(_special_chars_escape, u)

class EncodingWarning(UserWarning):
    pass

_control_characters_regex = re.compile('[%s]' % ''.join(
    ch for ch in map(chr, xrange(32))
    if ch not in u'\n\r\t'
))

def sanitize_utf8(text):
    '''
    Replace invalid UTF-8 sequences and control characters (except CR, LF, TAB
    and space) with Unicode replacement characters.
    '''
    try:
        text = text.decode('UTF-8')
    except UnicodeDecodeError, exc:
        text = text.decode('UTF-8', 'replace')
        message = str(exc)
        message = re.sub("^'utf8' codec can't decode ", '', message)
        warnings.warn(
            message,
            category=EncodingWarning,
            stacklevel=2,
        )
    text = text.encode('UTF-8')
    match = _control_characters_regex.search(text)
    if match:
        byte = ord(match.group())
        message = 'byte 0x%02x in position %d: control character' % (byte, match.start())
        warnings.warn(
            message,
            category=EncodingWarning,
            stacklevel=2,
        )
        text = _control_characters_regex.sub(u'\N{REPLACEMENT CHARACTER}'.encode('UTF-8'), text)
    # There are other code points that are not allowed in XML (or even: not
    # allowed in UTF-8), but which Python happily accept. However, they haven't
    # seemed to occur in real-world documents.
    # http://www.w3.org/TR/2008/REC-xml-20081126/#NT-Char
    return text

class NotOverriddenWarning(UserWarning):
    pass

def not_overridden(f):
    '''
    Issue NotOverriddenWarning if the decorated method was not overridden in a
    subclass, or called directly.
    '''
    @functools.wraps(f)
    def new_f(self, *args, **kwargs):
        cls = type(self)
        warnings.warn(
            '%s.%s.%s() is not overridden' % (cls.__module__, cls.__name__, f.__name__),
            category=NotOverriddenWarning,
            stacklevel=2
        )
        return f(self, *args, **kwargs)
    return new_f

def str_as_unicode(s, encoding=locale.getpreferredencoding()):
    if isinstance(s, unicode):
        return s
    return s.decode(encoding, 'replace')

def identity(x):
    '''
    identity(x) -> x
    '''
    return x

class property(object):

    def __init__(self, default_value=None, filter=identity):
        self._private_name = '__%s__%d' % (self.__module__, id(self))
        self._filter = filter
        self._default_value = default_value

    def __get__(self, instance, cls):
        if instance is None:
            return self
        return getattr(instance, self._private_name, self._default_value)

    def __set__(self, instance, value):
        setattr(instance, self._private_name, self._filter(value))
        return

# vim:ts=4 sw=4 et