File: unicode_support.py

package info (click to toggle)
ocrodjvu 0.7.18-1
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 3,672 kB
  • ctags: 502
  • sloc: python: 3,535; xml: 941; makefile: 14; sh: 13
file content (58 lines) | stat: -rw-r--r-- 1,631 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# encoding=UTF-8

# Copyright © 2008, 2009, 2010, 2011 Jakub Wilk <jwilk@jwilk.net>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; version 2 dated June, 1991.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.

from . import utils

def get_icu():
    try:
        # For PyICU ≥ 1.0
        import icu
        return icu
    except ImportError:
        pass
    try:
        # For PyICU < 1.0
        import PyICU as icu
        return icu
    except ImportError, ex:
        utils.enhance_import_error(ex, 'PyICU', 'python-pyicu', 'http://pyicu.osafoundation.org/')
        raise

def simple_word_break_iterator(text):
    '''
    Create an instance of simple space-to-space word break interator.
    '''
    if not text:
        return
    space = text[0].isspace()
    for n, ch in enumerate(text):
        if space != ch.isspace():
            yield n
            space = not space
    yield len(text)

def word_break_iterator(text, locale=None):
    '''
    Create an instance of word break interator.

    text: unicode string
    locale: ICU locale or None
    '''
    if locale is None:
        return simple_word_break_iterator(text)
    icu = get_icu()
    break_iterator = icu.BreakIterator.createWordInstance(locale)
    break_iterator.setText(text)
    return break_iterator

# vim:ts=4 sw=4 et