File: wchartype.py

package info (click to toggle)
wchartype 0.1-4
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 56 kB
  • sloc: python: 56; makefile: 3
file content (206 lines) | stat: -rw-r--r-- 4,341 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
#coding: UTF8
"""
wchartype
Retrieves character types of double-byte (full-width) characters.
"""

__license__ = "MIT"
__version__ = "0.1"
__author__ = "Ryan Ginstrom"
__description__ = "Retrieves character types of double-byte characters."

# 0x3000 is ideographic space (i.e. double-byte space)
IDEOGRAPHIC_SPACE = 0x3000

def is_asian(char):
    """Is the character Asian?
    
    >>> is_asian('a')
    False
    >>> is_asian(u'\u65e5')
    True
    >>> is_asian(unichr(0x3000))
    False
    
    """

    return ord(char) > IDEOGRAPHIC_SPACE

def is_full_width(char):
    """
    Is the character full width?
    It will be full width if it's Asian or an ideographic space.
    
    >>> is_full_width('a')
    False
    >>> is_full_width(u'\u65e5')
    True
    >>> is_full_width(unichr(0x3000))
    True
    """
    return is_asian(char) or ord(char) == IDEOGRAPHIC_SPACE

def is_kanji(char):
    """
    Returns whether char is kanji (or Chinese)

    >>> is_kanji(u'\u4E40')
    True
    >>> is_kanji(u"a")
    False
    """

    code = ord(char)
    return 0x4E00 <= code <= 0x9FFF

is_hanzi = is_kanji

def is_hiragana(char):
    """
    Returns whether char is hiragana

    >>> is_hiragana(u'a')
    False
    >>> is_hiragana(u'\u308F') # わ
    True
    >>> is_hiragana(u'\u30EA') # リ
    False
    """

    code = ord(char)
    return 0x3041 <= code <= 0x309F

def is_katakana(char):
    """
    Returns whether char is katakana

    >>> is_katakana(u'$')
    False
    >>> is_katakana(u'\u30EA') # リ
    True
    >>> is_katakana(u'\u308F') # わ
    False
    """

    code = ord(char)
    return 0x30A0 <= code <= 0x30FF

def is_half_katakana(char):
    """
    Returns whether char is half-width katakana

    >>> is_half_katakana(u'$')
    False
    >>> is_half_katakana(u'\uFF91') # ム
    True
    >>> is_half_katakana(u'\u30EA') # リ
    False
    """

    code = ord(char)
    return 0xFF65 <= code <= 0xFF9F

def is_hangul(char):
    """
    Returns whether char is hangul

    >>> is_hangul(u'1')
    False
    
    # halfwidth hangul
    >>> is_hangul(u'\uFFB8') # HALFWIDTH HANGUL LETTER CIEUC 
    True

    # fullwidth hangul
    >>> is_hangul(u'\uB973') # 륳
    True

    >>> is_hangul(u'\u30EA') # リ
    False
    """

    code = ord(char)
    # halfwidth hangul
    if 0xFFA0 <= code <= 0xFFDC:
        return True
    # fullwidth hangul
    return 0xAC00 <= code <= 0xD7A3

def is_full_punct(char):
    """
    Returns whether char is full-width punctuation

    >>> is_full_punct(u'$')
    False
    >>> is_full_punct(u'\uFF05') # %
    True
    >>> is_full_punct(u'\uFF1E') # >
    True
    >>> is_full_punct(u'\uFF3D') # ]
    True
    >>> is_full_punct(u'\uFF5B') # {
    True
    >>> is_full_punct(u'\u30EA') # リ
    False
    """

    code = ord(char)
    return any(x <= code <= y
               for x, y in [(0xFF01, 0xFF0F),
                        (0xFF1A, 0xFF20),
                        (0xFF3B, 0xFF40),
                        (0xFF5B, 0xFF64)])

def is_full_digit(char):
    """
    Returns whether char is full-width digit

    >>> is_full_digit(u'1')
    False
    >>> is_full_digit(u'\uFF15') # 5
    True
    >>> is_full_digit(u'\uFF05') # %
    False
    """

    code = ord(char)
    return 0xFF10 <= code <= 0xFF19

def is_full_letter(char):
    """
    Returns whether char is full-width letter.
    This differs from the built-in isalpha method for strings,
    because isalpha will return True for CJK characters.

    >>> is_full_letter(u'\u308F') # hiragana wa (わ)
    False
    >>> u'\u308F'.isalpha() # hiragana wa (わ)
    True

    >>> is_full_letter(u'A')
    False
    >>> is_full_letter(u'\uFF31') # Q
    True
    >>> is_full_letter(u'\uFF4A') # j
    True
    >>> is_full_letter(u'\u30EA') # リ
    False
    >>> is_full_letter(u'\uFF15') # 5
    False
    """

    code = ord(char)
    if 0xFF21 <= code <= 0xFF3A:
        return True
    if 0xFF41 <= code <= 0xFF5A:
        return True
    
    return False

def _test():
    """Run doc tests"""
    import doctest
    doctest.testmod()

if __name__ == "__main__":
    _test()