File: opentype_data.py

package info (click to toggle)
nototools 0~20170925-1
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 9,452 kB
  • sloc: python: 23,246; xml: 597; sh: 204; makefile: 83
file content (330 lines) | stat: -rwxr-xr-x 10,931 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
#!/usr/bin/env python
#
# Copyright 2014 Google Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""OpenType-related data."""

__author__ = 'roozbeh@google.com (Roozbeh Pournader)'


from nototools import unicode_data

OMPL = {}
def _set_ompl():
    """Set up OMPL.

    OMPL is defined to be the list of mirrored pairs in Unicode 5.1:
    http://www.microsoft.com/typography/otspec/ttochap1.htm#ltrrtl
    """

    global OMPL
    unicode_data.load_data()
    bmg_data = unicode_data._bidi_mirroring_glyph_data
    OMPL = {char:bmg for (char, bmg) in bmg_data.items()
            if float(unicode_data.age(char)) <= 5.1}


ZWSP = [0x200B]
JOINERS = [0x200C, 0x200D]
BIDI_MARKS = [0x200E, 0x200F]
DOTTED_CIRCLE = [0x25CC]

# From the various script-specific specs at
# http://www.microsoft.com/typography/SpecificationsOverview.mspx
SPECIAL_CHARACTERS_NEEDED = {
    'Arab': JOINERS + BIDI_MARKS + DOTTED_CIRCLE,
    'Beng': ZWSP + JOINERS + DOTTED_CIRCLE,
    'Bugi': ZWSP + JOINERS + DOTTED_CIRCLE,
    'Deva': ZWSP + JOINERS + DOTTED_CIRCLE,
    'Gujr': ZWSP + JOINERS + DOTTED_CIRCLE,
    'Guru': ZWSP + JOINERS + DOTTED_CIRCLE,
    # Hangul may not need the special characters:
    # https://code.google.com/p/noto/issues/detail?id=147#c2
    # 'Hang': ZWSP + JOINERS,
    'Hebr': BIDI_MARKS + DOTTED_CIRCLE,
    'Java': ZWSP + JOINERS + DOTTED_CIRCLE,
    'Khmr': ZWSP + JOINERS + DOTTED_CIRCLE,
    'Knda': ZWSP + JOINERS + DOTTED_CIRCLE,
    'Laoo': ZWSP + DOTTED_CIRCLE,
    'Mlym': ZWSP + JOINERS + DOTTED_CIRCLE,
    'Mymr': ZWSP + JOINERS + DOTTED_CIRCLE,
    'Orya': ZWSP + JOINERS + DOTTED_CIRCLE,
    'Sinh': ZWSP + JOINERS + DOTTED_CIRCLE,
    'Syrc': JOINERS + BIDI_MARKS + DOTTED_CIRCLE,
    'Taml': ZWSP + JOINERS + DOTTED_CIRCLE,
    'Telu': ZWSP + JOINERS + DOTTED_CIRCLE,
    'Thaa': BIDI_MARKS + DOTTED_CIRCLE,
    'Thai': ZWSP + DOTTED_CIRCLE,
    'Tibt': ZWSP + JOINERS + DOTTED_CIRCLE,
}

# www.microsoft.com/typography/otspec/os2.html#ur
# bit, block name, block range
_unicoderange_data = """0\tBasic Latin\t0000-007F
1\tLatin-1 Supplement\t0080-00FF
2\tLatin Extended-A\t0100-017F
3\tLatin Extended-B\t0180-024F
4\tIPA Extensions\t0250-02AF
\tPhonetic Extensions\t1D00-1D7F
\tPhonetic Extensions Supplement\t1D80-1DBF
5\tSpacing Modifier Letters\t02B0-02FF
\tModifier Tone Letters\tA700-A71F
6\tCombining Diacritical Marks\t0300-036F
\tCombining Diacritical Marks Supplement\t1DC0-1DFF
7\tGreek and Coptic\t0370-03FF
8\tCoptic\t2C80-2CFF
9\tCyrillic\t0400-04FF
\tCyrillic Supplement\t0500-052F
\tCyrillic Extended-A\t2DE0-2DFF
\tCyrillic Extended-B\tA640-A69F
10\tArmenian\t0530-058F
11\tHebrew\t0590-05FF
12\tVai\tA500-A63F
13\tArabic\t0600-06FF
\tArabic Supplement\t0750-077F
14\tNKo\t07C0-07FF
15\tDevanagari\t0900-097F
16\tBengali\t0980-09FF
17\tGurmukhi\t0A00-0A7F
18\tGujarati\t0A80-0AFF
19\tOriya\t0B00-0B7F
20\tTamil\t0B80-0BFF
21\tTelugu\t0C00-0C7F
22\tKannada\t0C80-0CFF
23\tMalayalam\t0D00-0D7F
24\tThai\t0E00-0E7F
25\tLao\t0E80-0EFF
26\tGeorgian\t10A0-10FF
\tGeorgian Supplement\t2D00-2D2F
27\tBalinese\t1B00-1B7F
28\tHangul Jamo\t1100-11FF
29\tLatin Extended Additional\t1E00-1EFF
\tLatin Extended-C\t2C60-2C7F
\tLatin Extended-D\tA720-A7FF
30\tGreek Extended\t1F00-1FFF
31\tGeneral Punctuation\t2000-206F
\tSupplemental Punctuation\t2E00-2E7F
32\tSuperscripts And Subscripts\t2070-209F
33\tCurrency Symbols\t20A0-20CF
34\tCombining Diacritical Marks For Symbols\t20D0-20FF
35\tLetterlike Symbols\t2100-214F
36\tNumber Forms\t2150-218F
37\tArrows\t2190-21FF
\tSupplemental Arrows-A\t27F0-27FF
\tSupplemental Arrows-B\t2900-297F
\tMiscellaneous Symbols and Arrows\t2B00-2BFF
38\tMathematical Operators\t2200-22FF
\tSupplemental Mathematical Operators\t2A00-2AFF
\tMiscellaneous Mathematical Symbols-A\t27C0-27EF
\tMiscellaneous Mathematical Symbols-B\t2980-29FF
39\tMiscellaneous Technical\t2300-23FF
40\tControl Pictures\t2400-243F
41\tOptical Character Recognition\t2440-245F
42\tEnclosed Alphanumerics\t2460-24FF
43\tBox Drawing\t2500-257F
44\tBlock Elements\t2580-259F
45\tGeometric Shapes\t25A0-25FF
46\tMiscellaneous Symbols\t2600-26FF
47\tDingbats\t2700-27BF
48\tCJK Symbols And Punctuation\t3000-303F
49\tHiragana\t3040-309F
50\tKatakana\t30A0-30FF
\tKatakana Phonetic Extensions\t31F0-31FF
51\tBopomofo\t3100-312F
\tBopomofo Extended\t31A0-31BF
52\tHangul Compatibility Jamo\t3130-318F
53\tPhags-pa\tA840-A87F
54\tEnclosed CJK Letters And Months\t3200-32FF
55\tCJK Compatibility\t3300-33FF
56\tHangul Syllables\tAC00-D7AF
57\tNon-Plane 0 *\tD800-DFFF
58\tPhoenician\t10900-1091F
59\tCJK Unified Ideographs\t4E00-9FFF
\tCJK Radicals Supplement\t2E80-2EFF
\tKangxi Radicals\t2F00-2FDF
\tIdeographic Description Characters\t2FF0-2FFF
\tCJK Unified Ideographs Extension A\t3400-4DBF
\tCJK Unified Ideographs Extension B\t20000-2A6DF
\tKanbun\t3190-319F
60\tPrivate Use Area (plane 0)\tE000-F8FF
61\tCJK Strokes\t31C0-31EF
\tCJK Compatibility Ideographs\tF900-FAFF
\tCJK Compatibility Ideographs Supplement\t2F800-2FA1F
62\tAlphabetic Presentation Forms\tFB00-FB4F
63\tArabic Presentation Forms-A\tFB50-FDFF
64\tCombining Half Marks\tFE20-FE2F
65\tVertical Forms\tFE10-FE1F
\tCJK Compatibility Forms\tFE30-FE4F
66\tSmall Form Variants\tFE50-FE6F
67\tArabic Presentation Forms-B\tFE70-FEFF
68\tHalfwidth And Fullwidth Forms\tFF00-FFEF
69\tSpecials\tFFF0-FFFF
70\tTibetan\t0F00-0FFF
71\tSyriac\t0700-074F
72\tThaana\t0780-07BF
73\tSinhala\t0D80-0DFF
74\tMyanmar\t1000-109F
75\tEthiopic\t1200-137F
\tEthiopic Supplement\t1380-139F
\tEthiopic Extended\t2D80-2DDF
76\tCherokee\t13A0-13FF
77\tUnified Canadian Aboriginal Syllabics\t1400-167F
78\tOgham\t1680-169F
79\tRunic\t16A0-16FF
80\tKhmer\t1780-17FF
\tKhmer Symbols\t19E0-19FF
81\tMongolian\t1800-18AF
82\tBraille Patterns\t2800-28FF
83\tYi Syllables\tA000-A48F
\tYi Radicals\tA490-A4CF
84\tTagalog\t1700-171F
\tHanunoo\t1720-173F
\tBuhid\t1740-175F
\tTagbanwa\t1760-177F
85\tOld Italic\t10300-1032F
86\tGothic\t10330-1034F
87\tDeseret\t10400-1044F
88\tByzantine Musical Symbols\t1D000-1D0FF
\tMusical Symbols\t1D100-1D1FF
\tAncient Greek Musical Notation\t1D200-1D24F
89\tMathematical Alphanumeric Symbols\t1D400-1D7FF
90\tPrivate Use (plane 15)\tFF000-FFFFD
\tPrivate Use (plane 16)\t100000-10FFFD
91\tVariation Selectors\tFE00-FE0F
\tVariation Selectors Supplement\tE0100-E01EF
92\tTags\tE0000-E007F
93\tLimbu\t1900-194F
94\tTai Le\t1950-197F
95\tNew Tai Lue\t1980-19DF
96\tBuginese\t1A00-1A1F
97\tGlagolitic\t2C00-2C5F
98\tTifinagh\t2D30-2D7F
99\tYijing Hexagram Symbols\t4DC0-4DFF
100\tSyloti Nagri\tA800-A82F
101\tLinear B Syllabary\t10000-1007F
\tLinear B Ideograms\t10080-100FF
\tAegean Numbers\t10100-1013F
102\tAncient Greek Numbers\t10140-1018F
103\tUgaritic\t10380-1039F
104\tOld Persian\t103A0-103DF
105\tShavian\t10450-1047F
106\tOsmanya\t10480-104AF
107\tCypriot Syllabary\t10800-1083F
108\tKharoshthi\t10A00-10A5F
109\tTai Xuan Jing Symbols\t1D300-1D35F
110\tCuneiform\t12000-123FF
\tCuneiform Numbers and Punctuation\t12400-1247F
111\tCounting Rod Numerals\t1D360-1D37F
112\tSundanese\t1B80-1BBF
113\tLepcha\t1C00-1C4F
114\tOl Chiki\t1C50-1C7F
115\tSaurashtra\tA880-A8DF
116\tKayah Li\tA900-A92F
117\tRejang\tA930-A95F
118\tCham\tAA00-AA5F
119\tAncient Symbols\t10190-101CF
120\tPhaistos Disc\t101D0-101FF
121\tCarian\t102A0-102DF
\tLycian\t10280-1029F
\tLydian\t10920-1093F
122\tDomino Tiles\t1F030-1F09F
\tMahjong Tiles\t1F000-1F02F
"""

ur_data = []
ur_bucket_info = [[] for i in range(128)]

def _setup_unicoderange_data():
    """The unicoderange data used in the os/2 table consists of slightly under
    128 'buckets', each of which consists of one or more 'ranges' of codepoints.
    Each range has a name, start, and end.  Bucket 57 is special, it consists of
    all non-BMP codepoints and overlaps the other ranges, though in the data it
    corresponds to the high and low UTF-16 surrogate code units.  The other ranges
    are all disjoint.

    We build two tables.  ur_data is a list of the ranges, consisting of the
    start, end, bucket index, and name.  It is sorted by range start.  ur_bucket_info
    is a list of buckets in bucket index order; each entry is a list of the tuples
    in ur_data that belong to that bucket.

    This is called by functions that require these tables.  On first use it builds
    ur_data and ur_bucket_info, which should remain unchanged thereafter."""

    if ur_data:
        return
    index = 0
    for line in _unicoderange_data.splitlines():
        index_str, name, urange = line.split('\t')
        range_start_str, range_end_str = urange.split('-')
        range_start = int(range_start_str, 16)
        range_end = int(range_end_str, 16)
        if index_str:
            index = int(index_str)
        tup = (range_start, range_end, index, name)
        ur_data.append(tup)
        ur_bucket_info[index].append(tup)
    ur_data.sort()


def collect_unicoderange_info(cmap):
    """Return a list of 2-tuples, the first element a count of the characters in a
    range, the second element the 4-tuple of information about that range: start,
    end, bucket number, and name.  Only ranges for which the cmap has a character
    are included."""

    _setup_unicoderange_data()
    range_count = 0
    index = 0
    limit = len(ur_data)
    result = []
    for cp in sorted(cmap):
        while index < limit:
            tup = ur_data[index]
            if cp <= tup[1]:
                # the ranges are disjoint and some characters fall into no
                # range, e.g. Javanese.
                if cp >= tup[0]:
                    range_count += 1
                break
            if range_count:
                result.append((range_count, ur_data[index]))
                range_count = 0
            index += 1
    if range_count:
        result.append((range_count, ur_data[index]))
    return result


def unicoderange_bucket_info_name(bucket_info):
    return ', '.join(t[3] for t in bucket_info)


def unicoderange_bucket_info_size(bucket_info):
    return sum(t[1] - t[0] + 1 for t in bucket_info)


def unicoderange_bucket_index_to_info(bucket_index):
    if bucket_index < 0 or bucket_index >= 128:
        raise ValueError('bucket_index %s out of range' % bucket_index)
    _setup_unicoderange_data()
    return ur_bucket_info[bucket_index]


def unicoderange_bucket_index_to_name(bucket_index):
    return unicoderange_bucket_info_name(unicoderange_bucket_index_to_info(bucket_index))


if not OMPL:
    _set_ompl()