1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330
|
#!/usr/bin/env python
#
# Copyright 2014 Google Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""OpenType-related data."""
__author__ = 'roozbeh@google.com (Roozbeh Pournader)'
from nototools import unicode_data
OMPL = {}
def _set_ompl():
"""Set up OMPL.
OMPL is defined to be the list of mirrored pairs in Unicode 5.1:
http://www.microsoft.com/typography/otspec/ttochap1.htm#ltrrtl
"""
global OMPL
unicode_data.load_data()
bmg_data = unicode_data._bidi_mirroring_glyph_data
OMPL = {char:bmg for (char, bmg) in bmg_data.items()
if float(unicode_data.age(char)) <= 5.1}
ZWSP = [0x200B]
JOINERS = [0x200C, 0x200D]
BIDI_MARKS = [0x200E, 0x200F]
DOTTED_CIRCLE = [0x25CC]
# From the various script-specific specs at
# http://www.microsoft.com/typography/SpecificationsOverview.mspx
SPECIAL_CHARACTERS_NEEDED = {
'Arab': JOINERS + BIDI_MARKS + DOTTED_CIRCLE,
'Beng': ZWSP + JOINERS + DOTTED_CIRCLE,
'Bugi': ZWSP + JOINERS + DOTTED_CIRCLE,
'Deva': ZWSP + JOINERS + DOTTED_CIRCLE,
'Gujr': ZWSP + JOINERS + DOTTED_CIRCLE,
'Guru': ZWSP + JOINERS + DOTTED_CIRCLE,
# Hangul may not need the special characters:
# https://code.google.com/p/noto/issues/detail?id=147#c2
# 'Hang': ZWSP + JOINERS,
'Hebr': BIDI_MARKS + DOTTED_CIRCLE,
'Java': ZWSP + JOINERS + DOTTED_CIRCLE,
'Khmr': ZWSP + JOINERS + DOTTED_CIRCLE,
'Knda': ZWSP + JOINERS + DOTTED_CIRCLE,
'Laoo': ZWSP + DOTTED_CIRCLE,
'Mlym': ZWSP + JOINERS + DOTTED_CIRCLE,
'Mymr': ZWSP + JOINERS + DOTTED_CIRCLE,
'Orya': ZWSP + JOINERS + DOTTED_CIRCLE,
'Sinh': ZWSP + JOINERS + DOTTED_CIRCLE,
'Syrc': JOINERS + BIDI_MARKS + DOTTED_CIRCLE,
'Taml': ZWSP + JOINERS + DOTTED_CIRCLE,
'Telu': ZWSP + JOINERS + DOTTED_CIRCLE,
'Thaa': BIDI_MARKS + DOTTED_CIRCLE,
'Thai': ZWSP + DOTTED_CIRCLE,
'Tibt': ZWSP + JOINERS + DOTTED_CIRCLE,
}
# www.microsoft.com/typography/otspec/os2.html#ur
# bit, block name, block range
_unicoderange_data = """0\tBasic Latin\t0000-007F
1\tLatin-1 Supplement\t0080-00FF
2\tLatin Extended-A\t0100-017F
3\tLatin Extended-B\t0180-024F
4\tIPA Extensions\t0250-02AF
\tPhonetic Extensions\t1D00-1D7F
\tPhonetic Extensions Supplement\t1D80-1DBF
5\tSpacing Modifier Letters\t02B0-02FF
\tModifier Tone Letters\tA700-A71F
6\tCombining Diacritical Marks\t0300-036F
\tCombining Diacritical Marks Supplement\t1DC0-1DFF
7\tGreek and Coptic\t0370-03FF
8\tCoptic\t2C80-2CFF
9\tCyrillic\t0400-04FF
\tCyrillic Supplement\t0500-052F
\tCyrillic Extended-A\t2DE0-2DFF
\tCyrillic Extended-B\tA640-A69F
10\tArmenian\t0530-058F
11\tHebrew\t0590-05FF
12\tVai\tA500-A63F
13\tArabic\t0600-06FF
\tArabic Supplement\t0750-077F
14\tNKo\t07C0-07FF
15\tDevanagari\t0900-097F
16\tBengali\t0980-09FF
17\tGurmukhi\t0A00-0A7F
18\tGujarati\t0A80-0AFF
19\tOriya\t0B00-0B7F
20\tTamil\t0B80-0BFF
21\tTelugu\t0C00-0C7F
22\tKannada\t0C80-0CFF
23\tMalayalam\t0D00-0D7F
24\tThai\t0E00-0E7F
25\tLao\t0E80-0EFF
26\tGeorgian\t10A0-10FF
\tGeorgian Supplement\t2D00-2D2F
27\tBalinese\t1B00-1B7F
28\tHangul Jamo\t1100-11FF
29\tLatin Extended Additional\t1E00-1EFF
\tLatin Extended-C\t2C60-2C7F
\tLatin Extended-D\tA720-A7FF
30\tGreek Extended\t1F00-1FFF
31\tGeneral Punctuation\t2000-206F
\tSupplemental Punctuation\t2E00-2E7F
32\tSuperscripts And Subscripts\t2070-209F
33\tCurrency Symbols\t20A0-20CF
34\tCombining Diacritical Marks For Symbols\t20D0-20FF
35\tLetterlike Symbols\t2100-214F
36\tNumber Forms\t2150-218F
37\tArrows\t2190-21FF
\tSupplemental Arrows-A\t27F0-27FF
\tSupplemental Arrows-B\t2900-297F
\tMiscellaneous Symbols and Arrows\t2B00-2BFF
38\tMathematical Operators\t2200-22FF
\tSupplemental Mathematical Operators\t2A00-2AFF
\tMiscellaneous Mathematical Symbols-A\t27C0-27EF
\tMiscellaneous Mathematical Symbols-B\t2980-29FF
39\tMiscellaneous Technical\t2300-23FF
40\tControl Pictures\t2400-243F
41\tOptical Character Recognition\t2440-245F
42\tEnclosed Alphanumerics\t2460-24FF
43\tBox Drawing\t2500-257F
44\tBlock Elements\t2580-259F
45\tGeometric Shapes\t25A0-25FF
46\tMiscellaneous Symbols\t2600-26FF
47\tDingbats\t2700-27BF
48\tCJK Symbols And Punctuation\t3000-303F
49\tHiragana\t3040-309F
50\tKatakana\t30A0-30FF
\tKatakana Phonetic Extensions\t31F0-31FF
51\tBopomofo\t3100-312F
\tBopomofo Extended\t31A0-31BF
52\tHangul Compatibility Jamo\t3130-318F
53\tPhags-pa\tA840-A87F
54\tEnclosed CJK Letters And Months\t3200-32FF
55\tCJK Compatibility\t3300-33FF
56\tHangul Syllables\tAC00-D7AF
57\tNon-Plane 0 *\tD800-DFFF
58\tPhoenician\t10900-1091F
59\tCJK Unified Ideographs\t4E00-9FFF
\tCJK Radicals Supplement\t2E80-2EFF
\tKangxi Radicals\t2F00-2FDF
\tIdeographic Description Characters\t2FF0-2FFF
\tCJK Unified Ideographs Extension A\t3400-4DBF
\tCJK Unified Ideographs Extension B\t20000-2A6DF
\tKanbun\t3190-319F
60\tPrivate Use Area (plane 0)\tE000-F8FF
61\tCJK Strokes\t31C0-31EF
\tCJK Compatibility Ideographs\tF900-FAFF
\tCJK Compatibility Ideographs Supplement\t2F800-2FA1F
62\tAlphabetic Presentation Forms\tFB00-FB4F
63\tArabic Presentation Forms-A\tFB50-FDFF
64\tCombining Half Marks\tFE20-FE2F
65\tVertical Forms\tFE10-FE1F
\tCJK Compatibility Forms\tFE30-FE4F
66\tSmall Form Variants\tFE50-FE6F
67\tArabic Presentation Forms-B\tFE70-FEFF
68\tHalfwidth And Fullwidth Forms\tFF00-FFEF
69\tSpecials\tFFF0-FFFF
70\tTibetan\t0F00-0FFF
71\tSyriac\t0700-074F
72\tThaana\t0780-07BF
73\tSinhala\t0D80-0DFF
74\tMyanmar\t1000-109F
75\tEthiopic\t1200-137F
\tEthiopic Supplement\t1380-139F
\tEthiopic Extended\t2D80-2DDF
76\tCherokee\t13A0-13FF
77\tUnified Canadian Aboriginal Syllabics\t1400-167F
78\tOgham\t1680-169F
79\tRunic\t16A0-16FF
80\tKhmer\t1780-17FF
\tKhmer Symbols\t19E0-19FF
81\tMongolian\t1800-18AF
82\tBraille Patterns\t2800-28FF
83\tYi Syllables\tA000-A48F
\tYi Radicals\tA490-A4CF
84\tTagalog\t1700-171F
\tHanunoo\t1720-173F
\tBuhid\t1740-175F
\tTagbanwa\t1760-177F
85\tOld Italic\t10300-1032F
86\tGothic\t10330-1034F
87\tDeseret\t10400-1044F
88\tByzantine Musical Symbols\t1D000-1D0FF
\tMusical Symbols\t1D100-1D1FF
\tAncient Greek Musical Notation\t1D200-1D24F
89\tMathematical Alphanumeric Symbols\t1D400-1D7FF
90\tPrivate Use (plane 15)\tFF000-FFFFD
\tPrivate Use (plane 16)\t100000-10FFFD
91\tVariation Selectors\tFE00-FE0F
\tVariation Selectors Supplement\tE0100-E01EF
92\tTags\tE0000-E007F
93\tLimbu\t1900-194F
94\tTai Le\t1950-197F
95\tNew Tai Lue\t1980-19DF
96\tBuginese\t1A00-1A1F
97\tGlagolitic\t2C00-2C5F
98\tTifinagh\t2D30-2D7F
99\tYijing Hexagram Symbols\t4DC0-4DFF
100\tSyloti Nagri\tA800-A82F
101\tLinear B Syllabary\t10000-1007F
\tLinear B Ideograms\t10080-100FF
\tAegean Numbers\t10100-1013F
102\tAncient Greek Numbers\t10140-1018F
103\tUgaritic\t10380-1039F
104\tOld Persian\t103A0-103DF
105\tShavian\t10450-1047F
106\tOsmanya\t10480-104AF
107\tCypriot Syllabary\t10800-1083F
108\tKharoshthi\t10A00-10A5F
109\tTai Xuan Jing Symbols\t1D300-1D35F
110\tCuneiform\t12000-123FF
\tCuneiform Numbers and Punctuation\t12400-1247F
111\tCounting Rod Numerals\t1D360-1D37F
112\tSundanese\t1B80-1BBF
113\tLepcha\t1C00-1C4F
114\tOl Chiki\t1C50-1C7F
115\tSaurashtra\tA880-A8DF
116\tKayah Li\tA900-A92F
117\tRejang\tA930-A95F
118\tCham\tAA00-AA5F
119\tAncient Symbols\t10190-101CF
120\tPhaistos Disc\t101D0-101FF
121\tCarian\t102A0-102DF
\tLycian\t10280-1029F
\tLydian\t10920-1093F
122\tDomino Tiles\t1F030-1F09F
\tMahjong Tiles\t1F000-1F02F
"""
ur_data = []
ur_bucket_info = [[] for i in range(128)]
def _setup_unicoderange_data():
"""The unicoderange data used in the os/2 table consists of slightly under
128 'buckets', each of which consists of one or more 'ranges' of codepoints.
Each range has a name, start, and end. Bucket 57 is special, it consists of
all non-BMP codepoints and overlaps the other ranges, though in the data it
corresponds to the high and low UTF-16 surrogate code units. The other ranges
are all disjoint.
We build two tables. ur_data is a list of the ranges, consisting of the
start, end, bucket index, and name. It is sorted by range start. ur_bucket_info
is a list of buckets in bucket index order; each entry is a list of the tuples
in ur_data that belong to that bucket.
This is called by functions that require these tables. On first use it builds
ur_data and ur_bucket_info, which should remain unchanged thereafter."""
if ur_data:
return
index = 0
for line in _unicoderange_data.splitlines():
index_str, name, urange = line.split('\t')
range_start_str, range_end_str = urange.split('-')
range_start = int(range_start_str, 16)
range_end = int(range_end_str, 16)
if index_str:
index = int(index_str)
tup = (range_start, range_end, index, name)
ur_data.append(tup)
ur_bucket_info[index].append(tup)
ur_data.sort()
def collect_unicoderange_info(cmap):
"""Return a list of 2-tuples, the first element a count of the characters in a
range, the second element the 4-tuple of information about that range: start,
end, bucket number, and name. Only ranges for which the cmap has a character
are included."""
_setup_unicoderange_data()
range_count = 0
index = 0
limit = len(ur_data)
result = []
for cp in sorted(cmap):
while index < limit:
tup = ur_data[index]
if cp <= tup[1]:
# the ranges are disjoint and some characters fall into no
# range, e.g. Javanese.
if cp >= tup[0]:
range_count += 1
break
if range_count:
result.append((range_count, ur_data[index]))
range_count = 0
index += 1
if range_count:
result.append((range_count, ur_data[index]))
return result
def unicoderange_bucket_info_name(bucket_info):
return ', '.join(t[3] for t in bucket_info)
def unicoderange_bucket_info_size(bucket_info):
return sum(t[1] - t[0] + 1 for t in bucket_info)
def unicoderange_bucket_index_to_info(bucket_index):
if bucket_index < 0 or bucket_index >= 128:
raise ValueError('bucket_index %s out of range' % bucket_index)
_setup_unicoderange_data()
return ur_bucket_info[bucket_index]
def unicoderange_bucket_index_to_name(bucket_index):
return unicoderange_bucket_info_name(unicoderange_bucket_index_to_info(bucket_index))
if not OMPL:
_set_ompl()
|