1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441
|
/*
* Copyright (C) 2007 Apple Computer, Inc.
*
* Portions are Copyright (C) 1998 Netscape Communications Corporation.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
* Alternatively, the contents of this file may be used under the terms
* of either the Mozilla Public License Version 1.1, found at
* http://www.mozilla.org/MPL/ (the "MPL") or the GNU General Public
* License Version 2.0, found at http://www.fsf.org/copyleft/gpl.html
* (the "GPL"), in which case the provisions of the MPL or the GPL are
* applicable instead of those above. If you wish to allow use of your
* version of this file only under the terms of one of those two
* licenses (the MPL or the GPL) and not to allow others to use your
* version of this file under the LGPL, indicate your decision by
* deletingthe provisions above and replace them with the notice and
* other provisions required by the MPL or the GPL, as the case may be.
* If you do not delete the provisions above, a recipient may use your
* version of this file under any of the LGPL, the MPL or the GPL.
*/
#include "third_party/blink/renderer/platform/text/unicode_range.h"
#include "base/compiler_specific.h"
namespace blink {
/**********************************************************************
* Unicode subranges as defined in unicode 3.0
* x-western, x-central-euro, tr, x-baltic -> latin
* 0000 - 036f
* 1e00 - 1eff
* 2000 - 206f (general punctuation)
* 20a0 - 20cf (currency symbols)
* 2100 - 214f (letterlike symbols)
* 2150 - 218f (Number Forms)
* el -> greek
* 0370 - 03ff
* 1f00 - 1fff
* x-cyrillic -> cyrillic
* 0400 - 04ff
* he -> hebrew
* 0590 - 05ff
* ar -> arabic
* 0600 - 06ff
* fb50 - fdff (arabic presentation forms)
* fe70 - feff (arabic presentation forms b)
* th - thai
* 0e00 - 0e7f
* ko -> korean
* ac00 - d7af (hangul Syllables)
* 1100 - 11ff (jamo)
* 3130 - 318f (hangul compatibility jamo)
* ja
* 3040 - 309f (hiragana)
* 30a0 - 30ff (katakana)
* zh-CN
* zh-TW
*
* CJK
* 3100 - 312f (bopomofo)
* 31a0 - 31bf (bopomofo extended)
* 3000 - 303f (CJK Symbols and Punctuation)
* 2e80 - 2eff (CJK radicals supplement)
* 2f00 - 2fdf (Kangxi Radicals)
* 2ff0 - 2fff (Ideographic Description Characters)
* 3190 - 319f (kanbun)
* 3200 - 32ff (Enclosed CJK letters and Months)
* 3300 - 33ff (CJK compatibility)
* 3400 - 4dbf (CJK Unified Ideographs Extension A)
* 4e00 - 9faf (CJK Unified Ideographs)
* f900 - fa5f (CJK Compatibility Ideographs)
* fe30 - fe4f (CJK compatibility Forms)
* ff00 - ffef (halfwidth and fullwidth forms)
*
* Armenian
* 0530 - 058f
* Sriac
* 0700 - 074f
* Thaana
* 0780 - 07bf
* Devanagari
* 0900 - 097f
* Bengali
* 0980 - 09ff
* Gurmukhi
* 0a00 - 0a7f
* Gujarati
* 0a80 - 0aff
* Oriya
* 0b00 - 0b7f
* Tamil
* 0b80 - 0bff
* Telugu
* 0c00 - 0c7f
* Kannada
* 0c80 - 0cff
* Malayalam
* 0d00 - 0d7f
* Sinhala
* 0d80 - 0def
* Lao
* 0e80 - 0eff
* Tibetan
* 0f00 - 0fbf
* Myanmar
* 1000 - 109f
* Georgian
* 10a0 - 10ff
* Ethiopic
* 1200 - 137f
* Cherokee
* 13a0 - 13ff
* Canadian Aboriginal Syllabics
* 1400 - 167f
* Ogham
* 1680 - 169f
* Runic
* 16a0 - 16ff
* Khmer
* 1780 - 17ff
* Mongolian
* 1800 - 18af
* Misc - superscripts and subscripts
* 2070 - 209f
* Misc - Combining Diacritical Marks for Symbols
* 20d0 - 20ff
* Misc - Arrows
* 2190 - 21ff
* Misc - Mathematical Operators
* 2200 - 22ff
* Misc - Miscellaneous Technical
* 2300 - 23ff
* Misc - Control picture
* 2400 - 243f
* Misc - Optical character recognition
* 2440 - 2450
* Misc - Enclose Alphanumerics
* 2460 - 24ff
* Misc - Box Drawing
* 2500 - 257f
* Misc - Block Elements
* 2580 - 259f
* Misc - Geometric Shapes
* 25a0 - 25ff
* Misc - Miscellaneous Symbols
* 2600 - 267f
* Misc - Dingbats
* 2700 - 27bf
* Misc - Braille Patterns
* 2800 - 28ff
* Yi Syllables
* a000 - a48f
* Yi radicals
* a490 - a4cf
* Alphabetic Presentation Forms
* fb00 - fb4f
* Misc - Combining half Marks
* fe20 - fe2f
* Misc - small form variants
* fe50 - fe6f
* Misc - Specials
* fff0 - ffff
*********************************************************************/
static const unsigned kCNumSubTables = 9;
static const unsigned kCSubTableSize = 16;
static const unsigned char
kGUnicodeSubrangeTable[kCNumSubTables][kCSubTableSize] = {
{
// table for X---
kCRangeTableBase + 1, // u0xxx
kCRangeTableBase + 2, // u1xxx
kCRangeTableBase + 3, // u2xxx
kCRangeSetCJK, // u3xxx
kCRangeSetCJK, // u4xxx
kCRangeSetCJK, // u5xxx
kCRangeSetCJK, // u6xxx
kCRangeSetCJK, // u7xxx
kCRangeSetCJK, // u8xxx
kCRangeSetCJK, // u9xxx
kCRangeTableBase + 4, // uaxxx
kCRangeKorean, // ubxxx
kCRangeKorean, // ucxxx
kCRangeTableBase + 5, // udxxx
kCRangePrivate, // uexxx
kCRangeTableBase + 6 // ufxxx
},
{
// table for 0X--
kCRangeSetLatin, // u00xx
kCRangeSetLatin, // u01xx
kCRangeSetLatin, // u02xx
kCRangeGreek, // u03xx XXX 0300-036f is in fact
// cRangeCombiningDiacriticalMarks
kCRangeCyrillic, // u04xx
kCRangeTableBase +
7, // u05xx, includes Cyrillic supplement, Hebrew, and Armenian
kCRangeArabic, // u06xx
kCRangeTertiaryTable, // u07xx
kCRangeUnassigned, // u08xx
kCRangeTertiaryTable, // u09xx
kCRangeTertiaryTable, // u0axx
kCRangeTertiaryTable, // u0bxx
kCRangeTertiaryTable, // u0cxx
kCRangeTertiaryTable, // u0dxx
kCRangeTertiaryTable, // u0exx
kCRangeTibetan, // u0fxx
},
{
// table for 1x--
kCRangeTertiaryTable, // u10xx
kCRangeKorean, // u11xx
kCRangeEthiopic, // u12xx
kCRangeTertiaryTable, // u13xx
kCRangeCanadian, // u14xx
kCRangeCanadian, // u15xx
kCRangeTertiaryTable, // u16xx
kCRangeKhmer, // u17xx
kCRangeMongolian, // u18xx
kCRangeUnassigned, // u19xx
kCRangeUnassigned, // u1axx
kCRangeUnassigned, // u1bxx
kCRangeUnassigned, // u1cxx
kCRangeUnassigned, // u1dxx
kCRangeSetLatin, // u1exx
kCRangeGreek, // u1fxx
},
{
// table for 2x--
kCRangeSetLatin, // u20xx
kCRangeSetLatin, // u21xx
kCRangeMathOperators, // u22xx
kCRangeMiscTechnical, // u23xx
kCRangeControlOpticalEnclose, // u24xx
kCRangeBoxBlockGeometrics, // u25xx
kCRangeMiscSymbols, // u26xx
kCRangeDingbats, // u27xx
kCRangeBraillePattern, // u28xx
kCRangeUnassigned, // u29xx
kCRangeUnassigned, // u2axx
kCRangeUnassigned, // u2bxx
kCRangeUnassigned, // u2cxx
kCRangeUnassigned, // u2dxx
kCRangeSetCJK, // u2exx
kCRangeSetCJK, // u2fxx
},
{
// table for ax--
kCRangeYi, // ua0xx
kCRangeYi, // ua1xx
kCRangeYi, // ua2xx
kCRangeYi, // ua3xx
kCRangeYi, // ua4xx
kCRangeUnassigned, // ua5xx
kCRangeUnassigned, // ua6xx
kCRangeUnassigned, // ua7xx
kCRangeUnassigned, // ua8xx
kCRangeUnassigned, // ua9xx
kCRangeUnassigned, // uaaxx
kCRangeUnassigned, // uabxx
kCRangeKorean, // uacxx
kCRangeKorean, // uadxx
kCRangeKorean, // uaexx
kCRangeKorean, // uafxx
},
{
// table for dx--
kCRangeKorean, // ud0xx
kCRangeKorean, // ud1xx
kCRangeKorean, // ud2xx
kCRangeKorean, // ud3xx
kCRangeKorean, // ud4xx
kCRangeKorean, // ud5xx
kCRangeKorean, // ud6xx
kCRangeKorean, // ud7xx
kCRangeSurrogate, // ud8xx
kCRangeSurrogate, // ud9xx
kCRangeSurrogate, // udaxx
kCRangeSurrogate, // udbxx
kCRangeSurrogate, // udcxx
kCRangeSurrogate, // uddxx
kCRangeSurrogate, // udexx
kCRangeSurrogate, // udfxx
},
{
// table for fx--
kCRangePrivate, // uf0xx
kCRangePrivate, // uf1xx
kCRangePrivate, // uf2xx
kCRangePrivate, // uf3xx
kCRangePrivate, // uf4xx
kCRangePrivate, // uf5xx
kCRangePrivate, // uf6xx
kCRangePrivate, // uf7xx
kCRangePrivate, // uf8xx
kCRangeSetCJK, // uf9xx
kCRangeSetCJK, // ufaxx
kCRangeArabic, // ufbxx, includes alphabic presentation form
kCRangeArabic, // ufcxx
kCRangeArabic, // ufdxx
kCRangeArabic, // ufexx, includes Combining half marks,
// CJK compatibility forms,
// CJK compatibility forms,
// small form variants
kCRangeTableBase +
8, // uffxx, halfwidth and fullwidth forms, includes Specials
},
{
// table for 0x0500 - 0x05ff
kCRangeCyrillic, // u050x
kCRangeCyrillic, // u051x
kCRangeCyrillic, // u052x
kCRangeArmenian, // u053x
kCRangeArmenian, // u054x
kCRangeArmenian, // u055x
kCRangeArmenian, // u056x
kCRangeArmenian, // u057x
kCRangeArmenian, // u058x
kCRangeHebrew, // u059x
kCRangeHebrew, // u05ax
kCRangeHebrew, // u05bx
kCRangeHebrew, // u05cx
kCRangeHebrew, // u05dx
kCRangeHebrew, // u05ex
kCRangeHebrew, // u05fx
},
{
// table for 0xff00 - 0xffff
kCRangeSetCJK, // uff0x, fullwidth latin
kCRangeSetCJK, // uff1x, fullwidth latin
kCRangeSetCJK, // uff2x, fullwidth latin
kCRangeSetCJK, // uff3x, fullwidth latin
kCRangeSetCJK, // uff4x, fullwidth latin
kCRangeSetCJK, // uff5x, fullwidth latin
kCRangeSetCJK, // uff6x, halfwidth katakana
kCRangeSetCJK, // uff7x, halfwidth katakana
kCRangeSetCJK, // uff8x, halfwidth katakana
kCRangeSetCJK, // uff9x, halfwidth katakana
kCRangeSetCJK, // uffax, halfwidth hangul jamo
kCRangeSetCJK, // uffbx, halfwidth hangul jamo
kCRangeSetCJK, // uffcx, halfwidth hangul jamo
kCRangeSetCJK, // uffdx, halfwidth hangul jamo
kCRangeSetCJK, // uffex, fullwidth symbols
kCRangeSpecials, // ufffx, Specials
},
};
// Most scripts between U+0700 and U+16FF are assigned a chunk of 128 (0x80)
// code points so that the number of entries in the tertiary range
// table for that range is obtained by dividing (0x1700 - 0x0700) by 128.
// Exceptions: Ethiopic, Tibetan, Hangul Jamo and Canadian aboriginal
// syllabaries take multiple chunks and Ogham and Runic share a single chunk.
static const unsigned kCTertiaryTableSize = ((0x1700 - 0x0700) / 0x80);
static const unsigned char kGUnicodeTertiaryRangeTable[kCTertiaryTableSize] = {
// table for 0x0700 - 0x1600
kCRangeSyriac, // u070x
kCRangeThaana, // u078x
kCRangeUnassigned, // u080x place holder(resolved in the 2ndary tab.)
kCRangeUnassigned, // u088x place holder(resolved in the 2ndary tab.)
kCRangeDevanagari, // u090x
kCRangeBengali, // u098x
kCRangeGurmukhi, // u0a0x
kCRangeGujarati, // u0a8x
kCRangeOriya, // u0b0x
kCRangeTamil, // u0b8x
kCRangeTelugu, // u0c0x
kCRangeKannada, // u0c8x
kCRangeMalayalam, // u0d0x
kCRangeSinhala, // u0d8x
kCRangeThai, // u0e0x
kCRangeLao, // u0e8x
kCRangeTibetan, // u0f0x place holder(resolved in the 2ndary tab.)
kCRangeTibetan, // u0f8x place holder(resolved in the 2ndary tab.)
kCRangeMyanmar, // u100x
kCRangeGeorgian, // u108x
kCRangeKorean, // u110x place holder(resolved in the 2ndary tab.)
kCRangeKorean, // u118x place holder(resolved in the 2ndary tab.)
kCRangeEthiopic, // u120x place holder(resolved in the 2ndary tab.)
kCRangeEthiopic, // u128x place holder(resolved in the 2ndary tab.)
kCRangeEthiopic, // u130x
kCRangeCherokee, // u138x
kCRangeCanadian, // u140x place holder(resolved in the 2ndary tab.)
kCRangeCanadian, // u148x place holder(resolved in the 2ndary tab.)
kCRangeCanadian, // u150x place holder(resolved in the 2ndary tab.)
kCRangeCanadian, // u158x place holder(resolved in the 2ndary tab.)
kCRangeCanadian, // u160x
kCRangeOghamRunic, // u168x this contains two scripts, Ogham & Runic
};
// A two level index is almost enough for locating a range, with the
// exception of u03xx and u05xx. Since we don't really care about range for
// combining diacritical marks in our font application, they are
// not discriminated further. Future adoption of this method for other use
// should be aware of this limitation. The implementation can be extended if
// there is such a need.
// For Indic, Southeast Asian scripts and some other scripts between
// U+0700 and U+16FF, it's extended to the third level.
unsigned FindCharUnicodeRange(UChar32 ch) {
if (ch >= 0xFFFF)
return 0;
unsigned range;
// search the first table
range = UNSAFE_TODO(kGUnicodeSubrangeTable[0][ch >> 12]);
if (range < kCRangeTableBase)
// we try to get a specific range
return range;
// otherwise, we have one more table to look at
range = UNSAFE_TODO(
kGUnicodeSubrangeTable[range - kCRangeTableBase][(ch & 0x0f00) >> 8]);
if (range < kCRangeTableBase)
return range;
if (range < kCRangeTertiaryTable) {
return UNSAFE_TODO(
kGUnicodeSubrangeTable[range - kCRangeTableBase][(ch & 0x00f0) >> 4]);
}
// Yet another table to look at : U+0700 - U+16FF : 128 code point blocks
return UNSAFE_TODO(kGUnicodeTertiaryRangeTable[(ch - 0x0700) >> 7]);
}
} // namespace blink
|