1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915
|
"""Implements the UnicodeData class."""
import re
import unicodedata
from precis_i18n.codepointset import CodepointSet
# pylint: disable=no-self-use
def _version_to_float(version):
m = re.match(r"^([0-9]+\.[0-9]+)\.[0-9]+$", version)
if not m:
raise ValueError("Unexpected unicode version format: %s" % version)
return float(m.group(1))
class UnicodeData:
"""Adapter for Python's built-in unicodedata module.
This class extends the unicodedata module for use in PRECIS profiles.
Args:
ucd (Union[module,object]): Implements `unicodedata` interface.
"""
_halfwidth_chars = re.compile(r"[\uff01-\uffef]")
_space_chars = re.compile(r"[\u00a0\u1680\u2000-\u200A\u202F\u205F\u3000]")
def __init__(self, ucd=None):
self._ucd = ucd or unicodedata
self._version = _version_to_float(self._ucd.unidata_version)
@property
def version(self):
return self._version
# These methods call through to the underlying unicodedata object.
def category(self, char):
return self._ucd.category(char)
def combining(self, char):
return self._ucd.combining(char)
def bidirectional(self, char):
return self._ucd.bidirectional(char)
def normalize(self, form, value):
return self._ucd.normalize(form, value)
def width_map(self, value):
"""Map half-width and full-width chars to their compat equivs.
Args:
value (str): Value to transform.
Returns:
str: Result.
"""
def _decompose(m):
char = m.group(0)
assert len(char) == 1
norm = self._ucd.normalize("NFKC", char)
return norm if len(norm) == 1 else char
return self._halfwidth_chars.sub(_decompose, value)
def map_nonascii_space_to_ascii(self, value):
"""Convert non-ASCII white space {Zs} to ASCII space.
Args:
value (str): Value to transform.
Returns:
str: Result.
"""
return self._space_chars.sub(" ", value)
def default_ignorable(self, cp):
return cp in _DEFAULT_IGNORABLE
def has_compat(self, cp):
char = chr(cp)
norm = self.normalize("NFKC", char)
assert norm
return norm != char
def control(self, cp):
return (0x00 <= cp <= 0x1F) or (0x7F <= cp <= 0x9F)
def noncharacter(self, cp):
last = cp & 0x0000FFFF
return (0xFFFE <= last <= 0xFFFF) or (0xFDD0 <= cp <= 0xFDEF)
def old_hangul_jamo(self, cp):
return cp in _OLD_HANGUL_JAMO
def greek_script(self, cp):
return cp in _GREEK_SCRIPT
def hebrew_script(self, cp):
return cp in _HEBREW_SCRIPT
def hiragana_katakana_han_script(self, cp):
return cp in _HIRAGANA_KATAKANA_HAN
def combining_virama(self, cp):
return self.combining(chr(cp)) == 9
def arabic_indic(self, cp):
return 0x0660 <= cp <= 0x0669
def extended_arabic_indic(self, cp):
return 0x06F0 <= cp <= 0x06F9
def valid_jointype(self, value, offset):
assert 0x200C <= ord(value[offset]) <= 0x200D
return self._scan_join(reversed(value[:offset]), "L") and self._scan_join(
value[offset + 1 :], "R"
)
def _scan_join(self, iterable, term):
for char in iterable:
join_type = self._join_type(ord(char))
if join_type in (term, "D"):
return True
if join_type != "T":
return False
return False
def _join_type(self, cp):
if cp in _JOINTYPE_DUAL_JOINING:
return "D"
if cp in _JOINTYPE_RIGHT_JOINING:
return "R"
if cp in _JOINTYPE_LEFT_JOINING:
return "L"
if cp in _JOINTYPE_TRANSPARENT:
return "T"
return None
# https://www.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt
# Derived Property: Default_Ignorable_Code_Point
_DEFAULT_IGNORABLE = CodepointSet(
"""
00AD
034F
061C
115F..1160
17B4..17B5
180B..180D
180E
180F
200B..200F
202A..202E
2060..2064
2065
2066..206F
3164
FE00..FE0F
FEFF
FFA0
FFF0..FFF8
1BCA0..1BCA3
1D173..1D17A
E0000
E0001
E0002..E001F
E0020..E007F
E0080..E00FF
E0100..E01EF
E01F0..E0FFF
"""
)
assert len(_DEFAULT_IGNORABLE) == 4174
# https://www.unicode.org/Public/UNIDATA/extracted/DerivedJoiningType.txt
# Joining_Type=Dual_Joining
_JOINTYPE_DUAL_JOINING = CodepointSet(
"""
0620
0626
0628
062A..062E
0633..063F
0641..0647
0649..064A
066E..066F
0678..0687
069A..06BF
06C1..06C2
06CC
06CE
06D0..06D1
06FA..06FC
06FF
0712..0714
071A..071D
071F..0727
0729
072B
072D..072E
074E..0758
075C..076A
076D..0770
0772
0775..0777
077A..077F
07CA..07EA
0841..0845
0848
084A..0853
0855
0860
0862..0865
0868
0886
0889..088D
088F
08A0..08A9
08AF..08B0
08B3..08B8
08BA..08C8
1807
1820..1842
1843
1844..1878
1887..18A8
18AA
A840..A871
10AC0..10AC4
10AD3..10AD6
10AD8..10ADC
10ADE..10AE0
10AEB..10AEE
10B80
10B82
10B86..10B88
10B8A..10B8B
10B8D
10B90
10BAD..10BAE
10D01..10D21
10D23
10EC3..10EC4
10EC6..10EC7
10F30..10F32
10F34..10F44
10F51..10F53
10F70..10F73
10F76..10F81
10FB0
10FB2..10FB3
10FB8
10FBB..10FBC
10FBE..10FBF
10FC1
10FC4
10FCA
1E900..1E943
"""
)
assert len(_JOINTYPE_DUAL_JOINING) == 615
# https://www.unicode.org/Public/UNIDATA/extracted/DerivedJoiningType.txt
# Joining_Type=Right_Joining
_JOINTYPE_RIGHT_JOINING = CodepointSet(
"""
0622..0625
0627
0629
062F..0632
0648
0671..0673
0675..0677
0688..0699
06C0
06C3..06CB
06CD
06CF
06D2..06D3
06D5
06EE..06EF
0710
0715..0719
071E
0728
072A
072C
072F
074D
0759..075B
076B..076C
0771
0773..0774
0778..0779
0840
0846..0847
0849
0854
0856..0858
0867
0869..086A
0870..0882
088E
08AA..08AC
08AE
08B1..08B2
08B9
10AC5
10AC7
10AC9..10ACA
10ACE..10AD2
10ADD
10AE1
10AE4
10AEF
10B81
10B83..10B85
10B89
10B8C
10B8E..10B8F
10B91
10BA9..10BAC
10D22
10EC2
10F33
10F54
10F74..10F75
10FB4..10FB6
10FB9..10FBA
10FBD
10FC2..10FC3
10FC9
"""
)
assert len(_JOINTYPE_RIGHT_JOINING) == 153
# https://www.unicode.org/Public/UNIDATA/extracted/DerivedJoiningType.txt
# Joining_Type=Left_Joining
_JOINTYPE_LEFT_JOINING = CodepointSet(
"""
A872
10ACD
10AD7
10D00
10FCB
"""
)
assert len(_JOINTYPE_LEFT_JOINING) == 5
# https://www.unicode.org/Public/UNIDATA/extracted/DerivedJoiningType.txt
# Joining_Type=Transparent
_JOINTYPE_TRANSPARENT = CodepointSet(
"""
00AD
0300..036F
0483..0487
0488..0489
0591..05BD
05BF
05C1..05C2
05C4..05C5
05C7
0610..061A
061C
064B..065F
0670
06D6..06DC
06DF..06E4
06E7..06E8
06EA..06ED
070F
0711
0730..074A
07A6..07B0
07EB..07F3
07FD
0816..0819
081B..0823
0825..0827
0829..082D
0859..085B
0897..089F
08CA..08E1
08E3..0902
093A
093C
0941..0948
094D
0951..0957
0962..0963
0981
09BC
09C1..09C4
09CD
09E2..09E3
09FE
0A01..0A02
0A3C
0A41..0A42
0A47..0A48
0A4B..0A4D
0A51
0A70..0A71
0A75
0A81..0A82
0ABC
0AC1..0AC5
0AC7..0AC8
0ACD
0AE2..0AE3
0AFA..0AFF
0B01
0B3C
0B3F
0B41..0B44
0B4D
0B55..0B56
0B62..0B63
0B82
0BC0
0BCD
0C00
0C04
0C3C
0C3E..0C40
0C46..0C48
0C4A..0C4D
0C55..0C56
0C62..0C63
0C81
0CBC
0CBF
0CC6
0CCC..0CCD
0CE2..0CE3
0D00..0D01
0D3B..0D3C
0D41..0D44
0D4D
0D62..0D63
0D81
0DCA
0DD2..0DD4
0DD6
0E31
0E34..0E3A
0E47..0E4E
0EB1
0EB4..0EBC
0EC8..0ECE
0F18..0F19
0F35
0F37
0F39
0F71..0F7E
0F80..0F84
0F86..0F87
0F8D..0F97
0F99..0FBC
0FC6
102D..1030
1032..1037
1039..103A
103D..103E
1058..1059
105E..1060
1071..1074
1082
1085..1086
108D
109D
135D..135F
1712..1714
1732..1733
1752..1753
1772..1773
17B4..17B5
17B7..17BD
17C6
17C9..17D3
17DD
180B..180D
180F
1885..1886
18A9
1920..1922
1927..1928
1932
1939..193B
1A17..1A18
1A1B
1A56
1A58..1A5E
1A60
1A62
1A65..1A6C
1A73..1A7C
1A7F
1AB0..1ABD
1ABE
1ABF..1ADD
1AE0..1AEB
1B00..1B03
1B34
1B36..1B3A
1B3C
1B42
1B6B..1B73
1B80..1B81
1BA2..1BA5
1BA8..1BA9
1BAB..1BAD
1BE6
1BE8..1BE9
1BED
1BEF..1BF1
1C2C..1C33
1C36..1C37
1CD0..1CD2
1CD4..1CE0
1CE2..1CE8
1CED
1CF4
1CF8..1CF9
1DC0..1DFF
200B
200E..200F
202A..202E
2060..2064
206A..206F
20D0..20DC
20DD..20E0
20E1
20E2..20E4
20E5..20F0
2CEF..2CF1
2D7F
2DE0..2DFF
302A..302D
3099..309A
A66F
A670..A672
A674..A67D
A69E..A69F
A6F0..A6F1
A802
A806
A80B
A825..A826
A82C
A8C4..A8C5
A8E0..A8F1
A8FF
A926..A92D
A947..A951
A980..A982
A9B3
A9B6..A9B9
A9BC..A9BD
A9E5
AA29..AA2E
AA31..AA32
AA35..AA36
AA43
AA4C
AA7C
AAB0
AAB2..AAB4
AAB7..AAB8
AABE..AABF
AAC1
AAEC..AAED
AAF6
ABE5
ABE8
ABED
FB1E
FE00..FE0F
FE20..FE2F
FEFF
FFF9..FFFB
101FD
102E0
10376..1037A
10A01..10A03
10A05..10A06
10A0C..10A0F
10A38..10A3A
10A3F
10AE5..10AE6
10D24..10D27
10D69..10D6D
10EAB..10EAC
10EFA..10EFF
10F46..10F50
10F82..10F85
11001
11038..11046
11070
11073..11074
1107F..11081
110B3..110B6
110B9..110BA
110C2
11100..11102
11127..1112B
1112D..11134
11173
11180..11181
111B6..111BE
111C9..111CC
111CF
1122F..11231
11234
11236..11237
1123E
11241
112DF
112E3..112EA
11300..11301
1133B..1133C
11340
11366..1136C
11370..11374
113BB..113C0
113CE
113D0
113D2
113E1..113E2
11438..1143F
11442..11444
11446
1145E
114B3..114B8
114BA
114BF..114C0
114C2..114C3
115B2..115B5
115BC..115BD
115BF..115C0
115DC..115DD
11633..1163A
1163D
1163F..11640
116AB
116AD
116B0..116B5
116B7
1171D
1171F
11722..11725
11727..1172B
1182F..11837
11839..1183A
1193B..1193C
1193E
11943
119D4..119D7
119DA..119DB
119E0
11A01..11A0A
11A33..11A38
11A3B..11A3E
11A47
11A51..11A56
11A59..11A5B
11A8A..11A96
11A98..11A99
11B60
11B62..11B64
11B66
11C30..11C36
11C38..11C3D
11C3F
11C92..11CA7
11CAA..11CB0
11CB2..11CB3
11CB5..11CB6
11D31..11D36
11D3A
11D3C..11D3D
11D3F..11D45
11D47
11D90..11D91
11D95
11D97
11EF3..11EF4
11F00..11F01
11F36..11F3A
11F40
11F42
11F5A
13430..1343F
13440
13447..13455
1611E..16129
1612D..1612F
16AF0..16AF4
16B30..16B36
16F4F
16F8F..16F92
16FE4
1BC9D..1BC9E
1BCA0..1BCA3
1CF00..1CF2D
1CF30..1CF46
1D167..1D169
1D173..1D17A
1D17B..1D182
1D185..1D18B
1D1AA..1D1AD
1D242..1D244
1DA00..1DA36
1DA3B..1DA6C
1DA75
1DA84
1DA9B..1DA9F
1DAA1..1DAAF
1E000..1E006
1E008..1E018
1E01B..1E021
1E023..1E024
1E026..1E02A
1E08F
1E130..1E136
1E2AE
1E2EC..1E2EF
1E4EC..1E4EF
1E5EE..1E5EF
1E6E3
1E6E6
1E6EE..1E6EF
1E6F5
1E8D0..1E8D6
1E944..1E94A
1E94B
E0001
E0020..E007F
E0100..E01EF
"""
)
assert len(_JOINTYPE_TRANSPARENT) == 2224
# https://www.unicode.org/Public/UNIDATA/Scripts.txt
# Greek
_GREEK_SCRIPT = CodepointSet(
"""
0370..0373
0375
0376..0377
037A
037B..037D
037F
0384
0386
0388..038A
038C
038E..03A1
03A3..03E1
03F0..03F5
03F6
03F7..03FF
1D26..1D2A
1D5D..1D61
1D66..1D6A
1DBF
1F00..1F15
1F18..1F1D
1F20..1F45
1F48..1F4D
1F50..1F57
1F59
1F5B
1F5D
1F5F..1F7D
1F80..1FB4
1FB6..1FBC
1FBD
1FBE
1FBF..1FC1
1FC2..1FC4
1FC6..1FCC
1FCD..1FCF
1FD0..1FD3
1FD6..1FDB
1FDD..1FDF
1FE0..1FEC
1FED..1FEF
1FF2..1FF4
1FF6..1FFC
1FFD..1FFE
2126
AB65
10140..10174
10175..10178
10179..10189
1018A..1018B
1018C..1018E
101A0
1D200..1D241
1D242..1D244
1D245
"""
)
assert len(_GREEK_SCRIPT) == 518
# https://www.unicode.org/Public/UNIDATA/Scripts.txt
# Hebrew
_HEBREW_SCRIPT = CodepointSet(
"""
0591..05BD
05BE
05BF
05C0
05C1..05C2
05C3
05C4..05C5
05C6
05C7
05D0..05EA
05EF..05F2
05F3..05F4
FB1D
FB1E
FB1F..FB28
FB29
FB2A..FB36
FB38..FB3C
FB3E
FB40..FB41
FB43..FB44
FB46..FB4F
"""
)
assert len(_HEBREW_SCRIPT) == 134
# https://www.unicode.org/Public/UNIDATA/Scripts.txt
# Hiragana, Katakana, Han
_HIRAGANA_KATAKANA_HAN = CodepointSet(
"""
# Hiragana (381)
3041..3096
309D..309E
309F
1B001..1B11F
1B132
1B150..1B152
1F200
# Katakana (321)
30A1..30FA
30FD..30FE
30FF
31F0..31FF
32D0..32FE
3300..3357
FF66..FF6F
FF71..FF9D
1AFF0..1AFF3
1AFF5..1AFFB
1AFFD..1AFFE
1B000
1B120..1B122
1B155
1B164..1B167
# Han (103351)
2E80..2E99
2E9B..2EF3
2F00..2FD5
3005
3007
3021..3029
3038..303A
303B
3400..4DBF
4E00..9FFF
F900..FA6D
FA70..FAD9
16FE2
16FE3
16FF0..16FF1
16FF2..16FF3
16FF4..16FF6
20000..2A6DF
2A700..2B81D
2B820..2CEAD
2CEB0..2EBE0
2EBF0..2EE5D
2F800..2FA1D
30000..3134A
31350..33479
"""
)
assert len(_HIRAGANA_KATAKANA_HAN) == (381 + 321 + 103351)
# https://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt
# Leading_Jamo, Vowel_Jamo, Trailing_Jamo
_OLD_HANGUL_JAMO = CodepointSet(
"""
# Leading_Jamo (125)
1100..115F
A960..A97C
# Vowel_Jamo (95)
1160..11A7
D7B0..D7C6
# Trailing_Jamo (137)
11A8..11FF
D7CB..D7FB
"""
)
assert len(_OLD_HANGUL_JAMO) == (125 + 95 + 137)
|