File: unicode_property.rb

package info (click to toggle)
ruby-regexp-parser 2.11.3-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 1,092 kB
  • sloc: ruby: 6,891; makefile: 6; sh: 3
file content (766 lines) | stat: -rw-r--r-- 18,395 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
# frozen_string_literal: true

module Regexp::Syntax
  module Token
    module UnicodeProperty
      all = proc { |name| constants.grep(/#{name}/).flat_map(&method(:const_get)) }

      CharType_V1_9_0 = %i[alnum alpha ascii blank cntrl digit graph
                           lower print punct space upper word xdigit].freeze

      CharType_V2_5_0 = %i[xposixpunct].freeze

      POSIX = %i[any assigned newline].freeze

      module Category
        Letter        = %i[letter uppercase_letter lowercase_letter
                           titlecase_letter modifier_letter other_letter].freeze

        Mark          = %i[mark nonspacing_mark spacing_mark
                           enclosing_mark].freeze

        Number        = %i[number decimal_number letter_number
                           other_number].freeze

        Punctuation   = %i[punctuation connector_punctuation dash_punctuation
                           open_punctuation close_punctuation initial_punctuation
                           final_punctuation other_punctuation].freeze

        Symbol        = %i[symbol math_symbol currency_symbol
                           modifier_symbol other_symbol].freeze

        Separator     = %i[separator space_separator line_separator
                           paragraph_separator].freeze

        Codepoint     = %i[other control format
                           surrogate private_use unassigned].freeze

        All = Letter + Mark + Number + Punctuation +
              Symbol + Separator + Codepoint
      end

      Age_V1_9_3 = %i[age=1.1 age=2.0 age=2.1 age=3.0 age=3.1
                      age=3.2 age=4.0 age=4.1 age=5.0 age=5.1
                      age=5.2 age=6.0].freeze

      Age_V2_0_0 = %i[age=6.1].freeze

      Age_V2_2_0 = %i[age=6.2 age=6.3 age=7.0].freeze

      Age_V2_3_0 = %i[age=8.0].freeze

      Age_V2_4_0 = %i[age=9.0].freeze

      Age_V2_5_0 = %i[age=10.0].freeze

      Age_V2_6_0 = %i[age=11.0].freeze

      Age_V2_6_2 = %i[age=12.0].freeze

      Age_V2_6_3 = %i[age=12.1].freeze

      Age_V3_1_0 = %i[age=13.0].freeze

      Age_V3_2_0 = %i[age=14.0 age=15.0].freeze

      Age_V3_5_0 = %i[age=15.1]

      Age = all[:Age_V]

      Derived_V1_9_0 = %i[
        ascii_hex_digit
        alphabetic
        cased
        changes_when_casefolded
        changes_when_casemapped
        changes_when_lowercased
        changes_when_titlecased
        changes_when_uppercased
        case_ignorable
        bidi_control
        dash
        deprecated
        default_ignorable_code_point
        diacritic
        extender
        grapheme_base
        grapheme_extend
        grapheme_link
        hex_digit
        hyphen
        id_continue
        ideographic
        id_start
        ids_binary_operator
        ids_trinary_operator
        join_control
        logical_order_exception
        lowercase
        math
        noncharacter_code_point
        other_alphabetic
        other_default_ignorable_code_point
        other_grapheme_extend
        other_id_continue
        other_id_start
        other_lowercase
        other_math
        other_uppercase
        pattern_syntax
        pattern_white_space
        quotation_mark
        radical
        sentence_terminal
        soft_dotted
        terminal_punctuation
        unified_ideograph
        uppercase
        variation_selector
        white_space
        xid_start
        xid_continue
      ].freeze

      Derived_V2_0_0 = %i[
        cased_letter
        combining_mark
      ].freeze

      Derived_V2_4_0 = %i[
        prepended_concatenation_mark
      ].freeze

      Derived_V2_5_0 = %i[
        regional_indicator
      ].freeze

      Derived_V3_5_0 = %i[
        id_compat_math_continue
        id_compat_math_start
        ids_unary_operator
      ]

      Derived = all[:Derived_V]

      Script_V1_9_0 = %i[
        arabic
        imperial_aramaic
        armenian
        avestan
        balinese
        bamum
        bengali
        bopomofo
        braille
        buginese
        buhid
        canadian_aboriginal
        carian
        cham
        cherokee
        coptic
        cypriot
        cyrillic
        devanagari
        deseret
        egyptian_hieroglyphs
        ethiopic
        georgian
        glagolitic
        gothic
        greek
        gujarati
        gurmukhi
        hangul
        han
        hanunoo
        hebrew
        hiragana
        old_italic
        javanese
        kayah_li
        katakana
        kharoshthi
        khmer
        kannada
        kaithi
        tai_tham
        lao
        latin
        lepcha
        limbu
        linear_b
        lisu
        lycian
        lydian
        malayalam
        mongolian
        meetei_mayek
        myanmar
        nko
        ogham
        ol_chiki
        old_turkic
        oriya
        osmanya
        phags_pa
        inscriptional_pahlavi
        phoenician
        inscriptional_parthian
        rejang
        runic
        samaritan
        old_south_arabian
        saurashtra
        shavian
        sinhala
        sundanese
        syloti_nagri
        syriac
        tagbanwa
        tai_le
        new_tai_lue
        tamil
        tai_viet
        telugu
        tifinagh
        tagalog
        thaana
        thai
        tibetan
        ugaritic
        vai
        old_persian
        cuneiform
        yi
        inherited
        common
        unknown
      ].freeze

      Script_V1_9_3 = %i[
        brahmi
        batak
        mandaic
      ].freeze

      Script_V2_0_0 = %i[
        chakma
        meroitic_cursive
        meroitic_hieroglyphs
        miao
        sharada
        sora_sompeng
        takri
      ].freeze

      Script_V2_2_0 = %i[
        caucasian_albanian
        bassa_vah
        duployan
        elbasan
        grantha
        pahawh_hmong
        khojki
        linear_a
        mahajani
        manichaean
        mende_kikakui
        modi
        mro
        old_north_arabian
        nabataean
        palmyrene
        pau_cin_hau
        old_permic
        psalter_pahlavi
        siddham
        khudawadi
        tirhuta
        warang_citi
      ].freeze

      Script_V2_3_0 = %i[
        ahom
        anatolian_hieroglyphs
        hatran
        multani
        old_hungarian
        signwriting
      ].freeze

      Script_V2_4_0 = %i[
        adlam
        bhaiksuki
        marchen
        newa
        osage
        tangut
      ].freeze

      Script_V2_5_0 = %i[
        masaram_gondi
        nushu
        soyombo
        zanabazar_square
      ].freeze

      Script_V2_6_0 = %i[
        dogra
        gunjala_gondi
        hanifi_rohingya
        makasar
        medefaidrin
        old_sogdian
        sogdian
      ].freeze

      Script_V2_6_2 = %i[
        elymaic
        nandinagari
        nyiakeng_puachue_hmong
        wancho
      ].freeze

      Script_V3_1_0 = %i[
        chorasmian
        dives_akuru
        khitan_small_script
        yezidi
      ].freeze

      Script_V3_2_0 = %i[
        cypro_minoan
        kawi
        nag_mundari
        old_uyghur
        tangsa
        toto
        vithkuqi
      ].freeze

      Script = all[:Script_V]

      UnicodeBlock_V1_9_0 = %i[
        in_alphabetic_presentation_forms
        in_arabic
        in_armenian
        in_arrows
        in_basic_latin
        in_bengali
        in_block_elements
        in_bopomofo_extended
        in_bopomofo
        in_box_drawing
        in_braille_patterns
        in_buhid
        in_cjk_compatibility_forms
        in_cjk_compatibility_ideographs
        in_cjk_compatibility
        in_cjk_radicals_supplement
        in_cjk_symbols_and_punctuation
        in_cjk_unified_ideographs_extension_a
        in_cjk_unified_ideographs
        in_cherokee
        in_combining_diacritical_marks_for_symbols
        in_combining_diacritical_marks
        in_combining_half_marks
        in_control_pictures
        in_currency_symbols
        in_cyrillic_supplement
        in_cyrillic
        in_devanagari
        in_dingbats
        in_enclosed_alphanumerics
        in_enclosed_cjk_letters_and_months
        in_ethiopic
        in_general_punctuation
        in_geometric_shapes
        in_georgian
        in_greek_extended
        in_greek_and_coptic
        in_gujarati
        in_gurmukhi
        in_halfwidth_and_fullwidth_forms
        in_hangul_compatibility_jamo
        in_hangul_jamo
        in_hangul_syllables
        in_hanunoo
        in_hebrew
        in_high_private_use_surrogates
        in_high_surrogates
        in_hiragana
        in_ipa_extensions
        in_ideographic_description_characters
        in_kanbun
        in_kangxi_radicals
        in_kannada
        in_katakana_phonetic_extensions
        in_katakana
        in_khmer_symbols
        in_khmer
        in_lao
        in_latin_extended_additional
        in_letterlike_symbols
        in_limbu
        in_low_surrogates
        in_malayalam
        in_mathematical_operators
        in_miscellaneous_symbols_and_arrows
        in_miscellaneous_symbols
        in_miscellaneous_technical
        in_mongolian
        in_myanmar
        in_number_forms
        in_ogham
        in_optical_character_recognition
        in_oriya
        in_phonetic_extensions
        in_private_use_area
        in_runic
        in_sinhala
        in_small_form_variants
        in_spacing_modifier_letters
        in_specials
        in_superscripts_and_subscripts
        in_supplemental_mathematical_operators
        in_syriac
        in_tagalog
        in_tagbanwa
        in_tai_le
        in_tamil
        in_telugu
        in_thaana
        in_thai
        in_tibetan
        in_unified_canadian_aboriginal_syllabics
        in_variation_selectors
        in_yi_radicals
        in_yi_syllables
        in_yijing_hexagram_symbols
      ].freeze

      UnicodeBlock_V2_0_0 = %i[
        in_aegean_numbers
        in_alchemical_symbols
        in_ancient_greek_musical_notation
        in_ancient_greek_numbers
        in_ancient_symbols
        in_arabic_extended_a
        in_arabic_mathematical_alphabetic_symbols
        in_arabic_presentation_forms_a
        in_arabic_presentation_forms_b
        in_arabic_supplement
        in_avestan
        in_balinese
        in_bamum
        in_bamum_supplement
        in_batak
        in_brahmi
        in_buginese
        in_byzantine_musical_symbols
        in_cjk_compatibility_ideographs_supplement
        in_cjk_strokes
        in_cjk_unified_ideographs_extension_b
        in_cjk_unified_ideographs_extension_c
        in_cjk_unified_ideographs_extension_d
        in_carian
        in_chakma
        in_cham
        in_combining_diacritical_marks_supplement
        in_common_indic_number_forms
        in_coptic
        in_counting_rod_numerals
        in_cuneiform
        in_cuneiform_numbers_and_punctuation
        in_cypriot_syllabary
        in_cyrillic_extended_a
        in_cyrillic_extended_b
        in_deseret
        in_devanagari_extended
        in_domino_tiles
        in_egyptian_hieroglyphs
        in_emoticons
        in_enclosed_alphanumeric_supplement
        in_enclosed_ideographic_supplement
        in_ethiopic_extended
        in_ethiopic_extended_a
        in_ethiopic_supplement
        in_georgian_supplement
        in_glagolitic
        in_gothic
        in_hangul_jamo_extended_a
        in_hangul_jamo_extended_b
        in_imperial_aramaic
        in_inscriptional_pahlavi
        in_inscriptional_parthian
        in_javanese
        in_kaithi
        in_kana_supplement
        in_kayah_li
        in_kharoshthi
        in_latin_1_supplement
        in_latin_extended_a
        in_latin_extended_b
        in_latin_extended_c
        in_latin_extended_d
        in_lepcha
        in_linear_b_ideograms
        in_linear_b_syllabary
        in_lisu
        in_lycian
        in_lydian
        in_mahjong_tiles
        in_mandaic
        in_mathematical_alphanumeric_symbols
        in_meetei_mayek
        in_meetei_mayek_extensions
        in_meroitic_cursive
        in_meroitic_hieroglyphs
        in_miao
        in_miscellaneous_mathematical_symbols_a
        in_miscellaneous_mathematical_symbols_b
        in_miscellaneous_symbols_and_pictographs
        in_modifier_tone_letters
        in_musical_symbols
        in_myanmar_extended_a
        in_nko
        in_new_tai_lue
        in_no_block
        in_ol_chiki
        in_old_italic
        in_old_persian
        in_old_south_arabian
        in_old_turkic
        in_osmanya
        in_phags_pa
        in_phaistos_disc
        in_phoenician
        in_phonetic_extensions_supplement
        in_playing_cards
        in_rejang
        in_rumi_numeral_symbols
        in_samaritan
        in_saurashtra
        in_sharada
        in_shavian
        in_sora_sompeng
        in_sundanese
        in_sundanese_supplement
        in_supplemental_arrows_a
        in_supplemental_arrows_b
        in_supplemental_punctuation
        in_supplementary_private_use_area_a
        in_supplementary_private_use_area_b
        in_syloti_nagri
        in_tags
        in_tai_tham
        in_tai_viet
        in_tai_xuan_jing_symbols
        in_takri
        in_tifinagh
        in_transport_and_map_symbols
        in_ugaritic
        in_unified_canadian_aboriginal_syllabics_extended
        in_vai
        in_variation_selectors_supplement
        in_vedic_extensions
        in_vertical_forms
      ].freeze

      UnicodeBlock_V2_2_0 = %i[
        in_bassa_vah
        in_caucasian_albanian
        in_combining_diacritical_marks_extended
        in_coptic_epact_numbers
        in_duployan
        in_elbasan
        in_geometric_shapes_extended
        in_grantha
        in_khojki
        in_khudawadi
        in_latin_extended_e
        in_linear_a
        in_mahajani
        in_manichaean
        in_mende_kikakui
        in_modi
        in_mro
        in_myanmar_extended_b
        in_nabataean
        in_old_north_arabian
        in_old_permic
        in_ornamental_dingbats
        in_pahawh_hmong
        in_palmyrene
        in_pau_cin_hau
        in_psalter_pahlavi
        in_shorthand_format_controls
        in_siddham
        in_sinhala_archaic_numbers
        in_supplemental_arrows_c
        in_tirhuta
        in_warang_citi
      ].freeze

      UnicodeBlock_V2_3_0 = %i[
        in_ahom
        in_anatolian_hieroglyphs
        in_cjk_unified_ideographs_extension_e
        in_cherokee_supplement
        in_early_dynastic_cuneiform
        in_hatran
        in_multani
        in_old_hungarian
        in_supplemental_symbols_and_pictographs
        in_sutton_signwriting
      ].freeze

      UnicodeBlock_V2_4_0 = %i[
        in_adlam
        in_bhaiksuki
        in_cyrillic_extended_c
        in_glagolitic_supplement
        in_ideographic_symbols_and_punctuation
        in_marchen
        in_mongolian_supplement
        in_newa
        in_osage
        in_tangut
        in_tangut_components
      ].freeze

      UnicodeBlock_V2_5_0 = %i[
        in_cjk_unified_ideographs_extension_f
        in_kana_extended_a
        in_masaram_gondi
        in_nushu
        in_soyombo
        in_syriac_supplement
        in_zanabazar_square
      ].freeze

      UnicodeBlock_V2_6_0 = %i[
        in_chess_symbols
        in_dogra
        in_georgian_extended
        in_gunjala_gondi
        in_hanifi_rohingya
        in_indic_siyaq_numbers
        in_makasar
        in_mayan_numerals
        in_medefaidrin
        in_old_sogdian
        in_sogdian
      ].freeze

      UnicodeBlock_V2_6_2 = %i[
        in_egyptian_hieroglyph_format_controls
        in_elymaic
        in_nandinagari
        in_nyiakeng_puachue_hmong
        in_ottoman_siyaq_numbers
        in_small_kana_extension
        in_symbols_and_pictographs_extended_a
        in_tamil_supplement
        in_wancho
      ].freeze

      UnicodeBlock_V3_1_0 = %i[
        in_chorasmian
        in_cjk_unified_ideographs_extension_g
        in_dives_akuru
        in_khitan_small_script
        in_lisu_supplement
        in_symbols_for_legacy_computing
        in_tangut_supplement
        in_yezidi
      ].freeze

      UnicodeBlock_V3_2_0 = %i[
        in_arabic_extended_b
        in_arabic_extended_c
        in_cjk_unified_ideographs_extension_h
        in_cypro_minoan
        in_cyrillic_extended_d
        in_devanagari_extended_a
        in_ethiopic_extended_b
        in_kaktovik_numerals
        in_kana_extended_b
        in_kawi
        in_latin_extended_f
        in_latin_extended_g
        in_nag_mundari
        in_old_uyghur
        in_tangsa
        in_toto
        in_unified_canadian_aboriginal_syllabics_extended_a
        in_vithkuqi
        in_znamenny_musical_notation
      ].freeze

      UnicodeBlock_V3_5_0 = %i[
        in_cjk_unified_ideographs_extension_i
      ]

      UnicodeBlock = all[:UnicodeBlock_V]

      Emoji_V2_5_0 = %i[
        emoji
        emoji_component
        emoji_modifier
        emoji_modifier_base
        emoji_presentation
      ].freeze

      Emoji_V2_6_0 = %i[
        extended_pictographic
      ].freeze

      Enumerated_V2_4_0 = %i[
        grapheme_cluster_break=control
        grapheme_cluster_break=cr
        grapheme_cluster_break=extend
        grapheme_cluster_break=l
        grapheme_cluster_break=lf
        grapheme_cluster_break=lv
        grapheme_cluster_break=lvt
        grapheme_cluster_break=prepend
        grapheme_cluster_break=regional_indicator
        grapheme_cluster_break=spacingmark
        grapheme_cluster_break=t
        grapheme_cluster_break=v
        grapheme_cluster_break=zwj
      ].freeze

      Enumerated = all[:Enumerated_V]

      Emoji = all[:Emoji_V]

      V1_9_0 = Category::All + POSIX + all[:V1_9_0]
      V1_9_3 = all[:V1_9_3]
      V2_0_0 = all[:V2_0_0]
      V2_2_0 = all[:V2_2_0]
      V2_3_0 = all[:V2_3_0]
      V2_4_0 = all[:V2_4_0]
      V2_5_0 = all[:V2_5_0]
      V2_6_0 = all[:V2_6_0]
      V2_6_2 = all[:V2_6_2]
      V2_6_3 = all[:V2_6_3]
      V3_1_0 = all[:V3_1_0]
      V3_2_0 = all[:V3_2_0]
      V3_5_0 = all[:V3_5_0]

      All = all[/^V\d+_\d+_\d+$/]

      Type = :property
      NonType = :nonproperty
    end

    Map[UnicodeProperty::Type] = UnicodeProperty::All
    Map[UnicodeProperty::NonType] = UnicodeProperty::All

    # alias for symmetry between token symbol and Token module name
    Property = UnicodeProperty
  end
end