1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146
|
#--------------------------------------------------------------------------
# Parameter $zName must be a path to the file UnicodeData.txt. This command
# reads the file and returns a list of mappings required to remove all
# diacritical marks from a unicode string. Each mapping is itself a list
# consisting of two elements - the unicode codepoint and the single ASCII
# character that it should be replaced with, or an empty string if the
# codepoint should simply be removed from the input. Examples:
#
# { 224 a } (replace codepoint 224 to "a")
# { 769 "" } (remove codepoint 769 from input)
#
# Mappings are only returned for non-upper case codepoints. It is assumed
# that the input has already been folded to lower case.
#
proc rd_load_unicodedata_text {zName} {
global tl_lookup_table
set fd [open $zName]
set lField {
code
character_name
general_category
canonical_combining_classes
bidirectional_category
character_decomposition_mapping
decimal_digit_value
digit_value
numeric_value
mirrored
unicode_1_name
iso10646_comment_field
uppercase_mapping
lowercase_mapping
titlecase_mapping
}
set lRet [list]
while { ![eof $fd] } {
set line [gets $fd]
if {$line == ""} continue
set fields [split $line ";"]
if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
foreach $lField $fields {}
if { [llength $character_decomposition_mapping]!=2
|| [string is xdigit [lindex $character_decomposition_mapping 0]]==0
} {
continue
}
set iCode [expr "0x$code"]
set iAscii [expr "0x[lindex $character_decomposition_mapping 0]"]
set iDia [expr "0x[lindex $character_decomposition_mapping 1]"]
if {[info exists tl_lookup_table($iCode)]} continue
if { ($iAscii >= 97 && $iAscii <= 122)
|| ($iAscii >= 65 && $iAscii <= 90)
} {
lappend lRet [list $iCode [string tolower [format %c $iAscii]]]
set dia($iDia) 1
}
}
foreach d [array names dia] {
lappend lRet [list $d ""]
}
set lRet [lsort -integer -index 0 $lRet]
close $fd
set lRet
}
#-------------------------------------------------------------------------
# Parameter $zName must be a path to the file UnicodeData.txt. This command
# reads the file and returns a list of codepoints (integers). The list
# contains all codepoints in the UnicodeData.txt assigned to any "General
# Category" that is not a "Letter" or "Number".
#
proc an_load_unicodedata_text {zName} {
set fd [open $zName]
set lField {
code
character_name
general_category
canonical_combining_classes
bidirectional_category
character_decomposition_mapping
decimal_digit_value
digit_value
numeric_value
mirrored
unicode_1_name
iso10646_comment_field
uppercase_mapping
lowercase_mapping
titlecase_mapping
}
set lRet [list]
while { ![eof $fd] } {
set line [gets $fd]
if {$line == ""} continue
set fields [split $line ";"]
if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
foreach $lField $fields {}
set iCode [expr "0x$code"]
set bAlnum [expr {
[lsearch {L N} [string range $general_category 0 0]] >= 0
|| $general_category=="Co"
}]
if { !$bAlnum } { lappend lRet $iCode }
}
close $fd
set lRet
}
proc tl_load_casefolding_txt {zName} {
global tl_lookup_table
set fd [open $zName]
while { ![eof $fd] } {
set line [gets $fd]
if {[string range $line 0 0] == "#"} continue
if {$line == ""} continue
foreach x {a b c d} {unset -nocomplain $x}
foreach {a b c d} [split $line ";"] {}
set a2 [list]
set c2 [list]
foreach elem $a { lappend a2 [expr "0x[string trim $elem]"] }
foreach elem $c { lappend c2 [expr "0x[string trim $elem]"] }
set b [string trim $b]
set d [string trim $d]
if {$b=="C" || $b=="S"} { set tl_lookup_table($a2) $c2 }
}
}
|