1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
|
#!/usr/bin/env tclsh
# Generate UTF-8 case mapping tables
#
# (c) 2010 Steve Bennett <steveb@workware.net.au>
#
# See LICENCE for licence details.
#/
# Parse the unicode data from: http://unicode.org/Public/UNIDATA/UnicodeData.txt
# and http://unicode.org/Public/UNIDATA/EastAsianWidth.txt
# to generate case mapping and display width tables
set map(lower) {}
set map(upper) {}
set map(title) {}
set map(combining) {}
set map(wide) {}
set USAGE "Usage: parse-unidata.tcl \[-width\] UnicodeData.txt \[EastAsianWidth.txt\]"
set do_width 0
if {[lindex $argv 0] eq "-width"} {
set do_width 1
set argv [lrange $argv 1 end]
}
if {[llength $argv] ni {1 2}} {
puts stderr $USAGE
exit 1
}
lassign $argv unicodefile widthfile
set f [open $unicodefile]
while {[gets $f buf] >= 0} {
# Remove any trailing whitespace, especially errant CR
set buf [string trim $buf]
set title ""
set lower ""
set upper ""
lassign [split $buf ";"] code name class x x x x x x x x x upper lower title
set codex [string tolower 0x$code]
if {[string match M* $class]} {
if {![info exists combining]} {
set combining $codex
}
continue
} elseif {[info exists combining]} {
lappend map(combining) $combining $codex
unset combining
}
if {$codex <= 0x7f} {
continue
}
if {$codex > 0xffff} {
break
}
if {![string match L* $class]} {
continue
}
if {$upper ne ""} {
lappend map(upper) $codex [string tolower 0x$upper]
}
if {$lower ne ""} {
lappend map(lower) $codex [string tolower 0x$lower]
}
if {$title ne "" && $title ne $upper} {
if {$title eq $code} {
set title 0
}
lappend map(title) $codex [string tolower 0x$title]
}
}
close $f
proc output-int-pairs {list} {
set n 0
foreach {v1 v2} $list {
puts -nonewline "\t{ $v1, $v2 },"
if {[incr n] % 4 == 0} {
puts ""
}
}
if {$n % 4} {
puts ""
}
}
# Merges adjacent ranges in a list of ranges (lower upper lower upper ...)
proc combine-adjacent-ranges {list} {
set newlist {}
foreach {lower upper} $list {
if {[info exists prev_upper]} {
if {$lower == $prev_upper + 1} {
# combine these
set prev_upper $upper
continue
} else {
# can't combine
lappend newlist $prev_lower $prev_upper
}
}
set prev_lower $lower
set prev_upper $upper
}
# Now add the last range
lappend newlist $prev_lower $prev_upper
return $newlist
}
foreach type {upper lower title} {
puts "static const struct casemap unicode_case_mapping_$type\[\] = \{"
output-int-pairs $map($type)
puts "\};\n"
}
if {$do_width} {
set f [open $widthfile]
while {[gets $f buf] >= 0} {
# Remove any trailing whitespace, especially errant CR
set buf [string trim $buf]
if {[regexp {^([0-9A-Fa-f.]+);W} $buf -> range]} {
set range [string tolower $range]
lassign [split $range .] lower - upper
if {$upper eq ""} {
set upper $lower
}
lappend map(wide) 0x$lower 0x$upper
}
}
close $f
}
foreach type {combining wide} {
puts "static const struct utf8range unicode_range_$type\[\] = \{"
if {$do_width} {
output-int-pairs [combine-adjacent-ranges $map($type)]
} else {
# Just produce empty width tables in this case
output-int-pairs {}
}
puts "\};\n"
}
|