1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352
|
# ucm2tests.tcl
#
# Parses given ucm files (from ICU) to generate test data
# for encodings.
#
# tclsh ucm2tests.tcl PATH_TO_ICU_UCM_DIRECTORY ?OUTPUTPATH?
#
namespace eval ucm {
# No means to change these currently but ...
variable outputPath
variable outputChan
variable errorChan stderr
variable verbose 0
# Map Tcl encoding name to ICU UCM file name
variable encNameMap
array set encNameMap {
cp1250 glibc-CP1250-2.1.2
cp1251 glibc-CP1251-2.1.2
cp1252 glibc-CP1252-2.1.2
cp1253 glibc-CP1253-2.1.2
cp1254 glibc-CP1254-2.1.2
cp1255 glibc-CP1255-2.1.2
cp1256 glibc-CP1256-2.1.2
cp1257 glibc-CP1257-2.1.2
cp1258 glibc-CP1258-2.1.2
gb1988 glibc-GB_1988_80-2.3.3
iso8859-1 glibc-ISO_8859_1-2.1.2
iso8859-2 glibc-ISO_8859_2-2.1.2
iso8859-3 glibc-ISO_8859_3-2.1.2
iso8859-4 glibc-ISO_8859_4-2.1.2
iso8859-5 glibc-ISO_8859_5-2.1.2
iso8859-6 glibc-ISO_8859_6-2.1.2
iso8859-7 glibc-ISO_8859_7-2.3.3
iso8859-8 glibc-ISO_8859_8-2.3.3
iso8859-9 glibc-ISO_8859_9-2.1.2
iso8859-10 glibc-ISO_8859_10-2.1.2
iso8859-11 glibc-ISO_8859_11-2.1.2
iso8859-13 glibc-ISO_8859_13-2.3.3
iso8859-14 glibc-ISO_8859_14-2.1.2
iso8859-15 glibc-ISO_8859_15-2.1.2
iso8859-16 glibc-ISO_8859_16-2.3.3
}
# Array keyed by Tcl encoding name. Each element contains mapping of
# Unicode code point -> byte sequence for that encoding as a flat list
# (or dictionary). Both are stored as hex strings
variable charMap
# Array keyed by Tcl encoding name. List of invalid code sequences
# each being a hex string.
variable invalidCodeSequences
# Array keyed by Tcl encoding name. List of unicode code points that are
# not mapped, each being a hex string.
variable unmappedCodePoints
# The fallback character per encoding
variable encSubchar
}
proc ucm::abort {msg} {
variable errorChan
puts $errorChan $msg
exit 1
}
proc ucm::warn {msg} {
variable errorChan
puts $errorChan $msg
}
proc ucm::log {msg} {
variable verbose
if {$verbose} {
variable errorChan
puts $errorChan $msg
}
}
proc ucm::print {s} {
variable outputChan
puts $outputChan $s
}
proc ucm::parse_SBCS {encName fd} {
variable charMap
variable invalidCodeSequences
variable unmappedCodePoints
set result {}
while {[gets $fd line] >= 0} {
if {[string match #* $line]} {
continue
}
if {[string equal "END CHARMAP" [string trim $line]]} {
break
}
if {![regexp {^\s*<U([[:xdigit:]]{4})>\s*((\\x[[:xdigit:]]{2})+)\s*(\|(0|1|2|3|4))} $line -> unichar bytes - - precision]} {
error "Unexpected line parsing SBCS: $line"
}
set bytes [string map {\\x {}} $bytes]; # \xNN -> NN
if {$precision eq "" || $precision eq "0"} {
lappend result $unichar $bytes
} else {
# It is a fallback mapping - ignore
}
}
set charMap($encName) $result
# Find out invalid code sequences and unicode code points that are not mapped
set valid {}
set mapped {}
foreach {unich bytes} $result {
lappend mapped $unich
lappend valid $bytes
}
set invalidCodeSequences($encName) {}
for {set i 0} {$i <= 255} {incr i} {
set hex [format %.2X $i]
if {[lsearch -exact $valid $hex] < 0} {
lappend invalidCodeSequences($encName) $hex
}
}
set unmappedCodePoints($encName) {}
for {set i 0} {$i <= 65535} {incr i} {
set hex [format %.4X $i]
if {[lsearch -exact $mapped $hex] < 0} {
lappend unmappedCodePoints($encName) $hex
# Only look for (at most) one below 256 and one above 1024
if {$i < 255} {
# Found one so jump past 8 bits
set i 255
} else {
break
}
}
if {$i == 255} {
set i 1023
}
}
lappend unmappedCodePoints($encName) D800 DC00 10000 10FFFF
}
proc ucm::generate_boilerplate {} {
# Common procedures
print {
# This file is automatically generated by ucm2tests.tcl.
# Edits will be overwritten on next generation.
#
# Generates tests comparing Tcl encodings to ICU.
# The generated file is NOT standalone. It should be sourced into a test script.
proc ucmConvertfromMismatches {enc map} {
set mismatches {}
foreach {unihex hex} $map {
set unihex [string range 00000000$unihex end-7 end]; # Make 8 digits
set unich [subst "\\U$unihex"]
if {[encoding convertfrom -profile strict $enc [binary decode hex $hex]] ne $unich} {
lappend mismatches "<[printable $unich],$hex>"
}
}
return $mismatches
}
proc ucmConverttoMismatches {enc map} {
set mismatches {}
foreach {unihex hex} $map {
set unihex [string range 00000000$unihex end-7 end]; # Make 8 digits
set unich [subst "\\U$unihex"]
if {[encoding convertto -profile strict $enc $unich] ne [binary decode hex $hex]} {
lappend mismatches "<[printable $unich],$hex>"
}
}
return $mismatches
}
if {[info commands printable] eq ""} {
proc printable {s} {
set print ""
foreach c [split $s ""] {
set i [scan $c %c]
if {[string is print $c] && ($i <= 127)} {
append print $c
} elseif {$i <= 0xff} {
append print \\x[format %02X $i]
} elseif {$i <= 0xffff} {
append print \\u[format %04X $i]
} else {
append print \\U[format %08X $i]
}
}
return $print
}
}
}
} ; # generate_boilerplate
proc ucm::generate_tests {} {
variable encNameMap
variable charMap
variable invalidCodeSequences
variable unmappedCodePoints
variable outputPath
variable outputChan
variable encSubchar
if {[info exists outputPath]} {
set outputChan [open $outputPath w]
fconfigure $outputChan -translation lf
} else {
set outputChan stdout
}
array set tclNames {}
foreach encName [encoding names] {
set tclNames($encName) ""
}
generate_boilerplate
foreach encName [lsort -dictionary [array names encNameMap]] {
if {![info exists charMap($encName)]} {
warn "No character map read for $encName"
continue
}
unset tclNames($encName)
# Print the valid tests
print "\n#\n# $encName (generated from $encNameMap($encName))"
print "\ntest encoding-convertfrom-ucmCompare-$encName {Compare against ICU UCM} -body \{"
print " ucmConvertfromMismatches $encName {$charMap($encName)}"
print "\} -result {}"
print "\ntest encoding-convertto-ucmCompare-$encName {Compare against ICU UCM} -body \{"
print " ucmConverttoMismatches $encName {$charMap($encName)}"
print "\} -result {}"
if {0} {
# This will generate individual tests for every char
# and test in lead, tail, middle, solo configurations
# but takes considerable time
print "lappend encValidStrings \{*\}\{"
foreach {unich hex} $charMap($encName) {
print " $encName \\u$unich $hex {} {}"
}
print "\}; # $encName"
}
# Generate the invalidity checks
print "\n# $encName - invalid byte sequences"
print "lappend encInvalidBytes \{*\}\{"
foreach hex $invalidCodeSequences($encName) {
# Map XXXX... to \xXX\xXX...
set uhex [regsub -all .. $hex {\\x\0}]
set uhex \\U[string range 00000000$hex end-7 end]
print " $encName $hex tcl8 $uhex -1 {} {}"
print " $encName $hex replace \\uFFFD -1 {} {}"
print " $encName $hex strict {} 0 {} {}"
}
print "\}; # $encName"
print "\n# $encName - invalid byte sequences"
print "lappend encUnencodableStrings \{*\}\{"
if {[info exists encSubchar($encName)]} {
set subchar $encSubchar($encName)
} else {
set subchar "3F"; # Tcl uses ? by default
}
foreach hex $unmappedCodePoints($encName) {
set uhex \\U[string range 00000000$hex end-7 end]
print " $encName $uhex tcl8 $subchar -1 {} {}"
print " $encName $uhex replace $subchar -1 {} {}"
print " $encName $uhex strict {} 0 {} {}"
}
print "\}; # $encName"
}
if {[array size tclNames]} {
warn "Missing encoding: [lsort [array names tclNames]]"
}
if {[info exists outputPath]} {
close $outputChan
unset outputChan
}
}
proc ucm::parse_file {encName ucmPath} {
variable charMap
variable encSubchar
set fd [open $ucmPath]
try {
# Parse the metadata
unset -nocomplain state
while {[gets $fd line] >= 0} {
if {[regexp {<(code_set_name|mb_cur_max|mb_cur_min|uconv_class|subchar)>\s+(\S+)} $line -> key val]} {
set state($key) $val
} elseif {[regexp {^\s*CHARMAP\s*$} $line]} {
set state(charmap) ""
break
} else {
# Skip all else
}
}
if {![info exists state(charmap)]} {
abort "Error: $ucmPath has No CHARMAP line."
}
foreach key {code_set_name uconv_class} {
if {[info exists state($key)]} {
set state($key) [string trim $state($key) {"}]
}
}
if {[info exists charMap($encName)]} {
abort "Duplicate file for $encName ($path)"
}
if {![info exists state(uconv_class)]} {
abort "Error: $ucmPath has no uconv_class definition."
}
if {[info exists state(subchar)]} {
# \xNN\xNN.. -> NNNN..
set encSubchar($encName) [string map {\\x {}} $state(subchar)]
}
switch -exact -- $state(uconv_class) {
SBCS {
if {[catch {
parse_SBCS $encName $fd
} result]} {
abort "Could not process $ucmPath. $result"
}
}
default {
log "Skipping $ucmPath -- not SBCS encoding."
return
}
}
} finally {
close $fd
}
}
proc ucm::run {} {
variable encNameMap
variable outputPath
switch [llength $::argv] {
2 {set outputPath [lindex $::argv 1]}
1 {}
default {
abort "Usage: [info nameofexecutable] $::argv0 path/to/icu/ucm/data ?outputfile?"
}
}
foreach {encName fname} [array get encNameMap] {
ucm::parse_file $encName [file join [lindex $::argv 0] ${fname}.ucm]
}
generate_tests
}
ucm::run
|