File: ucm2tests.tcl

package info (click to toggle)
tcl9.0 9.0.1%2Bdfsg-2
links: PTS, VCS
area: main
in suites: trixie
size: 35,828 kB
sloc: ansic: 216,761; tcl: 24,169; makefile: 3,524; sh: 2,553; ada: 1,681; pascal: 1,139; cpp: 1,001; cs: 879; yacc: 833; asm: 468; perl: 420; xml: 95
file content (352 lines) | stat: -rw-r--r-- 9,708 bytes
parent folder | download | duplicates (2)
# ucm2tests.tcl
#
# Parses given ucm files (from ICU) to generate test data
# for encodings.
#
#  tclsh ucm2tests.tcl PATH_TO_ICU_UCM_DIRECTORY ?OUTPUTPATH?
#

namespace eval ucm {
    # No means to change these currently but ...
    variable outputPath
    variable outputChan
    variable errorChan stderr
    variable verbose 0

    # Map Tcl encoding name to ICU UCM file name
    variable encNameMap
    array set encNameMap {
	cp1250    glibc-CP1250-2.1.2
	cp1251    glibc-CP1251-2.1.2
	cp1252    glibc-CP1252-2.1.2
	cp1253    glibc-CP1253-2.1.2
	cp1254    glibc-CP1254-2.1.2
	cp1255    glibc-CP1255-2.1.2
	cp1256    glibc-CP1256-2.1.2
	cp1257    glibc-CP1257-2.1.2
	cp1258    glibc-CP1258-2.1.2
	gb1988    glibc-GB_1988_80-2.3.3
	iso8859-1 glibc-ISO_8859_1-2.1.2
	iso8859-2 glibc-ISO_8859_2-2.1.2
	iso8859-3 glibc-ISO_8859_3-2.1.2
	iso8859-4 glibc-ISO_8859_4-2.1.2
	iso8859-5 glibc-ISO_8859_5-2.1.2
	iso8859-6 glibc-ISO_8859_6-2.1.2
	iso8859-7 glibc-ISO_8859_7-2.3.3
	iso8859-8 glibc-ISO_8859_8-2.3.3
	iso8859-9 glibc-ISO_8859_9-2.1.2
	iso8859-10 glibc-ISO_8859_10-2.1.2
	iso8859-11 glibc-ISO_8859_11-2.1.2
	iso8859-13 glibc-ISO_8859_13-2.3.3
	iso8859-14 glibc-ISO_8859_14-2.1.2
	iso8859-15 glibc-ISO_8859_15-2.1.2
	iso8859-16 glibc-ISO_8859_16-2.3.3
    }

    # Array keyed by Tcl encoding name. Each element contains mapping of
    # Unicode code point -> byte sequence for that encoding as a flat list
    # (or dictionary). Both are stored as hex strings
    variable charMap

    # Array keyed by Tcl encoding name. List of invalid code sequences
    # each being a hex string.
    variable invalidCodeSequences

    # Array keyed by Tcl encoding name. List of unicode code points that are
    # not mapped, each being a hex string.
    variable unmappedCodePoints

    # The fallback character per encoding
    variable encSubchar
}

proc ucm::abort {msg} {
    variable errorChan
    puts $errorChan $msg
    exit 1
}
proc ucm::warn {msg} {
    variable errorChan
    puts $errorChan $msg
}
proc ucm::log {msg} {
    variable verbose
    if {$verbose} {
	variable errorChan
	puts $errorChan $msg
    }
}
proc ucm::print {s} {
    variable outputChan
    puts $outputChan $s
}

proc ucm::parse_SBCS {encName fd} {
    variable charMap
    variable invalidCodeSequences
    variable unmappedCodePoints

    set result {}
    while {[gets $fd line] >= 0} {
	if {[string match #* $line]} {
	    continue
	}
	if {[string equal "END CHARMAP" [string trim $line]]} {
	    break
	}
	if {![regexp {^\s*<U([[:xdigit:]]{4})>\s*((\\x[[:xdigit:]]{2})+)\s*(\|(0|1|2|3|4))} $line -> unichar bytes - - precision]} {
	    error "Unexpected line parsing SBCS: $line"
	}
	set bytes [string map {\\x {}} $bytes]; # \xNN -> NN
	if {$precision eq "" || $precision eq "0"} {
	    lappend result $unichar $bytes
	} else {
	    # It is a fallback mapping - ignore
	}
    }
    set charMap($encName) $result

    # Find out invalid code sequences and unicode code points that are not mapped
    set valid {}
    set mapped {}
    foreach {unich bytes} $result {
	lappend mapped $unich
	lappend valid $bytes
    }
    set invalidCodeSequences($encName) {}
    for {set i 0} {$i <= 255} {incr i} {
	set hex [format %.2X $i]
	if {[lsearch -exact $valid $hex] < 0} {
	    lappend invalidCodeSequences($encName) $hex
	}
    }

    set unmappedCodePoints($encName) {}
    for {set i 0} {$i <= 65535} {incr i} {
	set hex [format %.4X $i]
	if {[lsearch -exact $mapped $hex] < 0} {
	    lappend unmappedCodePoints($encName) $hex
	    # Only look for (at most) one below 256 and one above 1024
	    if {$i < 255} {
		# Found one so jump past 8 bits
		set i 255
	    } else {
		break
	    }
	}
	if {$i == 255} {
	    set i 1023
	}
    }
    lappend unmappedCodePoints($encName) D800 DC00 10000 10FFFF
}

proc ucm::generate_boilerplate {} {
    # Common procedures
    print {
# This file is automatically generated by ucm2tests.tcl.
# Edits will be overwritten on next generation.
#
# Generates tests comparing Tcl encodings to ICU.
# The generated file is NOT standalone. It should be sourced into a test script.

proc ucmConvertfromMismatches {enc map} {
    set mismatches {}
    foreach {unihex hex} $map {
	set unihex [string range 00000000$unihex end-7 end]; # Make 8 digits
	set unich [subst "\\U$unihex"]
	if {[encoding convertfrom -profile strict $enc [binary decode hex $hex]] ne $unich} {
	    lappend mismatches "<[printable $unich],$hex>"
	}
    }
    return $mismatches
}
proc ucmConverttoMismatches {enc map} {
    set mismatches {}
    foreach {unihex hex} $map {
	set unihex [string range 00000000$unihex end-7 end]; # Make 8 digits
	set unich [subst "\\U$unihex"]
	if {[encoding convertto -profile strict $enc $unich] ne [binary decode hex $hex]} {
	    lappend mismatches "<[printable $unich],$hex>"
	}
    }
    return $mismatches
}
if {[info commands printable] eq ""} {
    proc printable {s} {
	set print ""
	foreach c [split $s ""] {
	    set i [scan $c %c]
	    if {[string is print $c] && ($i <= 127)} {
		append print $c
	    } elseif {$i <= 0xff} {
		append print \\x[format %02X $i]
	    } elseif {$i <= 0xffff} {
		append print \\u[format %04X $i]
	    } else {
		append print \\U[format %08X $i]
	    }
	}
	return $print
    }
}
    }
} ; # generate_boilerplate

proc ucm::generate_tests {} {
    variable encNameMap
    variable charMap
    variable invalidCodeSequences
    variable unmappedCodePoints
    variable outputPath
    variable outputChan
    variable encSubchar

    if {[info exists outputPath]} {
	set outputChan [open $outputPath w]
	fconfigure $outputChan -translation lf
    } else {
	set outputChan stdout
    }

    array set tclNames {}
    foreach encName [encoding names] {
	set tclNames($encName) ""
    }

    generate_boilerplate
    foreach encName [lsort -dictionary [array names encNameMap]] {
	if {![info exists charMap($encName)]} {
	    warn "No character map read for $encName"
	    continue
	}
	unset tclNames($encName)

	# Print the valid tests
	print "\n#\n# $encName (generated from $encNameMap($encName))"
	print "\ntest encoding-convertfrom-ucmCompare-$encName {Compare against ICU UCM} -body \{"
	print "    ucmConvertfromMismatches $encName {$charMap($encName)}"
	print "\} -result {}"
	print "\ntest encoding-convertto-ucmCompare-$encName {Compare against ICU UCM} -body \{"
	print "    ucmConverttoMismatches $encName {$charMap($encName)}"
	print "\} -result {}"
	if {0} {
	    # This will generate individual tests for every char
	    # and test in lead, tail, middle, solo configurations
	    # but takes considerable time
	    print "lappend encValidStrings \{*\}\{"
	    foreach {unich hex} $charMap($encName) {
		print "    $encName \\u$unich $hex {} {}"
	    }
	    print "\}; # $encName"
	}

	# Generate the invalidity checks
	print "\n# $encName - invalid byte sequences"
	print "lappend encInvalidBytes \{*\}\{"
	foreach hex $invalidCodeSequences($encName) {
	    # Map XXXX... to \xXX\xXX...
	    set uhex [regsub -all .. $hex {\\x\0}]
	    set uhex \\U[string range 00000000$hex end-7 end]
	    print "    $encName $hex tcl8    $uhex -1 {} {}"
	    print "    $encName $hex replace \\uFFFD -1 {} {}"
	    print "    $encName $hex strict  {}       0 {} {}"
	}
	print "\}; # $encName"

	print "\n# $encName - invalid byte sequences"
	print "lappend encUnencodableStrings \{*\}\{"
	if {[info exists encSubchar($encName)]} {
	    set subchar $encSubchar($encName)
	} else {
	    set subchar "3F"; # Tcl uses ? by default
	}
	foreach hex $unmappedCodePoints($encName) {
	    set uhex \\U[string range 00000000$hex end-7 end]
	    print "    $encName $uhex tcl8    $subchar -1 {} {}"
	    print "    $encName $uhex replace $subchar -1 {} {}"
	    print "    $encName $uhex strict  {}                      0 {} {}"
	}
	print "\}; # $encName"
    }

    if {[array size tclNames]} {
	warn "Missing encoding: [lsort [array names tclNames]]"
    }
    if {[info exists outputPath]} {
	close $outputChan
	unset outputChan
    }
}

proc ucm::parse_file {encName ucmPath} {
    variable charMap
    variable encSubchar

    set fd [open $ucmPath]
    try {
	# Parse the metadata
	unset -nocomplain state
	while {[gets $fd line] >= 0} {
	    if {[regexp {<(code_set_name|mb_cur_max|mb_cur_min|uconv_class|subchar)>\s+(\S+)} $line -> key val]} {
		set state($key) $val
	    } elseif {[regexp {^\s*CHARMAP\s*$} $line]} {
		set state(charmap) ""
		break
	    } else {
		# Skip all else
	    }
	}
	if {![info exists state(charmap)]} {
	    abort "Error: $ucmPath has No CHARMAP line."
	}
	foreach key {code_set_name uconv_class} {
	    if {[info exists state($key)]} {
		set state($key) [string trim $state($key) {"}]
	    }
	}
	if {[info exists charMap($encName)]} {
	    abort "Duplicate file for $encName ($path)"
	}
	if {![info exists state(uconv_class)]} {
	    abort "Error: $ucmPath has no uconv_class definition."
	}
	if {[info exists state(subchar)]} {
	    # \xNN\xNN.. -> NNNN..
	    set encSubchar($encName) [string map {\\x {}} $state(subchar)]
	}
	switch -exact -- $state(uconv_class) {
	    SBCS {
		if {[catch {
		    parse_SBCS $encName $fd
		} result]} {
		    abort "Could not process $ucmPath. $result"
		}
	    }
	    default {
		log "Skipping $ucmPath -- not SBCS encoding."
		return
	    }
	}
    } finally {
	close $fd
    }
}

proc ucm::run {} {
    variable encNameMap
    variable outputPath
    switch [llength $::argv] {
	2 {set outputPath [lindex $::argv 1]}
	1 {}
	default {
	    abort "Usage: [info nameofexecutable] $::argv0 path/to/icu/ucm/data ?outputfile?"
	}
    }
    foreach {encName fname} [array get encNameMap] {
	ucm::parse_file $encName [file join [lindex $::argv 0] ${fname}.ucm]
    }
    generate_tests
}

ucm::run