1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332
|
# This file contains a collection of tests for Tcl_UtfToExternal and
# Tcl_UtfToExternal that exercise various combinations of flags,
# buffer lengths and fragmentation that cannot be tested by
# normal script level commands. There tests are NOT intended to check
# correct encodings; those are elsewhere.
#
# Copyright (c) 2023 Ashok P. Nadkarni
#
# See the file "license.terms" for information on usage and redistribution
# of this file, and for a DISCLAIMER OF ALL WARRANTIES.
if {"::tcltest" ni [namespace children]} {
package require tcltest 2.5
namespace import -force ::tcltest::*
}
::tcltest::loadTestedCommands
catch [list package require -exact tcl::test [info patchlevel]]
testConstraint testbytestring [llength [info commands testbytestring]]
testConstraint testencoding [llength [info commands testencoding]]
namespace eval utftest {
# Format of table, indexed by encoding. The encodings are not exhaustive
# but one of each kind of encoding transform (algorithmic, table-driven,
# stateful, DBCS, MBCS).
# Each element is list of lists. Nested lists have following fields
# 0 comment (no spaces, might be used to generate id's as well)
# The combination of comment and internal hex (2) should be unique.
# 1 hex representation of internal *modified* utf-8 encoding. This is the
# source string for Tcl_UtfToExternal and expected result for
# Tcl_ExternalToUtf.
# 2 hex representation in specified encoding. This is the source string for
# Tcl_ExternalToUtf and expected result for Tcl_UtfToExternal.
# 3 internal fragmentation index - where to split field 1 for fragmentation
# tests. -1 to skip
# 4 external fragmentation index - where to split field 2 for fragmentation
# tests. -1 to skip
#
# THE HEX DEFINITIONS SHOULD SEPARATE EACH CHARACTER BY WHITESPACE
# (assumed by the charlimit tests)
lappend utfExtMap {*}{
ascii {
{basic {41 42 43} {41 42 43} -1 -1}
}
utf-8 {
{bmp {41 c3a9 42} {41 c3a9 42} 2 2}
{nonbmp-frag-1 {41 f09f9880 42} {41 f09f9880 42} 2 2}
{nonbmp-frag-2 {41 f09f9880 42} {41 f09f9880 42} 3 3}
{nonbmp-frag-3 {41 f09f9880 42} {41 f09f9880 42} 4 4}
{null {41 c080 42} {41 00 42} 2 -1}
}
cesu-8 {
{bmp {41 c3a9 42} {41 c3a9 42} 2 2}
{nonbmp-frag-surr-low {41 f09f9880 42} {41 eda0bd edb880 42} 2 2}
{nonbmp-split-surr {41 f09f9880 42} {41 eda0bd edb880 42} 3 -1}
{nonbmp-frag-surr-high {41 f09f9880 42} {41 eda0bd edb880 42} 4 6}
{null {41 c080 42} {41 00 42} 2 -1}
}
utf-16le {
{bmp {41 c3a9 42} {4100 e900 4200} 2 3}
{nonbmp {41 f09f9880 42} {4100 3dd8 00de 4200} 4 3}
{split-surrogate {41 f09f9080 42} {4100 3dd8 00dc 4200} 3 4}
{null {41 c080 42} {4100 0000 4200} 2 3}
}
utf-16be {
{bmp {41 c3a9 42} {0041 00e9 0042} 2 3}
{nonbmp {41 f09f9880 42} {0041 d83d de00 0042} 4 3}
{split-surrogate {41 f09f9080 42} {0041 d83d dc00 0042} 3 4}
{null {41 c080 42} {0041 0000 0042} 2 3}
}
utf-32le {
{bmp {41 c3a9 42} {41000000 e9000000 42000000} 2 3}
{nonbmp {41 f09f9880 42} {41000000 00f60100 42000000} 4 6}
{null {41 c080 42} {41000000 00000000 42000000} 2 3}
}
utf-32be {
{bmp {41 c3a9 42} {00000041 000000e9 00000042} 2 3}
{nonbmp {41 f09f9880 42} {00000041 0001f600 00000042} 4 3}
{null {41 c080 42} {00000041 00000000 00000042} 2 3}
}
iso8859-1 {
{basic {41 c3a9 42} {41 e9 42} 2 -1}
{null {41 c080 42} {41 00 42} 2 -1}
}
iso8859-3 {
{basic {41 c4a0 42} {41 d5 42} 2 -1}
{null {41 c080 42} {41 00 42} 2 -1}
}
shiftjis {
{basic {41 e4b98e 42} {41 8cc1 42} 3 2}
}
jis0208 {
{basic {e4b98e e590be} {3843 3863} 1 1}
}
iso2022-jp {
{frag-in-leadescape {58 e4b98e 5a} {58 1b2442 3843 1b2842 5a} 2 2}
{frag-in-char {58 e4b98e 5a} {58 1b2442 3843 1b2842 5a} 2 5}
{frag-in-trailescape {58 e4b98e 5a} {58 1b2442 3843 1b2842 5a} 2 8}
}
}
# Return a binary string containing nul terminator for encoding
proc hexnuls {enc} {
return [binary encode hex [encoding convertto $enc \x00]]
}
# The C wrapper fills entire destination buffer with FF.
# Anything beyond expected output should have FF's
proc fill {bin buflen} {
return [string range "$bin[string repeat \xFF $buflen]" 0 $buflen-1]
}
proc testutf {direction enc comment hexin hexout args} {
set id $comment-[join $hexin ""]
if {$direction eq "toutf"} {
set cmd Tcl_ExternalToUtf
} else {
set cmd Tcl_UtfToExternal
}
set in [binary decode hex $hexin]
set out [binary decode hex $hexout]
set dstlen 40 ;# Should be enough for all encoding tests
set status ok
set flags [list start end]
set constraints [list testencoding]
set profiles [encoding profiles]
while {[llength $args] > 1} {
set opt [lpop args 0]
switch $opt {
-flags { set flags [lpop args 0] }
-constraints { lappend constraints {*}[lpop args 0] }
-profiles { set profiles [lpop args 0] }
-status { set status [lpop args 0]}
default {
error "Unknown option \"$opt\""
}
}
}
if {[llength $args]} {
error "No value supplied for option [lindex $args 0]."
}
set result [list $status {} [fill $out $dstlen]]
test $cmd-$enc-$id-[join $flags -] "$cmd - $enc - $hexin - $flags" -body \
[list testencoding $cmd $enc $in $flags {} $dstlen] \
-result $result -constraints $constraints
foreach profile $profiles {
set flags2 [linsert $flags end $profile]
test $cmd-$enc-$id-[join $flags2 -] "$cmd - $enc - $hexin - $flags2" -body \
[list testencoding $cmd $enc $in $flags2 {} $dstlen] \
-result $result -constraints $constraints
}
}
proc testfragment {direction enc comment hexin hexout fragindex args} {
if {$fragindex < 0} {
# Single byte encodings so no question of fragmentation
return
}
set id $comment-[join $hexin ""]-fragment
if {$direction eq "toutf"} {
set cmd Tcl_ExternalToUtf
} else {
set cmd Tcl_UtfToExternal
}
set status1 multibyte; # Return status to expect after first call
while {[llength $args] > 1} {
set opt [lpop args 0]
switch $opt {
-status1 { set status1 [lpop args 0]}
default {
error "Unknown option \"$opt\""
}
}
}
set in [binary decode hex $hexin]
set infrag [string range $in 0 $fragindex-1]
set out [binary decode hex $hexout]
set dstlen 40 ;# Should be enough for all encoding tests
test $cmd-$enc-$id "$cmd - $enc - $hexin - frag" -constraints testencoding -body {
set frag1Result [testencoding $cmd $enc [string range $in 0 $fragindex-1] {start} 0 $dstlen frag1Read frag1Written]
lassign $frag1Result frag1Status frag1State frag1Decoded
set frag2Result [testencoding $cmd $enc [string range $in $frag1Read end] {end} $frag1State $dstlen frag2Read frag2Written]
lassign $frag2Result frag2Status frag2State frag2Decoded
set decoded [string cat [string range $frag1Decoded 0 $frag1Written-1] [string range $frag2Decoded 0 $frag2Written-1]]
list $frag1Status [expr {$frag1Read <= $fragindex}] \
$frag2Status [expr {$frag1Read+$frag2Read}] \
[expr {$frag1Written+$frag2Written}] $decoded
} -result [list $status1 1 ok [string length $in] [string length $out] $out]
}
proc testcharlimit {direction enc comment hexin hexout} {
set id $comment-[join $hexin ""]-charlimit
if {$direction eq "toutf"} {
set cmd Tcl_ExternalToUtf
} else {
set cmd Tcl_UtfToExternal
}
set maxchars [llength $hexout]
set in [binary decode hex $hexin]
set out [binary decode hex $hexout]
set dstlen 40 ;# Should be enough for all encoding tests
for {set nchars 0} {$nchars <= $maxchars} {incr nchars} {
set expected_bytes [binary decode hex [lrange $hexout 0 $nchars-1]]
set expected_nwritten [string length $expected_bytes]
test $cmd-$enc-$id-$nchars "$cmd - $enc - $hexin - nchars $nchars" -constraints testencoding -body {
set charlimit $nchars
lassign [testencoding $cmd $enc $in \
{start end charlimit} 0 $dstlen nread nwritten charlimit] \
status state buf
list $status $nwritten [string range $buf 0 $nwritten-1]
} -result [list [expr {$nchars == $maxchars ? "ok" : "nospace"}] $expected_nwritten $expected_bytes]
}
}
proc testspacelimit {direction enc comment hexin hexout} {
set id $comment-[join $hexin ""]-spacelimit
# Triple the input to avoid pathological short input case where
# whereby nothing is written to output. The test below
# requires $nchars > 0
set hexin $hexin$hexin$hexin
set hexout $hexout$hexout$hexout
set flags [list start end]
set constraints [list testencoding]
set maxchars [llength $hexout]
set in [binary decode hex $hexin]
set out [binary decode hex $hexout]
set dstlen [expr {[string length $out] - 1}]; # Smaller buffer than needed
if {$direction eq "toutf"} {
set cmd Tcl_ExternalToUtf
set str [encoding convertfrom $enc $in]
} else {
set cmd Tcl_UtfToExternal
set str [encoding convertfrom $enc $out]
}
# Note the tests are loose because the some encoding operations will
# stop even there is actually still room in the destination. For example,
# below only one char is written though there is room in the output.
# % testencoding Tcl_ExternalToUtf ascii abc {start end} {} 5 nread nwritten nchars
# nospace {} aÿÿÿ#
# % puts $nread,$nwritten,$nchars
# 1,1,1
#
test $cmd-$enc-$id-[join $flags -] "$cmd - $enc - $hexin - $flags" \
-constraints $constraints \
-body {
lassign [testencoding $cmd $enc $in $flags {} $dstlen nread nwritten nchars] status state buf
list \
$status \
[expr {$nread < [string length $in]}] \
[expr {$nwritten <= $dstlen}] \
[expr {$nchars > 0 && $nchars < [string length $str]}] \
[expr {[string range $out 0 $nwritten-1] eq [string range $buf 0 $nwritten-1]}]
} -result {nospace 1 1 1 1}
}
#
# Basic tests
foreach {enc testcases} $utfExtMap {
foreach testcase $testcases {
lassign $testcase {*}{comment utfhex hex internalfragindex externalfragindex}
# Basic test - TCL_ENCODING_START|TCL_ENCODING_END
# Note by default output should be terminated with \0
set encnuls [hexnuls $enc]
testutf toutf $enc $comment $hex ${utfhex}00
testutf fromutf $enc $comment $utfhex $hex$encnuls
# Test TCL_ENCODING_NO_TERMINATE
testutf toutf $enc $comment $hex $utfhex -flags {start end noterminate}
# noterminate is specific to ExternalToUtf,
# should have no effect in other direction
testutf fromutf $enc $comment $utfhex $hex$encnuls -flags {start end noterminate}
# Fragments
testfragment toutf $enc $comment $hex $utfhex $externalfragindex
testfragment fromutf $enc $comment $utfhex $hex $internalfragindex
# Char limits - note no fromutf as Tcl_UtfToExternal does not support it
testcharlimit toutf $enc $comment $hex $utfhex
# Space limits
testspacelimit toutf $enc $comment $hex $utfhex
testspacelimit fromutf $enc $comment $utfhex $hex
}
}
# Special cases - cesu2 high and low surrogates in separate fragments
# This will (correctly) return "ok", not "multibyte" after first frag
testfragment toutf cesu-8 nonbmp-split-surr \
{41 eda0bd edb880 42} {41 f09f9880 42} 4 -status1 ok
# Bug regression tests
test Tcl_UtfToExternal-bug-183a1adcc0 {buffer overflow} -body {
testencoding Tcl_UtfToExternal utf-16 A {start end} {} 1
} -result [list nospace {} \xFF] -constraints testencoding
test Tcl_ExternalToUtf-bug-5be203d6ca {
truncated prefix in table encoding
} -body {
set src \x82\x4F\x82\x50\x82
set result [list [testencoding Tcl_ExternalToUtf shiftjis $src {start tcl8} 0 16 srcRead dstWritten charsWritten] $srcRead $dstWritten $charsWritten]
lappend result {*}[list [testencoding Tcl_ExternalToUtf shiftjis [string range $src $srcRead end] {end tcl8} 0 10 srcRead dstWritten charsWritten] $srcRead $dstWritten $charsWritten]
} -result [list [list multibyte 0 \xEF\xBC\x90\xEF\xBC\x91\x00\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF] 4 6 2 [list ok 0 \xC2\x82\x00\xFF\xFF\xFF\xFF\xFF\xFF\xFF] 1 2 1] -constraints testencoding
}
namespace delete utftest
::tcltest::cleanupTests
return
# Local Variables:
# mode: tcl
# End:
|