File: hz_encoding.phpt

package info (click to toggle)
php8.4 8.4.11-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 208,108 kB
  • sloc: ansic: 1,060,628; php: 35,345; sh: 11,866; cpp: 7,201; pascal: 4,913; javascript: 3,091; asm: 2,810; yacc: 2,411; makefile: 689; xml: 446; python: 301; awk: 148
file content (150 lines) | stat: -rw-r--r-- 6,221 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
--TEST--
Exhaustive test of verification and conversion of HZ text
--EXTENSIONS--
mbstring
--SKIPIF--
<?php
if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
?>
--FILE--
<?php
include('encoding_tests.inc');
srand(1000); // Make results consistent
mb_substitute_character(0x25); // '%'

for ($i = 0; $i < 0x80; $i++) {
    if ($i != 0x7E) // ~ is special and will be tested separately
        testValidString(chr($i), chr($i), 'ASCII', 'HZ');
}
echo "Tested ASCII -> HZ\n";

for ($i = 0; $i < 0x80; $i++) {
    if ($i != 0x7E)
        testValidString(chr($i), chr($i), 'HZ', 'ASCII');
}
echo "Tested HZ -> ASCII\n";

for ($i = 0x80; $i < 0xFF; $i++) {
    testInvalidString(chr($i), '%', 'HZ', 'ASCII');
}
echo "Tested non-ASCII bytes in ASCII mode\n";

testValidString('~~', '~', 'HZ', 'ASCII');
testValidString("~\n", '', 'HZ', 'ASCII', false);
testValidString('~{~}', '', 'HZ', 'ASCII', false);
testValidString("~{~\n~}", '', 'HZ', 'ASCII', false);
testValidString('~~?', '~?', 'HZ', 'ASCII');
echo "Tested valid ~ escapes\n";

for ($i = 0; $i < 0xFF; $i++) {
    if ($i != 0x0A) {
        // Try invalid ~ escapes both in ASCII and GB modes
        if ($i != 0x7E && $i != 0x7B) // not {
            testInvalidString("~" . chr($i), '%', 'HZ', 'ASCII');
        if ($i != 0x7D) // not }
            testInvalidString("~{~" . chr($i) . "~}", '%', 'HZ', 'ASCII');
    }
}
echo "Tested all invalid ~ escapes\n";

readConversionTable(__DIR__ . '/data/GB2312.txt', $toUnicode, $fromUnicode);

findInvalidChars($toUnicode, $invalid, $truncated);

// Two characters in ISO-2022-CN convert to Unicode 0x2225
$irreversible = ["\x21\x2C" => true];

// Test all good GB2312 characters within ~{ ~} escapes
$goodChars = array_keys($toUnicode);
shuffle($goodChars);
while (!empty($goodChars)) {
    $reversible = true;
    $length = 1; //min(rand(5,10), count($goodChars));
    $fromString = $toString = '';
    while ($length--) {
        $goodChar = array_pop($goodChars);
        $fromString .= $goodChar;
        $toString .= $toUnicode[$goodChar];
        if (isset($irreversible[$goodChar]))
          $reversible = false;
    }

    testValidString('~{' . $fromString . '~}', $toString, 'HZ', 'UTF-16BE', $reversible);
}

// Test all invalid GB2312 characters within ~{ ~} escapes
// However, don't test escape sequences; we will do those separately below
unset($invalid["~"]);
$badChars = array_keys($invalid);
$goodChars = array();
while (!empty($badChars)) {
    if (empty($goodChars)) {
        $goodChars = array_keys($toUnicode);
        shuffle($goodChars);
    }
    $goodChar   = array_pop($goodChars);
    $fromString = array_pop($badChars) . $goodChar;
    $toString   = "\x00%" . $toUnicode[$goodChar];

    testInvalidString('~{' . $fromString . '~}', $toString, 'HZ', 'UTF-16BE');
}

$truncatedChars = array_keys($truncated);
foreach ($truncatedChars as $truncatedChar) {
    testInvalidString('~{' . $truncatedChar, "\x00%", 'HZ', 'UTF-16BE');
}

echo "Tested HZ -> UTF-16BE (for all GB2312 characters)\n";

findInvalidChars($fromUnicode, $invalid, $unused, array_fill_keys(range(0,0xFF), 2));

// Although they do not appear in the Unicode -> GB2312 map, ASCII characters *are*
// valid to convert to HZ
for ($i = 0; $i <= 0x7F; $i++)
    unset($invalid["\x00" . chr($i)]);

$badChars = array_keys($invalid);
$goodChars = array();
while (!empty($badChars)) {
    if (empty($goodChars)) {
        $goodChars = array_keys($fromUnicode);
        shuffle($goodChars);
    }
    $goodChar   = array_pop($goodChars);
    $fromString = array_pop($badChars) . $goodChar;
    $toString   = "%~{" . $fromUnicode[$goodChar] . "~}";

    convertInvalidString($fromString, $toString, 'UTF-16BE', 'HZ');
}

echo "Tested UTF-16BE -> HZ (for all GB2312 characters)\n";

// Regression tests
if (mb_convert_encoding("\x7A\xFA\x00\x00", 'HZ', 'UTF-16BE') !== "~{\x73\x43~}\x00")
    die("Bad");
if (mb_convert_encoding("~", 'UTF-16BE', 'HZ') !== "")
    die("Bad");
// There had once been a bug whereby the output buffer would be overrun by one byte.
// It was found by fuzzing. Reproducing it required a string which was long enough
// and had a GB2312 character at the end.
$str = "\xD9\x96C\xA7\x1B\xF6\xD8\x86\x94\xB0\xA0\xE1\x9D\x8C\xF8G\xBBMk\xD2Y\tt\xF1\x96d\x17JA\xF9\xF8\xCF\xDC\xFE\x8E\x0E\xC1\x84\xDA\xDBM\xC1\x87\x1AZ\xD5\xA6)\xFF%2\\\xCC\x02\x16]Y\xF0\x00\xEA\xE8{)\x81\xD5VQZ\x12\xB5\xBC\x9A\x91\xA0x\x02\xBA\xF6c\xACo\x9BH\xB7qx\xF5\x0F\t\x15\xDByx\xBA[\xC9\xE8r\xCD*:\xBF\x10P\xF1>Q\x07\xEE\xE5\x80\xAD\xB9\xA2\x9B\xF6\xE1,\x82\xC6q\x94E\xD4\x0B\xC6\xBCQe=\xC3\xE0\xC8\xE0R\x97\x14q\x0C\x1A\x7F\xE1\xC4\xB8U\x8A\x86\x93\xB6/\x84\x95\x06\x91W\xB2\xB6\x1F!\t X\x1A\xD5\xD6\xDA<\x81ib\x9A\x1B3\xD3\xB7:\xE2QS\xD0\x91\x99[K\xF2E\xBBjoh_5\x15 \xA4\xCC\xB0\x7F\x06\xB3,\xB3\xA7u\xB9\x82\x00\xE2f$\x1C\x84NsP\xFAiPB{\x8D\xBA\xB3[\x88\xA9\xB1\xA2r\x86\xFF<\xFD\xFB\xF8\xD6\xABq\x00z\xFA\x87\x8C_\xD9N\xF2\xFA\xEA\xEA\xAA\xD7\xFA\xA2\xD4\x85/\xFC\xE1}\xF7\x9C\x86\xDD\x12@\xC3\xDA\nC\x1Di\xA9\xB0\xC3\xB3\x04\xB2\x1A\x07BA\x02\xED\x11\xA4\xDAz\x96\xB5\xD0!p\xE2\xAD\xEDI\xEF\xF7\\\x05d.p\x07\xC4\x8B\x952\xCDz\x90\x8C\xA6U\xDB\xC7\xF4\x94\xE9\x16X\xF1\xCC\xB13\x07a9\x86]\xF9k\xA9\x87E\xCB\x89\x9Fd\x0E\x81m\xC6c\xDA\x9C\xE9\xAF\x80.\xFAq\xD9\xAAd\x1DB\x1F\x854\xE8\x82v)A\xF3\xB4\x1D\xE5\xF0\xFFu\x0E\x0C\xC4q\xF0\xE7\xB4p\x86\xE6]9\xD9\xA5O\xBAw\x1B\x8D&]\x9D\xE2\x0F\xD2\xD5\x13AY#\x81\x90\xB2\xE8\xDA\xD2\xFC>\xA0\x9A\xBD\x0B\xCC\x08>\x1E\xD1\xFEgr1'$\xEE\xA2!\x8A\xBB>\x11j#Pz_!?\xA8\x15\xCF\xCB\x84\x86\xC1\xF78:\xDA\xBCE\xA7\x02SO\x8B\x81>\x96\xBD\xFD2\x84\xC5\xFC\x19\xE5\xF4\xEFp\xF08K\xBB\xAE-[}\xE1\xDB\x8A%6\xC7\xC9";
if (substr(mb_convert_encoding($str, 'HZ', 'CP51932'), -4) !== "\x45\x49~}")
    die("Bad");

// Test "long" illegal character markers
mb_substitute_character("long");
convertInvalidString("~A", "%", "HZ", "UTF-8");
convertInvalidString("\x80", "%", "HZ", "UTF-8");
convertInvalidString("~{\x22\x21", "%", "HZ", "UTF-8");

echo "Done!\n";
?>
--EXPECT--
Tested ASCII -> HZ
Tested HZ -> ASCII
Tested non-ASCII bytes in ASCII mode
Tested valid ~ escapes
Tested all invalid ~ escapes
Tested HZ -> UTF-16BE (for all GB2312 characters)
Tested UTF-16BE -> HZ (for all GB2312 characters)
Done!