File: iso2022kr_encoding.phpt

package info (click to toggle)
php8.4 8.4.11-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 208,108 kB
  • sloc: ansic: 1,060,628; php: 35,345; sh: 11,866; cpp: 7,201; pascal: 4,913; javascript: 3,091; asm: 2,810; yacc: 2,411; makefile: 689; xml: 446; python: 301; awk: 148
file content (134 lines) | stat: -rw-r--r-- 4,545 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
--TEST--
Test of ASCII and KS X 1001-1992 support in ISO-2022-KR encoding
--EXTENSIONS--
mbstring
--SKIPIF--
<?php
if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
?>
--FILE--
<?php
include('encoding_tests.inc');
mb_substitute_character(0x25); // '%'

readConversionTable(__DIR__ . '/data/KSX1001.txt', $ksxChars, $unused);

function testValid($from, $to, $bothWays = true) {
  identifyValidString($from, 'ISO-2022-KR');
  convertValidString($from, $to, 'ISO-2022-KR', 'UTF-16BE', false);

  if ($bothWays) {
    /* 0xF at the beginning of an ISO-2022 string is redundant; it switches
     * to ASCII mode, but ASCII mode is default */
    if (strlen($from) > 0 && $from[0] == "\x0F")
      $from = substr($from, 1, strlen($from) - 1);
    /* If the string switches to a different charset, it should switch back to
     * ASCII at the end */
    if (strpos($from, "\x0E") !== false && $from[-1] !== "\x0F")
      $from .= "\x0F";
    if (strpos($from, "\x1B\$)C") === false && $from !== '')
      $from = "\x1B\$)C" . $from;

    convertValidString($to, $from, 'UTF-16BE', 'ISO-2022-KR', false);
  }
}

function testInvalid($from, $to) {
  testInvalidString($from, $to, 'ISO-2022-KR', 'UTF-16BE');
}

testValid("", "");
echo "Empty string OK\n";

for ($i = 0; $i < 0x80; $i++) {
  if ($i == 0xE || $i == 0xF || $i == 0x1B)
    continue;
  testValid(chr($i),          "\x00" . chr($i));
  testValid("\x0F" . chr($i), "\x00" . chr($i)); /* 0xF is 'Shift In' code */
}

for ($i = 0x80; $i < 256; $i++) {
  testInvalid(chr($i),          "\x00%");
  testInvalid("\x0F" . chr($i), "\x00%");
}

echo "ASCII support OK\n";

foreach ($ksxChars as $ksx => $utf16BE) {
  testValid("\x0E" . $ksx, $utf16BE, false);
  testValid("\x1B$)C\x0E" . $ksx, $utf16BE, false);
  testValid("\x1B$)C\x0E" . $ksx . "\x0F", $utf16BE);
}

findInvalidChars($ksxChars, $invalidKsx, $truncatedKsx);

$badChars = array_keys($invalidKsx);
foreach ($badChars as $badChar) {
  if ($badChar[0] == "\x0E" || $badChar[0] == "\x0F" || $badChar[0] == "\x1B")
    continue;
  testInvalid("\x1B$)C\x0E" . $badChar, "\x00%");
}

$badChars = array_keys($truncatedKsx);
foreach ($badChars as $badChar) {
  testInvalid("\x1B$)C\x0E" . $badChar, "\x00%");
}

echo "KS X 1001 support OK\n";

/* After a valid ESC sequence, we are still in ASCII mode; 'Shift Out' is needed to start KS X 1001 */
testValid("\x1B$)Cabc", "\x00a\x00b\x00c", false);

/* Test invalid and truncated ESC sequences */
testInvalid("\x1B", "\x00%");
testInvalid("\x1B$", "\x00%");
testInvalid("\x1B$)", "\x00%");

for ($i = 0; $i < 256; $i++) {
  if (chr($i) != '$')
    testInvalid("\x1B" . chr($i), "\x00%");
  if (chr($i) != ')')
    testInvalid("\x1B$" . chr($i), "\x00%");
  if (chr($i) != 'C')
    testInvalid("\x1B$)" . chr($i), "\x00%");
}

/* We can switch back and forth between ASCII and KS X 1001 */
testValid("\x0E\x0E\x0F\x0E\x0Fabc", "\x00a\x00b\x00c", false);

echo "Escapes behave as expected\n";

// Test switching between KS X 1001 and ASCII when converting Unicode -> ISO-2022-KR
convertValidString("\x76\x20\x00a\x00b", "\x1B$)C\x0E\x74\x30\x0Fab", "UTF-16BE", "ISO-2022-KR", false);

// Regression test: Our conversion table for KS X 1001 only goes up to 0x7D7E, but
// we previously accepted and tried to convert two-byte sequences starting with
// 0x7E, resulting in a failed assertion
convertInvalidString("\x0E~/", "%", "ISO-2022-KR", "UTF-8");

// Regression test: The old implementation would wrongly convert some codepoints
// which are not in KS X 1001 at all to 'random' characters in KS X 1001
convertInvalidString("\xFF\x86", "\x1B\$)C%", "UTF-16BE", "ISO-2022-KR");

// Regression test: The old implementation would sometimes emit an extra 0x0F ('shift out')
// character at the end of a string, although the string was already ending in ASCII mode
convertValidString("\x68\x46\x00a", "\x1B\$)C\x0E\x68\x46\x0Fa", "UTF-16BE", "ISO-2022-KR", false);

// Regression test: Don't shift from KS X 1001 to ASCII mode on invalid escape sequence
convertInvalidString("\x0E\x1BX\x74\x30", "\x00%\x76\x20", "ISO-2022-KR", "UTF-16BE", false);

// Test "long" illegal character markers
mb_substitute_character("long");
convertInvalidString("\x1B", "%", "ISO-2022-KR", "UTF-8");
convertInvalidString("\x1B$", "%", "ISO-2022-KR", "UTF-8");
convertInvalidString("\x1B$)", "%", "ISO-2022-KR", "UTF-8");
convertInvalidString("\x1B$)C\x0E\x7C\x84", "%", "ISO-2022-KR", "UTF-8");

echo "Done!\n";
?>
--EXPECT--
Empty string OK
ASCII support OK
KS X 1001 support OK
Escapes behave as expected
Done!