1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
|
--TEST--
mb_str_split() tests for more text encodings
--EXTENSIONS--
mbstring
--FILE--
<?php
$array = mb_str_split("\x00\x01\x02\x03\x04\x05\x06\x07", 2, "UCS-2BE");
echo "[", bin2hex($array[0]), ", ", bin2hex($array[1]), "]\n";
$str = "test カタカナ 汉字";
echo "== HZ ==\n";
$hz = mb_convert_encoding($str, 'HZ', 'UTF-8');
for ($i = 1; $i <= mb_strlen($str, 'UTF-8'); $i++) {
$array = mb_str_split($hz, $i, 'HZ');
echo "[", implode(', ', array_map('bin2hex', $array)), "]\n";
$converted = mb_convert_encoding(implode($array), 'UTF-8', 'HZ');
if ($converted !== $str) {
die("Expected " . $str . "; got " . $converted);
}
}
$str = "test カタカナ 漢字";
echo "== BIG-5 ==\n";
$big5 = mb_convert_encoding($str, 'BIG-5', 'UTF-8');
for ($i = 1; $i <= mb_strlen($str, 'UTF-8'); $i++) {
$array = mb_str_split($big5, $i, 'BIG-5');
echo "[", implode(', ', array_map('bin2hex', $array)), "]\n";
$converted = mb_convert_encoding(implode($array), 'UTF-8', 'BIG-5');
if ($converted !== $str) {
die("Expected " . $str . "; got " . $converted);
}
}
$str = "test ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏ";
echo "== ISO-8859-1 ==\n";
$iso = mb_convert_encoding($str, 'ISO-8859-1', 'UTF-8');
for ($i = 1; $i <= mb_strlen($str, 'UTF-8'); $i++) {
$array = mb_str_split($iso, $i, 'ISO-8859-1');
echo "[", implode(', ', array_map('bin2hex', $array)), "]\n";
$converted = mb_convert_encoding(implode($array), 'UTF-8', 'ISO-8859-1');
if ($converted !== $str) {
die("Expected " . $str . "; got " . $converted);
}
}
echo "== Regression tests ==\n";
// The old implementation of mb_str_split had a bug due to using char* instead of unsigned char*
// When retrieving a byte with the MSB set, it would sign-extend it to become a negative int
// For example, 0xCA would become 0xFFFFFFCA
$array = mb_str_split("\xCA\xCA\xCA\xCA", 2, "JIS");
echo "[", implode(', ', array_map('bin2hex', $array)), "]\n";
// Another bug in the old implementation of mb_str_split; when finishing one last (incomplete)
// incomplete chunk, it required that the last byte of the input string should decode to
// some codepoint for the last chunk to actually be returned
// This is not always the case; the last byte might be some kind of control sequence which
// affects the decoder state but doesn't actually decode to any codepoint
$array = mb_str_split("Z~", 2, "HZ");
echo "[", implode(', ', array_map('bin2hex', $array)), "]\n";
// Another problem with the old implementation of mb_str_split: If you passed a huge chunk_len
// argument, it would run the PHP interpreter out of memory
try {
mb_str_split("abc", 1234567890, "UTF-8");
} catch (ValueError $e) {
echo $e->getMessage() . \PHP_EOL;
}
?>
--EXPECT--
[00010203, 04050607]
== HZ ==
[74, 65, 73, 74, 20, 7e7b252b7e7d, 7e7b253f7e7d, 7e7b252b7e7d, 7e7b254a7e7d, 20, 7e7b3a3a7e7d, 7e7b57567e7d]
[7465, 7374, 207e7b252b7e7d, 7e7b253f252b7e7d, 7e7b254a7e7d20, 7e7b3a3a57567e7d]
[746573, 74207e7b252b7e7d, 7e7b253f252b254a7e7d, 207e7b3a3a57567e7d]
[74657374, 207e7b252b253f252b7e7d, 7e7b254a7e7d207e7b3a3a57567e7d]
[7465737420, 7e7b252b253f252b254a7e7d20, 7e7b3a3a57567e7d]
[74657374207e7b252b7e7d, 7e7b253f252b254a7e7d207e7b3a3a57567e7d]
[74657374207e7b252b253f7e7d, 7e7b252b254a7e7d207e7b3a3a57567e7d]
[74657374207e7b252b253f252b7e7d, 7e7b254a7e7d207e7b3a3a57567e7d]
[74657374207e7b252b253f252b254a7e7d, 207e7b3a3a57567e7d]
[74657374207e7b252b253f252b254a7e7d20, 7e7b3a3a57567e7d]
[74657374207e7b252b253f252b254a7e7d207e7b3a3a7e7d, 7e7b57567e7d]
[74657374207e7b252b253f252b254a7e7d207e7b3a3a57567e7d]
== BIG-5 ==
[74, 65, 73, 74, 20, c743, c757, c743, c762, 20, ba7e, a672]
[7465, 7374, 20c743, c757c743, c76220, ba7ea672]
[746573, 7420c743, c757c743c762, 20ba7ea672]
[74657374, 20c743c757c743, c76220ba7ea672]
[7465737420, c743c757c743c76220, ba7ea672]
[7465737420c743, c757c743c76220ba7ea672]
[7465737420c743c757, c743c76220ba7ea672]
[7465737420c743c757c743, c76220ba7ea672]
[7465737420c743c757c743c762, 20ba7ea672]
[7465737420c743c757c743c76220, ba7ea672]
[7465737420c743c757c743c76220ba7e, a672]
[7465737420c743c757c743c76220ba7ea672]
== ISO-8859-1 ==
[74, 65, 73, 74, 20, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, ca, cb, cc, cd, ce, cf]
[7465, 7374, 20c0, c1c2, c3c4, c5c6, c7c8, c9ca, cbcc, cdce, cf]
[746573, 7420c0, c1c2c3, c4c5c6, c7c8c9, cacbcc, cdcecf]
[74657374, 20c0c1c2, c3c4c5c6, c7c8c9ca, cbcccdce, cf]
[7465737420, c0c1c2c3c4, c5c6c7c8c9, cacbcccdce, cf]
[7465737420c0, c1c2c3c4c5c6, c7c8c9cacbcc, cdcecf]
[7465737420c0c1, c2c3c4c5c6c7c8, c9cacbcccdcecf]
[7465737420c0c1c2, c3c4c5c6c7c8c9ca, cbcccdcecf]
[7465737420c0c1c2c3, c4c5c6c7c8c9cacbcc, cdcecf]
[7465737420c0c1c2c3c4, c5c6c7c8c9cacbcccdce, cf]
[7465737420c0c1c2c3c4c5, c6c7c8c9cacbcccdcecf]
[7465737420c0c1c2c3c4c5c6, c7c8c9cacbcccdcecf]
[7465737420c0c1c2c3c4c5c6c7, c8c9cacbcccdcecf]
[7465737420c0c1c2c3c4c5c6c7c8, c9cacbcccdcecf]
[7465737420c0c1c2c3c4c5c6c7c8c9, cacbcccdcecf]
[7465737420c0c1c2c3c4c5c6c7c8c9ca, cbcccdcecf]
[7465737420c0c1c2c3c4c5c6c7c8c9cacb, cccdcecf]
[7465737420c0c1c2c3c4c5c6c7c8c9cacbcc, cdcecf]
[7465737420c0c1c2c3c4c5c6c7c8c9cacbcccd, cecf]
[7465737420c0c1c2c3c4c5c6c7c8c9cacbcccdce, cf]
[7465737420c0c1c2c3c4c5c6c7c8c9cacbcccdcecf]
== Regression tests ==
[1b28494a4a1b2842, 1b28494a4a1b2842]
[5a]
mb_str_split(): Argument #2 ($length) is too large
|