File: utf8_error_handling.phpt

package info (click to toggle)
php8.4 8.4.11-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 208,108 kB
  • sloc: ansic: 1,060,628; php: 35,345; sh: 11,866; cpp: 7,201; pascal: 4,913; javascript: 3,091; asm: 2,810; yacc: 2,411; makefile: 689; xml: 446; python: 301; awk: 148
file content (56 lines) | stat: -rw-r--r-- 1,731 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
--TEST--
Confirm error handling for UTF-8 complies with WHATWG spec
--EXTENSIONS--
mbstring
--FILE--
<?php
/* The WHATWG specifies not just how web browsers should handle _valid_
 * UTF-8 text, but how they should handle _invalid_ UTF-8 text (such
 * as how many error markers each invalid byte sequence should decode
 * to).
 * That specification is followed by the JavaScript Encoding API.
 *
 * The API documentation for mb_convert_encoding does not specify how
 * many error markers we will emit for each possible invalid byte
 * sequence, so we might as well comply with the WHATWG specification.
 *
 * Thanks to Martin Auswöger for pointing this out... and another big
 * thanks for providing test cases!
 *
 * Ref: https://encoding.spec.whatwg.org/#utf-8-decoder
 */
mb_substitute_character(0x25);

$testCases = [
  ["\x80", "%"],
  ["\xFF", "%"],
  ["\xC2\x7F", "%\x7F"],
  ["\xC2\x80", "\xC2\x80"],
  ["\xDF\xBF", "\xDF\xBF"],
  ["\xDF\xC0", "%%"],
  ["\xE0\xA0\x7F", "%\x7F"],
  ["\xE0\xA0\x80", "\xE0\xA0\x80"],
  ["\xEF\xBF\xBF", "\xEF\xBF\xBF"],
  ["\xEF\xBF\xC0", "%%"],
  ["\xF0\x90\x80\x7F", "%\x7F"],
  ["\xF0\x90\x80\x80", "\xF0\x90\x80\x80"],
  ["\xF4\x8F\xBF\xBF", "\xF4\x8F\xBF\xBF"],
  ["\xF4\x8F\xBF\xC0", "%%"],
  ["\xFA\x80\x80\x80\x80", "%%%%%"],
  ["\xFB\xBF\xBF\xBF\xBF", "%%%%%"],
  ["\xFD\x80\x80\x80\x80\x80", "%%%%%%"],
  ["\xFD\xBF\xBF\xBF\xBF\xBF", "%%%%%%"]
];

foreach ($testCases as $testCase) {
  $result = mb_convert_encoding($testCase[0], 'UTF-8', 'UTF-8');
  if ($result !== $testCase[1]) {
    die("Expected UTF-8 string " . bin2hex($testCase[0]) . " to convert to UTF-8 string " . bin2hex($testCase[1]) . "; got " . bin2hex($result));
  }
}

echo "All done!\n";

?>
--EXPECT--
All done!