File: utf7imap_encoding.phpt

package info (click to toggle)
php8.4 8.4.11-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 208,108 kB
  • sloc: ansic: 1,060,628; php: 35,345; sh: 11,866; cpp: 7,201; pascal: 4,913; javascript: 3,091; asm: 2,810; yacc: 2,411; makefile: 689; xml: 446; python: 301; awk: 148
file content (254 lines) | stat: -rw-r--r-- 11,957 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
--TEST--
Exhaustive test of mUTF-7 (IMAP) encoding verification and conversion
--EXTENSIONS--
mbstring
--FILE--
<?php
include('encoding_tests.inc');
mb_substitute_character(0x25); // '%'

function utf16BE($utf8) {
	return mb_convert_encoding($utf8, 'UTF-16BE', 'UTF-8');
}

function mBase64($str) {
	return str_replace('=', '', str_replace('/', ',', base64_encode($str)));
}

function testValid($from, $to, $bothWays = true) {
	testValidString($from, $to, 'UTF7-IMAP', 'UTF-8', $bothWays);
}
function testInvalid($from, $to) {
	testInvalidString($from, $to, 'UTF7-IMAP', 'UTF-8');
}

/* An empty string is valid */
testValid("", "");
echo "Identification passes on empty string... good start!\n";

/* RFC says that 0x00 should be Base64-encoded */
testValidString("\x00", "&AAA-", 'UTF-8', 'UTF7-IMAP');
echo "Null byte converted correctly\n";

/* Identification and conversion of ASCII characters (minus &) */
for ($i = 0x20; $i <= 0x7E; $i++) {
	if ($i == 0x26) // '&'
		continue;
	testValid(chr($i), chr($i));
}
echo "Testing all valid single-character ASCII strings... check!\n";

/* Identification and conversion of non-ASCII characters */
for ($i = 0; $i < 0x20; $i++)
	testInvalid(chr($i), "%");
for ($i = 0x7F; $i < 256; $i++)
	testInvalid(chr($i), "%");
echo "Non-ASCII characters convert to illegal char marker... yes!\n";

/* Identification of '&' when Base-64 encoded */
testValid("&" . mBase64(utf16BE("&")) . "-", "&", false);
echo "& can be Base64-encoded... yes!\n";

/* Identification of unterminated & section */
identifyInvalidString("&", 'UTF7-IMAP');
identifyInvalidString("abc&", 'UTF7-IMAP');
identifyInvalidString("&" . mBase64(utf16BE("ハムサンドイッチ")), 'UTF7-IMAP');
echo "Testing unterminated & sections... yep!\n";

/* Identification of null shifts (& immediately after -)
 *
 * This is illegal according to the spec for mUTF-7 (IMAP), but currently we are letting
 * it pass... among other things, this makes it possible to concatenate UTF-7-IMAP
 * strings naively without the concatenated strings being treated as 'invalid'
 *
 * If ever we want to enforce this part of the spec, uncomment the following test */
/*
identifyInvalidString("&" . mBase64(utf16BE("肉包子")) . "-&" . mBase64(utf16BE("冰淇淋")) . "-", 'UTF7-IMAP');
echo "Testing consecutive & sections which should have been merged... yep!\n";
*/

/* Conversion of Base64-encoded ASCII characters (excluding &)
 * These should be treated as erroneous and mb_substitute_character should apply */
for ($i = 0x20; $i <= 0x7E; $i++) {
	if ($i == 0x26) // '&'
		continue;
	testInvalid("&" . mBase64(utf16BE(chr($i))) . "-", "%");
}
echo "Testing ASCII characters which are Base64-encoded... great!\n";

/* Conversion of & encoded as &- */
testValid("&-", "&");
testValid("abc&-", "abc&");
testValid("&-.&-", "&.&");
echo "Testing valid strings which use '&-' for '&'... good!\n";

/* Identification of & sections containing non-Base64 */

/* We'll use 6 character strings as a test, since 6 UTF-16 characters is just enough
 * to fit perfectly in Base64 encoding, with no padding */
$testString = mBase64(utf16BE("我是打酱油的"));
if (strlen($testString) != 16)
	die("Erk!!");
for ($i = 0; $i < 256; $i++) {
	if ($i >= 0x30 && $i <= 0x39) // '0'..'9'
		continue;
	if ($i >= 0x41 && $i <= 0x5A) // 'A'..'Z'
		continue;
	if ($i >= 0x61 && $i <= 0x7A) // 'a'..'z'
		continue;
	if ($i == 0x2B || $i == 0x2C) // '+' or ','
		continue;
	if ($i == 0x2D) // '-'... this will be interpreted as ending the Base64 section
		continue;
	identifyInvalidString("&" . substr($testString, 0, 11) . chr($i) . "-", 'UTF7-IMAP');
}
echo "Identification fails when Base64 sections contain non-Base64 bytes... right!\n";

/* Tell me, please, how many ways can UTF-16BE text get messed up?
 * Why, that's elementary... */

/* 1. The second half of a surrogate pair could come first, */
$testString = mb_convert_encoding("\x00\x01\x04\x00", 'UTF-16BE', 'UTF-32BE');
if (strlen($testString) != 4)
	die("Ouch!");
$testString = substr($testString, 2, 2) . substr($testString, 0, 2);
identifyInvalidString("&" . mBase64($testString) . "-", 'UTF7-IMAP');
/* (Or could appear by itself) */
$testString2 = substr($testString, 0, 2);
identifyInvalidString("&" . mBase64($testString2) . "-", 'UTF7-IMAP');

/* ...and we should detect this wherever it occurs */
$singleChar = "\x00\x01";
$doubleChar = mb_convert_encoding("\x00\x01\x04\x01", 'UTF-16BE', 'UTF-32BE');
if (strlen($doubleChar) != 4)
	die("That was supposed to be a surrogate pair");
identifyInvalidString("&" . mBase64($singleChar . $testString) . "-", 'UTF7-IMAP');
identifyInvalidString("&" . mBase64($singleChar . $singleChar . $testString) . "-", 'UTF7-IMAP');
identifyInvalidString("&" . mBase64($singleChar . $singleChar . $singleChar . $testString) . "-", 'UTF7-IMAP');
identifyInvalidString("&" . mBase64($singleChar . $testString2) . "-", 'UTF7-IMAP');
identifyInvalidString("&" . mBase64($singleChar . $singleChar . $testString2) . "-", 'UTF7-IMAP');
identifyInvalidString("&" . mBase64($singleChar . $singleChar . $singleChar . $testString2) . "-", 'UTF7-IMAP');
identifyInvalidString("&" . mBase64($doubleChar . $testString) . "-", 'UTF7-IMAP');
identifyInvalidString("&" . mBase64($singleChar . $doubleChar . $testString) . "-", 'UTF7-IMAP');
identifyInvalidString("&" . mBase64($singleChar . $singleChar . $doubleChar . $testString) . "-", 'UTF7-IMAP');
identifyInvalidString("&" . mBase64($doubleChar . $testString2) . "-", 'UTF7-IMAP');
identifyInvalidString("&" . mBase64($singleChar . $doubleChar . $testString2) . "-", 'UTF7-IMAP');
identifyInvalidString("&" . mBase64($singleChar . $singleChar . $doubleChar . $testString2) . "-", 'UTF7-IMAP');

/* 2. The first half of a surrogate pair might be followed by an invalid 2nd part, */
$testString = mb_convert_encoding("\x00\x01\x04\x00", 'UTF-16BE', 'UTF-32BE');
$testString = substr($testString, 0, 2) . "\x00a";
identifyInvalidString("&" . mBase64($testString) . "-", 'UTF7-IMAP');

/* ...and we should also detect that wherever it occurs... */
identifyInvalidString("&" . mBase64($singleChar . $testString) . "-", 'UTF7-IMAP');
identifyInvalidString("&" . mBase64($singleChar . $singleChar . $testString) . "-", 'UTF7-IMAP');
identifyInvalidString("&" . mBase64($doubleChar . $testString) . "-", 'UTF7-IMAP');

/* 3. The first half of a surrogate pair could come at the end of the string, */
$testString = mb_convert_encoding("\x00\x01\x04\x00", 'UTF-16BE', 'UTF-32BE');
testInvalid("&" . mBase64(substr($testString, 0, 2)) . "-", "%");
testInvalid("&" . mBase64($singleChar . substr($testString, 0, 2)) . "-", "\x01%");
testInvalid("&" . mBase64($singleChar . $singleChar . substr($testString, 0, 2)) . "-", "\x01\x01%");
/* ...and the string could even be improperly terminated... */
testInvalid("&" . mBase64(substr($testString, 0, 2)), "%%");
testInvalid("&" . mBase64($singleChar . substr($testString, 0, 2)), "\x01%%");
/* NOTE: We currently don't check for trailing first half of surrogate pair when the string
 * abruptly ends after a group of 3 Base64-encoded codepoints... that's why we only emit one
 * error marker here for the incorrect termination of Base64 section and no error marker
 * for the trailing first half of surrogate pair */
testInvalid("&" . mBase64($singleChar . $singleChar . substr($testString, 0, 2)), "\x01\x01%");

/* 4. Or, it could have an odd number of bytes in it! */
$testString = utf16BE("ドーナツ");
$testString = substr($testString, 0, strlen($testString) - 1);
identifyInvalidString("&" . mBase64($testString) . "-", 'UTF7-IMAP');

/* And there is one bonus way to discombobulate your UTF-16BE when it is Base64-encoded...
 * The Base64 might not decode to an integral number of bytes
 * Or, equivalently... it might not be padded with zeroes (as the RFC requires) */
$testString = utf16BE("☺⛑");
if (strlen($testString) != 4)
	die("No good");
$encoded = mBase64($testString);
if (strlen($encoded) != 6)
	die("Don't like that");
/* Mess up the padding by replacing the last Base64 character with ',',
 * which represents 63 (a number with a 1 in the last bit) */
identifyInvalidString("&" . substr($encoded, 0, strlen($encoded) - 1) . ",-", 'UTF7-IMAP');

echo "Identification fails when UTF-16 text is invalid... no sweat!\n";

/* OK, let's try valid Base64-encoded text now */

/* 2-byte char */
testValid("&" . mBase64(utf16BE("☺")) . "-", "☺");
/* 2 + 2 */
testValid("&" . mBase64(utf16BE("饺子")) . "-", "饺子");
/* 2 + 2 + 2 */
testValid("&" . mBase64(utf16BE("123")) . "-", "123");
/* 2 + 2 + 2 + 2 */
testValid("&" . mBase64(utf16BE("ᄚᄆᄇᄈ")) . "-", "ᄚᄆᄇᄈ");
/* 4 */
$longChar1 = mb_convert_encoding("\x00\x01\x04\x01", 'UTF-16BE', 'UTF-32BE');
$longChar2 = mb_convert_encoding("\x00\x01\x04\x01", 'UTF-8', 'UTF-32BE');
testValid("&" . mBase64($longChar1) . "-", $longChar2);
/* 2 + 4 */
testValid("&" . mBase64(utf16BE("饼") . $longChar1) . "-", "饼" . $longChar2);
/* 4 + 2 */
testValid("&" . mBase64($longChar1 . utf16BE("饼")) . "-", $longChar2 . "饼");
/* 2 + 4 + 2 */
testValid("&" . mBase64(utf16BE("☺") . $longChar1 . utf16BE("饼")) . "-", "☺" . $longChar2 . "饼");
/* 2 + 2 + 4 */
testValid("&" . mBase64(utf16BE("西瓜") . $longChar1) . "-", "西瓜" . $longChar2);
/* 2 + 2 + 4 + 2 */
testValid("&" . mBase64(utf16BE("西瓜") . $longChar1 . utf16BE("☺")) . "-", "西瓜" . $longChar2 . "☺");
/* 2 + 2 + 4 + 4 */
testValid("&" . mBase64(utf16BE("西瓜") . $longChar1 . $longChar1) . "-", "西瓜" . $longChar2 . $longChar2);
/* 2 + 2 + 2 + 4 */
testValid("&" . mBase64(utf16BE("西红柿") . $longChar1) . "-", "西红柿" . $longChar2);

/* Multiple sections of valid ASCII _and_ Base64-encoded text */
testValid("123&" . mBase64(utf16BE("123")) . "-abc&" . mBase64(utf16BE("☺")) . "-.", "123123abc☺.");

/* If a & character appears right after a non-ASCII character, we must first close the Base64
 * section and then emit &- */
testValidString("☺&", "&Jjo-&-", "UTF-8", "UTF7-IMAP", false);
testValidString("西瓜&", "&iX903A-&-", "UTF-8", "UTF7-IMAP", false);
testValidString("西红柿&", "&iX9+omf,-&-", "UTF-8", "UTF7-IMAP", false);

echo "Identification and conversion of valid text is working... perfect!\n";

// Try illegal Unicode codepoint (> 0x10FFFF)
convertInvalidString("\x00\x20\x00\x00", "%", "UCS-4BE", "UTF7-IMAP");

// Test "long" illegal character markers
mb_substitute_character("long");
convertInvalidString("\x10", "%", "UTF7-IMAP", "UTF-8");
convertInvalidString("\x80", "%", "UTF7-IMAP", "UTF-8");
convertInvalidString("abc&", "abc%", "UTF7-IMAP", "UTF-8"); // The & starts a Base-64 coded section, which is OK... but there's no data in it
convertInvalidString("&**-", "%*-", "UTF7-IMAP", "UTF-8"); // When we hit the first bad byte in a Base-64 coded section, it drops us back into the default mode, so the following characters are literal

// Try strings where Base64 has an extra trailing byte which is not needed
convertInvalidString('&RR8I', "\xE4\x94\x9F%", 'UTF7-IMAP', 'UTF-8');
convertInvalidString('&RR8IAAA', "\xE4\x94\x9F\xE0\xA0\x80%", 'UTF7-IMAP', 'UTF-8');

// It is useless for a Base64 section to only contain a single 'A'
// (which decodes to only zero bits)
convertInvalidString("&A", "\x00\x00\x00%", 'UTF7-IMAP', 'UTF-32BE');

echo "Done!\n";
?>
--EXPECT--
Identification passes on empty string... good start!
Null byte converted correctly
Testing all valid single-character ASCII strings... check!
Non-ASCII characters convert to illegal char marker... yes!
& can be Base64-encoded... yes!
Testing unterminated & sections... yep!
Testing ASCII characters which are Base64-encoded... great!
Testing valid strings which use '&-' for '&'... good!
Identification fails when Base64 sections contain non-Base64 bytes... right!
Identification fails when UTF-16 text is invalid... no sweat!
Identification and conversion of valid text is working... perfect!
Done!