1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86
|
--TEST--
mb_substr_count()
--EXTENSIONS--
mbstring
--INI--
output_handler=
--FILE--
<?php
mb_internal_encoding("EUC-JP");
print "== Empty needle should raise an error ==\n";
try {
var_dump(mb_substr_count("", ""));
} catch (\ValueError $e) {
echo $e->getMessage() . \PHP_EOL;
}
try {
var_dump(mb_substr_count("��", ""));
} catch (\ValueError $e) {
echo $e->getMessage() . \PHP_EOL;
}
try {
// Although the needle below contains 3 bytes, it decodes to zero Unicode codepoints
// So the needle is actually 'empty', although it doesn't appear so
var_dump(mb_substr_count("abcdef", "\x1B(B", "ISO-2022-JP"));
} catch (\ValueError $e) {
echo $e->getMessage() . \PHP_EOL;
}
print "== Return value for empty haystack should always be zero ==\n";
var_dump(mb_substr_count("", "\xA4\xA2"));
var_dump(mb_substr_count("", chr(0)));
print "== Try searching using various encodings ==\n";
$a = str_repeat("abcacba", 100);
var_dump(mb_substr_count($a, "bca"));
$a = str_repeat("\xA4\xA2\xA4\xA4\xA4\xA6\xA4\xA2\xA4\xA6\xA4\xA4\xA4\xA2", 100);
$b = "\xA4\xA4\xA4\xA6\xA4\xA2";
var_dump(mb_substr_count($a, $b));
$to_enc = "UTF-8";
var_dump(mb_substr_count(mb_convert_encoding($a, $to_enc),
mb_convert_encoding($b, $to_enc), $to_enc));
$to_enc = "Shift_JIS";
var_dump(mb_substr_count(mb_convert_encoding($a, $to_enc),
mb_convert_encoding($b, $to_enc), $to_enc));
$a = str_repeat("abcacbabca", 100);
var_dump(mb_substr_count($a, "bca"));
print "== Regression tests ==\n";
// The old implementation had a bug; it could only recognize a maximum of one
// match for each byte that it fed into the decoder, even if feeding in that
// byte caused two codepoints to be emitted (because the decoder was holding
// cached data), and both of those codepoints matched a 1-codepoint needle
// (For this example, two error markers are emitted for the final byte 0xFF)
echo mb_substr_count("\xef\xff", "\xf8", "UTF-8"), "\n";
// Another thing about the old implementation: if a final codepoint was emitted
// by a decoder flush function, and that codepoint finished a match with the
// needle, that match would be disregarded and not counted in the returned value
// (In practice, the only thing emitted from decoder flush functions is an error
// marker, if the string ended in an illegal state)
echo mb_substr_count("+", "+", "UTF7-IMAP"), "\n";
?>
--EXPECT--
== Empty needle should raise an error ==
mb_substr_count(): Argument #2 ($needle) must not be empty
mb_substr_count(): Argument #2 ($needle) must not be empty
mb_substr_count(): Argument #2 ($needle) must not be empty
== Return value for empty haystack should always be zero ==
int(0)
int(0)
== Try searching using various encodings ==
int(100)
int(100)
int(100)
int(100)
int(200)
== Regression tests ==
2
1
|