File: mb_substr_count.phpt

package info (click to toggle)

php8.4 8.4.11-1

links: PTS, VCS
area: main
in suites: forky, sid, trixie
size: 208,108 kB
sloc: ansic: 1,060,628; php: 35,345; sh: 11,866; cpp: 7,201; pascal: 4,913; javascript: 3,091; asm: 2,810; yacc: 2,411; makefile: 689; xml: 446; python: 301; awk: 148

file content (86 lines) | stat: -rw-r--r-- 3,035 bytes

--TEST--
mb_substr_count()
--EXTENSIONS--
mbstring
--INI--
output_handler=
--FILE--
<?php
    mb_internal_encoding("EUC-JP");

    print "== Empty needle should raise an error ==\n";
    try {
        var_dump(mb_substr_count("", ""));
    } catch (\ValueError $e) {
        echo $e->getMessage() . \PHP_EOL;
    }
    try {
        var_dump(mb_substr_count("��", ""));
    } catch (\ValueError $e) {
        echo $e->getMessage() . \PHP_EOL;
    }
    try {
        // Although the needle below contains 3 bytes, it decodes to zero Unicode codepoints
        // So the needle is actually 'empty', although it doesn't appear so
        var_dump(mb_substr_count("abcdef", "\x1B(B", "ISO-2022-JP"));
    } catch (\ValueError $e) {
        echo $e->getMessage() . \PHP_EOL;
    }

    print "== Return value for empty haystack should always be zero ==\n";
    var_dump(mb_substr_count("", "\xA4\xA2"));
    var_dump(mb_substr_count("", chr(0)));

    print "== Try searching using various encodings ==\n";
    $a = str_repeat("abcacba", 100);
    var_dump(mb_substr_count($a, "bca"));

    $a = str_repeat("\xA4\xA2\xA4\xA4\xA4\xA6\xA4\xA2\xA4\xA6\xA4\xA4\xA4\xA2", 100);
    $b = "\xA4\xA4\xA4\xA6\xA4\xA2";
    var_dump(mb_substr_count($a, $b));

    $to_enc = "UTF-8";
    var_dump(mb_substr_count(mb_convert_encoding($a, $to_enc),
                              mb_convert_encoding($b, $to_enc), $to_enc));

    $to_enc = "Shift_JIS";
    var_dump(mb_substr_count(mb_convert_encoding($a, $to_enc),
                              mb_convert_encoding($b, $to_enc), $to_enc));

    $a = str_repeat("abcacbabca", 100);
    var_dump(mb_substr_count($a, "bca"));

    print "== Regression tests ==\n";

    // The old implementation had a bug; it could only recognize a maximum of one
    // match for each byte that it fed into the decoder, even if feeding in that
    // byte caused two codepoints to be emitted (because the decoder was holding
    // cached data), and both of those codepoints matched a 1-codepoint needle
    // (For this example, two error markers are emitted for the final byte 0xFF)
    echo mb_substr_count("\xef\xff", "\xf8", "UTF-8"), "\n";

    // Another thing about the old implementation: if a final codepoint was emitted
    // by a decoder flush function, and that codepoint finished a match with the
    // needle, that match would be disregarded and not counted in the returned value
    // (In practice, the only thing emitted from decoder flush functions is an error
    // marker, if the string ended in an illegal state)
    echo mb_substr_count("+", "+", "UTF7-IMAP"), "\n";

?>
--EXPECT--
== Empty needle should raise an error ==
mb_substr_count(): Argument #2 ($needle) must not be empty
mb_substr_count(): Argument #2 ($needle) must not be empty
mb_substr_count(): Argument #2 ($needle) must not be empty
== Return value for empty haystack should always be zero ==
int(0)
int(0)
== Try searching using various encodings ==
int(100)
int(100)
int(100)
int(100)
int(200)
== Regression tests ==
2
1