1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273
|
use warnings;
use strict;
BEGIN {
chdir 't' if -d 't';
@INC = '../lib';
require './test.pl';
}
plan(20736); # Determined by experimentation
# In this section, test the upper/lower/title case mappings for all characters
# 0-255.
# First compute the case mappings without resorting to the functions we're
# testing.
# Initialize the arrays so each $i maps to itself.
my @posix_to_upper;
for my $i (0 .. 255) {
$posix_to_upper[$i] = chr($i);
}
my @posix_to_lower
= my @posix_to_title
= my @latin1_to_upper
= my @latin1_to_lower
= my @latin1_to_title
= @posix_to_upper;
# Override the elements in the to_lower arrays that have different standard
# lower case mappings. (standard meaning they are 32 numbers apart)
for my $i (0x41 .. 0x5A, 0xC0 .. 0xD6, 0xD8 .. 0xDE) {
my $upper_ord = utf8::unicode_to_native $i;
my $lower_ord = utf8::unicode_to_native($i + 32);
$latin1_to_lower[$upper_ord] = chr($lower_ord);
next if $i > 127;
$posix_to_lower[$upper_ord] = chr($lower_ord);
}
# Same for upper and title
for my $i (0x61 .. 0x7A, 0xE0 .. 0xF6, 0xF8 .. 0xFE) {
my $lower_ord = utf8::unicode_to_native $i;
my $upper_ord = utf8::unicode_to_native($i - 32);
$latin1_to_upper[$lower_ord] = chr($upper_ord);
$latin1_to_title[$lower_ord] = chr($upper_ord);
next if $i > 127;
$posix_to_upper[$lower_ord] = chr($upper_ord);
$posix_to_title[$lower_ord] = chr($upper_ord);
}
# Override the abnormal cases.
$latin1_to_upper[utf8::unicode_to_native 0xB5] = chr(0x39C);
$latin1_to_title[utf8::unicode_to_native 0xB5] = chr(0x39C);
$latin1_to_upper[utf8::unicode_to_native 0xDF] = 'SS';
$latin1_to_title[utf8::unicode_to_native 0xDF] = 'Ss';
$latin1_to_upper[utf8::unicode_to_native 0xFF] = chr(0x178);
$latin1_to_title[utf8::unicode_to_native 0xFF] = chr(0x178);
my $repeat = 25; # Length to make strings.
# Create hashes of strings in several ranges, both for uc and lc.
my %posix;
$posix{'uc'} = 'A' x $repeat;
$posix{'lc'} = 'a' x $repeat ;
my %cyrillic;
$cyrillic{'uc'} = chr(0x42F) x $repeat;
$cyrillic{'lc'} = chr(0x44F) x $repeat;
my %latin1;
$latin1{'uc'} = chr(utf8::unicode_to_native 0xD8) x $repeat;
$latin1{'lc'} = chr(utf8::unicode_to_native 0xF8) x $repeat;
my %empty;
$empty{'lc'} = $empty{'uc'} = "";
# Loop so prefix each character being tested with nothing, and the various
# strings; then loop for suffixes of those strings as well.
for my $prefix (\%empty, \%posix, \%cyrillic, \%latin1) {
for my $suffix (\%empty, \%posix, \%cyrillic, \%latin1) {
for my $i (0 .. 255) { # For each possible posix or latin1 character
my $cp = sprintf "U+%04X", $i;
# First try using latin1 (Unicode) semantics.
use feature "unicode_strings";
my $phrase = 'in uni8bit';
my $char = chr($i);
my $pre_lc = $prefix->{'lc'};
my $pre_uc = $prefix->{'uc'};
my $post_lc = $suffix->{'lc'};
my $post_uc = $suffix->{'uc'};
my $to_upper = $pre_lc . $char . $post_lc;
my $expected_upper = $pre_uc . $latin1_to_upper[$i] . $post_uc;
my $to_lower = $pre_uc . $char . $post_uc;
my $expected_lower = $pre_lc . $latin1_to_lower[$i] . $post_lc;
is (uc($to_upper), $expected_upper,
display("$cp: $phrase: Verify uc($to_upper) eq $expected_upper"));
is (lc($to_lower), $expected_lower,
display("$cp: $phrase: Verify lc($to_lower) eq $expected_lower"));
if ($pre_uc eq "") { # Title case if null prefix.
my $expected_title = $latin1_to_title[$i] . $post_lc;
is (ucfirst($to_upper), $expected_title,
display("$cp: $phrase: Verify ucfirst($to_upper) eq $expected_title"));
my $expected_lcfirst = $latin1_to_lower[$i] . $post_uc;
is (lcfirst($to_lower), $expected_lcfirst,
display("$cp: $phrase: Verify lcfirst($to_lower) eq $expected_lcfirst"));
}
# Then try with posix semantics.
no feature "unicode_strings";
$phrase = 'no uni8bit';
# These don't contribute anything in this case.
next if $suffix == \%cyrillic;
next if $suffix == \%latin1;
next if $prefix == \%cyrillic;
next if $prefix == \%latin1;
$expected_upper = $pre_uc . $posix_to_upper[$i] . $post_uc;
$expected_lower = $pre_lc . $posix_to_lower[$i] . $post_lc;
is (uc($to_upper), $expected_upper,
display("$cp: $phrase: Verify uc($to_upper) eq $expected_upper"));
is (lc($to_lower), $expected_lower,
display("$cp: $phrase: Verify lc($to_lower) eq $expected_lower"));
if ($pre_uc eq "") {
my $expected_title = $posix_to_title[$i] . $post_lc;
is (ucfirst($to_upper), $expected_title,
display("$cp: $phrase: Verify ucfirst($to_upper) eq $expected_title"));
my $expected_lcfirst = $posix_to_lower[$i] . $post_uc;
is (lcfirst($to_lower), $expected_lcfirst,
display("$cp: $phrase: Verify lcfirst($to_lower) eq $expected_lcfirst"));
}
}
}
}
# In this section test that \w, \s, and \b (and complements) work correctly.
# These are the only character classes affected by this pragma. Above ASCII
# range Latin-1 characters are in \w and \s iff the pragma is on.
# Construct the expected full Latin1 values without using anything we're
# testing. All these were determined manually by looking in the manual.
# Boolean: is w[$i] a \w character?
my @w = (0) x 256;
for my $i ( 0x30 .. 0x39, # 0-9
0x41 .. 0x5a, # A-Z
0x61 .. 0x7a, # a-z
0x5F, # _
0xAA, # FEMININE ORDINAL INDICATOR
0xB5, # MICRO SIGN
0xBA, # MASCULINE ORDINAL INDICATOR
0xC0 .. 0xD6, # various
0xD8 .. 0xF6, # various
0xF8 .. 0xFF, # various
)
{
$w[utf8::unicode_to_native $i] = 1;
}
# Boolean: is s[$i] a \s character?
my @s = (0) x 256;
$s[utf8::unicode_to_native 0x09] = 1; # Tab
$s[utf8::unicode_to_native 0x0A] = 1; # LF
$s[utf8::unicode_to_native 0x0B] = 1; # VT
$s[utf8::unicode_to_native 0x0C] = 1; # FF
$s[utf8::unicode_to_native 0x0D] = 1; # CR
$s[utf8::unicode_to_native 0x20] = 1; # SPACE
$s[utf8::unicode_to_native 0x85] = 1; # NEL
$s[utf8::unicode_to_native 0xA0] = 1; # NO BREAK SPACE
for my $i (0 .. 255) {
my $char = chr($i);
my $hex_i = sprintf "%02X", $i;
foreach my $which (\@s, \@w) {
my $basic_name;
if ($which == \@s) {
$basic_name = 's';
} else {
$basic_name = 'w'
}
# Test \w \W \s \S
foreach my $complement (0, 1) {
my $name = '\\' . (($complement) ? uc($basic_name) : $basic_name);
# in and out of [...]
foreach my $charclass (0, 1) {
# And like [^...] or just plain [...]
foreach my $complement_class (0, 1) {
next if ! $charclass && $complement_class;
# Start with the boolean as to if the character is in the
# class, and then complement as needed.
my $expect_success = $which->[$i];
$expect_success = ! $expect_success if $complement;
$expect_success = ! $expect_success if $complement_class;
my $test = $name;
$test = "^$test" if $complement_class;
$test = "[$test]" if $charclass;
$test = "^$test\$";
use feature 'unicode_strings';
my $prefix = "in uni8bit; Verify chr(0x$hex_i)";
if ($expect_success) {
like($char, qr/$test/, display("$prefix =~ qr/$test/"));
} else {
unlike($char, qr/$test/, display("$prefix !~ qr/$test/"));
}
no feature 'unicode_strings';
$prefix = "no uni8bit; Verify chr(0x$hex_i)";
# With the legacy, nothing above 128 should be in the
# class
if (utf8::native_to_unicode($i) >= 128) {
$expect_success = 0;
$expect_success = ! $expect_success if $complement;
$expect_success = ! $expect_success if $complement_class;
}
if ($expect_success) {
like($char, qr/$test/, display("$prefix =~ qr/$test/"));
} else {
unlike($char, qr/$test/, display("$prefix !~ qr/$test/"));
}
}
}
}
}
# Similarly for \b and \B.
foreach my $complement (0, 1) {
my $name = '\\' . (($complement) ? 'B' : 'b');
my $expect_success = ! $w[$i]; # \b is complement of \w
$expect_success = ! $expect_success if $complement;
my $string = "a$char";
my $test = "(^a$name\\x{$hex_i}\$)";
use feature 'unicode_strings';
my $prefix = "in uni8bit; Verify $string";
if ($expect_success) {
like($string, qr/$test/, display("$prefix =~ qr/$test/"));
} else {
unlike($string, qr/$test/, display("$prefix !~ qr/$test/"));
}
no feature 'unicode_strings';
$prefix = "no uni8bit; Verify $string";
if (utf8::native_to_unicode($i) >= 128) {
$expect_success = 1;
$expect_success = ! $expect_success if $complement;
}
if ($expect_success) {
like($string, qr/$test/, display("$prefix =~ qr/$test/"));
like($string, qr/$test/, display("$prefix =~ qr/$test/"));
} else {
unlike($string, qr/$test/, display("$prefix !~ qr/$test/"));
}
}
}
|