1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167
|
#!perl
BEGIN {
chdir 't' if -d 't';
@INC = '../lib';
require './test.pl';
skip_all_if_miniperl("no dynamic loading on miniperl, no File::Spec");
}
use strict;
use warnings;
my @tests;
my $file="../lib/unicore/CaseFolding.txt";
my @folds;
use Unicode::UCD;
# Use the Unicode data file if we are on an ASCII platform (which its data is
# for), and it is in the modern format (starting in Unicode 3.1.0) and it is
# available. This avoids being affected by potential bugs introduced by other
# layers of Perl
if (ord('A') == 65
&& pack("C*", split /\./, Unicode::UCD::UnicodeVersion()) ge v3.1.0
&& open my $fh, "<", $file)
{
@folds = <$fh>;
}
else {
my ($invlist_ref, $invmap_ref, undef, $default)
= Unicode::UCD::prop_invmap('Case_Folding');
for my $i (0 .. @$invlist_ref - 1 - 1) {
next if $invmap_ref->[$i] == $default;
my $adjust = -1;
for my $j ($invlist_ref->[$i] .. $invlist_ref->[$i+1] -1) {
$adjust++;
# Single-code point maps go to a 'C' type
if (! ref $invmap_ref->[$i]) {
push @folds, sprintf("%04X; C; %04X\n",
$j,
$invmap_ref->[$i] + $adjust);
}
else { # Multi-code point maps go to 'F'. prop_invmap()
# guarantees that no adjustment is needed for these,
# as the range will contain just one element
push @folds, sprintf("%04X; F; %s\n",
$j,
join " ", map { sprintf "%04X", $_ }
@{$invmap_ref->[$i]});
}
}
}
}
for (@folds) {
chomp;
my ($line,$comment)= split/\s+#\s+/, $_;
$comment = "" unless defined $comment;
my ($cp,$type,@folded)=split/[\s;]+/,$line||'';
next unless $type and ($type eq 'F' or $type eq 'C');
my $fold_above_latin1 = grep { hex("0x$_") > 255 } @folded;
$_="\\x{$_}" for @folded;
my $cpv=hex("0x$cp");
my $chr="\\x{$cp}";
my @str;
foreach my $swap (0, 1) { # swap lhs and rhs, or not.
foreach my $charclass (0, 1) { # Put rhs in [...], or not
my $lhs;
my $rhs;
if ($swap) {
$lhs = join "", @folded;
$rhs = $chr;
$rhs = "[$rhs]" if $charclass;
} else {
#next if $charclass && @folded > 1;
$lhs = $chr;
$rhs = "";
foreach my $rhs_char (@folded) {
# The colon is an unrelated character to the rest of the
# class, and makes sure no optimization into an EXACTish
# node occurs.
$rhs .= '[:' if $charclass;
$rhs .= $rhs_char;
$rhs .= ']' if $charclass;
}
}
$lhs = "\"$lhs\"";
$rhs = "/^$rhs\$/iu";
# Try both Latin1 and Unicode for code points below 256
foreach my $upgrade ("", 'utf8::upgrade($c); ') {
if ($upgrade) { # No need to upgrade if already must be in
# utf8
next if $swap && $fold_above_latin1;
next if !$swap && $cpv > 255;
}
my $eval = "my \$c = $lhs; $upgrade\$c =~ $rhs";
#print __LINE__, ": $eval\n";
push @tests, qq[ok(eval '$eval', '$eval - $comment')];
if (! $swap && $charclass && @folded > 1)
{
$tests[-1]="TODO: { local \$::TODO='A multi-char fold \"foo\", doesnt work for /[f][o][o]/i';\n$tests[-1] }"
}
}
}
}
}
# Now verify the case folding tables. First compute the mappings without
# resorting to the functions we're testing.
# Initialize the array so each $i maps to itself.
my @fold_ascii;
for my $i (0 .. 255) {
$fold_ascii[$i] = $i;
}
my @fold_latin1 = @fold_ascii;
# Override the uppercase elements to fold to their lower case equivalents,
# using the fact that 'A' in ASCII is 0x41, 'a' is 0x41+32, 'B' is 0x42, and
# so on. The same paradigm applies for most of the Latin1 range cased
# characters, but in posix anything outside ASCII maps to itself, as we've
# already set up.
for my $i (0x41 .. 0x5A, 0xC0 .. 0xD6, 0xD8 .. 0xDE) {
my $upper_ord = ord_latin1_to_native($i);
my $lower_ord = ord_latin1_to_native($i + 32);
$fold_latin1[$upper_ord] = $lower_ord;
next if $i > 127;
$fold_ascii[$upper_ord] = $lower_ord;
}
# Same for folding lower to the upper equivalents
for my $i (0x61 .. 0x7A, 0xE0 .. 0xF6, 0xF8 .. 0xFE) {
my $lower_ord = ord_latin1_to_native($i);
my $upper_ord = ord_latin1_to_native($i - 32);
$fold_latin1[$lower_ord] = $upper_ord;
next if $i > 127;
$fold_ascii[$lower_ord] = $upper_ord;
}
# Test every latin1 character for the correct values in both /u and /d
for my $i (0 .. 255) {
my $chr = sprintf "\\x%02X", $i;
my $hex_fold_ascii = sprintf "0x%02X", $fold_ascii[$i];
my $hex_fold_latin1 = sprintf "0x%02X", $fold_latin1[$i];
push @tests, qq[like chr($hex_fold_ascii), qr/(?d:$chr)/i, 'chr($hex_fold_ascii) =~ qr/(?d:$chr)/i'];
push @tests, qq[like chr($hex_fold_latin1), qr/(?u:$chr)/i, 'chr($hex_fold_latin1) =~ qr/(?u:$chr)/i'];
}
push @tests, qq[like chr(0x0430), qr/[=\x{0410}-\x{0411}]/i, 'Bug #71752 Unicode /i char in a range'];
push @tests, qq[like 'a', qr/\\p{Upper}/i, "'a' =~ /\\\\p{Upper}/i"];
push @tests, q[my $c = "\x{212A}"; my $p = qr/(?:^[K_]+$)/i; utf8::upgrade($p); like $c, $p, 'Bug #78994: my $c = "\x{212A}"; my $p = qr/(?:^[K_]+$)/i; utf8::upgrade($p); $c =~ $p'];
use charnames ":full";
my $e_grave = latin1_to_native("\xE8");
push @tests, q[my $re1 = "\N{WHITE SMILING FACE}";like $e_grave, qr/[\w$re1]/, 'my $re = "\N{WHITE SMILING FACE}"; $e_grave =~ qr/[\w$re]/'];
push @tests, q[my $re2 = "\N{WHITE SMILING FACE}";like $e_grave, qr/\w|$re2/, 'my $re = "\N{WHITE SMILING FACE}"; $e_grave =~ qr/\w|$re/'];
eval join ";\n","plan tests=>". (scalar @tests), @tests, "1"
or die $@;
__DATA__
|