1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189
|
# Tools to aid testing across platforms with different character sets.
$::IS_ASCII = ord 'A' == 65;
$::IS_EBCDIC = ord 'A' == 193;
# The following functions allow tests to work on both EBCDIC and ASCII-ish
# platforms. They convert string scalars between the native character set and
# the set of 256 characters which is usually called Latin1. However, they
# will work properly with any character input, not just Latin1.
*native_to_uni = ($::IS_ASCII)
? sub { return shift }
: sub {
my $string = shift;
my $output = "";
for my $i (0 .. length($string) - 1) {
$output .= chr(utf8::native_to_unicode(ord(substr($string, $i, 1))));
}
# Preserve utf8ness of input onto the output, even if it didn't need to be
# utf8
utf8::upgrade($output) if utf8::is_utf8($string);
return $output;
};
*uni_to_native = ($::IS_ASCII)
? sub { return shift }
: sub {
my $string = shift;
my $output = "";
for my $i (0 .. length($string) - 1) {
$output .= chr(utf8::unicode_to_native(ord(substr($string, $i, 1))));
}
# Preserve utf8ness of input onto the output, even if it didn't need to be
# utf8
utf8::upgrade($output) if utf8::is_utf8($string);
return $output;
};
my @utf8_skip;
if ($::IS_EBCDIC) {
@utf8_skip = (
# This translates a utf-8-encoded byte into how many bytes the full utf8
# character occupies.
# 0 1 2 3 4 5 6 7 8 9 A B C D E F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 1
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 2
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 3
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 4
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 5
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 6
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 7
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, # 8
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, # 9
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, # A
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, # B
-1,-1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # C
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # D
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # E
4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 7,13, # F
);
}
*byte_utf8a_to_utf8n = ($::IS_ASCII)
? sub { return shift }
: sub {
# Convert a UTF-8 byte sequence into the platform's native UTF-8
# equivalent, currently only UTF-8 and UTF-EBCDIC.
my $string = shift;
die "Input to byte_utf8a-to_utf8n() must not be flagged UTF-8"
if utf8::is_utf8($string);
die "Expecting ASCII or EBCDIC" unless $::IS_EBCDIC;
my $length = length($string);
#diag($string);
#diag($length);
my $out = "";
for ($i = 0; $i < $length; $i++) {
my $byte = ord substr($string, $i, 1);
my $byte_count = $utf8_skip[$byte];
#diag($byte);
#diag($byte_count);
die "Illegal start byte" if $byte_count < 0;
if ($i + $byte_count > $length) {
die "Attempt to read " . $i + $byte_count - $length . " beyond end-of-string";
}
# Just translate UTF-8 invariants directly.
if ($byte_count == 1) {
$out .= chr utf8::unicode_to_native($byte);
next;
}
# Otherwise calculate the code point ordinal represented by the
# sequence beginning with this byte, using the algorithm adapted from
# utf8.c. We absorb each byte in the sequence as we go along
my $ord = $byte & (0x1F >> ($byte_count - 2));
my $bytes_remaining = $byte_count - 1;
while ($bytes_remaining > 0) {
$byte = ord substr($string, ++$i, 1);
unless (($byte & 0xC0) == 0x80) {
die sprintf "byte '%X' is not a valid continuation", $byte;
}
$ord = $ord << 6 | ($byte & 0x3f);
$bytes_remaining--;
}
#diag($byte);
#diag($ord);
my $expected_bytes = $ord < 0x80
? 1
: $ord < 0x800
? 2
: $ord < 0x10000
? 3
: $ord < 0x200000
? 4
: $ord < 0x4000000
? 5
: $ord < 0x80000000
? 6
: 7;
#: (uv) < UTF8_QUAD_MAX ? 7 : 13 )
# Make sure is not an overlong sequence
if ($byte_count != $expected_bytes) {
die sprintf "character U+%X should occupy %d bytes, not %d",
$ord, $expected_bytes, $byte_count;
}
# Now that we have found the code point the original UTF-8 meant, we
# use the native chr function to get its native string equivalent.
$out .= chr utf8::unicode_to_native($ord);
}
utf8::encode($out); # Turn off utf8 flag.
#diag($out);
return $out;
};
my @i8_to_native = ( # Only code page 1047 so far.
# _0 _1 _2 _3 _4 _5 _6 _7 _8 _9 _A _B _C _D _E _F
0x00,0x01,0x02,0x03,0x37,0x2D,0x2E,0x2F,0x16,0x05,0x15,0x0B,0x0C,0x0D,0x0E,0x0F,
0x10,0x11,0x12,0x13,0x3C,0x3D,0x32,0x26,0x18,0x19,0x3F,0x27,0x1C,0x1D,0x1E,0x1F,
0x40,0x5A,0x7F,0x7B,0x5B,0x6C,0x50,0x7D,0x4D,0x5D,0x5C,0x4E,0x6B,0x60,0x4B,0x61,
0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,0xF8,0xF9,0x7A,0x5E,0x4C,0x7E,0x6E,0x6F,
0x7C,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,0xC8,0xC9,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,
0xD7,0xD8,0xD9,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,0xE8,0xE9,0xAD,0xE0,0xBD,0x5F,0x6D,
0x79,0x81,0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x91,0x92,0x93,0x94,0x95,0x96,
0x97,0x98,0x99,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,0xA8,0xA9,0xC0,0x4F,0xD0,0xA1,0x07,
0x20,0x21,0x22,0x23,0x24,0x25,0x06,0x17,0x28,0x29,0x2A,0x2B,0x2C,0x09,0x0A,0x1B,
0x30,0x31,0x1A,0x33,0x34,0x35,0x36,0x08,0x38,0x39,0x3A,0x3B,0x04,0x14,0x3E,0xFF,
0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4A,0x51,0x52,0x53,0x54,0x55,0x56,
0x57,0x58,0x59,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6A,0x70,0x71,0x72,0x73,
0x74,0x75,0x76,0x77,0x78,0x80,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,0x90,0x9A,0x9B,0x9C,
0x9D,0x9E,0x9F,0xA0,0xAA,0xAB,0xAC,0xAE,0xAF,0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,
0xB7,0xB8,0xB9,0xBA,0xBB,0xBC,0xBE,0xBF,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,0xDA,0xDB,
0xDC,0xDD,0xDE,0xDF,0xE1,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,0xFA,0xFB,0xFC,0xFD,0xFE,
);
my @native_to_i8;
for (my $i = 0; $i < 256; $i++) {
$native_to_i8[$i8_to_native[$i]] = $i;
}
# Use these to convert to/from UTF-8 bytes. I8 is the encoding that
# corresponds to UTF-8 with start bytes, continuation bytes, and invariant
# bytes. UTF-EBCDIC is derived from this by a mapping which causes things
# like the start byte C5 to map to something else, as C5 is actually an 'E' in
# EBCDIC so can't be a real start byte, as it must be an invariant; and it
# maps 0x45 (an ASCII 'E') to C5.
*I8_to_native = ($::IS_ASCII)
? sub { return shift }
: sub { return join "", map { chr $i8_to_native[ord $_] }
split "", shift };
*native_to_I8 = ($::IS_ASCII)
? sub { return shift }
: sub { return join "", map { chr $native_to_i8[ord $_] }
split "", shift };
1
|