File: 01_utf8.t

package info (click to toggle)
libcpanel-json-xs-perl 4.35-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 2,864 kB
  • sloc: perl: 1,092; makefile: 8
file content (168 lines) | stat: -rw-r--r-- 7,196 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
use Test::More tests => 162;
use utf8;
use Cpanel::JSON::XS;
use warnings;

is(Cpanel::JSON::XS->new->allow_nonref->utf8->encode("ü"), "\"\xc3\xbc\"");
is(Cpanel::JSON::XS->new->allow_nonref->encode("ü"), "\"ü\"");

is(Cpanel::JSON::XS->new->allow_nonref->ascii->utf8->encode(chr 0x8000), '"\u8000"');
is(Cpanel::JSON::XS->new->allow_nonref->ascii->utf8->pretty->encode(chr 0x10402), "\"\\ud801\\udc02\"\n");

ok not defined eval { Cpanel::JSON::XS->new->allow_nonref->utf8->decode('"ü"') };
like $@, qr/malformed UTF-8/;

is(Cpanel::JSON::XS->new->allow_nonref->decode('"ü"'), "ü");
is(Cpanel::JSON::XS->new->allow_nonref->decode('"\u00fc"'), "ü");

ok not defined eval { decode_json ('"\ud801\udc02' . "\x{10204}\"", 1) };
like $@, qr/Wide character/;

SKIP: {
  skip "5.6", 1 if $] < 5.008;
  is(Cpanel::JSON::XS->new->allow_nonref->decode('"\ud801\udc02' . "\x{10204}\""), "\x{10402}\x{10204}");
}

is(Cpanel::JSON::XS->new->allow_nonref->decode('"\"\n\\\\\r\t\f\b"'), "\"\012\\\015\011\014\010");

my $utf8_love = "I \342\235\244 perl";
is(Cpanel::JSON::XS->new->ascii->encode([$utf8_love]), '["I \u00e2\u009d\u00a4 perl"]', 'utf8 enc ascii');
is(Cpanel::JSON::XS->new->latin1->encode([$utf8_love]), "[\"I \342\235\244 perl\"]", 'utf8 enc latin1');
is(Cpanel::JSON::XS->new->utf8->encode([$utf8_love]), "[\"I \303\242\302\235\302\244 perl\"]", 'utf8 enc utf8');
is(Cpanel::JSON::XS->new->binary->encode([$utf8_love]), '["I \xe2\x9d\xa4 perl"]', 'utf8 enc binary');

SKIP: {
  skip "5.6", 4 if $] < 5.008;
  my $unicode_love = "I ❤ perl";
  is(Cpanel::JSON::XS->new->ascii->encode([$unicode_love]), '["I \u2764 perl"]', 'unicode enc ascii');
  is(Cpanel::JSON::XS->new->latin1->encode([$unicode_love]), "[\"I \\u2764 perl\"]", 'unicode enc latin1');
  is(Cpanel::JSON::XS->new->utf8->encode([$unicode_love]), "[\"I \342\235\244 perl\"]", 'unicode enc utf8');
  is(Cpanel::JSON::XS->new->binary->encode([$unicode_love]), '["I \xe2\x9d\xa4 perl"]', 'unicode enc binary');
}

# TODO: test utf8 hash keys,
# test utf8 strings without any char > 0x80.

# warn on the 66 non-characters as in core
{
  BEGIN { 'warnings'->import($] < 5.014 ? 'utf8' : 'nonchar') }
  my $w = '';
  $SIG{__WARN__} = sub { $w = shift };
  my $d = Cpanel::JSON::XS->new->allow_nonref->decode('"\ufdd0"');
  my $warn = $w;
  {
    no warnings 'utf8';
    is ($d, "\x{fdd0}", substr($warn,0,31)."...");
  }
  like ($warn, qr/^Unicode non-character U\+FDD0 is/);
  $w = '';
  # higher planes
  $d = Cpanel::JSON::XS->new->allow_nonref->decode('"\ud83f\udfff"');
  $warn = $w;
  {
    no warnings 'utf8';
    is ($d, "\x{1ffff}", substr($warn,0,31)."...");
  }
  like ($w, qr/^Unicode non-character U\+1FFFF is/);
  $w = '';
  $d = Cpanel::JSON::XS->new->allow_nonref->decode('"\ud87f\udffe"');
  $warn = $w;
  {
    no warnings 'utf8';
    is ($d, "\x{2fffe}", substr($warn,0,31)."...");
  }
  like ($w, qr/^Unicode non-character U\+2FFFE is/);

  $w = '';
  $d = Cpanel::JSON::XS->new->allow_nonref->decode('"\ud8a4\uddd1"');
  $warn = $w;
  is ($d, "\x{391d1}", substr($warn,0,31)."...");
  is ($w, '');
}
{
  my $w;
  BEGIN { 'warnings'->import($] < 5.014 ? 'utf8' : 'nonchar') }
  $SIG{__WARN__} = sub { $w = shift };
  # no warning with relaxed
  my $d = Cpanel::JSON::XS->new->allow_nonref->relaxed->decode('"\ufdd0"');
  my $warn = $w;
  {
    no warnings 'utf8';
    is ($d, "\x{fdd0}", "no warning with relaxed");
  }
  is($w, undef);
}

# security exploits via ill-formed subsequences
# see http://unicode.org/reports/tr36/#UTF-8_Exploit
# testcases from Encode/t/utf8strict.t
# All these sequences are not handled by the unsafe, fast XS decoder,
# rather passed through to the safe Perl decoder, which detects those.
my @ill =
  (# http://smontagu.damowmow.com/utf8test.html
   # The numbers below, like 2.1.2 are test numbers on this web page
   qq/80/          ,             # 3.1.1
   qq/bf/          ,             # 3.1.2
   qq/80 bf/       ,             # 3.1.3
   qq/80 bf 80/    ,             # 3.1.4
   qq/80 bf 80 bf/ ,             # 3.1.5
   qq/80 bf 80 bf 80/ ,          # 3.1.6
   qq/80 bf 80 bf 80 bf/ ,       # 3.1.7
   qq/80 bf 80 bf 80 bf 80/ ,    # 3.1.8
   qq/80 81 82 83 84 85 86 87 88 89 8a 8b 8c 8d 8e 8f 90 91 92 93 94 95 96 97 98 99 9a 9b 9c 9d 9e 9f a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aa ab ac ad ae af b0 b1 b2 b3 b4 b5 b6 b7 b8 b9 ba bb bc bd be bf/ , # 3.1.9
   qq/c0 20 c1 20 c2 20 c3 20 c4 20 c5 20 c6 20 c7 20 c8 20 c9 20 ca 20 cb 20 cc 20 cd 20 ce 20 cf 20 d0 20 d1 20 d2 20 d3 20 d4 20 d5 20 d6 20 d7 20 d8 20 d9 20 da 20 db 20 dc 20 dd 20 de 20 df 20/ , # 3.2.1
   qq/e0 20 e1 20 e2 20 e3 20 e4 20 e5 20 e6 20 e7 20 e8 20 e9 20 ea 20 eb 20 ec 20 ed 20 ee 20 ef 20/ , # 3.2.2
   qq/f0 20 f1 20 f2 20 f3 20 f4 20 f5 20 f6 20 f7 20/ , # 3.2.3
   qq/f8 20 f9 20 fa 20 fb 20/ , # 3.2.4
   qq/fc 20 fd 20/ ,             # 3.2.5
   qq/c0/ ,                      # 3.3.1
   qq/e0 80/ ,                   # 3.3.2
   qq/f0 80 80/ ,                # 3.3.3
   qq/f8 80 80 80/ ,             # 3.3.4
   qq/fc 80 80 80 80/ ,          # 3.3.5
   qq/df/ ,                      # 3.3.6
   qq/ef bf/ ,                   # 3.3.7
   qq/f7 bf bf/ ,                # 3.3.8
   qq/fb bf bf bf/ ,             # 3.3.9
   qq/fd bf bf bf bf/ ,          # 3.3.10
   qq/c0 e0 80 f0 80 80 f8 80 80 80 fc 80 80 80 80 df ef bf f7 bf bf fb bf bf bf fd bf bf bf bf/ , # 3.4.1
   qq/fe/ ,                      # 3.5.1
   qq/ff/ ,                      # 3.5.2
   qq/fe fe ff ff/ ,             # 3.5.3
   qq/f0 8f bf bf/ ,             # 4.2.3
   qq/f8 87 bf bf bf/ ,          # 4.2.4
   qq/fc 83 bf bf bf bf/ ,       # 4.2.5
   qq/c0 af/ ,                   # 4.1.1  # ! overflow not with perl 5.6
   qq/e0 80 af/ ,                # 4.1.2  # ! overflow not with perl 5.6
   qq/f0 80 80 af/ ,             # 4.1.3  # ! overflow not with perl 5.6
   qq/f8 80 80 80 af/ ,          # 4.1.4  # ! overflow not with perl 5.6
   qq/fc 80 80 80 80 af/ ,       # 4.1.5  # ! overflow not with perl 5.6
   qq/c1 bf/ ,                   # 4.2.1  # ! overflow not with perl 5.6
   qq/e0 9f bf/ ,                # 4.2.2  # ! overflow not with perl 5.6
   qq/c0 80/ ,                   # 4.3.1  # xx! overflow not with perl 5.6
   qq/e0 80 80/ ,                # 4.3.2  # xx! overflow not with perl 5.6
   qq/f0 80 80 80/ ,             # 4.3.3  # xx! overflow not with perl 5.6
   qq/f8 80 80 80 80/ ,          # 4.3.4  # xx! overflow not with perl 5.6
   qq/fc 80 80 80 80 80/ ,       # 4.3.5  # xx! overflow not with perl 5.6
   # non-shortest form of 5c i.e. "\\"
   qq/c1 9c/ ,                            # ! not with perl 5.6
  );

{
  # these are no multibyte codepoints, just raw utf8 bytes,
  # so most of them work with 5.6 also.
  BEGIN { $^W = 1 }
  BEGIN { 'warnings'->import($] < 5.014 ? 'utf8' : 'nonchar') }
  my $w;
  $SIG{__WARN__} = sub { $w = shift };

  for my $ill (@ill) {
    my $o = pack "C*" => map {hex} split /\s+/, $ill;
    my $d = eval { decode_json("[\"$o\"]"); };
    is ($d, undef, substr($@,0,25))
      or diag $w, ' ', $ill, "\t => ", $d->[0], " $@";
    like($@, qr/malformed UTF-8 character/, "ill-formed utf8 <$ill> throws error");
    is($d, undef, "without warning");
    $w = undef;
  }
}