File: 01_utf8.t

package info (click to toggle)
libcpanel-json-xs-perl 3.0225-1
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 2,660 kB
  • ctags: 799
  • sloc: perl: 892; makefile: 8
file content (161 lines) | stat: -rw-r--r-- 7,274 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
use Test::More tests => 155;
use utf8;
use Cpanel::JSON::XS;

is(Cpanel::JSON::XS->new->allow_nonref (1)->utf8 (1)->encode ("ü"), "\"\xc3\xbc\"");
is(Cpanel::JSON::XS->new->allow_nonref (1)->encode ("ü"), "\"ü\"");

is(Cpanel::JSON::XS->new->allow_nonref (1)->ascii (1)->utf8 (1)->encode (chr 0x8000), '"\u8000"');
is(Cpanel::JSON::XS->new->allow_nonref (1)->ascii (1)->utf8 (1)->pretty (1)->encode (chr 0x10402), "\"\\ud801\\udc02\"\n");

SKIP: {
  skip "5.6", 1 if $] < 5.008;
  eval { Cpanel::JSON::XS->new->allow_nonref (1)->utf8 (1)->decode ('"ü"') };
  like $@, qr/malformed UTF-8/;
}

is(Cpanel::JSON::XS->new->allow_nonref (1)->decode ('"ü"'), "ü");
is(Cpanel::JSON::XS->new->allow_nonref (1)->decode ('"\u00fc"'), "ü");
if ($] < 5.008) {
  eval { decode_json ('"\ud801\udc02' . "\x{10204}\"", 1) };
  like $@, qr/malformed UTF-8/;
} else {
  is(Cpanel::JSON::XS->new->allow_nonref (1)->decode ('"\ud801\udc02' . "\x{10204}\""), "\x{10402}\x{10204}");
}
is(Cpanel::JSON::XS->new->allow_nonref (1)->decode ('"\"\n\\\\\r\t\f\b"'), "\"\012\\\015\011\014\010");

my $love = $] < 5.008 ? "I \342\235\244 perl" : "I ❤ perl";
is(Cpanel::JSON::XS->new->ascii->encode ([$love]),
   $] < 5.008 ? '["I \u00e2\u009d\u00a4 perl"]' : '["I \u2764 perl"]', 'utf8 enc ascii');
is(Cpanel::JSON::XS->new->latin1->encode ([$love]),
      $] < 5.008 ? "[\"I \342\235\244 perl\"]" : '["I \u2764 perl"]', 'utf8 enc latin1');

SKIP: {
  skip "5.6", 1 if $] < 5.008;
  require Encode;
  # [RT #84244] wrong complaint: JSON::XS double encodes to ["I ❤ perl"]
  #             and with utf8 triple encodes it to ["I ❤ perl"]
  if ($Encode::VERSION < 2.40 or $Encode::VERSION >= 2.54) { # Encode stricter check: Cannot decode string with wide characters
    # see also http://stackoverflow.com/questions/12994100/perl-encode-pm-cannot-decode-string-with-wide-character
    $love = "I \342\235\244 perl";
  }
  my $s = Encode::decode_utf8($love); # User tries to double decode wide-char to unicode with Encode
  is(Cpanel::JSON::XS->new->utf8->encode ([$s]), "[\"I \342\235\244 perl\"]", 'utf8 enc utf8 [RT #84244]');
}
is(Cpanel::JSON::XS->new->binary->encode ([$love]), '["I \xe2\x9d\xa4 perl"]', 'utf8 enc binary');

# TODO: test utf8 hash keys,
# test utf8 strings without any char > 0x80.

# warn on the 66 non-characters as in core
{
  my $w;
  require warnings;
  warnings->unimport($] < 5.014 ? 'utf8' : 'nonchar');
  $SIG{__WARN__} = sub { $w = shift };
  my $d = Cpanel::JSON::XS->new->allow_nonref->decode('"\ufdd0"');
  my $warn = $w;
  is ($d, "\x{fdd0}", substr($warn,0,31)."...");
  like ($warn, qr/^Unicode non-character U\+FDD0 is/);
  $w = '';
  # higher planes
  $d = Cpanel::JSON::XS->new->allow_nonref->decode('"\ud83f\udfff"');
  $warn = $w;
  is ($d, "\x{1ffff}", substr($warn,0,31)."...");
  like ($w, qr/^Unicode non-character U\+1FFFF is/);
  $w = '';
  $d = Cpanel::JSON::XS->new->allow_nonref->decode('"\ud87f\udffe"');
  $warn = $w;
  is ($d, "\x{2fffe}", substr($warn,0,31)."...");
  like ($w, qr/^Unicode non-character U\+2FFFE is/);

  $w = '';
  $d = Cpanel::JSON::XS->new->allow_nonref->decode('"\ud8a4\uddd1"');
  $warn = $w;
  is ($d, "\x{391d1}", substr($warn,0,31)."...");
  is ($w, '');
}
{
  my $w;
  warnings->unimport($] < 5.014 ? 'utf8' : 'nonchar');
  $SIG{__WARN__} = sub { $w = shift };
  # no warning with relaxed
  my $d = Cpanel::JSON::XS->new->allow_nonref->relaxed->decode('"\ufdd0"');
  my $warn = $w;
  is ($d, "\x{fdd0}", "no warning with relaxed");
  is($w, undef);
}

# security exploits via ill-formed subsequences
# see http://unicode.org/reports/tr36/#UTF-8_Exploit
# testcases from Encode/t/utf8strict.t
# All these sequences are not handled by the unsafe, fast XS decoder,
# rather passed through to the safe Perl decoder, which detects those.
my @ill =
  (# http://smontagu.damowmow.com/utf8test.html
   # The numbers below, like 2.1.2 are test numbers on this web page
   qq/80/          ,             # 3.1.1
   qq/bf/          ,             # 3.1.2
   qq/80 bf/       ,             # 3.1.3
   qq/80 bf 80/    ,             # 3.1.4
   qq/80 bf 80 bf/ ,             # 3.1.5
   qq/80 bf 80 bf 80/ ,          # 3.1.6
   qq/80 bf 80 bf 80 bf/ ,       # 3.1.7
   qq/80 bf 80 bf 80 bf 80/ ,    # 3.1.8
   qq/80 81 82 83 84 85 86 87 88 89 8a 8b 8c 8d 8e 8f 90 91 92 93 94 95 96 97 98 99 9a 9b 9c 9d 9e 9f a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aa ab ac ad ae af b0 b1 b2 b3 b4 b5 b6 b7 b8 b9 ba bb bc bd be bf/ , # 3.1.9
   qq/c0 20 c1 20 c2 20 c3 20 c4 20 c5 20 c6 20 c7 20 c8 20 c9 20 ca 20 cb 20 cc 20 cd 20 ce 20 cf 20 d0 20 d1 20 d2 20 d3 20 d4 20 d5 20 d6 20 d7 20 d8 20 d9 20 da 20 db 20 dc 20 dd 20 de 20 df 20/ , # 3.2.1
   qq/e0 20 e1 20 e2 20 e3 20 e4 20 e5 20 e6 20 e7 20 e8 20 e9 20 ea 20 eb 20 ec 20 ed 20 ee 20 ef 20/ , # 3.2.2
   qq/f0 20 f1 20 f2 20 f3 20 f4 20 f5 20 f6 20 f7 20/ , # 3.2.3
   qq/f8 20 f9 20 fa 20 fb 20/ , # 3.2.4
   qq/fc 20 fd 20/ ,             # 3.2.5
   qq/c0/ ,                      # 3.3.1
   qq/e0 80/ ,                   # 3.3.2
   qq/f0 80 80/ ,                # 3.3.3
   qq/f8 80 80 80/ ,             # 3.3.4
   qq/fc 80 80 80 80/ ,          # 3.3.5
   qq/df/ ,                      # 3.3.6
   qq/ef bf/ ,                   # 3.3.7
   qq/f7 bf bf/ ,                # 3.3.8
   qq/fb bf bf bf/ ,             # 3.3.9
   qq/fd bf bf bf bf/ ,          # 3.3.10
   qq/c0 e0 80 f0 80 80 f8 80 80 80 fc 80 80 80 80 df ef bf f7 bf bf fb bf bf bf fd bf bf bf bf/ , # 3.4.1
   qq/fe/ ,                      # 3.5.1
   qq/ff/ ,                      # 3.5.2
   qq/fe fe ff ff/ ,             # 3.5.3
   qq/f0 8f bf bf/ ,             # 4.2.3
   qq/f8 87 bf bf bf/ ,          # 4.2.4
   qq/fc 83 bf bf bf bf/ ,       # 4.2.5
   qq/c0 af/ ,                   # 4.1.1  # ! overflow not with perl 5.6
   qq/e0 80 af/ ,                # 4.1.2  # ! overflow not with perl 5.6
   qq/f0 80 80 af/ ,             # 4.1.3  # ! overflow not with perl 5.6
   qq/f8 80 80 80 af/ ,          # 4.1.4  # ! overflow not with perl 5.6
   qq/fc 80 80 80 80 af/ ,       # 4.1.5  # ! overflow not with perl 5.6
   qq/c1 bf/ ,                   # 4.2.1  # ! overflow not with perl 5.6
   qq/e0 9f bf/ ,                # 4.2.2  # ! overflow not with perl 5.6
   qq/c0 80/ ,                   # 4.3.1  # xx! overflow not with perl 5.6
   qq/e0 80 80/ ,                # 4.3.2  # xx! overflow not with perl 5.6
   qq/f0 80 80 80/ ,             # 4.3.3  # xx! overflow not with perl 5.6
   qq/f8 80 80 80 80/ ,          # 4.3.4  # xx! overflow not with perl 5.6
   qq/fc 80 80 80 80 80/ ,       # 4.3.5  # xx! overflow not with perl 5.6
   # non-shortest form of 5c i.e. "\\"
   qq/c1 9c/ ,                            # ! not with perl 5.6
  );

{
  # these are no multibyte codepoints, just raw utf8 bytes,
  # so most of them work with 5.6 also.
  $^W = 1;
  my $w;
  warnings->import($] < 5.014 ? 'utf8' : 'nonchar');
  $SIG{__WARN__} = sub { $w = shift };

  for my $ill (@ill) {
    my $o = pack "C*" => map {hex} split /\s+/, $ill;
    my $d = eval { decode_json("[\"$o\"]"); };
    is ($d, undef, substr($@,0,25))
      or diag $w, ' ', $ill, "\t => ", $d->[0], " $@";
    like($@, qr/malformed UTF-8 character/, "ill-formed utf8 <$ill> throws error");
    is($d, undef, "without warning");
    $w = undef;
  }
}