File: 50_utf8.t

package info (click to toggle)
libtext-csv-xs-perl 1.61-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 1,376 kB
  • sloc: perl: 8,771; makefile: 9
file content (139 lines) | stat: -rw-r--r-- 4,599 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#!/usr/bin/perl

use strict;
use warnings;

use Test::More;
use charnames ":full";

BEGIN {
    if ($] < 5.008001) {
	plan skip_all => "UTF8 tests useless in this ancient perl version";
	}
    else {
	plan tests => 93;
	}
    }

BEGIN {
    require_ok "Text::CSV_XS";
    plan skip_all => "Cannot load Text::CSV_XS" if $@;
    require "./t/util.pl";
    }

my $tfn = "_50test.csv"; END { -f $tfn and unlink $tfn; }
# No binary => 1, as UTF8 is supposed to be allowed without it
my $csv = Text::CSV_XS->new ({
    always_quote   => 1,
    keep_meta_info => 1,
    });

# Special characters to check:
# 0A = \n  2C = ,  20 =     22 = "  
# 0D = \r  3B = ;
foreach my $test (
  # Space-like characters
  [ "\x{0000A0}", "U+0000A0 NO-BREAK SPACE"				],
  [ "\x{00200B}", "U+00200B ZERO WIDTH SPACE"				],
  # Some characters with possible problems in the code point
  [ "\x{000122}", "U+000122 LATIN CAPITAL LETTER G WITH CEDILLA"	],
  [ "\x{002C22}", "U+002C22 GLAGOLITIC CAPITAL LETTER SPIDERY HA"	],
  [ "\x{000A2C}", "U+000A2C GURMUKHI LETTER BA"				],
  [ "\x{000E2C}", "U+000E2C THAI CHARACTER LO CHULA"			],
  [ "\x{010A2C}", "U+010A2C KHAROSHTHI LETTER VA"			],
  # Characters with possible problems in the encoded representation
  #  Should not be possible. ASCII is coded in 000..127, all other
  #  characters in 128..255
  ) {
    my ($u, $msg) = @$test;
    ($u = "$u\x{0123}") =~ s/.$//;	# Make sure it's marked UTF8
    my @in  = ("", " ", $u, "");
    my $exp = join ",", map { qq{"$_"} } @in;

    ok ($csv->combine (@in),		"combine $msg");

    my $str = $csv->string;
    is_binary ($str, $exp,		"string  $msg");

    ok ($csv->parse ($str),		"parse   $msg");
    my @out = $csv->fields;
    # Cannot use is_deeply (), because of the binary content
    is (scalar @in, scalar @out,	"fields  $msg");
    is_binary ($in[$_], $out[$_],	"field $_ $msg") for 0 .. $#in;
    }

# Test if the UTF8 part is accepted, but the \n is not
is ($csv->parse (qq{"\x{0123}\n\x{20ac}"}), 0, "\\n still needs binary");
is ($csv->binary, 0, "bin flag still unset");
is ($csv->error_diag + 0, 2021, "Error 2021");

open my $fh, ">:encoding(utf-8)", $tfn or die "$tfn: $!\n";
print   $fh qq{"\N{LATIN CAPITAL LETTER O WITH STROKE}l/Vin",0\n};
close   $fh;
SKIP: {
    open my $fh, "<:encoding(utf-8)", $tfn or
	skip "Cannot open UTF-8 test file", 6;

    my $row;
    ok ($row = $csv->getline ($fh), "read/parse");

    is ($csv->is_quoted (0),	1,	"First  field is quoted");
    is ($csv->is_quoted (1),	0,	"Second field is not quoted");
    is ($csv->is_binary (0),	1,	"First  field is binary");
    is ($csv->is_binary (1),	0,	"Second field is not binary");

    ok (utf8::valid ($row->[0]),	"First field is valid utf8");

    $csv->combine (@$row);
    ok (utf8::valid ($csv->string),	"Combined string is valid utf8");
    }

# Test quote_binary
$csv->always_quote (0);
$csv->quote_space  (0);
$csv->quote_binary (0);
ok ($csv->combine (" ", 1, "\x{20ac} "),	"Combine");
is ($csv->string, qq{ ,1,\x{20ac} },		"String 0-0");
$csv->quote_binary (1);
ok ($csv->combine (" ", 1, "\x{20ac} "),	"Combine");
is ($csv->string, qq{ ,1,"\x{20ac} "},		"String 0-1");

$csv->quote_space  (1);
$csv->quote_binary (0);
ok ($csv->combine (" ", 1, "\x{20ac} "),	"Combine");
is ($csv->string, qq{" ",1,"\x{20ac} "},	"String 1-0");
ok ($csv->quote_binary (1),			"quote binary on");
ok ($csv->combine (" ", 1, "\x{20ac} "),	"Combine");
is ($csv->string, qq{" ",1,"\x{20ac} "},	"String 1-1");

ok ($csv->parse (qq{,1,"f\x{014d}o, 3""56",,bar,\r\n}), "example from XS");
is_deeply ([$csv->fields], [
    "", 1, qq{f\x{014d}o, 3"56}, "", "bar", "" ], "content");

open  $fh, ">:encoding(utf-8)", $tfn or die "$tfn: $!\n";
print $fh "euro\n\x{20ac}\neuro\n";
close $fh;
open  $fh, "<:encoding(utf-8)", $tfn or die "$tfn: $!\n";

SKIP: {
    my $out = "";
    my $isutf8 = $] < 5.008001 ?
	sub { !$_[0]; } :	# utf8::is_utf8 () not available in 5.8.0
	sub { utf8::is_utf8 ($out); };
    ok ($csv->auto_diag (1),			"auto diag");
    ok ($csv->binary (1),   			"set binary");
    ok ($csv->bind_columns (\$out),		"bind");
    ok ($csv->getline ($fh),			"parse");
    is ($csv->is_binary (0),	0,		"not binary");
    is ($out,			"euro",		"euro");
    ok (!$isutf8->(1),				"not utf8");
    ok ($csv->getline ($fh),			"parse");
    is ($csv->is_binary (0),	1,		"is binary");
    is ($out,			"\x{20ac}",	"euro");
    ok ($isutf8->(0),				"is utf8");
    ok ($csv->getline ($fh),			"parse");
    is ($csv->is_binary (0),	0,		"not binary");
    is ($out,			"euro",		"euro");
    ok (!$isutf8->(1),				"not utf8");
    close $fh;
    }