File: csv-check

package info (click to toggle)
libtext-csv-xs-perl 1.38-1
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 844 kB
  • sloc: perl: 6,862; makefile: 9
file content (183 lines) | stat: -rwxr-xr-x 5,480 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
#!/pro/bin/perl

# csv-check: Check validity of CSV file and report
#	   (m)'17 [24 Nov 2017] Copyright H.M.Brand 2007-2018

# This code requires the defined-or feature and PerlIO

use 5.12.0;
use warnings;

use Data::Peek;
use Encode qw( decode );

our $VERSION = "2.01";	# 2017-11-24
my $cmd = $0; $cmd =~ s{.*/}{};

sub usage {
    my $err = shift and select STDERR;
    print <<"EOU";
usage: $cmd [-s <sep>] [-q <quot>] [-e <esc>] [-u] [--pp] [file.csv]
       -s <sep>   use <sep>   as seperator char. Auto-detect, default = ','
                  The string "tab" is allowed.
       -e <esc>   use <sep>   as seperator char. Auto-detect, default = ','
                  The string "undef" is allowed.
       -q <quot>  use <quot>  as quotation char. Default = '"'
                  The string "undef" will disable quotation.
       -u         check if all fields are valid unicode
       -E <enc>   open file with encoding
       -h         check with header (implies BOM)
       -b         check with BOM (no header)
       -f         do not check formula's

       --pp       use Text::CSV_PP instead (cross-check)
EOU
    exit $err;
    } # usage

use Getopt::Long qw(:config bundling);
my $sep;	# Set after reading first line in a flurry attempt to auto-detect
my $quo = '"';
my $esc = '"';
my $opt_u = 0;
my $opt_p = 0;
my $opt_h = 0;
my $opt_b = 0;
my $opt_f = 0;
my $enc;
GetOptions (
    "help|?"		=> sub { usage (0); },
    "V|version"		=> sub { print "$cmd [$VERSION]\n"; exit 0; },

    "c|s|sep=s"		=> \$sep,
    "q|quo|quote=s"	=> \$quo,
    "e|esc|escape=s"	=> \$esc,
    "u|utf|utf8|utf-8!"	=> \$opt_u,
    "E|enc|encoding=s"	=> \$enc,
    "h|hdr|header!"	=> \$opt_h,
    "b|bom!"		=> \$opt_b,
    "f|skip-formula!"	=> \$opt_f,

    "pp!"		=> \$opt_p,
    ) or usage (1);

my  $csvmod = "Text::CSV_XS";
if ($opt_p) {
    require Text::CSV_PP;
    $csvmod = "Text::CSV_PP";
    }
else {
    require Text::CSV_XS;
    }
$csvmod->import ();

my $fn   = $ARGV[0] // "-";
my $data = do { local $/; <> } or die "No data to analyze\n";
my @warn;

my ($bin, $rows, $eol, %cols) = (0, 0, undef);
unless ($sep) { # No sep char passed, try to auto-detect;
    my ($first_line) = ($data =~ m/\A(.*?)(?:\r\n|\n|\r)/);
    $first_line ||= $data; # if no EOL at all, use whole set
    $sep = $first_line =~ m/["\d],["\d,]/ ? ","  :
	   $first_line =~ m/["\d];["\d;]/ ? ";"  :
	   $first_line =~ m/["\d]\t["\d]/ ? "\t" :
	   # If neither, then for unquoted strings
	   $first_line =~ m/\w,[\w,]/     ? ","  :
	   $first_line =~ m/\w;[\w;]/     ? ";"  :
	   $first_line =~ m/\w\t[\w]/     ? "\t" : ",";
    $data =~ m/([\r\n]+)\Z/ and $eol = DDisplay "$1";
    }

my $csv = $csvmod->new ({
    sep_char       => $sep eq "tab"   ? "\t"  : $sep,
    quote_char     => $quo eq "undef" ? undef : $quo,
    escape_char    => $esc eq "undef" ? undef : $esc,
    binary         => 1,
    keep_meta_info => 1,
    auto_diag      => 1,
    formula        => $opt_f ? "none" : "diag",
    });

sub done {
    my $file = $ARGV // "STDIN";
    (my $pv = "$]0000000") =~ s{^([0-9]+)\.([0-9]{3})([0-9]{3})[0-9]*}
			       {sprintf "%d.%d.%d",$1,$2,$3}e;
    my $uv = eval {
	no warnings;
	(my $cv = $]) =~ s/0+$//;
	eval { require Unicode::UCD;     Unicode::UCD::UnicodeVersion () } ||
	eval { require Module::CoreList; $Module::CoreList::version{$cv}{Unicode} };
	} || "unknown";
    print "Checked $file with $cmd $VERSION\nusing $csvmod @{[$csvmod->VERSION]} with perl $pv and Unicode $uv\n";
    my @diag = $csv->error_diag;
    if ($diag[0] == 2012 && $csv->eof) {
	my @coll = sort { $a <=> $b } keys %cols;
	local $" = ", ";
	my $cols = @coll == 1 ? $coll[0] : "(@coll)";
	$eol //= $csv->eol || "--unknown--";
	print "OK: rows: $rows, columns: $cols\n";
	print "    sep = <$sep>, quo = <$quo>, bin = <$bin>, eol = <$eol>\n";
	print "    encoding = $csv->{ENCODING}\n" if $csv->{ENCODING};
	if (@coll > 1) {
	    print "multiple column lengths:\n";
	    printf " %6d line%s with %4d field%s\n",
		$cols{$_}, $cols{$_} == 1 ? " " : "s",
		$_,        $_        == 1 ? ""  : "s"
		    for @coll;
	    }
	$diag[0] = 0;
	}
    elsif ($diag[2]) {
	print "$ARGV record $diag[3] at line $./$diag[2] - $diag[0] - $diag[1]\n";
	my $ep  = $diag[2] - 1; # diag[2] is 1-based
	my $err = $csv->error_input . "         ";
	substr $err, $ep + 1, 0, "*";
	substr $err, $ep,     0, "*";
	($err = substr $err, $ep - 5, 12) =~ s/ +$//;
	print "    |$err|\n";
	}
    else {
	print "$ARGV line $. - $diag[1]\n";
	}
    print for @warn;
    exit $diag[0];
    } # done

sub stats {
    my $r = shift;
    $cols{scalar @$r}++;
    grep { $_ & 0x0002 } $csv->meta_info and $bin = 1;
    if ($opt_u) {
	my @r = @$r;
	foreach my $x (0 .. $#r) {
	    utf8::is_utf8 ($r[$x]) and next;

	    local $SIG{__WARN__} = sub {
		(my $msg = shift) =~ s{ at /\S+Encode.pm.*}{};
		my @h = $csv->column_names;
		push @warn, sprintf "Field %d%s in record %d - '%s'\t- %s",
		    $x + 1, @h ? " (column: '$h[$x]')" : "", $rows,
		    DPeek ($r[$x]), $msg;
		};
	    my $oct = decode ("utf-8", $r[$x], Encode::FB_WARN);
	    }
	}
    } # stats

my $mode = $enc ? "<:encoding($enc)" : "<";
open my $fh, $mode, \$data or die "$fn: $!\n";
if ($opt_h) {
    $csv->header ($fh);
    }
elsif ($opt_b) {
    my @hdr = $csv->header ($fh, { detect_bom => 1, set_column_names => 0 });
    stats \@hdr;
    }

local $SIG{__WARN__} = sub { push @warn, @_; };
while (my $row = $csv->getline ($fh)) {
    $rows++;
    stats $row;
    }
done;