1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183
|
#!/pro/bin/perl
# csv-check: Check validity of CSV file and report
# (m)'17 [24 Nov 2017] Copyright H.M.Brand 2007-2018
# This code requires the defined-or feature and PerlIO
use 5.12.0;
use warnings;
use Data::Peek;
use Encode qw( decode );
our $VERSION = "2.01"; # 2017-11-24
my $cmd = $0; $cmd =~ s{.*/}{};
sub usage {
my $err = shift and select STDERR;
print <<"EOU";
usage: $cmd [-s <sep>] [-q <quot>] [-e <esc>] [-u] [--pp] [file.csv]
-s <sep> use <sep> as seperator char. Auto-detect, default = ','
The string "tab" is allowed.
-e <esc> use <sep> as seperator char. Auto-detect, default = ','
The string "undef" is allowed.
-q <quot> use <quot> as quotation char. Default = '"'
The string "undef" will disable quotation.
-u check if all fields are valid unicode
-E <enc> open file with encoding
-h check with header (implies BOM)
-b check with BOM (no header)
-f do not check formula's
--pp use Text::CSV_PP instead (cross-check)
EOU
exit $err;
} # usage
use Getopt::Long qw(:config bundling);
my $sep; # Set after reading first line in a flurry attempt to auto-detect
my $quo = '"';
my $esc = '"';
my $opt_u = 0;
my $opt_p = 0;
my $opt_h = 0;
my $opt_b = 0;
my $opt_f = 0;
my $enc;
GetOptions (
"help|?" => sub { usage (0); },
"V|version" => sub { print "$cmd [$VERSION]\n"; exit 0; },
"c|s|sep=s" => \$sep,
"q|quo|quote=s" => \$quo,
"e|esc|escape=s" => \$esc,
"u|utf|utf8|utf-8!" => \$opt_u,
"E|enc|encoding=s" => \$enc,
"h|hdr|header!" => \$opt_h,
"b|bom!" => \$opt_b,
"f|skip-formula!" => \$opt_f,
"pp!" => \$opt_p,
) or usage (1);
my $csvmod = "Text::CSV_XS";
if ($opt_p) {
require Text::CSV_PP;
$csvmod = "Text::CSV_PP";
}
else {
require Text::CSV_XS;
}
$csvmod->import ();
my $fn = $ARGV[0] // "-";
my $data = do { local $/; <> } or die "No data to analyze\n";
my @warn;
my ($bin, $rows, $eol, %cols) = (0, 0, undef);
unless ($sep) { # No sep char passed, try to auto-detect;
my ($first_line) = ($data =~ m/\A(.*?)(?:\r\n|\n|\r)/);
$first_line ||= $data; # if no EOL at all, use whole set
$sep = $first_line =~ m/["\d],["\d,]/ ? "," :
$first_line =~ m/["\d];["\d;]/ ? ";" :
$first_line =~ m/["\d]\t["\d]/ ? "\t" :
# If neither, then for unquoted strings
$first_line =~ m/\w,[\w,]/ ? "," :
$first_line =~ m/\w;[\w;]/ ? ";" :
$first_line =~ m/\w\t[\w]/ ? "\t" : ",";
$data =~ m/([\r\n]+)\Z/ and $eol = DDisplay "$1";
}
my $csv = $csvmod->new ({
sep_char => $sep eq "tab" ? "\t" : $sep,
quote_char => $quo eq "undef" ? undef : $quo,
escape_char => $esc eq "undef" ? undef : $esc,
binary => 1,
keep_meta_info => 1,
auto_diag => 1,
formula => $opt_f ? "none" : "diag",
});
sub done {
my $file = $ARGV // "STDIN";
(my $pv = "$]0000000") =~ s{^([0-9]+)\.([0-9]{3})([0-9]{3})[0-9]*}
{sprintf "%d.%d.%d",$1,$2,$3}e;
my $uv = eval {
no warnings;
(my $cv = $]) =~ s/0+$//;
eval { require Unicode::UCD; Unicode::UCD::UnicodeVersion () } ||
eval { require Module::CoreList; $Module::CoreList::version{$cv}{Unicode} };
} || "unknown";
print "Checked $file with $cmd $VERSION\nusing $csvmod @{[$csvmod->VERSION]} with perl $pv and Unicode $uv\n";
my @diag = $csv->error_diag;
if ($diag[0] == 2012 && $csv->eof) {
my @coll = sort { $a <=> $b } keys %cols;
local $" = ", ";
my $cols = @coll == 1 ? $coll[0] : "(@coll)";
$eol //= $csv->eol || "--unknown--";
print "OK: rows: $rows, columns: $cols\n";
print " sep = <$sep>, quo = <$quo>, bin = <$bin>, eol = <$eol>\n";
print " encoding = $csv->{ENCODING}\n" if $csv->{ENCODING};
if (@coll > 1) {
print "multiple column lengths:\n";
printf " %6d line%s with %4d field%s\n",
$cols{$_}, $cols{$_} == 1 ? " " : "s",
$_, $_ == 1 ? "" : "s"
for @coll;
}
$diag[0] = 0;
}
elsif ($diag[2]) {
print "$ARGV record $diag[3] at line $./$diag[2] - $diag[0] - $diag[1]\n";
my $ep = $diag[2] - 1; # diag[2] is 1-based
my $err = $csv->error_input . " ";
substr $err, $ep + 1, 0, "*";
substr $err, $ep, 0, "*";
($err = substr $err, $ep - 5, 12) =~ s/ +$//;
print " |$err|\n";
}
else {
print "$ARGV line $. - $diag[1]\n";
}
print for @warn;
exit $diag[0];
} # done
sub stats {
my $r = shift;
$cols{scalar @$r}++;
grep { $_ & 0x0002 } $csv->meta_info and $bin = 1;
if ($opt_u) {
my @r = @$r;
foreach my $x (0 .. $#r) {
utf8::is_utf8 ($r[$x]) and next;
local $SIG{__WARN__} = sub {
(my $msg = shift) =~ s{ at /\S+Encode.pm.*}{};
my @h = $csv->column_names;
push @warn, sprintf "Field %d%s in record %d - '%s'\t- %s",
$x + 1, @h ? " (column: '$h[$x]')" : "", $rows,
DPeek ($r[$x]), $msg;
};
my $oct = decode ("utf-8", $r[$x], Encode::FB_WARN);
}
}
} # stats
my $mode = $enc ? "<:encoding($enc)" : "<";
open my $fh, $mode, \$data or die "$fn: $!\n";
if ($opt_h) {
$csv->header ($fh);
}
elsif ($opt_b) {
my @hdr = $csv->header ($fh, { detect_bom => 1, set_column_names => 0 });
stats \@hdr;
}
local $SIG{__WARN__} = sub { push @warn, @_; };
while (my $row = $csv->getline ($fh)) {
$rows++;
stats $row;
}
done;
|