1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245
|
#------------------------------------------------------------------------------
# File: Text.pm
#
# Description: Deduce characteristics of TXT and CSV files
#
# Revisions: 2019-11-01 - P. Harvey Created
# 2020-02-13 - PH Added CSV file support
#
# References: 1) https://github.com/file/file
#------------------------------------------------------------------------------
package Image::ExifTool::Text;
use strict;
use vars qw($VERSION);
use Image::ExifTool qw(:DataAccess :Utils);
use Image::ExifTool::XMP;
$VERSION = '1.03';
# Text tags
%Image::ExifTool::Text::Main = (
VARS => { NO_ID => 1 },
GROUPS => { 0 => 'File', 1 => 'File', 2 => 'Document' },
NOTES => q{
Although basic text files contain no metadata, the following tags are
determined from a simple analysis of the data in TXT and CSV files.
Statistics are generated only for 8-bit encodings, but the L<FastScan|../ExifTool.html#FastScan> (-fast)
option may be used to limit processing to the first 64 kB in which case some
tags are not produced. To avoid long processing delays, ExifTool will issue
a minor warning and process only the first 64 kB of any file larger than 20
MB unless the L<IgnoreMinorErrors|../ExifTool.html#IgnoreMinorErrors> (-m)
option is used.
},
MIMEEncoding => { Groups => { 2 => 'Other' } },
Newlines => {
PrintConv => {
"\r\n" => 'Windows CRLF',
"\r" => 'Macintosh CR',
"\n" => 'Unix LF',
'' => '(none)',
},
},
ByteOrderMark => { PrintConv => { 0 => 'No', 1 => 'Yes' } },
LineCount => { },
WordCount => { },
Delimiter => { PrintConv => { '' => '(none)', ',' => 'Comma', ';' => 'Semicolon', "\t" => 'Tab' }},
Quoting => { PrintConv => { '' => '(none)', '"' => 'Double quotes', "'" => 'Single quotes' }},
RowCount => { },
ColumnCount => { },
);
#------------------------------------------------------------------------------
# Extract some stats from a text file
# Inputs: 0) ExifTool ref, 1) dirInfo ref
# Returns: 1 on success, 0 if this wasn't a text file
sub ProcessTXT($$)
{
my ($et, $dirInfo) = @_;
my $dataPt = $$dirInfo{TestBuff};
my $raf = $$dirInfo{RAF};
my $fast = $et->Options('FastScan') || 0;
my ($buff, $enc, $isBOM, $isUTF8);
my $nl = '';
return 0 unless length $$dataPt; # can't call it a text file if it has no text
# read more from the file if necessary
if ($fast < 3 and length($$dataPt) == $Image::ExifTool::testLen) {
$raf->Read($buff, 65536) or return 0;
$dataPt = \$buff;
}
#
# make our best guess at the character encoding (EBCDIC is not supported)
#
if ($$dataPt =~ /([\0-\x06\x0e-\x1a\x1c-\x1f\x7f])/) {
# file contains weird control characters, could be multi-byte Unicode
if ($$dataPt =~ /^(\xff\xfe\0\0|\0\0\xfe\xff)/) {
if ($1 eq "\xff\xfe\0\0") {
$enc = 'utf-32le';
$nl = $1 if $$dataPt =~ /(\r\0\0\0\n|\r|\n)\0\0\0/;
} else {
$enc = 'utf-32be';
$nl = $1 if $$dataPt =~ /\0\0\0(\r\0\0\0\n|\r|\n)/;
}
} elsif ($$dataPt =~ /^(\xff\xfe|\xfe\xff)/) {
if ($1 eq "\xff\xfe") {
$enc = 'utf-16le';
$nl = $1 if $$dataPt =~ /(\r\0\n|\r|\n)\0/;
} else {
$enc = 'utf-16be';
$nl = $1 if $$dataPt =~ /\0(\r\0\n|\r|\n)/;
}
} else {
return 0; # probably not a text file
}
$nl =~ tr/\0//d; # remove nulls from newline sequence
$isBOM = 1; # (we don't recognize UTF-16/UTF-32 without one)
} else {
$isUTF8 = Image::ExifTool::XMP::IsUTF8($dataPt, 1);
if ($isUTF8 == 0) {
$enc = 'us-ascii';
} elsif ($isUTF8 > 0) {
$enc = 'utf-8';
$isBOM = ($$dataPt =~ /^\xef\xbb\xbf/ ? 1 : 0);
} elsif ($$dataPt !~ /[\x80-\x9f]/) {
$enc = 'iso-8859-1';
} else {
$enc = 'unknown-8bit';
}
$nl = $1 if $$dataPt =~ /(\r\n|\r|\n)/;
}
my $tagTablePtr = GetTagTable('Image::ExifTool::Text::Main');
$et->SetFileType();
$et->HandleTag($tagTablePtr, MIMEEncoding => $enc);
return 1 if $fast == 3 or not $raf->Seek(0,0);
$et->HandleTag($tagTablePtr, ByteOrderMark => $isBOM) if defined $isBOM;
$et->HandleTag($tagTablePtr, Newlines => $nl);
return 1 if $fast or not defined $isUTF8;
#
# generate stats for CSV files
#
if ($$et{FileType} eq 'CSV') {
my ($delim, $quot, $ncols);
my $nrows = 0;
while ($raf->ReadLine($buff)) {
if (not defined $delim) {
my %count = ( ',' => 0, ';' => 0, "\t" => 0 );
++$count{$_} foreach $buff =~ /[,;\t]/g;
if ($count{','} > $count{';'} and $count{','} > $count{"\t"}) {
$delim = ',';
} elsif ($count{';'} > $count{"\t"}) {
$delim = ';';
} elsif ($count{"\t"}) {
$delim = "\t";
} else {
$delim = '';
$ncols = 1;
}
unless ($ncols) {
# account for delimiters in quotes (simplistically)
while ($buff =~ /(^|$delim)(["'])(.*?)\2(?=$delim|$)/sg) {
$quot = $2;
my $field = $3;
$count{$delim} -= () = $field =~ /$delim/g;
}
$ncols = $count{$delim} + 1;
}
} elsif (not $quot) {
$quot = $2 if $buff =~ /(^|$delim)(["'])(.*?)\2(?=$delim|$)/sg;
}
if (++$nrows == 1000 and $et->Warn('Not counting rows past 1000', 2)) {
undef $nrows;
last;
}
}
$et->HandleTag($tagTablePtr, Delimiter => ($delim || ''));
$et->HandleTag($tagTablePtr, Quoting => ($quot || ''));
$et->HandleTag($tagTablePtr, ColumnCount => $ncols);
$et->HandleTag($tagTablePtr, RowCount => $nrows) if $nrows;
return 1;
}
return 1 if $$et{VALUE}{FileSize} and $$et{VALUE}{FileSize} > 20000000 and
$et->Warn('Not counting lines/words in text file larger than 20 MB', 2);
#
# count lines/words and check encoding of the rest of the file
#
my ($lines, $words) = (0, 0);
my $oldNL = $/;
$/ = $nl if $nl;
while ($raf->ReadLine($buff)) {
++$lines;
++$words while $buff =~ /\S+/g;
if (not $nl and $buff =~ /(\r\n|\r|\n)$/) {
# (the first line must have been longer than 64 kB)
$$et{VALUE}{Newlines} = $nl = $1;
}
next if $raf->Tell() < 65536;
# continue to check encoding after the first 64 kB
if ($isUTF8 >= 0) { # (if ascii or utf8)
$isUTF8 = Image::ExifTool::XMP::IsUTF8(\$buff);
if ($isUTF8 > 0) {
$enc = 'utf-8';
} elsif ($isUTF8 < 0) {
$enc = $buff =~ /[\x80-\x9f]/ ? 'unknown-8bit' : 'iso-8859-1';
}
} elsif ($enc eq 'iso-8859-1' and $buff =~ /[\x80-\x9f]/) {
$enc = 'unknown-8bit';
}
}
if ($$et{VALUE}{MIMEEncoding} ne $enc) {
$$et{VALUE}{MIMEEncoding} = $enc;
$et->VPrint(0," MIMEEncoding [override] = $enc\n");
}
$/ = $oldNL;
$et->HandleTag($tagTablePtr, LineCount => $lines);
$et->HandleTag($tagTablePtr, WordCount => $words);
return 1;
}
1; # end
__END__
=head1 NAME
Image::ExifTool::Text - Read Text meta information
=head1 SYNOPSIS
This module is used by Image::ExifTool
=head1 DESCRIPTION
This module contains definitions required by Image::ExifTool to deduce some
characteristics of TXT and CSV files.
=head1 AUTHOR
Copyright 2003-2021, Phil Harvey (philharvey66 at gmail.com)
This library is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.
=head1 REFERENCES
=over 4
=item L<https://github.com/file/file>
=back
=head1 SEE ALSO
L<Image::ExifTool::TagNames/Text Tags>,
L<Image::ExifTool(3pm)|Image::ExifTool>
=cut
|