File: Text.pm

package info (click to toggle)
libimage-exiftool-perl 12.16%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 25,940 kB
  • sloc: perl: 263,492; xml: 120; makefile: 13
file content (245 lines) | stat: -rw-r--r-- 8,319 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
#------------------------------------------------------------------------------
# File:         Text.pm
#
# Description:  Deduce characteristics of TXT and CSV files
#
# Revisions:    2019-11-01 - P. Harvey Created
#               2020-02-13 - PH Added CSV file support
#
# References:   1) https://github.com/file/file
#------------------------------------------------------------------------------

package Image::ExifTool::Text;

use strict;
use vars qw($VERSION);
use Image::ExifTool qw(:DataAccess :Utils);
use Image::ExifTool::XMP;

$VERSION = '1.03';

# Text tags
%Image::ExifTool::Text::Main = (
    VARS => { NO_ID => 1 },
    GROUPS => { 0 => 'File', 1 => 'File', 2 => 'Document' },
    NOTES => q{
        Although basic text files contain no metadata, the following tags are
        determined from a simple analysis of the data in TXT and CSV files. 
        Statistics are generated only for 8-bit encodings, but the L<FastScan|../ExifTool.html#FastScan> (-fast)
        option may be used to limit processing to the first 64 kB in which case some
        tags are not produced.  To avoid long processing delays, ExifTool will issue
        a minor warning and process only the first 64 kB of any file larger than 20
        MB unless the L<IgnoreMinorErrors|../ExifTool.html#IgnoreMinorErrors> (-m)
        option is used.
    },
    MIMEEncoding => { Groups => { 2 => 'Other' } },
    Newlines => {
        PrintConv => {
            "\r\n" => 'Windows CRLF',
            "\r"   => 'Macintosh CR',
            "\n"   => 'Unix LF',
            ''     => '(none)',
        },
    },
    ByteOrderMark => { PrintConv => { 0 => 'No', 1 => 'Yes' } },
    LineCount => { },
    WordCount => { },
    Delimiter => { PrintConv => { '' => '(none)', ',' => 'Comma', ';' => 'Semicolon', "\t" => 'Tab' }},
    Quoting   => { PrintConv => { '' => '(none)', '"' => 'Double quotes', "'" => 'Single quotes' }},
    RowCount  => { },
    ColumnCount => { },
);

#------------------------------------------------------------------------------
# Extract some stats from a text file
# Inputs: 0) ExifTool ref, 1) dirInfo ref
# Returns: 1 on success, 0 if this wasn't a text file
sub ProcessTXT($$)
{
    my ($et, $dirInfo) = @_;
    my $dataPt = $$dirInfo{TestBuff};
    my $raf = $$dirInfo{RAF};
    my $fast = $et->Options('FastScan') || 0;
    my ($buff, $enc, $isBOM, $isUTF8);
    my $nl = '';

    return 0 unless length $$dataPt; # can't call it a text file if it has no text

    # read more from the file if necessary
    if ($fast < 3 and length($$dataPt) == $Image::ExifTool::testLen) {
        $raf->Read($buff, 65536) or return 0;
        $dataPt = \$buff;
    }
#
# make our best guess at the character encoding (EBCDIC is not supported)
#
    if ($$dataPt =~ /([\0-\x06\x0e-\x1a\x1c-\x1f\x7f])/) {
        # file contains weird control characters, could be multi-byte Unicode
        if ($$dataPt =~ /^(\xff\xfe\0\0|\0\0\xfe\xff)/) {
            if ($1 eq "\xff\xfe\0\0") {
                $enc = 'utf-32le';
                $nl = $1 if $$dataPt =~ /(\r\0\0\0\n|\r|\n)\0\0\0/;
            } else {
                $enc = 'utf-32be';
                $nl = $1 if $$dataPt =~ /\0\0\0(\r\0\0\0\n|\r|\n)/;
            }
        } elsif ($$dataPt =~ /^(\xff\xfe|\xfe\xff)/) {
            if ($1 eq "\xff\xfe") {
                $enc = 'utf-16le';
                $nl = $1 if $$dataPt =~ /(\r\0\n|\r|\n)\0/;
            } else {
                $enc = 'utf-16be';
                $nl = $1 if $$dataPt =~ /\0(\r\0\n|\r|\n)/;
            }
        } else {
            return 0;       # probably not a text file
        }
        $nl =~ tr/\0//d;    # remove nulls from newline sequence
        $isBOM = 1;         # (we don't recognize UTF-16/UTF-32 without one)
    } else {
        $isUTF8 = Image::ExifTool::XMP::IsUTF8($dataPt, 1);
        if ($isUTF8 == 0) {
            $enc = 'us-ascii';
        } elsif ($isUTF8 > 0) {
            $enc = 'utf-8';
            $isBOM = ($$dataPt =~ /^\xef\xbb\xbf/ ? 1 : 0);
        } elsif ($$dataPt !~ /[\x80-\x9f]/) {
            $enc = 'iso-8859-1';
        } else {
            $enc = 'unknown-8bit';
        }
        $nl = $1 if $$dataPt =~ /(\r\n|\r|\n)/;
    }

    my $tagTablePtr = GetTagTable('Image::ExifTool::Text::Main');

    $et->SetFileType();
    $et->HandleTag($tagTablePtr, MIMEEncoding => $enc);

    return 1 if $fast == 3 or not $raf->Seek(0,0);

    $et->HandleTag($tagTablePtr, ByteOrderMark => $isBOM) if defined $isBOM;
    $et->HandleTag($tagTablePtr, Newlines => $nl);

    return 1 if $fast or not defined $isUTF8;
#
# generate stats for CSV files
#
    if ($$et{FileType} eq 'CSV') {
        my ($delim, $quot, $ncols);
        my $nrows = 0;
        while ($raf->ReadLine($buff)) {
            if (not defined $delim) {
                my %count = ( ',' => 0, ';' => 0, "\t" => 0 );
                ++$count{$_} foreach $buff =~ /[,;\t]/g;
                if ($count{','} > $count{';'} and $count{','} > $count{"\t"}) {
                    $delim = ',';
                } elsif ($count{';'} > $count{"\t"}) {
                    $delim = ';';
                } elsif ($count{"\t"}) {
                    $delim = "\t";
                } else {
                    $delim = '';
                    $ncols = 1;
                }
                unless ($ncols) {
                    # account for delimiters in quotes (simplistically)
                    while ($buff =~ /(^|$delim)(["'])(.*?)\2(?=$delim|$)/sg) {
                        $quot = $2;
                        my $field = $3;
                        $count{$delim} -= () = $field =~ /$delim/g;
                    }
                    $ncols = $count{$delim} + 1;
                }
            } elsif (not $quot) {
                $quot = $2 if $buff =~ /(^|$delim)(["'])(.*?)\2(?=$delim|$)/sg;
            }
            if (++$nrows == 1000 and $et->Warn('Not counting rows past 1000', 2)) {
                undef $nrows;
                last;
            }
        }
        $et->HandleTag($tagTablePtr, Delimiter => ($delim || ''));
        $et->HandleTag($tagTablePtr, Quoting => ($quot || ''));
        $et->HandleTag($tagTablePtr, ColumnCount => $ncols);
        $et->HandleTag($tagTablePtr, RowCount => $nrows) if $nrows;
        return 1;
    }
    return 1 if $$et{VALUE}{FileSize} and $$et{VALUE}{FileSize} > 20000000 and
        $et->Warn('Not counting lines/words in text file larger than 20 MB', 2);
#
# count lines/words and check encoding of the rest of the file
#
    my ($lines, $words) = (0, 0);
    my $oldNL = $/;
    $/ = $nl if $nl;
    while ($raf->ReadLine($buff)) {
        ++$lines;
        ++$words while $buff =~ /\S+/g;
        if (not $nl and $buff =~ /(\r\n|\r|\n)$/) {
            # (the first line must have been longer than 64 kB)
            $$et{VALUE}{Newlines} = $nl = $1;
        }
        next if $raf->Tell() < 65536;
        # continue to check encoding after the first 64 kB
        if ($isUTF8 >= 0) { # (if ascii or utf8)
            $isUTF8 = Image::ExifTool::XMP::IsUTF8(\$buff);
            if ($isUTF8 > 0) {
                $enc = 'utf-8';
            } elsif ($isUTF8 < 0) {
                $enc = $buff =~ /[\x80-\x9f]/ ? 'unknown-8bit' : 'iso-8859-1';
            }
        } elsif ($enc eq 'iso-8859-1' and $buff =~ /[\x80-\x9f]/) {
            $enc = 'unknown-8bit';
        }
    }
    if ($$et{VALUE}{MIMEEncoding} ne $enc) {
        $$et{VALUE}{MIMEEncoding} = $enc;
        $et->VPrint(0,"  MIMEEncoding [override] = $enc\n");
    }
    $/ = $oldNL;
    $et->HandleTag($tagTablePtr, LineCount => $lines);
    $et->HandleTag($tagTablePtr, WordCount => $words);
    return 1;
}


1;  # end

__END__

=head1 NAME

Image::ExifTool::Text - Read Text meta information

=head1 SYNOPSIS

This module is used by Image::ExifTool

=head1 DESCRIPTION

This module contains definitions required by Image::ExifTool to deduce some
characteristics of TXT and CSV files.

=head1 AUTHOR

Copyright 2003-2021, Phil Harvey (philharvey66 at gmail.com)

This library is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.

=head1 REFERENCES

=over 4

=item L<https://github.com/file/file>

=back

=head1 SEE ALSO

L<Image::ExifTool::TagNames/Text Tags>,
L<Image::ExifTool(3pm)|Image::ExifTool>

=cut