File: datamash-parser.pl

package info (click to toggle)
datamash 1.9-1
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 13,600 kB
  • sloc: ansic: 65,320; sh: 8,982; perl: 5,127; makefile: 250; sed: 16
file content (243 lines) | stat: -rwxr-xr-x 10,276 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
#!/usr/bin/env perl
=pod
  Unit Tests for GNU Datamash - perform simple calculation on input data

   Copyright (C) 2013-2021 Assaf Gordon <assafgordon@gmail.com>
   Copyright (C) 2022-2025 Timothy Rice <trice@posteo.net>

   This file is part of GNU Datamash.

   GNU Datamash is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.

   GNU Datamash is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with GNU Datamash.  If not, see <https://www.gnu.org/licenses/>.

   Written by Assaf Gordon.
=cut
use strict;
use warnings;

# Until a better way comes along to auto-use Coreutils Perl modules
# as in the coreutils' autotools system.
use Coreutils;
use CuSkip;
use CuTmpdir qw(datamash);
use MIME::Base64 ;

(my $program_name = $0) =~ s|.*/||;
my $prog_bin = 'datamash';

## Cross-Compiling portability hack:
##  under qemu/binfmt, argv[0] (which is used to report errors) will contain
##  the full path of the binary, if the binary is on the $PATH.
##  So we try to detect what is the actual returned value of the program
##  in case of an error.
my $prog = `$prog_bin ---print-progname`;
$prog = $prog_bin unless $prog;

# TODO: add localization tests with "grouping"
# Turn off localization of executable's output.
@ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3;

my $in1=<<"EOF";
A\t100\tx
A\t10\tx
B\t10\tx
B\t35\ty
EOF

my $in2=<<"EOF";
1	2	3	4	5
6	7	8	9	10
EOF

my $out2=<<'EOF';
7	9	11	13	15
EOF


## NOTE: these tests check the parser behaviour,
##       while ignoring the exact wording of the error messages.
##       The 'datamash-error-msgs.pl' checks the exact message wording.
##       (ERR_SUBST is used to discard the text of STDERR)
my @Tests =
(
  # no explicit mode - 'sum' implies 'group by' without any columns -
  # operate on entire file.
  ['p1', 'sum 2',               {IN_PIPE=>$in1}, {OUT=>"155\n"}],

  # 'old' syntax - with '-g'
  ['p2', '-g 1 sum 2',          {IN_PIPE=>$in1}, {OUT=>"A\t110\nB\t45\n"}],

  # 'new' syntax - without '-g'
  ['p3', 'groupby 1 sum 2',     {IN_PIPE=>$in1}, {OUT=>"A\t110\nB\t45\n"}],
  ['p4', 'gb 1 sum 2',          {IN_PIPE=>$in1}, {OUT=>"A\t110\nB\t45\n"}],

  # group by multiple columns
  ['p5', 'gb 1,3 sum 2',        {IN_PIPE=>$in1},
    {OUT=>"A\tx\t110\nB\tx\t10\nB\ty\t35\n"}],
  # operate by multiple columns with comma
  ['p6', 'gb 1 last 2,3',        {IN_PIPE=>$in1},
    {OUT=>"A\t10\tx\nB\t35\ty\n"}],

  # many groupby columns, force the parser to allocate more array items
  ['p7', 'gb 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23' .
         ',24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,44,44' .
         ',45,46,47,48,49,50 sum 2',          {IN_PIPE=>""}, {OUT=>""}],
  # many operator columns, force the parser to allocate more array items
  ['p8', 'gb 1 sum 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22' .
         ',23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,44,44' .
         ',45,46,47,48,49,50',          {IN_PIPE=>""}, {OUT=>""}],

  # Invalid numeric value for column prasing should be treated as named column
  ['p9', 'sum 1x', {IN_PIPE=>""}, {EXIT=>1}, {ERR_SUBST=>'s/.*//s'}],

  # Processing mode without operation
  ['p10','groupby 1', {IN_PIPE=>""}, {EXIT=>1}, {ERR_SUBST=>'s/.*//s'}],

  # invalid operation after valid mode
  ['p11','groupby 1 foobar 2', {IN_PIPE=>""}, {EXIT=>1},
    {ERR_SUBST=>'s/.*//s'}],

  # missing field number after processing mode
  ['p12','groupby', {IN_PIPE=>""}, {EXIT=>1}, {ERR_SUBST=>'s/.*//s'}],

  # field range syntax
  ['p20','sum 1-44', {IN_PIPE=>""}, {OUT=>""}],

  # compare results of equivalent syntaxes
  ['p21','sum 1,2,3,4,5', {IN_PIPE=>$in2}, {OUT=>$out2}],
  ['p22','sum 1-2,3-4,5', {IN_PIPE=>$in2}, {OUT=>$out2}],
  ['p23','sum 1-2,3-5',   {IN_PIPE=>$in2}, {OUT=>$out2}],
  ['p24','sum 1-4,5',     {IN_PIPE=>$in2}, {OUT=>$out2}],
  ['p25','sum 1-5',       {IN_PIPE=>$in2}, {OUT=>$out2}],
  ['p26','sum 1 sum 2 sum 3 sum 4 sum 5',
    {IN_PIPE=>$in2}, {OUT=>$out2}],
  ['p27','sum 1,2 sum 3-5',
    {IN_PIPE=>$in2}, {OUT=>$out2}],

  # 'check' options
  ['p30','check',      {IN_PIPE=>""}, {OUT=>"0 lines, 0 fields\n"}],
  ['p31','check foo',  {IN_PIPE=>""}, {EXIT=>1}, {ERR_SUBST=>'s/.*//s'}],
  ['p32','check 10',   {IN_PIPE=>""}, {EXIT=>1}, {ERR_SUBST=>'s/.*//s'}],
  ['p33','check lines lines', {IN_PIPE=>""}, {EXIT=>1}, {ERR_SUBST=>'s/.*//s'}],
  ['p34','check 1 line fields', {IN_PIPE=>""}, {EXIT=>1},
    {ERR_SUBST=>'s/.*//s'}],
  ['p35','check 10 foo',   {IN_PIPE=>""}, {EXIT=>1}, {ERR_SUBST=>'s/.*//s'}],


  # Field range with invalid syntax
  ['e20','sum 1-',    {IN_PIPE=>""}, {EXIT=>1}, {ERR_SUBST=>'s/.*//s'}],
  ['e21','sum 1-x',   {IN_PIPE=>""}, {EXIT=>1}, {ERR_SUBST=>'s/.*//s'}],
  ['e22','sum 4-2',   {IN_PIPE=>""}, {EXIT=>1}, {ERR_SUBST=>'s/.*//s'}],
  # zero in range
  ['e23','sum 0-2',   {IN_PIPE=>""}, {EXIT=>1}, {ERR_SUBST=>'s/.*//s'}],
  ['e24','sum 1-0',   {IN_PIPE=>""}, {EXIT=>1}, {ERR_SUBST=>'s/.*//s'}],
  #Negative in range
  ['e25','sum 1--5',   {IN_PIPE=>""}, {EXIT=>1}, {ERR_SUBST=>'s/.*//s'}],

  # Test field pair syntaax
  ['p40','pcov 1:2',      {IN_PIPE=>""}, {OUT=>""}],
  ['e41','pcov 1', {IN_PIPE=>""}, {EXIT=>1},   {ERR_SUBST=>'s/.*//s'}],
  ['e42','pcov 1:', {IN_PIPE=>""}, {EXIT=>1},  {ERR_SUBST=>'s/.*//s'}],
  ['e43','pcov :', {IN_PIPE=>""}, {EXIT=>1},   {ERR_SUBST=>'s/.*//s'}],
  ['e44','pcov :1', {IN_PIPE=>""}, {EXIT=>1},  {ERR_SUBST=>'s/.*//s'}],
  ['e46','pcov hello:world', {IN_PIPE=>""}, {EXIT=>1}, {ERR_SUBST=>'s/.*//s'}],
  ['e47','sum 1:3', {IN_PIPE=>""}, {EXIT=>1}, {ERR_SUBST=>'s/.*//s'}],

  ['p50','dotprod 1:2',      {IN_PIPE=>""}, {OUT=>""}],
  ['e51','dotprod 1', {IN_PIPE=>""}, {EXIT=>1},   {ERR_SUBST=>'s/.*//s'}],
  ['e52','dotprod 1:', {IN_PIPE=>""}, {EXIT=>1},  {ERR_SUBST=>'s/.*//s'}],
  ['e53','dotprod :', {IN_PIPE=>""}, {EXIT=>1},   {ERR_SUBST=>'s/.*//s'}],
  ['e54','dotprod :1', {IN_PIPE=>""}, {EXIT=>1},  {ERR_SUBST=>'s/.*//s'}],
  ['e56','dotprod hello:world',
    {IN_PIPE=>""}, {EXIT=>1}, {ERR_SUBST=>'s/.*//s'}],

  # Test scanner edge-cases
  # Floating point value
  ['e60','sum 4.5',   {IN_PIPE=>""}, {EXIT=>1}, {ERR_SUBST=>'s/.*//s'}],

  ['e61','sum 4.',   {IN_PIPE=>""}, {EXIT=>1}, {ERR_SUBST=>'s/.*//s'}],


  # invalid numbers
  ['e62','sum 4a',   {IN_PIPE=>""}, {EXIT=>1}, {ERR_SUBST=>'s/.*//s'}],

  ['e63','sum 4_',   {IN_PIPE=>""}, {EXIT=>1}, {ERR_SUBST=>'s/.*//s'}],

  # Overflow strtol
  ['e64','sum 1234567890123456789012345678901234567', {IN_PIPE=>""}, {EXIT=>1},
    {ERR_SUBST=>'s/.*//s'}],

  # Invalid charcters
  ['e65','sum "foo^bar"', {IN_PIPE=>""}, {EXIT=>1}, {ERR_SUBST=>'s/.*//s'}],

  # Empty columns
  ['e66','sum 1,,', {IN_PIPE=>""}, {EXIT=>1}, {ERR_SUBST=>'s/.*//s'}],

  # Range with names instead of numbers
  ['e67','sum foo-bar', {IN_PIPE=>""}, {EXIT=>1}, {ERR_SUBST=>'s/.*//s'}],


  # Valid identifiers with undersocres
  ['s66','--header-in sum foo_bar', {IN_PIPE=>"foo_bar\n1\n"}, {OUT=>"1\n"}],
  ['s67','--header-in sum _bar',    {IN_PIPE=>"_bar\n1\n"},    {OUT=>"1\n"}],

  # Binning, and optional parameters
  ['b31','bin 1',       {IN_PIPE=>""}, {OUT=>""}],
  ['b32','bin:10 1',    {IN_PIPE=>""}, {OUT=>""}],
  ['b33','bin:1.5 1',   {IN_PIPE=>""}, {OUT=>""}],
  ['b34','bin 1,2',     {IN_PIPE=>""}, {OUT=>""}],
  ['b35','bin:1 1,2',   {IN_PIPE=>""}, {OUT=>""}],
  ['e70','bin:10:30 1', {IN_PIPE=>""}, {EXIT=>1}, {ERR_SUBST=>'s/.*//s'}],
  ['e71','bin: 1',      {IN_PIPE=>""}, {EXIT=>1}, {ERR_SUBST=>'s/.*//s'}],
  ['e72','sum: 1',      {IN_PIPE=>""}, {EXIT=>1}, {ERR_SUBST=>'s/.*//s'}],
  ['e73','bin:10: 1',   {IN_PIPE=>""}, {EXIT=>1}, {ERR_SUBST=>'s/.*//s'}],
  ['e74','bin:10:1',    {IN_PIPE=>""}, {EXIT=>1}, {ERR_SUBST=>'s/.*//s'}],
  ['e75','bin:10, 1',   {IN_PIPE=>""}, {EXIT=>1}, {ERR_SUBST=>'s/.*//s'}],
  ['e76','bin:, 1',     {IN_PIPE=>""}, {EXIT=>1}, {ERR_SUBST=>'s/.*//s'}],
  ['e77','bin,  1',     {IN_PIPE=>""}, {EXIT=>1}, {ERR_SUBST=>'s/.*//s'}],
  ['e78','bin:-  1',    {IN_PIPE=>""}, {EXIT=>1}, {ERR_SUBST=>'s/.*//s'}],
  ['e79','sum:10 1',    {IN_PIPE=>""}, {EXIT=>1}, {ERR_SUBST=>'s/.*//s'}],
  ['e30','bin :10 1',   {IN_PIPE=>""}, {EXIT=>1}, {ERR_SUBST=>'s/.*//s'}],
  ['e31','bin : 10 1',  {IN_PIPE=>""}, {EXIT=>1}, {ERR_SUBST=>'s/.*//s'}],
  ['e32','bin : 1',     {IN_PIPE=>""}, {EXIT=>1}, {ERR_SUBST=>'s/.*//s'}],
  ['e33','bin:1,2 1',   {IN_PIPE=>""}, {EXIT=>1}, {ERR_SUBST=>'s/.*//s'}],
  ['e34','bin:-2 1',    {IN_PIPE=>""}, {EXIT=>1}, {ERR_SUBST=>'s/.*//s'}],
  ['e35','bin 1:2',     {IN_PIPE=>""}, {EXIT=>1}, {ERR_SUBST=>'s/.*//s'}],


  # Field specifications for primary operations
  ['e90',  'groupby 1:2', {IN_PIPE=>""}, {EXIT=>1}, {ERR_SUBST=>'s/.*//s'}],
  ['e91',  'groupby 1-2', {IN_PIPE=>""}, {EXIT=>1}, {ERR_SUBST=>'s/.*//s'}],

  # Test corner cases to avoid introducing unintended changes in behavior
  # The parser allows some optional whitespace
  ['cc0',  '" sum 1 , 2"',      {IN_PIPE=>""}, {OUT=>""}],
  ['cc1',  '" sum 1, 2"',       {IN_PIPE=>""}, {OUT=>""}],
  ['cc2',  '" sum 1 ,2"',       {IN_PIPE=>""}, {OUT=>""}],
  ['cc3',  '" sum 1 - 3"',      {IN_PIPE=>""}, {OUT=>""}],
  ['cc4',  '" sum 1- 3"',       {IN_PIPE=>""}, {OUT=>""}],
  ['cc5',  '" sum 1 -3"',       {IN_PIPE=>""}, {OUT=>""}],
  ['cc6',  '" pcov 1 : 2"',     {IN_PIPE=>""}, {OUT=>""}],
  ['cc7',  '" gb 1 , 2 sum 3"', {IN_PIPE=>""}, {OUT=>""}],
  # Trailing whitespace leads to failures (could be seen as parser bug)
  ['cce0', '"sum 1 "',        {IN_PIPE=>""}, {EXIT=>1}, {ERR_SUBST=>'s/.*//s'}],
  ['cce1', "'sum 1\t'",       {IN_PIPE=>""}, {EXIT=>1}, {ERR_SUBST=>'s/.*//s'}],
  ['cce2', '"pcov 1:2  "',    {IN_PIPE=>""}, {EXIT=>1}, {ERR_SUBST=>'s/.*//s'}],
  ['cce3', '"gb 1,2 sum 3 "', {IN_PIPE=>""}, {EXIT=>1}, {ERR_SUBST=>'s/.*//s'}],
);

my $save_temps = $ENV{SAVE_TEMPS};
my $verbose = $ENV{VERBOSE};

my $fail = run_tests ($program_name, $prog, \@Tests, $save_temps, $verbose);
exit $fail;