File: match-check-template

package info (click to toggle)
dgit 12.16
  • links: PTS, VCS
  • area: main
  • in suites: trixie-proposed-updates
  • size: 3,368 kB
  • sloc: perl: 13,443; sh: 6,466; python: 334; makefile: 324; tcl: 69
file content (187 lines) | stat: -rwxr-xr-x 5,080 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
#!/usr/bin/perl -w
#
# usage:
#   match-check-template TEMPLATE FILE
# Checks a text file against a template (a pattern).
#
# Template data is indented by 4 characters, with control information
# in the first two and then two separating spaces.
#
# Template line syntax
#   TR PATTERN
# where T can be
#   space   - ??? maps to .*, no way to escape ???
#   E       - PATTERN is a perl regexp fragment
#   L       - PATTERN is a literal string
#   #       - this is a comment line, rest is ignored
# and R can be
#   space ? * +
#
# For regexp PATTERNs, don't use numbered capture groups: the supplied regexp
# will be embedded in a longer regexp including many other PATTERNs.  Use
# named capture groups, which should be distinct across all PATTERNs.
#
# If the template doesn't match, prints (to stdout) a report showing
# the longest prefix of the template which *does* match,
# and which template lines matched which file lines.
#
# TODO put this in chiark-utils and depend on it instead of including it
# (with a transition plan).

use strict;
use Carp;

sub read_file ($) {
    my ($n) = @_;
    local $/ = undef;
    open F, '<', $n or die "$n: $!";
    my $r = <F>;
    F->error and die $!;
    close F;
    return $r;
}

die unless @ARGV==2 && $ARGV[0] !~ m/^-/;
my ($exp_file) = @ARGV;
our ($exp, $got) = map { read_file $_ } @ARGV;

# This template file syntax is line based; files without trailing newline
# can't sensibly be matched.  (If we didn't do this here, you could in
# theory handle a file without a final newline but the missing newline
# would have to be matched against the final line of the template.
# Instead, to support missing final newlines, we should have some
# bespoke other pattern syntax.
sub no_nl_chk ($$) {
    my ($desc, $data) = @_;
    return if $data =~ m/\n$/s;
    die "no newline at end of $desc\n";
}
no_nl_chk 'expected', $exp;
no_nl_chk 'actual', $got;

my $exp_line = 0;
my @exp;
# $exp[]{Re}      regexp for PATTERN (including the newline)
# $exp[]{Line}    line number in template file
# $exp[]{Repeat}  R from the input, but '' instead of ' '

foreach (split m{(?<=\n)}, $exp, -1) {
    $exp_line++;

    next if m{^\#};

    my $e = { Orig => $_ };

    # pad (before the newline) with spaces, in case the line is short
    s{^(.{0,3})(\n?)$}{ sprintf "%-4s%s", $1, $2 }e;

    s{^(.)(.)  }{} or
      die "$exp_file:$exp_line: missing spaces before pattern data\n";

    my ($t, $r) = ($1, $2);

    $e->{Re} =
      $t eq ' ' ? join '.*', map { quotemeta $_ } split m/\Q???/, $_, -1 :
      $t eq 'L' ? quotemeta $_ :
      $t eq 'E' ? $_ :
      die "$exp_file:$exp_line: unknown line type ($t)\n";

    $r =~ m{^[ ?*+]$} or
      die "$exp_file:$exp_line: unknown repeat mode ($r)\n";
    $r =~ s{ }{};

    $e->{Repeat} = $r;
    $e->{Line} = $exp_line;

    #use Data::Dumper;
    #print STDERR Dumper($e);

    push @exp, $e;
}

my $exp_whole = join '', map {
    my $e = $_;
    my $re = $e->{Re};
    qr{$re}.$e->{Repeat}
} @exp;

#print STDERR ">>>>>$exp_whole<<<<<";

# fast success path
exit 0 if $got =~ m{$exp_whole}s;


#---- failure reporting -----

sub p { print @_ or die $!; }

p "# mismatch! prefix that matches (| = actual output):\n";

# use Data::Dumper;

my ($l, $rhs, @cap);

# Find the longest prefix that matches.  We iterate O(n), rather than
# (eg) binary search, for simplicity.  If this ever turns out to be too slow
# it could be improved.

for ($l=@exp; ; $l--) {
    $exp_whole = '^';
    for (my $i=0; $i<$l; $i++) {
	my $e = $exp[$i];
	my $re = $e->{Re};
	# Each template line's regexp is wrapped in a named capture group.
	# That will let us print how the match went.
	# We use capture groups with these long names to avoid any confusion
	# if any PATTERN also uses capture groups.
	$exp_whole .= sprintf "(?<%s>%s)",
	  "template_match_check_$i",
	  qr{$re}.$e->{Repeat};
    }
    # No $ at the end of $exp_whole - we expect to match a prefix of the file.
    if ($got =~ m{$exp_whole}s) {
	# Perl scoping rules for $' and $+ mean we must copy things here.
	$rhs = $';
	foreach (my $i=0; $i<$l; $i++) {
	    push @cap, $+{"template_match_check_$i"};
	}
	last;
    }
    confess "$exp_whole ?" unless $l > 0;
}

my $i;
for ($i=0; $i<$l; $i++) {
    my $e = $exp[$i];
    p $e->{Orig};
    if ($cap[$i] ne '') {
	# Every pattern which had a nonzero repeat will have a nonempty
	# match text, since the pattern always ends in \n.
	my $cap = $cap[$i];
	my @l = split m{^}m, $cap, -1;
	#use Data::Dumper;
	#print STDERR Dumper($i, $cap, \@l);
	p "|   $_" foreach @l;
    } else {
	# But ? and * might repeat 0 times and we should print *something*.
	p "|(none)\n";
    }
}
print "# mismatch:\n" or die $!;
if ($i<@exp) {
    my $e = $exp[$i];
    # use Data::Dumper;
    # p Dumper($exp_whole, $&, $', $e->{Orig}, $+{"template_match_check_$i"});
    p $e->{Orig};
} else {
    p " (eof)\n";
}
if ($rhs) {
    # $rhs is the whole unmatched suffix of the file
    my @l = split "\n", $rhs, 2; # trailing newline was guaranteed earlier
    p "|   $l[0]\n";
} else {
    p "|(eof)\n";
}

exit 1;