File: perlmask.pl

package info (click to toggle)
perltidy 20031021-2
  • links: PTS
  • area: main
  • in suites: sarge
  • size: 1,364 kB
  • ctags: 591
  • sloc: perl: 16,452; makefile: 46
file content (247 lines) | stat: -rw-r--r-- 8,634 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
#!/usr/bin/perl -w
use strict;

# Walk through a perl script and create a masked file which is
# similar but which masks comments, quotes, patterns, and non-code
# lines so that it is easy to parse with regular expressions.
#
# usage:
# perlmask [-cn]  myfile.pl >myfile.new
# perlmask [-cn] <myfile.pl >myfile.new
#
# In the masked file,
#  -comments and pod will be masked (or removed)
#  -here-doc text lines will be masked (or removed)
#  -quotes and patterns, qw quotes, and here doc << operators will be
#   replaced by the letters 'Q', 'q', or 'h'
#
# The result is a file in which all braces, parens, and square brackets
# are balanced, and it can be parsed relatively easily by regular
# expressions.
#
# -cn is an optional 'compression' flag.  By default the masked file will have
# the same number of characters as the input file, with the difference being
# that certain characters will be changed (masked).
#
# If character position correspondence is not required, the size of the masked
# file can be significantly reduced by increasing the 'compression' level as
# follows:
#
# -c0 all mask file line numbers and character positions agree with
#     original file (DEFAULT)
# -c1 line numbers agree and character positions agree within lines of code
# -c2 line numbers agree but character positions do not
# -c3 no correspondence between line numbers or character positions
#
# Try each of these on a file of significant size to see how they work.
# The default, -c0, is required if you are working with character positions
# that span multiple lines.  The other levels may be useful if you
# do not need this level of correspondence.
#
# This file is one of the examples distributed with perltidy and demonstrates
# using a callback object with Perl::Tidy to walk through a perl file and find
# all of its tokens.  It can be useful for simple perl code parsing tasks.  It
# might even be helpful in debugging.  Or you may want to modify it to suit
# your own purposes.
#
use Getopt::Std;
use IO::File;
$| = 1;
use vars qw($opt_c $opt_h);
my $usage = <<EOM;
   usage: perlmask [ -cn ] filename >outfile
EOM
getopts('c:h') or die "$usage";
if ($opt_h) { die $usage }
unless ( defined($opt_c) ) { $opt_c = 0 }
if (@ARGV > 1) { die $usage }

my $source=$ARGV[0];   # an undefined filename will become stdin

# strings to hold the files (arrays could be used to)
my ( $masked_file, $original_file );  

PerlMask::perlmask(
    _source         => $source,
    _rmasked_file   => \$masked_file,
    _roriginal_file => \$original_file,    # optional
    _compression    => $opt_c              # optional, default=0
);

# Now we have the masked and original files in strings of equal length.
# We could search for specific text in the masked file here.  But here
# we'll just print the masked file:
if ($masked_file) { print $masked_file; }

#####################################################################
#
# The PerlMask package is an interface to perltidy which accepts a
# source filehandle and returns a 'masked' version of the source as
# a string or array.  It can also optionally return the original file
# as a string or array.
#
# It works by making a a callback object with a write_line() method to
# receive tokenized lines from perltidy.  This write_line method
# selectively replaces tokens with either their original text or with a
# benign masking character (such as '#' or 'Q').
#
# Usage:
#
#   PerlMask::perlmask(
#       _source         => $fh,             # required source
#       _rmasked_file   => \$masked_file,   # required ref to ARRAY or SCALAR
#       _roriginal_file => \$original_file, # optional ref to ARRAY or SCALAR
#       _compression    => $opt_c           # optional
#   );
#
# _source is any source that perltidy will accept, including a
# filehandle or reference to SCALAR or ARRAY
#
# The compression flag may have these values:
#  0 all mask file line numbers and character positions agree with
#    original file (DEFAULT)
#  1 line numbers agree and character positions agree within lines of code
#  2 line numbers agree but character positions do not
#  3 no correspondence between line numbers or character positions
#
#####################################################################

package PerlMask;
use Carp;
use Perl::Tidy;

sub perlmask {

    my %args = ( _compression => 0, @_ );
    my $rfile = $args{_rmasked_file};
    unless ( defined($rfile) ) {
        croak
          "Missing required parameter '_rmasked_file' in call to perlmask\n";
    }
    my $ref=ref($rfile);
    unless ( $ref =~ /^(SCALAR|ARRAY)$/ ) {
            croak <<EOM;
Expecting _rmasked_file = ref to SCALAR or ARRAY in perlmask but got : ($ref)
EOM
    }

    # run perltidy, which will call $formatter's write_line() for each line
    perltidy(
        'source'    => $args{_source},
        'formatter' => bless( \%args, __PACKAGE__ ),    # callback object
        'argv'        => "-npro -se",    # -npro : ignore .perltidyrc,
                                         # -se   : errors to STDOUT
    );
}

sub print_line {

    # called from write_line to dispatch one line (either masked or original)..
    # here we'll either append it to a string or array, as appropriate
    my ( $rfile, $line ) = @_;
    if ( defined($rfile) ) {
        if ( ref($rfile) eq 'SCALAR' ) {
            $$rfile .= $line . "\n";
        }
        elsif ( ref($rfile) eq 'ARRAY' ) {
            push @{$rfile}, $line . "\n";
        }
    }
}

sub write_line {

    # This is called from perltidy line-by-line
    my ( $self, $line_of_tokens ) = @_;
    my $rmasked_file   = $self->{_rmasked_file};
    my $roriginal_file = $self->{_roriginal_file};
    my $opt_c          = $self->{_compression};

    my $line_type         = $line_of_tokens->{_line_type};
    my $input_line_number = $line_of_tokens->{_line_number};
    my $input_line        = $line_of_tokens->{_line_text};
    my $rtoken_type       = $line_of_tokens->{_rtoken_type};
    my $rtokens           = $line_of_tokens->{_rtokens};
    chomp $input_line;

    # mask non-CODE lines
    if ( $line_type ne 'CODE' ) {
        return if ( $opt_c == 3 );
        my $len = length($input_line);
        if ( $opt_c == 0 && $len > 0 ) {
            print_line( $roriginal_file, $input_line ) if $roriginal_file;
            print_line( $rmasked_file, '#' x $len ); 
        }
        else {
            print_line( $roriginal_file, $input_line ) if $roriginal_file;
            print_line( $rmasked_file, "" );
        }
        return;
    }

    # we'll build the masked line token by token
    my $masked_line = "";

    # add leading spaces if not in a higher compression mode
    if ( $opt_c <= 1 ) {

        # Find leading whitespace.  But be careful..we don't want the
        # whitespace if it is part of quoted text, because it will 
        # already be contained in a token.
        if ( $input_line =~ /^(\s+)/ && !$line_of_tokens->{_starting_in_quote} )
        {
            $masked_line = $1;
        }
    }

    # loop over tokens to construct one masked line
    for ( my $j = 0 ; $j < @$rtoken_type ; $j++ ) {

        # Mask certain token types by replacing them with their type code:
        # type  definition
        # ----  ----------
        # Q     quote or pattern
        # q     qw quote
        # h     << here doc operator
        # #     comment
        #
        # This choice will produce a mask file that has balanced
        # container tokens and does not cause parsing problems.
        if ( $$rtoken_type[$j] =~ /^[Qqh]$/ ) {
            if ( $opt_c <= 1 ) {
                $masked_line .= $$rtoken_type[$j] x length( $$rtokens[$j] );
            }
            else {
                $masked_line .= $$rtoken_type[$j];
            }
        }

        # Mask a comment
        elsif ( $$rtoken_type[$j] eq '#' ) {
            if ( $opt_c == 0 ) {
                $masked_line .= '#' x length( $$rtokens[$j] );
            }
        }

        # All other tokens go out verbatim
        else {
            $masked_line .= $$rtokens[$j];
        }
    }
    print_line( $roriginal_file, $input_line ) if $roriginal_file;
    print_line( $rmasked_file, $masked_line );

    # self-check lengths; this error should never happen
    if ( $opt_c == 0 && length($masked_line) != length($input_line) ) {
        my $lmask  = length($masked_line);
        my $linput = length($input_line);
        print STDERR
"$input_line_number: length ERROR, masked length=$lmask but input length=$linput\n";
    }
}

# called once after the last line of a file
sub finish_formatting {
    my $self = shift;
    return;
}