1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247
|
#!/usr/bin/perl -w
use strict;
# Walk through a perl script and create a masked file which is
# similar but which masks comments, quotes, patterns, and non-code
# lines so that it is easy to parse with regular expressions.
#
# usage:
# perlmask [-cn] myfile.pl >myfile.new
# perlmask [-cn] <myfile.pl >myfile.new
#
# In the masked file,
# -comments and pod will be masked (or removed)
# -here-doc text lines will be masked (or removed)
# -quotes and patterns, qw quotes, and here doc << operators will be
# replaced by the letters 'Q', 'q', or 'h'
#
# The result is a file in which all braces, parens, and square brackets
# are balanced, and it can be parsed relatively easily by regular
# expressions.
#
# -cn is an optional 'compression' flag. By default the masked file will have
# the same number of characters as the input file, with the difference being
# that certain characters will be changed (masked).
#
# If character position correspondence is not required, the size of the masked
# file can be significantly reduced by increasing the 'compression' level as
# follows:
#
# -c0 all mask file line numbers and character positions agree with
# original file (DEFAULT)
# -c1 line numbers agree and character positions agree within lines of code
# -c2 line numbers agree but character positions do not
# -c3 no correspondence between line numbers or character positions
#
# Try each of these on a file of significant size to see how they work.
# The default, -c0, is required if you are working with character positions
# that span multiple lines. The other levels may be useful if you
# do not need this level of correspondence.
#
# This file is one of the examples distributed with perltidy and demonstrates
# using a callback object with Perl::Tidy to walk through a perl file and find
# all of its tokens. It can be useful for simple perl code parsing tasks. It
# might even be helpful in debugging. Or you may want to modify it to suit
# your own purposes.
#
use Getopt::Std;
use IO::File;
$| = 1;
use vars qw($opt_c $opt_h);
my $usage = <<EOM;
usage: perlmask [ -cn ] filename >outfile
EOM
getopts('c:h') or die "$usage";
if ($opt_h) { die $usage }
unless ( defined($opt_c) ) { $opt_c = 0 }
if (@ARGV > 1) { die $usage }
my $source=$ARGV[0]; # an undefined filename will become stdin
# strings to hold the files (arrays could be used to)
my ( $masked_file, $original_file );
PerlMask::perlmask(
_source => $source,
_rmasked_file => \$masked_file,
_roriginal_file => \$original_file, # optional
_compression => $opt_c # optional, default=0
);
# Now we have the masked and original files in strings of equal length.
# We could search for specific text in the masked file here. But here
# we'll just print the masked file:
if ($masked_file) { print $masked_file; }
#####################################################################
#
# The PerlMask package is an interface to perltidy which accepts a
# source filehandle and returns a 'masked' version of the source as
# a string or array. It can also optionally return the original file
# as a string or array.
#
# It works by making a a callback object with a write_line() method to
# receive tokenized lines from perltidy. This write_line method
# selectively replaces tokens with either their original text or with a
# benign masking character (such as '#' or 'Q').
#
# Usage:
#
# PerlMask::perlmask(
# _source => $fh, # required source
# _rmasked_file => \$masked_file, # required ref to ARRAY or SCALAR
# _roriginal_file => \$original_file, # optional ref to ARRAY or SCALAR
# _compression => $opt_c # optional
# );
#
# _source is any source that perltidy will accept, including a
# filehandle or reference to SCALAR or ARRAY
#
# The compression flag may have these values:
# 0 all mask file line numbers and character positions agree with
# original file (DEFAULT)
# 1 line numbers agree and character positions agree within lines of code
# 2 line numbers agree but character positions do not
# 3 no correspondence between line numbers or character positions
#
#####################################################################
package PerlMask;
use Carp;
use Perl::Tidy;
sub perlmask {
my %args = ( _compression => 0, @_ );
my $rfile = $args{_rmasked_file};
unless ( defined($rfile) ) {
croak
"Missing required parameter '_rmasked_file' in call to perlmask\n";
}
my $ref=ref($rfile);
unless ( $ref =~ /^(SCALAR|ARRAY)$/ ) {
croak <<EOM;
Expecting _rmasked_file = ref to SCALAR or ARRAY in perlmask but got : ($ref)
EOM
}
# run perltidy, which will call $formatter's write_line() for each line
perltidy(
'source' => $args{_source},
'formatter' => bless( \%args, __PACKAGE__ ), # callback object
'argv' => "-npro -se", # -npro : ignore .perltidyrc,
# -se : errors to STDOUT
);
}
sub print_line {
# called from write_line to dispatch one line (either masked or original)..
# here we'll either append it to a string or array, as appropriate
my ( $rfile, $line ) = @_;
if ( defined($rfile) ) {
if ( ref($rfile) eq 'SCALAR' ) {
$$rfile .= $line . "\n";
}
elsif ( ref($rfile) eq 'ARRAY' ) {
push @{$rfile}, $line . "\n";
}
}
}
sub write_line {
# This is called from perltidy line-by-line
my ( $self, $line_of_tokens ) = @_;
my $rmasked_file = $self->{_rmasked_file};
my $roriginal_file = $self->{_roriginal_file};
my $opt_c = $self->{_compression};
my $line_type = $line_of_tokens->{_line_type};
my $input_line_number = $line_of_tokens->{_line_number};
my $input_line = $line_of_tokens->{_line_text};
my $rtoken_type = $line_of_tokens->{_rtoken_type};
my $rtokens = $line_of_tokens->{_rtokens};
chomp $input_line;
# mask non-CODE lines
if ( $line_type ne 'CODE' ) {
return if ( $opt_c == 3 );
my $len = length($input_line);
if ( $opt_c == 0 && $len > 0 ) {
print_line( $roriginal_file, $input_line ) if $roriginal_file;
print_line( $rmasked_file, '#' x $len );
}
else {
print_line( $roriginal_file, $input_line ) if $roriginal_file;
print_line( $rmasked_file, "" );
}
return;
}
# we'll build the masked line token by token
my $masked_line = "";
# add leading spaces if not in a higher compression mode
if ( $opt_c <= 1 ) {
# Find leading whitespace. But be careful..we don't want the
# whitespace if it is part of quoted text, because it will
# already be contained in a token.
if ( $input_line =~ /^(\s+)/ && !$line_of_tokens->{_starting_in_quote} )
{
$masked_line = $1;
}
}
# loop over tokens to construct one masked line
for ( my $j = 0 ; $j < @$rtoken_type ; $j++ ) {
# Mask certain token types by replacing them with their type code:
# type definition
# ---- ----------
# Q quote or pattern
# q qw quote
# h << here doc operator
# # comment
#
# This choice will produce a mask file that has balanced
# container tokens and does not cause parsing problems.
if ( $$rtoken_type[$j] =~ /^[Qqh]$/ ) {
if ( $opt_c <= 1 ) {
$masked_line .= $$rtoken_type[$j] x length( $$rtokens[$j] );
}
else {
$masked_line .= $$rtoken_type[$j];
}
}
# Mask a comment
elsif ( $$rtoken_type[$j] eq '#' ) {
if ( $opt_c == 0 ) {
$masked_line .= '#' x length( $$rtokens[$j] );
}
}
# All other tokens go out verbatim
else {
$masked_line .= $$rtokens[$j];
}
}
print_line( $roriginal_file, $input_line ) if $roriginal_file;
print_line( $rmasked_file, $masked_line );
# self-check lengths; this error should never happen
if ( $opt_c == 0 && length($masked_line) != length($input_line) ) {
my $lmask = length($masked_line);
my $linput = length($input_line);
print STDERR
"$input_line_number: length ERROR, masked length=$lmask but input length=$linput\n";
}
}
# called once after the last line of a file
sub finish_formatting {
my $self = shift;
return;
}
|