1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61
|
package Uncruft;
my $CLASS = __PACKAGE__;
use strict;
use warnings;
use base qw(Exporter);
our @EXPORT = qw(uncruft);
my $html_xml_tags_re = qr/<\/?(?:p|br|ref)(?:\s[^>]*)?>/i;
sub uncruft
{
($_) = @_;
# Remove generic comments: look for 4 or more lines beginning with
# regular comment pattern and trim it. Fall back to old algorithm
# if no such pattern found.
my @matches = m/^[ \t]*([^a-zA-Z0-9\s]{1,3})[ \t]+\S/mg;
if ( @matches >= 4 ) {
my $comment_re = qr/^[ \t]*[\Q$matches[0]\E]{1,3}[ \t]*/m;
s/$comment_re//g;
}
my @wordmatches = m/^[ \t]*(dnl|REM|COMMENT)[ \t]+\S/mg;
if ( @wordmatches >= 4 ) {
my $comment_re = qr/^[ \t]*\Q$wordmatches[0]\E[ \t]*/m;
s/$comment_re//g;
}
# Remove other side of "boxed" comments
s/[ \t]*[*#][ \t]*$//gm;
# Remove Fortran comments
s/^[cC]$//gm;
s/^[cC] //gm;
# Remove C / C++ comments
s#(\*/|/\*|(?<!:)//)##g;
# Strip escaped newline
s/\s*\\n\s*/ /g;
# strip trailing dash, assuming it is soft-wrap
# (example: disclaimers in GNU autotools file "install-sh")
s/-\r?\n//g;
# strip common html and xml tags
s/$html_xml_tags_re//g;
tr/\t\r\n/ /;
# this also removes quotes
tr% A-Za-z.,:@;0-9\(\)/-%%cd;
tr/ //s;
return $_;
}
1;
|