File: htmlbeauty

package info (click to toggle)
wpp 2.13.1.35-4
  • links: PTS
  • area: main
  • in suites: stretch
  • size: 1,000 kB
  • ctags: 146
  • sloc: perl: 1,605; makefile: 54
file content (106 lines) | stat: -rwxr-xr-x 2,149 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#!/usr/bin/perl -w

use File::Basename;
use strict;


select STDERR;
$| = 0;
select STDOUT;

if (@ARGV) {
	foreach (@ARGV) {
		htmlbeauty($_);
	}
} else {
	htmlbeauty('-');
}


sub htmlbeauty {
	my ($ifile) = @_;
	my $OS;
	my $IS;

	print STDERR "[" . basename($0) . "] converting '$ifile'.\n";
	if ($ifile ne '-') {
		if ($ifile !~ /\.html?$/) {
			print STDERR "usage: " . basename($0) .
				" [file1.html] [file2.html] ...\n";
			exit 1;
		}
		open($OS, "> $ifile~") or die "$!";
	} else {
		$OS = *STDOUT;
	}
	my @line;
	my $out;
	my $skip = 0;
	open($IS, $ifile) or die "$!";
	while (<$IS>) {
		chomp;
		@line = split(/(<[^>]*>)/o, $_);
		$out = '';
		foreach (@line) {
			if (m#<(?:script|pre|style|code)#i) {
				$skip = 1;
			}
			if (m#</(?:script|pre|style|code)#i) {
				$skip = 0;
			}

			if (!$skip) {
				# expand tabs
				s/\t/  /g;

				# replace whites with single ones
				#s/([^\s'"])\s+([^\s'"])/$1 $2/g;		# generic
				s/(['"])\s+([^\s'"])/$1 $2/g;				# within html tag

				# remove ending whites
				#s/\s+$/ /g;

				# remove wpp flags
				s/ !wpp=(["'])[0-9]\1//g;

				# rough stressed vowel replacement outside tags
				my $line = $_;
				my $s = '';
				my $v;
				while ($line =~ /(^|>)([^><]+)(<|$)/) {
					$s .= "$`$1";
					$v = "$2";
					$line = "$3$'";
					$v =~ s/([aeiou])['`]([\s!?,:;.]|$)/&$1grave;$2/gi;

					# LANG:IT
					{
						# po'
						$v =~ s/(^|\s)\s*(p)\&(o)grave;\s*(\s|$)/$1$2$3'$4/gi;
						# ne'
						$v =~ s/(^|\s)\s*(n)\&(e)grave;\s*(\s|$)/$1$2$3'$4/gi;
						# perche`, benche`, purche`, poiche`, allorche`, nonche`, se`,
						# finche`, cosicche`, dopodiche`
						$v =~ s/(^|\s)\s*((?:p[eu]r|poi|ben|allor|non|fin|cosic|dopodi)ch|s)\&(e)grave;\s*(\s|\?|\.|$)/$1$2&$3acute;$4/gi;
					}

					$s .= $v;
					#print STDERR "'$line'\n";
				}
				$_ = "$s$line";
			}
				
			# non ASCII-base chars to entites conversion
			s/([\x80-\xff])/'&#'.ord($1).';'/ge;

			$out .= $_;
		}
		print $OS "$out\n" if ($out !~ /^[\r\n\s]*$/ || $skip);
	}
	close($IS);
	if ($ifile ne '-') {
		close($OS);
		rename "$ifile~", $ifile;
	}
	print STDERR "[" . basename($0) . "] converted '$ifile'.\n";
}