1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193
|
#! /usr/bin/perl
# Script to pre-process XML input before processing it for various purposes.
# Options specify which transformations are to be done. Monospaced literal
# layout blocks are never touched.
# Changes:
# -ascii: Replace ’ by '
# Replace © by (c)
# Replace † by *
# Replace ‡ by **
# Replace by a space
# Replace – by -
# Put quotes round <quote> text
#
# -quoteliteral:
# Put quotes round <literal> text
#
# -bookinfo: Remove the <bookinfo> element from the file
#
# -fi: Replace "fi" by fi except when it is in an XML element, or
# inside a <literal>.
#
# -html: Certain things are done only for HTML output:
#
# If <literallayout> is followed by optional space and then a
# newline, the space and newline are removed, because otherwise you
# get a blank line in the HTML output.
#
# -noindex Remove the XML that generates indexes.
# -oneindex Ditto, but add XML to generate a single index.
#
# -optbreak Insert an optional line break (zero-width space, ​) after
# every underscore in text within <option> and <variable> elements,
# except when preceded by <entry> (i.e. not in tables). The same is
# also done within a word of four or more upper-case letters (for
# compile-time options).
# The function that processes non-literal, non-monospaced text
sub process()
{
my($s) = $_[0];
$s =~ s/fi(?![^<>]*>)/fi/g if $ligatures;
if ($optbreak)
{
$s =~ s%(?<!<entry>)(<option>|<varname>)([^<]+)%
my($x,$y) = ($1,$2); $y =~ s/_/_​/g; "$x"."$y"%gex;
$s =~ s?\b([A-Z_]{4,})\b?
my($x) = $1; $x =~ s/_/_​/g; "$x"?gex;
}
if ($ascii)
{
$s =~ s/’/'/g;
$s =~ s/©/(c)/g;
$s =~ s/†/*/g;
$s =~ s/‡/**/g;
$s =~ s/&nsbp;/ /g;
$s =~ s/–/-/g;
$s =~ s/<quote>/"/g;
$s =~ s/<\/quote>/"/g;
}
$s;
}
# The main program
$ascii = 0;
$bookinfo = 0;
$html = 0;
$inliteral = 0;
$inliterallayout = 0;
$ligatures = 0;
$madeindex = 0;
$noindex = 0;
$oneindex = 0;
$optbreak = 0;
$quoteliteral = 0;
foreach $arg (@ARGV)
{
if ($arg eq "-fi") { $ligatures = 1; }
elsif ($arg eq "-ascii") { $ascii = 1; }
elsif ($arg eq "-bookinfo") { $bookinfo = 1; }
elsif ($arg eq "-html") { $html = 1; }
elsif ($arg eq "-noindex") { $noindex = 1; }
elsif ($arg eq "-oneindex") { $oneindex = 1; }
elsif ($arg eq "-optbreak") { $optbreak = 1; }
elsif ($arg eq "-quoteliteral") { $quoteliteral = 1; }
else { die "** Pre-xml: Unknown option \"$arg\"\n"; }
}
while (<STDIN>)
{
# Remove <bookinfo> if required
if ($bookinfo && /^<bookinfo/)
{
while (<STDIN>) { last if /^<\/bookinfo/; }
next;
}
# Copy monospaced literallayout blocks
if (/^<literallayout class="monospaced">/)
{
$_ = substr($_, 0, -1) if $html;
print;
while (<STDIN>)
{
print;
last if /^<\/literallayout>/;
}
next;
}
# Adjust index-generation code if required
if (($noindex || $oneindex) && /^<index[\s>]/)
{
while (<STDIN>)
{
last if /^<\/index>/;
}
if ($oneindex && !$madeindex)
{
$madeindex = 1;
print "<index><title>Index</title></index>\n";
}
next;
}
# A line that is not in a monospaced literal block; keep track of which
# parts are in <literal> and which not. The latter get processed by the
# function above. Items in <literal> get quoted unless they are also in
# a <literallayout> block, or are already being quoted.
for (;;)
{
$_ = substr($_, 0, -1) if $html && /^<literallayout[^>]*>\s*\n$/;
$inliterallayout = 1 if /^<literallayout/;
$inliterallayout = 0 if /^<\/literallayout/;
if ($inliteral)
{
if (/^(.*?)<\/literal>(?!<\/quote>)(.*)$/)
{
print $1;
print "\"" if $quoteliteral && !$inliterallayout;
print "</literal>";
$inliteral = 0;
$_ = "$2\n";
}
else
{
print;
last;
}
}
# Not in literal state
else
{
if (/^(.*?)(?<!<quote>)<literal>(.*)$/)
{
print &process($1);
print "<literal>";
print "\"" if $quoteliteral && !$inliterallayout;
$inliteral = 1;
$_ = "$2\n";
}
else
{
print &process($_);
last;
}
}
} # Loop for different parts of one line
} # Loop for multiple lines
# End
|