1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196
|
#!/usr/bin/perl -w
# Version 7.0
# Original script written by Michael Larsen, larsen@edu.upenn.math
# Modified by Martin Ward, martin@gkc.org.uk
# Copyright 1994 Michael Larsen and Martin Ward
# Email: martin@gkc.org.uk
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor Boston, MA 02110-1335 USA
#
# Usage: reflow [-skipto stuff] [-fast|-slow|-veryslow] [-skipindented] src>out
#
# Uses Knuth's paragraphing algorithm (as used in TeX) to "reflow"
# text files by picking "good" places to break the line.
#
# It takes an ascii text file (with paragraphs separated by blank lines)
# and reflows the paragraphs. If two or more lines in a row are "indented"
# then they are assumed to be a quoted poem and are passed through unchanged.
# (The -skipindented causes ALL indented lines to be passed unchanged)
#
# The "reflow" operation tries to keep the lines the same length but also
# tries to break at punctuation, and avoid breaking within a proper name
# or after certain "connectives" ("a", "the", etc.).
#
# The "-veryslow" option reflows each paragraph 16 times with different
# optimal line widths and picks the "best" result -- with this option
# the paragraphs are easier to read, but the line width may vary from
# one paragraph to the next.
#
#
# Recent version history:
# 5.1: Fixed a bug with hyphens at end of lines
# (caught by check-punct!)
# 5.2: Avoid turning ". . ." into ". . ."
# 5.3: Added an option "-skipto stuff"
# which skips lines up to and including a line starting with "stuff"
# 5.4: Added an option "-skipindented" which treats ALL indented lines
# as "poetry" (i.e. not reflow-ed), rather than requiring two
# indented lines.
# 5.5: Fixed a bug when a paragraph starts with "--"
# 5.6: Tweaked some parameters, and modified to break before rather
# than after a word which starts with an opening parenthesis
# 5.7: Delete spaces at the end of lines in output.
# 5.8: Added an option "-slow" which tries a list of "optimum" line lengths,
# picking the one which gives the least total penalties.
# 5.9: Tweaked the parameters a bit more!
# 6.0: Added a "-veryslow" option which uses a LONG list of "optimum" lengths,
# -veryslow is about 7 times slower than with no options (or -fast)
# 6.1: Added a penalty for paragraphs with 1 or 2 words on the last line.
# 6.2: Fixed a bug in calculating $bestsofar.
# 6.3: Modified to work with perl5beta1 (gaining a 20% speed improvement).
# 6.4: Added "-w" and fixed warnings and a bug with v long first word in para.
# 6.5: Fix 5.1 could leave @linewords empty
# -- check for hyphen at end of line before checking @linewords
# 6.6: Trim spaces from the ends of lines before printing.
# 6.7: Tweaked "optimum" parameters and reduced maximum to 75
# 6.8: Speed optimisations, including converting all penalties to integers
# -- now 37% faster on -veryslow
# 7.0: Converted to a module with an XSUB for reflow_trial
# -- Vast speed improvement!
use Text::Reflow (reflow_file);
use strict;
my @slow_optimum = (65, 70, 60); # List of best line lengths for -slow option
my @veryslow_optimum = (60..70); # ditto for -veryslow option
my @fast_optimum = (65); # The default
my $Usage = "reflow [-fast|-slow|-veryslow] [-skipto patt] [-skipindented] ...\n";
$| = 1; # unbuffer output
my @opts = ();
# Check for options -skipindented, -fast, -slow, -veryslow for backwards compatibility:
while (($#ARGV >= 0) && ($ARGV[0] =~ /^-/)) {
if ($ARGV[0] eq "-skipindented") {
shift;
if (@ARGV && ($ARGV[0] =~ /^[012]$/)) {
push(@opts, "skipindented", shift);
} else {
push(@opts, "skipindented", 1);
}
} elsif ($ARGV[0] eq "-fast") {
shift;
push(@opts, "optimum", \@fast_optimum);
} elsif ($ARGV[0] eq "-slow") {
shift;
push(@opts, "optimum", \@slow_optimum);
} elsif ($ARGV[0] eq "-veryslow") {
shift;
push(@opts, "optimum", \@veryslow_optimum);
} else {
my $key = shift;
$key =~ s/^-//;
my $value = shift;
push(@opts, $key, $value);
}
}
push(@ARGV, "") unless (@ARGV); # Read STDIN if no file given
foreach my $file (@ARGV) {
reflow_file($file, "", @opts);
}
__END__
=head1 NAME
reflow - Perl script for reflowing text files using Knuth's
paragraphing algorithm.
=head1 SYNOPSIS
reflow < input > output
reflow file1 file2 ... > output
=head1 DESCRIPTION
Uses Knuth's paragraphing algorithm (as used in TeX) to "reflow"
text files by picking "good" places to break the line.
It takes an ascii text file (with paragraphs separated by blank lines)
and reflows the paragraphs. If two or more lines in a row are "indented"
then they are assumed to be a quoted poem and are passed through unchanged.
(The -skipindented causes ALL indented lines to be passed unchanged)
The algorithm tries to keep the lines the same length but also
tries to break at punctuation, and avoid breaking within a proper name
or after certain "connectives" ("a", "the", etc.).
=head2 OPTIONS
=over 5
=item B<-skipindented>
Cause ALL indented lines to be passed unchanged. Normally a single indented
line is included in the paragraph, while two or more indented lines
are considered to be a quotation or poem and are passed through unchanged.
=item B<-fast>
Sets the optimum line width to 65 characters. This is the default.
=item B<-slow>
Tries line widths of 60, 65 and 70 characters and picks the best one.
=item B<-veryslow>
Tries all line widths between 60 and 70 characters inclusive,
and picks the best one. With the B<-slow> and B<-veryslow> options
the line widths within a paragraph will vary less,
but the line widths may vary from one paragraph to the next.
=item B<-skipto pattern>
Skip to the first line starting with the given pattern before starting
to reflow. This is useful for skipping Project Gutenberg headers
or contents tables.
See Text::Reflow(3) for more options: all the keyword options
defined in Text::Reflow are also available via the reflow script.
=back
=head1 AUTHOR
Original C<reflow> perl script written by Michael Larsen, larsen@edu.upenn.math.
Modified, enhanced and converted to a perl module with XSUB
by Martin Ward, martin@gkc.org.uk
=head1 SEE ALSO
Text::Reflow(3).
perl(1).
See "TeX the Program" by Donald Knuth for a description of the algorithm used.
=cut
|