1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150
|
#! /usr/bin/env awk -f
# __ _
# |_) /| Copyright (C) 2000 | richard@
# | \/| Richard Atterer | atterer.net
# '`
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License, version 2. See
# the file COPYING for details.
function appendWord(word, spaceAfterWord) {
#print "appendWord \"" word "\" \"" gensub(/\n/, "\\\\n", "g", spaceAfterWord) "\"";
if (prevSpaceAfterWord == "\n") {
# Linebreak while inside <pre>
doc = doc substr(indentStr, 1, ind) docLine "\n";
docLine = word;
ind = 0;
prevSpaceAfterWord = spaceAfterWord;
return;
}
if (ind + length(docLine) + length(word) < curMaxLen) {
# Append
if (word != "" || doPreserve > 0)
docLine = docLine prevSpaceAfterWord word;
prevSpaceAfterWord = spaceAfterWord;
} else {
# New line
if (docLine != "") doc = doc substr(indentStr, 1, ind) docLine "\n";
#print ">>> " docLine;
docLine = word;
ind = nextInd;
prevSpaceAfterWord = spaceAfterWord;
}
}
#______________________________________________________________________
BEGIN {
get = ARGV[1];
put = ARGV[2];
maxLen = 75;
indent = 1;
indentStr = " "; # Won't indent by more than this
killClass = 1; # If nonzero, remove all " class=...>" attributes
# Only tags that come with closing tags are allowed!
tags["html"]=1; tags["body"]=1; tags["head"]; tags["title"];
tags["div"]; tags["h1"]; tags["h2"]; tags["h3"]; tags["h4"]; tags["h5"];
tags["h6"]; tags["p"]; tags["dl"]; tags["dt"]; tags["dd"]; tags["table"];
tags["tr"]; tags["td"];
preserve["pre"];
maxTagLength = 100;
curMaxLen = maxLen;
# Join lines
getline rest < get;
while ((getline line < get) == 1)
rest = rest line "\n";
if (killClass)
gsub(/[ \t\n]+(class|CLASS)=("[^"]*"|'[^']*')[ \t\n]*>/, ">", rest); #"
# Split lines at whitespace and some tags
nextInd = ind = 0; # Nr of characters of indentation
doc = ""; # Ouput document
docLine = ""; # Current line to append words to
doPreserve = 0; # Nesting level of <pre>
while (match(rest, /([ \n\t]+|< *(\/ *)?)/)) {
#print "MATCH \"" substr(rest, RSTART, RLENGTH) "\"";
#print "xxx "nextInd" " gensub(/\n/, "\\\\n", "g", substr(rest, 1, 90));
if (substr(rest, RSTART, 1) == "<") {
# Tag found
tagName = tolower(substr(rest, RSTART + RLENGTH, maxTagLength));
gsub(/[^a-z0-9].*$/, "", tagName);
closing = index(substr(rest, RSTART + 1, RLENGTH - 1), "/");
# Is tag <pre>?
if (tagName in preserve) {
if (closing && doPreserve > 0) {
appendWord(substr(rest, 1, RSTART + RLENGTH + length(tagName) \
- 1), "");
nextInd -= indent;
--doPreserve;
if (doPreserve == 0) {
curMaxLen = maxLen; nextInd = nonpreserveInd;
}
rest = substr(rest, RSTART + RLENGTH + length(tagName));
continue;
}
if (!closing) {
# Disable indentation while inside <pre>
if (doPreserve == 0) { nonpreserveInd = nextInd; nextInd = 0; }
++doPreserve; curMaxLen = 9999999;
}
}
if (!(tagName in tags)) {
# No known tag name
appendWord(substr(rest, 1, RSTART + RLENGTH + length(tagName) - 1),
"");
rest = substr(rest, RSTART + RLENGTH + length(tagName));
continue;
} else if (closing) {
#print "---/" tagName;
# Closing tag
if (tags[tagName] == 0) {
appendWord(substr(rest, 1, RSTART + RLENGTH + length(tagName) \
- 1), "");
nextInd -= indent;
} else {
nextInd -= indent;
appendWord(substr(rest, 1, RSTART - 1), "");
curMaxLen = 0; # Force new line with next appendWord()
appendWord(substr(rest, RSTART, RLENGTH + length(tagName)), "");
curMaxLen = maxLen;
}
} else {
#print "--- " tagName;
# Opening tag
appendWord(substr(rest, 1, RSTART - 1), "");
curMaxLen = 0; # Force new line with next appendWord()
appendWord(substr(rest, RSTART, RLENGTH + length(tagName)), "");
curMaxLen = maxLen;
nextInd += indent;
}
rest = substr(rest, RSTART + RLENGTH + length(tagName));
continue;
} # endif tag found
# Whitespace
#print "dop " doPreserve ", RSTART=" RSTART ", RLENGTH=" RLENGTH;
if (doPreserve) {
# Preserve spaces and newlines in output
appendWord(substr(rest, 1, RSTART - 1), substr(rest, RSTART, 1));
rest = substr(rest, RSTART + 1);
} else {
# Wrap words
if (substr(rest, RSTART + RLENGTH, 1) == ">")
appendWord(substr(rest, 1, RSTART - 1), "");
else
appendWord(substr(rest, 1, RSTART - 1), " ");
rest = substr(rest, RSTART + RLENGTH);
}
}
doc = doc substr(indentStr, 1, ind) docLine rest;
print doc;
}
|