1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174
|
/*
clean.c -- clean up misuse of presentation markup
(c) 1998-2008 (W3C) MIT, ERCIM, Keio University
See tidy.h for the copyright notice.
Filters from other formats such as Microsoft Word
often make excessive use of presentation markup such
as font tags, B, I, and the align attribute. By applying
a set of production rules, it is straight forward to
transform this to use CSS.
Some rules replace some of the children of an element by
style properties on the element, e.g.
<p><b>...</b></p> -> <p style="font-weight: bold">...</p>
Such rules are applied to the element's content and then
to the element itself until none of the rules more apply.
Having applied all the rules to an element, it will have
a style attribute with one or more properties.
Other rules strip the element they apply to, replacing
it by style properties on the contents, e.g.
<dir><li><p>...</li></dir> -> <p style="margin-left 1em">...
These rules are applied to an element before processing
its content and replace the current element by the first
element in the exposed content.
After applying both sets of rules, you can replace the
style attribute by a class value and style rule in the
document head. To support this, an association of styles
and class names is built.
A naive approach is to rely on string matching to test
when two property lists are the same. A better approach
would be to first sort the properties before matching.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "tidy-int.h"
#include "gdoc.h"
#include "lexer.h"
#include "parser.h"
#include "tags.h"
#include "attrs.h"
#include "message.h"
#include "tmbstr.h"
#include "utf8.h"
/*
Extricate "element", replace it by its content and delete it.
*/
static void DiscardContainer( TidyDocImpl* doc, Node *element, Node **pnode)
{
if (element->content)
{
Node *node, *parent = element->parent;
element->last->next = element->next;
if (element->next)
{
element->next->prev = element->last;
}
else
parent->last = element->last;
if (element->prev)
{
element->content->prev = element->prev;
element->prev->next = element->content;
}
else
parent->content = element->content;
for (node = element->content; node; node = node->next)
node->parent = parent;
*pnode = element->content;
element->next = element->content = NULL;
TY_(FreeNode)(doc, element);
}
else
{
*pnode = TY_(DiscardElement)(doc, element);
}
}
static void CleanNode( TidyDocImpl* doc, Node *node )
{
Node *child, *next;
if (node->content)
{
for (child = node->content; child != NULL; child = next)
{
next = child->next;
if (TY_(nodeIsElement)(child))
{
if (nodeIsSTYLE(child))
TY_(DiscardElement)(doc, child);
if (nodeIsP(child) && !child->content)
TY_(DiscardElement)(doc, child);
else if (nodeIsSPAN(child))
DiscardContainer( doc, child, &next);
else if (nodeIsA(child) && !child->content)
{
AttVal *id = TY_(GetAttrByName)( child, "name" );
if (id)
TY_(RepairAttrValue)( doc, child->parent, "id", id->value );
TY_(DiscardElement)(doc, child);
}
else
{
if (child->attributes)
TY_(DropAttrByName)( doc, child, "class" );
CleanNode(doc, child);
}
}
}
}
}
/* insert meta element to force browser to recognize doc as UTF8 */
static void SetUTF8( TidyDocImpl* doc )
{
Node *head = TY_(FindHEAD)( doc );
if (head)
{
Node *node = TY_(InferredTag)(doc, TidyTag_META);
TY_(AddAttribute)( doc, node, "http-equiv", "Content-Type" );
TY_(AddAttribute)( doc, node, "content", "text/html; charset=UTF-8" );
TY_(InsertNodeAtStart)( head, node );
}
}
/* clean html exported by Google Docs
- strip the script element, as the style sheet is a mess
- strip class attributes
- strip span elements, leaving their content in place
- replace <a name=...></a> by id on parent element
- strip empty <p> elements
*/
void TY_(CleanGoogleDocument)( TidyDocImpl* doc )
{
/* placeholder. CleanTree()/CleanNode() will not
** zap root element
*/
CleanNode( doc, &doc->root );
SetUTF8( doc );
}
/*
* local variables:
* mode: c
* indent-tabs-mode: nil
* c-basic-offset: 4
* eval: (c-set-offset 'substatement-open 0)
* end:
*/
|