File: gdoc.c

package info (click to toggle)
virtuoso-opensource 7.2.5.1%2Bdfsg1-0.3
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 285,240 kB
  • sloc: ansic: 641,220; sql: 490,413; xml: 269,570; java: 83,893; javascript: 79,900; cpp: 36,927; sh: 31,653; cs: 25,702; php: 12,690; yacc: 10,227; lex: 7,601; makefile: 7,129; jsp: 4,523; awk: 1,697; perl: 1,013; ruby: 1,003; python: 326
file content (174 lines) | stat: -rw-r--r-- 4,872 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
/*
  clean.c -- clean up misuse of presentation markup

  (c) 1998-2008 (W3C) MIT, ERCIM, Keio University
  See tidy.h for the copyright notice.

  Filters from other formats such as Microsoft Word
  often make excessive use of presentation markup such
  as font tags, B, I, and the align attribute. By applying
  a set of production rules, it is straight forward to
  transform this to use CSS.

  Some rules replace some of the children of an element by
  style properties on the element, e.g.

  <p><b>...</b></p> -> <p style="font-weight: bold">...</p>

  Such rules are applied to the element's content and then
  to the element itself until none of the rules more apply.
  Having applied all the rules to an element, it will have
  a style attribute with one or more properties. 

  Other rules strip the element they apply to, replacing
  it by style properties on the contents, e.g.
  
  <dir><li><p>...</li></dir> -> <p style="margin-left 1em">...
      
  These rules are applied to an element before processing
  its content and replace the current element by the first
  element in the exposed content.

  After applying both sets of rules, you can replace the
  style attribute by a class value and style rule in the
  document head. To support this, an association of styles
  and class names is built.

  A naive approach is to rely on string matching to test
  when two property lists are the same. A better approach
  would be to first sort the properties before matching.

*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "tidy-int.h"
#include "gdoc.h"
#include "lexer.h"
#include "parser.h"
#include "tags.h"
#include "attrs.h"
#include "message.h"
#include "tmbstr.h"
#include "utf8.h"

/*
  Extricate "element", replace it by its content and delete it.
*/
static void DiscardContainer( TidyDocImpl* doc, Node *element, Node **pnode)
{
    if (element->content)
    {
        Node *node, *parent = element->parent;

        element->last->next = element->next;

        if (element->next)
        {
            element->next->prev = element->last;
        }
        else
            parent->last = element->last;

        if (element->prev)
        {
            element->content->prev = element->prev;
            element->prev->next = element->content;
        }
        else
            parent->content = element->content;

        for (node = element->content; node; node = node->next)
            node->parent = parent;

        *pnode = element->content;

        element->next = element->content = NULL;
        TY_(FreeNode)(doc, element);
    }
    else
    {
        *pnode = TY_(DiscardElement)(doc, element);
    }
}

static void CleanNode( TidyDocImpl* doc, Node *node )
{
    Node *child, *next;

    if (node->content)
    {
        for (child = node->content; child != NULL; child = next)
        {
            next = child->next;

            if (TY_(nodeIsElement)(child))
            {
                if (nodeIsSTYLE(child))
                    TY_(DiscardElement)(doc, child);
                if (nodeIsP(child) && !child->content)
                    TY_(DiscardElement)(doc, child);
                else if (nodeIsSPAN(child))
                    DiscardContainer( doc, child, &next);
                else if (nodeIsA(child) && !child->content)
                 {
                    AttVal *id = TY_(GetAttrByName)( child, "name" );

                    if (id)
                        TY_(RepairAttrValue)( doc, child->parent, "id", id->value );

                    TY_(DiscardElement)(doc, child);
                }
                else
                {
                    if (child->attributes)
                        TY_(DropAttrByName)( doc, child, "class" );

                    CleanNode(doc, child);
                }
            }
        }
    }
}

/* insert meta element to force browser to recognize doc as UTF8 */
static void SetUTF8( TidyDocImpl* doc )
{
    Node *head = TY_(FindHEAD)( doc );

    if (head)
    {
        Node *node = TY_(InferredTag)(doc, TidyTag_META);
        TY_(AddAttribute)( doc, node, "http-equiv", "Content-Type" );
        TY_(AddAttribute)( doc, node, "content", "text/html; charset=UTF-8" );
        TY_(InsertNodeAtStart)( head, node );
    }
}

/* clean html exported by Google Docs

    - strip the script element, as the style sheet is a mess
    - strip class attributes
    - strip span elements, leaving their content in place
    - replace <a name=...></a> by id on parent element
    - strip empty <p> elements
*/
void TY_(CleanGoogleDocument)( TidyDocImpl* doc )
{
    /* placeholder.  CleanTree()/CleanNode() will not
    ** zap root element 
    */
    CleanNode( doc, &doc->root );
    SetUTF8( doc );
}

/*
 * local variables:
 * mode: c
 * indent-tabs-mode: nil
 * c-basic-offset: 4
 * eval: (c-set-offset 'substatement-open 0)
 * end:
 */