File: hxuncdata.c

package info (click to toggle)
html-xml-utils 7.7-1.1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, bullseye, forky, sid, trixie
  • size: 2,488 kB
  • sloc: ansic: 11,213; sh: 7,996; lex: 243; makefile: 193; yacc: 125
file content (144 lines) | stat: -rw-r--r-- 4,611 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
/*
 * uncdata -- remove CDATA sections from an XML file
 *
 * The input is scanned for occurrences of "<![CDATA[" and
 * corresponding "]]>". Those strings are removed and all occurrences
 * of "&", "<" and ">" in between them will be replaced by "&amp;",
 * "&;t;" and "&gt;" resp.
 *
 * The input must be 1 byte per character. If it is not, convert it to
 * UTF-8 first.
 *
 * Part of HTML-XML-utils, see:
 * http://www.w3.org/Tools/HTML-XML-utils/
 *
 * Author: Bert Bos <bert@w3.org>
 * Created: 20 Feb 2002
 * Version: $Id: hxuncdata.c,v 1.2 2009/01/08 14:35:09 bbos Exp $
 */

#include <stdio.h>
#include <assert.h>
#include <string.h>


/* process -- process one file */
static void process(FILE *f)
{
  int c;
  enum {INITIAL, START, CDATA1, CDATA2, CDATA3, CDATA4, CDATA5,
	CDATA6, CDATA7, CDATA98, CDATA99, CDATA, MARKUP, DECL1, DECL,
	COMMENT1, COMMENT, COMMENT99, DQUOTE, SQUOTE} state = INITIAL;

  /* No attempt at reporting errors for impossible XML,
     and no support for internal DTD subsets
  */
  while ((c = getc(f)) != EOF) {
    switch (state) {
    case INITIAL:
      if (c == '<') state = START;
      else putchar(c);
      break;
    case START:					/* Seen "<" */
      if (c == '!') state = DECL1;
      else if (c == '>') {putchar('<'); putchar('>'); state = INITIAL;}
      else {putchar('<'); putchar(c); state = MARKUP;}
      break;
    case MARKUP:				/* Inside "<...>" */
      if (c == '"') {putchar('"'); state = DQUOTE;}
      else if (c == '\'') {putchar('\''); state = SQUOTE;}
      else if (c == '>') {putchar('>'); state = INITIAL;}
      else putchar(c);
      break;
    case DQUOTE:				/* Inside double quotes */
      if (c == '"') {putchar('"'); state = MARKUP;}
      else putchar(c);
      break;
    case SQUOTE:				/* Inside single quotes */
      if (c == '\'') {putchar('\''); state = MARKUP;}
      else putchar(c);
      break;
    case DECL1:					/* Seen "<!" */
      if (c == '-') {printf("<!-"); state = COMMENT1;}
      else if (c == '[') state = CDATA1;
      else {putchar('<'); putchar('!'); putchar(c); state = DECL;}
      break;
    case DECL:					/* Inside "<!...>" */
      if (c == '-') {putchar('-'); state = COMMENT1;}
      else if (c == '>') {putchar('>'); state = INITIAL;}
      else putchar(c);
      break;
    case COMMENT1:				/* Seen "-" */
      if (c == '-') {putchar('-'); state = COMMENT;}
      else {putchar(c); state = DECL;}
      break;
    case COMMENT:				/* Seen "--" */
      if (c == '-') {putchar('-'); state = COMMENT99;}
      else putchar(c);
      break;
    case COMMENT99:				/* Seen "-" */
      if (c == '-') {putchar('-'); state = DECL;}
      else {putchar(c); state = COMMENT;}
      break;
    case CDATA1:				/* Seen "<![" */
      if (c == 'C') state = CDATA2;
      else {printf("<![%c", c); state = INITIAL;}
      break;
    case CDATA2:				/* Seen "<![C" */
      if (c == 'D') state = CDATA3;
      else {printf("<![C%c", c); state = INITIAL;}
      break;
    case CDATA3:				/* Seen "<![CD" */
      if (c == 'A') state = CDATA4;
      else {printf("<![CD%c", c); state = INITIAL;}
      break;
    case CDATA4:				/* Seen "<![CDA" */
      if (c == 'T') state = CDATA5;
      else {printf("<![CDA%c", c); state = INITIAL;}
      break;
    case CDATA5:				/* Seen "<![CDAT" */
      if (c == 'A') state = CDATA6;
      else {printf("<![CDAT%c", c); state = INITIAL;}
      break;
    case CDATA6:				/* Seen "<![CDATA" */
      if (c == '[') state = CDATA;
      else {printf("<![CDATA%c", c); state = INITIAL;}
      break;
    case CDATA:					/* Inside "<![CDATA[...]]>" */
      if (c == ']') state = CDATA98;
      else if (c == '<') fputs("&lt;", stdout);
      else if (c == '>') fputs("&gt;", stdout);
      else if (c == '&') fputs("&amp;", stdout); 
      else putchar(c);
      break;
    case CDATA98:				/* Seen "]" */
      if (c == ']') state = CDATA99;
      else {putchar(']'); putchar(c); state = CDATA;}
      break;
    case CDATA99:				/* Seen "]]" */
      if (c == '>') state = INITIAL;
      else {putchar(']'); putchar(']'); putchar(c); state = CDATA;}
      break;
    default:
      assert(!"Cannot happen!");
    }
  }
}

int main(int argc, char *argv[])
{
  int i, err = 0;
  FILE *f;

  if (argc == 1)
    process(stdin);
  else if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0)
    printf("Usage: %s [XML-FILE]...\n", argv[0]);
  else
    for (i = 1; i < argc; i++) {
      if (!(f = fopen(argv[i], "r"))) {perror(argv[i]); err++; continue;}
      process(f);
      if (fclose(f) != 0) {perror(argv[i]); err++; continue;}
    }
  return err;
}