File: xml_lexer.mll

package info (click to toggle)
lablgtk2 2.10.1-2
  • links: PTS, VCS
  • area: main
  • in suites: lenny
  • size: 3,804 kB
  • ctags: 5,871
  • sloc: ml: 32,939; ansic: 8,488; makefile: 679; sh: 85
file content (195 lines) | stat: -rw-r--r-- 5,994 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
(**************************************************************************)
(*     Lablgtk - Camlirc                                                  *)
(*                                                                        *)
(*    * You are free to do anything you want with this code as long       *)
(*      as it is for personal use.                                        *)
(*                                                                        *)
(*    * Redistribution can only be "as is".  Binary distribution          *)
(*      and bug fixes are allowed, but you cannot extensively             *)
(*      modify the code without asking the authors.                       *)
(*                                                                        *)
(*    The authors may choose to remove any of the above                   *)
(*    restrictions on a per request basis.                                *)
(*                                                                        *)
(*    Authors:                                                            *)
(*      Nobuaki Yoshida  <nyoshi@dd.iij4u.or.jp>                          *)
(*      Jacques Garrigue <garrigue@kurims.kyoto-u.ac.jp>                  *)
(*                                                                        *)
(**************************************************************************)

(* $Id: xml_lexer.mll 1354 2007-07-20 04:18:38Z garrigue $ *)

{
open Lexing

type error =
  | Illegal_character of char
  | Bad_entity of string
  | Unterminated of string
  | Tag_expected
  | Other of string

let error_string = function
  | Illegal_character c ->
      "illegal character '" ^ Char.escaped c ^ "'"
  | Bad_entity s ->
      "\"&" ^ s ^ ";\" is not a valid entity"
  | Unterminated s -> "unterminated " ^ s ^ " starts here"
  | Tag_expected -> "a tag was expected"
  | Other s -> s

exception Error of error * int

type token =
  | Tag of string * (string * string) list * bool
  | Chars of string
  | Endtag of string
  | EOF

let string_start_pos = ref 0
and comment_start_pos = ref 0
and token_start_pos = ref 0

let token_start () = !token_start_pos

let string_buffer = Buffer.create 80
let reset_string lexbuf =
  string_start_pos := lexeme_start lexbuf;
  Buffer.reset string_buffer

let reset_comment lexbuf =
  comment_start_pos := lexeme_start lexbuf

let entities = [ "lt", "<"; "gt", ">"; "ampers", "&" ]

}

let break = ['\010' '\013' '\012']
let space = [' ' '\009']
let identchar =  ['A'-'Z' 'a'-'z' '_' '0'-'9']

rule token = parse
  | break +
      { token lexbuf }
  | space +
      { token lexbuf }

  | "<!--"
      { reset_comment lexbuf; comment lexbuf; token lexbuf }
  | "</"
      { token_start_pos := lexeme_start lexbuf;
        let tag = tag_name lexbuf in close_tag lexbuf; Endtag tag }
        
  | "<"
      { token_start_pos := lexeme_start lexbuf;
        let tag = tag_name lexbuf in
        let attribs, closed = attributes lexbuf in
        Tag(tag, attribs, closed) }
  | space * [ ^ ' ' '\009' '\010' '\013' '\012' '<' '>' '&'] +
      { token_start_pos := lexeme_start lexbuf;
        reset_string lexbuf;
        Buffer.add_string string_buffer (lexeme lexbuf);
        Chars(chars lexbuf) }
  | "&"
      { token_start_pos := lexeme_start lexbuf;
        reset_string lexbuf;
        Buffer.add_string string_buffer (entity lexbuf);
        Chars(chars lexbuf)  }
  | eof
      { EOF }
  | _
      { raise (Error(Illegal_character (lexeme_char lexbuf 0),
                     lexeme_start lexbuf)) }

and chars = parse
  | [ ^ '\010' '\013' '\012' '<' '>' '&' ] +
      { Buffer.add_string string_buffer (lexeme lexbuf);
        chars lexbuf }
  | "&"
      { Buffer.add_string string_buffer (entity lexbuf);
        chars lexbuf  }
  | ""
      { Buffer.contents string_buffer }

and entity = parse
  | identchar + ";"
      { let s = lexeme lexbuf in
      let s = String.sub s 0 (String.length s - 1) in
        try List.assoc (String.lowercase s) entities
        with Not_found ->
          "&" ^ String.lowercase s ^ ";" }
  | _
      { raise (Error (Unterminated "entity", lexeme_start lexbuf)) }

and tag_name = parse
  | ('!' ?) (identchar +)
      { String.lowercase (lexeme lexbuf) }
  | _
      { raise (Error(Tag_expected, lexeme_start lexbuf)) }

and close_tag = parse
  | (space|break) +
      { close_tag lexbuf }
  | ">"
      { () }
  | _
      { raise (Error(Illegal_character (lexeme_char lexbuf 0),
                     lexeme_start lexbuf)) }

and attributes = parse
  | (space|break) +
      { attributes lexbuf }
  | ">"
      { [], false }
  | "/>"
      { [], true }
  | ""
      { let key = attribute lexbuf in
        let data = attribute_data lexbuf in
        let others, closed = attributes lexbuf in
        (String.lowercase key, data) :: others, closed }

and attribute = parse
  | '"'
      { reset_string lexbuf; string lexbuf }
  | [ ^ ' ' '\010' '\013' '\009' '\012' '=' '<' '>' '"' ] +
      { lexeme lexbuf }

and attribute_data = parse
  | "=" { attribute lexbuf }
  | ""  { "" }

and string = parse
  | '"'
      { Buffer.contents string_buffer }
  | "\\" [ '"' '\\' ]
      { Buffer.add_char string_buffer (lexeme_char lexbuf 1); string lexbuf }
  | eof
      { raise (Error(Unterminated "string", !string_start_pos)) }
  | _
      {  Buffer.add_char string_buffer (lexeme_char lexbuf 0); string lexbuf }

and comment = parse
  | "-->"
      { () }
  | eof
      { raise (Error(Unterminated "comment", !comment_start_pos)) }
  | _
      { comment lexbuf }

and base64 = parse
  | (space|break) +
      { base64 lexbuf }
  | ['A'-'Z']
      { Char.code (lexeme_char lexbuf 0) - Char.code 'A' }
  | ['a'-'z']
      { Char.code (lexeme_char lexbuf 0) - Char.code 'a' + 26 }
  | ['0'-'9']
      { Char.code (lexeme_char lexbuf 0) - Char.code '0' + 52 }
  | '+'
      { 62 }
  | '/'
      { 63 }
  | _
      { raise (Error(Illegal_character (lexeme_char lexbuf 0),
                     lexeme_start lexbuf)) }