File: camomilelocaledef.ml

package info (click to toggle)
camomile 0.7.2-2
  • links: PTS, VCS
  • area: main
  • in suites: squeeze
  • size: 18,680 kB
  • ctags: 2,269
  • sloc: ml: 11,806; makefile: 336; xml: 224; sh: 75
file content (335 lines) | stat: -rw-r--r-- 10,434 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
(* $Id: camomilelocaledef.ml,v 1.1 2006/08/13 17:21:24 yori Exp $ *)
(* Copyright 2002, 2003 Yamagata Yoriyuki *)

open Toolslib
open UCharInfo
open AbsCe

let enc, readfile, dir =
  let enc = ref CharEncoding.utf8 in
  let readfile = ref stdin in
  let dir = ref Filename.current_dir_name in
  Arg.parse
    ["--enc", Arg.String (fun encname ->
      enc := CharEncoding.of_name encname), "Encoding name";
     "--file", Arg.String (fun filename ->
       readfile := open_in_bin filename), "Reading file"]
    (fun dirname -> dir := dirname)
    "camomilelocaledef --enc ENCNAME --file INPUTFILE DIRECTORY:\n\
    Read the localedef INPUTFILE using the encoding ENCNAME \
and put the compiled data into DIRECTORY. \
    If ENCNAME is ommited, UTF-8 is used.  \
    If INPUTFILE is ommited, reading from stdin. \
    If DIRECTORY is ommited, the current directory is used.";
    !enc, !readfile, !dir

module Utf8Buffer = UTF8.Buf
module Utf8NF = UNF.Make (UTF8)

let ff = 0x000c				(*form feed*)
let cr = Char.code '\r'
let lf = Char.code '\n'
let nel = 0x0085
let tab = Char.code '\t'

let backslash = Char.code '\\'
let sq = Char.code '\\'
let dq = Char.code '"'

let backslash = Str.regexp "\\\\\\\\"
let literal_1 = Str.regexp 
    "\\\\[u]\\([0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f]\\)"

let literal_2 = Str.regexp
    "\\\\[v]\\([0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f]\\)"

let unescape s =
  let s = 
    Str.global_substitute literal_1 (fun _ ->
      let n = int_of_string (Str.replace_matched "0x\\1" s) in
      UTF8.init 1 (fun _ -> (UChar.chr_of_uint n))) 
      s in
  let s = 
    Str.global_substitute literal_2 (fun _ ->
      let n = int_of_string (Str.replace_matched "0x\\1" s) in
      UTF8.init 1 (fun _ -> (UChar.chr_of_uint n))) 
      s in
  Str.global_replace backslash "\\\\" s

let rec stream_to_list_aux a s = (parser
    [< 'e; rest >] -> stream_to_list_aux (e :: a) rest
  | [< >] -> List.rev a) s

let stream_to_list s = stream_to_list_aux [] s

type token = 
    Text of string
  | Brace_r
  | Brace_l
  | Colon
  | Comma

let rec prep = parser 
    [< 'u; rest >] ->
      let c = try Some (UChar.char_of u) with _ -> None in
      (match general_category u with
	`Cc | `Cf when c <> Some '\n' ->  prep rest
      | ct -> [< '(c, ct, u); prep rest >])
  | [< >] -> [< >]

let rec remove_comment = parser
    [< '( Some '/', _, _) as data; rest >] ->
      (parser
	  [< '(Some '/', _, _); rest >] -> comment rest
	| [< '(Some '*', _, _); rest >] -> comment2 rest
	| [< rest >] -> [< 'data; remove_comment rest >])
	rest
  | [< '( Some '"', _, _) as data; rest >]  -> 
      [< 'data; in_quote rest >]
  | [< 'data; rest >] -> [< 'data; remove_comment rest >]
  | [< >] -> [< >]
and comment = parser
    [< '( Some ('\r' | '\n' | '\133'), _, _) | ( _, (`Zl | `Zp), _); rest >] 
    -> remove_comment rest
  | [< 'data; rest >] -> comment rest
  | [< >] -> [< >]
and comment2 = parser
    [< '( Some '*', _, _) as data; rest >] -> (parser
	[<  '(Some '/', _, _); rest >] -> remove_comment rest
      |	[< rest >] -> comment2 rest) rest
  | [< 'data; rest >] -> comment2 rest
  | [< >] -> [< >]
and in_quote = parser
    [< '( Some '\\', _, _) as data1; 'data2; rest >] ->
      [< 'data1; 'data2; in_quote rest >]
  | [< '( Some '"', _, _) as data; rest >]  -> 
      [<' data; remove_comment rest >]
  | [< 'data; rest >] -> [< 'data; in_quote rest >]
  | [< >] -> [< >]

let rec merge_text = parser
    [< 'Text s; rest >] -> do_merge s rest
  | [< 'e; rest >] -> [< 'e; merge_text rest >]
  | [< >] -> [< >]
and do_merge s = parser
    [< 'Text s'; rest >] -> do_merge (s ^ s') rest
  | [< 'e; rest >] -> [< 'Text s; 'e; merge_text rest >]
  | [< >] -> [< >]

let lexer s =
  let rec parse = parser
      [< '( Some '{', _, _); rest >] -> [< 'Brace_l; parse rest >]
    | [< '( Some '}', _, _); rest >] -> [< 'Brace_r; parse rest >]
    | [< '( Some ':', _, _); rest >] -> [< 'Colon; parse rest >]
    | [< '( Some ',', _, _); rest >] -> [< 'Comma; parse rest >]
    | [< '( Some '"', _, _); rest >] -> quote rest
    | [< '( Some ('\r' | '\n' | '\133' | '\t'), _, _)
    | ( _, (`Zs | `Zl | `Zp), _) ; rest >] ->
	parse rest
    | [< 'e; rest >] -> text [< 'e; rest >]
    | [< >] -> [< >]
  and quote s =
    let buf = Utf8Buffer.create 16 in
    let rec loop = parser
	[< '( Some '\\', _, u1); '(_, _, u2); rest >] ->
	  Utf8Buffer.add_char buf u1;
	  Utf8Buffer.add_char buf u2;
	  loop rest
      |	[< '( Some '"', _, _); rest >]  -> 
	  let s = Utf8Buffer.contents buf in
	  let s' = unescape s in
	  [< 'Text s'; parse rest >]
      |	[< '( _, _, u); rest >] ->
	  Utf8Buffer.add_char buf u;
	  loop rest
      | [< >] -> failwith "A quote is not enclosed."
    in
    loop s
  and text s =
    let buf = Utf8Buffer.create 16 in
    let rec loop = parser
	[<'( Some ('\r' | '\n' | '\133' | '\t'), _, _) | 
	( _, (`Zs | `Zl | `Zp), _) ; rest >] ->
	  let s = Utf8Buffer.contents buf in
	  let s' = unescape s in
	  [< 'Text s'; parse rest >]
      |	[< '( Some ('{' | '}' | ':' | ','| '"'), _, _) as e; rest >] ->
	  let s = Utf8Buffer.contents buf in
	  let s' = unescape s in
	  [< 'Text s'; parse [< 'e; rest >] >]
      |	[< '( _, _, u); rest >] ->
	  Utf8Buffer.add_char buf u;
	  loop rest
      |	[< >] ->
	  let s = Utf8Buffer.contents buf in
	  let s' = unescape s in
	  [< 'Text s' >]
    in
    loop s
  in
  let p = prep s in
  let p1 = remove_comment p in
  let tokens = parse p1 in
  let tokens1 = merge_text tokens in
  let l = stream_to_list tokens1 in l

let string_to_binary s =
  let n = String.length s / 2 in
  let b = String.create n in
  for i = 0 to n - 1 do
    let d = int_of_string ("0x" ^ (String.sub s (i * 2) 2)) in
    b.[i] <- Char.chr d
  done;
  b

let root = ref ""

let load_file filename =
  let file =
    if Filename.is_implicit filename then
      Filename.concat !root filename else
      filename
  in
  let c = open_in_bin file in
  let buf = Buffer.create 16 in
  try begin while true do
    Buffer.add_channel buf c 1
  done; assert false end
  with End_of_file ->
    Buffer.contents buf

type data = 
    Table of (string, data) Hashtbl.t
  | Array_data of data array
  | String_data of string
  | Binary of string
  | Int of int
  | Intvect of int array
  | Tagged of string * data

let rec parse_intvect l a =
  match l with
    Text num :: Comma :: rest ->
      parse_intvect rest ((int_of_string num) :: a)
  | Text num :: rest ->
      Intvect (Array.of_list (List.rev ((int_of_string num) :: a))), rest
  | _ -> 
      Intvect (Array.of_list (List.rev a)), l

let rec parse_table l a =
  match parse l with
    Some d, rest -> parse_table rest (d :: a)
  | None, rest -> 
      let tbl = Hashtbl.create (List.length a) in
      let proc ent =
	match ent with
	  Tagged (name, data) ->
	    Hashtbl.add tbl name data
	| _ -> failwith "A broken table entry."
      in
      List.iter proc a;
      Table tbl, rest

and parse_array l a =
  match l with
    Brace_l :: rest ->
      let data, rest = parse_unknown rest in
      (match rest with
	Brace_r :: Comma :: rest -> 
	  parse_array rest (data :: a)
      |	Brace_r :: rest -> 
	  parse_array rest (data :: a)
      |	_ -> failwith "A brace is not enclosed.")
  | Text text :: Comma :: rest ->
      parse_array rest ((String_data text) :: a)
  | Text text :: rest ->
      Array_data (Array.of_list (List.rev ((String_data text) :: a))), rest
  | _ ->
      Array_data (Array.of_list (List.rev a)), l

and parse_unknown l =
  match l with
    Text text :: Brace_r :: rest ->
      String_data text, Brace_r :: rest
  | Text text :: Comma :: rest -> parse_array l []
  | Text text :: rest -> parse_table l []
  | _ -> parse_array l []

and parse l = match l with
    Text tname :: Colon :: Text "table" :: Brace_l :: rest ->
      let data, rest = parse_table rest [] in
      (match rest with
	Brace_r :: rest ->
	  Some (Tagged (tname, data)), rest
      |	_ -> failwith "A brace is not enclosed.")
  | Text tname :: Colon :: Text "array" :: Brace_l :: rest ->
      let data, rest = parse_array rest [] in
      (match rest with
	Brace_r :: rest ->
	  Some (Tagged (tname, data)), rest
      |	_ -> failwith "A brace is not enclosed.")
  | Text tname :: Colon :: Text "string" :: Brace_l ::
    Text data :: Brace_r :: rest ->  
      Some (Tagged (tname, String_data data)), rest
  | Text tname :: Colon :: Text "bin" :: Brace_l :: 
    Text data :: Brace_r :: rest ->
      let b = string_to_binary data in
      Some (Tagged (tname, Binary b)), rest
  | Text tname :: Colon :: Text "import" :: Brace_l ::
    Text filename :: Brace_r :: rest ->
      prerr_endline "Warning : file loading is not supported.";
      Some (Tagged (tname, Binary "")), rest
  | Text tname :: Colon :: Text "int" :: Brace_l ::
    Text num :: Brace_r :: rest ->
      let n = int_of_string num in
      Some (Tagged (tname, Int n)), rest
  | Text tname :: Colon :: Text "intvector" :: Brace_l :: rest ->
      let data, rest = parse_intvect rest [] in
      (match rest with
	Brace_r :: rest ->
	  Some (Tagged (tname, data)), rest
      |	_ -> failwith "A brace is not enclosed.")
  | Text name :: Brace_l :: rest ->
      let data, rest = parse_unknown rest in
      (match rest with
	Brace_r :: rest ->
	  Some (Tagged (name, data)), rest
      |	_ -> failwith "A brace is not enclosed.")
  | _ -> None, l

let col_parse s =
  let s = Utf8NF.nfd s in
  let lexbuf = Lexing.from_string s in
  let ace_info = ColParser.main ColLexer.token lexbuf in
  cetbl_of ace_info

let localedef = function Table tbl ->
  let col_info = try
    Some (match Hashtbl.find tbl "CollationElements" with 
      Table tbl ->
	(match Hashtbl.find tbl "Sequence" with 
	  String_data s -> col_parse s
	| _ -> assert false)
    | _ -> assert false)
  with Not_found -> None
  in
  {Unidata.col_info = col_info}
  | _ -> assert false

let main () =
  let cs = Stream.of_channel readfile in
  let stream = CharEncoding.ustream_of enc cs in
  let lexed = lexer stream in
  let data, rest = parse_table lexed [] in
  if rest <> [] then failwith "Strange trailing data.";
  let proc key entry =
    let locale_info = localedef entry in
    let file = Filename.concat dir (key ^ ".mar") in
    let c = open_out_bin file in
    output_value c locale_info
  in
  (match data with
    Table tbl -> Hashtbl.iter proc tbl
  | _ -> failwith "Broken data.")
    
let _ = main ()