1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335
|
(* $Id: camomilelocaledef.ml,v 1.1 2006/08/13 17:21:24 yori Exp $ *)
(* Copyright 2002, 2003 Yamagata Yoriyuki *)
open Toolslib
open UCharInfo
open AbsCe
let enc, readfile, dir =
let enc = ref CharEncoding.utf8 in
let readfile = ref stdin in
let dir = ref Filename.current_dir_name in
Arg.parse
["--enc", Arg.String (fun encname ->
enc := CharEncoding.of_name encname), "Encoding name";
"--file", Arg.String (fun filename ->
readfile := open_in_bin filename), "Reading file"]
(fun dirname -> dir := dirname)
"camomilelocaledef --enc ENCNAME --file INPUTFILE DIRECTORY:\n\
Read the localedef INPUTFILE using the encoding ENCNAME \
and put the compiled data into DIRECTORY. \
If ENCNAME is ommited, UTF-8 is used. \
If INPUTFILE is ommited, reading from stdin. \
If DIRECTORY is ommited, the current directory is used.";
!enc, !readfile, !dir
module Utf8Buffer = UTF8.Buf
module Utf8NF = UNF.Make (UTF8)
let ff = 0x000c (*form feed*)
let cr = Char.code '\r'
let lf = Char.code '\n'
let nel = 0x0085
let tab = Char.code '\t'
let backslash = Char.code '\\'
let sq = Char.code '\\'
let dq = Char.code '"'
let backslash = Str.regexp "\\\\\\\\"
let literal_1 = Str.regexp
"\\\\[u]\\([0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f]\\)"
let literal_2 = Str.regexp
"\\\\[v]\\([0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f]\\)"
let unescape s =
let s =
Str.global_substitute literal_1 (fun _ ->
let n = int_of_string (Str.replace_matched "0x\\1" s) in
UTF8.init 1 (fun _ -> (UChar.chr_of_uint n)))
s in
let s =
Str.global_substitute literal_2 (fun _ ->
let n = int_of_string (Str.replace_matched "0x\\1" s) in
UTF8.init 1 (fun _ -> (UChar.chr_of_uint n)))
s in
Str.global_replace backslash "\\\\" s
let rec stream_to_list_aux a s = (parser
[< 'e; rest >] -> stream_to_list_aux (e :: a) rest
| [< >] -> List.rev a) s
let stream_to_list s = stream_to_list_aux [] s
type token =
Text of string
| Brace_r
| Brace_l
| Colon
| Comma
let rec prep = parser
[< 'u; rest >] ->
let c = try Some (UChar.char_of u) with _ -> None in
(match general_category u with
`Cc | `Cf when c <> Some '\n' -> prep rest
| ct -> [< '(c, ct, u); prep rest >])
| [< >] -> [< >]
let rec remove_comment = parser
[< '( Some '/', _, _) as data; rest >] ->
(parser
[< '(Some '/', _, _); rest >] -> comment rest
| [< '(Some '*', _, _); rest >] -> comment2 rest
| [< rest >] -> [< 'data; remove_comment rest >])
rest
| [< '( Some '"', _, _) as data; rest >] ->
[< 'data; in_quote rest >]
| [< 'data; rest >] -> [< 'data; remove_comment rest >]
| [< >] -> [< >]
and comment = parser
[< '( Some ('\r' | '\n' | '\133'), _, _) | ( _, (`Zl | `Zp), _); rest >]
-> remove_comment rest
| [< 'data; rest >] -> comment rest
| [< >] -> [< >]
and comment2 = parser
[< '( Some '*', _, _) as data; rest >] -> (parser
[< '(Some '/', _, _); rest >] -> remove_comment rest
| [< rest >] -> comment2 rest) rest
| [< 'data; rest >] -> comment2 rest
| [< >] -> [< >]
and in_quote = parser
[< '( Some '\\', _, _) as data1; 'data2; rest >] ->
[< 'data1; 'data2; in_quote rest >]
| [< '( Some '"', _, _) as data; rest >] ->
[<' data; remove_comment rest >]
| [< 'data; rest >] -> [< 'data; in_quote rest >]
| [< >] -> [< >]
let rec merge_text = parser
[< 'Text s; rest >] -> do_merge s rest
| [< 'e; rest >] -> [< 'e; merge_text rest >]
| [< >] -> [< >]
and do_merge s = parser
[< 'Text s'; rest >] -> do_merge (s ^ s') rest
| [< 'e; rest >] -> [< 'Text s; 'e; merge_text rest >]
| [< >] -> [< >]
let lexer s =
let rec parse = parser
[< '( Some '{', _, _); rest >] -> [< 'Brace_l; parse rest >]
| [< '( Some '}', _, _); rest >] -> [< 'Brace_r; parse rest >]
| [< '( Some ':', _, _); rest >] -> [< 'Colon; parse rest >]
| [< '( Some ',', _, _); rest >] -> [< 'Comma; parse rest >]
| [< '( Some '"', _, _); rest >] -> quote rest
| [< '( Some ('\r' | '\n' | '\133' | '\t'), _, _)
| ( _, (`Zs | `Zl | `Zp), _) ; rest >] ->
parse rest
| [< 'e; rest >] -> text [< 'e; rest >]
| [< >] -> [< >]
and quote s =
let buf = Utf8Buffer.create 16 in
let rec loop = parser
[< '( Some '\\', _, u1); '(_, _, u2); rest >] ->
Utf8Buffer.add_char buf u1;
Utf8Buffer.add_char buf u2;
loop rest
| [< '( Some '"', _, _); rest >] ->
let s = Utf8Buffer.contents buf in
let s' = unescape s in
[< 'Text s'; parse rest >]
| [< '( _, _, u); rest >] ->
Utf8Buffer.add_char buf u;
loop rest
| [< >] -> failwith "A quote is not enclosed."
in
loop s
and text s =
let buf = Utf8Buffer.create 16 in
let rec loop = parser
[<'( Some ('\r' | '\n' | '\133' | '\t'), _, _) |
( _, (`Zs | `Zl | `Zp), _) ; rest >] ->
let s = Utf8Buffer.contents buf in
let s' = unescape s in
[< 'Text s'; parse rest >]
| [< '( Some ('{' | '}' | ':' | ','| '"'), _, _) as e; rest >] ->
let s = Utf8Buffer.contents buf in
let s' = unescape s in
[< 'Text s'; parse [< 'e; rest >] >]
| [< '( _, _, u); rest >] ->
Utf8Buffer.add_char buf u;
loop rest
| [< >] ->
let s = Utf8Buffer.contents buf in
let s' = unescape s in
[< 'Text s' >]
in
loop s
in
let p = prep s in
let p1 = remove_comment p in
let tokens = parse p1 in
let tokens1 = merge_text tokens in
let l = stream_to_list tokens1 in l
let string_to_binary s =
let n = String.length s / 2 in
let b = String.create n in
for i = 0 to n - 1 do
let d = int_of_string ("0x" ^ (String.sub s (i * 2) 2)) in
b.[i] <- Char.chr d
done;
b
let root = ref ""
let load_file filename =
let file =
if Filename.is_implicit filename then
Filename.concat !root filename else
filename
in
let c = open_in_bin file in
let buf = Buffer.create 16 in
try begin while true do
Buffer.add_channel buf c 1
done; assert false end
with End_of_file ->
Buffer.contents buf
type data =
Table of (string, data) Hashtbl.t
| Array_data of data array
| String_data of string
| Binary of string
| Int of int
| Intvect of int array
| Tagged of string * data
let rec parse_intvect l a =
match l with
Text num :: Comma :: rest ->
parse_intvect rest ((int_of_string num) :: a)
| Text num :: rest ->
Intvect (Array.of_list (List.rev ((int_of_string num) :: a))), rest
| _ ->
Intvect (Array.of_list (List.rev a)), l
let rec parse_table l a =
match parse l with
Some d, rest -> parse_table rest (d :: a)
| None, rest ->
let tbl = Hashtbl.create (List.length a) in
let proc ent =
match ent with
Tagged (name, data) ->
Hashtbl.add tbl name data
| _ -> failwith "A broken table entry."
in
List.iter proc a;
Table tbl, rest
and parse_array l a =
match l with
Brace_l :: rest ->
let data, rest = parse_unknown rest in
(match rest with
Brace_r :: Comma :: rest ->
parse_array rest (data :: a)
| Brace_r :: rest ->
parse_array rest (data :: a)
| _ -> failwith "A brace is not enclosed.")
| Text text :: Comma :: rest ->
parse_array rest ((String_data text) :: a)
| Text text :: rest ->
Array_data (Array.of_list (List.rev ((String_data text) :: a))), rest
| _ ->
Array_data (Array.of_list (List.rev a)), l
and parse_unknown l =
match l with
Text text :: Brace_r :: rest ->
String_data text, Brace_r :: rest
| Text text :: Comma :: rest -> parse_array l []
| Text text :: rest -> parse_table l []
| _ -> parse_array l []
and parse l = match l with
Text tname :: Colon :: Text "table" :: Brace_l :: rest ->
let data, rest = parse_table rest [] in
(match rest with
Brace_r :: rest ->
Some (Tagged (tname, data)), rest
| _ -> failwith "A brace is not enclosed.")
| Text tname :: Colon :: Text "array" :: Brace_l :: rest ->
let data, rest = parse_array rest [] in
(match rest with
Brace_r :: rest ->
Some (Tagged (tname, data)), rest
| _ -> failwith "A brace is not enclosed.")
| Text tname :: Colon :: Text "string" :: Brace_l ::
Text data :: Brace_r :: rest ->
Some (Tagged (tname, String_data data)), rest
| Text tname :: Colon :: Text "bin" :: Brace_l ::
Text data :: Brace_r :: rest ->
let b = string_to_binary data in
Some (Tagged (tname, Binary b)), rest
| Text tname :: Colon :: Text "import" :: Brace_l ::
Text filename :: Brace_r :: rest ->
prerr_endline "Warning : file loading is not supported.";
Some (Tagged (tname, Binary "")), rest
| Text tname :: Colon :: Text "int" :: Brace_l ::
Text num :: Brace_r :: rest ->
let n = int_of_string num in
Some (Tagged (tname, Int n)), rest
| Text tname :: Colon :: Text "intvector" :: Brace_l :: rest ->
let data, rest = parse_intvect rest [] in
(match rest with
Brace_r :: rest ->
Some (Tagged (tname, data)), rest
| _ -> failwith "A brace is not enclosed.")
| Text name :: Brace_l :: rest ->
let data, rest = parse_unknown rest in
(match rest with
Brace_r :: rest ->
Some (Tagged (name, data)), rest
| _ -> failwith "A brace is not enclosed.")
| _ -> None, l
let col_parse s =
let s = Utf8NF.nfd s in
let lexbuf = Lexing.from_string s in
let ace_info = ColParser.main ColLexer.token lexbuf in
cetbl_of ace_info
let localedef = function Table tbl ->
let col_info = try
Some (match Hashtbl.find tbl "CollationElements" with
Table tbl ->
(match Hashtbl.find tbl "Sequence" with
String_data s -> col_parse s
| _ -> assert false)
| _ -> assert false)
with Not_found -> None
in
{Unidata.col_info = col_info}
| _ -> assert false
let main () =
let cs = Stream.of_channel readfile in
let stream = CharEncoding.ustream_of enc cs in
let lexed = lexer stream in
let data, rest = parse_table lexed [] in
if rest <> [] then failwith "Strange trailing data.";
let proc key entry =
let locale_info = localedef entry in
let file = Filename.concat dir (key ^ ".mar") in
let c = open_out_bin file in
output_value c locale_info
in
(match data with
Table tbl -> Hashtbl.iter proc tbl
| _ -> failwith "Broken data.")
let _ = main ()
|