1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
|
(* $Id: parse_casefolding.ml,v 1.8 2003/12/19 17:24:34 yori Exp $ *)
(* Copyright 2002 Yamagata Yoriyuki *)
open Unidata
let folds = ref UMap.empty
let int_of_code code = int_of_string ("0x"^code)
let uchar_of_code code = UChar.chr_of_uint (int_of_code code)
let us_of_codes codes = List.map uchar_of_code codes
let scolon_pat = Str.regexp ";"
let blank_pat = Str.regexp "[ \t]+"
let comment_pat = Str.regexp "\\(^#.*\\)\\|\\([ \t]*$\\)"
let entry_pat =
Str.regexp "\\([^;]*\\); \\([^;]*\\); \\([^;]*\\);.*"
let loaddata () =
let count = ref 0 in
try while true do
let line = read_line () in
incr count;
if Str.string_match comment_pat line 0 then () else
if Str.string_match entry_pat line 0 then
let u = uchar_of_code (Str.matched_group 1 line) in
let status = Str.matched_group 2 line in
let mapping = Str.matched_group 3 line in
if status = "C" || status = "F" then
let mapping = us_of_codes (Str.split blank_pat mapping) in
folds := UMap.add u mapping !folds
else ()
else failwith (Printf.sprintf "Malformed entry in the line %d" !count)
done with End_of_file -> ()
module Tbl = UCharTbl.Make (struct
type t = UChar.t list
let equal = (=)
let hash = Hashtbl.hash
end)
let _ =
let dir = ref "" in
Arg.parse [] (fun s -> dir := s) "Parse the CaseFolding file";
loaddata ();
let tbl = Tbl.of_map [] !folds in
let c = open_out_bin (!dir^"/case_folding.mar") in
output_value c tbl; close_out c;
|