File: parse_casefolding.ml

package info (click to toggle)
camomile 0.7.2-2
  • links: PTS, VCS
  • area: main
  • in suites: squeeze
  • size: 18,680 kB
  • ctags: 2,269
  • sloc: ml: 11,806; makefile: 336; xml: 224; sh: 75
file content (49 lines) | stat: -rw-r--r-- 1,492 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
(* $Id: parse_casefolding.ml,v 1.8 2003/12/19 17:24:34 yori Exp $ *)
(* Copyright 2002 Yamagata Yoriyuki *)

open Unidata

let folds = ref UMap.empty

let int_of_code code = int_of_string ("0x"^code)
let uchar_of_code code = UChar.chr_of_uint (int_of_code code)

let us_of_codes codes = List.map uchar_of_code codes

let scolon_pat = Str.regexp ";"
let blank_pat = Str.regexp "[ \t]+"

let comment_pat = Str.regexp "\\(^#.*\\)\\|\\([ \t]*$\\)"
let entry_pat = 
  Str.regexp "\\([^;]*\\); \\([^;]*\\); \\([^;]*\\);.*"

let loaddata () = 
  let count = ref 0 in
  try while true do
    let line = read_line () in
    incr count;
    if Str.string_match comment_pat line 0 then () else
    if Str.string_match entry_pat line 0 then
      let u = uchar_of_code (Str.matched_group 1 line) in
      let status = Str.matched_group 2 line in
      let mapping = Str.matched_group 3 line in
      if status = "C" || status = "F" then
	let mapping = us_of_codes (Str.split blank_pat mapping) in
        folds := UMap.add u mapping !folds
      else ()
    else failwith (Printf.sprintf "Malformed entry in the line %d" !count)
  done with End_of_file -> ()

module Tbl = UCharTbl.Make (struct
  type t = UChar.t list
  let equal = (=)
  let hash = Hashtbl.hash
end)

let  _ =
  let dir = ref "" in
  Arg.parse [] (fun s -> dir := s) "Parse the CaseFolding file";
  loaddata ();
  let tbl = Tbl.of_map [] !folds in
  let c = open_out_bin (!dir^"/case_folding.mar") in
  output_value c tbl; close_out c;