File: parse_specialcasing.ml

package info (click to toggle)
camomile 0.7.2-2
  • links: PTS, VCS
  • area: main
  • in suites: squeeze
  • size: 18,680 kB
  • ctags: 2,269
  • sloc: ml: 11,806; makefile: 336; xml: 224; sh: 75
file content (85 lines) | stat: -rw-r--r-- 3,084 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
(* $Id: parse_specialcasing.ml,v 1.10 2006/08/13 17:24:43 yori Exp $ *)
(* Copyright 2002 Yamagata Yoriyuki *)

module Unidata = Unidata.Make(Camomileconfig)

let scases = ref UMap.empty

let int_of_code code = int_of_string ("0x"^code)
let uchar_of_code code = UChar.chr_of_uint (int_of_code code)

let us_of_codes codes = List.map uchar_of_code codes

let not_pat = Str.regexp "not_\\(.*\\)"
let locale_pat = Str.regexp "\\(..\\)\\(_\\(..\\)\\)?\\(_\\(.+\\)\\)?"

let rec parse_condition condition =
  let s = String.lowercase condition in
  match s with
    "final_sigma" -> `FinalSigma
  | "after_soft_dotted" -> `AfterSoftDotted
  | "more_above" -> `MoreAbove
  | "before_dot" -> `BeforeDot
  | _ ->
      if Str.string_match not_pat s 0 then
	`Not (parse_condition (Str.matched_group 1 s))
      else if Str.string_match locale_pat s 0 then `Locale s
      else invalid_arg "Not a condition"

let scolon_pat = Str.regexp ";"
let blank_pat = Str.regexp "[ \t]+"

let put_record code lower title upper conditions =
  let u = uchar_of_code code in
  let record =
    {Toolslib.UCharInfo.lower = us_of_codes (Str.split blank_pat lower);
     Toolslib.UCharInfo.title = us_of_codes (Str.split blank_pat title);
     Toolslib.UCharInfo.upper = us_of_codes (Str.split blank_pat upper);
     Toolslib.UCharInfo.condition = List.map parse_condition conditions}
  in 
  let entry = try UMap.find u !scases with Not_found -> [] in
  scases := UMap.add u (record :: entry) !scases

let comment_pat = Str.regexp "\\(^#.*\\)\\|\\([ \t]*$\\)"
let no_context_pat = 
  Str.regexp "\\([^;]*\\);\\([^;]*\\);\\([^;]*\\);\\([^;]*\\);?[ \t]*#.*"

let with_context_pat =
  Str.regexp "\\([^;]*\\);\\([^;]*\\);\\([^;]*\\);\\([^;]*\\);\\([^;]*\\);?[ \t]*#.*"

let loaddata () = 
  let count = ref 0 in
  try while true do
    let line = read_line () in
    incr count;
    if Str.string_match comment_pat line 0 then () else
    if Str.string_match no_context_pat line 0 then
      let code = Str.matched_group 1 line in
      let lower = Str.matched_group 2 line in
      let title = Str.matched_group 3 line in
      let upper = Str.matched_group 4 line in
      put_record code lower title upper []
    else if Str.string_match with_context_pat line 0 then
      let code = Str.matched_group 1 line in
      let lower = Str.matched_group 2 line in
      let title = Str.matched_group 3 line in
      let upper = Str.matched_group 4 line in
      let conditions = Str.matched_group 5 line in
      put_record code lower title upper (Str.split blank_pat conditions)
    else failwith (Printf.sprintf "Malformed entry in the line %d" !count)
  done with End_of_file -> ()

module CasingTbl = UCharTbl.Make (struct
  type t = Toolslib.UCharInfo.special_casing_property list
  let equal = (=)
  let hash = Hashtbl.hash
end)

let  _ =
  let dir = ref "" in
  Arg.parse [] (fun s -> dir := s) "Parse the SpecialCasing file";
  loaddata ();
  let tbl = CasingTbl.of_map [] !scases in
  let file = Filename.concat !dir "special_casing.mar" in
  let c = open_out_bin file in
  output_value c tbl; close_out c;