File: parse_specialcasing.ml

package info (click to toggle)
camomile 0.8.4-2
  • links: PTS, VCS
  • area: main
  • in suites: wheezy
  • size: 22,252 kB
  • sloc: ml: 14,032; makefile: 368; xml: 224; sh: 6
file content (117 lines) | stat: -rw-r--r-- 4,777 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
(** Parse the data used fot case mappings *)
(* Copyright (C) 2002, 2011 Yamagata Yoriyuki *)

(* This library is free software; you can redistribute it and/or *)
(* modify it under the terms of the GNU Lesser General Public License *)
(* as published by the Free Software Foundation; either version 2 of *)
(* the License, or (at your option) any later version. *)

(* As a special exception to the GNU Library General Public License, you *)
(* may link, statically or dynamically, a "work that uses this library" *)
(* with a publicly distributed version of this library to produce an *)
(* executable file containing portions of this library, and distribute *)
(* that executable file under terms of your choice, without any of the *)
(* additional requirements listed in clause 6 of the GNU Library General *)
(* Public License. By "a publicly distributed version of this library", *)
(* we mean either the unmodified Library as distributed by the authors, *)
(* or a modified version of this library that is distributed under the *)
(* conditions defined in clause 3 of the GNU Library General Public *)
(* License. This exception does not however invalidate any other reasons *)
(* why the executable file might be covered by the GNU Library General *)
(* Public License . *)

(* This library is distributed in the hope that it will be useful, *)
(* but WITHOUT ANY WARRANTY; without even the implied warranty of *)
(* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU *)
(* Lesser General Public License for more details. *)

(* You should have received a copy of the GNU Lesser General Public *)
(* License along with this library; if not, write to the Free Software *)
(* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 *)
(* USA *)

(* You can contact the authour by sending email to *)
(* yori@sourceforge.net *)


module Unidata = Unidata.Make(Camomileconfig)
module UCharInfo = UCharInfo.Make(Camomileconfig)

let scases = ref UMap.empty

let int_of_code code = int_of_string ("0x"^code)
let uchar_of_code code = UChar.chr_of_uint (int_of_code code)

let us_of_codes codes = List.map uchar_of_code codes

let not_pat = Str.regexp "not_\\(.*\\)"
let locale_pat = Str.regexp "\\(..\\)\\(_\\(..\\)\\)?\\(_\\(.+\\)\\)?"

let rec parse_condition condition =
  let s = String.lowercase condition in
  match s with
    "final_sigma" -> `FinalSigma
  | "after_soft_dotted" -> `AfterSoftDotted
  | "more_above" -> `MoreAbove
  | "before_dot" -> `BeforeDot
  | _ ->
      if Str.string_match not_pat s 0 then
	`Not (parse_condition (Str.matched_group 1 s))
      else if Str.string_match locale_pat s 0 then `Locale s
      else invalid_arg "Not a condition"

let scolon_pat = Str.regexp ";"
let blank_pat = Str.regexp "[ \t]+"

let put_record code lower title upper conditions =
  let u = uchar_of_code code in
  let record =
    {UCharInfo.lower = us_of_codes (Str.split blank_pat lower);
     UCharInfo.title = us_of_codes (Str.split blank_pat title);
     UCharInfo.upper = us_of_codes (Str.split blank_pat upper);
     UCharInfo.condition = List.map parse_condition conditions}
  in 
  let entry = try UMap.find u !scases with Not_found -> [] in
  scases := UMap.add u (record :: entry) !scases

let comment_pat = Str.regexp "\\(^#.*\\)\\|\\([ \t]*$\\)"
let no_context_pat = 
  Str.regexp "\\([^;]*\\);\\([^;]*\\);\\([^;]*\\);\\([^;]*\\);?[ \t]*#.*"

let with_context_pat =
  Str.regexp "\\([^;]*\\);\\([^;]*\\);\\([^;]*\\);\\([^;]*\\);\\([^;]*\\);?[ \t]*#.*"

let loaddata () = 
  let count = ref 0 in
  try while true do
    let line = read_line () in
    incr count;
    if Str.string_match comment_pat line 0 then () else
    if Str.string_match no_context_pat line 0 then
      let code = Str.matched_group 1 line in
      let lower = Str.matched_group 2 line in
      let title = Str.matched_group 3 line in
      let upper = Str.matched_group 4 line in
      put_record code lower title upper []
    else if Str.string_match with_context_pat line 0 then
      let code = Str.matched_group 1 line in
      let lower = Str.matched_group 2 line in
      let title = Str.matched_group 3 line in
      let upper = Str.matched_group 4 line in
      let conditions = Str.matched_group 5 line in
      put_record code lower title upper (Str.split blank_pat conditions)
    else failwith (Printf.sprintf "Malformed entry in the line %d" !count)
  done with End_of_file -> ()

module CasingTbl = UCharTbl.Make (struct
  type t = UCharInfo.special_casing_property list
  let equal = (=)
  let hash = Hashtbl.hash
end)

let  _ =
  let dir = ref "" in
  Arg.parse [] (fun s -> dir := s) "Parse the SpecialCasing file";
  loaddata ();
  let tbl = CasingTbl.of_map [] !scases in
  Database.write !dir "mar" output_value "special_casing" tbl