File: cf_unicode.ml

package info (click to toggle)
pagodacf 0.10-4
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 1,296 kB
  • ctags: 2,481
  • sloc: ml: 8,458; ansic: 3,339; makefile: 173
file content (354 lines) | stat: -rw-r--r-- 12,253 bytes parent folder | download | duplicates (7)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
(*---------------------------------------------------------------------------*
  IMPLEMENTATION  cf_unicode.ml

  Copyright (c) 2003-2006, James H. Woodyatt
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions
  are met:

    Redistributions of source code must retain the above copyright
    notice, this list of conditions and the following disclaimer.

    Redistributions in binary form must reproduce the above copyright
    notice, this list of conditions and the following disclaimer in
    the documentation and/or other materials provided with the
    distribution

  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
  INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
  OF THE POSSIBILITY OF SUCH DAMAGE. 
 *---------------------------------------------------------------------------*)

module type Endian_T = sig
    val to_ucs2: char -> char -> int
    val of_ucs2: int -> char * char
end

module Endian_be: Endian_T = struct
    let to_ucs2 c0 c1 =
        let c0 = int_of_char c0 and c1 = int_of_char c1 in
        (c0 lsr 8) lor c1
    
    let of_ucs2 n =
        let c1 = char_of_int (n land 0xFF) in
        let n = n lsr 8 in
        let c0 = char_of_int (n land 0xFF) in
        c0, c1
end

module Endian_le: Endian_T = struct
    let to_ucs2 c0 c1 =
        let c0 = int_of_char c0 and c1 = int_of_char c1 in
        (c1 lsr 8) lor c0
    
    let of_ucs2 n =
        let c0 = char_of_int (n land 0xFF) in
        let n = n lsr 8 in
        let c1 = char_of_int (n land 0xFF) in
        c0, c1
end

module type Encoding_T = sig
    val to_ucs4: (char Cf_seq.t option, int) Cf_flow.t
    val of_ucs4: (int, char) Cf_flow.t
end

module E_utf8: Encoding_T = struct
    let to_ucs4 =
        let rec state0 sopt =
            match sopt with
            | None ->
                Cf_flow.Z
            | Some seq ->
                match Lazy.force seq with
                | Cf_seq.Z ->
                    Cf_flow.Q state0
                | Cf_seq.P (hd, tl) ->
                    let c = int_of_char hd in
                    let tl = Some tl in
                    if c < 0b11000000 || c >= 0b11111110 then
                        let hd = if c < 0b10000000 then c else 0xFFFD in
                        Cf_flow.P (hd, lazy (state0 tl))
                    else
                        let k, x =
                            if c < 0b11100000 then 0, (c land 0b11111)
                            else if c < 0b11110000 then 1, (c land 0b1111)
                            else if c < 0b11111000 then 2, (c land 0b111)
                            else if c < 0b11111100 then 3, (c land 0b11)
                            else 4, (c land 0b1)
                        in
                        state1 ~k ~x tl
        and state1 ~k ~x sopt =
            match sopt with
            | None ->
                Cf_flow.P (0xFFFD, Lazy.lazy_from_val Cf_flow.Z)
            | Some seq as p ->
                match Lazy.force seq with
                | Cf_seq.Z ->
                    Cf_flow.Q (state1 ~k ~x)
                | Cf_seq.P (hd, tl) ->
                    let c = int_of_char hd in
                    if c < 0b10000000 then
                        Cf_flow.P (0xFFFD, lazy (state0 p))
                    else
                        let z = Some tl in
                        let zz = lazy (state0 z) in
                        if c > 0b10111111 then
                            Cf_flow.P (0xFFFD, zz)
                        else
                            let x = (x lsl 6) lor (c land 0b111111) in
                            if k > 0 then
                                let k = pred k in
                                state1 ~k ~x z
                            else
                                Cf_flow.P (x, zz)
        in
        Lazy.lazy_from_val (Cf_flow.Q state0)

    let rec of_ucs4 =
        lazy begin
            let rec state0 x =
                match x with
                | x when x = x land 0x7f -> state1 0 0 x of_ucs4
                | x when x = x land 0x7ff -> state1 0b11000000 1 x of_ucs4
                | x when x = x land 0xffff -> state1 0b11100000 2 x of_ucs4
                | x when x = x land 0xfffff -> state1 0b11110000 3 x of_ucs4
                | x when x = x land 0x3ffffff -> state1 0b11111000 4 x of_ucs4
                | x -> state1 0b11111100 5 x of_ucs4 (* UCS4 are 31-bit *)
            and state1 pre n x w =
                if n > 0 then begin
                    let c = char_of_int ((x land 0x3f) lor 0x80) in
                    let w = Lazy.lazy_from_val (Cf_flow.P (c, w)) in
                    state1 pre (pred n) (x lsr 6) w
                end
                else begin
                    let c = char_of_int (x lor pre) in
                    Cf_flow.P (c, w)
                end
            in
            Cf_flow.Q state0
        end
end

module E_utf16x_create(N: Endian_T): Encoding_T = struct
    open Cf_flow.Op
    
    let utf16_to_ucs2_ =
        let rec state0 = function
            | None -> Cf_flow.Z
            | Some seq ->
                match Lazy.force seq with
                | Cf_seq.Z -> Cf_flow.Q state0
                | Cf_seq.P (hd, tl) -> state1 ~c0:hd (Some tl)
        and state1 ~c0 = function
            | None ->
                Cf_flow.P (0xFFFD, Lazy.lazy_from_val Cf_flow.Z)
            | Some seq ->
                match Lazy.force seq with
                | Cf_seq.Z ->
                    Cf_flow.Q (state1 ~c0)
                | Cf_seq.P (hd, tl) ->
                    Cf_flow.P (N.to_ucs2 c0 hd, lazy (state0 (Some tl)))
        in
        Lazy.lazy_from_val (Cf_flow.Q state0)
    
    let rec ucs2_to_ucs4_ =
        lazy begin
            let rec state0 u0 =
                if u0 >= 0xd800 && u0 < 0xdc00 then
                    Cf_flow.Q (state1 ~u0)
                else
                    let u0 = if u0 < 0xe000 then 0xfffd else u0 in
                    Cf_flow.P (u0, ucs2_to_ucs4_)
            and state1 ~u0 u1 =
                let u =
                    if u1 < 0xdc00 || u1 >= 0xe000
                        then 0xfffd 
                        else ((u0 land 0x3ff) lsl 10) lor (u1 land 0x3ff)
                in
                Cf_flow.P (u, ucs2_to_ucs4_)
            in
            Cf_flow.Q state0
        end
    
    let to_ucs4 =
        utf16_to_ucs2_ -=- ucs2_to_ucs4_
    
    let rec of_ucs4 =
        lazy begin
            let put x w =
                let c0, c1 = N.of_ucs2 x in
                Cf_flow.P (c0, Lazy.lazy_from_val (Cf_flow.P (c1, w)))
            in
            let rec loop x =
                match x with
                | x when x = (x land 0xffff) ->
                    put x of_ucs4
                | x when x > 0 && x < 0x110000 ->
                    let x = x - 0x10000 in
                    let d800 = 0xd800 lor ((x lsr 10) land 0x3ff)
                    and dc00 = 0xdc00 lor (x land 0x3ff) in
                    put dc00 (Lazy.lazy_from_val (put d800 of_ucs4))
                | _ ->
                    put 0xFFFD of_ucs4
            in
            Cf_flow.Q loop
        end
end

module E_utf16be: Encoding_T = E_utf16x_create(Endian_be)
module E_utf16le: Encoding_T = E_utf16x_create(Endian_le)

let any_utf_to_ucs4 =
    let to_ucs4_f_ x =
        match Lazy.force E_utf8.to_ucs4 with
        | Cf_flow.Q f -> f x
        | _ -> assert false
    in
    let rec state1 = function
        | None ->
            Cf_flow.Z
        | Some seq as p->
            match Lazy.force seq with
            | Cf_seq.Z ->
                Cf_flow.Q state1
            | Cf_seq.P (hd, tl) ->
                let c = int_of_char hd in
                if c < 0b11111110 then
                    to_ucs4_f_ p
                else
                    state2 ~c0:hd (Some tl)
    and state2 ~c0 = function
        | None ->
            Cf_flow.P (0xFFFD, Lazy.lazy_from_val Cf_flow.Z)
        | Some seq ->
            match Lazy.force seq with
            | Cf_seq.Z ->
                Cf_flow.Q (state2 ~c0)
            | Cf_seq.P (hd, tl) ->
                let c = int_of_char hd in
                let w = Cf_flow.P (0xFFFD, lazy (to_ucs4_f_ (Some tl))) in
                if c < 0b11111110 then
                    w
                else
                    let u0 = int_of_char c0 and u1 = int_of_char hd in
                    match u0, u1 with
                    | 0xFE, 0xFF -> Lazy.force E_utf16be.to_ucs4
                    | 0xFF, 0xFE -> Lazy.force E_utf16le.to_ucs4
                    | _, _ -> w
    in
    Lazy.lazy_from_val (Cf_flow.Q state1)

module B_utf16_create(N: Endian_T) = struct    
    let prepend_bom w =
        let c0, c1 = N.of_ucs2 0xFFEF in
        lazy (Cf_flow.P (c0, lazy (Cf_flow.P (c1, w))))
end

module B_utf16be = B_utf16_create(Endian_be)
module B_utf16le = B_utf16_create(Endian_le)

let ucs4_to_utf16 = function
    | `BE -> B_utf16be.prepend_bom E_utf16be.of_ucs4
    | `LE -> B_utf16le.prepend_bom E_utf16le.of_ucs4

module type Transcoding_T = sig
    module E: Encoding_T
    
    val transcoder: (char Cf_seq.t option, char) Cf_flow.t
    val transcode: char Cf_seq.t -> char Cf_seq.t
    val atomic: string -> string
end

module C_create(E: Encoding_T): Transcoding_T = struct
    open Cf_flow.Op

    module E = E
    
    let transcoder = E.to_ucs4 -=- E.of_ucs4
    
    let transcode s = Cf_flow.transcode transcoder s
    
    let atomic s = Cf_seq.to_string (transcode (Cf_seq.of_string s))
end

module E_utf8_to_utf16be: Encoding_T = struct
    let to_ucs4 = E_utf8.to_ucs4
    let of_ucs4 = ucs4_to_utf16 `BE
end

module E_utf8_to_utf16le: Encoding_T = struct
    let to_ucs4 = E_utf8.to_ucs4
    let of_ucs4 = ucs4_to_utf16 `LE
end

module E_utf8_to_utf16be_raw: Encoding_T = struct
    let to_ucs4 = E_utf8.to_ucs4
    let of_ucs4 = E_utf16be.of_ucs4
end

module E_utf8_to_utf16le_raw: Encoding_T = struct
    let to_ucs4 = E_utf8.to_ucs4
    let of_ucs4 = E_utf16le.of_ucs4
end

module E_utf16be_to_utf8: Encoding_T = struct
    let to_ucs4 = E_utf16be.to_ucs4
    let of_ucs4 = E_utf8.of_ucs4
end

module E_utf16le_to_utf8: Encoding_T = struct
    let to_ucs4 = E_utf16le.to_ucs4
    let of_ucs4 = E_utf8.of_ucs4
end

module E_any_utf_to_utf8: Encoding_T = struct
    let to_ucs4 = any_utf_to_ucs4
    let of_ucs4 = E_utf8.of_ucs4
end

module E_any_utf_to_utf16be: Encoding_T = struct
    let to_ucs4 = any_utf_to_ucs4
    let of_ucs4 = ucs4_to_utf16 `BE
end

module E_any_utf_to_utf16le: Encoding_T = struct
    let to_ucs4 = any_utf_to_ucs4
    let of_ucs4 = ucs4_to_utf16 `LE
end

module E_any_utf_to_utf16be_raw: Encoding_T = struct
    let to_ucs4 = any_utf_to_ucs4
    let of_ucs4 = E_utf16be.of_ucs4
end

module E_any_utf_to_utf16le_raw: Encoding_T = struct
    let to_ucs4 = any_utf_to_ucs4
    let of_ucs4 = E_utf16le.of_ucs4
end

module C_utf8_to_utf16be = C_create(E_utf8_to_utf16be)
module C_utf8_to_utf16le = C_create(E_utf8_to_utf16le)
module C_utf8_to_utf16be_raw = C_create(E_utf8_to_utf16be_raw)
module C_utf8_to_utf16le_raw = C_create(E_utf8_to_utf16le_raw)

module C_utf16be_to_utf8 = C_create(E_utf16be_to_utf8)
module C_utf16le_to_utf8 = C_create(E_utf16le_to_utf8)
module C_any_utf_to_utf8 = C_create(E_any_utf_to_utf8)
module C_any_utf_to_utf16be = C_create(E_any_utf_to_utf16be)
module C_any_utf_to_utf16le = C_create(E_any_utf_to_utf16le)
module C_any_utf_to_utf16be_raw = C_create(E_any_utf_to_utf16be_raw)
module C_any_utf_to_utf16le_raw = C_create(E_any_utf_to_utf16le_raw)

(*--- End of File [ cf_unicode.ml ] ---*)