File: utf8.ml

package info (click to toggle)
ocaml-sedlex 3.7-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 792 kB
  • sloc: ml: 7,866; makefile: 24; sh: 9
file content (33 lines) | stat: -rw-r--r-- 1,550 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
open Printf

let next_tok buf =
  let open Sedlexing.Utf8 in
  match%sedlex buf with
    | "a", Utf8 (Chars "+-×÷") -> sprintf "with Chars: %s" (lexeme buf)
    | "b", Utf8 ("+" | "-" | "×" | "÷") ->
        sprintf "with or_pattern: %s" (lexeme buf)
    | Latin1 "\xc0", Utf8 "À", Ascii (Utf8 (Latin1 (Utf8 (Chars "À")))) ->
        sprintf "mixed encoding: %s" (lexeme buf)
    | Ascii (Star '\x00' .. '\x7f') -> sprintf "only ascii: %s" (lexeme buf)
    | Utf8 (Star '\x00' .. '\x7f') ->
        assert false
        (* utf8 char interval can only match ascii because of the OCaml lexer. The regexp above should match instead  *)
    | Latin1 (Star '\x00' .. '\xff') -> sprintf "only latin1: %s" (lexeme buf)
    | _ -> failwith (sprintf "Unexpected character: %s" (lexeme buf))

let%expect_test _ =
  Sedlexing.Utf8.from_string "a+" |> next_tok |> print_string;
  [%expect {| with Chars: a+ |}];
  Sedlexing.Utf8.from_string "a÷" |> next_tok |> print_string;
  [%expect {| with Chars: a÷ |}];
  Sedlexing.Utf8.from_string "b+" |> next_tok |> print_string;
  [%expect {| with or_pattern: b+ |}];
  Sedlexing.Utf8.from_string "b÷" |> next_tok |> print_string;
  [%expect {| with or_pattern: b÷ |}];
  Sedlexing.Utf8.from_string "ÀÀÀ" |> next_tok |> print_string;
  [%expect {| mixed encoding: ÀÀÀ |}];
  Sedlexing.Utf8.from_string "az\x7f"
  |> next_tok |> String.escaped |> print_string;
  [%expect {| only ascii: az\127 |}];
  Sedlexing.Utf8.from_string "az\u{c0}" |> next_tok |> print_string;
  [%expect {| only latin1: azÀ |}]