File: ulexing.ml

package info (click to toggle)
ulex 0.5-3
  • links: PTS
  • area: main
  • in suites: sarge
  • size: 156 kB
  • ctags: 260
  • sloc: ml: 1,070; makefile: 73; sh: 50
file content (199 lines) | stat: -rwxr-xr-x 4,925 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
exception Error
exception InvalidCodepoint of int

let eof = -1

(* Absolute position from the beginning of the stream *)
type apos = int

type lexbuf = {
  refill: (int array -> int -> int -> int);
  mutable buf: int array;
  mutable len: int;    (* Number of meaningful char in buffer *)
  mutable offset: apos; (* Position of the first char in buffer
			    in the input stream *)
  mutable pos : int;
  mutable start : int; (* First char we need to keep visible *)

  mutable marked_pos : int;
  mutable marked_val : int;

  mutable finished: bool;
}

let chunk_size = 512

let empty_lexbuf = {
  refill = (fun _ _ _ -> assert false);
  buf = [| |];
  len = 0;
  offset = 0;
  pos = 0;
  start = 0;
  marked_pos = 0;
  marked_val = 0;
  finished = false;
}

let create f = {
  empty_lexbuf with
    refill = f;
    buf = Array.create chunk_size 0;
}

let from_stream s =
  create (fun buf pos len ->
	    try buf.(pos) <- Stream.next s; 1
	    with Stream.Failure -> 0)

let from_latin1_stream s =
  create (fun buf pos len ->
	    try buf.(pos) <- Char.code (Stream.next s); 1
	    with Stream.Failure -> 0)

let from_utf8_stream s =
  create (fun buf pos len ->
	    try buf.(pos) <- Utf8.from_stream s; 1
	    with Stream.Failure -> 0)

type enc = Ascii | Latin1 | Utf8

exception MalFormed

let from_var_enc_stream enc s =
  create (fun buf pos len ->
	    try 
	      buf.(pos) <- (match !enc with
			      | Ascii ->
				  let c = Char.code (Stream.next s) in
				  if c > 127 then raise (InvalidCodepoint c);
				  c
			      | Latin1 -> Char.code (Stream.next s)
			      | Utf8 -> Utf8.from_stream s);
	      1
	    with Stream.Failure -> 0)


let from_var_enc_string enc s =
  from_var_enc_stream enc (Stream.of_string s)

let from_var_enc_channel enc ic =
  from_var_enc_stream enc (Stream.of_channel ic)

let from_latin1_string s = 
  let len = String.length s in
  {
    empty_lexbuf with
      buf = Array.init len (fun i -> Char.code s.[i]);
      len = len;
      finished = true;
  }

let from_latin1_channel ic =
  from_latin1_stream (Stream.of_channel ic)
let from_utf8_channel ic =
  from_stream (Utf8.stream_from_char_stream (Stream.of_channel ic))

let from_int_array a =
  let len = Array.length a in
  {
    empty_lexbuf with
      buf = Array.init len (fun i -> a.(i));
      len = len;
      finished = true;
  }


let from_utf8_string s =
  from_int_array (Utf8.to_int_array s 0 (String.length s))

let refill lexbuf =
  if lexbuf.len + chunk_size > Array.length lexbuf.buf 
  then begin
    let s = lexbuf.start in
    let ls = lexbuf.len - s in
    if ls + chunk_size <= Array.length lexbuf.buf then
      Array.blit lexbuf.buf s lexbuf.buf 0 ls
    else begin
      let newlen = (Array.length lexbuf.buf + chunk_size) * 2 in
      let newbuf = Array.create newlen 0 in
      Array.blit lexbuf.buf s newbuf 0 ls;
      lexbuf.buf <- newbuf
    end;
    lexbuf.len <- ls;
    lexbuf.offset <- lexbuf.offset + s;
    lexbuf.pos <- lexbuf.pos - s;
    lexbuf.marked_pos <- lexbuf.marked_pos - s;
    lexbuf.start <- 0
  end;
  let n = lexbuf.refill lexbuf.buf lexbuf.pos chunk_size in
  if (n = 0) 
  then begin
    lexbuf.buf.(lexbuf.len) <- eof;
    lexbuf.len <- lexbuf.len + 1;
  end
  else lexbuf.len <- lexbuf.len + n

let next lexbuf =
  let i = 
    if lexbuf.pos = lexbuf.len then 
      if lexbuf.finished then eof
      else (refill lexbuf; lexbuf.buf.(lexbuf.pos))
    else lexbuf.buf.(lexbuf.pos)
  in
  if i = eof then lexbuf.finished <- true else lexbuf.pos <- lexbuf.pos + 1;
  i

let start lexbuf =
  lexbuf.start <- lexbuf.pos;
  lexbuf.marked_pos <- lexbuf.pos;
  lexbuf.marked_val <- (-1)

let mark lexbuf i =
  lexbuf.marked_pos <- lexbuf.pos;
  lexbuf.marked_val <- i

let backtrack lexbuf =
  lexbuf.pos <- lexbuf.marked_pos;
  lexbuf.marked_val

let rollback lexbuf =
  lexbuf.pos <- lexbuf.start

let lexeme_start lexbuf = lexbuf.start + lexbuf.offset
let lexeme_end lexbuf = lexbuf.pos + lexbuf.offset

let loc lexbuf = (lexbuf.start + lexbuf.offset, lexbuf.pos + lexbuf.offset)

let lexeme_length lexbuf = lexbuf.pos - lexbuf.start

let sub_lexeme lexbuf pos len = 
  Array.sub lexbuf.buf (lexbuf.start + pos) len
let lexeme lexbuf = 
  Array.sub lexbuf.buf (lexbuf.start) (lexbuf.pos - lexbuf.start)
let lexeme_char lexbuf pos = 
  lexbuf.buf.(lexbuf.start + pos)

let to_latin1 c =
  if (c >= 0) && (c < 256) 
  then Char.chr c 
  else raise (InvalidCodepoint c)

let latin1_lexeme_char lexbuf pos = 
  to_latin1 (lexeme_char lexbuf pos)

let latin1_sub_lexeme lexbuf pos len =
  let s = String.create len in
  for i = 0 to len - 1 do s.[i] <- to_latin1 lexbuf.buf.(lexbuf.start + pos + i) done;
  s

let latin1_lexeme lexbuf = 
  latin1_sub_lexeme lexbuf 0 (lexbuf.pos - lexbuf.start)

let utf8_sub_lexeme lexbuf pos len =
  Utf8.from_int_array lexbuf.buf (lexbuf.start + pos) len

let utf8_lexeme lexbuf = 
  utf8_sub_lexeme lexbuf 0 (lexbuf.pos - lexbuf.start)