File: scanner.ha

package info (click to toggle)
hare 0.25.2-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 6,948 kB
  • sloc: asm: 1,264; makefile: 123; sh: 114; lisp: 101
file content (418 lines) | stat: -rw-r--r-- 11,217 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
// SPDX-License-Identifier: MPL-2.0
// (c) Hare authors <https://harelang.org>

use bytes;
use encoding::utf8;
use errors;
use io;
use strings;
use types;

def BUFSZ: size = 4096;

const scanner_vtable = io::vtable {
	reader = &scan_read,
	seeker = &scan_seek,
	...
};

export type scanner = struct {
	stream: io::stream,
	src: io::handle,
	buffer: []u8,
	// Index of start of pending bytes in buffer
	start: size,
	// Sub-slice with pending bytes in buffer
	pending: []u8,
	// User-confirmed maximum size of read buffer
	maxread: size,
	// Change some scanning behaviors
	opts: scan_options,
};

// Options which fine-tune the behavior of a [[scanner]].
export type scan_options = enum uint {
	DEFAULT = EOF_DISCARD,
	// Upon encountering EOF, all bytes or characters between the
	// final token and EOF are discarded and EOF is returned
	// immediately.
	//
	// This option is recommended for use-cases where the user is
	// scanning over a file or buffer which may contain partial
	// content, and the user wishes to consume as many tokens as
	// possible and assume that additional data may follow EOF
	// before a new delimiter is written.
	//
	// This is the default behavior. Note that on Unix, text files
	// are always terminated with a new line, and [[scan_line]] will
	// enumerate all well-formed lines in a file with this flag --
	// however, when scanning ill-formed text files which include
	// text following the final line feed, this additional text will
	// be discarded.
	EOF_DISCARD = 0,
	// Upon encountering EOF, all bytes or characters between the
	// final token and EOF are treated as a token and returned to
	// the caller before returning EOF.
	//
	// This is recommended for use-cases where EOF is effectively
	// considered an additional delimiter between tokens, or where
	// the remainder of the file following the final delimiter is
	// meaningful.
	EOF_GREEDY = 1 << 0,
};

// Creates a new [[scanner]] which will allocate and maintain a read buffer for
// efficient reading of a handle. The scanner will read ahead only up to maxread
// bytes, which defaults to [[types::SIZE_MAX]] if no limit is required. The
// user must free resources associated with the scanner using [[finish]] after
// use.
//
// Reads from the scanner will return [[errors::overflow]] if maxread is
// reached.
export fn newscanner(
	src: io::handle,
	maxread: size = types::SIZE_MAX,
	opts: scan_options = scan_options::DEFAULT,
) scanner = {
	return scanner {
		stream = &scanner_vtable,
		src = src,
		buffer = [],
		maxread = maxread,
		start = 0,
		pending = [],
		opts = opts,
	};
};

// Creates a new [[scanner]] using a user-provided buffer. The scanner will
// return [[errors::overflow]] if the buffer length is reached, but will not
// perform any allocations. The user should not call [[finish]] after use unless
// they wish to free the underlying buffer through bufio.
export fn newscanner_static(
	src: io::handle,
	buffer: []u8,
	opts: scan_options = scan_options::DEFAULT,
) scanner = {
	return scanner {
		stream = &scanner_vtable,
		src = src,
		buffer = buffer,
		maxread = len(buffer),
		start = 0,
		pending = [],
		opts = opts,
	};
};

// Frees resources associated with a [[scanner]]. Does not close the underlying
// I/O handle.
export fn finish(scan: *scanner) void = {
	free(scan.buffer);
};

fn scan_read(s: *io::stream, buf: []u8) (size | io::EOF | io::error) = {
	let scan = s: *scanner;

	if (len(scan.pending) == 0) {
		match (scan_readahead(scan)?) {
		case io::EOF =>
			return io::EOF;
		case size => void;
		};
	};

	const n = if (len(buf) > len(scan.pending)) len(scan.pending) else len(buf);
	buf[..n] = scan_consume(scan, n)[..];
	return n;
};

fn scan_seek(
	s: *io::stream,
	off: io::off,
	w: io::whence,
) (io::off | io::error) = {
	let scan = s: *scanner;

	if (w == io::whence::CUR) {
		// Fast path for io::tell
		if (off == 0) {
			let new_off = io::tell(scan.src)?;
			new_off -= len(scan.pending): io::off;
			return new_off;
		};

		// Adjust offset to account for buffered data
		off -= len(scan.pending): io::off;
	};

	const new_off = io::seek(scan.src, off, w)?;

	// Discard read-ahead buffer
	scan.start = 0;
	scan.pending = scan.buffer[..0];
	return new_off;
};

// Fills up the scanner buffer with data from the underlying I/O handle. If no
// space remains in the read buffer, it is expanded by BUFSZ (up to maxread).
// Then, one read from the underlying I/O handle is performed and scan.pending
// is updated accordingly. Returns the number of bytes which had been available
// prior to the call.
fn scan_readahead(scan: *scanner) (size | io::EOF | io::error) = {
	let start = scan.start;
	const pending = len(scan.pending);

	if (start + pending == len(scan.buffer)) {
		if (start > 0) {
			// Shift buffer to the left to free space at the end
			scan.buffer[..len(scan.buffer) - start] = scan.buffer[start..];
			scan.pending = scan.buffer[..pending];
			start = 0;
			scan.start = 0;
		} else {
			// Buffer is full, expand it
			let readahead = pending + BUFSZ;
			if (readahead > scan.maxread) {
				readahead = scan.maxread;
			};
			if (pending >= readahead) {
				return errors::overflow;
			};
			append(scan.buffer, [0...], readahead)?;
		};
	};

	match (io::read(scan.src, scan.buffer[start + pending..])?) {
	case let z: size =>
		scan.pending = scan.buffer[start..start + pending + z];
		return pending;
	case io::EOF =>
		return io::EOF;
	};
};

// Consumes N bytes from the buffer.
fn scan_consume(scan: *scanner, n: size) []u8 = {
	assert(len(scan.pending) >= n);
	scan.start += n;
	defer scan.pending = scan.pending[n..];
	return scan.pending[..n];
};

// Reads one byte from a [[scanner]].
export fn scan_byte(scan: *scanner) (u8 | io::EOF | io::error) = {
	if (len(scan.pending) == 0) {
		match (scan_readahead(scan)?) {
		case io::EOF =>
			return io::EOF;
		case size => void;
		};
	};

	return scan_consume(scan, 1)[0];
};

// Reads the next token from a [[scanner]], delimited by delim. The delimiter is
// read from the source handle but not included in the returned slice. The
// return value is borrowed from the internal scanner buffer, which is
// invalidated during subsequent operations which use this scanner.
export fn scan_bytes(
	scan: *scanner,
	delim: (u8 | []u8),
) ([]u8 | io::EOF | io::error) = {
	let i = 0z;
	for (true) {
		match (bytes::index(scan.pending[i..], delim)) {
		case let ix: size =>
			i += ix;
			break;
		case void => void;
		};

		match (scan_readahead(scan)?) {
		case io::EOF =>
			if (scan.opts == scan_options::EOF_DISCARD) {
				return io::EOF;
			};
			if (len(scan.pending) == 0) {
				return io::EOF;
			};
			return scan_consume(scan, len(scan.pending));
		case let prevpending: size =>
			// No need to re-index the earlier part of the buffer
			i = prevpending;
		};
	};

	const ndelim = match (delim) {
	case u8 =>
		yield 1z;
	case let u: []u8 =>
		yield len(u);
	};
	const nconsume = i + ndelim;
	return scan_consume(scan, nconsume)[..i];
};

// Reads one rune from a [[scanner]].
export fn scan_rune(
	scan: *scanner,
) (rune | io::EOF | io::error | utf8::invalid) = {
	if (len(scan.pending) < 4) {
		match (scan_readahead(scan)?) {
		case io::EOF =>
			if (len(scan.pending) == 0) {
				return io::EOF;
			};
		case size => void;
		};
	};
	const sz = utf8::utf8sz(scan.pending[0])?;
	if (len(scan.pending) < sz) {
		return utf8::invalid;
	};
	const buf = scan_consume(scan, sz);
	const dec = utf8::decode(buf[..sz]);
	match (utf8::next(&dec)?) {
	case let r: rune =>
		return r;
	case done =>
		return io::EOF;
	case utf8::more =>
		return utf8::invalid;
	};
};

// Scans a string of text from a [[scanner]] up to some delimiter. The delimiter
// is read from the source handle but not included in the returned string. The
// return value is borrowed from the internal scanner buffer, which is
// invalidated during subsequent operations which use this scanner.
export fn scan_string(
	scan: *scanner,
	delim: str,
) (const str | io::EOF | io::error | utf8::invalid) = {
	const token = match (scan_bytes(scan, strings::toutf8(delim))?) {
	case let token: []u8 =>
		yield token;
	case io::EOF =>
		return io::EOF;
	};
	return strings::fromutf8(token)?;
};

// Scans the next line of text from a [[scanner]]. The return value is borrowed
// from the internal scanner buffer, which is invalidated during subsequent
// operations which use this scanner.
export fn scan_line(
	scan: *scanner,
) (const str | io::EOF | io::error | utf8::invalid) = {
	return scan_string(scan, "\n");
};

// Returns the internal scanner buffer, which contains all bytes read ahead by
// the scanner up to this point.
export fn scan_buffer(scan: *scanner) []u8 = {
	return scan.pending[..];
};

fn scan_unread(scan: *scanner, buf: []u8) void = {
	if (len(buf) == 0) {
		return;
	};
	if (len(buf) <= scan.start) {
		const pending_end = scan.start + len(scan.pending);
		scan.buffer[scan.start - len(buf)..scan.start] = buf;
		scan.start -= len(buf);
		scan.pending = scan.buffer[scan.start..pending_end];
	} else {
		assert(len(buf) <= len(scan.buffer) - len(scan.pending),
			"Attempted to unread more data than buffer has available");
		// Shift buffer to the right to free space at the beginning
		scan.buffer[len(buf)..len(buf) + len(scan.pending)] =
			scan.buffer[scan.start..scan.start + len(scan.pending)];
		scan.buffer[..len(buf)] = buf;
		scan.pending = scan.buffer[..len(scan.pending) + len(buf)];
		scan.start = 0;
	};
};

// Reads a single byte from an [[io::handle]].
export fn read_byte(h: io::handle) (u8 | io::EOF | io::error) = {
	let buf: [1]u8 = [0...];

	match (io::readall(h, buf)?) {
	case size =>
		return buf[0];
	case io::EOF =>
		return io::EOF;
	};
};

// Reads a slice of bytes until the delimiter. Delimiter is not included but
// it is read from the handle. The return value must be freed by the caller.
export fn read_tok(h: io::handle, delim: u8...) ([]u8 | io::EOF | io::error) = {
	let buf: []u8 = [];
	let ok = false;
	defer if (!ok) free(buf);

	for (true) {
		match (read_byte(h)?) {
		case let res: u8 =>
			if (bytes::contains(delim, res)) {
				break;
			};
			append(buf, res)?;
		case io::EOF =>
			if (len(buf) == 0) {
				return io::EOF;
			};
			break;
		};
	};

	ok = true;
	return buf;
};

// Reads a slice of bytes until a newline character (\n, 0x0A). Newline itself
// is not included but it is read from the handle. The return value must be
// freed by the caller.
export fn read_line(h: io::handle) ([]u8 | io::EOF | io::error) =
	read_tok(h, '\n');

// Reads a rune from a UTF-8 stream.
export fn read_rune(
	h: io::handle,
) (rune | utf8::invalid | io::EOF | io::error) = {
	let b: [4]u8 = [0...];
	match (io::readall(h, b[..1])?) {
	case let n: size => void;
	case io::EOF =>
		return io::EOF;
	};

	const sz = utf8::utf8sz(b[0])?;

	if (sz == 1) {
		return b[0]: rune;
	};

	match (io::readall(h, b[1..sz])) {
	case let n: size => void;
	case io::EOF =>
		return io::EOF;
	case let err: io::error =>
		return if (err is io::underread) utf8::invalid else err;
	};

	let dec = utf8::decode(b[..sz]);
	match (utf8::next(&dec)?) {
	case let r: rune =>
		return r;
	case done =>
		return io::EOF;
	case utf8::more =>
		return utf8::invalid;
	};
};