File: peg.tcl

package info (click to toggle)
tcllib 2.0%2Bdfsg-5
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 83,560 kB
  • sloc: tcl: 306,798; ansic: 14,272; sh: 3,035; xml: 1,766; yacc: 1,157; pascal: 881; makefile: 124; perl: 84; f90: 84; python: 33; ruby: 13; php: 11
file content (541 lines) | stat: -rw-r--r-- 12,981 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
# -*- tcl -*-
#
# Copyright (c) 2005 by Andreas Kupries <andreas_kupries@users.sourceforge.net>
# Grammars / Parsing Expression Grammars / Container

# ### ### ### ######### ######### #########
## Package description

# A class whose instances hold all the information describing a single
# parsing expression grammar (terminal symbols, nonterminal symbols,
# nonterminal rules, start expression, hints), and operations to
# define, manipulate, and query this information.
#
# The container has only one functionality beyond the simple storage
# of the aforementioned information. It keeps track if the provided
# grammar is valid (*). The container provides no higher-level
# operations on the grammar, like removal of unreachable nonterminals,
# rule rewriting, etc.
#
# The set of terminal symbols is the set of characters (i.e.
# implicitly defined). For Tcl this means that all the unicode
# characters are supported.
#
# (*) A grammar is valid if and only if all its rules are valid.  A
# rule is valid if and only if all nonterminals referenced by the RHS
# of the rule are in the set of nonterminals, and if only the allowed
# operators are used in the expression.

# ### ### ### ######### ######### #########
## Requisites

package require snit         ; # Tcllib | OO system used

# ### ### ### ######### ######### #########
## Implementation

snit::type ::grammar::peg {
    # ### ### ### ######### ######### #########
    ## Type API. Helpful methods for PEs.

    proc ValidateSerial {e prefix} {}
    proc Validate   {e} {}
    proc References {e} {}
    proc Rename     {e old new} {}

    # ### ### ### ######### ######### #########
    ## Instance API

    constructor {args} {}

    method clear {} {}

    method =   {src} {}
    method --> {dst} {}
    method serialize {} {}
    method deserialize {value} {}

    method {is valid} {} {}
    method start {args} {}

    method nonterminals {} {}
    method {nonterminal add}    {nts pae} {}
    method {nonterminal delete} {nts pae} {}
    method {nonterminal exists} {nts} {}
    method {nonterminal rename} {ntsold ntsnew} {}
    method {nonterminal mode}   {nts args} {}

    method {unknown nonterminals} {} {}

    method {nonterminal rule}   {nts} {}

    # ### ### ### ######### ######### #########
    ## Internal data structures.

    ## - Set of nonterminal symbols, and
    ## - Mapping from nonterminals to their defining parsing
    ##   expressions, and
    ## - Start parsing expression.
    ## - And usage of nonterminals by others, required for tracking
    ##   of validity.

    ## se: expression               | Start expression
    ## nt: nonterm -> expression    | Known Nt's, their rules
    ## re: nonterm -> list(nonterm) | Known Nt's, what others they use.
    ## ir: nonterm -> list(nonterm) | Nt's, possibly unknown, their users.
    ## uk: nonterm -> use counter   | Nt's which are unknown.
    ##
    ## Both 're' and 'ir' can list a nonterminal A multiple times,
    ## if it uses or is used multiple times.
    ##
    ## Grammar is invalid <=> '[array size uk] > 0'

    variable se        epsilon
    variable nt -array {}
    variable re -array {}
    variable ir -array {}
    variable uk -array {}
    variable mo -array {}

    # ### ### ### ######### ######### #########
    ## Instance API Implementation.

    constructor {args} {
	if {
	    (([llength $args] != 0) && ([llength $args] != 2)) ||
	    (([llength $args] == 2) && ([lsearch {= := <-- as deserialize} [lindex $args 0]]) < 0)
	} {
	    return -code error "wrong#args: $self ?=|:=|<--|as|deserialize a'?"
	}

	# Serialization arguments.
	# [llength args] in {0 2}
	#
	# =           src-obj
	# :=          src-obj
	# <--         src-obj
	# as          src-obj
	# deserialize src-value

	if {[llength $args] == 2} {
	    foreach {op val} $args break
	    switch -exact -- $op {
		= - := - <-- - as {
		    $self deserialize [$val serialize]
		}
		deserialize {
		    $self deserialize $val
		}
	    }
	}
	return
    }

    #destructor {}

    method clear {} {
	array unset nt *
	array unset re *
	array unset ir *
	array unset uk *
	array unset mo *
	set se epsilon
	return
    }

    method = {src} {
	$self dserialize [$src serialize]
    }

    method --> {dst} {
	$dst deserialize [$self serialize]
    }

    method serialize {} {
	return [::list \
		grammar::pegc \
		[array get nt] \
		[array get mo] \
		$se]
    }

    method deserialize {value} {
	# Validate value, then clear and refill.

	$self CheckSerialization $value ntv mov sev
	$self clear

	foreach {s e} $ntv {
	    $self NtAdd $s $e
	}
	array set mo $mov
	$self start $sev
	return
    }

    method {is valid} {} {
	return [expr {[array size uk] == 0}]
    }

    method start {args} {
	if {[llength $args] == 0} {
	    return $se
	}
	if {[llength $args] > 1} {
	    return -code error "wrong#args: $self start ?pe?"
	}
	set newse [lindex $args 0]
	Validate $newse
	set se   $newse
	return
    }

    method nonterminals {} {
	return [array names nt]
    }

    method {nonterminal add} {nts pae} {
	$self CheckNtKnown $nts
	Validate $pae
	$self NtAdd $nts $pae
	return
    }

    method {nonterminal mode} {nts args} {
	$self CheckNt $nts
	if {![llength $args]} {
	    return $mo($nts)
	} elseif {[llength $args] == 1} {
	    set mo($nts) [lindex $args 0]
	    return
	} else {
	    return -code error "wrong#args"
	}
	return
    }

    method {nonterminal delete} {nts args} {
	set args [linsert $args 0 $nts]
	foreach nts $args {
	    $self CheckNt $nts
	}

	foreach nts $args {
	    $self NtDelete $nts
	}
	return
    }

    method {nonterminal exists} {nts} {
	return [info exists nt($nts)]
    }

    method {nonterminal rename} {ntsold ntsnew} {
	$self CheckNt      $ntsold
	$self CheckNtKnown $ntsnew

	# Difficult. We have to go through all rules and rewrite their
	# RHS to use the new name of the nonterminal. We can however
	# restrict ourselves to the rules which actually use the
	# changed nonterminal.

	# We also have to update the used/user information. We know
	# that the validity of the grammar is unchanged by this
	# operation. The unknown information is unchanged as well, as
	# we cannot rename an unknown nonterminal. IOW we know that
	# 'ntsold' is not in 'uk', and so 'ntsnew' will not be in that
	# array either after the rename.

	set myusers $ir($ntsold)
	set myused  $re($ntsold)

	set nt($ntsnew) $nt($ntsold)
	unset            nt($ntsold)

	set mo($ntsnew) $mo($ntsold)
	unset            mo($ntsold)

	foreach x $myusers {
	    set nt($x) [Rename $nt($x) $ntsold $ntsnew]
	}

	# It is possible to use myself, and be used by myself.

	while {[set pos [lsearch -exact $myusers $ntsold]] >= 0} {
	    set myusers [lreplace $myusers $pos $pos $ntsnew]
	}
	while {[set pos [lsearch -exact $myused $ntsold]] >= 0} {
	    set myused [lreplace $myused $pos $pos $ntsnew]
	}

	set re($ntsnew) $myusers
	set ir($ntsnew) $myused

	unset            re($ntsold)
	unset            ir($ntsold)
	return
    }

    method {unknown nonterminals} {} {
	return [array names uk]
    }

    method {nonterminal rule} {nts} {
	$self CheckNt $nts
	return $nt($nts)
    }

    # ### ### ### ######### ######### #########
    ## Internal helper methods

    method NtAdd {nts pae} {
	# None of the symbols is known. We can add them to the
	# grammar. If however any of their PEs is known to the PE
	# storage then we had expressions refering to unknown
	# symbols. The grammar is most certainly invalid and may have
	# become valid right now. We have to invalidate the validity
	# cache.

	set nt($nts) $pae
	set mo($nts) value

	# Track users, uses, and unknowns.

	set references [References $pae]

	# We use the refered symbols
	set re($nts) $references

	# We are a user for the refered symbols
	# Record unknown symbols immediately.
	foreach x $references {
	    lappend ir($x) $nts
	    if {[info exists nt($x)]} continue
	    if {[catch {incr uk($x)}]} {set uk($x) 1}
	}

	# We are definitely not unknown.
	unset -nocomplain uk($nts)
	return
    }

    method NtDelete {nts} {
	set references $re($nts)

	# We are gone. We are not using anything anymore.
	unset    nt($nts)
	unset    re($nts)
	unset    mo($nts)

	# Our references loose us as their user.
	foreach x $references {
	    set pos [lsearch -exact $ir($x) $x]
	    if {$pos < 0} {error PANIC}
	    set ir($x) [lreplace $ir($x) $pos $pos]
	    if {[llength $ir($x)] == 0} {
		unset ir($x)
		# x is not referenced anywhere, cannot be unknown.
		unset -nocomplain uk($x)
	    }
	    if {[info exists uk($x)]} {
		incr uk($x) -1
	    }
	}

	# We might be used by others still, and therefore become
	# unknown.

	if {[info exists ir($nts]} {
	    set uk($nts) [llength $ir($nts)]
	}
	return
    }

    method CheckNt {nts} {
	if {![info exists nt($nts)]} {
	    return -code error "Invalid nonterminal \"$nts\""
	}
	return
    }

    method CheckNtKnown {nts} {
	if {[info exists nt($nts)]} {
	    return -code error "Nonterminal \"$nts\" is already known"
	}
	return
    }

    method CheckSerialization {value ntv mov sev} {
	# value is list/3 ('grammar::pegc' nonterminals start)
	# terminals is list of string.
	# nonterminals is doct (key is string, value is expr)
	# start is expr
	# terminals * nonterminals == empty
	# expr is parsing expression (Validate PE).

	upvar 1 \
	    $ntv ntvs \
	    $mov movs \
	    $sev sevs

	set prefix "error in serialization:"
	if {[llength $value] != 4} {
	    return -code error "$prefix list length not 4"
	}

	struct::list assign $value type nonterminals hints start
	if {$type ne "grammar::pegc"} {
	    return -code error "$prefix unknown type \"$type\""
	}

	ValidateSerial $start "$prefix invalid start expression"

	if {[llength $nonterminals] % 2 == 1} {
	    return -code error "$prefix nonterminal data is not a dictionary"
	}
	array set _nt $nonterminals
	if {[llength $nonterminals] != (2*[array size _nt])} {
	    return -code error "$prefix nonterminal data contains duplicate names, or misses some"
	}

	foreach {s e} $nonterminals {
	    ValidateSerial $start "$prefix nonterminal \"$s\", invalid parsing expression"
	}


	if {[llength $hints] % 2 == 1} {
	    return -code error "$prefix nonterminal modes is not a dictionary"
	}
	array set _mo $hints
	if {[llength $hints] != (2*[array size _mo])} {
	    return -code error "$prefix nonterminal modes contains duplicate names, or misses some"
	}
	foreach {s _} $hints {
	    if {![info exists _nt($s)]} {
		return -code error "$prefix nonterminal mode for unknown nonterminal \"$s\""
	    }
	}

	set ntvs $nonterminals
	set sevs $start
	set movs $hints
	return
    }

    # ### ### ### ######### ######### #########

    # ### ### ### ######### ######### #########
    ## Type API implementation.

    proc ValidateSerial {e prefix} {
	if {![catch {Validate $e} msg]} return
	return -code error "$prefix, $msg"
    }

    proc Validate {e} {
	if {[llength $e] == 0} {
	    return -code error "invalid empty expression list"
	}

	set op [lindex $e 0]
	set ar [lrange $e 1 end]

	switch -exact -- $op {
	    epsilon - alpha - alnum - dot {
		if {[llength $ar] > 0} {
		    return -code error "wrong#args for \"$op\""
		}
	    }
	    .. {
		if {[llength $ar] != 2} {
		    return -code error "wrong#args for \"$op\""
		}
		# Leaf, arguments are not expressions to validate.
	    }
	    n - t {
		if {[llength $ar] != 1} {
		    return -code error "wrong#args for \"$op\""
		}
		# Leaf, argument is not expression to validate.
	    }
	    & - ! - * - + - ? {
		if {[llength $ar] != 1} {
		    return -code error "wrong#args for \"$op\""
		}
		Validate [lindex $ar 0]
	    }
	    x - / {
		if {![llength $ar]} {
		    return -code error "wrong#args for \"$op\""
		}
		foreach e $ar {
		    Validate $e
		}
	    }
	    default {
		return -code error "invalid operator \"$op\""
	    }
	}
    }

    proc References {e} {
	set references {}

	set op [lindex $e 0]
	set ar [lrange $e 1 end]

	switch -exact -- $op {
	    epsilon - t - alpha - alnum - dot - .. {}
	    n {
		# Remember referenced nonterminal
		lappend references [lindex $ar 0]
	    }
	    & - ! - * - + - ? {
		foreach r [References [lindex $ar 0]] {
		    lappend references $r
		}
	    }
	    x - / {
		foreach e $ar {
		    foreach r [References $e] {
			lappend references $r
		    }
		}
	    }
	}
	return $references
    }

    proc Rename {e old new} {
	set op [lindex $e 0]
	set ar [lrange $e 1 end]

	switch -exact -- $op {
	    epsilon - t - alpha - alnum - dot - .. {return $e}
	    n {
		if {[lindex $ar 0] ne $old} {return $e}
		return [list n $new]
	    }
	    & - ! - * - + - ? {
		return [list $op [Rename [lindex $ar 0] $old $new]]
	    }
	    x - / {
		set res $op
		foreach e $ar {
		    lappend res [Rename $e $old $new]
		}
		return $res
	    }
	}
    }

    # ### ### ### ######### ######### #########
    ## Type Internals.

    # ### ### ### ######### ######### #########
}

# ### ### ### ######### ######### #########
## Package Management

package provide grammar::peg 0.3