File: Parsers.hs

package info (click to toggle)
ctklight 0.17.11-1
  • links: PTS
  • area: main
  • in suites: potato
  • size: 176 kB
  • ctags: 41
  • sloc: haskell: 514; makefile: 26; sh: 12
file content (392 lines) | stat: -rw-r--r-- 14,871 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
--  Compiler Toolkit: Self-optimizing LL(1) parser combinators
--
--  Author : Manuel M. T. Chakravarty
--  Created: 27 February 99
--
--  Version $Revision: 1.18 $ from $Date: 1999/09/22 09:36:35 $
--
--  Copyright (c) 1999 Manuel M. T. Chakravarty
--
--  This library is free software; you can redistribute it and/or
--  modify it under the terms of the GNU Library General Public
--  License as published by the Free Software Foundation; either
--  version 2 of the License, or (at your option) any later version.
--
--  This library is distributed in the hope that it will be useful,
--  but WITHOUT ANY WARRANTY; without even the implied warranty of
--  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
--  Library General Public License for more details.
--
--- DESCRIPTION ---------------------------------------------------------------
--
--  This is a reimplementation of Swierstra/Duponcheel's parser combinators.
--  These are deterministic, self-optimizing LL(1) combinators, which generate
--  parse tables on-the-fly and come with better error recovery facilities
--  than traditional parser combinators.  The applied technique for
--  efficiently computing the parse tables makes essential use of the
--  memorization build into lazy evaluation.
--
--  The present implementation does not exactly follow S. D. Swierstra and
--  L. Duponcheel, ``Deterministic, Error-Correcting Combinator Parsers'', in
--  John Launchbury, Erik Meijer, and Tim Sheard (Eds.) "Advanced Functional
--  Programming", Springer-Verlag, Lecture Notes in Computer Science 1129,
--  184-207, 1996.  Instead, the data structure is used to keep track of the
--  possibility of emptiness of a parser (variant `Empty') and the first set
--  of a parser (the token component of the list stored in the variant
--  `Alts').  This is not unlike a revised version published by
--  S. D. Swierstra, but seems somewhat simpler than his version and adds
--  optimized `Skip' actions (actions where the token is discarded).
--
--- DOCU ----------------------------------------------------------------------
--
--  language: Haskell 98 & rank-2 polymorphism (existentially quantified type 
--	      variables)
--
--  Unlike conventional parser combinators, the combinators do not produce
--  parsers, but only specifications of parsers that can then be executed
--  using the function `parse'.
--
--  It is basically impossible to get this efficient without
--  universally-quantified data type fields (or existentially quantified type
--  variables) as soon as we encode the parsers in a data structure.  The
--  reason is that we cannot store the action functions in the structure
--  without that feature.
--
--  A user-defined state can be passed down in the parser and be threaded
--  through the individual actions.
--
--  Tokens:
--
--  * Tokens must contain a position and equality must be defined for them.
--    The equality determines whether they "match" during parsing, ie, whether
--    they are equal modulo their attributes (the position is, of course, an
--    attribute).  Tokens are, furthermore, printable (instance of `Show');
--    the resulting string should correspond to the lexeme of the token and
--    not the data constructor used to represent it internally.
--
--- TODO ----------------------------------------------------------------------
--
--  * Should tokens be an instance of `Ix' instead of only `Eq'?  Then, we
--    could use arrays to represent the alternatives in the parser
--    specification.  (Would at least considerably simplify the case where two 
--    `Alts' parsers are combined by a <|>.)
--
--  * Error correction is still missing.
--
--  * `execLexer': The extra case construct when we have `Empty' may be
--      inefficient.  We could introduce another variant of `Parser', 
--      `End x', which behaves like `Empty x (Alts [])', but can be easily
--      recognized by the pattern matching in `execLexer'.
--

module Parsers (Token, Parser, empty, token, skip, (<|>), (*$>), (*>), ($>),
		action, meta, opt, (-*>), (*->), many, list, many1, list1,
		sep, seplist, sep1, seplist1, execParser)
where

import Common (Position, Pos (posOf), nopos)
import Errors (interr, ErrorLvl(..), Error, makeError)


infix  5 `opt`
infixl 4 *>, -*>, *->, *$>, $>
infixl 3 `action`
infixl 2 <|>


-- data structures
-- ---------------

-- token class (EXPORTED)
--
class (Pos t, Show t, Eq t) => Token t

-- tree structure used to represent parsers specifications (EXPORTED
-- ABSTRACTLY) 
--
data Token t => 
     Parser a t r = -- Maybe end of input
		    --
		    Empty r			-- return if no input
			  (Parser a t r)	-- used if there is input
		    --
		    -- Selection of acceptable tokens paired with an action
		    --
		  | Alts [(t, Action a t r)]
		    --
		    -- top-down meta action transforming the threaded state;
		    -- the result of the state transformer (type `q') is
		    -- passed to the result of the following parser; the meta
		    -- action has to be executed before the parser applied, as 
		    -- the parser get's the internal state *after* transformed 
		    -- by the meta action
		    --
		  | forall q. Meta (a -> (a, q))
				   (Parser a t (q -> r))

-- actions
--
-- Note that the rank-2 polymorphism (existentially quantified type 
-- variable) is essential here to seperate the action function from the
-- parser (if we don't do that, the actions are pushed down in the parser
-- structure until they reach the `Empty' variant matching the end-of-file in
-- the actual parse - this makes the parser structure as deep as the input has
-- tokens!)
--
data Token t =>
     Action a t r = forall q. Action (t -> q -> r,
				      Parser a t q)
		  |           Skip   (Parser a t r)	    -- ignore token


-- basic combinators
-- -----------------

-- Without consuming any input, yield the given result value (EXPORTED)
--
empty   :: Token t => r -> Parser a t r
empty x  = Empty x (Alts [])

-- Consume a token that is equal to the given one; the consumed token is
-- returned as the result (EXPORTED) 
--
token   :: Token t => t -> Parser a t t
token t  = Alts [(t, Action (const, empty oops))]
	   where
	     oops = interr "Parsers.token: Touched untouchable value!"

-- Consume a token that is equal to the given one; the consumed token is
-- thrown away (EXPORTED) 
--
skip   :: Token t => t -> Parser a t ()
skip t  = Alts [(t, Skip (empty ()))]

-- Alternative parsers (EXPORTED)
--
(<|>) :: Token t => Parser a t r -> Parser a t r -> Parser a t r
--
-- * Alternatives require to merge the alternative sets of the two parsers.
--   The most interesting case is where both sets contain cases for the same
--   token.  In this case, we left factor over this token.  This requires some 
--   care with the actions, which (1) have to be pushed down into their
--   corresponding alternatives in a way that (2) they can get at the current
--   token, albeit with a delay, ie, we flip the arguments. 
--
-- * Meta actions can may never occur in alternatives.
--
(Empty _ _)  <|> (Empty _ _)  = interr "Parsers.<|>: Ambiguous grammar!"
(Empty x p)  <|> q	      = Empty x (p <|> q)
p            <|> (Empty x q)  = Empty x (p <|> q)
(Meta  g p)  <|> q            = Meta g (p <|> const $>q)
p            <|> (Meta  g q)  = Meta g (const $> p <|> q)
(Alts alts1) <|> (Alts alts2) = Alts (foldr addAlt alts1 alts2)
  where
    addAlt alt               []             = [alt]
    addAlt alt1@(t, a) (alt2@(t', a'):alts) 
      | t == t'   = (t, merge a a') : alts
      | otherwise = alt2 : addAlt alt1 alts

    merge (Skip p)        (Skip p')         = Skip (p <|> p')
    merge (Skip p)        (Action (a', p')) = 
      Action (\t tr -> tr t, const $> p <|> (flip a') $> p')
    merge (Action (a, p)) (Skip p')         = 
      Action (\t tr -> tr t, (flip a) $> p <|> const $> p')
    merge (Action (a, p)) (Action (a', p')) =
      Action (\t tr -> tr t, (flip a) $> p <|> (flip a') $> p')

-- Sequential parsers, where the result of the first is applied to the result
-- of the second (EXPORTED)
--
(*$>) :: Token t => Parser a t (s -> r) -> Parser a t s -> Parser a t r
(Empty f p) *$> q = p *$> q <|> (f $> q)
(Meta  h p) *$> q = Meta h (flip $> p *$> q)
(Alts alts) *$> q = Alts [(t, propagateTo a) | (t, a) <- alts] 
		    where
		      -- propagates the action into `p'
		      --
		      propagateTo (Skip   p     ) = Skip (p *$> q)
		      propagateTo (Action (a, p)) = 
		        Action (\t tr -> tr t, 
			        ((\b s t -> a t b s) $> p) *$> q)

-- Sequential parsers, where the overall result is the pair of the component
-- results (EXPORTED)
--
(*>) :: Token t => Parser a t s -> Parser a t r -> Parser a t (s, r)
(Empty x p) *> q = p *> q <|> ((\y -> (x, y)) $> q)
(Meta  h p) *> q = Meta h ((\(xs, r) x -> (xs x, r)) $> (p *> q))
(Alts alts) *> q = Alts [(t, propagateTo a) | (t, a) <- alts] 
		    where
		      -- propagates the action into `p'
		      --
		      propagateTo (Skip   p     ) = Skip (p *> q)
		      propagateTo (Action (a, p)) = 
		        Action (\t (ts, r) -> (ts t, r), ((flip a) $> p) *> q)

-- apply a function to the result yielded by a parser (EXPORTED)
--
($>) :: Token t => (s -> r) -> Parser a t s -> Parser a t r
f $> (Empty x p)  = Empty (f x) (f $> p)
f $> (Meta  g p)  = Meta g ((f .) $> p)
f $> (Alts  alts) = Alts [(t, applyTo a) | (t, a) <- alts]
		    where
		      applyTo (Skip   p     ) = Skip (f $> p)
		      applyTo (Action (a, p)) = Action (\t b -> f (a t b), p)

-- produces a parser that encapsulates a meta action manipulating the
-- threaded state (EXPORTED)
--
meta :: Token t => (a -> (a, r)) -> Parser a t r
meta g  = Meta g (empty id)


-- non-basic combinators
-- ---------------------

-- postfix action (EXPORTED)
--
action :: Token t => Parser a t s -> (s -> r) -> Parser a t r
action  = flip ($>)

-- optional parse (EXPORTED)
--
opt       :: Token t => Parser a t r -> r -> Parser a t r
p `opt` r  = p <|> empty r

-- sequential composition, where the result of the rhs is discarded (EXPORTED)
--
(*->)   :: Token t => Parser a t r -> Parser a t s -> Parser a t r
p *-> q  = const $> p *$> q

-- sequential composition, where the result of the lhs is discarded (EXPORTED)
--
(-*>)   :: Token t => Parser a t s -> Parser a t r -> Parser a t r
p -*> q  = flip const $> p *$> q

-- accept a sequence of productions from a nonterminal (EXPORTED)
--
-- * Uses a graphical structure to require only constant space, but this
--   behaviour is destroyed if the replicated parser is a `skip c'.
--
many       :: Token t => (r -> s -> s) -> s -> Parser a t r -> Parser a t s
--
-- * we need to build a cycle, to avoid building the parser structure over and 
--   over again
--
many f e p  = let me = (f $> p *$> me) `opt` e
	      in me

-- return the results of a sequence of productions from a nonterminal in a
-- list (EXPORTED) 
--
list :: Token t => Parser a t r -> Parser a t [r]
list  = many (:) [] 

-- accept a sequence consisting of at least one production from a nonterminal
-- (EXPORTED) 
--
many1     :: Token t => (r -> r -> r) -> Parser a t r -> Parser a t r
--many1 f p = p <|> (f <$> p <*> many1 f p)
many1 f p = let me = p <|> (f $> p *$> me)
	    in me

-- accept a sequence consisting of at least one production from a nonterminal
-- and return a list of results (EXPORTED) 
--
list1   :: Token t => Parser a t r -> Parser a t [r]
list1 p  = let me =     (\x -> [x]) $> p 
		    <|> ((:) $> p *$> me)
	   in me

-- accept a sequence of productions from a nonterminal, which are seperated by 
-- productions of another nonterminal (EXPORTED)
--
sep :: Token t 
    => (r -> u -> s -> s) 
    -> (r -> s) 
    -> s 
    -> Parser a t u 
    -> Parser a t r 
    -> Parser a t s
sep f g e sepp p  = let me = g $> p <|> (f $> p *$> sepp *$> me)
		    in me `opt` e

-- return the results of a sequence of productions from a nonterminal, which
-- are seperated by productions of another nonterminal, in a list (EXPORTED)
--
seplist :: Token t => Parser a t s -> Parser a t r -> Parser a t [r]
seplist  = sep (\h _ l -> h:l) (\x -> [x]) [] 

-- accept a sequence of productions from a nonterminal, which are seperated by 
-- productions of another nonterminal (EXPORTED)
--
sep1 :: Token t 
     => (r -> s -> r -> r) -> Parser a t s -> Parser a t r -> Parser a t r
sep1 f sepp p  = let me = p <|> (f $> p *$> sepp *$> me)
		 in me

-- accept a sequence consisting of at least one production from a nonterminal, 
-- which are separated by the productions of another nonterminal; the list of
-- results is returned (EXPORTED)
--
seplist1        :: Token t => Parser a t s -> Parser a t r -> Parser a t [r]
seplist1 sepp p = p *> list (sepp -*> p) `action` uncurry (:)
{- Is the above also space save?  Should be.  Contributed by Roman.
seplist1 sepp p  = let me =     (\x -> [x]) $> p 
		            <|> ((:) $> p *-> sepp *$> me)
	           in me
-}


-- execution of a parser
-- ---------------------

-- apply a parser to a token sequence (EXPORTED)
--
-- * Trailing tokens are returned in the third component of the result (the
--   longest match is found).
--
-- * Currently, all errors are fatal; thus, the result (first component of the 
--   returned pair) is undefined in case of an error (this changes when error
--   correction is added).
--
execParser      :: Token t => Parser a t r -> a -> [t] -> (r, [Error], [t])
--
-- Note that we cannot avoid the let-in in the `Action' case, as an
-- accumulator for the result value doesn't work here due to the use of an
-- existential type variable in the definition of `Action'.
--
execParser = exec
  where
    exec                    :: Token t 
			    => Parser a t r -> a -> [t] -> (r, [Error], [t])
    exec (Empty x _) _ []      = (x, [], [])
    exec (Empty x p) a ts      = case p of
				   Alts [] -> (x, [], ts)
				   _       -> exec p a ts
--    exec (Empty _ p) a ts      = exec p a ts
    exec (Meta  h p) a ts      = let (a', x)         = h a
				     (xr, errs, ts') = exec p a' ts
				 in
				 (xr x, errs, ts')
    exec (Alts alts) _ []      = (errtouched, 
				  [makeError FatalErr nopos eofErr], [])
    exec (Alts []  ) _ _       = (errtouched, 
				  [makeError FatalErr nopos trailErr], [])
    exec (Alts alts) a (t:ts)  = 
      case lookup t alts of
	Nothing              -> (errtouched, 
				 [makeError FatalErr (posOf t) (illErr t)], [])
	Just (Action (b, p)) -> let (r, errs,ts') = exec p a ts
				in
				(b t r, errs, ts')
	Just (Skip   p     ) -> exec p a ts

    eofErr   = ["Unexpected end of input!",
	        "The code at the end of the file seems truncated."]
    trailErr = ["Trailing garbage!",
	        "There seem to be characters behind the valid end of input."]
    illErr t = ["Syntax error!",
	        "The symbol `" ++ show t ++ "' does not fit here."]

errtouched = interr "Parsers.errtouch: Touched undefined result!"