File: REParser.hs

package info (click to toggle)
haskell-utils 1.6
  • links: PTS
  • area: main
  • in suites: sarge
  • size: 104 kB
  • ctags: 13
  • sloc: haskell: 411; makefile: 87
file content (243 lines) | stat: -rw-r--r-- 9,366 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243

module REParser (RETree(..), Collection, parse) where

import Data.Char (isDigit, isAlpha, isAlphaNum, isLower, isUpper,
                  isHexDigit, isSpace, isControl, isPrint)

type Collection = Int

data RETree = Epsilon
            | Char (Char -> Bool)       -- a, [abc]
            | Start                     -- ^
            | End                       -- $
            | Concat RETree RETree      -- a b
            | AtLeastOne RETree         -- a +
            | AnyNumber RETree          -- a *
            | Optional RETree           -- a ?
            | Exactly Int RETree        -- a { m }
            | AtLeast Int RETree        -- a { m, }
            | FromTo Int Int RETree     -- a { m, n }   (m >= n)
            | Or RETree RETree          -- a | b
            | Capture Collection RETree -- ( a )

instance Show RETree where
    show Epsilon = ""
    show (Char _) = "."
    show Start = "^"
    show End = "$"
    show (Concat t1 t2) = show t1 ++ show t2
    show (AtLeastOne t) = show t ++ "+"
    show (AnyNumber t) = show t ++ "*"
    show (Optional t) = show t ++ "?"
    show (Exactly t m) = show t ++ "{" ++ show m ++ "}"
    show (AtLeast t m) = show t ++ "{" ++ show m ++ ",}"
    show (FromTo t m n) = show t ++ "{" ++ show m ++ "," ++ show n ++ "}"
    show (Or t1 t2) = show t1 ++ "|" ++ show t2
    show (Capture _ t) = "(" ++ show t ++ ")"

parse :: String -> Either (RETree, Collection) String
parse xs = run_parser 0 (parse_regexp False <*  eoi) xs

parse_regexp :: Bool -> Parser RETree
parse_regexp in_parens
             = foldr1 Or <$> pSepList1 (pChar '|') (parse_branch in_parens)
           <|> pSucceed Epsilon

parse_branch :: Bool -> Parser RETree
parse_branch in_parens = foldr1 Concat <$> pMany1 (parse_atomop in_parens)
                     <|> pSucceed Epsilon

parse_atomop :: Bool -> Parser RETree
parse_atomop in_parens = flip ($) <$> parse_atom in_parens <*> parse_op
                     <|> Start <$  pChar '^'
                     <|> End <$  pChar '$'

parse_atom :: Bool -> Parser RETree
parse_atom in_parens
           = Capture <$> pCapture <*> pChar '('  *> parse_regexp True
                                  <*  pChar ')'
         <|> pChar '['  *> parse_group <*  pChar ']'
         <|> Char . (==) <$> pPred (\x -> x `notElem` special)
         <|> Char . (==) <$  pChar '\\' <*> pPred (\x -> x `elem` special)
         <|> Char ('\0' /=) <$  pChar '.'
    where special_base = ".[\\(*+?{|^$"
          special = if in_parens then ')':special_base else special_base

parse_group :: Parser RETree
parse_group = (Char . (not .)) <$  pChar '^' <*> parse_negated_group
          <|> Char <$> parse_negated_group
    where parse_negated_group :: Parser (Char -> Bool)
          parse_negated_group = (\p c -> c == ']' || p c)
                            <$  pChar ']'
                            <*> parse_group_body
                        <|>     parse_group_body
          parse_group_body :: Parser (Char -> Bool)
          parse_group_body = (\p1 p2 -> \c -> p1 c || p2 c)
                         <$> (    isAlphaNum     <$  pString "[:alnum:]"
                              <|> isAlpha        <$  pString "[:alpha:]"
                              <|> (`elem` " \t") <$  pString "[:blank:]"
                              <|> isControl      <$  pString "[:cntrl:]"
                              <|> isDigit        <$  pString "[:digit:]"
                              <|> (\c -> isPrint c && not (isSpace c))
                                                 <$  pString "[:graph:]"
                              <|> isLower        <$  pString "[:lower:]"
                              <|> isPrint        <$  pString "[:print:]"
                              <|> (\c -> isPrint c
                                      && not (isAlphaNum c || isSpace c))
                                                 <$  pString "[:punct:]"
                              <|> isSpace        <$  pString "[:space:]"
                              <|> isUpper        <$  pString "[:upper:]"
                              <|> isHexDigit     <$  pString "[:xdigit:]"
                              <|> pString "[:"  *> pFailCut
                              <|>     (\c1 c2 -> \c -> c1 <= c && c <= c2)
                                  <$> pPred (']' /=)
                                  <*  pChar '-'
                                  <*> pPred (']' /=)
                              <|> (==) <$> pPred (']' /=))
                          <*> parse_group_body
                      <|> pSucceed (const False)

parse_op :: Parser (RETree -> RETree)
parse_op = AtLeastOne <$  pChar '+'
       <|> AnyNumber <$  pChar '*'
       <|> Optional <$  pChar '?'
       <|>     (\m -> Exactly (read m))
           <$  pChar '{' <*> pMany1 pDigit <*  pChar '}'
       <|>     (\m -> AtLeast (read m))
           <$  pChar '{' <*> pMany1 pDigit <*  pString ",}"
       <|?>    ((\m n -> let m' = read m
                             n' = read n
                         in if m' <= n' then Just (FromTo m' n')
                                        else Nothing)
           <$  pChar '{' <*> pMany1 pDigit <*  pChar ','
                         <*> pMany1 pDigit <*  pChar '}')
       <|> pSucceed id

data Pos = Pos LinePos CharPos
         | EOI
    deriving (Eq, Ord)
type LinePos = Integer
type CharPos = Integer
type Parser a = Collection -> [(Char, Pos)] -> Res a
-- Pos in Succ is furthest we got before failing
data Res a = Succ !Pos Collection a [(Char, Pos)]
           | Fail !Pos
           | FailCut !Pos
    deriving Show

instance Show Pos where
    show (Pos l c) = "line " ++ show l ++ ", char " ++ show c
    show EOI = "end of input"

-- Primitives:

run_parser :: Collection -> Parser a -> String -> Either (a, Collection) String
run_parser n p xs = case p n (posify xs) of
                        Succ _ c x [] -> Left (x, c)
                        Succ pos _ _ _ -> Right (input_at pos)
                        Fail pos -> Right (err_at pos)
                        FailCut pos -> Right (err_at pos)
    where err_at pos = "Syntax error at " ++ show pos
          input_at pos = "Unconsumed input at " ++ show pos

posify :: String -> [(Char, Pos)]
posify = f (Pos 1 1)
    where f _ "" = []
          f p@(Pos l _) ('\n':xs) = ('\n', p):f (Pos (l+1) 1) xs
          f p@(Pos l c) (x:xs) = (x, p):f (Pos l (c+1)) xs
          f EOI _ = error "posify: Can't happen"

pos_of :: [(Char, Pos)] -> Pos
pos_of [] = EOI
pos_of ((_, p):_) = p

eoi :: Parser ()
eoi = \c xs -> case xs of
                   ((_, p):_) -> Fail p
                   [] -> Succ EOI c () []

pCapture :: Parser Collection
pCapture = \c xs -> Succ (pos_of xs) (c+1) c xs

(<*>) :: Parser (a -> b) -> Parser a -> Parser b
p <*> q = \c xs -> case p c xs of
                       Succ pp cp rp xs' ->
                          case q cp xs' of
                              Succ pq cq rq xs'' ->
                                  Succ (pp `max` pq) cq (rp rq) xs''
                              Fail pos -> Fail (pp `max` pos)
                              FailCut pos -> FailCut (pp `max` pos)
                       Fail pos -> Fail pos
                       FailCut pos -> FailCut pos

(<|>) :: Parser a -> Parser a -> Parser a
p <|> q = \c xs -> case p c xs of
                       FailCut p1 -> FailCut p1
                       Fail p1 ->
                           case q c xs of
                               Fail p2 -> Fail (p1 `max` p2)
                               FailCut p2 -> FailCut (p1 `max` p2)
                               Succ p2 c' x xs' -> Succ (p1 `max` p2) c' x xs'
                       s -> s

(<|?>) :: Parser a -> Parser (Maybe a) -> Parser a
p <|?> q = \c xs -> case p c xs of
                        FailCut p1 -> FailCut p1
                        Fail p1 ->
                            case q c xs of
                                Fail p2 -> Fail (p1 `max` p2)
                                FailCut p2 -> FailCut (p1 `max` p2)
                                Succ p2 _ Nothing _ -> Fail (p1 `max` p2)
                                Succ p2 c' (Just x) xs' ->
                                    Succ (p1 `max` p2) c' x xs'
                        s -> s

pSucceed :: a -> Parser a
pSucceed x = \c xs -> Succ (pos_of xs) c x xs

pFailCut :: Parser a
pFailCut = \_ xs -> FailCut (pos_of xs)

pPred :: (Char -> Bool) -> Parser Char
pPred p = \c xs -> case xs of
                       ((x, _):xs')
                        | p x -> Succ (pos_of xs') c x xs'
                       _ -> Fail (pos_of xs)

-- Derived:

(<$>) :: (a -> b) -> Parser a -> Parser b
f <$> q = pSucceed f <*> q

(<*) :: Parser a -> Parser b -> Parser a
p <* q = const <$> p <*> q

(*>) :: Parser a -> Parser b -> Parser b
p *> q = flip const <$> p <*> q

(<$) :: a -> Parser b -> Parser a
f <$ q = pSucceed f <*  q

pChar :: Char -> Parser Char
pChar c = pPred (c ==)

pString :: String -> Parser String
pString "" = pSucceed ""
pString (x:xs) = (:) <$> pChar x <*> pString xs

pDigit :: Parser Char
pDigit = pPred isDigit

pSepList1 :: Parser a -> Parser b -> Parser [b]
pSepList1 s p = (:) <$> p <*> pMany (s  *> p)

pMany1 :: Parser a -> Parser [a]
pMany1 p = (:) <$> p <*> pMany p

pMany :: Parser a -> Parser [a]
pMany p = (:) <$> p <*> pMany p
      <|> pSucceed []

infixl 3 <|>, <|?>
infixl 5 <*>, <$>, <*, <$