1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243
|
module REParser (RETree(..), Collection, parse) where
import Data.Char (isDigit, isAlpha, isAlphaNum, isLower, isUpper,
isHexDigit, isSpace, isControl, isPrint)
type Collection = Int
data RETree = Epsilon
| Char (Char -> Bool) -- a, [abc]
| Start -- ^
| End -- $
| Concat RETree RETree -- a b
| AtLeastOne RETree -- a +
| AnyNumber RETree -- a *
| Optional RETree -- a ?
| Exactly Int RETree -- a { m }
| AtLeast Int RETree -- a { m, }
| FromTo Int Int RETree -- a { m, n } (m >= n)
| Or RETree RETree -- a | b
| Capture Collection RETree -- ( a )
instance Show RETree where
show Epsilon = ""
show (Char _) = "."
show Start = "^"
show End = "$"
show (Concat t1 t2) = show t1 ++ show t2
show (AtLeastOne t) = show t ++ "+"
show (AnyNumber t) = show t ++ "*"
show (Optional t) = show t ++ "?"
show (Exactly t m) = show t ++ "{" ++ show m ++ "}"
show (AtLeast t m) = show t ++ "{" ++ show m ++ ",}"
show (FromTo t m n) = show t ++ "{" ++ show m ++ "," ++ show n ++ "}"
show (Or t1 t2) = show t1 ++ "|" ++ show t2
show (Capture _ t) = "(" ++ show t ++ ")"
parse :: String -> Either (RETree, Collection) String
parse xs = run_parser 0 (parse_regexp False <* eoi) xs
parse_regexp :: Bool -> Parser RETree
parse_regexp in_parens
= foldr1 Or <$> pSepList1 (pChar '|') (parse_branch in_parens)
<|> pSucceed Epsilon
parse_branch :: Bool -> Parser RETree
parse_branch in_parens = foldr1 Concat <$> pMany1 (parse_atomop in_parens)
<|> pSucceed Epsilon
parse_atomop :: Bool -> Parser RETree
parse_atomop in_parens = flip ($) <$> parse_atom in_parens <*> parse_op
<|> Start <$ pChar '^'
<|> End <$ pChar '$'
parse_atom :: Bool -> Parser RETree
parse_atom in_parens
= Capture <$> pCapture <*> pChar '(' *> parse_regexp True
<* pChar ')'
<|> pChar '[' *> parse_group <* pChar ']'
<|> Char . (==) <$> pPred (\x -> x `notElem` special)
<|> Char . (==) <$ pChar '\\' <*> pPred (\x -> x `elem` special)
<|> Char ('\0' /=) <$ pChar '.'
where special_base = ".[\\(*+?{|^$"
special = if in_parens then ')':special_base else special_base
parse_group :: Parser RETree
parse_group = (Char . (not .)) <$ pChar '^' <*> parse_negated_group
<|> Char <$> parse_negated_group
where parse_negated_group :: Parser (Char -> Bool)
parse_negated_group = (\p c -> c == ']' || p c)
<$ pChar ']'
<*> parse_group_body
<|> parse_group_body
parse_group_body :: Parser (Char -> Bool)
parse_group_body = (\p1 p2 -> \c -> p1 c || p2 c)
<$> ( isAlphaNum <$ pString "[:alnum:]"
<|> isAlpha <$ pString "[:alpha:]"
<|> (`elem` " \t") <$ pString "[:blank:]"
<|> isControl <$ pString "[:cntrl:]"
<|> isDigit <$ pString "[:digit:]"
<|> (\c -> isPrint c && not (isSpace c))
<$ pString "[:graph:]"
<|> isLower <$ pString "[:lower:]"
<|> isPrint <$ pString "[:print:]"
<|> (\c -> isPrint c
&& not (isAlphaNum c || isSpace c))
<$ pString "[:punct:]"
<|> isSpace <$ pString "[:space:]"
<|> isUpper <$ pString "[:upper:]"
<|> isHexDigit <$ pString "[:xdigit:]"
<|> pString "[:" *> pFailCut
<|> (\c1 c2 -> \c -> c1 <= c && c <= c2)
<$> pPred (']' /=)
<* pChar '-'
<*> pPred (']' /=)
<|> (==) <$> pPred (']' /=))
<*> parse_group_body
<|> pSucceed (const False)
parse_op :: Parser (RETree -> RETree)
parse_op = AtLeastOne <$ pChar '+'
<|> AnyNumber <$ pChar '*'
<|> Optional <$ pChar '?'
<|> (\m -> Exactly (read m))
<$ pChar '{' <*> pMany1 pDigit <* pChar '}'
<|> (\m -> AtLeast (read m))
<$ pChar '{' <*> pMany1 pDigit <* pString ",}"
<|?> ((\m n -> let m' = read m
n' = read n
in if m' <= n' then Just (FromTo m' n')
else Nothing)
<$ pChar '{' <*> pMany1 pDigit <* pChar ','
<*> pMany1 pDigit <* pChar '}')
<|> pSucceed id
data Pos = Pos LinePos CharPos
| EOI
deriving (Eq, Ord)
type LinePos = Integer
type CharPos = Integer
type Parser a = Collection -> [(Char, Pos)] -> Res a
-- Pos in Succ is furthest we got before failing
data Res a = Succ !Pos Collection a [(Char, Pos)]
| Fail !Pos
| FailCut !Pos
deriving Show
instance Show Pos where
show (Pos l c) = "line " ++ show l ++ ", char " ++ show c
show EOI = "end of input"
-- Primitives:
run_parser :: Collection -> Parser a -> String -> Either (a, Collection) String
run_parser n p xs = case p n (posify xs) of
Succ _ c x [] -> Left (x, c)
Succ pos _ _ _ -> Right (input_at pos)
Fail pos -> Right (err_at pos)
FailCut pos -> Right (err_at pos)
where err_at pos = "Syntax error at " ++ show pos
input_at pos = "Unconsumed input at " ++ show pos
posify :: String -> [(Char, Pos)]
posify = f (Pos 1 1)
where f _ "" = []
f p@(Pos l _) ('\n':xs) = ('\n', p):f (Pos (l+1) 1) xs
f p@(Pos l c) (x:xs) = (x, p):f (Pos l (c+1)) xs
f EOI _ = error "posify: Can't happen"
pos_of :: [(Char, Pos)] -> Pos
pos_of [] = EOI
pos_of ((_, p):_) = p
eoi :: Parser ()
eoi = \c xs -> case xs of
((_, p):_) -> Fail p
[] -> Succ EOI c () []
pCapture :: Parser Collection
pCapture = \c xs -> Succ (pos_of xs) (c+1) c xs
(<*>) :: Parser (a -> b) -> Parser a -> Parser b
p <*> q = \c xs -> case p c xs of
Succ pp cp rp xs' ->
case q cp xs' of
Succ pq cq rq xs'' ->
Succ (pp `max` pq) cq (rp rq) xs''
Fail pos -> Fail (pp `max` pos)
FailCut pos -> FailCut (pp `max` pos)
Fail pos -> Fail pos
FailCut pos -> FailCut pos
(<|>) :: Parser a -> Parser a -> Parser a
p <|> q = \c xs -> case p c xs of
FailCut p1 -> FailCut p1
Fail p1 ->
case q c xs of
Fail p2 -> Fail (p1 `max` p2)
FailCut p2 -> FailCut (p1 `max` p2)
Succ p2 c' x xs' -> Succ (p1 `max` p2) c' x xs'
s -> s
(<|?>) :: Parser a -> Parser (Maybe a) -> Parser a
p <|?> q = \c xs -> case p c xs of
FailCut p1 -> FailCut p1
Fail p1 ->
case q c xs of
Fail p2 -> Fail (p1 `max` p2)
FailCut p2 -> FailCut (p1 `max` p2)
Succ p2 _ Nothing _ -> Fail (p1 `max` p2)
Succ p2 c' (Just x) xs' ->
Succ (p1 `max` p2) c' x xs'
s -> s
pSucceed :: a -> Parser a
pSucceed x = \c xs -> Succ (pos_of xs) c x xs
pFailCut :: Parser a
pFailCut = \_ xs -> FailCut (pos_of xs)
pPred :: (Char -> Bool) -> Parser Char
pPred p = \c xs -> case xs of
((x, _):xs')
| p x -> Succ (pos_of xs') c x xs'
_ -> Fail (pos_of xs)
-- Derived:
(<$>) :: (a -> b) -> Parser a -> Parser b
f <$> q = pSucceed f <*> q
(<*) :: Parser a -> Parser b -> Parser a
p <* q = const <$> p <*> q
(*>) :: Parser a -> Parser b -> Parser b
p *> q = flip const <$> p <*> q
(<$) :: a -> Parser b -> Parser a
f <$ q = pSucceed f <* q
pChar :: Char -> Parser Char
pChar c = pPred (c ==)
pString :: String -> Parser String
pString "" = pSucceed ""
pString (x:xs) = (:) <$> pChar x <*> pString xs
pDigit :: Parser Char
pDigit = pPred isDigit
pSepList1 :: Parser a -> Parser b -> Parser [b]
pSepList1 s p = (:) <$> p <*> pMany (s *> p)
pMany1 :: Parser a -> Parser [a]
pMany1 p = (:) <$> p <*> pMany p
pMany :: Parser a -> Parser [a]
pMany p = (:) <$> p <*> pMany p
<|> pSucceed []
infixl 3 <|>, <|?>
infixl 5 <*>, <$>, <*, <$
|