1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248
|
module RE (match) where
import Data.Array
import Data.Set (Set, elementOf, emptySet, addToSet)
import REParser (RETree(..), Collection, parse)
import StateM (StateM, get, put, evalStateM)
import Control.Monad (foldM, liftM)
type State = Int
type Collectors = Array Collection (Collecting, Collected)
type Collected = String
type Collecting = Bool
data AState = AState !State !Collectors -- Automaton state
deriving (Eq, Ord)
data Action = TakeChar (Char -> [State])
| BeginCollecting Collection [State]
| StopCollecting Collection [State]
| AtStart [State]
| AtEnd [State]
type Table = Array State Action
-- Can't have a real show due to the (Char -> [State])
instance Show Action where
show (TakeChar f) = show ('a', f 'a', 'b', f 'b')
show (BeginCollecting n ss) = "Begin " ++ show n ++ " " ++ show ss
show (StopCollecting n ss) = "Stop " ++ show n ++ " " ++ show ss
show (AtStart ss) = "Start " ++ show ss
show (AtEnd ss) = "End " ++ show ss
mkAStates :: [State] -> [Collectors] -> [AState]
mkAStates = zipWith AState
-- Eliminate some cases (AtLeastOne, Exactly, AtLeast, FromTo) to make
-- gen_state_trans easier
simplify :: RETree -> RETree
simplify Epsilon = Epsilon
simplify (Char p) = Char p
simplify Start = Start
simplify End = End
simplify (Concat t1 t2) = Concat (simplify t1) (simplify t2)
simplify (AtLeastOne t) = let t' = simplify t in Concat t' (AnyNumber t')
simplify (AnyNumber t) = AnyNumber (simplify t)
simplify (Optional t) = Optional (simplify t)
simplify (Exactly m t) = let t' = simplify t
in foldr Concat Epsilon (replicate m t')
simplify (AtLeast m t) = foldr Concat (AnyNumber t') (replicate m t')
where t' = simplify t
simplify (FromTo m n t)
| m > n = error "simplify: Can't happen: m > n"
| m == n = foldr Concat Epsilon (replicate m t')
| otherwise = foldr Concat opts (replicate m t')
where t' = simplify t
opts = foldr (\x y -> Optional (Concat x y)) Epsilon
(replicate (n - m) t')
simplify (Or t1 t2) = Or (simplify t1) (simplify t2)
simplify (Capture n t) = Capture n (simplify t)
-- Generate the states and state transition table for a regexp tree
gen_state_trans
:: State -- The next available state number
-> [State] -- The end states for after this tree
-> [(State, Action)] -- Accumulate main result
-> RETree -- The tree to work on
-> ([(State, Action)], -- for the array
State, -- The next available state number after this tree
[State]) -- The start states for this tree
gen_state_trans snum after acc Epsilon = (acc, snum, after)
gen_state_trans snum after acc (Char p)
= ((snum, TakeChar (\x -> if p x then after else [])):acc, snum + 1, [snum])
gen_state_trans snum after acc Start
= ((snum, AtStart after):acc, snum + 1, [snum])
gen_state_trans snum after acc End
= ((snum, AtEnd after):acc, snum + 1, [snum])
gen_state_trans snum after acc (Concat t1 t2)
= let (acc1, snum1, starts1) = gen_state_trans snum starts2 acc t1
(acc2, snum2, starts2) = gen_state_trans snum1 after acc1 t2
in (acc2, snum2, starts1)
gen_state_trans _ _ _ (AtLeastOne _)
= error "gen_state_trans: Can't happen: AtLeastOne"
gen_state_trans snum after acc (AnyNumber t)
= let (acc', snum', starts) = gen_state_trans snum starts' acc t
starts' = starts ++ after
in (acc', snum', starts')
gen_state_trans snum after acc (Optional t)
= case gen_state_trans snum after acc t of
(t', after', starts) -> (t', after', starts ++ after)
gen_state_trans _ _ _ (Exactly _ _)
= error "gen_state_trans: Can't happen: Exactly"
gen_state_trans _ _ _ (AtLeast _ _)
= error "gen_state_trans: Can't happen: AtLeast"
gen_state_trans _ _ _ (FromTo _ _ _)
= error "gen_state_trans: Can't happen: FromTo"
gen_state_trans snum after acc (Or t1 t2)
= let (acc1, snum1, starts1) = gen_state_trans snum after acc t1
(acc2, snum2, starts2) = gen_state_trans snum1 after acc1 t2
in (acc2, snum2, starts1 ++ starts2)
gen_state_trans snum after acc (Capture n t) = (acc'', snum', [snum])
where acc' = (snum, BeginCollecting n starts):
(snum+1, StopCollecting n after):acc
(acc'', snum', starts) = gen_state_trans (snum+2) [snum+1] acc' t
-- Given a RE tree return the list of start states and the lookup table
mk_table :: RETree -> ([State], Table)
mk_table t = (starts', table')
where (acc, states, starts)
= gen_state_trans 1 [0] [(0, TakeChar (\_ -> []))] (simplify t)
table = array (0, upper_bound) acc
table' = table // extra_acc
upper_bound = (states-1) {- in acc -} + 1 {- max. in extra_acc -}
(extra_acc, starts') = fix_start table states starts
-- If we can accept something not anchored to the start of the input
-- then we need to put a non-greedy .* on the front
fix_start :: Table -> State -> [State] -> ([(State, Action)], [State])
fix_start t snum starts
= if all (is_start_anchored t) starts
then ([], starts)
else let new_starts = starts ++ [snum]
in ([(snum, TakeChar (\_ -> new_starts))], new_starts)
-- Given a RE and input, return whether it matches and the list of
-- captures if it does
match :: String -> String -> Maybe [String]
match re xs
= case parse re of
Left (re_tree, num_collections) ->
let -- Allow anything after a match
re_tree' = Concat re_tree (AnyNumber (Char (const True)))
-- Initially all parentheses have captured nothing and
-- haven't started capturing
init_collectors
= listArray (0, num_collections - 1)
(replicate num_collections (False, ""))
-- Make the state transition table for the regexp
(init_states, table) = mk_table re_tree'
-- The initial automaton states are the initial states
-- with the above initial collectors. Get them ready for
-- the first input character.
init_astates = mkAStates init_states (repeat init_collectors)
init_astates' = make_want_first_char table init_astates
in do_match table init_astates' xs
Right err -> error err
do_match :: Table -> [AState] -> String -> Maybe [String]
-- If we have no active automaton states then we failed to match the input
do_match _ [] _ = Nothing
do_match t as xs
-- If we have reached and end state, return its corresponding collectors
= case get_winner as of
Just w -> Just w
Nothing ->
case xs of
-- Otherwise, if we have run out of input then return
-- whatever result we get by having an end of input here
"" -> get_winner $ at_end_of_input t as
-- Otherwise apply the next char and push the states
-- through ready for the next iteration
(x:xs') -> let as' = make_want_char t
$ concatMap (apply_char t x) as
in do_match t as' xs'
-- If we have reached an accepting state (which we know always means
-- state 0) then return its (the first one's) collectors
get_winner :: [AState] -> Maybe [String]
get_winner as = case filter (\(AState s _) -> s == 0) as of
[] -> Nothing
(AState _ cs:_) -> Just (map (reverse . snd) (elems cs))
type SeenM a = StateM (Set AState) a
type AStateAccum = [AState] -> [AState]
seeing :: AState -> SeenM Bool
seeing a = do seen <- get
if a `elementOf` seen
then return True
else do put (addToSet seen a)
return False
-- Push automaton states through to the point where they need the next
-- (not the first) character or end of input to proceed
make_want_char :: Table -> [AState] -> [AState]
make_want_char t xs = mwc t False False xs
-- Push automaton states through to the point where they need the first
-- character or end of input to proceed
make_want_first_char :: Table -> [AState] -> [AState]
make_want_first_char t xs = mwc t True False xs
-- Push automaton states through to the point after they get the end
-- of the input
at_end_of_input :: Table -> [AState] -> [AState]
at_end_of_input t xs = mwc t False True xs
mwc :: Table -> Bool -> Bool -> [AState] -> [AState]
mwc t pass_through_start pass_through_end as
= evalStateM (foldM f id as) emptySet []
where
f :: AStateAccum -> AState -> SeenM AStateAccum
f acc a@(AState s cs)
= do seen <- seeing a
if seen
then return acc
else case t ! s of
TakeChar _ -> return (acc . (a:))
BeginCollecting c ss ->
let cs' = cs // [(c, (True, ""))]
in recurse (mkAStates ss (repeat cs'))
StopCollecting c ss ->
let collected = snd (cs ! c)
cs' = cs // [(c, (False, collected))]
in recurse (mkAStates ss (repeat cs'))
AtStart ss
| pass_through_start -> recurse (mkAStates ss (repeat cs))
| otherwise -> return acc
AtEnd ss
| pass_through_end -> recurse (mkAStates ss (repeat cs))
| otherwise -> return (acc . (a:))
where recurse = foldM f acc
-- Return whether a given start state always needs to match at the start
-- of the input
is_start_anchored :: Table -> State -> Bool
is_start_anchored t init_s = evalStateM (isa init_s) emptySet
where isa :: State -> StateM (Set State) Bool
isa s = do seen <- get
if s `elementOf` seen
then return True
else do put (addToSet seen s)
case t ! s of
TakeChar _ -> return False
BeginCollecting _ ss ->
liftM and $ mapM isa ss
StopCollecting _ ss ->
liftM and $ mapM isa ss
AtStart _ -> return True
AtEnd _ -> return False
-- Do the single transition when a character is applied
apply_char :: Table -> Char -> AState -> [AState]
apply_char t x (AState s cs) = case t ! s of
TakeChar f -> [ AState s' cs' | s' <- f x ]
AtEnd _ -> []
_ -> error "Can't happen: apply_char"
where cs' = fmap (\(collecting, collected) ->
if collecting then (collecting, x:collected)
else (collecting, collected)) cs
|