File: RE.hs

package info (click to toggle)
haskell-utils 1.6
  • links: PTS
  • area: main
  • in suites: sarge
  • size: 104 kB
  • ctags: 13
  • sloc: haskell: 411; makefile: 87
file content (248 lines) | stat: -rw-r--r-- 10,881 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248

module RE (match) where

import Data.Array
import Data.Set (Set, elementOf, emptySet, addToSet)
import REParser (RETree(..), Collection, parse)
import StateM (StateM, get, put, evalStateM)
import Control.Monad (foldM, liftM)

type State = Int
type Collectors = Array Collection (Collecting, Collected)
type Collected = String
type Collecting = Bool
data AState = AState !State !Collectors -- Automaton state
    deriving (Eq, Ord)
data Action = TakeChar (Char -> [State])
            | BeginCollecting Collection [State]
            | StopCollecting Collection [State]
            | AtStart [State]
            | AtEnd [State]
type Table = Array State Action

-- Can't have a real show due to the (Char -> [State])
instance Show Action where
    show (TakeChar f) = show ('a', f 'a', 'b', f 'b')
    show (BeginCollecting n ss) = "Begin " ++ show n ++ " " ++ show ss
    show (StopCollecting n ss) = "Stop " ++ show n ++ " " ++ show ss
    show (AtStart ss) = "Start " ++ show ss
    show (AtEnd ss) = "End " ++ show ss

mkAStates :: [State] -> [Collectors] -> [AState]
mkAStates = zipWith AState

-- Eliminate some cases (AtLeastOne, Exactly, AtLeast, FromTo) to make
-- gen_state_trans easier
simplify :: RETree -> RETree
simplify Epsilon = Epsilon
simplify (Char p) = Char p
simplify Start = Start
simplify End = End
simplify (Concat t1 t2) = Concat (simplify t1) (simplify t2)
simplify (AtLeastOne t) = let t' = simplify t in Concat t' (AnyNumber t')
simplify (AnyNumber t) = AnyNumber (simplify t)
simplify (Optional t) = Optional (simplify t)
simplify (Exactly m t) = let t' = simplify t
                         in foldr Concat Epsilon (replicate m t')
simplify (AtLeast m t) = foldr Concat (AnyNumber t') (replicate m t')
    where t' = simplify t
simplify (FromTo m n t)
 | m > n = error "simplify: Can't happen: m > n"
 | m == n = foldr Concat Epsilon (replicate m t')
 | otherwise = foldr Concat opts (replicate m t')
    where t' = simplify t
          opts = foldr (\x y -> Optional (Concat x y)) Epsilon
                       (replicate (n - m) t')
simplify (Or t1 t2) = Or (simplify t1) (simplify t2)
simplify (Capture n t) = Capture n (simplify t)

-- Generate the states and state transition table for a regexp tree
gen_state_trans
    :: State               -- The next available state number
    -> [State]             -- The end states for after this tree
    -> [(State, Action)]   -- Accumulate main result
    -> RETree              -- The tree to work on
    -> ([(State, Action)], -- for the array
        State,             -- The next available state number after this tree
        [State])           -- The start states for this tree
gen_state_trans snum after acc Epsilon = (acc, snum, after)
gen_state_trans snum after acc (Char p)
 = ((snum, TakeChar (\x -> if p x then after else [])):acc, snum + 1, [snum])
gen_state_trans snum after acc Start
 = ((snum, AtStart after):acc, snum + 1, [snum])
gen_state_trans snum after acc End
 = ((snum, AtEnd after):acc, snum + 1, [snum])
gen_state_trans snum after acc (Concat t1 t2)
 = let (acc1, snum1, starts1) = gen_state_trans snum starts2 acc t1
       (acc2, snum2, starts2) = gen_state_trans snum1 after acc1 t2
   in (acc2, snum2, starts1)
gen_state_trans _ _ _ (AtLeastOne _)
 = error "gen_state_trans: Can't happen: AtLeastOne"
gen_state_trans snum after acc (AnyNumber t)
 = let (acc', snum', starts) = gen_state_trans snum starts' acc t
       starts' = starts ++ after
   in (acc', snum', starts')
gen_state_trans snum after acc (Optional t)
 = case gen_state_trans snum after acc t of
       (t', after', starts) -> (t', after', starts ++ after)
gen_state_trans _ _ _ (Exactly _ _)
 = error "gen_state_trans: Can't happen: Exactly"
gen_state_trans _ _ _ (AtLeast _ _)
 = error "gen_state_trans: Can't happen: AtLeast"
gen_state_trans _ _ _ (FromTo _ _ _)
 = error "gen_state_trans: Can't happen: FromTo"
gen_state_trans snum after acc (Or t1 t2)
 = let (acc1, snum1, starts1) = gen_state_trans snum  after acc  t1
       (acc2, snum2, starts2) = gen_state_trans snum1 after acc1 t2
   in (acc2, snum2, starts1 ++ starts2)
gen_state_trans snum after acc (Capture n t) = (acc'', snum', [snum])
    where acc' = (snum, BeginCollecting n starts):
                 (snum+1, StopCollecting n after):acc
          (acc'', snum', starts) = gen_state_trans (snum+2) [snum+1] acc' t

-- Given a RE tree return the list of start states and the lookup table
mk_table :: RETree -> ([State], Table)
mk_table t = (starts', table')
    where (acc, states, starts)
              = gen_state_trans 1 [0] [(0, TakeChar (\_ -> []))] (simplify t)
          table = array (0, upper_bound) acc
          table' = table // extra_acc
          upper_bound = (states-1) {- in acc -} + 1 {- max. in extra_acc -}
          (extra_acc, starts') = fix_start table states starts

-- If we can accept something not anchored to the start of the input
-- then we need to put a non-greedy .* on the front
fix_start :: Table -> State -> [State] -> ([(State, Action)], [State])
fix_start t snum starts
 = if all (is_start_anchored t) starts
   then ([], starts)
   else let new_starts = starts ++ [snum]
        in ([(snum, TakeChar (\_ -> new_starts))], new_starts)

-- Given a RE and input, return whether it matches and the list of
-- captures if it does
match :: String -> String -> Maybe [String]
match re xs
 = case parse re of
       Left (re_tree, num_collections) ->
           let -- Allow anything after a match
               re_tree' = Concat re_tree (AnyNumber (Char (const True)))
               -- Initially all parentheses have captured nothing and
               -- haven't started capturing
               init_collectors
                = listArray (0, num_collections - 1)
                            (replicate num_collections (False, ""))
               -- Make the state transition table for the regexp
               (init_states, table) = mk_table re_tree'
               -- The initial automaton states are the initial states
               -- with the above initial collectors. Get them ready for
               -- the first input character.
               init_astates = mkAStates init_states (repeat init_collectors)
               init_astates' = make_want_first_char table init_astates
           in do_match table init_astates' xs
       Right err -> error err

do_match :: Table -> [AState] -> String -> Maybe [String]
-- If we have no active automaton states then we failed to match the input
do_match _ [] _ = Nothing
do_match t as xs
 -- If we have reached and end state, return its corresponding collectors
 = case get_winner as of
       Just w -> Just w
       Nothing ->
           case xs of
               -- Otherwise, if we have run out of input then return
               -- whatever result we get by having an end of input here
               "" -> get_winner $ at_end_of_input t as
               -- Otherwise apply the next char and push the states
               -- through ready for the next iteration
               (x:xs') -> let as' = make_want_char t
                                  $ concatMap (apply_char t x) as
                          in do_match t as' xs'

-- If we have reached an accepting state (which we know always means
-- state 0) then return its (the first one's) collectors
get_winner :: [AState] -> Maybe [String]
get_winner as = case filter (\(AState s _) -> s == 0) as of
                    [] -> Nothing
                    (AState _ cs:_) -> Just (map (reverse . snd) (elems cs))

type SeenM a = StateM (Set AState) a
type AStateAccum = [AState] -> [AState]

seeing :: AState -> SeenM Bool
seeing a = do seen <- get
              if a `elementOf` seen
               then return True
               else do put (addToSet seen a)
                       return False

-- Push automaton states through to the point where they need the next
-- (not the first) character or end of input to proceed
make_want_char :: Table -> [AState] -> [AState]
make_want_char t xs = mwc t False False xs

-- Push automaton states through to the point where they need the first
-- character or end of input to proceed
make_want_first_char :: Table -> [AState] -> [AState]
make_want_first_char t xs = mwc t True False xs

-- Push automaton states through to the point after they get the end
-- of the input
at_end_of_input :: Table -> [AState] -> [AState]
at_end_of_input t xs = mwc t False True xs

mwc :: Table -> Bool -> Bool -> [AState] -> [AState]
mwc t pass_through_start pass_through_end as
    = evalStateM (foldM f id as) emptySet []
 where
  f :: AStateAccum -> AState -> SeenM AStateAccum
  f acc a@(AState s cs)
   = do seen <- seeing a
        if seen
         then return acc
         else case t ! s of
                  TakeChar _ -> return (acc . (a:))
                  BeginCollecting c ss ->
                      let cs' = cs // [(c, (True, ""))]
                      in recurse (mkAStates ss (repeat cs'))
                  StopCollecting c ss ->
                      let collected = snd (cs ! c)
                          cs' = cs // [(c, (False, collected))]
                      in recurse (mkAStates ss (repeat cs'))
                  AtStart ss
                   | pass_through_start -> recurse (mkAStates ss (repeat cs))
                   | otherwise -> return acc
                  AtEnd ss
                   | pass_through_end -> recurse (mkAStates ss (repeat cs))
                   | otherwise -> return (acc . (a:))
      where recurse = foldM f acc

-- Return whether a given start state always needs to match at the start
-- of the input
is_start_anchored :: Table -> State -> Bool
is_start_anchored t init_s = evalStateM (isa init_s) emptySet
 where isa :: State -> StateM (Set State) Bool
       isa s = do seen <- get
                  if s `elementOf` seen
                   then return True
                   else do put (addToSet seen s)
                           case t ! s of
                               TakeChar _ -> return False
                               BeginCollecting _ ss ->
                                   liftM and $ mapM isa ss
                               StopCollecting _ ss ->
                                   liftM and $ mapM isa ss
                               AtStart _ -> return True
                               AtEnd _ -> return False

-- Do the single transition when a character is applied
apply_char :: Table -> Char -> AState -> [AState]
apply_char t x (AState s cs) = case t ! s of
                                   TakeChar f -> [ AState s' cs' | s' <- f x ]
                                   AtEnd _ -> []
                                   _ -> error "Can't happen: apply_char"
    where cs' = fmap (\(collecting, collected) ->
                          if collecting then (collecting, x:collected)
                                        else (collecting,   collected)) cs