1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193
|
-- | This is a POSIX version of parseRegex that allows NUL characters.
-- Lazy\/Possessive\/Backrefs are not recognized. Anchors \^ and \$ are
-- recognized.
--
-- A 'PGroup' returned always has @(Maybe 'GroupIndex')@ set to @(Just _)@
-- and never to @Nothing@.
module Text.Regex.TDFA.ReadRegex (parseRegex) where
{- By Chris Kuklewicz, 2007. BSD License, see the LICENSE file. -}
import Text.Regex.TDFA.Pattern {- all -}
import Text.ParserCombinators.Parsec((<|>), (<?>),
try, runParser, many, getState, setState, CharParser, ParseError,
sepBy1, option, notFollowedBy, many1, lookAhead, eof, between,
string, noneOf, digit, char, anyChar)
import Control.Monad (liftM, guard)
import Data.Foldable (asum)
import qualified Data.Set as Set(fromList)
-- | An element inside @[...]@, denoting a character class.
data BracketElement
= BEChar Char -- ^ A single character.
| BERange Char Char -- ^ A character range (e.g. @a-z@).
| BEColl String -- ^ @foo@ in @[.foo.]@.
| BEEquiv String -- ^ @bar@ in @[=bar=]@.
| BEClass String -- ^ A POSIX character class (candidate), e.g. @alpha@ parsed from @[:alpha:]@.
-- | Return either an error message or a tuple of the Pattern and the
-- largest group index and the largest DoPa index (both have smallest
-- index of 1). Since the regular expression is supplied as [Char] it
-- automatically supports unicode and @\\NUL@ characters.
parseRegex :: String -> Either ParseError (Pattern,(GroupIndex,DoPa))
parseRegex x = runParser (do pat <- p_regex
eof
(lastGroupIndex,lastDopa) <- getState
return (pat,(lastGroupIndex,DoPa lastDopa))) (0,0) x x
type P = CharParser (GroupIndex, Int)
p_regex :: P Pattern
p_regex = liftM POr $ sepBy1 p_branch (char '|')
-- man re_format helps a lot, it says one-or-more pieces so this is
-- many1 not many. Use "()" to indicate an empty piece.
p_branch :: P Pattern
p_branch = liftM PConcat $ many1 p_piece
p_piece :: P Pattern
p_piece = (p_anchor <|> p_atom) >>= p_post_atom -- correct specification
p_atom :: P Pattern
p_atom = p_group <|> p_bracket <|> p_char <?> "an atom"
group_index :: P (Maybe GroupIndex)
group_index = do
(gi,ci) <- getState
let index = succ gi
setState (index,ci)
return (Just index)
p_group :: P Pattern
p_group = lookAhead (char '(') >> do
index <- group_index
liftM (PGroup index) $ between (char '(') (char ')') p_regex
-- p_post_atom takes the previous atom as a parameter
p_post_atom :: Pattern -> P Pattern
p_post_atom atom = (char '?' >> return (PQuest atom))
<|> (char '+' >> return (PPlus atom))
<|> (char '*' >> return (PStar True atom))
<|> p_bound atom
<|> return atom
p_bound :: Pattern -> P Pattern
p_bound atom = try $ between (char '{') (char '}') (p_bound_spec atom)
p_bound_spec :: Pattern -> P Pattern
p_bound_spec atom = do lowS <- many1 digit
let lowI = read lowS
highMI <- option (Just lowI) $ try $ do
_ <- char ','
-- parsec note: if 'many digits' fails below then the 'try' ensures
-- that the ',' will not match the closing '}' in p_bound, same goes
-- for any non '}' garbage after the 'many digits'.
highS <- many digit
if null highS then return Nothing -- no upper bound
else do let highI = read highS
guard (lowI <= highI)
return (Just (read highS))
return (PBound lowI highMI atom)
-- An anchor cannot be modified by a repetition specifier
p_anchor :: P Pattern
p_anchor = (char '^' >> liftM PCarat char_index)
<|> (char '$' >> liftM PDollar char_index)
<|> try (do _ <- string "()"
index <- group_index
return $ PGroup index PEmpty)
<?> "empty () or anchor ^ or $"
char_index :: P DoPa
char_index = do (gi,ci) <- getState
let ci' = succ ci
setState (gi,ci')
return (DoPa ci')
p_char :: P Pattern
p_char = p_dot <|> p_left_brace <|> p_escaped <|> p_other_char where
p_dot = char '.' >> char_index >>= return . PDot
p_left_brace = try $ (char '{' >> notFollowedBy digit >> char_index >>= return . (`PChar` '{'))
p_escaped = char '\\' >> anyChar >>= \c -> char_index >>= return . (`PEscape` c)
p_other_char = noneOf specials >>= \c -> char_index >>= return . (`PChar` c)
where specials = "^.[$()|*+?{\\"
-- parse [bar] and [^bar] sets of characters
p_bracket :: P Pattern
p_bracket = (char '[') >> ( (char '^' >> p_set True) <|> (p_set False) )
p_set :: Bool -> P Pattern
p_set invert = do initial <- option "" (char ']' >> return "]")
values <- if null initial then many1 p_set_elem else many p_set_elem
_ <- char ']'
ci <- char_index
let chars = maybe'set $ concat $
initial :
[ c | BEChar c <- values ] :
[ [start..end] | BERange start end <- values ]
colls = maybe'set [PatternSetCollatingElement coll | BEColl coll <- values ]
equivs = maybe'set [PatternSetEquivalenceClass equiv | BEEquiv equiv <- values]
class's = maybe'set [PatternSetCharacterClass a'class | BEClass a'class <- values]
maybe'set x = if null x then Nothing else Just (Set.fromList x)
sets = PatternSet chars class's colls equivs
sets `seq` return $ if invert then PAnyNot ci sets else PAny ci sets
-- From here down the code is the parser and functions for pattern [ ] set things
p_set_elem :: P BracketElement
p_set_elem = checkBracketElement =<< asum
[ p_set_elem_class
, p_set_elem_equiv
, p_set_elem_coll
, p_set_elem_range
, p_set_elem_char
, fail "Failed to parse bracketed string"
]
p_set_elem_class :: P BracketElement
p_set_elem_class = liftM BEClass $
try (between (string "[:") (string ":]") (many1 $ noneOf ":]"))
p_set_elem_equiv :: P BracketElement
p_set_elem_equiv = liftM BEEquiv $
try (between (string "[=") (string "=]") (many1 $ noneOf "=]"))
p_set_elem_coll :: P BracketElement
p_set_elem_coll = liftM BEColl $
try (between (string "[.") (string ".]") (many1 $ noneOf ".]"))
p_set_elem_range :: P BracketElement
p_set_elem_range = try $ do
start <- noneOf "]"
_ <- char '-'
end <- noneOf "]"
return $ BERange start end
p_set_elem_char :: P BracketElement
p_set_elem_char = do
c <- noneOf "]"
return (BEChar c)
-- | Fail when 'BracketElement' is invalid, e.g. empty range @1-0@.
-- This failure should not be caught.
--
checkBracketElement :: BracketElement -> P BracketElement
checkBracketElement e =
case e of
BERange start end
| start > end -> fail $ unwords
[ "End point"
, show end
, "of dashed character range is less than starting point"
, show start
]
| otherwise -> ok
BEChar _ -> ok
BEClass _ -> ok
BEColl _ -> ok
BEEquiv _ -> ok
where
ok = return e
|