1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232
|
\begin{code}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE CPP #-}
#if __GLASGOW_HASKELL__ >= 800
{-# LANGUAGE TemplateHaskellQuotes #-}
#else
{-# LANGUAGE QuasiQuotes #-}
{-# LANGUAGE TemplateHaskell #-}
#endif
module Text.RE.ZeInternals.NamedCaptures
( cp
, extractNamedCaptures
, idFormatTokenREOptions
, Token(..)
, validToken
, formatTokens
, formatTokens'
, formatTokens0
, scan
) where
import Data.Char
import qualified Data.HashMap.Strict as HM
import qualified Data.Text as T
import GHC.Generics
import qualified Language.Haskell.TH as TH
import Language.Haskell.TH.Quote
import Text.RE.ZeInternals.PreludeMacros
import Text.RE.ZeInternals.QQ
import Text.RE.ZeInternals.TestBench
import Text.RE.ZeInternals.Tools.Lex
import Text.RE.ZeInternals.Types.CaptureID
import Text.RE.ZeInternals.Types.Match
import Text.RE.ZeInternals.Types.Poss
import Text.Regex.TDFA
-- | quasi quoter for CaptureID: @[cp|0|]@, @[cp|0|]@, etc.,
-- indexing captures by classic positional numbers, and @[cp|foo|]@,
-- etc., referencing a named capture @[re| ... ${foo}( ... ) ... |]@.
cp :: QuasiQuoter
cp =
(qq0 "cp")
{ quoteExp = parse_capture
}
-- | extract the CaptureNames from an RE or return an error diagnostic
-- if the RE is not well formed; also returns the total number of captures
-- in the RE
extractNamedCaptures :: String -> Either String ((Int,CaptureNames),String)
extractNamedCaptures s = Right (analyseTokens tks,formatTokens tks)
where
tks = scan s
\end{code}
Token
-----
\begin{code}
-- | our RE scanner returns a list of these tokens
data Token
= ECap (Maybe String)
| PGrp
| PCap
| Bra
| BS Char
| Other Char
deriving (Show,Generic,Eq)
-- | check that a token is well formed
validToken :: Token -> Bool
validToken tkn = case tkn of
ECap mb -> maybe True check_ecap mb
PGrp -> True
PCap -> True
Bra -> True
BS c -> is_dot c
Other c -> is_dot c
where
check_ecap s = not (null s) && all not_br s
is_dot c = c/='\n'
not_br c = not $ c `elem` "{}\n"
\end{code}
Analysing [Token] -> CaptureNames
---------------------------------
\begin{code}
-- | analyse a token stream, returning the number of captures and the
-- 'CaptureNames'
analyseTokens :: [Token] -> (Int,CaptureNames)
analyseTokens tks0 = case count_em 1 tks0 of
(n,as) -> (n-1, HM.fromList as)
where
count_em n [] = (n,[])
count_em n (tk:tks) = case count_em (n `seq` n+d) tks of
(n',as) -> (n',bd++as)
where
(d,bd) = case tk of
ECap (Just nm) -> (,) 1 [(CaptureName $ T.pack nm,CaptureOrdinal n)]
ECap Nothing -> (,) 1 []
PGrp -> (,) 0 []
PCap -> (,) 1 []
Bra -> (,) 1 []
BS _ -> (,) 0 []
Other _ -> (,) 0 []
\end{code}
Scanning Regex Strings
----------------------
\begin{code}
-- | scan a RE string into a list of RE Token
scan :: String -> [Token]
scan = alex' match al $ oops "top"
where
al :: [(Regex,Match String->Maybe Token)]
al =
[ mk "\\$\\{([^{}]+)\\}\\(" $ ECap . Just . x_1
, mk "\\$\\(" $ const $ ECap Nothing
, mk "\\(\\?:" $ const PGrp
, mk "\\(\\?" $ const PCap
, mk "\\(" $ const Bra
, mk "\\\\(.)" $ BS . s2c . x_1
, mk "(.|\n)" $ Other . s2c . x_1
]
x_1 = captureText $ IsCaptureOrdinal $ CaptureOrdinal 1
s2c [c] = c
s2c _ = oops "s2c"
mk s f = (poss error id $ makeRegexM s,Just . f)
oops m = error $ "NamedCaptures.scan: " ++ m
\end{code}
Parsing captures
----------------
\begin{code}
parse_capture :: String -> TH.Q TH.Exp
parse_capture s = case all isDigit s of
True -> [|IsCaptureOrdinal $ CaptureOrdinal $ read s|]
False -> [|IsCaptureName $ CaptureName $ T.pack s|]
\end{code}
Formatting [Token]
------------------
\begin{code}
-- | format [Token] into an RE string
formatTokens :: [Token] -> String
formatTokens = formatTokens' defFormatTokenREOptions
-- | options for the general Token formatter below
data FormatTokenREOptions =
FormatTokenREOptions
{ _fto_regex_type :: Maybe RegexType -- ^ Posix, PCRE or indeterminate REs?
, _fto_min_caps :: Bool -- ^ remove captures where possible
, _fto_incl_caps :: Bool -- ^ include the captures in the output
}
deriving (Show)
-- | the default configuration for the Token formatter
defFormatTokenREOptions :: FormatTokenREOptions
defFormatTokenREOptions =
FormatTokenREOptions
{ _fto_regex_type = Nothing
, _fto_min_caps = False
, _fto_incl_caps = False
}
-- | a configuration that will preserve the parsed regular expression
-- in the output
idFormatTokenREOptions :: FormatTokenREOptions
idFormatTokenREOptions =
FormatTokenREOptions
{ _fto_regex_type = Nothing
, _fto_min_caps = False
, _fto_incl_caps = True
}
-- | the general Token formatter, generating REs according to the options
formatTokens' :: FormatTokenREOptions -> [Token] -> String
formatTokens' FormatTokenREOptions{..} = foldr f ""
where
f tk tl = t_s ++ tl
where
t_s = case tk of
ECap mb -> ecap mb
PGrp -> if maybe False isTDFA _fto_regex_type then "(" else "(?:"
PCap -> "(?"
Bra -> bra _fto_min_caps
BS c -> "\\" ++ [c]
Other c -> [c]
ecap mb = case _fto_incl_caps of
True -> case mb of
Nothing -> "$("
Just nm -> "${"++nm++"}("
False -> bra _fto_min_caps
bra mc = case mc && maybe False isPCRE _fto_regex_type of
True -> "(?:"
False -> "("
\end{code}
\begin{code}
-- this is a reference of formatTokens defFormatTokenREOptions,
-- used for testing the latter
formatTokens0 :: [Token] -> String
formatTokens0 = foldr f ""
where
f tk tl = t_s ++ tl
where
t_s = case tk of
ECap _ -> "("
PGrp -> "(?:"
PCap -> "(?"
Bra -> "("
BS c -> "\\" ++ [c]
Other c -> [c]
\end{code}
|