1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282
|
\begin{code}
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE UndecidableInstances #-}
{-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE DeriveDataTypeable #-}
{-# LANGUAGE MonoLocalBinds #-}
\end{code}
\begin{code}
module Text.RE.ZeInternals.Types.Match
( Match(..)
, noMatch
, emptyMatchArray
, matched
, matchedText
, matchCapture
, matchCaptures
, (!$$)
, captureText
, (!$$?)
, captureTextMaybe
, (!$)
, capture
, (!$?)
, captureMaybe
, RegexFix(..)
, convertMatchText
) where
\end{code}
\begin{code}
import Data.Array
import Data.Bits
import qualified Data.ByteString as BW
import qualified Data.ByteString.Char8 as B
import qualified Data.ByteString.Lazy.Char8 as LBS
import qualified Data.ByteString.UTF8 as B
import Data.Maybe
import qualified Data.Sequence as S
import qualified Data.Text as T
import qualified Data.Text.Encoding as T
import qualified Data.Text.Lazy as LT
import Data.Typeable
import Data.Word
import Text.RE.ZeInternals.Types.Capture
import Text.RE.ZeInternals.Types.CaptureID
import Text.Regex.Base
import qualified Text.Regex.PCRE2 as PCRE
import qualified Text.Regex.TDFA as TDFA
infixl 9 !$, !$$
\end{code}
\begin{code}
-- | the result of matching a RE to a text once (with @?=~@), retaining
-- the text that was matched against
data Match a =
Match
{ matchSource :: !a -- ^ the whole source text
, captureNames :: !CaptureNames -- ^ the RE's capture names
, matchArray :: !(Array CaptureOrdinal (Capture a))
-- ^ 0..n-1 captures,
-- starting with the
-- text matched by the
-- whole RE
}
deriving (Show,Eq,Typeable)
\end{code}
\begin{code}
-- | Construct a Match that does not match anything.
noMatch :: a -> Match a
noMatch t = Match t noCaptureNames emptyMatchArray
-- | an empty array of Capture
emptyMatchArray :: Array CaptureOrdinal (Capture a)
emptyMatchArray = listArray (CaptureOrdinal 0,CaptureOrdinal $ -1) []
\end{code}
\begin{code}
instance Functor Match where
fmap f Match{..} =
Match
{ matchSource = f matchSource
, captureNames = captureNames
, matchArray = fmap (fmap f) matchArray
}
\end{code}
\begin{code}
-- | tests whether the RE matched the source text at all
matched :: Match a -> Bool
matched = isJust . matchCapture
-- | yields the text matched by the RE, Nothing if no match
matchedText :: Match a -> Maybe a
matchedText = fmap capturedText . matchCapture
-- | the top-level capture if the source text matched the RE,
-- Nothing otherwise
matchCapture :: Match a -> Maybe (Capture a)
matchCapture = fmap fst . matchCaptures
-- | the main top-level capture (capture \'0'') and the sub captures
-- if the text matched the RE, @Nothing@ otherwise
matchCaptures :: Match a -> Maybe (Capture a,[Capture a])
matchCaptures Match{..} = case rangeSize (bounds matchArray) == 0 of
True -> Nothing
False -> Just (matchArray!0,drop 1 $ elems matchArray)
-- | an alternative for captureText
(!$$) :: Match a -> CaptureID -> a
(!$$) = flip captureText
-- | look up the text of the nth capture, 0 being the match of the whole
-- RE against the source text, 1, the first bracketed sub-expression to
-- be matched and so on
captureText :: CaptureID -> Match a -> a
captureText cid mtch = capturedText $ capture cid mtch
-- | an alternative for captureTextMaybe
(!$$?) :: Match a -> CaptureID -> Maybe a
(!$$?) = flip captureTextMaybe
-- | look up the text of the nth capture (0 being the match of the
-- whole), returning Nothing if the Match doesn't contain the capture
captureTextMaybe :: CaptureID -> Match a -> Maybe a
captureTextMaybe cid mtch = do
cap <- mtch !$? cid
case hasCaptured cap of
True -> Just $ capturedText cap
False -> Nothing
-- | an alternative for capture
(!$) :: Match a -> CaptureID -> Capture a
(!$) = flip capture
-- | look up the nth capture, 0 being the match of the whole RE against
-- the source text, 1, the first bracketed sub-expression to be matched
-- and so on
capture :: CaptureID -> Match a -> Capture a
capture cid mtch = fromMaybe oops $ mtch !$? cid
where
oops = error $ "capture: out of bounds (" ++ show cid ++ ")"
-- | an alternative for capture captureMaybe
(!$?) :: Match a -> CaptureID -> Maybe (Capture a)
(!$?) = flip captureMaybe
-- | look up the nth capture, 0 being the match of the whole RE against
-- the source text, 1, the first bracketed sub-expression to be matched
-- and so on, returning Nothing if there is no such capture, or if the
-- capture failed to capture anything (being in a failed alternate)
captureMaybe :: CaptureID -> Match a -> Maybe (Capture a)
captureMaybe cid mtch@Match{..} = do
i <- lookupCaptureID cid mtch
cap <- case bounds matchArray `inRange` i of
True -> Just $ matchArray ! i
False -> Nothing
case hasCaptured cap of
True -> Just cap
False -> Nothing
lookupCaptureID :: CaptureID -> Match a -> Maybe CaptureOrdinal
lookupCaptureID cid Match{..} =
either (const Nothing) Just $ findCaptureID cid captureNames
\end{code}
\begin{code}
-- | this instance hooks 'Match' into regex-base: regex consumers need
-- not worry about any of this
instance
( RegexContext regex source (AllTextSubmatches (Array Int) (source,(Int,Int)))
, RegexLike regex source
, RegexFix regex source
) =>
RegexContext regex source (Match source) where
match r s = convertMatchText r s $ getAllTextSubmatches $ match r s
matchM r s = do
y <- matchM r s
return $ convertMatchText r s $ getAllTextSubmatches y
\end{code}
\begin{code}
-- | convert a regex-base native MatchText into a regex Match type
convertMatchText :: RegexFix regex source
=> regex
-> source
-> MatchText source
-> Match source
convertMatchText re hay arr =
Match
{ matchSource = hay
, captureNames = noCaptureNames
, matchArray =
ixmap (CaptureOrdinal lo,CaptureOrdinal hi) getCaptureOrdinal $
fmap f arr
}
where
(lo,hi) = bounds arr
f (ndl,(off_,len_)) =
Capture
{ captureSource = hay
, capturedText = ndl
, captureOffset = off
, captureLength = len
}
where
CharRange off len = utf8_correct re hay off_ len_
\end{code}
\begin{code}
data CharRange = CharRange !Int !Int
deriving (Show)
class RegexFix regex source where
utf8_correct :: regex -> source -> Int -> Int -> CharRange
utf8_correct _ _ = CharRange
instance RegexFix TDFA.Regex [Char] where
instance RegexFix TDFA.Regex B.ByteString where
instance RegexFix TDFA.Regex LBS.ByteString where
instance RegexFix TDFA.Regex T.Text where
instance RegexFix TDFA.Regex LT.Text where
instance RegexFix TDFA.Regex (S.Seq Char) where
instance RegexFix PCRE.Regex [Char] where
utf8_correct _ = utf8_correct_bs . B.fromString
instance RegexFix PCRE.Regex B.ByteString where
instance RegexFix PCRE.Regex LBS.ByteString where
instance RegexFix PCRE.Regex T.Text where
utf8_correct _ = utf8_correct_bs . T.encodeUtf8
instance RegexFix PCRE.Regex LT.Text where
utf8_correct _ = utf8_correct_bs . T.encodeUtf8 . LT.toStrict
instance RegexFix PCRE.Regex (S.Seq Char) where
-- convert a byte offset+length in a UTF-8-encoded ByteString
-- into a character offset+length
utf8_correct_bs :: B.ByteString -> Int -> Int -> CharRange
utf8_correct_bs bs ix0 ln0 = case ix0+ln0 > BW.length bs of
True -> error "utf8_correct_bs: index+length out of range"
False -> skip 0 0 -- BW.index calls below should not fail
where
skip ix di = case compare ix ix0 of
GT -> case ix0 of
-- -1 is used as a magic number to indicate failure to match
-1 -> CharRange ix0 ln0
_ -> error "utf8_correct_bs: UTF-8 decoding error"
EQ -> count ix di 0 ln0
LT -> case u8_width $ BW.index bs ix of
Single -> skip (ix+1) di
Double -> skip (ix+2) $ di+1
Triple -> skip (ix+3) $ di+2
Quadruple -> skip (ix+4) $ di+3
count ix di dl c = case compare c 0 of
LT -> error "utf8_correct_bs: length ends inside character"
EQ -> CharRange (ix0-di) (ln0-dl)
GT -> case u8_width $ BW.index bs ix of
Single -> count (ix+1) di dl $ c-1
Double -> count (ix+2) di (dl+1) $ c-2
Triple -> count (ix+3) di (dl+2) $ c-3
Quadruple -> count (ix+4) di (dl+3) $ c-4
data UTF8Size = Single | Double | Triple | Quadruple
deriving (Show)
u8_width :: Word8 -> UTF8Size
u8_width w8 = case w8 .&. 0x80 == 0x00 of
True -> Single
False -> case w8 .&. 0xE0 == 0xC0 of
True -> Double
False -> case w8 .&. 0xF0 == 0xE0 of
True -> Triple
False -> case w8 .&. 0xF8 == 0xF0 of
True -> Quadruple
False -> error "u8_width: UTF-8 decoding error"
\end{code}
|