File: Match.lhs

package info (click to toggle)
haskell-regex 1.1.0.2-4
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 424 kB
  • sloc: haskell: 4,533; makefile: 3
file content (282 lines) | stat: -rw-r--r-- 9,441 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
\begin{code}
{-# LANGUAGE RecordWildCards            #-}
{-# LANGUAGE FlexibleContexts           #-}
{-# LANGUAGE FlexibleInstances          #-}
{-# LANGUAGE UndecidableInstances       #-}
{-# LANGUAGE MultiParamTypeClasses      #-}
{-# LANGUAGE DeriveDataTypeable         #-}
{-# LANGUAGE MonoLocalBinds             #-}
\end{code}

\begin{code}
module Text.RE.ZeInternals.Types.Match
  ( Match(..)
  , noMatch
  , emptyMatchArray
  , matched
  , matchedText
  , matchCapture
  , matchCaptures
  , (!$$)
  , captureText
  , (!$$?)
  , captureTextMaybe
  , (!$)
  , capture
  , (!$?)
  , captureMaybe
  , RegexFix(..)
  , convertMatchText
  ) where
\end{code}

\begin{code}
import           Data.Array
import           Data.Bits
import qualified Data.ByteString                as BW
import qualified Data.ByteString.Char8          as B
import qualified Data.ByteString.Lazy.Char8     as LBS
import qualified Data.ByteString.UTF8           as B
import           Data.Maybe
import qualified Data.Sequence                  as S
import qualified Data.Text                      as T
import qualified Data.Text.Encoding             as T
import qualified Data.Text.Lazy                 as LT
import           Data.Typeable
import           Data.Word
import           Text.RE.ZeInternals.Types.Capture
import           Text.RE.ZeInternals.Types.CaptureID
import           Text.Regex.Base
import qualified Text.Regex.PCRE2               as PCRE
import qualified Text.Regex.TDFA                as TDFA

infixl 9 !$, !$$
\end{code}

\begin{code}
-- | the result of matching a RE to a text once (with @?=~@), retaining
-- the text that was matched against
data Match a =
  Match
    { matchSource  :: !a                -- ^ the whole source text
    , captureNames :: !CaptureNames     -- ^ the RE's capture names
    , matchArray   :: !(Array CaptureOrdinal (Capture a))
                                        -- ^ 0..n-1 captures,
                                        -- starting with the
                                        -- text matched by the
                                        -- whole RE
    }
  deriving (Show,Eq,Typeable)
\end{code}

\begin{code}
-- | Construct a Match that does not match anything.
noMatch :: a -> Match a
noMatch t = Match t noCaptureNames emptyMatchArray

-- | an empty array of Capture
emptyMatchArray :: Array CaptureOrdinal (Capture a)
emptyMatchArray = listArray (CaptureOrdinal 0,CaptureOrdinal $ -1) []
\end{code}

\begin{code}
instance Functor Match where
  fmap f Match{..} =
    Match
      { matchSource  = f matchSource
      , captureNames = captureNames
      , matchArray   = fmap (fmap f) matchArray
      }
\end{code}

\begin{code}
-- | tests whether the RE matched the source text at all
matched :: Match a -> Bool
matched = isJust . matchCapture

-- | yields the text matched by the RE, Nothing if no match
matchedText :: Match a -> Maybe a
matchedText = fmap capturedText . matchCapture

-- | the top-level capture if the source text matched the RE,
-- Nothing otherwise
matchCapture :: Match a -> Maybe (Capture a)
matchCapture = fmap fst . matchCaptures

-- | the main top-level capture (capture \'0'') and the sub captures
-- if the text matched the RE, @Nothing@ otherwise
matchCaptures :: Match a -> Maybe (Capture a,[Capture a])
matchCaptures Match{..} = case rangeSize (bounds matchArray) == 0 of
  True  -> Nothing
  False -> Just (matchArray!0,drop 1 $ elems matchArray)

-- | an alternative for captureText
(!$$) :: Match a -> CaptureID -> a
(!$$) = flip captureText

-- | look up the text of the nth capture, 0 being the match of the whole
-- RE against the source text, 1, the first bracketed sub-expression to
-- be matched and so on
captureText :: CaptureID -> Match a -> a
captureText cid mtch = capturedText $ capture cid mtch

-- | an alternative for captureTextMaybe
(!$$?) :: Match a -> CaptureID -> Maybe a
(!$$?) = flip captureTextMaybe

-- | look up the text of the nth capture (0 being the match of the
-- whole), returning Nothing if the Match doesn't contain the capture
captureTextMaybe :: CaptureID -> Match a -> Maybe a
captureTextMaybe cid mtch = do
    cap <- mtch !$? cid
    case hasCaptured cap of
      True  -> Just $ capturedText cap
      False -> Nothing

-- | an alternative for capture
(!$) :: Match a -> CaptureID -> Capture a
(!$) = flip capture

-- | look up the nth capture, 0 being the match of the whole RE against
-- the source text, 1, the first bracketed sub-expression to be matched
-- and so on
capture :: CaptureID -> Match a -> Capture a
capture cid mtch = fromMaybe oops $ mtch !$? cid
  where
    oops = error $ "capture: out of bounds (" ++ show cid ++ ")"

-- | an alternative for capture captureMaybe
(!$?) :: Match a -> CaptureID -> Maybe (Capture a)
(!$?) = flip captureMaybe

-- | look up the nth capture, 0 being the match of the whole RE against
-- the source text, 1, the first bracketed sub-expression to be matched
-- and so on, returning Nothing if there is no such capture, or if the
-- capture failed to capture anything (being in a failed alternate)
captureMaybe :: CaptureID -> Match a -> Maybe (Capture a)
captureMaybe cid mtch@Match{..} = do
  i   <- lookupCaptureID cid mtch
  cap <- case bounds matchArray `inRange` i of
    True  -> Just $ matchArray ! i
    False -> Nothing
  case hasCaptured cap of
    True  -> Just cap
    False -> Nothing

lookupCaptureID :: CaptureID -> Match a -> Maybe CaptureOrdinal
lookupCaptureID cid Match{..} =
    either (const Nothing) Just $ findCaptureID cid captureNames
\end{code}


\begin{code}
-- | this instance hooks 'Match' into regex-base: regex consumers need
-- not worry about any of this
instance
    ( RegexContext regex source (AllTextSubmatches (Array Int) (source,(Int,Int)))
    , RegexLike    regex source
    , RegexFix     regex source
    ) =>
  RegexContext regex source (Match source) where
    match  r s = convertMatchText r s $ getAllTextSubmatches $ match r s
    matchM r s = do
      y <- matchM r s
      return $ convertMatchText r s $ getAllTextSubmatches y
\end{code}

\begin{code}
-- | convert a regex-base native MatchText into a regex Match type
convertMatchText :: RegexFix regex source
                 => regex
                 -> source
                 -> MatchText source
                 -> Match source
convertMatchText re hay arr =
    Match
      { matchSource  = hay
      , captureNames = noCaptureNames
      , matchArray   =
          ixmap (CaptureOrdinal lo,CaptureOrdinal hi) getCaptureOrdinal $
            fmap f arr
      }
  where
    (lo,hi) = bounds arr

    f (ndl,(off_,len_)) =
      Capture
        { captureSource = hay
        , capturedText  = ndl
        , captureOffset = off
        , captureLength = len
        }
      where
        CharRange off len = utf8_correct re hay off_ len_
\end{code}

\begin{code}
data CharRange = CharRange !Int !Int
  deriving (Show)

class RegexFix regex source where
  utf8_correct :: regex -> source -> Int -> Int -> CharRange
  utf8_correct _ _ = CharRange

instance RegexFix TDFA.Regex [Char]         where
instance RegexFix TDFA.Regex B.ByteString   where
instance RegexFix TDFA.Regex LBS.ByteString where
instance RegexFix TDFA.Regex T.Text         where
instance RegexFix TDFA.Regex LT.Text        where
instance RegexFix TDFA.Regex (S.Seq Char)   where

instance RegexFix PCRE.Regex [Char]         where
  utf8_correct _ = utf8_correct_bs . B.fromString
instance RegexFix PCRE.Regex B.ByteString   where
instance RegexFix PCRE.Regex LBS.ByteString where
instance RegexFix PCRE.Regex T.Text         where
  utf8_correct _ = utf8_correct_bs . T.encodeUtf8
instance RegexFix PCRE.Regex LT.Text        where
  utf8_correct _ = utf8_correct_bs . T.encodeUtf8 . LT.toStrict
instance RegexFix PCRE.Regex (S.Seq Char)   where

-- convert a byte offset+length in a UTF-8-encoded ByteString
-- into a character offset+length
utf8_correct_bs :: B.ByteString -> Int -> Int -> CharRange
utf8_correct_bs bs ix0 ln0 = case ix0+ln0 > BW.length bs of
    True  -> error "utf8_correct_bs: index+length out of range"
    False -> skip 0 0     -- BW.index calls below should not fail
  where
    skip ix di = case compare ix ix0 of
      GT -> case ix0 of
        -- -1 is used as a magic number to indicate failure to match
        -1 -> CharRange ix0 ln0
        _ -> error "utf8_correct_bs: UTF-8 decoding error"
      EQ -> count ix di 0 ln0
      LT -> case u8_width $ BW.index bs ix of
        Single    -> skip (ix+1)   di
        Double    -> skip (ix+2) $ di+1
        Triple    -> skip (ix+3) $ di+2
        Quadruple -> skip (ix+4) $ di+3

    count ix di dl c = case compare c 0 of
      LT -> error "utf8_correct_bs: length ends inside character"
      EQ -> CharRange (ix0-di) (ln0-dl)
      GT -> case u8_width $ BW.index bs ix of
        Single    -> count (ix+1) di  dl    $ c-1
        Double    -> count (ix+2) di (dl+1) $ c-2
        Triple    -> count (ix+3) di (dl+2) $ c-3
        Quadruple -> count (ix+4) di (dl+3) $ c-4

data UTF8Size = Single | Double | Triple | Quadruple
  deriving (Show)

u8_width :: Word8 -> UTF8Size
u8_width w8 = case   w8 .&. 0x80 == 0x00 of
  True  ->       Single
  False -> case      w8 .&. 0xE0 == 0xC0 of
    True  ->     Double
    False -> case    w8 .&. 0xF0 == 0xE0 of
      True  ->   Triple
      False -> case  w8 .&. 0xF8 == 0xF0 of
        True  -> Quadruple
        False -> error "u8_width: UTF-8 decoding error"
\end{code}