1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193
|
{-# LANGUAGE BangPatterns, EmptyDataDecls, MagicHash, RecordWildCards,
ScopedTypeVariables #-}
{-# OPTIONS_GHC -fno-warn-orphans #-}
-- |
-- Module : Data.Text.ICU.Regex
-- Copyright : (c) 2010 Bryan O'Sullivan
--
-- License : BSD-style
-- Maintainer : bos@serpentine.com
-- Stability : experimental
-- Portability : GHC
--
-- Regular expression support for Unicode, implemented as bindings to
-- the International Components for Unicode (ICU) libraries.
--
-- The syntax and behaviour of ICU regular expressions are Perl-like.
-- For complete details, see the ICU User Guide entry at
-- <http://userguide.icu-project.org/strings/regexp>.
--
-- /Note/: The functions in this module are not thread safe. For
-- thread safe use, see 'clone' below, or use the pure functions in
-- 'Data.Text.ICU'.
module Data.Text.ICU.Regex
(
-- * Types
MatchOption(..)
, ParseError(errError, errLine, errOffset)
, Regex
-- * Functions
-- ** Construction
, regex
, regex'
, clone
-- ** Managing text to search
, setText
, getText
-- ** Inspection
, pattern
-- ** Searching
, find
, findNext
-- ** Match groups
-- $groups
, groupCount
, start
, end
, start_
, end_
) where
import Data.Text.ICU.Regex.Internal
import qualified Control.Exception as E
import Data.IORef (newIORef, readIORef, writeIORef)
import Data.Text (Text)
import qualified Data.Text.Foreign as T
import Data.Text.Foreign (I16)
import Data.Text.ICU.Internal (asBool)
import Data.Text.ICU.Error.Internal (ParseError(..), handleError)
import Data.Word (Word16)
import Foreign.ForeignPtr (ForeignPtr, newForeignPtr, withForeignPtr)
import Foreign.Marshal.Alloc (alloca)
import Foreign.Storable (peek)
import System.IO.Unsafe (unsafePerformIO)
instance Show Regex where
show re = "Regex " ++ show (pattern re)
-- $groups
--
-- Capturing groups are numbered starting from zero. Group zero is
-- always the entire matching text. Groups greater than zero contain
-- the text matching each capturing group in a regular expression.
-- | Compile a regular expression with the given options. This is
-- safest to use when the pattern is constructed at run time.
regex' :: [MatchOption] -> Text -> IO (Either ParseError Regex)
regex' opts pat = (Right `fmap` regex opts pat) `E.catch` \(err::ParseError) ->
return (Left err)
-- | Set the subject text string upon which the regular expression
-- will look for matches. This function may be called any number of
-- times, allowing the regular expression pattern to be applied to
-- different strings.
setText :: Regex -> Text -> IO ()
setText Regex{..} t = do
(hayfp, hayLen) <- T.asForeignPtr t
withForeignPtr reRe $ \rePtr ->
withForeignPtr hayfp $ \hayPtr -> handleError $
uregex_setText rePtr hayPtr (fromIntegral hayLen)
writeIORef reText $! H hayfp hayLen
-- | Get the subject text that is currently associated with this
-- regular expression object.
getText :: Regex -> IO (ForeignPtr Word16, I16)
getText Regex{..} = do
H fp len <- readIORef reText
return (fp, len)
-- | Return the source form of the pattern used to construct this
-- regular expression or match.
pattern :: Regex -> Text
pattern Regex{..} = unsafePerformIO . withForeignPtr reRe $ \rePtr ->
alloca $ \lenPtr -> do
textPtr <- handleError $ uregex_pattern rePtr lenPtr
(T.fromPtr textPtr . fromIntegral) =<< peek lenPtr
-- | Find the first matching substring of the input string that
-- matches the pattern.
--
-- If /n/ is non-negative, the search for a match begins at the
-- specified index, and any match region is reset.
--
-- If /n/ is -1, the search begins at the start of the input region,
-- or at the start of the full string if no region has been specified.
--
-- If a match is found, 'start', 'end', and 'group' will provide more
-- information regarding the match.
find :: Regex -> I16 -> IO Bool
find Regex{..} n =
fmap asBool . withForeignPtr reRe $ \rePtr -> handleError $
uregex_find rePtr (fromIntegral n)
-- | Find the next pattern match in the input string. Begin searching
-- the input at the location following the end of he previous match,
-- or at the start of the string (or region) if there is no previous
-- match.
--
-- If a match is found, 'start', 'end', and 'group' will provide more
-- information regarding the match.
findNext :: Regex -> IO Bool
findNext Regex{..} =
fmap asBool . withForeignPtr reRe $ handleError . uregex_findNext
-- | Make a copy of a compiled regular expression. Cloning a regular
-- expression is faster than opening a second instance from the source
-- form of the expression, and requires less memory.
--
-- Note that the current input string and the position of any matched
-- text within it are not cloned; only the pattern itself and and the
-- match mode flags are copied.
--
-- Cloning can be particularly useful to threaded applications that
-- perform multiple match operations in parallel. Each concurrent RE
-- operation requires its own instance of a 'Regex'.
clone :: Regex -> IO Regex
{-# INLINE clone #-}
clone Regex{..} = do
fp <- newForeignPtr uregex_close =<< withForeignPtr reRe (handleError . uregex_clone)
Regex fp `fmap` newIORef (H emptyForeignPtr 0)
-- | Return the number of capturing groups in this regular
-- expression's pattern.
groupCount :: Regex -> IO Int
groupCount Regex{..} =
fmap fromIntegral . withForeignPtr reRe $ handleError . uregex_groupCount
-- | Returns the index in the input string of the start of the text
-- matched by the specified capture group during the previous match
-- operation. Returns @-1@ if the capture group was not part of the
-- last match.
start_ :: Regex -> Int -> IO I16
start_ Regex{..} n =
fmap fromIntegral . withForeignPtr reRe $ \rePtr -> handleError $
uregex_start rePtr (fromIntegral n)
-- | Returns the index in the input string of the end of the text
-- matched by the specified capture group during the previous match
-- operation. Returns @-1@ if the capture group was not part of
-- the last match.
end_ :: Regex -> Int -> IO I16
end_ Regex{..} n =
fmap fromIntegral . withForeignPtr reRe $ \rePtr -> handleError $
uregex_end rePtr (fromIntegral n)
-- | Returns the index in the input string of the start of the text
-- matched by the specified capture group during the previous match
-- operation. Returns 'Nothing' if the capture group was not part of
-- the last match.
start :: Regex -> Int -> IO (Maybe I16)
start r n = check `fmap` start_ r n
-- | Returns the index in the input string of the end of the text
-- matched by the specified capture group during the previous match
-- operation. Returns 'Nothing' if the capture group was not part of
-- the last match.
end :: Regex -> Int -> IO (Maybe I16)
end r n = check `fmap` end_ r n
check :: I16 -> Maybe I16
check (-1) = Nothing
check k = Just $! fromIntegral k
|