File: Light.hs

package info (click to toggle)
haskell-pcre-light 0.4-6
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 220 kB
  • ctags: 2
  • sloc: haskell: 3,615; makefile: 10; sh: 5
file content (334 lines) | stat: -rw-r--r-- 10,768 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
{-# LANGUAGE CPP #-}
--------------------------------------------------------------------
-- |
-- Module   : Text.Regex.PCRE.Light
-- Copyright: Copyright (c) 2007-2008, Don Stewart
-- License  : BSD3
--
-- Maintainer:  Don Stewart <dons@galois.com>
-- Stability :  experimental
-- Portability: H98 + CPP
--
--------------------------------------------------------------------
-- 
-- A simple, portable binding to perl-compatible regular expressions
-- (PCRE) via strict ByteStrings.
--

module Text.Regex.PCRE.Light (

        -- * The abstract PCRE Regex type
          Regex

        -- * ByteString interface
        , compile, compileM
        , match
        , captureCount

        -- * Regex types and constructors externally visible

        -- ** PCRE compile-time bit flags
        , PCREOption

        , anchored
        , auto_callout
        {-, bsr_anycrlf-}
        {-, bsr_unicode-}
        , caseless
        , dollar_endonly
        , dotall
        , dupnames
        , extended
        , extra
        , firstline
        , multiline
        {-, newline_any-}
        {-, newline_anycrlf-}
        , newline_cr
        , newline_crlf
        , newline_lf
        , no_auto_capture
        , ungreedy
        , utf8
        , no_utf8_check

        -- ** PCRE exec-time bit flags
        , PCREExecOption

        , exec_anchored
        {-, exec_newline_any     -}
        {-, exec_newline_anycrlf -}
        , exec_newline_cr
        , exec_newline_crlf
        , exec_newline_lf
        , exec_notbol
        , exec_noteol
        , exec_notempty
        , exec_no_utf8_check
        , exec_partial

    ) where

import Text.Regex.PCRE.Light.Base

-- Strings
import qualified Data.ByteString          as S

#if __GLASGOW_HASKELL__ >= 608
import qualified Data.ByteString.Internal as S
import qualified Data.ByteString.Unsafe   as S
#else
import qualified Data.ByteString.Base     as S
#endif

-- Foreigns
import Foreign
import Foreign.Ptr
import Foreign.C.Types
import Foreign.C.String
import Foreign.Storable
import Foreign.Marshal.Alloc

-- | 'compile'
--
-- Compile a perl-compatible regular expression stored in a strict bytestring.
--
-- An example
--
-- > let r = compile (pack "^(b+|a){1,2}?bc") []
--
-- Or using GHC's -XOverloadedStrings flag, and importing
-- Data.ByteString.Char8, we can avoid the pack:
--
-- > let r = compile "^(b+|a){1,2}?bc" []
--
-- If the regular expression is invalid, an exception is thrown.
-- If this is unsuitable, 'compileM' is availlable, which returns failure 
-- in a monad.
--
-- To do case insentive matching,
--
-- > compile "^(b+|a){1,2}?bc" [caseless]
--
-- Other flags are documented below.
--
-- The resulting abstract regular expression can be passed to 'match'
-- for matching against a subject string.
--
-- The arguments are:
--
-- * 'pat': A ByteString containing the regular expression to be compiled. 
--
-- * 'flags', optional bit flags. If 'Nothing' is provided, defaults are used.
--
-- Valid compile-time flags are:
--
-- * 'anchored'        - Force pattern anchoring
--
-- * 'auto_callout'    - Compile automatic callouts
--
-- * 'bsr_anycrlf'     - \\R matches only CR, LF, or CRLF
--
-- * 'bsr_unicode'     - \\R matches all Unicode line endings
--
-- * 'caseless'        - Do caseless matching
--
-- * 'dollar_endonly'  - '$' not to match newline at end
--
-- * 'dotall'          - matches anything including NL
--
-- * 'dupnames'        - Allow duplicate names for subpatterns
--
-- * 'extended'        - Ignore whitespace and # comments
--
-- * 'extra'           - PCRE extra features (not much use currently)
--
-- * 'firstline'       - Force matching to be  before  newline
--
-- * 'multiline'       - '^' and '$' match newlines within data
--
-- * 'newline_any'     - Recognize any Unicode newline sequence
--
-- * 'newline_anycrlf' - Recognize CR, LF, and CRLF as newline sequences
--
-- * 'newline_cr'      - Set CR as the newline sequence
--
-- * 'newline_crlf'    - Set CRLF as the newline sequence
--
-- * 'newline_lf'      - Set LF as the newline sequence
--
-- * 'no_auto_capture' - Disable numbered capturing parentheses (named ones available)
--
-- * 'ungreedy'        - Invert greediness of quantifiers
--
-- * 'utf8'            - Run in UTF-8 mode
--
-- * 'no_utf8_check'   - Do not check the pattern for UTF-8 validity
--
-- The regex is allocated via malloc on the C side, and will be
-- deallocated by the runtime when the Haskell value representing it
-- goes out of scope.
--
-- See 'man pcreapi for more details.
--
-- Caveats: patterns with embedded nulls, such as "\0*" seem to be
-- mishandled, as this won't currently match the subject "\0\0\0".
--
compile :: S.ByteString -> [PCREOption] -> Regex
compile s o = case compileM s o of
    Right r -> r
    Left  e -> error ("Text.Regex.PCRE.Light: Error in regex: " ++ e)

------------------------------------------------------------------------

-- | 'compileM'
-- A safe version of 'compile' with failure wrapped in an Either.
--
-- Examples,
--
-- > > compileM ".*" [] :: Either String Regex
-- > Right (Regex 0x000000004bb5b980 ".*")
--
-- > > compileM "*" [] :: Either String Regex
-- > Left "nothing to repeat"
--
compileM :: S.ByteString -> [PCREOption] -> Either String Regex
compileM str os = unsafePerformIO $
  S.useAsCString str $ \pattern -> do
    alloca $ \errptr       -> do
    alloca $ \erroffset    -> do
        pcre_ptr <- c_pcre_compile pattern (combineOptions os) errptr erroffset nullPtr
        if pcre_ptr == nullPtr
            then do
                err <- peekCString =<< peek errptr
                return (Left err)
            else do
                reg <- newForeignPtr finalizerFree pcre_ptr -- release with free()
                return (Right (Regex reg str))

-- Possible improvements: an 'IsString' instance could be defined
-- for 'Regex', which would allow the compiler to insert calls to
-- 'compile' based on the type:
--
-- The following would be valid:
--
-- > match "a.*b" "abcdef" []
--
-- and equivalent to:
--
-- > match (either error id (compile "a.*b")) "abcdef" []

-- | 'match'
--
-- Matches a compiled regular expression against a given subject string,
-- using a matching algorithm that is similar to Perl's. If the subject
-- string doesn't match the regular expression, 'Nothing' is returned,
-- otherwise the portion of the string that matched is returned, along
-- with any captured subpatterns.
--
-- The arguments are:
--
-- * 'regex', a PCRE regular expression value produced by compile
--
-- * 'subject', the subject string to match against
--
-- * 'options', an optional set of exec-time flags to exec.
--
-- Available runtime options are:
--
-- * 'exec_anchored'        - Match only at the first position
--
-- * 'exec_newline_any'     - Recognize any Unicode newline sequence
--
-- * 'exec_newline_anycrlf' - Recognize CR, LF, and CRLF as newline sequences
--
-- * 'exec_newline_cr'      - Set CR as the newline sequence
--
-- * 'exec_newline_crlf'    - Set CRLF as the newline sequence
--
-- * 'exec_newline_lf'      - Set LF as the newline sequence
--
-- * 'exec_notbol'          - Subject is not the beginning of a line
--
-- * 'exec_noteol'          - Subject is not the end of a line
--
-- * 'exec_notempty'        - An empty string is not a valid match
--
-- * 'exec_no_utf8_check'   - Do not check the subject for UTF-8
--
-- * 'exec_partial'         - Return PCRE_ERROR_PARTIAL for a partial match
--
-- The result value, and any captured subpatterns, are returned.
-- If the regex is invalid, or the subject string is empty, Nothing
-- is returned.
--
match :: Regex -> S.ByteString -> [PCREExecOption] -> Maybe [S.ByteString]
match (Regex pcre_fp _) subject os = unsafePerformIO $ do
  withForeignPtr pcre_fp $ \pcre_ptr -> do
    n_capt <- captureCount' pcre_ptr

    -- The smallest  size  for ovector that will allow for n captured
    -- substrings, in addition to the offsets  of  the  substring
    -- matched by the whole pattern, is (n+1)*3. (man pcreapi)

    let ovec_size = (n_capt + 1) * 3
        ovec_bytes = ovec_size * size_of_cint

    allocaBytes ovec_bytes $ \ovec -> do

        let (str_fp, off, len) = S.toForeignPtr subject
        withForeignPtr str_fp $ \cstr -> do
            r <- c_pcre_exec
                         pcre_ptr
                         nullPtr
                         (cstr `plusPtr` off) -- may contain binary zero bytes.
                         (fromIntegral len)
                         0
                         (combineExecOptions os)
                         ovec
                         (fromIntegral ovec_size)

            if r < 0 -- errors, or error_no_match
                then return Nothing
                else let loop n o acc =
                            if n == r
                              then return (Just (reverse acc))
                              else do
                                    i <- peekElemOff ovec $! o
                                    j <- peekElemOff ovec (o+1)
                                    let s = substring i j subject
                                    s `seq` loop (n+1) (o+2) (s : acc)
                     in loop 0 0 []

    -- The  first  two-thirds  of ovec is used to pass back captured
    -- substrings When  a  match  is  successful, information about captured
    -- substrings is returned in pairs of integers,  starting  at the
    -- beginning of ovector, and continuing up to two-thirds of its length at
    -- the most.  The first pair, ovector[0] and ovector[1], identify the
    -- portion of the subject string matched  by  the entire pattern.  The next
    -- pair is used for the first capturing subpattern,  and  so on.  The
    -- value returned  by pcre_exec() is one more than the highest num- bered
    -- pair that has been set. For  example,  if  two  sub- strings  have been
    -- captured, the returned value is 3. 

  where
    -- The first element of a pair is set  to  the offset of the first
    -- character in a substring, and the second is set to the offset of the
    -- first character after  the  end of a substring.
    substring :: CInt -> CInt -> S.ByteString -> S.ByteString
    substring x y _ | x == y = S.empty -- XXX an unset subpattern
    substring a b s = end -- note that we're not checking...
        where
            start = S.unsafeDrop (fromIntegral a) s
            end   = S.unsafeTake (fromIntegral (b-a)) start


captureCount :: Regex -> Int
captureCount (Regex pcre_fp _) = unsafePerformIO $ do
  withForeignPtr pcre_fp $ \pcre_ptr -> do
    captureCount' pcre_ptr

captureCount' pcre_fp =
    alloca $ \n_ptr -> do -- (st :: Ptr CInt)
      c_pcre_fullinfo pcre_fp nullPtr info_capturecount n_ptr
      return . fromIntegral =<< peek (n_ptr :: Ptr CInt)