File: Char8.hs

package info (click to toggle)
haskell-pcre-light 0.4-6
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 220 kB
  • ctags: 2
  • sloc: haskell: 3,615; makefile: 10; sh: 5
file content (205 lines) | stat: -rw-r--r-- 6,163 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
--------------------------------------------------------------------
-- |
-- Module   : Text.Regex.PCRE.Light.Char8
-- Copyright: Copyright (c) 2007-2008, Don Stewart
-- License  : BSD3
--
-- Maintainer:  Don Stewart <dons@galois.com>
-- Stability :  experimental
-- Portability: H98 + FFI
--
--------------------------------------------------------------------
-- 
-- A simple, portable binding to perl-compatible regular expressions
-- (PCRE) via 8-bit latin1 Strings.
--

module Text.Regex.PCRE.Light.Char8 (

        -- * The abstract PCRE Regex type
          Regex

        -- * String interface
        , compile, compileM
        , match

        -- * Regex types and constructors externally visible

        -- ** PCRE compile-time bit flags
        , PCREOption

        , anchored
        , auto_callout
        {-, bsr_anycrlf-}
        {-, bsr_unicode-}
        , caseless
        , dollar_endonly
        , dotall
        , dupnames
        , extended
        , extra
        , firstline
        , multiline
        {-, newline_any-}
        {-, newline_anycrlf-}
        , newline_cr
        , newline_crlf
        , newline_lf
        , no_auto_capture
        , ungreedy
        , utf8
        , no_utf8_check

        -- ** PCRE exec-time bit flags
        , PCREExecOption

        , exec_anchored
        {-, exec_newline_any     -}
        {-, exec_newline_anycrlf -}
        , exec_newline_cr
        , exec_newline_crlf
        , exec_newline_lf
        , exec_notbol
        , exec_noteol
        , exec_notempty
        , exec_no_utf8_check
        , exec_partial

    ) where

import qualified Data.ByteString.Char8 as S
import qualified Text.Regex.PCRE.Light as S
import Text.Regex.PCRE.Light hiding (match, compile, compileM)

-- | 'compile'
--
-- Compile a perl-compatible regular expression, in a strict bytestring.
-- The arguments are:
--
-- * 'pat': A ByteString, which may or may not be zero-terminated,
-- containing the regular expression to be compiled. 
--
-- * 'flags', optional bit flags. If 'Nothing' is provided, defaults are used.
--
-- Valid compile-time flags are:
--
-- * 'anchored'        - Force pattern anchoring
--
-- * 'auto_callout'    - Compile automatic callouts
--
-- * 'bsr_anycrlf'     - \\R matches only CR, LF, or CRLF
--
-- * 'bsr_unicode'     - \\R matches all Unicode line endings
--
-- * 'caseless'        - Do caseless matching
--
-- * 'dollar_endonly'  - '$' not to match newline at end
--
-- * 'dotall'          - matches anything including NL
--
-- * 'dupnames'        - Allow duplicate names for subpatterns
--
-- * 'extended'        - Ignore whitespace and # comments
--
-- * 'extra'           - PCRE extra features (not much use currently)
--
-- * 'firstline'       - Force matching to be  before  newline
--
-- * 'multiline'       - '^' and '$' match newlines within data
--
-- * 'newline_any'     - Recognize any Unicode newline sequence
--
-- * 'newline_anycrlf' - Recognize CR, LF, and CRLF as newline sequences
--
-- * 'newline_cr'      - Set CR as the newline sequence
--
-- * 'newline_crlf'    - Set CRLF as the newline sequence
--
-- * 'newline_lf'      - Set LF as the newline sequence
--
-- * 'no_auto_capture' - Disable numbered capturing parentheses (named ones available)
--
-- * 'ungreedy'        - Invert greediness of quantifiers
--
-- * 'utf8'            - Run in UTF-8 mode
--
-- * 'no_utf8_check'   - Do not check the pattern for UTF-8 validity
--
-- If compilation of the pattern fails, the 'Left' constructor is 
-- returned with the error string. Otherwise an abstract type
-- representing the compiled regular expression is returned.
-- The regex is allocated via malloc on the C side, and will be
-- deallocated by the runtime when the Haskell value representing it
-- goes out of scope.
--
-- As regexes are often defined statically, GHC will compile them 
-- to null-terminated, strict C strings, enabling compilation of the 
-- pattern without copying. This may be useful for very large patterns.
--
-- See man pcreapi for more details.
--
compile :: String -> [PCREOption] -> Regex
compile str os = S.compile (S.pack str) os
{-# INLINE compile #-}

-- | 'compileM'
-- A safe version of 'compile' with failure lifted into an Either
compileM :: String -> [PCREOption] -> Either String Regex
compileM str os = S.compileM (S.pack str) os
{-# INLINE compileM #-}


-- | 'match'
--
-- Matches a compiled regular expression against a given subject string,
-- using a matching algorithm that is similar to Perl's. If the subject
-- string doesn't match the regular expression, 'Nothing' is returned,
-- otherwise the portion of the string that matched is returned, along
-- with any captured subpatterns.
--
-- The arguments are:
--
-- * 'regex', a PCRE regular expression value produced by compile
--
-- * 'subject', the subject string to match against
--
-- * 'options', an optional set of exec-time flags to exec.
--
-- Available runtime options are:
--
-- * 'anchored'        - Match only at the first position
--
-- * 'bsr_anycrlf'     - '\\R' matches only CR, LF, or CRLF
--
-- * 'bsr_unicode'     - '\\R' matches all Unicode line endings
--
-- * 'newline_any'     - Recognize any Unicode newline sequence
--
-- * 'newline_anycrlf' - Recognize CR, LF, and CRLF as newline sequences
--
-- * 'newline_cr'      - Set CR as the newline sequence
--
-- * 'newline_crlf'    - Set CRLF as the newline sequence
--
-- * 'newline_lf'      - Set LF as the newline sequence
--
-- * 'notbol'          - Subject is not the beginning of a line
--
-- * 'noteol'          - Subject is not the end of a line
--
-- * 'notempty'        - An empty string is not a valid match
--
-- * 'no_utf8_check'   - Do not check the subject for UTF-8
--
-- * 'partial'         - Return PCRE_ERROR_PARTIAL for a partial match
--
-- The result value, and any captured subpatterns, are returned.
-- If the regex is invalid, or the subject string is empty, Nothing
-- is returned.
--
match :: Regex -> String -> [PCREExecOption] -> Maybe [String]
match r subject os =
    case S.match r (S.pack subject) os of
           Nothing -> Nothing
           Just x  -> Just (map S.unpack x)
{-# INLINE match #-}