1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826
|
{-# LANGUAGE CPP, ForeignFunctionInterface, GeneralizedNewtypeDeriving #-}
--------------------------------------------------------------------
-- |
-- Module : Text.Regex.PCRE.Light.Base
-- Copyright: Copyright (c) 2007-2008, Don Stewart
--
-- Documentation based on /man pcreapi/, written by Philip Hazel, 2007.
--
-- License : BSD3
-- Maintainer: Don Stewart <dons@galois.com>
-- Stability : experimental
-- Portability: CPP, FFI
-- Tested with: GHC 6.8.2
--
-- Raw FFI bindings to PCRE functions and constants.
--
module Text.Regex.PCRE.Light.Base (
-- * A PCRE structure
PCRE
, Regex(..)
-- * C exports
, c_pcre_compile
, c_pcre_exec
, c_pcre_fullinfo
------------------------------------------------------------------------
-- * PCRE Options, an abstract newtyped Num wrapper over a CInt
, PCREOption
, combineOptions
, anchored , auto_callout {-, bsr_anycrlf-}
{-, bsr_unicode-} , caseless , dollar_endonly
, dotall , dupnames , extended
, extra , firstline , multiline
{-, newline_any-} {-, newline_anycrlf-} , newline_cr
, newline_crlf , newline_lf , no_auto_capture
, ungreedy , utf8 , no_utf8_check
-- * PCRE exec-time options, an abstract, newtyped Num wrapper over CInt
, PCREExecOption
, combineExecOptions
, exec_anchored {-, exec_newline_any , exec_newline_anycrlf-}
, exec_newline_cr , exec_newline_crlf , exec_newline_lf
, exec_notbol , exec_noteol , exec_notempty
, exec_no_utf8_check , exec_partial
------------------------------------------------------------------------
-- * PCRE Errors
, PCREError
, error_nomatch , error_null , error_badoption
, error_badmagic {-, error_unknown_opcode-} , error_unknown_node
, error_nomemory , error_nosubstring , error_matchlimit
, error_callout , error_badutf8 , error_badutf8_offset
, error_partial , error_badpartial , error_internal
, error_badcount , error_dfa_uitem , error_dfa_ucond
, error_dfa_umlimit , error_dfa_wssize , error_dfa_recurse
, error_recursionlimit {-, error_nullwslimit-} {-, error_badnewline-}
-- * PCRE Info
, PCREInfo
, info_options , info_size , info_capturecount
, info_backrefmax , info_firstbyte , info_firstchar
, info_firsttable , info_lastliteral , info_nameentrysize
, info_namecount , info_nametable , info_studysize
, info_default_tables {-, info_okpartial-} {-, info_jchanged-}
{-, info_hascrorlf-}
-- * PCRE Configuration
, PCREConfig
, config_utf8 , config_newline , config_link_size
, config_posix_malloc_threshold , config_match_limit
, config_stackrecurse , config_unicode_properties
, config_match_limit_recursion {-, config_bsr-}
-- * PCRE Extra
, PCREExtraFlags
, extra_study_data , extra_match_limit , extra_callout_data
, extra_tables , extra_match_limit_recursion,
------------------------------------------------------------------------
size_of_cint
) where
-- Foreigns
import Foreign
import Foreign.Ptr
import Foreign.C.Types
import Foreign.C.String
import qualified Data.ByteString.Char8 as S
#include <pcre.h>
-- | Get sizeof CInt from hsc2hs
size_of_cint :: Int
size_of_cint = #const (sizeof(int))
------------------------------------------------------------------------
-- Types
-- | An abstract pointer to a compiled PCRE Regex structure
-- The structure allocated by the PCRE library will be deallocated
-- automatically by the Haskell storage manager.
--
data Regex = Regex {-# UNPACK #-} !(ForeignPtr PCRE)
{-# UNPACK #-} !S.ByteString
deriving (Eq, Ord, Show)
type PCRE = ()
------------------------------------------------------------------------
-- | A type for PCRE compile-time options. These are newtyped CInts,
-- which can be bitwise-or'd together, using '(Data.Bits..|.)'
--
newtype PCREOption = PCREOption { unPCREOption :: PCREOption_ }
#if __GLASGOW_HASKELL__
deriving (Eq,Ord,Show,Read)
#endif
-- | Combine a list of options into a single option, using bitwise (.|.)
combineOptions :: [PCREOption] -> PCREOption
combineOptions = PCREOption . foldr ((.|.) . unPCREOption) 0
-- Now follows the user-visible options to _exec and _compile.
-- To avoid type errors, we newtype the underlying CInts, and
-- statically differentiate PCREOptions from non-PCREOptions
--
-- The safety can still be defeated using numeric literals though,
-- and other Num operations. We could do more to protect against this.
-- (a smart constructor for .|.)
-- | 'anchored'
--
-- If this bit is set, the pattern is forced to be /anchored/, that is,
-- it is constrained to match only at the first matching point in the
-- string that is being searched (the /subject string/). This effect can
-- also be achieved by appropriate constructs in the pattern itself, which
-- is the only way to do it in Perl.
--
anchored :: PCREOption
anchored = PCREOption anchored_cint
-- | 'auto_callout'
--
-- If this bit is set, "compile" automatically inserts callout
-- items, all with number 255, before each pattern item. For discussion
-- of the callout facility, see the man pcrecallout documentation
--
auto_callout :: PCREOption
auto_callout = PCREOption auto_callout_cint
-- | 'bsr_anycrlf' and 'bsr_unicode'
--
-- These options (which are mutually exclusive) control what the \\R escape
-- sequence matches. The choice is either to match only CR, LF, or CRLF, or to
-- match any Unicode new- line sequence. The default is specified when PCRE is
-- built. It can be overridden from within the pattern, or by setting an option
-- when a compiled pattern is matched.
--
-- bsr_anycrlf :: PCREOption
-- bsr_anycrlf = PCREOption bsr_anycrlf_cint
-- | 'bsr_unicode'. See 'bse_anycrlf'
--
-- bsr_unicode :: PCREOption
-- bsr_unicode = PCREOption bsr_unicode_cint
-- | 'caseless'
--
-- If this bit is set, letters in the pattern match both upper and lower case
-- letters. It is equivalent to Perl's \/i option, and it can be changed within a
-- pattern by a (?i) option setting. In UTF-8 mode, PCRE always understands the
-- concept of case for characters whose values are less than 128, so caseless
-- matching is always possible. For characters with higher values, the concept of
-- case is supported if PCRE is compiled with Unicode property sup- port, but not
-- otherwise. If you want to use caseless matching for characters 128 and above,
-- you must ensure that PCRE is compiled with Unicode property support as well as
-- with UTF-8 support.
--
caseless :: PCREOption
caseless = PCREOption caseless_cint
-- | 'dollar_endonly'
--
-- If this bit is set, a dollar metacharacter in the pattern matches only at
-- the end of the subject string. Without this option, a dollar also matches
-- immediately before a newline at the end of the string (but not before any other
-- newlines). The 'dollar_endonly' option is ignored if 'multiline'
-- is set. There is no equivalent to this option in Perl, and no way to set it
-- within a pattern.
--
dollar_endonly :: PCREOption
dollar_endonly = PCREOption dollar_endonly_cint
-- | 'dotall'
--
-- If this bit is set, a dot metacharater in the pattern matches all
-- characters, including those that indicate newline. Without it, a dot does
-- not match when the current position is at a newline. This option is
-- equivalent to Perl's \/s option, and it can be changed within a pattern by a
-- (?s) option setting. A negative class such as [^a] always matches newline
-- characters, independent of the setting of this option.
--
dotall :: PCREOption
dotall = PCREOption dotall_cint
-- | 'dupnames'
--
-- If this bit is set, names used to identify capturing subpatterns need not be
-- unique. This can be helpful for certain types of pattern when it is known
-- that only one instance of the named subpattern can ever be matched. There are
-- more details of named subpatterns in the /man pcreapi/ documentation.
--
dupnames :: PCREOption
dupnames = PCREOption dupnames_cint
-- | 'extended'
--
-- If this bit is set, whitespace data characters in the pattern are totally
-- ignored except when escaped or inside a character class. Whitespace does not
-- include the VT character (code 11). In addition, characters between an
-- unescaped \# outside a character class and the next newline, inclusive, are
-- also ignored. This is equivalent to Perl's \/x option, and it can be changed
--within a pattern by a (?x) option setting.
--
-- This option makes it possible to include comments inside complicated
-- patterns. Note, however, that this applies only to data characters. Whitespace
-- characters may never appear within special character sequences in a pattern,
-- for example within the sequence (?( which introduces a conditional subpattern.
--
extended :: PCREOption
extended = PCREOption extended_cint
-- | 'extra'
--
-- This option was invented in order to turn on additional functionality of
-- PCRE that is incompatible with Perl, but it is currently of very little use.
-- When set, any backslash in a pattern that is followed by a letter that has no
-- special meaning causes an error, thus reserving these combinations for future
-- expansion. By default, as in Perl, a backslash followed by a letter with no
-- special meaning is treated as a literal. (Perl can, however, be persuaded to
-- give a warning for this.) There are at present no other features controlled by
-- this option. It can also be set by a (?X) option setting within a pattern.
--
extra :: PCREOption
extra = PCREOption extra_cint
-- | 'firstline'
--
-- If this option is set, an unanchored pattern is required to match before or
-- at the first newline in the subject string, though the matched text may
--continue over the newline.
--
firstline :: PCREOption
firstline = PCREOption firstline_cint
-- | 'multiline'
--
-- By default, PCRE treats the subject string as consisting of a single line
-- of characters (even if it actually contains newlines). The /start of line/
-- metacharacter (^) matches only at the start of the string, while the /end of line/
-- metacharacter ($) matches only at the end of the string, or before a
-- terminating newline (unless 'dollar_endonly' is set). This is the same
-- as Perl.
--
-- When 'multiline' it is set, the /start of line/ and /end of line/
-- constructs match immediately following or immediately before internal newlines
-- in the subject string, respectively, as well as at the very start and end. This
-- is equivalent to Perl's \/m option, and it can be changed within a pattern by a
-- (?m) option setting. If there are no newlines in a subject string, or no occur-
-- rences of ^ or $ in a pattern, setting PCRE_MULTILINE has no effect.
--
multiline :: PCREOption
multiline = PCREOption multiline_cint
-- | newline_cr', 'newline_lf', 'newline_crlf',
-- 'newline_anycrlf', 'newline_any'
--
-- These options override the default newline definition that
-- was chosen when PCRE was built. Setting the first or the
-- second specifies that a newline is indicated by a single
-- character (CR or LF, respectively). Setting 'newline_crlf' specifies
-- that a newline is indicated by the two-character CRLF sequence.
-- Setting 'newline_anycrlf'
-- specifies that any of the three preceding sequences should
-- be recognized. Setting 'newline_any' specifies that any
-- Unicode newline sequence should be recognized. The Unicode
-- newline sequences are the three just mentioned, plus the
-- single characters VT (vertical tab, U+000B), FF (formfeed,
-- U+000C), NEL (next line, U+0085), LS (line separator,
-- U+2028), and PS (paragraph separator, U+2029). The last
-- two are recognized only in UTF-8 mode.
--
-- The newline setting in the options word uses three bits
-- that are treated as a number, giving eight possibilities.
-- Currently only six are used (default plus the five values
-- above). This means that if you set more than one newline
-- option, the combination may or may not be sensible. For
-- example, 'newline_cr' with 'newline_lf' is equivalent to
-- 'newline_crlf', but other combinations may yield unused numbers and
-- cause an error.
--
-- The only time that a line break is specially recognized
-- when compiling a pattern is if 'extended' is set, and
-- an unescaped \# outside a character class is encountered.
-- This indicates a comment that lasts until after the next
-- line break sequence. In other circumstances, line break
-- sequences are treated as literal data, except that in
-- 'extended' mode, both CR and LF are treated as whitespace characters
-- and are therefore ignored. --
--
-- The newline option that is set at compile time becomes the
-- default that is used for 'exec' but it can be overridden.
--
-- newline_any :: PCREOption
-- newline_any = PCREOption newline_any_cint
-- | 'newline_anycrlf', see 'newline_any'
-- newline_anycrlf :: PCREOption
-- newline_anycrlf = PCREOption newline_anycrlf_cint
-- | 'newline_cr', see 'newline_any'
newline_cr :: PCREOption
newline_cr = PCREOption newline_cr_cint
-- | 'newline_crlf', see 'newline_any'
newline_crlf :: PCREOption
newline_crlf = PCREOption newline_crlf_cint
-- | 'newline_lf', see 'newline_any'
newline_lf :: PCREOption
newline_lf = PCREOption newline_lf_cint
-- | 'no_auto_capture'
--
-- If this option is set, it disables the use of numbered
-- capturing parentheses in the pattern. Any opening paren-
-- thesis that is not followed by ? behaves as if it were
-- followed by ?: but named parentheses can still be used for
-- capturing (and they acquire numbers in the usual way).
-- There is no equivalent of this option in Perl.
--
no_auto_capture :: PCREOption
no_auto_capture = PCREOption no_auto_capture_cint
-- | 'ungreedy'
--
-- This option inverts the /greediness/ of the quantifiers so
-- that they are not greedy by default, but become greedy if
-- followed by /?/. It is not compatible with Perl. It can
-- also be set by a (?U) option setting within the pattern.
--
ungreedy :: PCREOption
ungreedy = PCREOption ungreedy_cint
-- | 'utf8'
--
-- This option causes PCRE to regard both the pattern and the
-- subject as strings of UTF-8 characters instead of single-byte character
-- strings. However, it is available only when
-- PCRE is built to include UTF-8 support. If not, the use of
-- this option provokes an error. Details of how this option
-- changes the behaviour of PCRE are given in the section on
-- UTF-8 support in the main pcre page.
--
utf8 :: PCREOption
utf8 = PCREOption utf8_cint
-- | 'no_utf8_check'
--
-- When PCRE_UTF8 is set, the validity of the pattern as a
-- UTF-8 string is automatically checked. There is a discussion
-- about the validity of UTF-8 strings in the main pcre
-- page. If an invalid UTF-8 sequence of bytes is found,
-- compile() returns an error. If you already know that
-- your pattern is valid, and you want to skip this check for
-- performance reasons, you can set the 'no_utf8_check'
-- option. When it is set, the effect of passing an invalid
-- UTF-8 string as a pattern is undefined. It may cause your
-- program to crash. Note that this option can also be passed
-- to 'exec', to suppress the UTF-8 validity checking of subject strings.
--
no_utf8_check :: PCREOption
no_utf8_check = PCREOption no_utf8_check_cint
-- Internal name for hsc2hs to bind to.
type PCREOption_ = CInt
-- PCRE compile options, as CInts
#{enum PCREOption_,
, anchored_cint = PCRE_ANCHORED
, auto_callout_cint = PCRE_AUTO_CALLOUT
, caseless_cint = PCRE_CASELESS
, dollar_endonly_cint = PCRE_DOLLAR_ENDONLY
, dotall_cint = PCRE_DOTALL
, dupnames_cint = PCRE_DUPNAMES
, extended_cint = PCRE_EXTENDED
, extra_cint = PCRE_EXTRA
, firstline_cint = PCRE_FIRSTLINE
, multiline_cint = PCRE_MULTILINE
, newline_cr_cint = PCRE_NEWLINE_CR
, newline_crlf_cint = PCRE_NEWLINE_CRLF
, newline_lf_cint = PCRE_NEWLINE_LF
, no_auto_capture_cint = PCRE_NO_AUTO_CAPTURE
, ungreedy_cint = PCRE_UNGREEDY
, utf8_cint = PCRE_UTF8
, no_utf8_check_cint = PCRE_NO_UTF8_CHECK
}
-- , bsr_anycrlf_cint = PCRE_BSR_ANYCRLF
-- , bsr_unicode_cint = PCRE_BSR_UNICODE
-- , newline_any_cint = PCRE_NEWLINE_ANY
-- , newline_anycrlf_cint = PCRE_NEWLINE_ANYCRLF
------------------------------------------------------------------------
-- | PCRE exec options, to be passed to exec
newtype PCREExecOption = PCREExecOption { unPCREExecOption :: PCREExecOption_ }
#if __GLASGOW_HASKELL__
deriving (Eq,Ord,Show,Read)
#endif
-- | Combine a list of exec options into a single option, using bitwise (.|.)
combineExecOptions :: [PCREExecOption] -> PCREExecOption
combineExecOptions = PCREExecOption . foldr ((.|.) . unPCREExecOption) 0
-- | 'anchored'.
--
-- The 'anchored' option limits 'exec' to matching at
-- the first matching position. If a pattern was compiled
-- with 'anchored', or turned out to be anchored by virtue
-- of its contents, it cannot be made unachored at matching
-- time.
exec_anchored :: PCREExecOption
exec_anchored = PCREExecOption exec_anchored_cint
-- | 'newline_cr', 'newline_lf',
-- 'newline_crlf', 'newline_anycrlf', 'newline_any'
--
-- These options override the newline definition that was
-- chosen or defaulted when the pattern was compiled. For
-- details, see the description of 'compile' above. Dur-
-- ing matching, the newline choice affects the behaviour of
-- the dot, circumflex, and dollar metacharacters. It may
-- also alter the way the match position is advanced after a
-- match failure for an unanchored pattern.
--
-- When 'newline_crlf', 'newline_anycrlf', or 'newline_any'
-- is set, and a match attempt for an unanchored
-- pattern fails when the current position is at a CRLF
-- sequence, and the pattern contains no explicit matches for
-- CR or LF characters, the match position is advanced by two
-- characters instead of one, in other words, to after the
-- CRLF.
--
-- The above rule is a compromise that makes the most common
-- cases work as expected. For example, if the pattern is .+A
-- (and the 'dotall' option is not set), it does not match
-- the string /\\r\\nA/ because, after failing at the start, it
-- skips both the CR and the LF before retrying. However, the
-- pattern /[\\r\\n]A/ does match that string, because it contains
-- an explicit CR or LF reference, and so advances only
-- by one character after the first failure.
--
-- An explicit match for CR of LF is either a literal appear-
-- ance of one of those characters, or one of the \\r or \\n
-- escape sequences. Implicit matches such as [^X] do not
-- count, nor does \\s (which includes CR and LF in the char-
-- acters that it matches).
--
-- Notwithstanding the above, anomalous effects may still
-- occur when CRLF is a valid newline sequence and explicit
-- \\r or \\n escapes appear in the pattern.
--
-- exec_newline_any :: PCREExecOption
-- exec_newline_any = PCREExecOption exec_newline_any_cint
-- | 'exec_newline_anycrlf', see 'exec_newline_any'
-- exec_newline_anycrlf :: PCREExecOption
-- exec_newline_anycrlf = PCREExecOption exec_newline_anycrlf_cint
-- | 'exec_newline_cr', see 'exec_newline_any'
exec_newline_cr :: PCREExecOption
exec_newline_cr = PCREExecOption exec_newline_cr_cint
-- | 'exec_newline_crlf', see 'exec_newline_any'
exec_newline_crlf :: PCREExecOption
exec_newline_crlf = PCREExecOption exec_newline_crlf_cint
-- | 'exec_newline_lf', see 'exec_newline_any'
exec_newline_lf :: PCREExecOption
exec_newline_lf = PCREExecOption exec_newline_lf_cint
-- | 'PCRE_NOTBOL'
--
-- This option specifies that first character of the subject
-- string is not the beginning of a line, so the circumflex
-- metacharacter should not match before it. Setting this
-- without 'multiline' (at compile time) causes circumflex
-- never to match. This option affects only the behaviour of
-- the circumflex metacharacter. It does not affect \\A.
--
exec_notbol :: PCREExecOption
exec_notbol = PCREExecOption exec_notbol_cint
-- | 'noteol'
--
-- This option specifies that the end of the subject string
-- is not the end of a line, so the dollar metacharacter
-- should not match it nor (except in multiline mode) a newline
-- immediately before it. Setting this without 'multiline'
-- (at compile time) causes dollar never to match.
-- This option affects only the behaviour of the dollar
-- metacharacter. It does not affect \\Z or \\z.
--
exec_noteol :: PCREExecOption
exec_noteol = PCREExecOption exec_noteol_cint
-- | PCRE_NOTEMPTY
--
-- An empty string is not considered to be a valid match if
-- this option is set. If there are alternatives in the pattern,
-- they are tried. If all the alternatives match the
-- empty string, the entire match fails. For example, if the
-- pattern
--
-- > a?b?
--
-- is applied to a string not beginning with /a/ or /b/, it
-- matches the empty string at the start of the subject. With
-- 'notempty' set, this match is not valid, so 'PCRE
-- searches further into the string for occurrences of /a/ or
-- /b/.
--
-- Perl has no direct equivalent of 'notempty', but it
-- does make a special case of a pattern match of the empty
-- string within its split() function, and when using the \/g
-- modifier. It is possible to emulate Perl's behaviour after
-- matching a null string by first trying the match again at
-- the same offset with PCRE_NOTEMPTY and PCRE_ANCHORED, and
-- then if that fails by advancing the starting offset (see
-- below) and trying an ordinary match again. There is some
-- code that demonstrates how to do this in the pcredemo.c
-- sample program.
--
exec_notempty :: PCREExecOption
exec_notempty = PCREExecOption exec_notempty_cint
-- | 'no_utf8_check'
--
-- When 'utf8' is set at compile time, the validity of the
-- subject as a UTF-8 string is automatically checked when
-- exec() is subsequently called. The value of
-- startoffset is also checked to ensure that it points to
-- the start of a UTF-8 character. There is a discussion
-- about the validity of UTF-8 strings in the section on
-- UTF-8 support in the main pcre page. If an invalid UTF-8
-- sequence of bytes is found, exec() returns the error
-- 'error_badutf8'. If startoffset contains an invalid
-- value, 'error_badutf8_offset' is returned.
--
-- If you already know that your subject is valid, and you
-- want to skip these checks for performance reasons, you can
-- set the 'no_utf8_check' option when calling
-- 'exec'. You might want to do this for the second and
-- subsequent calls to exec() if you are making repeated
-- calls to find all the matches in a single subject string.
-- However, you should be sure that the value of startoffset
-- points to the start of a UTF-8 character. When
-- 'no_utf8_check' is set, the effect of passing an
-- invalid UTF-8 string as a subject, or a value of startoff-
-- set that does not point to the start of a UTF-8 character,
-- is undefined. Your program may crash.
--
exec_no_utf8_check :: PCREExecOption
exec_no_utf8_check = PCREExecOption exec_no_utf8_check_cint
-- | 'partial'
--
-- This option turns on the partial matching feature. If the
-- subject string fails to match the pattern, but at some
-- point during the matching process the end of the subject
-- was reached (that is, the subject partially matches the
-- pattern and the failure to match occurred only because
-- there were not enough subject characters), 'exec'
-- returns 'error_partial' instead of 'error_nomatch'.
-- When 'partial' is used, there are restrictions on what
-- may appear in the pattern. These are discussed in the
-- pcrepartial documentation.
--
exec_partial :: PCREExecOption
exec_partial = PCREExecOption exec_partial_cint
-- Internal name for hsc2hs to bind to.
type PCREExecOption_ = CInt
-- PCRE exec options
#{enum PCREExecOption_,
, exec_anchored_cint = PCRE_ANCHORED
, exec_newline_cr_cint = PCRE_NEWLINE_CR
, exec_newline_crlf_cint = PCRE_NEWLINE_CRLF
, exec_newline_lf_cint = PCRE_NEWLINE_LF
, exec_notbol_cint = PCRE_NOTBOL
, exec_noteol_cint = PCRE_NOTEOL
, exec_notempty_cint = PCRE_NOTEMPTY
, exec_no_utf8_check_cint = PCRE_NO_UTF8_CHECK
, exec_partial_cint = PCRE_PARTIAL
}
-- , exec_newline_any_cint = PCRE_NEWLINE_ANY
-- , exec_newline_anycrlf_cint = PCRE_NEWLINE_ANYCRLF
-- , dfa_shortest = PCRE_DFA_SHORTEST
-- , dfa_restart = PCRE_DFA_RESTART
------------------------------------------------------------------------
-- | A type for PCRE Errors: exec-time error codes.
type PCREError = CInt
#{enum PCREError,
, error_nomatch = PCRE_ERROR_NOMATCH
, error_null = PCRE_ERROR_NULL
, error_badoption = PCRE_ERROR_BADOPTION
, error_badmagic = PCRE_ERROR_BADMAGIC
, error_unknown_node = PCRE_ERROR_UNKNOWN_NODE
, error_nomemory = PCRE_ERROR_NOMEMORY
, error_nosubstring = PCRE_ERROR_NOSUBSTRING
, error_matchlimit = PCRE_ERROR_MATCHLIMIT
, error_callout = PCRE_ERROR_CALLOUT
, error_badutf8 = PCRE_ERROR_BADUTF8
, error_badutf8_offset = PCRE_ERROR_BADUTF8_OFFSET
, error_partial = PCRE_ERROR_PARTIAL
, error_badpartial = PCRE_ERROR_BADPARTIAL
, error_internal = PCRE_ERROR_INTERNAL
, error_badcount = PCRE_ERROR_BADCOUNT
, error_dfa_uitem = PCRE_ERROR_DFA_UITEM
, error_dfa_ucond = PCRE_ERROR_DFA_UCOND
, error_dfa_umlimit = PCRE_ERROR_DFA_UMLIMIT
, error_dfa_wssize = PCRE_ERROR_DFA_WSSIZE
, error_dfa_recurse = PCRE_ERROR_DFA_RECURSE
, error_recursionlimit = PCRE_ERROR_RECURSIONLIMIT
}
-- , error_unknown_opcode = PCRE_ERROR_UNKNOWN_OPCODE
-- , error_nullwslimit = PCRE_ERROR_NULLWSLIMIT
-- , error_badnewline = PCRE_ERROR_BADNEWLINE
------------------------------------------------------------------------
-- Request types for fullinfo() */
-- | PCRE Info requests -- provides information about the compiled pattern.
type PCREInfo = CInt
#{enum PCREInfo,
, info_options = PCRE_INFO_OPTIONS
, info_size = PCRE_INFO_SIZE
, info_capturecount = PCRE_INFO_CAPTURECOUNT
, info_backrefmax = PCRE_INFO_BACKREFMAX
, info_firstbyte = PCRE_INFO_FIRSTBYTE
, info_firstchar = PCRE_INFO_FIRSTCHAR
, info_firsttable = PCRE_INFO_FIRSTTABLE
, info_lastliteral = PCRE_INFO_LASTLITERAL
, info_nameentrysize = PCRE_INFO_NAMEENTRYSIZE
, info_namecount = PCRE_INFO_NAMECOUNT
, info_nametable = PCRE_INFO_NAMETABLE
, info_studysize = PCRE_INFO_STUDYSIZE
, info_default_tables = PCRE_INFO_DEFAULT_TABLES
}
-- , info_okpartial = PCRE_INFO_OKPARTIAL
-- , info_jchanged = PCRE_INFO_JCHANGED
-- , info_hascrorlf = PCRE_INFO_HASCRORLF
------------------------------------------------------------------------
-- | Request types for config()
type PCREConfig = CInt
#{enum PCREConfig,
, config_utf8 = PCRE_CONFIG_UTF8
, config_newline = PCRE_CONFIG_NEWLINE
, config_link_size = PCRE_CONFIG_LINK_SIZE
, config_posix_malloc_threshold = PCRE_CONFIG_POSIX_MALLOC_THRESHOLD
, config_match_limit = PCRE_CONFIG_MATCH_LIMIT
, config_stackrecurse = PCRE_CONFIG_STACKRECURSE
, config_unicode_properties = PCRE_CONFIG_UNICODE_PROPERTIES
, config_match_limit_recursion = PCRE_CONFIG_MATCH_LIMIT_RECURSION
}
-- Not portable
-- , config_bsr = PCRE_CONFIG_BSR
------------------------------------------------------------------------
-- | PCREExtra.
-- A extra structure contains the following fields:
--
-- * flags Bits indicating which fields are set
-- * study_data Opaque data from study()
-- * match_limit Limit on internal resource use
-- * match_limit_recursion Limit on internal recursion depth
-- * callout_data Opaque data passed back to callouts
-- * tables Points to character tables or is NULL
--
type PCREExtra = ()
-- | PCREExtraFlags. bit flags for extra structure.
type PCREExtraFlags = CInt
-- Bit flags for the extra structure. Do not re-arrange or redefine
-- these bits, just add new ones on the end, in order to remain compatible. */
#{enum PCREExtraFlags,
, extra_study_data = PCRE_EXTRA_STUDY_DATA
, extra_match_limit = PCRE_EXTRA_MATCH_LIMIT
, extra_callout_data = PCRE_EXTRA_CALLOUT_DATA
, extra_tables = PCRE_EXTRA_TABLES
, extra_match_limit_recursion = PCRE_EXTRA_MATCH_LIMIT_RECURSION
}
-- PCRE_EXP_DECL pcre *compile(const char *, int, const char **, int *, const unsigned char *);
-- PCRE_EXP_DECL int config(int, void *);
-- PCRE_EXP_DECL int exec(const pcre *, const extra *, PCRE_SPTR, int, int, int, int *, int);
------------------------------------------------------------------------
-- C api
{-
pcre *pcre_compile(const char *pattern, int options,
const char **errptr, int *erroffset,
const unsigned char *tableptr);
-}
-- | Compile a pattern to an internal form. The pattern is a C string
-- terminated by a binary zero. A pointer to a single block of memory that is
-- obtained via pcre_malloc is returned. It is up to the caller to free
-- the memory (via pcre_free) when it is no longer required
--
-- The options argument contains various bit settings that affect the
-- compilation. It should be zero if no options are required.
--
-- If errptr is NULL, pcre_compile() returns NULL immediately.
-- Otherwise, if compilation of a pattern fails, pcre_compile() returns NULL,
-- and sets the variable pointed to by errptr to point to a textual error
-- message.
--
-- The offset from the start of the pattern to the character where the error
-- was discovered is placed in the variable pointed to by erroffset, which must
-- not be NULL.
--
foreign import ccall unsafe "pcre.h pcre_compile"
c_pcre_compile :: CString
-> PCREOption
-> Ptr CString
-> Ptr CInt
-> Ptr Word8
-> IO (Ptr PCRE)
-- Additional fields to c_pcre_compile:
--
-- errptr Where to put an error message
-- erroffset Offset in pattern where error was found
-- tableptr Pointer to character tables, or NULL to to use built in
{-
int pcre_exec(const pcre *code, const pcre_extra *extra,
const char *subject, int length, int startoffset,
int options, int *ovector, int ovecsize);
-}
-- | This function matches a compiled regular expression
-- against a given subject string, using a matching algorithm
-- that is similar to Perl's. It returns offsets to captured
-- substrings.
--
-- Its arguments are, in order:
--
-- * 'code' Points to the compiled pattern (result of pcre_compile)
--
-- * 'extra' Points to an associated pcre_extra structure (result of pcre_study), or is NULL
--
-- * 'subject' Points to the subject string
--
-- * 'length' Length of the subject string, in bytes
--
-- * 'startoffset' Offset in bytes in the subject at which to start matching
--
-- * 'options' Option bits
--
-- * 'ovector' Points to a vector of ints for result substrings
--
-- * 'ovecsize' Number of elements in the vector (a multiple of 3)
--
-- Note, subject not required to be null terminated.
--
foreign import ccall unsafe "pcre.h pcre_exec"
c_pcre_exec :: Ptr PCRE
-> Ptr PCREExtra
-> Ptr Word8
-> CInt
-> CInt
-> PCREExecOption
-> Ptr CInt
-> CInt
-> IO CInt
-- | Return information about a compiled pattern
foreign import ccall unsafe "pcre.h pcre_fullinfo"
c_pcre_fullinfo :: Ptr PCRE
-> Ptr PCREExtra
-> PCREInfo
-> Ptr a
-> IO CInt
|