1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532
|
{-# LANGUAGE BangPatterns #-}
{-# OPTIONS_HADDOCK hide, prune #-}
-- |
-- Module : Data.ByteString.Search.Internal.BoyerMoore
-- Copyright : Daniel Fischer
-- Chris Kuklewicz
-- Licence : BSD3
-- Maintainer : Daniel Fischer <daniel.is.fischer@googlemail.com>
-- Stability : Provisional
-- Portability : non-portable (BangPatterns)
--
-- Fast overlapping Boyer-Moore search of both strict and lazy
-- 'S.ByteString' values. Breaking, splitting and replacing
-- using the Boyer-Moore algorithm.
--
-- Descriptions of the algorithm can be found at
-- <http://www-igm.univ-mlv.fr/~lecroq/string/node14.html#SECTION00140>
-- and
-- <http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm>
--
-- Original authors: Daniel Fischer (daniel.is.fischer at googlemail.com) and
-- Chris Kuklewicz (haskell at list.mightyreason.com).
module Data.ByteString.Search.Internal.BoyerMoore (
matchLS
, matchSS
-- Non-overlapping
, matchNOS
-- Replacing substrings
-- replacing
, replaceAllS
-- Breaking on substrings
-- breaking
, breakSubstringS
, breakAfterS
-- Splitting on substrings
-- splitting
, splitKeepEndS
, splitKeepFrontS
, splitDropS
) where
import Data.ByteString.Search.Internal.Utils
(occurs, suffShifts, strictify)
import Data.ByteString.Search.Substitution
import qualified Data.ByteString as S
import qualified Data.ByteString.Lazy as L
import qualified Data.ByteString.Lazy.Internal as LI
import Data.ByteString.Unsafe (unsafeIndex)
import Data.Array.Base (unsafeAt)
import Data.Word (Word8)
-- overview
--
-- This module exports three search functions for searching in strict
-- ByteStrings. One for searching non-overlapping occurrences of a strict
-- pattern and one each for possibly overlapping occurrences of a lazy
-- resp. strict pattern. The common base name is @match@, the suffix
-- indicates the type of search to perform. These functions
-- return (for a non-empty pattern) a list of all the indices of the target
-- string where an occurrence of the pattern begins, if some occurrences
-- overlap, all starting indices are reported. The list is produced lazily,
-- so not necessarily the entire target string is searched.
--
-- The behaviour of these functions when given an empty pattern has changed.
-- Formerly, the @matchXY@ functions returned an empty list then, now it's
-- @[0 .. 'length' target]@.
--
-- Newly added are functions to replace all (non-overlapping) occurrences
-- of a pattern within a string, functions to break ByteStrings at the first
-- occurrence of a pattern and functions to split ByteStrings at each
-- occurrence of a pattern. None of these functions does copying, so they
-- don't introduce large memory overhead.
--
-- Internally, a lazy pattern is always converted to a strict ByteString,
-- which is necessary for an efficient implementation of the algorithm.
-- The limit this imposes on the length of the pattern is probably
-- irrelevant in practice, but perhaps it should be mentioned.
-- This also means that the @matchL*@ functions are mere convenience wrappers.
-- Except for the initial 'strictify'ing, there's no difference between lazy
-- and strict patterns, they call the same workers. There is, however, a
-- difference between strict and lazy target strings.
-- For the new functions, no such wrappers are provided, you have to
-- 'strictify' lazy patterns yourself.
-- caution
--
-- When working with a lazy target string, the relation between the pattern
-- length and the chunk size can play a big rôle.
-- Crossing chunk boundaries is relatively expensive, so when that becomes
-- a frequent occurrence, as may happen when the pattern length is close
-- to or larger than the chunk size, performance is likely to degrade.
-- If it is needed, steps can be taken to ameliorate that effect, but unless
-- entirely separate functions are introduced, that would hurt the
-- performance for the more common case of patterns much shorter than
-- the default chunk size.
-- performance
--
-- In general, the Boyer-Moore algorithm is the most efficient method to
-- search for a pattern inside a string, so most of the time, you'll want
-- to use the functions of this module, hence this is where the most work
-- has gone. Very short patterns are an exception to this, for those you
-- should consider using a finite automaton
-- ("Data.ByteString.Search.DFA.Array"). That is also often the better
-- choice for searching longer periodic patterns in a lazy ByteString
-- with many matches.
--
-- Operating on a strict target string is mostly faster than on a lazy
-- target string, but the difference is usually small (according to my
-- tests).
--
-- The known exceptions to this rule of thumb are
--
-- [long targets] Then the smaller memory footprint of a lazy target often
-- gives (much) better performance.
--
-- [high number of matches] When there are very many matches, strict target
-- strings are much faster, especially if the pattern is periodic.
--
-- If both conditions hold, either may outweigh the other.
-- complexity
--
-- Preprocessing the pattern is /O/(@patternLength@ + σ) in time and
-- space (σ is the alphabet size, 256 here) for all functions.
-- The time complexity of the searching phase for @matchXY@
-- is /O/(@targetLength@ \/ @patternLength@) in the best case.
-- For non-periodic patterns, the worst case complexity is
-- /O/(@targetLength@), but for periodic patterns, the worst case complexity
-- is /O/(@targetLength@ * @patternLength@) for the original Boyer-Moore
-- algorithm.
--
-- The searching functions in this module now contain a modification which
-- drastically improves the performance for periodic patterns.
-- I believe that for strict target strings, the worst case is now
-- /O/(@targetLength@) also for periodic patterns and for lazy target strings,
-- my semi-educated guess is
-- /O/(@targetLength@ * (1 + @patternLength@ \/ @chunkSize@)).
-- I may be very wrong, though.
--
-- The other functions don't have to deal with possible overlapping
-- patterns, hence the worst case complexity for the processing phase
-- is /O/(@targetLength@) (respectively /O/(@firstIndex + patternLength@)
-- for the breaking functions if the pattern occurs).
-- currying
--
-- These functions can all be usefully curried. Given only a pattern
-- the curried version will compute the supporting lookup tables only
-- once, allowing for efficient re-use. Similarly, the curried
-- 'matchLL' and 'matchLS' will compute the concatenated pattern only
-- once.
-- overflow
--
-- The current code uses @Int@ to keep track of the locations in the
-- target string. If the length of the pattern plus the length of any
-- strict chunk of the target string is greater than
-- @'maxBound' :: 'Int'@ then this will overflow causing an error. We
-- try to detect this and call 'error' before a segfault occurs.
------------------------------------------------------------------------------
-- Wrappers --
------------------------------------------------------------------------------
-- matching
--
-- These functions find the indices of all (possibly overlapping)
-- occurrences of a pattern in a target string.
-- If the pattern is empty, the result is @[0 .. length target]@.
-- If the pattern is much shorter than the target string
-- and the pattern does not occur very near the beginning of the target,
--
-- > not . null $ matchSS pattern target
--
-- is a much more efficient version of 'S.isInfixOf'.
-- | @'matchLS'@ finds the starting indices of all possibly overlapping
-- occurrences of the pattern in the target string.
-- It is a simple wrapper for 'Data.ByteString.Search.indices'.
-- If the pattern is empty, the result is @[0 .. 'length' target]@.
{-# INLINE matchLS #-}
matchLS :: L.ByteString -- ^ Lazy pattern
-> S.ByteString -- ^ Strict target string
-> [Int] -- ^ Offsets of matches
matchLS pat = search
where
search = strictSearcher True (strictify pat)
-- | @'matchSS'@ finds the starting indices of all possibly overlapping
-- occurrences of the pattern in the target string.
-- It is an alias for 'Data.ByteString.Search.indices'.
-- If the pattern is empty, the result is @[0 .. 'length' target]@.
{-# INLINE matchSS #-}
matchSS :: S.ByteString -- ^ Strict pattern
-> S.ByteString -- ^ Strict target string
-> [Int] -- ^ Offsets of matches
matchSS pat = search
where
search = strictSearcher True pat
-- | @'matchNOS'@ finds the indices of all non-overlapping occurrences
-- of the pattern in the Strict target string.
{-# INLINE matchNOS #-}
matchNOS :: S.ByteString -- ^ Strict pattern
-> S.ByteString -- ^ Strict target string
-> [Int] -- ^ Offsets of matches
matchNOS pat = search
where
search = strictSearcher False pat
-- replacing
--
-- These functions replace all (non-overlapping) occurrences of a pattern
-- in the target string. If some occurrences overlap, the earliest is
-- replaced and replacing continues at the index after the replaced
-- occurrence, for example
--
-- > replaceAllL \"ana\" \"olog\" \"banana\" == \"bologna\",
-- > replaceAllS \"abacab\" \"u\" \"abacabacabacab\" == \"uacu\",
-- > replaceAllS \"aa\" \"aaa\" \"aaaa\" == \"aaaaaa\".
--
-- Equality of pattern and substitution is not checked, but
--
-- > pat == sub => 'strictify' (replaceAllS pat sub str) == str,
-- > pat == sub => replaceAllL pat sub str == str.
--
-- The result is a lazily generated lazy ByteString, the first chunks will
-- generally be available before the entire target has been scanned.
-- If the pattern is empty, but not the substitution, the result is
-- equivalent to @'cycle' sub@.
{-# INLINE replaceAllS #-}
replaceAllS :: Substitution rep
=> S.ByteString -- ^ Pattern to replace
-> rep -- ^ Substitution string
-> S.ByteString -- ^ Target string
-> L.ByteString -- ^ Lazy result
replaceAllS pat
| S.null pat = \sub -> prependCycle sub . flip LI.chunk LI.Empty
| otherwise =
let repl = strictRepl pat
in \sub -> L.fromChunks . repl (substitution sub)
-- breaking
--
-- Break a string on a pattern. The first component of the result
-- contains the prefix of the string before the first occurrence of the
-- pattern, the second component contains the remainder.
-- The following relations hold:
--
-- > breakSubstringX \"\" str = (\"\", str)
-- > not (pat `isInfixOf` str) == null (snd $ breakSunbstringX pat str)
-- > True == case breakSubstringX pat str of
-- > (x, y) -> not (pat `isInfixOf` x)
-- > && (null y || pat `isPrefixOf` y)
-- | This function has the same semantics as 'S.breakSubstring'
-- but is generally much faster.
{-# INLINE breakSubstringS #-}
breakSubstringS :: S.ByteString -- ^ Pattern to break on
-> S.ByteString -- ^ String to break up
-> (S.ByteString, S.ByteString)
-- ^ Prefix and remainder of broken string
breakSubstringS = strictBreak
breakAfterS :: S.ByteString
-> S.ByteString
-> (S.ByteString, S.ByteString)
breakAfterS pat
| S.null pat = \str -> (S.empty, str)
breakAfterS pat = breaker
where
!patLen = S.length pat
searcher = strictSearcher False pat
breaker str = case searcher str of
[] -> (str, S.empty)
(i:_) -> S.splitAt (i + patLen) str
-- splitting
--
-- These functions implement various splitting strategies.
--
-- If the pattern to split on is empty, all functions return an
-- infinite list of empty ByteStrings.
-- Otherwise, the names are rather self-explanatory.
--
-- For nonempty patterns, the following relations hold:
--
-- > concat (splitKeepXY pat str) == str
-- > concat ('Data.List.intersperse' pat (splitDropX pat str)) == str.
--
-- All fragments except possibly the last in the result of
-- @splitKeepEndX pat@ end with @pat@, none of the fragments contains
-- more than one occurrence of @pat@ or is empty.
--
-- All fragments except possibly the first in the result of
-- @splitKeepFrontX pat@ begin with @pat@, none of the fragments
-- contains more than one occurrence of @patq or is empty.
--
-- > splitDropX pat str == map dropPat (splitKeepFrontX pat str)
-- > where
-- > patLen = length pat
-- > dropPat frag
-- > | pat `isPrefixOf` frag = drop patLen frag
-- > | otherwise = frag
--
-- but @splitDropX@ is a little more efficient than that.
{-# INLINE splitKeepEndS #-}
splitKeepEndS :: S.ByteString -- ^ Pattern to split on
-> S.ByteString -- ^ String to split
-> [S.ByteString] -- ^ List of fragments
splitKeepEndS = strictSplitKeepEnd
{-# INLINE splitKeepFrontS #-}
splitKeepFrontS :: S.ByteString -- ^ Pattern to split on
-> S.ByteString -- ^ String to split
-> [S.ByteString] -- ^ List of fragments
splitKeepFrontS = strictSplitKeepFront
{-# INLINE splitDropS #-}
splitDropS :: S.ByteString -- ^ Pattern to split on
-> S.ByteString -- ^ String to split
-> [S.ByteString] -- ^ List of fragments
splitDropS = strictSplitDrop
------------------------------------------------------------------------------
-- Search Functions --
------------------------------------------------------------------------------
strictSearcher :: Bool -> S.ByteString -> S.ByteString -> [Int]
strictSearcher _ !pat
| S.null pat = enumFromTo 0 . S.length
| S.length pat == 1 = let !w = S.head pat in S.elemIndices w
strictSearcher !overlap pat = searcher
where
{-# INLINE patAt #-}
patAt :: Int -> Word8
patAt !i = unsafeIndex pat i
!patLen = S.length pat
!patEnd = patLen - 1
!maxLen = maxBound - patLen
!occT = occurs pat -- for bad-character-shift
!suffT = suffShifts pat -- for good-suffix-shift
!skip = if overlap then unsafeAt suffT 0 else patLen
-- shift after a complete match
!kept = patLen - skip -- length of known prefix after full match
!pe = patAt patEnd -- last pattern byte for fast comparison
{-# INLINE occ #-}
occ !w = unsafeAt occT (fromIntegral w)
{-# INLINE suff #-}
suff !i = unsafeAt suffT i
searcher str
| maxLen < strLen
= error "Overflow in BoyerMoore.strictSearcher"
| maxDiff < 0 = []
| otherwise = checkEnd patEnd
where
!strLen = S.length str
!strEnd = strLen - 1
!maxDiff = strLen - patLen
{-# INLINE strAt #-}
strAt !i = unsafeIndex str i
-- After a full match, we know how long a prefix of the pattern
-- still matches. Do not re-compare the prefix to prevent O(m*n)
-- behaviour for periodic patterns.
afterMatch !diff !patI =
case strAt (diff + patI) of
!c | c == patAt patI ->
if patI == kept
then diff : let !diff' = diff + skip
in if maxDiff < diff'
then []
else afterMatch diff' patEnd
else afterMatch diff (patI - 1)
| patI == patEnd ->
checkEnd (diff + 2*patEnd + occ c)
| otherwise ->
let {-# INLINE badShift #-}
badShift = patI + occ c
{-# INLINE goodShift #-}
goodShift = suff patI
!diff' = diff + max badShift goodShift
in if maxDiff < diff'
then []
else checkEnd (diff + patEnd)
-- While comparing the last byte of the pattern, the bad-
-- character-shift is always at least as large as the good-
-- suffix-shift. Eliminating the unnecessary memory reads and
-- comparison speeds things up noticeably.
checkEnd !sI -- index in string to compare to last of pattern
| strEnd < sI = []
| otherwise =
case strAt sI of
!c | c == pe -> findMatch (sI - patEnd) (patEnd - 1)
| otherwise -> checkEnd (sI + patEnd + occ c)
-- Once the last byte has matched, we enter the full matcher
-- diff is the offset of the window, patI the index of the
-- pattern byte to compare next.
findMatch !diff !patI =
case strAt (diff + patI) of
!c | c == patAt patI ->
if patI == 0 -- full match, report
then diff : let !diff' = diff + skip
in if maxDiff < diff'
then []
else
if skip == patLen
then
checkEnd (diff' + patEnd)
else
afterMatch diff' patEnd
else findMatch diff (patI - 1)
| otherwise ->
let !diff' = diff + max (patI + occ c) (suff patI)
in if maxDiff < diff'
then []
else checkEnd (diff' + patEnd)
------------------------------------------------------------------------------
-- Breaking Functions --
------------------------------------------------------------------------------
strictBreak :: S.ByteString -> S.ByteString -> (S.ByteString, S.ByteString)
strictBreak pat
| S.null pat = \str -> (S.empty, str)
| otherwise = breaker
where
searcher = strictSearcher False pat
breaker str = case searcher str of
[] -> (str, S.empty)
(i:_) -> S.splitAt i str
------------------------------------------------------------------------------
-- Splitting Functions --
------------------------------------------------------------------------------
strictSplitKeepFront :: S.ByteString -> S.ByteString -> [S.ByteString]
strictSplitKeepFront pat
| S.null pat = const (repeat S.empty)
strictSplitKeepFront pat = splitter
where
!patLen = S.length pat
searcher = strictSearcher False pat
splitter str
| S.null str = []
| otherwise =
case searcher str of
[] -> [str]
(i:_)
| i == 0 -> psplitter str
| otherwise -> S.take i str : psplitter (S.drop i str)
psplitter !str
| S.null str = []
| otherwise =
case searcher (S.drop patLen str) of
[] -> [str]
(i:_) -> S.take (i + patLen) str :
psplitter (S.drop (i + patLen) str)
strictSplitKeepEnd :: S.ByteString -> S.ByteString -> [S.ByteString]
strictSplitKeepEnd pat
| S.null pat = const (repeat S.empty)
strictSplitKeepEnd pat = splitter
where
!patLen = S.length pat
searcher = strictSearcher False pat
splitter str
| S.null str = []
| otherwise =
case searcher str of
[] -> [str]
(i:_) -> S.take (i + patLen) str :
splitter (S.drop (i + patLen) str)
strictSplitDrop :: S.ByteString -> S.ByteString -> [S.ByteString]
strictSplitDrop pat
| S.null pat = const (repeat S.empty)
strictSplitDrop pat = splitter'
where
!patLen = S.length pat
searcher = strictSearcher False pat
splitter' str
| S.null str = []
| otherwise = splitter str
splitter str
| S.null str = [S.empty]
| otherwise =
case searcher str of
[] -> [str]
(i:_) -> S.take i str : splitter (S.drop (i + patLen) str)
------------------------------------------------------------------------------
-- Replacing Functions --
------------------------------------------------------------------------------
-- replacing loop for strict ByteStrings, called only for
-- non-empty patterns and substitutions
strictRepl :: S.ByteString -> ([S.ByteString] -> [S.ByteString])
-> S.ByteString -> [S.ByteString]
strictRepl pat = repl
where
!patLen = S.length pat
searcher = strictSearcher False pat
repl sub = replacer
where
replacer str
| S.null str = []
| otherwise =
case searcher str of
[] -> [str]
(i:_)
| i == 0 -> sub $ replacer (S.drop patLen str)
| otherwise ->
S.take i str : sub (replacer (S.drop (i + patLen) str))
|