File: BoyerMoore.hs

package info (click to toggle)
haskell-stringsearch 0.3.6.6-13
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 284 kB
  • sloc: haskell: 1,915; makefile: 2
file content (532 lines) | stat: -rw-r--r-- 22,290 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
{-# LANGUAGE BangPatterns #-}
{-# OPTIONS_HADDOCK hide, prune #-}
-- |
-- Module         : Data.ByteString.Search.Internal.BoyerMoore
-- Copyright      : Daniel Fischer
--                  Chris Kuklewicz
-- Licence        : BSD3
-- Maintainer     : Daniel Fischer <daniel.is.fischer@googlemail.com>
-- Stability      : Provisional
-- Portability    : non-portable (BangPatterns)
--
-- Fast overlapping Boyer-Moore search of both strict and lazy
-- 'S.ByteString' values. Breaking, splitting and replacing
-- using the Boyer-Moore algorithm.
--
-- Descriptions of the algorithm can be found at
-- <http://www-igm.univ-mlv.fr/~lecroq/string/node14.html#SECTION00140>
-- and
-- <http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm>
--
-- Original authors: Daniel Fischer (daniel.is.fischer at googlemail.com) and
-- Chris Kuklewicz (haskell at list.mightyreason.com).

module Data.ByteString.Search.Internal.BoyerMoore (
                                           matchLS
                                         , matchSS

                                           --  Non-overlapping
                                         , matchNOS

                                            --  Replacing substrings
                                            -- replacing
                                         , replaceAllS
                                            --  Breaking on substrings
                                            -- breaking
                                         , breakSubstringS
                                         , breakAfterS
                                            --  Splitting on substrings
                                            -- splitting
                                         , splitKeepEndS
                                         , splitKeepFrontS
                                         , splitDropS
                                         ) where


import Data.ByteString.Search.Internal.Utils
                (occurs, suffShifts, strictify)
import Data.ByteString.Search.Substitution

import qualified Data.ByteString as S
import qualified Data.ByteString.Lazy as L
import qualified Data.ByteString.Lazy.Internal as LI
import Data.ByteString.Unsafe (unsafeIndex)

import Data.Array.Base (unsafeAt)

import Data.Word (Word8)

-- overview
--
-- This module exports three search functions for searching in strict
-- ByteStrings. One for searching non-overlapping occurrences of a strict
-- pattern and one each for possibly overlapping occurrences of a lazy
-- resp. strict pattern. The common base name is @match@, the suffix
-- indicates the type of search to perform. These functions
-- return (for a non-empty pattern) a list of all the indices of the target
-- string where an occurrence of the pattern begins, if some occurrences
-- overlap, all starting indices are reported. The list is produced lazily,
-- so not necessarily the entire target string is searched.
--
-- The behaviour of these functions when given an empty pattern has changed.
-- Formerly, the @matchXY@ functions returned an empty list then, now it's
-- @[0 .. 'length' target]@.
--
-- Newly added are functions to replace all (non-overlapping) occurrences
-- of a pattern within a string, functions to break ByteStrings at the first
-- occurrence of a pattern and functions to split ByteStrings at each
-- occurrence of a pattern. None of these functions does copying, so they
-- don't introduce large memory overhead.
--
-- Internally, a lazy pattern is always converted to a strict ByteString,
-- which is necessary for an efficient implementation of the algorithm.
-- The limit this imposes on the length of the pattern is probably
-- irrelevant in practice, but perhaps it should be mentioned.
-- This also means that the @matchL*@ functions are mere convenience wrappers.
-- Except for the initial 'strictify'ing, there's no difference between lazy
-- and strict patterns, they call the same workers. There is, however, a
-- difference between strict and lazy target strings.
-- For the new functions, no such wrappers are provided, you have to
-- 'strictify' lazy patterns yourself.

-- caution
--
-- When working with a lazy target string, the relation between the pattern
-- length and the chunk size can play a big r&#244;le.
-- Crossing chunk boundaries is relatively expensive, so when that becomes
-- a frequent occurrence, as may happen when the pattern length is close
-- to or larger than the chunk size, performance is likely to degrade.
-- If it is needed, steps can be taken to ameliorate that effect, but unless
-- entirely separate functions are introduced, that would hurt the
-- performance for the more common case of patterns much shorter than
-- the default chunk size.

-- performance
--
-- In general, the Boyer-Moore algorithm is the most efficient method to
-- search for a pattern inside a string, so most of the time, you'll want
-- to use the functions of this module, hence this is where the most work
-- has gone. Very short patterns are an exception to this, for those you
-- should consider using a finite automaton
-- ("Data.ByteString.Search.DFA.Array"). That is also often the better
-- choice for searching longer periodic patterns in a lazy ByteString
-- with many matches.
--
-- Operating on a strict target string is mostly faster than on a lazy
-- target string, but the difference is usually small (according to my
-- tests).
--
-- The known exceptions to this rule of thumb are
--
-- [long targets] Then the smaller memory footprint of a lazy target often
-- gives (much) better performance.
--
-- [high number of matches] When there are very many matches, strict target
-- strings are much faster, especially if the pattern is periodic.
--
-- If both conditions hold, either may outweigh the other.

-- complexity
--
-- Preprocessing the pattern is /O/(@patternLength@ + &#963;) in time and
-- space (&#963; is the alphabet size, 256 here) for all functions.
-- The time complexity of the searching phase for @matchXY@
-- is /O/(@targetLength@ \/ @patternLength@) in the best case.
-- For non-periodic patterns, the worst case complexity is
-- /O/(@targetLength@), but for periodic patterns, the worst case complexity
-- is /O/(@targetLength@ * @patternLength@) for the original Boyer-Moore
-- algorithm.
--
-- The searching functions in this module now contain a modification which
-- drastically improves the performance for periodic patterns.
-- I believe that for strict target strings, the worst case is now
-- /O/(@targetLength@) also for periodic patterns and for lazy target strings,
-- my semi-educated guess is
-- /O/(@targetLength@ * (1 + @patternLength@ \/ @chunkSize@)).
-- I may be very wrong, though.
--
-- The other functions don't have to deal with possible overlapping
-- patterns, hence the worst case complexity for the processing phase
-- is /O/(@targetLength@) (respectively /O/(@firstIndex + patternLength@)
-- for the breaking functions if the pattern occurs).

-- currying
--
-- These functions can all be usefully curried. Given only a pattern
-- the curried version will compute the supporting lookup tables only
-- once, allowing for efficient re-use.  Similarly, the curried
-- 'matchLL' and 'matchLS' will compute the concatenated pattern only
-- once.

-- overflow
--
-- The current code uses @Int@ to keep track of the locations in the
-- target string.  If the length of the pattern plus the length of any
-- strict chunk of the target string is greater than
-- @'maxBound' :: 'Int'@ then this will overflow causing an error.  We
-- try to detect this and call 'error' before a segfault occurs.

------------------------------------------------------------------------------
--                                 Wrappers                                 --
------------------------------------------------------------------------------

-- matching
--
-- These functions find the indices of all (possibly overlapping)
-- occurrences of a pattern in a target string.
-- If the pattern is empty, the result is @[0 .. length target]@.
-- If the pattern is much shorter than the target string
-- and the pattern does not occur very near the beginning of the target,
--
-- > not . null $ matchSS pattern target
--
-- is a much more efficient version of 'S.isInfixOf'.

-- | @'matchLS'@ finds the starting indices of all possibly overlapping
--   occurrences of the pattern in the target string.
--   It is a simple wrapper for 'Data.ByteString.Search.indices'.
--   If the pattern is empty, the result is @[0 .. 'length' target]@.
{-# INLINE matchLS #-}
matchLS :: L.ByteString     -- ^ Lazy pattern
        -> S.ByteString     -- ^ Strict target string
        -> [Int]            -- ^ Offsets of matches
matchLS pat = search
  where
    search = strictSearcher True (strictify pat)

-- | @'matchSS'@ finds the starting indices of all possibly overlapping
--   occurrences of the pattern in the target string.
--   It is an alias for 'Data.ByteString.Search.indices'.
--   If the pattern is empty, the result is @[0 .. 'length' target]@.
{-# INLINE matchSS #-}
matchSS :: S.ByteString     -- ^ Strict pattern
        -> S.ByteString     -- ^ Strict target string
        -> [Int]            -- ^ Offsets of matches
matchSS pat = search
  where
    search = strictSearcher True pat

-- | @'matchNOS'@ finds the indices of all non-overlapping occurrences
--   of the pattern in the Strict target string.
{-# INLINE matchNOS #-}
matchNOS :: S.ByteString    -- ^ Strict pattern
         -> S.ByteString    -- ^ Strict target string
         -> [Int]           -- ^ Offsets of matches
matchNOS pat = search
  where
    search = strictSearcher False pat

-- replacing
--
--   These functions replace all (non-overlapping) occurrences of a pattern
--   in the target string. If some occurrences overlap, the earliest is
--   replaced and replacing continues at the index after the replaced
--   occurrence, for example
--
-- > replaceAllL \"ana\" \"olog\" \"banana\" == \"bologna\",
-- > replaceAllS \"abacab\" \"u\" \"abacabacabacab\" == \"uacu\",
-- > replaceAllS \"aa\" \"aaa\" \"aaaa\" == \"aaaaaa\".
--
--   Equality of pattern and substitution is not checked, but
--
-- > pat == sub => 'strictify' (replaceAllS pat sub str) == str,
-- > pat == sub => replaceAllL pat sub str == str.
--
--   The result is a lazily generated lazy ByteString, the first chunks will
--   generally be available before the entire target has been scanned.
--   If the pattern is empty, but not the substitution, the result is
--   equivalent to @'cycle' sub@.

{-# INLINE replaceAllS #-}
replaceAllS :: Substitution rep
            => S.ByteString  -- ^ Pattern to replace
            -> rep           -- ^ Substitution string
            -> S.ByteString  -- ^ Target string
            -> L.ByteString  -- ^ Lazy result
replaceAllS pat
    | S.null pat = \sub -> prependCycle sub . flip LI.chunk LI.Empty
    | otherwise =
      let repl = strictRepl pat
      in \sub -> L.fromChunks . repl (substitution sub)

-- breaking
--
-- Break a string on a pattern. The first component of the result
-- contains the prefix of the string before the first occurrence of the
-- pattern, the second component contains the remainder.
-- The following relations hold:
--
-- > breakSubstringX \"\" str = (\"\", str)
-- > not (pat `isInfixOf` str) == null (snd $ breakSunbstringX pat str)
-- > True == case breakSubstringX pat str of
-- >          (x, y) -> not (pat `isInfixOf` x)
-- >                       && (null y || pat `isPrefixOf` y)

-- | This function has the same semantics as 'S.breakSubstring'
--   but is generally much faster.
{-# INLINE breakSubstringS #-}
breakSubstringS :: S.ByteString  -- ^ Pattern to break on
                -> S.ByteString  -- ^ String to break up
                -> (S.ByteString, S.ByteString)
                    -- ^ Prefix and remainder of broken string
breakSubstringS = strictBreak

breakAfterS :: S.ByteString
            -> S.ByteString
            -> (S.ByteString, S.ByteString)
breakAfterS pat
  | S.null pat = \str -> (S.empty, str)
breakAfterS pat = breaker
  where
    !patLen  = S.length pat
    searcher = strictSearcher False pat
    breaker str = case searcher str of
                    []    -> (str, S.empty)
                    (i:_) -> S.splitAt (i + patLen) str

-- splitting
--
-- These functions implement various splitting strategies.
--
-- If the pattern to split on is empty, all functions return an
-- infinite list of empty ByteStrings.
-- Otherwise, the names are rather self-explanatory.
--
-- For nonempty patterns, the following relations hold:
--
-- > concat (splitKeepXY pat str) == str
-- > concat ('Data.List.intersperse' pat (splitDropX pat str)) == str.
--
-- All fragments except possibly the last in the result of
-- @splitKeepEndX pat@ end with @pat@, none of the fragments contains
-- more than one occurrence of @pat@ or is empty.
--
-- All fragments except possibly the first in the result of
-- @splitKeepFrontX pat@ begin with @pat@, none of the fragments
-- contains more than one occurrence of @patq or is empty.
--
-- > splitDropX pat str == map dropPat (splitKeepFrontX pat str)
-- >   where
-- >     patLen = length pat
-- >     dropPat frag
-- >        | pat `isPrefixOf` frag = drop patLen frag
-- >        | otherwise             = frag
--
-- but @splitDropX@ is a little more efficient than that.


{-# INLINE splitKeepEndS #-}
splitKeepEndS :: S.ByteString    -- ^ Pattern to split on
              -> S.ByteString    -- ^ String to split
              -> [S.ByteString]  -- ^ List of fragments
splitKeepEndS = strictSplitKeepEnd

{-# INLINE splitKeepFrontS #-}
splitKeepFrontS :: S.ByteString    -- ^ Pattern to split on
                -> S.ByteString    -- ^ String to split
                -> [S.ByteString]  -- ^ List of fragments
splitKeepFrontS = strictSplitKeepFront

{-# INLINE splitDropS #-}
splitDropS :: S.ByteString    -- ^ Pattern to split on
           -> S.ByteString    -- ^ String to split
           -> [S.ByteString]  -- ^ List of fragments
splitDropS = strictSplitDrop

------------------------------------------------------------------------------
--                             Search Functions                             --
------------------------------------------------------------------------------

strictSearcher :: Bool -> S.ByteString -> S.ByteString -> [Int]
strictSearcher _ !pat
    | S.null pat = enumFromTo 0 . S.length
    | S.length pat == 1 = let !w = S.head pat in S.elemIndices w
strictSearcher !overlap pat = searcher
  where
    {-# INLINE patAt #-}
    patAt :: Int -> Word8
    patAt !i = unsafeIndex pat i

    !patLen = S.length pat
    !patEnd = patLen - 1
    !maxLen = maxBound - patLen
    !occT   = occurs pat        -- for bad-character-shift
    !suffT  = suffShifts pat    -- for good-suffix-shift
    !skip   = if overlap then unsafeAt suffT 0 else patLen
    -- shift after a complete match
    !kept   = patLen - skip     -- length of known prefix after full match
    !pe     = patAt patEnd      -- last pattern byte for fast comparison

    {-# INLINE occ #-}
    occ !w = unsafeAt occT (fromIntegral w)

    {-# INLINE suff #-}
    suff !i = unsafeAt suffT i

    searcher str
        | maxLen < strLen
            = error "Overflow in BoyerMoore.strictSearcher"
        | maxDiff < 0   = []
        | otherwise     = checkEnd patEnd
          where
            !strLen  = S.length str
            !strEnd  = strLen - 1
            !maxDiff = strLen - patLen

            {-# INLINE strAt #-}
            strAt !i = unsafeIndex str i

            -- After a full match, we know how long a prefix of the pattern
            -- still matches. Do not re-compare the prefix to prevent O(m*n)
            -- behaviour for periodic patterns.
            afterMatch !diff !patI =
              case strAt (diff + patI) of
                !c  | c == patAt patI ->
                      if patI == kept
                        then diff : let !diff' = diff + skip
                                    in if maxDiff < diff'
                                        then []
                                        else afterMatch diff' patEnd
                        else afterMatch diff (patI - 1)
                    | patI == patEnd  ->
                            checkEnd (diff + 2*patEnd + occ c)
                    | otherwise       ->
                            let {-# INLINE badShift #-}
                                badShift = patI + occ c
                                {-# INLINE goodShift #-}
                                goodShift = suff patI
                                !diff' = diff + max badShift goodShift
                            in if maxDiff < diff'
                                then []
                                else checkEnd (diff + patEnd)

            -- While comparing the last byte of the pattern, the bad-
            -- character-shift is always at least as large as the good-
            -- suffix-shift. Eliminating the unnecessary memory reads and
            -- comparison speeds things up noticeably.
            checkEnd !sI  -- index in string to compare to last of pattern
                | strEnd < sI   = []
                | otherwise     =
                  case strAt sI of
                    !c  | c == pe   -> findMatch (sI - patEnd) (patEnd - 1)
                        | otherwise -> checkEnd (sI + patEnd + occ c)

            -- Once the last byte has matched, we enter the full matcher
            -- diff is the offset of the window, patI the index of the
            -- pattern byte to compare next.
            findMatch !diff !patI =
                case strAt (diff + patI) of
                    !c  | c == patAt patI ->
                            if patI == 0    -- full match, report
                                then diff : let !diff' = diff + skip
                                            in if maxDiff < diff'
                                                then []
                                                else
                                                  if skip == patLen
                                                    then
                                                      checkEnd (diff' + patEnd)
                                                    else
                                                      afterMatch diff' patEnd
                                else findMatch diff (patI - 1)
                        | otherwise       ->
                            let !diff' = diff + max (patI + occ c) (suff patI)
                            in if maxDiff < diff'
                                then []
                                else checkEnd (diff' + patEnd)

------------------------------------------------------------------------------
--                            Breaking Functions                            --
------------------------------------------------------------------------------

strictBreak :: S.ByteString -> S.ByteString -> (S.ByteString, S.ByteString)
strictBreak pat
    | S.null pat    = \str -> (S.empty, str)
    | otherwise     = breaker
      where
        searcher = strictSearcher False pat
        breaker str = case searcher str of
                        []      -> (str, S.empty)
                        (i:_)   -> S.splitAt i str

------------------------------------------------------------------------------
--                            Splitting Functions                           --
------------------------------------------------------------------------------

strictSplitKeepFront :: S.ByteString -> S.ByteString -> [S.ByteString]
strictSplitKeepFront pat
  | S.null pat  = const (repeat S.empty)
strictSplitKeepFront pat = splitter
  where
    !patLen = S.length pat
    searcher = strictSearcher False pat
    splitter str
        | S.null str    = []
        | otherwise     =
          case searcher str of
            []            -> [str]
            (i:_)
              | i == 0    -> psplitter str
              | otherwise -> S.take i str : psplitter (S.drop i str)
    psplitter !str
        | S.null str    = []
        | otherwise     =
          case searcher (S.drop patLen str) of
            []      -> [str]
            (i:_)   -> S.take (i + patLen) str :
                        psplitter (S.drop (i + patLen) str)

strictSplitKeepEnd :: S.ByteString -> S.ByteString -> [S.ByteString]
strictSplitKeepEnd pat
  | S.null pat  = const (repeat S.empty)
strictSplitKeepEnd pat = splitter
  where
    !patLen = S.length pat
    searcher = strictSearcher False pat
    splitter str
        | S.null str    = []
        | otherwise     =
          case searcher str of
            [] -> [str]
            (i:_) -> S.take (i + patLen) str :
                        splitter (S.drop (i + patLen) str)

strictSplitDrop :: S.ByteString -> S.ByteString -> [S.ByteString]
strictSplitDrop pat
    | S.null pat    = const (repeat S.empty)
strictSplitDrop pat = splitter'
  where
    !patLen = S.length pat
    searcher = strictSearcher False pat
    splitter' str
        | S.null str    = []
        | otherwise     = splitter str
    splitter str
        | S.null str    = [S.empty]
        | otherwise     =
          case searcher str of
            []            -> [str]
            (i:_) -> S.take i str : splitter (S.drop (i + patLen) str)

------------------------------------------------------------------------------
--                            Replacing Functions                           --
------------------------------------------------------------------------------

-- replacing loop for strict ByteStrings, called only for
-- non-empty patterns and substitutions
strictRepl :: S.ByteString -> ([S.ByteString] -> [S.ByteString])
            -> S.ByteString -> [S.ByteString]
strictRepl pat  = repl
  where
    !patLen = S.length pat
    searcher = strictSearcher False pat
    repl sub = replacer
      where
        replacer str
          | S.null str    = []
          | otherwise     =
            case searcher str of
              []            -> [str]
              (i:_)
                | i == 0    -> sub $ replacer (S.drop patLen str)
                | otherwise ->
                  S.take i str : sub (replacer (S.drop (i + patLen) str))