File: KarpRabin.hs

package info (click to toggle)
haskell-stringsearch 0.3.6.6-13
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 284 kB
  • sloc: haskell: 1,915; makefile: 2
file content (251 lines) | stat: -rw-r--r-- 10,401 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
{-# LANGUAGE BangPatterns #-}
-- |
-- Module         : Data.ByteString.Lazy.Search.KarpRabin
-- Copyright      : (c) 2010 Daniel Fischer
-- Licence        : BSD3
-- Maintainer     : Daniel Fischer <daniel.is.fischer@googlemail.com>
-- Stability      : Provisional
-- Portability    : non-portable (BangPatterns)
--
-- Simultaneous search for multiple patterns in a lazy 'L.ByteString'
-- using the Karp-Rabin algorithm.
--
-- A description of the algorithm for a single pattern can be found at
-- <http://www-igm.univ-mlv.fr/~lecroq/string/node5.html#SECTION0050>.
module Data.ByteString.Lazy.Search.KarpRabin ( -- * Overview
                                               -- $overview

                                               -- ** Caution
                                               -- $caution

                                               -- * Function
                                               indicesOfAny
                                             ) where

import qualified Data.ByteString as S
import qualified Data.ByteString.Lazy as L
import Data.ByteString.Unsafe (unsafeIndex)

import qualified Data.IntMap as IM

import Data.Array
import Data.Array.Base (unsafeAt)

import Data.Word (Word8)
import Data.Int (Int64)
import Data.Bits
import Data.List (foldl')

-- $overview
--
-- The Karp-Rabin algorithm works by calculating a hash of the pattern and
-- comparing that hash with the hash of a slice of the target string with
-- the same length as the pattern. If the hashes are equal, the slice of the
-- target is compared to the pattern character by character (since the hash
-- function generally isn't injective).
--
-- For a single pattern, this tends to be more efficient than the na&#239;ve
-- algorithm, but it cannot compete with algorithms like
-- Knuth-Morris-Pratt or Boyer-Moore.
--
-- However, the algorithm can be generalised to search for multiple patterns
-- simultaneously. If the shortest pattern has length @k@, hash the prefix of
-- length @k@ of all patterns and compare the hash of the target's slices of
-- length @k@ to them. If there's a match, check whether the slice is part
-- of an occurrence of the corresponding pattern.
--
-- With a hash-function that
--
--   * allows to compute the hash of one slice in constant time from the hash
--     of the previous slice, the new and the dropped character, and
--
--   * produces few spurious matches,
--
-- searching for occurrences of any of @n@ patterns has a best-case complexity
-- of /O/(@targetLength@ * @lookup n@). The worst-case complexity is
-- /O/(@targetLength@ * @lookup n@ * @sum patternLengths@), the average is
-- not much worse than the best case.
--
-- The functions in this module store the hashes of the patterns in an
-- 'IM.IntMap', so the lookup is /O/(@log n@). Re-hashing is done in constant
-- time and spurious matches of the hashes /should be/ sufficiently rare.
-- The maximal length of the prefixes to be hashed is 32.

-- $caution
--
-- Unfortunately, the constant factors are high, so these functions are slow.
-- Unless the number of patterns to search for is high (larger than 50 at
-- least), repeated search for single patterns using Boyer-Moore or DFA and
-- manual merging of the indices is faster. /Much/ faster for less than 40
-- or so patterns.
--
-- 'indicesOfAny' has the advantage over multiple single-pattern searches that
-- it doesn't hold on to large parts of the string (which is likely to happen
-- for multiple searches), however, so in contrast to the strict version, it
-- may be useful for relatively few patterns already.
--
-- Nevertheless, this module seems more of an interesting curiosity than
-- anything else.

-- | @'indicesOfAny'@ finds all occurrences of any of several non-empty strict
--   patterns in a lazy target string. If no non-empty patterns are given,
--   the result is an empty list. Otherwise the result list contains
--   the pairs of all indices where any of the (non-empty) patterns start
--   and the list of all patterns starting at that index, the patterns being
--   represented by their (zero-based) position in the pattern list.
--   Empty patterns are filtered out before processing begins.
{-# INLINE indicesOfAny #-}
indicesOfAny :: [S.ByteString]  -- ^ List of non-empty patterns
             -> L.ByteString    -- ^ String to search
             -> [(Int64,[Int])]   -- ^ List of matches
indicesOfAny pats
    | null nepats   = const []
    | otherwise     = lazyMatcher nepats . L.toChunks
      where
        nepats = filter (not . S.null) pats

------------------------------------------------------------------------------
--                                 Workers                                 --
------------------------------------------------------------------------------

{-# INLINE rehash1 #-}
rehash1 :: Int -> Int -> Word8 -> Word8 -> Int
rehash1 out h o n =
    (h `shiftL` 1 - (fromIntegral o `shiftL` out)) + fromIntegral n

{-# INLINE rehash2 #-}
rehash2 :: Int -> Int -> Word8 -> Word8 -> Int
rehash2 out h o n =
    (h `shiftL` 2 - (fromIntegral o `shiftL` out)) + fromIntegral n

{-# INLINE rehash3 #-}
rehash3 :: Int -> Int -> Word8 -> Word8 -> Int
rehash3 out h o n =
    (h `shiftL` 3 - (fromIntegral o `shiftL` out)) + fromIntegral n

{-# INLINE rehash4 #-}
rehash4 :: Int -> Int -> Word8 -> Word8 -> Int
rehash4 out h o n =
    (h `shiftL` 4 - (fromIntegral o `shiftL` out)) + fromIntegral n

lazyMatcher :: [S.ByteString] -> [S.ByteString] -> [(Int64,[Int])]
lazyMatcher pats = search 0 hLen S.empty
  where
    !hLen = minimum (32 : map S.length pats)
    !shDi = case 32 `quot` hLen of
              q | q < 4 -> q
                | otherwise -> 4
    !outS = shDi*hLen
    !patNum = length pats
    !patArr = listArray (0, patNum - 1) pats
    {-# INLINE rehash #-}
    rehash :: Int -> Word8 -> Word8 -> Int
    rehash = case shDi of
                1 -> rehash1 hLen
                2 -> rehash2 outS
                3 -> rehash3 outS
                _ -> rehash4 outS
    hash :: S.ByteString -> Int
    hash = S.foldl' (\h w -> (h `shiftL` shDi) + fromIntegral w) 0 . S.take hLen
    !hashMap =
        foldl' (\mp (h,i) -> IM.insertWith (flip (++)) h [i] mp) IM.empty $
                                zip (map hash pats) [0 :: Int .. ]
    search _ _ _ [] = []
    search !h !rm !prev (!str : rest)
      | strLen < rm =
          let !h' = S.foldl' (\o w -> (o `shiftL` 1) + fromIntegral w) h str
              !prev' = S.append prev str
          in search h' (rm - strLen) prev' rest
      | otherwise =
          let !h' = S.foldl' (\o w -> (o `shiftL` 1) + fromIntegral w) h
                                                (S.take rm str)
          in if S.null prev
                then noPast 0 rest str h'
                else past 0 rest prev 0 str rm h'
        where
          !strLen = S.length str

    noPast !prior rest !str hsh = go hsh 0
      where
        !strLen = S.length str
        !maxIdx = strLen - hLen
        {-# INLINE strAt #-}
        strAt !i = unsafeIndex str i
        go !h sI =
          case IM.lookup h hashMap of
            Nothing ->
              if sI == maxIdx
                then case rest of
                        [] -> []
                        (nxt : more) ->
                          let !h' = rehash h (strAt sI) (unsafeIndex nxt 0)
                              !prior' = prior + fromIntegral strLen
                              !prev = S.drop (sI + 1) str
                          in if hLen == 1
                                then noPast prior' more nxt h'
                                else past prior' more prev 0 nxt 1 h'
                else go (rehash h (strAt sI) (strAt (sI + hLen))) (sI + 1)
            Just ps ->
              let !rst = S.drop sI str
                  !rLen = strLen - sI
                  {-# INLINE hd #-}
                  hd = strAt sI
                  {-# INLINE more #-}
                  more =
                    if sI == maxIdx
                      then case rest of
                            [] -> []
                            (nxt : fut) ->
                              let !h' = rehash h hd (unsafeIndex nxt 0)
                                  !prior' = prior + fromIntegral strLen
                              in if hLen == 1
                                    then noPast prior' fut nxt h'
                                    else past prior' fut rst 1 nxt 1 h'
                      else go (rehash h hd (strAt (sI + hLen))) (sI + 1)
                  okay bs
                    | rLen < S.length bs = S.isPrefixOf rst bs &&
                            checkFut (S.drop rLen bs) rest
                    | otherwise = S.isPrefixOf bs rst
              in case filter (okay . (patArr `unsafeAt`)) ps of
                    [] -> more
                    qs -> seq (length qs) $
                            (prior + fromIntegral sI,qs) : more

    past !prior rest !prev !pI !str !sI !hsh
      | strLen < 4040 =
        let !prior' = prior - 1 + fromIntegral (sI - hLen)
            !curr   = S.append (S.drop pI prev) str
        in noPast prior' rest curr hsh
      | otherwise = go hsh pI sI
        where
          !strLen = S.length str
          {-# INLINE strAt #-}
          strAt !i = unsafeIndex str i
          {-# INLINE prevAt #-}
          prevAt !i = unsafeIndex prev i
          go !h !p !s
            | s == hLen = noPast prior rest str h
            | otherwise =
              case IM.lookup h hashMap of
                Nothing ->
                  let {-# INLINE h' #-}
                      h' = rehash h (prevAt p) (strAt s)
                  in go h' (p + 1) (s + 1)
                Just ps ->
                  let !prst = S.drop p prev
                      {-# INLINE more #-}
                      more = go (rehash h (prevAt p) (strAt s)) (p + 1) (s + 1)
                      okay bs = checkFut bs (prst : str : rest)
                  in case filter (okay . (unsafeAt patArr)) ps of
                        [] -> more
                        qs -> seq (length qs) $
                                (prior + fromIntegral (s - hLen), qs) : more

{-# INLINE checkFut #-}
checkFut :: S.ByteString -> [S.ByteString] -> Bool
checkFut _ [] = False
checkFut !bs (!h : t)
    | hLen < S.length bs = S.isPrefixOf h bs && checkFut (S.drop hLen bs) t
    | otherwise = S.isPrefixOf bs h
      where
        !hLen = S.length h