File: Metrics.hs

package info (click to toggle)
haskell-text-metrics 0.3.2-3
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 112 kB
  • sloc: haskell: 409; makefile: 7
file content (365 lines) | stat: -rw-r--r-- 12,710 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE MultiWayIf #-}

-- |
-- Module      :  Data.Text.Metrics
-- Copyright   :  © 2016–present Mark Karpov
-- License     :  BSD 3 clause
--
-- Maintainer  :  Mark Karpov <markkarpov92@gmail.com>
-- Stability   :  experimental
-- Portability :  portable
--
-- The module provides efficient implementations of various strings metric
-- algorithms. It works with strict 'Text' values.
--
-- __Note__: before version /0.3.0/ the package used C implementations of
-- the algorithms under the hood. Beginning from version /0.3.0/, the
-- implementations are written in Haskell while staying almost as fast, see:
--
-- <https://markkarpov.com/post/migrating-text-metrics.html>
module Data.Text.Metrics
  ( -- * Levenshtein variants
    levenshtein,
    levenshteinNorm,
    damerauLevenshtein,
    damerauLevenshteinNorm,

    -- * Treating inputs like sets
    overlap,
    jaccard,

    -- * Other
    hamming,
    jaro,
    jaroWinkler,
  )
where

import Control.Monad
import Control.Monad.ST
import Data.Map.Strict (Map)
import qualified Data.Map.Strict as M
import Data.Ratio
import Data.Text
import qualified Data.Text as T
import qualified Data.Text.Internal as T
import qualified Data.Text.Unsafe as TU
import qualified Data.Vector.Unboxed.Mutable as VUM
import GHC.Exts (inline)

----------------------------------------------------------------------------
-- Levenshtein variants

-- | Return the Levenshtein distance between two 'Text' values. The
-- Levenshtein distance between two strings is the minimal number of
-- operations necessary to transform one string into another. For the
-- Levenshtein distance allowed operations are: deletion, insertion, and
-- substitution.
--
-- See also: <https://en.wikipedia.org/wiki/Levenshtein_distance>.
--
-- __Heads up__, before version /0.3.0/ this function returned
-- 'Data.Numeric.Natural'.
levenshtein :: Text -> Text -> Int
levenshtein a b = fst (levenshtein_ a b)

-- | Return the normalized Levenshtein distance between two 'Text' values.
-- Result is a non-negative rational number (represented as @'Ratio'
-- 'Data.Numeric.Natural'@), where 0 signifies no similarity between the
-- strings, while 1 means exact match.
--
-- See also: <https://en.wikipedia.org/wiki/Levenshtein_distance>.
--
-- __Heads up__, before version /0.3.0/ this function returned @'Ratio'
-- 'Data.Numeric.Natural'@.
levenshteinNorm :: Text -> Text -> Ratio Int
levenshteinNorm = norm levenshtein_

-- | An internal helper, returns the Levenshtein distance as the first
-- element of the tuple and max length of the two inputs as the second
-- element of the tuple.
levenshtein_ :: Text -> Text -> (Int, Int)
levenshtein_ a b
  | T.null a = (lenb, lenm)
  | T.null b = (lena, lenm)
  | otherwise = runST $ do
      let v_len = lenb + 1
      v <- VUM.unsafeNew (v_len * 2)
      let gov !i =
            when (i < v_len) $ do
              VUM.unsafeWrite v i i
              gov (i + 1)
          goi !i !na !v0 !v1 = do
            let !(TU.Iter ai da) = TU.iter a na
                goj !j !nb =
                  when (j < lenb) $ do
                    let !(TU.Iter bj db) = TU.iter b nb
                        cost = if ai == bj then 0 else 1
                    x <- (+ 1) <$> VUM.unsafeRead v (v1 + j)
                    y <- (+ 1) <$> VUM.unsafeRead v (v0 + j + 1)
                    z <- (+ cost) <$> VUM.unsafeRead v (v0 + j)
                    VUM.unsafeWrite v (v1 + j + 1) (min x (min y z))
                    goj (j + 1) (nb + db)
            when (i < lena) $ do
              VUM.unsafeWrite v v1 (i + 1)
              goj 0 0
              goi (i + 1) (na + da) v1 v0
      gov 0
      goi 0 0 0 v_len
      ld <- VUM.unsafeRead v (lenb + if even lena then 0 else v_len)
      return (ld, lenm)
  where
    lena = T.length a
    lenb = T.length b
    lenm = max lena lenb
{-# INLINE levenshtein_ #-}

-- | Return the Damerau-Levenshtein distance between two 'Text' values. The
-- function works like 'levenshtein', but the collection of allowed
-- operations also includes transposition of two /adjacent/ characters.
--
-- See also: <https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance>.
--
-- __Heads up__, before version /0.3.0/ this function returned
-- 'Data.Numeric.Natural'.
damerauLevenshtein :: Text -> Text -> Int
damerauLevenshtein a b = fst (damerauLevenshtein_ a b)

-- | Return the normalized Damerau-Levenshtein distance between two 'Text'
-- values. 0 signifies no similarity between the strings, while 1 means
-- exact match.
--
-- See also: <https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance>.
--
-- __Heads up__, before version /0.3.0/ this function returned @'Ratio'
-- 'Data.Numeric.Natural'@.
damerauLevenshteinNorm :: Text -> Text -> Ratio Int
damerauLevenshteinNorm = norm damerauLevenshtein_

-- | An internal helper, returns the Damerau-Levenshtein distance as the
-- first element of the tuple and max length of the two inputs as the second
-- element of the tuple.
damerauLevenshtein_ :: Text -> Text -> (Int, Int)
damerauLevenshtein_ a b
  | T.null a = (lenb, lenm)
  | T.null b = (lena, lenm)
  | otherwise = runST $ do
      let v_len = lenb + 1
      v <- VUM.unsafeNew (v_len * 3)
      let gov !i =
            when (i < v_len) $ do
              VUM.unsafeWrite v i i
              gov (i + 1)
          goi !i !na !ai_1 !v0 !v1 !v2 = do
            let !(TU.Iter ai da) = TU.iter a na
                goj !j !nb !bj_1 =
                  when (j < lenb) $ do
                    let !(TU.Iter bj db) = TU.iter b nb
                        cost = if ai == bj then 0 else 1
                    x <- (+ 1) <$> VUM.unsafeRead v (v1 + j)
                    y <- (+ 1) <$> VUM.unsafeRead v (v0 + j + 1)
                    z <- (+ cost) <$> VUM.unsafeRead v (v0 + j)
                    let g = min x (min y z)
                    val <- (+ cost) <$> VUM.unsafeRead v (v2 + j - 1)
                    VUM.unsafeWrite v (v1 + j + 1) $
                      if i > 0 && j > 0 && ai == bj_1 && ai_1 == bj && val < g
                        then val
                        else g
                    goj (j + 1) (nb + db) bj
            when (i < lena) $ do
              VUM.unsafeWrite v v1 (i + 1)
              goj 0 0 'a'
              goi (i + 1) (na + da) ai v1 v2 v0
      gov 0
      goi 0 0 'a' 0 v_len (v_len * 2)
      ld <- VUM.unsafeRead v (lenb + (lena `mod` 3) * v_len)
      return (ld, lenm)
  where
    lena = T.length a
    lenb = T.length b
    lenm = max lena lenb
{-# INLINE damerauLevenshtein_ #-}

----------------------------------------------------------------------------
-- Treating inputs like sets

-- | Return the overlap coefficient for two 'Text' values. Returned value is
-- in the range from 0 (no similarity) to 1 (exact match). Return 1 if both
-- 'Text' values are empty.
--
-- See also: <https://en.wikipedia.org/wiki/Overlap_coefficient>.
--
-- @since 0.3.0
overlap :: Text -> Text -> Ratio Int
overlap a b =
  if d == 0
    then 1 % 1
    else intersectionSize (mkTextMap a) (mkTextMap b) % d
  where
    d = min (T.length a) (T.length b)

-- | Return the Jaccard similarity coefficient for two 'Text' values.
-- Returned value is in the range from 0 (no similarity) to 1 (exact match).
-- Return 1 if both
--
-- See also: <https://en.wikipedia.org/wiki/Jaccard_index>
--
-- @since 0.3.0
jaccard :: Text -> Text -> Ratio Int
jaccard a b =
  if d == 0
    then 1 % 1
    else intersectionSize ma mb % d
  where
    ma = mkTextMap a
    mb = mkTextMap b
    d = unionSize ma mb

-- | Make a map from 'Char' to 'Int' representing how many times the 'Char'
-- appears in the input 'Text'.
mkTextMap :: Text -> Map Char Int
mkTextMap = T.foldl' f M.empty
  where
    f m ch = M.insertWith (+) ch 1 m
{-# INLINE mkTextMap #-}

-- | Return intersection size between two 'Text'-maps.
intersectionSize :: Map Char Int -> Map Char Int -> Int
intersectionSize a b = M.foldl' (+) 0 (M.intersectionWith min a b)
{-# INLINE intersectionSize #-}

-- | Return union size between two 'Text'-maps.
unionSize :: Map Char Int -> Map Char Int -> Int
unionSize a b = M.foldl' (+) 0 (M.unionWith max a b)
{-# INLINE unionSize #-}

----------------------------------------------------------------------------
-- Other

-- | /O(n)/ Return the Hamming distance between two 'Text' values. Hamming
-- distance is defined as the number of positions at which the corresponding
-- symbols are different. The input 'Text' values should be of equal length
-- or 'Nothing' will be returned.
--
-- See also: <https://en.wikipedia.org/wiki/Hamming_distance>.
--
-- __Heads up__, before version /0.3.0/ this function returned @'Maybe'
-- 'Data.Numeric.Natural'@.
hamming :: Text -> Text -> Maybe Int
hamming a@(T.Text _ _ len) b =
  if T.length a == T.length b
    then Just (go 0 0 0)
    else Nothing
  where
    go !na !nb !r =
      let !(TU.Iter cha da) = TU.iter a na
          !(TU.Iter chb db) = TU.iter b nb
       in if
              | na == len -> r
              | cha /= chb -> go (na + da) (nb + db) (r + 1)
              | otherwise -> go (na + da) (nb + db) r

-- | Return the Jaro distance between two 'Text' values. Returned value is
-- in the range from 0 (no similarity) to 1 (exact match).
--
-- While the algorithm is pretty clear for artificial examples (like those
-- from the linked Wikipedia article), for /arbitrary/ strings, it may be
-- hard to decide which of two strings should be considered as one having
-- “reference” order of characters (order of matching characters in an
-- essential part of the definition of the algorithm). This makes us
-- consider the first string the “reference” string (with correct order of
-- characters). Thus generally,
--
-- > jaro a b ≠ jaro b a
--
-- This asymmetry can be found in all implementations of the algorithm on
-- the internet, AFAIK.
--
-- See also: <https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance>
--
-- @since 0.2.0
--
-- __Heads up__, before version /0.3.0/ this function returned @'Ratio'
-- 'Data.Numeric.Natural'@.
jaro :: Text -> Text -> Ratio Int
jaro a b =
  if T.null a || T.null b
    then 0 % 1
    else runST $ do
      let lena = T.length a
          lenb = T.length b
          d =
            if lena >= 2 && lenb >= 2
              then max lena lenb `quot` 2 - 1
              else 0
      v <- VUM.replicate lenb (0 :: Int)
      r <- VUM.replicate 3 (0 :: Int) -- tj, m, t
      let goi !i !na !fromb = do
            let !(TU.Iter ai da) = TU.iter a na
                (from, fromb') =
                  if i >= d
                    then (i - d, fromb + TU.iter_ b fromb)
                    else (0, 0)
                to = min (i + d + 1) lenb
                goj !j !nb =
                  when (j < to) $ do
                    let !(TU.Iter bj db) = TU.iter b nb
                    used <- (== 1) <$> VUM.unsafeRead v j
                    if not used && ai == bj
                      then do
                        tj <- VUM.unsafeRead r 0
                        if j < tj
                          then VUM.unsafeModify r (+ 1) 2
                          else VUM.unsafeWrite r 0 j
                        VUM.unsafeWrite v j 1
                        VUM.unsafeModify r (+ 1) 1
                      else goj (j + 1) (nb + db)
            when (i < lena) $ do
              goj from fromb
              goi (i + 1) (na + da) fromb'
      goi 0 0 0
      m <- VUM.unsafeRead r 1
      t <- VUM.unsafeRead r 2
      return $
        if m == 0
          then 0 % 1
          else
            ( (m % lena)
                + (m % lenb)
                + ((m - t) % m)
            )
              / 3

-- | Return the Jaro-Winkler distance between two 'Text' values. Returned
-- value is in range from 0 (no similarity) to 1 (exact match).
--
-- See also: <https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance>
--
-- @since 0.2.0
--
-- __Heads up__, before version /0.3.0/ this function returned @'Ratio'
-- 'Data.Numeric.Natural'@.
jaroWinkler :: Text -> Text -> Ratio Int
jaroWinkler a b = dj + (1 % 10) * l * (1 - dj)
  where
    dj = inline (jaro a b)
    l = fromIntegral (min 4 (commonPrefix a b))

-- | Return the length of the common prefix two 'Text' values have.
commonPrefix :: Text -> Text -> Int
commonPrefix a b = case T.commonPrefixes a b of
  Nothing -> 0
  Just (pref, _, _) -> T.length pref
{-# INLINE commonPrefix #-}

----------------------------------------------------------------------------
-- Helpers

norm :: (Text -> Text -> (Int, Int)) -> Text -> Text -> Ratio Int
norm f a b =
  let (r, l) = f a b
   in if r == 0
        then 1 % 1
        else 1 % 1 - r % l
{-# INLINE norm #-}