File: levenshtein_distance.cc

package info (click to toggle)
chromium 139.0.7258.127-1
  • links: PTS, VCS
  • area: main
  • in suites:
  • size: 6,122,068 kB
  • sloc: cpp: 35,100,771; ansic: 7,163,530; javascript: 4,103,002; python: 1,436,920; asm: 946,517; xml: 746,709; pascal: 187,653; perl: 88,691; sh: 88,436; objc: 79,953; sql: 51,488; cs: 44,583; fortran: 24,137; makefile: 22,147; tcl: 15,277; php: 13,980; yacc: 8,984; ruby: 7,485; awk: 3,720; lisp: 3,096; lex: 1,327; ada: 727; jsp: 228; sed: 36
file content (102 lines) | stat: -rw-r--r-- 4,319 bytes parent folder | download | duplicates (10)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
// Copyright 2023 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "base/strings/levenshtein_distance.h"

#include <stddef.h>

#include <algorithm>
#include <numeric>
#include <optional>
#include <string_view>
#include <vector>

namespace base {

namespace {

template <typename CharT>
size_t LevenshteinDistanceImpl(std::basic_string_view<CharT> a,
                               std::basic_string_view<CharT> b,
                               std::optional<size_t> max_distance) {
  if (a.size() > b.size()) {
    a.swap(b);
  }

  // max(a.size(), b.size()) steps always suffice.
  const size_t k = max_distance.value_or(b.size());
  // If the string's lengths differ by more than `k`, so does their
  // Levenshtein distance.
  if (a.size() + k < b.size()) {
    return k + 1;
  }
  // The classical Levenshtein distance DP defines dp[i][j] as the minimum
  // number of insert, remove and replace operation to convert a[:i] to b[:j].
  // To make this more efficient, one can define dp[i][d] as the distance of
  // a[:i] and b[:i + d]. Intuitively, d represents the delta between j and i in
  // the former dp. Since the Levenshtein distance is restricted by `k`, abs(d)
  // can be bounded by `k`. Since dp[i][d] only depends on values from dp[i-1],
  // it is not necessary to store the entire 2D table. Instead, this code just
  // stores the d-dimension, which represents "the distance with the current
  // prefix of the string, for a given delta d". Since d is between `-k` and
  // `k`, the implementation shifts the d-index by `k`, bringing it in range
  // [0, `2*k`].

  // The algorithm only cares if the Levenshtein distance is at most `k`. Thus,
  // any unreachable states and states in which the distance is certainly larger
  // than `k` can be set to any value larger than `k`, without affecting the
  // result.
  const size_t kInfinity = k + 1;
  std::vector<size_t> dp(2 * k + 1, kInfinity);
  // Initially, `dp[d]` represents the Levenshtein distance of the empty prefix
  // of `a` and the first j = d - k characters of `b`. Their distance is j,
  // since j removals are required. States with negative d are not reachable,
  // since that corresponds to a negative index into `b`.
  std::iota(dp.begin() + static_cast<long>(k), dp.end(), 0);
  for (size_t i = 0; i < a.size(); i++) {
    // Right now, `dp` represents the Levenshtein distance when considering the
    // first `i` characters (up to index `i-1`) of `a`. After the next loop,
    // `dp` will represent the Levenshtein distance when considering the first
    // `i+1` characters.
    for (size_t d = 0; d <= 2 * k; d++) {
      if (i + d < k || i + d >= b.size() + k) {
        // `j = i + d - k` is out of range of `b`. Since j == -1 corresponds to
        // the empty prefix of `b`, the distance is i + 1 in this case.
        dp[d] = i + d + 1 == k ? i + 1 : kInfinity;
        continue;
      }
      const size_t j = i + d - k;
      // If `a[i] == `b[j]` the Levenshtein distance for `d` remained the same.
      if (a[i] != b[j]) {
        // (i, j) -> (i-1, j-1), `d` stays the same.
        const size_t replace = dp[d];
        // (i, j) -> (i-1, j), `d` increases by 1.
        // If the distance between `i` and `j` becomes larger than `k`, their
        // distance is at least `k + 1`. Same in the `insert` case.
        const size_t remove = d != 2 * k ? dp[d + 1] : kInfinity;
        // (i, j) -> (i, j-1), `d` decreases by 1. Since `i` stays the same,
        // this is intentionally using the dp value updated in the previous
        // iteration.
        const size_t insert = d != 0 ? dp[d - 1] : kInfinity;
        dp[d] = 1 + std::min({replace, remove, insert});
      }
    }
  }
  return std::min(dp[b.size() + k - a.size()], k + 1);
}

}  // namespace

size_t LevenshteinDistance(std::string_view a,
                           std::string_view b,
                           std::optional<size_t> max_distance) {
  return LevenshteinDistanceImpl(a, b, max_distance);
}
size_t LevenshteinDistance(std::u16string_view a,
                           std::u16string_view b,
                           std::optional<size_t> max_distance) {
  return LevenshteinDistanceImpl(a, b, max_distance);
}

}  // namespace base