File: levenshtein.cpp

package info (click to toggle)
martchus-cpp-utilities 5.33.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 1,396 kB
  • sloc: cpp: 12,679; awk: 18; ansic: 12; makefile: 10
file content (134 lines) | stat: -rw-r--r-- 5,739 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#include "./levenshtein.h"

#include "./math.h"
#include "./multiarray.h"

#include <limits>

using namespace std;

namespace CppUtilities {

/// \cond

/// \brief The DistanceArray is a 2D array which is allocated either on the stack or the heap.
using DistanceArray = MultiArray<size_t, NoneOwningMultiArray, size_t, size_t>;

/// \brief Initializes the distance array.
///
/// For size1=5 and size2=4 it would look like this:
/// ```
///  9 9 9 9 9 9
///  9 0 1 2 3 4
///  9 1 0 0 0 0
///  9 2 0 0 0 0
///  9 3 0 0 0 0
///  9 4 0 0 0 0
///  9 5 0 0 0 0
/// ```
void initDistanceArray(DistanceArray &distanceArray, const size_t size1, const size_t size2)
{
    const auto maxDistance(size1 + size2);
    // ignore warning about null pointer dereference for now (which is *likely* not correct)
#ifdef __GNUC__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wnull-dereference"
#endif
    distanceArray.at(0, 0) = maxDistance;
#ifdef __GNUC__
#pragma GCC diagnostic pop
#endif
    for (size_t i = 0; i <= size1; ++i) {
        distanceArray.at(i + 1, 1) = i;
        distanceArray.at(i + 1, 0) = maxDistance;
    }
    for (size_t i = 0; i <= size2; ++i) {
        distanceArray.at(1, i + 1) = i;
        distanceArray.at(0, i + 1) = maxDistance;
    }
}

/// \brief Performs the actual Damerau–Levenshtein algorithm.
/// \sa computeDamerauLevenshteinDistance() for more details on the algorithm.
size_t performDamerauLevenshteinAlgorithm(
    DistanceArray &distanceArray, const char *const str1, const size_t size1, const char *const str2, const size_t size2)
{
    size_t dist1[std::numeric_limits<unsigned char>::max() + 1] = { 0 };
    for (size_t index1 = 1; index1 <= size1; ++index1) {
        size_t dist2 = 0;
        for (size_t index2 = 1; index2 <= size2; ++index2) {
            const size_t substitution((str1[index1 - 1] == str2[index2 - 1]) ? 0 : 1);
            const size_t transposition1(dist1[static_cast<unsigned char>(str2[index2 - 1])]);
            const size_t transposition2(dist2);
            if (!substitution) {
                dist2 = index2;
            }
            // clang-format off
            distanceArray.at(index1 + 1, index2 + 1) = CppUtilities::min(
                distanceArray.at(index1, index2) + substitution,                                                                     // substitution
                distanceArray.at(index1 + 1, index2) + 1,                                                                            // insertion
                distanceArray.at(index1, index2 + 1) + 1,                                                                            // deletion
                distanceArray.at(transposition1, transposition2) + (index1 - transposition1 - 1) + 1 + (index2 - transposition2 - 1) // transposition
            );
            // clang-format on
        }
        dist1[static_cast<int>(str1[index1 - 1])] = index1;
    }
    return distanceArray.at(size1 + 1, size2 + 1);
}

/// \brief Allocates the distance array on the heap and performs the Damerau–Levenshtein algorithm.
template <typename DistanceArray>
size_t performDamerauLevenshteinAlgorithmAllocatingOnHeap(
    DistanceArray &distanceArray, const char *const str1, const size_t size1, const char *const str2, const size_t size2)
{
    std::vector<size_t> buffer(distanceArray.totalSize());
    distanceArray.buffer() = buffer.data();
    initDistanceArray(distanceArray, size1, size2);
    return performDamerauLevenshteinAlgorithm(distanceArray, str1, size1, str2, size2);
}

/// \brief Allocates the distance array on the stack and performs the Damerau–Levenshtein algorithm.
/// \remarks The totalSize() of \a distanceArray mustn't exceed 128 byte.
template <typename DistanceArray>
size_t performDamerauLevenshteinAlgorithmAllocatingOnStack(
    DistanceArray &distanceArray, const char *const str1, const size_t size1, const char *const str2, const size_t size2)
{
    size_t buffer[128] = { 0 };
    distanceArray.buffer() = buffer;
    initDistanceArray(distanceArray, size1, size2);
    return performDamerauLevenshteinAlgorithm(distanceArray, str1, size1, str2, size2);
}

/// \endcond

/*!
 * \brief Computes Damerau–Levenshtein distance with adjacent transpositions.
 *
 * \returns Returns the number of editing steps required to turn \a str1 into \a str2.
 * The following operations are considered as editing steps:
 * - substitution: replace one character with another character
 * - insertion: insert one character at any position
 * - deletion: delete one character at any position
 * - transposition: swap any pair of adjacent characters
 *
 * \remarks
 * - Computing "Optimal string alignment distance" is a *different* thing.
 * - The algorithm operates on byte-level. So characters requiring more than one byte in
 *   the used character encoding (e.g. UTF-8 encoded German umlauts) are counted as multiple
 *   characters (e.g. substitution of those umlauts with non-umlauts requires 2 editing steps).
 * - The memory consumption of this algorithm is considerably. The required memory increases
 *   with the product of \a size1 and \a size2. Pass only short words to this function!
 */
std::size_t computeDamerauLevenshteinDistance(const char *const str1, const size_t size1, const char *const str2, const size_t size2)
{
    // allocate distance array
    auto distanceArray(makeNoneOwningMultiArray<std::size_t>(size1 + 2, size2 + 2));
    if (distanceArray.totalSize() <= 128) {
        return performDamerauLevenshteinAlgorithmAllocatingOnStack(distanceArray, str1, size1, str2, size2);
    } else {
        return performDamerauLevenshteinAlgorithmAllocatingOnHeap(distanceArray, str1, size1, str2, size2);
    }
}

} // namespace CppUtilities