File: damlev.h

package info (click to toggle)
recoll 1.43.13-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 16,956 kB
  • sloc: cpp: 104,864; python: 9,923; xml: 7,324; ansic: 6,447; sh: 1,252; perl: 166; makefile: 73
file content (97 lines) | stat: -rw-r--r-- 3,124 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
/* Copyright (C) 2022 J.F.Dockes
 *
 * License: GPL 2.1
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2.1 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program; if not, write to the
 * Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 */

// Damerau-Levenshtein distance between two strings.
// Implements the algorithm from:
// https://en.wikipedia.org/wiki/Damerau–Levenshtein_distance
//    "Distance with adjacent transpositions"
//
// The function is usable with regular std::string or any class implementing operator[] and size(),
// such as some kind of int array to be used after a conversion to utf-32 (the algorithm will NOT
// work with UTF-8 because of the variable length multi-char8 characters).

#include <string>
#include <map>
#include <algorithm>
#include <iostream>

namespace MedocUtils {

// Two-dimensional array with configurable lower bounds
class Mat2 {
public:
    Mat2(int w, int h, int xs = 0, int ys = 0)
        : m_w(w), m_xs(xs), m_ys(ys) {
        ds = (int*)malloc(sizeof(int) * w * h);
    }
    ~Mat2() {
        if (ds) free(ds);
    }
    int& operator()(int x, int y) {
        return ds[(y - m_ys) * m_w + (x - m_xs)];
    }
private:
    int m_w, m_xs, m_ys;
    int *ds{nullptr};
};

// https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Algorithm
template<class T> int DLDistance(const T& str1, const T& str2)
{
    // This replaces an array of the size of the alphabet, initialized to 0.
    std::map<int, int> da;
    int size1 = static_cast<int>(str1.size());
    int size2 = static_cast<int>(str2.size());
    
    Mat2 d(size1 + 2, size2 + 2, -1, -1);
    int maxdist = size1 + size2;
    d(-1,-1) = maxdist;
    for (int x = 0; x <= size1; x++) {
        d(x, -1) = maxdist;
        d(x, 0) = x;
    }
    for (int y = 0; y <= size2; y++) {
        d(-1, y) = maxdist;
        d(0, y) = y;
    }
    // The strings in the algo are 1-indexed, so we adjust accessses (e.g. a[x-1])
    for (int x = 1; x <= size1; x++) {
        int db = 0;
        for (int y = 1; y <= size2; y++) {
            auto it = da.find(str2[y-1]);
            int k = it == da.end() ? 0 : da[str2[y-1]];
            int l = db;
            int cost;
            if (str1[x-1] == str2[y-1]) {
                cost = 0;
                db = y;
            } else {
                cost = 1;
            }
            d(x, y) =  std::min({d(x-1, y-1) + cost, d(x, y-1) + 1, d(x-1, y) + 1,
                    d(k-1, l-1) + (x-k-1) + 1 + (y-l-1)});
        }
        da[str1[x-1]] = x;
    }
    return d(size1, size2);
}

}