File: editdistance.m

package info (click to toggle)
octave-strings 1.3.1-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 336 kB
  • sloc: cpp: 143; makefile: 136; sh: 3
file content (247 lines) | stat: -rw-r--r-- 7,738 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
## Copyright (C) 2006 Muthiah Annamalai <muthiah.annamalai@uta.edu>
## Copyright (C) 2014 Max Görner <max.goerner@tu-dresden.de>
##
## This program is free software; you can redistribute it and/or modify it under
## the terms of the GNU General Public License as published by the Free Software
## Foundation; either version 3 of the License, or (at your option) any later
## version.
##
## This program is distributed in the hope that it will be useful, but WITHOUT
## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
## details.
##
## You should have received a copy of the GNU General Public License along with
## this program; if not, see <http://www.gnu.org/licenses/>.

## -*- texinfo -*-
## @deftypefn  {Function File} {[@var{dist}, @var{L}] =} editdistance (@var{str1}, @var{str2})
## @deftypefnx {Function File} {[@var{dist}, @var{L}] =} editdistance (@var{str1}, @var{str2}, @var{weights})
## @deftypefnx {Function File} {[@var{dist}, @var{L}] =} editdistance (@var{str1}, @var{str2}, @var{weights}, @var{modus})
## Compute the Levenshtein edit distance between the two strings.
##
## The optional argument @var{weights} specifies weights for the deletion,
## matched, and insertion operations; by default it is set to +1, 0, +1
## respectively, so that a least editdistance means a closer match between the
## two strings. This function implements the Levenshtein edit distance as
## presented in Wikipedia article, accessed Nov 2006. Also the levenshtein edit
## distance of a string with the empty string is defined to be its length.
##
## For the special case that there are no weights given and the array L is not
## requested, an algorithm of Berghel and Roach, which improves an algorithm 
## introduced by Ukkonen in 1985, will be applied. This algorithm is
## significantly faster most of the times. Its main strength lies in cases with
## small edit distances, where huge speedups and memory savings are suspectible.
## The time (and space) complexity is O(((dist^2 - (n - m)^2)/2) + dist), where
## n and m denote the length of both strings.
## 
## The optional argument @var{modus} specifies the algorithm to be used. For
## @var{modus} = 0, Berghel and Roach's algorithm will be used whenever
## possible. For @var{modus} = 1, the classic algorithm by Fisher and Wagner
## will be used. If @var{L} is omitted, and @var{modus} = 1, a variant of Fisher
## and Wagner's algorithm using only a linear amount of memory with respect to
## the input length, but O(m*n) runtime, will be used. Again, n and m denote the
## length of both strings.
##
## The default return value @var{dist} is the edit distance, and
## the other return value @var{L} is the distance matrix.
##
## @example
## @group
## editdistance ('marry', 'marie') 
##   @result{}  2
## @end group
## @end example
##
## @end deftypefn

function [dist, L] = editdistance (str1, str2, weights = [1 0 1], modus = 0)
  #Checking correct call of the function.
  if (nargin < 2 || nargin > 4)
    print_usage ();
  endif
  if (nargin >= 3 && numel (weights) != 3)
    error ("editdistance: WEIGTHS must be a 3 element vector.")
  endif 
  if (!isvector(str1) || !isvector(str2))
    error ("editdistance: Both strings must be a vector.")
  endif

  #Using the approach of Berghel and Roach, if possible.
  if (modus == 0 &&
      nargout < 2 && 
      (weights(1) == weights(3) && weights(2) == 0)
     )
    dist = weights(1)*editdistance_berghel_roach (str1,str2);
    return;
  endif

  saveMemory = nargout < 2;

  L1 = numel (str1) + 1;
  L2 = numel (str2) + 1;

  #Checking whether str1 or str2 are empty strings. If so, the determination of the minimal edit distance is trivial.
  if (L1 == 1)
    dist = L2-1;
    return;
  elseif (L2 == 1)
    dist = L1-1;
    return;
  endif

  if (saveMemory)
    L = zeros (2,L2);
  else
    L  = zeros (L1,L2);
  endif
  
  %Setting weights
  g = weights(1); %insertion
  m = weights(2); %match
  d = weights(3); %deletion

  if (not (saveMemory))
    L(:,1) = [0:L1-1]'*g;
  endif
  L(1,:) = [0:L2-1]*g;

  for idx = 2:L1;
    if (saveMemory)
      L(2, 1) = idx-1;
    endif
    for idy = 2:L2
      if (str1(idx-1) == str2(idy-1))
        score = m;
      else
        score = d;
      endif
      if (saveMemory)
        x = 2;
      else
        x = idx;
      endif
      m1 = L(x-1,idy-1) + score;
      m2 = L(x-1,idy) + g;
      m3 = L(x,idy-1) + g;
      L(x,idy) = min ([m1, m2, m3]);
    endfor
    if (saveMemory)
      L(1, :) = L(2, :);
    endif
  endfor

  if (saveMemory)
    x = 2;
  else
    x = L1;
  endif

  dist = L(x,L2);
endfunction

function dist = editdistance_berghel_roach (a,b)
  #Variables named according to Berghel and Roach 1996 (except s, which is called dist here)
  
  if (!isvector(a) || !isvector(b))
    print_usage();
  endif

  if (numel (a) > numel (b))
    ans = a;
    a = b;
    b = ans;
  endif
  m = numel (a); n = numel (b);
  if (m == 0)
    dist = n;
    return;
  endif

  MIN_K = -m-1; #The diagonal with the lowest number is -m
  MIN_P = -1; #minimum p that has to be cached is 0
  FKP = sparse (m+n+2,n+1);
  FKP_cached = sparse (m+n+2,n+1);

  p = n-m;
  do
    inc = p;
    for temp_p = 0:p-1
      if (abs (n-m - inc) <= temp_p)
        get_f ((n-m) - inc, temp_p);
      endif
      if (abs(n-m + inc) <= temp_p)
        get_f(n-m+inc,temp_p);
      endif
      inc--;
    endfor
    get_f (n-m,p);
    p++;
  until (get_f (n-m,p-1) == m);

  dist = p-1;

  function f = get_f (k,p)
    if (p == abs (k)-1)
      if (k < 0)
        f = abs (k) - 1;
      else
        f = -1;
      endif
      return;
    endif
    if (p < abs (k)-1)
      f = -inf;
      return;
    endif
    
    if (FKP_cached(k-MIN_K,p-MIN_P) == 1)
      f = FKP(k-MIN_K,p-MIN_P);
      return;
    endif

    c1 = get_f (k,p-1) + 1;
    c2 = get_f (k-1,p-1);
    c3 = get_f (k+1,p-1) + 1;
    t = max ([c1 c2 c3]);
    #if (a([t, t+1]) == b([k+t+1, k+t]) t2 = t+1; endif #taking adjacent transpositions into account
    while (t < m && t+k < n && a(t+1) == b(t+1+k))
      t += 1;
    endwhile
    if (t > m || t+k > n)
      f = FKP(k-MIN_K,p-MIN_P) = NaN;
    else
      f = FKP(k-MIN_K,p-MIN_P) = t;
    endif
    FKP_cached(k-MIN_K,p-MIN_P) = 1;
  endfunction
endfunction

%!test
%! l = 50;
%! n = 20;
%! rand('state',31513);
%! abc = 'A':'Z';
%!
%! for it = 1:n
%!   #Generate two Strings
%!   #This kind of generation produces worst case examples for the new algorithm,
%!   #so runtime comparisons won't be informative.
%!   str1 = str2 = abc(randi([1 length(abc)],1,l));
%!   m = randi(l/2);
%!   str1(randi([1 l],1,m)) = abc(randi([1 length(abc)],1,m));
%!   m = randi(l/2);
%!   str2(randi([1 l],1,m)) = abc(randi([1 length(abc)],1,m));
%!
%!   d1 = editdistance(str1,str2); #Berghel and Roach
%!   tempDist = editdistance(str2,str1);
%!   assert(d1 == tempDist, "editdistance: Berghel and Roach algorithm is not symmetric.");
%!   [d2 ~] = editdistance(str1,str2); #Fisher and Wagner
%!   [tempDist ~] = editdistance(str2,str1);
%!   assert(d2 == tempDist, "editdistance: Fisher and Wagner algorithm is not symmetric.");
%!   d3 = editdistance(str1,str2,[1 0 1],1); #Fisher and Wagner with linear memory usage
%!   tempDist = editdistance(str2,str1,[1 0 1],1);
%!   assert(d3 == tempDist, "editdistance: Fisher and Wagner algorithm with linear memory usage is not symmetric.");
%!   assert(d1 == d2, "editdistance: The result of the algorithm by Berghel and Roach and that of the algorithm by Fisher and Wagners differ.");
%!   assert(d2 == d3, "editdistance: The results of the algorithm by Fisher and Wagner with and without linear memory usage differ.");
%! endfor