File: leditdist.hpp

package info (click to toggle)
aspell 0.60.8.2-3
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 15,336 kB
  • sloc: cpp: 24,378; sh: 12,340; perl: 1,924; ansic: 1,661; makefile: 852; sed: 16
file content (72 lines) | stat: -rw-r--r-- 2,502 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72

#ifndef __aspeller_leditdist_hh__
#define __aspeller_leditdist_hh__

#include "weights.hpp"

namespace aspeller {

  // limit_edit_distance finds the shortest edit distance but will
  // stop and return a number at least as large as LARGE_NUM if it has
  // to do more edits than a set limit.
  // Note that this does NOT mean that the score returned is <= limit*w.max
  // as "sub" vs "submarine" will return 6*(cost of insertion) no matter what
  // the limit is.
  // The edit distance is 
  // (cost of swap)(# of swaps) + (cost of deletion)(# of deletions) 
  //   + (cost of insertion)(# of insertions) 
  //   + (cost of substitutions)(# of substitutions)

  // Preconditions:
  //   max(strlen(a), strlen(b))*max(of the edit weights) <= 2^15
  //     if violated than an incorrect result may be returned (which may be negative)
  //     due to overflow of a short integer
  //   (limit+1)*w.min < limit*w.max
  //   limit <= 5 (use edit_distance if limit > 5)
  // where w.min and w.max is the minimum and maximum cost of an edit
  // respectfully.

  // The running time is asymptotically bounded above by 
  // (3^l)*n where l is the limit and n is the maximum of strlen(a),strlen(b)
  // Based on my informal tests, however, the n does not really matter
  // and the running time is more like (3^l).

  // limit_edit_distance, based on my informal tests, turns out to be
  // faster than edit_dist for l < 5.  For l == 5 it is about the 
  // smaller for short strings (<= 5) and less than for longer strings

  // limit2_edit_distance(a,b,w) = limit_edit_distance(a,b,2,w)
  // but is roughly 2/3's faster

  struct EditDist {
    int          score;
    const char * stopped_at;
    EditDist() {}
    EditDist(int s, const char * p) 
      : score(s), stopped_at(p) {}
    operator int () const {return score;}
  };

  static const int LARGE_NUM = 0xFFFFF; 
  // this needs to be SMALLER than INT_MAX since it may be incremented 
  // a few times

  int limit_edit_distance(const char * a, const char * b, int limit,
			  const EditDistanceWeights & w
			  = EditDistanceWeights());
  
  EditDist limit0_edit_distance(const char * a, const char * b,
				const EditDistanceWeights & w 
				= EditDistanceWeights());

  EditDist limit1_edit_distance(const char * a, const char * b,
				const EditDistanceWeights & w 
				= EditDistanceWeights());

  EditDist limit2_edit_distance(const char * a, const char * b,
				const EditDistanceWeights & w 
				= EditDistanceWeights());
  
}

#endif