1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171
|
/*
* fstrcmp - fuzzy string compare library
* Copyright (C) 2009 Peter Miller
*
*
* Derived from gettext 0.17
* Copyright (C) 1988-2006 Free Software Foundation, Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or (at
* your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*
* Derived from GNU diff 2.7, analyze.c et al.
*
* The basic idea is to consider two vectors as similar if, when
* transforming the first vector into the second vector through a
* sequence of edits (inserts and deletes of one element each), this
* sequence is short - or equivalently, if the ordered list of elements
* that are untouched by these edits is long. For a good introduction
* to the subject, read about the "Levenshtein distance" in Wikipedia.
*
* The basic algorithm is described in:
* "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
* Algorithmica Vol. 1 No. 2, 1986, pp. 251-266;
* see especially section 4.2, which describes the variation used below.
*
* The basic algorithm was independently discovered as described in:
* "Algorithms for Approximate String Matching", E. Ukkonen, Information
* and Control Vol. 64, 1985, pp. 100-118.
*
* Unless the 'find_minimal' flag is set, this code uses the
* TOO_EXPENSIVE heuristic, by Paul Eggert, to limit the cost to
* O(N**1.5 log N) at the price of producing suboptimal output for large
* inputs with many differences.
*/
#include <lib/fstrcmp.h>
#include <lib/ac/string.h>
#include <lib/ac/stdio.h>
#include <lib/ac/stdlib.h>
#include <lib/ac/wchar.h>
#include <limits.h>
#include <lib/minmax.h>
#include <lib/nmalloc.h>
#ifndef uintptr_t
# define uintptr_t unsigned long
#endif
#define ELEMENT wchar_t
#define EQUAL(x,y) ((x) == (y))
#define OFFSET_T ssize_t
#define OFFSET OFFSET_T
#define EXTRA_CONTEXT_FIELDS \
/* The number of elements inserted or deleted. */ \
size_t xvec_edit_count; \
size_t yvec_edit_count;
#define NOTE_DELETE(ctxt, xoff) ctxt->xvec_edit_count++
#define NOTE_INSERT(ctxt, yoff) ctxt->yvec_edit_count++
/*
* We don't need USE_HEURISTIC, since it is unlikely in typical uses of
* fstrcmp().
*/
#include <lib/diffseq.h>
/*
* Because fstrcmp is typically called multiple times, attempt to
* minimize the number of memory allocations performed. Thus, let a
* call reuse the memory already allocated by the previous call, if
* it is sufficient.
*
* This isn't thread safe.
*/
static OFFSET_T *buffer;
static size_t bufmax;
int
fwcscmpi(const wchar_t *wcs1, const wchar_t *wcs2)
{
struct context ctxt;
int i;
size_t fdiag_len;
size_t size1;
size_t size2;
size_t numerator;
size_t denominator;
/*
* set the info for each string.
*/
ctxt.xvec = wcs1;
ctxt.yvec = wcs2;
/*
* short-circuit obvious comparisons
*/
size1 = wcslen(wcs1);
size2 = wcslen(wcs2);
if (size1 == 0 && size2 == 0)
return 1.0;
if (size1 == 0 || size2 == 0)
return 0.0;
/*
* Set TOO_EXPENSIVE to be approximate square root of input size,
* bounded below by 256.
*/
ctxt.too_expensive = 1;
for (i = size1 + size2; i != 0; i >>= 2)
ctxt.too_expensive <<= 1;
if (ctxt.too_expensive < 256)
ctxt.too_expensive = 256;
/*
* Allocate memory for fdiag and bdiag.
*/
fdiag_len = size1 + size2 + 3;
if (fdiag_len > bufmax)
{
/* Need more memory. */
bufmax = 2 * bufmax;
if (fdiag_len > bufmax)
bufmax = fdiag_len;
/*
* Calling realloc() would be a waste: buffer's contents do not
* need to be preserved.
*/
if (buffer != NULL)
free(buffer);
buffer = (OFFSET_T *)fstrcmp_nmalloc(bufmax, 2 * sizeof(OFFSET_T));
if (!buffer)
return FSTRCMP_ERROR;
}
ctxt.fdiag = buffer + size2 + 1;
ctxt.bdiag = ctxt.fdiag + fdiag_len;
/*
* Now do the main comparison algorithm
*/
ctxt.xvec_edit_count = 0;
ctxt.yvec_edit_count = 0;
compareseq(0, size1, 0, size2, 0, &ctxt);
/*
* The result is
*
* ((number of chars in common) / (average length of the strings)).
*
* This is admittedly biased towards finding that the strings are
* similar, however it does produce meaningful results.
*/
numerator = size1 + size2 - ctxt.yvec_edit_count - ctxt.xvec_edit_count;
denominator = size1 + size2;
return ((FSTRCMPI_IDENTICAL * numerator + denominator / 2) / denominator);
}
|