File: cython_levenshtein.pyx

package info (click to toggle)
py-stringmatching 0.4.3-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 1,956 kB
  • sloc: python: 3,979; makefile: 174; sh: 7
file content (56 lines) | stat: -rw-r--r-- 1,396 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# cython: boundscheck=False

from __future__ import division
import cython
import numpy as np
cimport numpy as np
from py_stringmatching.similarity_measure.cython.cython_utils import int_min_three
from numpy import int32
from numpy cimport int32_t

DTYPE = int
ctypedef np.int_t DTYPE_t

@cython.boundscheck(False)
@cython.wraparound(False)



def levenshtein(unicode string1, unicode string2):

    cdef int len_str1 = len(string1)
    cdef int len_str2 = len(string2)

    cdef int ins_cost = 1
    cdef int del_cost = 1
    cdef int sub_cost = 1
    cdef int trans_cost = 1

    cdef int i = 0
    cdef int j = 0

    if len_str1 == 0:
        return len_str2 * ins_cost

    if len_str2 == 0:
        return len_str1 * del_cost

    cdef int[:,:] d_mat = np.zeros((len_str1 + 1, len_str2 + 1), dtype=np.int32)

    for i from 0 <= i < (len_str1 + 1):
        d_mat[i, 0] = i * del_cost

    for j from 0 <= j < (len_str2 + 1):
        d_mat[0, j] = j * ins_cost

    cdef unsigned char lchar = 0
    cdef unsigned char rchar = 0

    for i from 0 <= i < (len_str1):
        lchar = string1[i]
        for j from 0 <= j < (len_str2):
            rchar = string2[j]

            d_mat[i+1,j+1] = int_min_three(d_mat[i + 1, j] + ins_cost, d_mat[i, j + 1] + del_cost, d_mat[i, j]
                                        + (sub_cost if lchar != rchar else 0))
    return d_mat[len_str1, len_str2]