1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99
|
from py_stringmatching import utils
from py_stringmatching.similarity_measure.sequence_similarity_measure import \
SequenceSimilarityMeasure
from py_stringmatching.similarity_measure.cython.cython_jaro_winkler import jaro_winkler
class JaroWinkler(SequenceSimilarityMeasure):
"""Computes Jaro-Winkler measure.
The Jaro-Winkler measure is designed to capture cases where two strings have a low Jaro score, but share a prefix and thus are likely to match.
Args:
prefix_weight (float): Weight to give to the prefix (defaults to 0.1).
Attributes:
prefix_weight (float): An attribute to store the prefix weight.
"""
def __init__(self, prefix_weight=0.1):
self.prefix_weight = prefix_weight
super(JaroWinkler, self).__init__()
def get_raw_score(self, string1, string2):
"""Computes the raw Jaro-Winkler score between two strings.
Args:
string1,string2 (str): Input strings.
Returns:
Jaro-Winkler similarity score (float).
Raises:
TypeError : If the inputs are not strings or if one of the inputs is None.
Examples:
>>> jw = JaroWinkler()
>>> jw.get_raw_score('MARTHA', 'MARHTA')
0.9611111111111111
>>> jw.get_raw_score('DWAYNE', 'DUANE')
0.84
>>> jw.get_raw_score('DIXON', 'DICKSONX')
0.8133333333333332
"""
# input validations
utils.sim_check_for_none(string1, string2)
# convert input to unicode.
string1 = utils.convert_to_unicode(string1)
string2 = utils.convert_to_unicode(string2)
utils.tok_check_for_string_input(string1, string2)
# if one of the strings is empty return 0
if utils.sim_check_for_empty(string1, string2):
return 0
return jaro_winkler(string1, string2, self.prefix_weight)
def get_sim_score(self, string1, string2):
"""Computes the normalized Jaro-Winkler similarity score between two strings. Simply call get_raw_score.
Args:
string1,string2 (str): Input strings.
Returns:
Normalized Jaro-Winkler similarity (float).
Raises:
TypeError : If the inputs are not strings or if one of the inputs is None.
Examples:
>>> jw = JaroWinkler()
>>> jw.get_sim_score('MARTHA', 'MARHTA')
0.9611111111111111
>>> jw.get_sim_score('DWAYNE', 'DUANE')
0.84
>>> jw.get_sim_score('DIXON', 'DICKSONX')
0.8133333333333332
"""
return self.get_raw_score(string1, string2)
def get_prefix_weight(self):
"""Get prefix weight.
Returns:
prefix weight (float).
"""
return self.prefix_weight
def set_prefix_weight(self, prefix_weight):
"""Set prefix weight.
Args:
prefix_weight (float): Weight to give to the prefix.
"""
self.prefix_weight = prefix_weight
return True
|