File: partial_ratio.py

package info (click to toggle)
py-stringmatching 0.4.3-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 1,956 kB
  • sloc: python: 3,979; makefile: 174; sh: 7
file content (126 lines) | stat: -rw-r--r-- 4,371 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
"""Fuzzy Wuzzy Partial Ratio Similarity Measure"""

from difflib import SequenceMatcher
from py_stringmatching import utils
from py_stringmatching.similarity_measure.sequence_similarity_measure import \
                                                    SequenceSimilarityMeasure


class PartialRatio(SequenceSimilarityMeasure):
    """Computes the Fuzzy Wuzzy partial ratio similarity between two strings.

    Fuzzy Wuzzy partial ratio raw score is a measure of the strings similarity as an int in the
    range [0, 100]. Given two strings X and Y, let the shorter string (X) be of length m.
    It finds the fuzzy wuzzy ratio similarity measure between the shorter string and every
    substring of length m of the longer string, and returns the maximum of
    those similarity measures. Fuzzy Wuzzy partial ratio sim score is a float in the range [0, 1] 
    and is obtained by dividing the raw score by 100.

    Note:
    In the case where either of strings X or Y are empty, we define the Fuzzy Wuzzy ratio similarity 
    score to be 0.
    """
    def __init__(self):
        pass

    def get_raw_score(self, string1, string2):
        """
        Computes the Fuzzy Wuzzy partial ratio measure raw score between two strings.
        This score is in the range [0,100].

        Args:
            string1,string2 (str): Input strings

        Returns:
            Partial Ratio measure raw score (int) is returned

        Raises:
            TypeError: If the inputs are not strings

        Examples:
            >>> s = PartialRatio()
            >>> s.get_raw_score('Robert Rupert', 'Rupert')
            100
            >>> s.get_raw_score('Sue', 'sue')
            67
            >>> s.get_raw_score('example', 'samples')
            86

        References:
            * https://pypi.python.org/pypi/fuzzywuzzy
        """
        # input validations
        utils.sim_check_for_none(string1, string2)
        utils.sim_check_for_string_inputs(string1, string2)

        # if one of the strings is empty return 0
        if utils.sim_check_for_empty(string1, string2):
            return 0

        string1 = utils.convert_to_unicode(string1)
        string2 = utils.convert_to_unicode(string2)

        # string1 should be smaller in length than string2. If this is not the case
        # then swap string1 and string2
        if len(string1) > len(string2):
            temp = string1
            string1 = string2
            string2 = temp

        sm = SequenceMatcher(None, string1, string2)
        matching_blocks = sm.get_matching_blocks()

        scores = []
        for block in matching_blocks:
            string2_starting_index = 0
            if (block[1] - block[0] > 0):
                string2_starting_index = block[1] - block[0]
            string2_ending_index = string2_starting_index + len(string1)
            string2_substr = string2[string2_starting_index:string2_ending_index]

            sm2 = SequenceMatcher(None, string1, string2_substr)
            similarity_ratio = sm2.ratio()
            if similarity_ratio > .995:
                return 100
            else:
                scores.append(similarity_ratio)

        return int(round(100 * max(scores)))

    def get_sim_score(self, string1, string2):
        """
        Computes the Fuzzy Wuzzy partial ratio similarity score between two strings.
        This score is in the range [0,1].

        Args:
            string1,string2 (str): Input strings

        Returns:
            Partial Ratio measure similarity score (float) is returned

        Raises:
            TypeError: If the inputs are not strings

        Examples:
            >>> s = PartialRatio()
            >>> s.get_sim_score('Robert Rupert', 'Rupert')
            1.0
            >>> s.get_sim_score('Sue', 'sue')
            0.67
            >>> s.get_sim_score('example', 'samples')
            0.86
        
        References:
            * https://pypi.python.org/pypi/fuzzywuzzy
        """
        # input validations
        utils.sim_check_for_none(string1, string2)
        utils.sim_check_for_string_inputs(string1, string2)

        # if one of the strings is empty return 0
        if utils.sim_check_for_empty(string1, string2):
            return 0

        raw_score = 1.0 * self.get_raw_score(string1, string2)
        sim_score = raw_score / 100
        return sim_score