File: tversky_index.py

package info (click to toggle)
py-stringmatching 0.4.3-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 1,956 kB
  • sloc: python: 3,979; makefile: 174; sh: 7
file content (145 lines) | stat: -rw-r--r-- 4,647 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
"""Tversky index similarity measure"""

from py_stringmatching import utils
from py_stringmatching.similarity_measure.token_similarity_measure import \
                                                    TokenSimilarityMeasure


class TverskyIndex(TokenSimilarityMeasure):
    """Tversky index similarity measure class.

    Parameters:
        alpha, beta (float): Tversky index parameters (defaults to 0.5).
    """
    def __init__(self, alpha=0.5, beta=0.5):
        # validate alpha and beta 
        utils.sim_check_tversky_parameters(alpha, beta)

        self.alpha = alpha
        self.beta = beta
        super(TverskyIndex, self).__init__()

    def get_raw_score(self, set1, set2):
        """
        Computes the Tversky index similarity between two sets.

        The Tversky index is an asymmetric similarity measure on sets that compares a variant to a prototype. The
        Tversky index can be seen as a generalization of Dice's coefficient and Tanimoto coefficient.

        For sets X and Y the Tversky index is a number between 0 and 1 given by:
        :math:`tversky_index(X, Y) = \\frac{|X \\cap Y|}{|X \\cap Y| + \\alpha |X-Y| + \\beta |Y-X|}`
        where, :math: \\alpha, \\beta >=0

        Args:
            set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets.

        Returns:
            Tversly index similarity (float)

        Raises:
            TypeError : If the inputs are not sets (or lists) or if one of the inputs is None.

        Examples:
            >>> tvi = TverskyIndex()
            >>> tvi.get_raw_score(['data', 'science'], ['data'])
            0.6666666666666666
            >>> tvi.get_raw_score(['data', 'management'], ['data', 'data', 'science'])
            0.5
            >>> tvi.get_raw_score({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8})
            0.5454545454545454
            >>> tvi = TverskyIndex(0.5, 0.5)
            >>> tvi.get_raw_score({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8})
            0.5454545454545454
            >>> tvi = TverskyIndex(beta=0.5)
            >>> tvi.get_raw_score(['data', 'management'], ['data', 'data', 'science'])
            0.5
        """
        # input validations
        utils.sim_check_for_none(set1, set2)
        utils.sim_check_for_list_or_set_inputs(set1, set2)

        # if exact match return 1.0
        if utils.sim_check_for_exact_match(set1, set2):
            return 1.0

        # if one of the strings is empty return 0
        if utils.sim_check_for_empty(set1, set2):
            return 0

        if not isinstance(set1, set):
            set1 = set(set1)
        if not isinstance(set2, set):
            set2 = set(set2)
        intersection = float(len(set1 & set2))

        return 1.0 * intersection / (intersection +
            (self.alpha * len(set1 - set2)) + (self.beta * len(set2 - set1)))

    def get_sim_score(self, set1, set2):
        """
        Computes the normalized tversky index similarity between two sets.

        Args:
            set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets.

        Returns:
            Normalized tversky index similarity (float)

        Raises:
            TypeError : If the inputs are not sets (or lists) or if one of the inputs is None.

        Examples:
            >>> tvi = TverskyIndex()
            >>> tvi.get_sim_score(['data', 'science'], ['data'])
            0.6666666666666666
            >>> tvi.get_sim_score(['data', 'management'], ['data', 'data', 'science'])
            0.5
            >>> tvi.get_sim_score({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8})
            0.5454545454545454
            >>> tvi = TverskyIndex(0.5, 0.5)
            >>> tvi.get_sim_score({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8})
            0.5454545454545454
            >>> tvi = TverskyIndex(beta=0.5)
            >>> tvi.get_sim_score(['data', 'management'], ['data', 'data', 'science'])
            0.5

        """
        return self.get_raw_score(set1, set2)

    def get_alpha(self):
        """
        Get alpha

        Returns:
            alpha (float)
        """
        return self.alpha

    def get_beta(self):
        """
        Get beta

        Returns:
            beta (float)
        """
        return self.beta

    def set_alpha(self, alpha):
        """
        Set alpha

        Args:
            alpha (float): Tversky index parameter
        """
        self.alpha = alpha
        return True

    def set_beta(self, beta):
        """
        Set beta

        Args:
            beta (float): Tversky index parameter
        """
        self.beta = beta
        return True