File: tfidf.py

package info (click to toggle)
py-stringmatching 0.4.3-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 1,956 kB
  • sloc: python: 3,979; makefile: 174; sh: 7
file content (192 lines) | stat: -rw-r--r-- 7,656 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
from __future__ import division
from math import log, sqrt
import collections

from py_stringmatching import utils
from py_stringmatching.similarity_measure.token_similarity_measure import \
                                                    TokenSimilarityMeasure


class TfIdf(TokenSimilarityMeasure):
    """Computes TF/IDF measure.

    This measure employs the notion of TF/IDF score commonly used in information retrieval (IR) to
    find documents that are relevant to keyword queries. The intuition underlying the TF/IDF measure
    is that two strings are similar if they share distinguishing terms. See the string matching chapter in the book "Principles of Data Integration"
    
    Args:
        corpus_list (list of lists): The corpus that will be used to compute TF and IDF values. This corpus is a list of strings, where each string has been tokenized into a list of tokens (that is, a bag of tokens). The default is set to None. In this case, when we call this TF/IDF measure on two input strings (using get_raw_score or get_sim_score), the corpus is taken to be the list of those two strings. 
        dampen (boolean): Flag to indicate whether 'log' should be used in TF and IDF formulas (defaults to True). 

    Attributes:
        dampen (boolean): An attribute to store the dampen flag.
    """

    def __init__(self, corpus_list=None, dampen=True):
        self.__corpus_list = corpus_list
        self.__document_frequency = {}
        self.__compute_document_frequency()
        self.__corpus_size = 0 if self.__corpus_list is None else (
                                                         len(self.__corpus_list))
        self.dampen = dampen        
        super(TfIdf, self).__init__()

    def get_raw_score(self, bag1, bag2):
        """Computes the raw TF/IDF score between two lists.

        Args:
            bag1,bag2 (list): Input lists.

        Returns:
            TF/IDF score between the input lists (float).

        Raises:
            TypeError : If the inputs are not lists or if one of the inputs is None.

        Examples:
            
            >>> # here the corpus is a list of three strings that 
            >>> # have been tokenized into three lists of tokens
            >>> tfidf = TfIdf([['a', 'b', 'a'], ['a', 'c'], ['a']])
            >>> tfidf.get_raw_score(['a', 'b', 'a'], ['b', 'c'])
            0.7071067811865475
            >>> tfidf.get_raw_score(['a', 'b', 'a'], ['a'])
            0.0
            >>> tfidf = TfIdf([['x', 'y'], ['w'], ['q']])
            >>> tfidf.get_raw_score(['a', 'b', 'a'], ['a'])
            0.0
            >>> tfidf = TfIdf([['a', 'b', 'a'], ['a', 'c'], ['a'], ['b']], False)
            >>> tfidf.get_raw_score(['a', 'b', 'a'], ['a', 'c'])
            0.25298221281347033
            >>> tfidf = TfIdf(dampen=False)
            >>> tfidf.get_raw_score(['a', 'b', 'a'], ['a'])
            0.7071067811865475
            >>> tfidf = TfIdf()
            >>> tfidf.get_raw_score(['a', 'b', 'a'], ['a'])
            0.0
        """
        # input validations
        utils.sim_check_for_none(bag1, bag2)
        utils.sim_check_for_list_or_set_inputs(bag1, bag2)

        # if the strings match exactly return 1.0
        if utils.sim_check_for_exact_match(bag1, bag2):
            return 1.0

        # if one of the strings is empty return 0
        if utils.sim_check_for_empty(bag1, bag2):
            return 0

        # term frequency for input strings
        tf_x, tf_y = collections.Counter(bag1), collections.Counter(bag2)
         
        # find unique elements in the input lists and their document frequency 
        local_df = {}
        for element in tf_x:
            local_df[element] = local_df.get(element, 0) + 1
        for element in tf_y:
            local_df[element] = local_df.get(element, 0) + 1

        # if corpus is not provided treat input string as corpus
        curr_df, corpus_size = (local_df, 2) if self.__corpus_list is None else (
                                   (self.__document_frequency, self.__corpus_size))

        idf_element, v_x, v_y, v_x_y, v_x_2, v_y_2 = (0.0, 0.0, 0.0, 
                                                      0.0, 0.0, 0.0)

        # tfidf calculation
        for element in local_df.keys():
            df_element = curr_df.get(element)
            if df_element is None:
                continue
            idf_element = corpus_size * 1.0 / df_element
            v_x = 0 if element not in tf_x else (log(idf_element) * log(tf_x[element] + 1)) if self.dampen else (
                  idf_element * tf_x[element])
            v_y = 0 if element not in tf_y else (log(idf_element) * log(tf_y[element] + 1)) if self.dampen else (
                  idf_element * tf_y[element])
            v_x_y += v_x * v_y
            v_x_2 += v_x * v_x
            v_y_2 += v_y * v_y

        return 0.0 if v_x_y == 0 else v_x_y / (sqrt(v_x_2) * sqrt(v_y_2))

    def get_sim_score(self, bag1, bag2):
        """Computes the normalized TF/IDF similarity score between two lists. Simply call get_raw_score.

        Args:
            bag1,bag2 (list): Input lists.

        Returns:
            Normalized TF/IDF similarity score between the input lists (float).

        Raises:
            TypeError : If the inputs are not lists or if one of the inputs is None.

        Examples:

            >>> # here the corpus is a list of three strings that 
            >>> # have been tokenized into three lists of tokens
            >>> tfidf = TfIdf([['a', 'b', 'a'], ['a', 'c'], ['a']])
            >>> tfidf.get_sim_score(['a', 'b', 'a'], ['b', 'c'])
            0.7071067811865475
            >>> tfidf.get_sim_score(['a', 'b', 'a'], ['a'])
            0.0
            >>> tfidf = TfIdf([['x', 'y'], ['w'], ['q']])
            >>> tfidf.get_sim_score(['a', 'b', 'a'], ['a'])
            0.0
            >>> tfidf = TfIdf([['a', 'b', 'a'], ['a', 'c'], ['a'], ['b']], False)
            >>> tfidf.get_sim_score(['a', 'b', 'a'], ['a', 'c'])
            0.25298221281347033
            >>> tfidf = TfIdf(dampen=False)
            >>> tfidf.get_sim_score(['a', 'b', 'a'], ['a'])
            0.7071067811865475
            >>> tfidf = TfIdf()
            >>> tfidf.get_sim_score(['a', 'b', 'a'], ['a'])
            0.0            
        """
        return self.get_raw_score(bag1, bag2)

    def get_dampen(self):
        """Get dampen flag.

        Returns:
            dampen flag (boolean).
        """
        return self.dampen

    def get_corpus_list(self):
        """Get corpus list.

        Returns:
            corpus list (list of lists).
        """
        return self.__corpus_list

    def set_dampen(self, dampen):
        """Set dampen flag.

        Args:
            dampen (boolean): Flag to indicate whether 'log' should be applied to TF and IDF formulas.
        """
        self.dampen = dampen
        return True

    def set_corpus_list(self, corpus_list):
        """Set corpus list.

        Args:
            corpus_list (list of lists): Corpus list.
        """
        self.__corpus_list = corpus_list
        self.__document_frequency = {}
        self.__compute_document_frequency()
        self.__corpus_size = 0 if self.__corpus_list is None else (
                                                         len(self.__corpus_list))
        return True

    def __compute_document_frequency(self):
        if self.__corpus_list != None:
            for document in self.__corpus_list:
                for element in set(document):
                    self.__document_frequency[element] = ( 
                        self.__document_frequency.get(element, 0) + 1)