File: qgram_tokenizer.py

package info (click to toggle)
py-stringmatching 0.4.3-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 1,956 kB
  • sloc: python: 3,979; makefile: 174; sh: 7
file content (236 lines) | stat: -rw-r--r-- 8,314 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
from six import string_types
from six.moves import xrange

from py_stringmatching import utils
from py_stringmatching.tokenizer.definition_tokenizer import DefinitionTokenizer


class QgramTokenizer(DefinitionTokenizer):
    """Returns tokens that are sequences of q consecutive characters.
    
    A qgram of an input string s is a substring t (of s) which is a sequence of q consecutive characters. Qgrams are also known as
    ngrams or kgrams. 

    Args:
        qval (int): A value for q, that is, the qgram's length (defaults to 2).
        return_set (boolean): A flag to indicate whether to return a set of
                              tokens or a bag of tokens (defaults to False).
        padding (boolean): A flag to indicate whether a prefix and a suffix should be added
                           to the input string (defaults to True).
        prefix_pad (str): A character (that is, a string of length 1 in Python) that should be replicated 
                          (qval-1) times and prepended to the input string, if padding was 
                          set to True (defaults to '#').
        suffix_pad (str): A character (that is, a string of length 1 in Python) that should be replicated 
                          (qval-1) times and appended to the input string, if padding was 
                          set to True (defaults to '$').

    Attributes:
        qval (int): An attribute to store the q value.
        return_set (boolean): An attribute to store the flag return_set.
        padding (boolean): An attribute to store the padding flag.
        prefix_pad (str): An attribute to store the prefix string that should be used for padding.
        suffix_pad (str): An attribute to store the suffix string that should
                          be used for padding.
    """

    def __init__(self, qval=2,
                 padding=True, prefix_pad='#', suffix_pad='$',
                 return_set=False):
        if qval < 1:
            raise AssertionError("qval cannot be less than 1")
        self.qval = qval

        if not type(padding) == type(True):
            raise AssertionError('padding is expected to be boolean type')
        self.padding = padding

        if not isinstance(prefix_pad, string_types):
            raise AssertionError('prefix_pad is expected to be of type string')
        if not isinstance(suffix_pad, string_types):
            raise AssertionError('suffix_pad is expected to be of type string')
        if not len(prefix_pad) == 1:
            raise AssertionError("prefix_pad should have length equal to 1")
        if not len(suffix_pad) == 1:
            raise AssertionError("suffix_pad should have length equal to 1")

        self.prefix_pad = prefix_pad
        self.suffix_pad = suffix_pad

        super(QgramTokenizer, self).__init__(return_set)

    def tokenize(self, input_string):
        """Tokenizes input string into qgrams.

        Args:
            input_string (str): The string to be tokenized. 

        Returns:
            A Python list, which is a set or a bag of qgrams, depending on whether return_set flag is True or False. 

        Raises:
            TypeError : If the input is not a string

        Examples:
            >>> qg2_tok = QgramTokenizer()
            >>> qg2_tok.tokenize('database')
            ['#d', 'da', 'at', 'ta', 'ab', 'ba', 'as', 'se', 'e$']
            >>> qg2_tok.tokenize('a')
            ['#a', 'a$']
            >>> qg3_tok = QgramTokenizer(qval=3)
            >>> qg3_tok.tokenize('database')
            ['##d', '#da', 'dat', 'ata', 'tab', 'aba', 'bas', 'ase', 'se$', 'e$$']
            >>> qg3_nopad = QgramTokenizer(padding=False)
            >>> qg3_nopad.tokenize('database')
            ['da', 'at', 'ta', 'ab', 'ba', 'as', 'se']
            >>> qg3_diffpads = QgramTokenizer(prefix_pad='^', suffix_pad='!')
            >>> qg3_diffpads.tokenize('database')
            ['^d', 'da', 'at', 'ta', 'ab', 'ba', 'as', 'se', 'e!']
                      
        """
        utils.tok_check_for_none(input_string)
        utils.tok_check_for_string_input(input_string)

        qgram_list = []

        # If the padding flag is set to true, add q-1 "prefix_pad" characters
        # in front of the input string and  add q-1 "suffix_pad" characters at
        # the end of the input string.
        if self.padding:
            input_string = (self.prefix_pad * (self.qval - 1)) + input_string \
                           + (self.suffix_pad * (self.qval - 1))

        if len(input_string) < self.qval:
            return qgram_list

        qgram_list = [input_string[i:i + self.qval] for i in
                      xrange(len(input_string) - (self.qval - 1))]
        qgram_list = list(filter(None, qgram_list))

        if self.return_set:
            return utils.convert_bag_to_set(qgram_list)

        return qgram_list

    def get_qval(self):
        """Gets the value of the qval attribute, which is the length of qgrams. 

        Returns:
            The value of the qval attribute. 
        """
        return self.qval

    def set_qval(self, qval):
        """Sets the value of the qval attribute. 

        Args:
            qval (int): A value for q (the length of qgrams). 

        Raises:
            AssertionError : If qval is less than 1.
        """
        if qval < 1:
            raise AssertionError("qval cannot be less than 1")
        self.qval = qval
        return True

    def get_padding(self):
        """
        Gets the value of the padding flag. This flag determines whether the
        padding should be done for the input strings or not.

        Returns:
            The Boolean value of the padding flag.

        """
        return self.padding

    def set_padding(self, padding):
        """
        Sets the value of the padding flag.

        Args:
            padding (boolean): Flag to indicate whether padding should be
                done or not.

        Returns:
            The Boolean value of True is returned if the update was
            successful.

        Raises:
            AssertionError: If the padding is not of type boolean

        """
        if not type(padding) == type(True):
            raise AssertionError('padding is expected to be boolean type')
        self.padding = padding
        return True

    def get_prefix_pad(self):
        """
        Gets the value of the prefix pad.


        Returns:
            The prefix pad string.

        """
        return self.prefix_pad

    def set_prefix_pad(self, prefix_pad):
        """
        Sets the value of the prefix pad string.

        Args:
            prefix_pad (str): String that should be prepended to the
                input string before tokenization.

        Returns:
            The Boolean value of True is returned if the update was
            successful.

        Raises:
            AssertionError: If the prefix_pad is not of type string.
            AssertionError: If the length of prefix_pad is not one.

        """
        if not isinstance(prefix_pad, string_types):
            raise AssertionError('prefix_pad is expected to be of type string')
        if not len(prefix_pad) == 1:
            raise AssertionError("prefix_pad should have length equal to 1")
        self.prefix_pad = prefix_pad
        return True

    def get_suffix_pad(self):
        """
        Gets the value of the suffix pad.


        Returns:
            The suffix pad string.

        """
        return self.suffix_pad

    def set_suffix_pad(self, suffix_pad):
        """
        Sets the value of the suffix pad string.

        Args:
            suffix_pad (str): String that should be appended to the
                input string before tokenization.

        Returns:
            The boolean value of True is returned if the update was
            successful.

        Raises:
            AssertionError: If the suffix_pad is not of type string.
            AssertionError: If the length of suffix_pad is not one.

        """
        if not isinstance(suffix_pad, string_types):
            raise AssertionError('suffix_pad is expected to be of type string')
        if not len(suffix_pad) == 1:
            raise AssertionError("suffix_pad should have length equal to 1")
        self.suffix_pad = suffix_pad
        return True