File: whitespace_tokenizer.py

package info (click to toggle)
py-stringmatching 0.4.3-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 1,956 kB
  • sloc: python: 3,979; makefile: 174; sh: 7
file content (57 lines) | stat: -rw-r--r-- 2,129 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
from py_stringmatching import utils
from py_stringmatching.tokenizer.delimiter_tokenizer import DelimiterTokenizer


class WhitespaceTokenizer(DelimiterTokenizer):
    """Segments the input string using whitespaces then returns the segments as tokens. 
    
    Currently using the split function in Python, so whitespace character refers to 
    the actual whitespace character as well as the tab and newline characters. 

    Args:
        return_set (boolean): A flag to indicate whether to return a set of
                              tokens instead of a bag of tokens (defaults to False).
                              
    Attributes:
        return_set (boolean): An attribute to store the flag return_set. 
    """
    
    def __init__(self, return_set=False):
        super(WhitespaceTokenizer, self).__init__([' ', '\t', '\n'], return_set)

    def tokenize(self, input_string):
        """Tokenizes input string based on white space.

        Args:
            input_string (str): The string to be tokenized. 

        Returns:
            A Python list, which is a set or a bag of tokens, depending on whether return_set is True or False. 

        Raises:
            TypeError : If the input is not a string.

        Examples:
            >>> ws_tok = WhitespaceTokenizer() 
            >>> ws_tok.tokenize('data science')
            ['data', 'science']
            >>> ws_tok.tokenize('data        science')
            ['data', 'science']
            >>> ws_tok.tokenize('data\tscience')
            ['data', 'science']
            >>> ws_tok = WhitespaceTokenizer(return_set=True) 
            >>> ws_tok.tokenize('data   science data integration')
            ['data', 'science', 'integration']
        """
        utils.tok_check_for_none(input_string)
        utils.tok_check_for_string_input(input_string)

        token_list =  list(filter(None, input_string.split()))

        if self.return_set:
            return utils.convert_bag_to_set(token_list)

        return token_list

    def set_delim_set(self, delim_set):
        raise AttributeError('Delimiters cannot be set for WhitespaceTokenizer')