File: test_tokenizer.py

package info (click to toggle)
python-lunr 0.8.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 3,644 kB
  • sloc: python: 3,811; javascript: 114; makefile: 60
file content (94 lines) | stat: -rw-r--r-- 3,332 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import re

import pytest

from lunr.tokenizer import Tokenizer


class TestTokenizer:
    def test_splitting_into_tokens(self):
        tokenizer = Tokenizer("foo bar baz")
        tokens = [str(token) for token in tokenizer]

        assert tokens == ["foo", "bar", "baz"]

    def test_run_downcases_tokens(self):
        tokenizer = Tokenizer("foo BAR BAZ")
        tokens = [str(token) for token in tokenizer]

        assert tokens == ["foo", "bar", "baz"]

    def test_array_of_strings(self):
        tokenizer = Tokenizer(["foo", "bar", "baz"])
        tokens = [str(token) for token in tokenizer]

        assert tokens == ["foo", "bar", "baz"]

    def test_none_is_converted_to_empty_string(self):
        tokenizer = Tokenizer(["foo", None, "baz"])
        tokens = [str(token) for token in tokenizer]

        assert tokens == ["foo", "", "baz"]

    def test_multiple_whitespace_is_stripped(self):
        tokenizer = Tokenizer("   foo    bar   baz  ")
        tokens = [str(token) for token in tokenizer]

        assert tokens == ["foo", "bar", "baz"]

    def test_handling_null_like_arguments(self):
        assert len(Tokenizer(None)) == 0

    def test_converting_a_number_to_tokens(self):
        tokens = [str(token) for token in Tokenizer(41)]
        assert tokens == ["41"]

    def test_converting_a_boolean_to_tokens(self):
        tokens = [str(token) for token in Tokenizer(False)]
        assert tokens == ["false"]

    def test_converting_an_object_to_tokens(self):
        class Subject:
            def __str__(self):
                return "custom object"

        tokens = [str(token) for token in Tokenizer(Subject())]
        assert tokens == ["custom", "object"]

    def test_splits_strings_with_hyphens(self):
        tokens = [str(token) for token in Tokenizer("foo-bar")]
        assert tokens == ["foo", "bar"]

    def test_splits_strings_with_hyphens_and_spaces(self):
        tokens = [str(token) for token in Tokenizer("foo - bar")]
        assert tokens == ["foo", "bar"]

    def test_tracking_the_token_index(self):
        tokens = Tokenizer("foo bar")
        assert tokens[0].metadata["index"] == 0
        assert tokens[1].metadata["index"] == 1

    def test_tracking_the_token_position(self):
        tokens = Tokenizer("foo bar")
        assert tokens[0].metadata["position"] == [0, 3]
        assert tokens[1].metadata["position"] == [4, 3]

    def test_providing_additional_metadata(self):
        tokens = Tokenizer("foo bar", {"hurp": "durp"})
        assert tokens[0].metadata["hurp"] == "durp"
        assert tokens[1].metadata["hurp"] == "durp"

    @pytest.mark.parametrize("separator", [re.compile(r"[_\-]+"), lambda c: c in "_-"])
    def test_providing_separator(self, separator):
        tokens = [str(token) for token in Tokenizer("foo_bar-baz", separator=separator)]
        assert tokens == ["foo", "bar", "baz"]

    def test_tracking_token_position_with_left_hand_whitespace(self):
        tokens = Tokenizer(" foo bar")
        assert tokens[0].metadata["position"] == [1, 3]
        assert tokens[1].metadata["position"] == [5, 3]

    def test_tracking_token_position_with_right_hand_whitespace(self):
        tokens = Tokenizer("foo bar ")
        assert tokens[0].metadata["position"] == [0, 3]
        assert tokens[1].metadata["position"] == [4, 3]