1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210
|
# Copyright © Michal Čihař <michal@weblate.org>
#
# SPDX-License-Identifier: MIT
"""Unit tests for unicode-segmentation-rs"""
import unicode_segmentation_rs
class TestGraphemes:
"""Tests for grapheme cluster segmentation"""
def test_simple_ascii(self):
text = "Hello"
result = unicode_segmentation_rs.graphemes(text, is_extended=True)
assert result == ["H", "e", "l", "l", "o"]
def test_emoji_zwj_sequence(self):
# Family emoji with ZWJ (Zero Width Joiner)
text = "👨👩👧👦"
result = unicode_segmentation_rs.graphemes(text, is_extended=True)
assert result == [text]
def test_combining_characters(self):
# Devanagari with combining characters
text = "नमस्ते"
result = unicode_segmentation_rs.graphemes(text, is_extended=True)
# Should treat combining characters as single graphemes
assert len(result) < len(text)
def test_grapheme_indices(self):
text = "Hello"
result = unicode_segmentation_rs.grapheme_indices(text, is_extended=True)
expected = [(0, "H"), (1, "e"), (2, "l"), (3, "l"), (4, "o")]
assert result == expected
def test_empty_string(self):
result = unicode_segmentation_rs.graphemes("", is_extended=True)
assert result == []
class TestWordSegmentation:
"""Tests for word segmentation"""
def test_split_word_bounds_simple(self):
text = "Hello world"
result = unicode_segmentation_rs.split_word_bounds(text)
assert result == ["Hello", " ", "world"]
def test_split_word_bounds_punctuation(self):
text = "Hello, world!"
result = unicode_segmentation_rs.split_word_bounds(text)
assert result == ["Hello", ",", " ", "world", "!"]
def test_unicode_words(self):
text = "Hello, world!"
result = unicode_segmentation_rs.unicode_words(text)
assert result == ["Hello", "world"]
def test_split_word_bound_indices(self):
text = "Hello world"
result = unicode_segmentation_rs.split_word_bound_indices(text)
expected = [(0, "Hello"), (5, " "), (6, "world")]
assert result == expected
def test_empty_string(self):
result = unicode_segmentation_rs.unicode_words("")
assert result == []
def test_multilingual(self):
text = "Hello世界"
result = unicode_segmentation_rs.unicode_words(text)
# Should handle mixed scripts
assert len(result) > 0
class TestSentenceSegmentation:
"""Tests for sentence segmentation"""
def test_simple_sentences(self):
text = "Hello world. How are you?"
result = unicode_segmentation_rs.unicode_sentences(text)
assert len(result) == 2
assert result[0] == "Hello world. "
assert result[1] == "How are you?"
def test_multiple_sentences(self):
text = "First. Second! Third?"
result = unicode_segmentation_rs.unicode_sentences(text)
assert len(result) == 3
def test_abbreviations(self):
text = "Dr. Smith went home."
result = unicode_segmentation_rs.unicode_sentences(text)
# Should handle abbreviations correctly
assert len(result) >= 1
def test_empty_string(self):
result = unicode_segmentation_rs.unicode_sentences("")
assert result == []
def test_arabic(self):
text = "مرحبا بك. كيف حالك؟"
result = unicode_segmentation_rs.unicode_sentences(text)
assert len(result) == 2
def test_japanese(self):
text = "こんにちは。お元気ですか?"
result = unicode_segmentation_rs.unicode_sentences(text)
assert len(result) == 2
class TestDisplayWidth:
"""Tests for display width calculation"""
def test_ascii_width(self):
assert unicode_segmentation_rs.text_width("Hello") == 5
assert unicode_segmentation_rs.text_width("a") == 1
def test_cjk_width(self):
# Chinese characters are typically 2 columns wide
assert unicode_segmentation_rs.text_width("世界") == 4
assert unicode_segmentation_rs.text_width("世") == 2
def test_mixed_width(self):
text = "Hello 世界"
width = unicode_segmentation_rs.text_width(text)
# "Hello" = 5, space = 1, "世界" = 4
assert width == 10
def test_empty_string(self):
assert unicode_segmentation_rs.text_width("") == 0
def test_text_width_ascii(self):
assert unicode_segmentation_rs.text_width("a") == 1
assert unicode_segmentation_rs.text_width("A") == 1
assert unicode_segmentation_rs.text_width("1") == 1
assert unicode_segmentation_rs.text_width(" ") == 1
def test_text_width(self):
assert unicode_segmentation_rs.text_width("世") == 2
assert unicode_segmentation_rs.text_width("界") == 2
assert unicode_segmentation_rs.text_width("あ") == 2
def test_text_width_control(self):
# Control characters should return None
assert unicode_segmentation_rs.text_width("\t") == 1
assert unicode_segmentation_rs.text_width("\n") == 1
assert unicode_segmentation_rs.text_width("\r") == 1
def test_text_width_mode(self):
# Basic test that CJK mode works
assert unicode_segmentation_rs.text_width("a") == 1
assert unicode_segmentation_rs.text_width("世") == 2
assert unicode_segmentation_rs.text_width("\t") == 1
class TestEdgeCases:
"""Tests for edge cases and special scenarios"""
def test_only_whitespace(self):
text = " "
assert unicode_segmentation_rs.graphemes(text, is_extended=True) == [
" ",
" ",
" ",
]
assert unicode_segmentation_rs.split_word_bounds(text) == [" "]
assert unicode_segmentation_rs.unicode_words(text) == []
def test_only_punctuation(self):
text = "!!!"
assert unicode_segmentation_rs.unicode_words(text) == []
assert unicode_segmentation_rs.split_word_bounds(text) == ["!", "!", "!"]
def test_newlines(self):
text = "Hello\nWorld"
result = unicode_segmentation_rs.unicode_words(text)
assert "Hello" in result
assert "World" in result
def test_tabs(self):
text = "Hello\tWorld"
result = unicode_segmentation_rs.unicode_words(text)
assert "Hello" in result
assert "World" in result
def test_multiple_spaces(self):
text = "Hello World"
words = unicode_segmentation_rs.unicode_words(text)
assert words == ["Hello", "World"]
class TestPerformance:
"""Basic performance sanity checks"""
def test_large_text_graphemes(self):
text = "a" * 10000
result = unicode_segmentation_rs.graphemes(text, is_extended=True)
assert len(result) == 10000
def test_large_text_words(self):
text = " ".join(["word"] * 1000)
result = unicode_segmentation_rs.unicode_words(text)
assert len(result) == 1000
def test_large_text_width(self):
text = "a" * 10000
width = unicode_segmentation_rs.text_width(text)
assert width == 10000
|