File: test_mess_detection.py

package info (click to toggle)
python-charset-normalizer 3.4.3-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 712 kB
  • sloc: python: 5,434; makefile: 25; sh: 17
file content (48 lines) | stat: -rw-r--r-- 2,404 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
from __future__ import annotations

import pytest

from charset_normalizer.md import mess_ratio


@pytest.mark.parametrize(
    "content, min_expected_ratio, max_expected_ratio",
    [
        (
            "典肇乎庚辰年十二月廿一,及己丑年二月十九,收各方語言二百五十,合逾七百萬目;二十大卷佔八成,單英文卷亦過二百萬。悉文乃天下有志共筆而成;有意助之,幾網路、隨纂作,大典茁焉。",
            0.0,
            0.0,
        ),
        ("العقلية , التنويم المغناطيسي و / أو الاقتراح", 0.0, 0.0),
        ("RadoZ تـــعــــديــل الـــتــــوقــيــــت مـــن قــبــل", 0.0, 0.0),
        ("Cehennemin Sava■þ²s²'da kim?", 0.1, 0.5),
        ("´Á¥½³ø§i --  ±i®Ìºû, ³¯·Ø©v", 0.5, 1.0),
        (
            "ïstanbul, T■rkiye'nin en kalabal»k, iktisadi ve k■lt■rel aÓ»dan en —nemli",
            0.1,
            0.5,
        ),
        (
            "<i>Parce que Óa, c'est la vÕritable histoire de la rencontre avec votre Tante Robin.</i>",
            0.01,
            0.5,
        ),
        (
            """ØĢØŠØģاØĶŲ„ Ų„Ųˆ ØĢŲ† Ø§Ų„Ų†Ø§Øģ ŲŠŲˆŲ… Ų…Ø§ ØģŲˆŲŲŠØŠØģاØĶŲ„ŲˆŲ†ØŒ ØŊØđŲ†Ø§ Ų†ØģŲ…Øđ ØđŲ† (ŲØąŲˆØŊ؈) ŲˆØ§Ų„ØŪØ§ØŠŲ…""",
            0.8,
            3.0,
        ),
        ("""ÇáÚŞáíÉ , ÇáÊäæíã ÇáãÛäÇØíÓí æ / Ãæ ÇáÇŞÊÑÇÍ""", 0.8, 2.5),
        (
            """hishamkoc@yahoo.com ุชุฑุฌู…ู€ู€ุฉ ู‡ู€ุดู€ู€ู€ุงู… ุงู„ู€ู‚ู€ู€ู€ู€ู„ุงูRadoZ ุชู€ู€ู€ุนู€ู€ู€ู€ุฏูŠู€ู€ู„ ุงู„ู€ู€ู€ุชู€ู€ู€ู€ูˆู‚ู€ู€ูŠู€ู€ู€ู€ุช ู…ู€ู€ู€ู† ู‚ู€ู€ุจู€ู€ู„""",
            0.5,
            2.0,
        ),
    ],
)
def test_mess_detection(content, min_expected_ratio, max_expected_ratio):
    calculated_mess_ratio = mess_ratio(content, maximum_threshold=1.0)

    assert (
        min_expected_ratio <= calculated_mess_ratio <= max_expected_ratio
    ), "The mess detection ratio calculated for given content is not well adjusted!"