File: test_detect_legacy.py

package info (click to toggle)
python-charset-normalizer 3.4.3-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 712 kB
  • sloc: python: 5,434; makefile: 25; sh: 17
file content (60 lines) | stat: -rw-r--r-- 2,270 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
from __future__ import annotations

import unittest

from charset_normalizer.legacy import detect


class TestDetectLegacy(unittest.TestCase):
    def test_detect_dict_keys(self):
        r = detect(("\uFEFF" + "我没有埋怨,磋砣的只是一些时间。").encode("gb18030"))

        with self.subTest("encoding key present"):
            self.assertIn("encoding", r.keys())

        with self.subTest("language key present"):
            self.assertIn("language", r.keys())

        with self.subTest("confidence key present"):
            self.assertIn("confidence", r.keys())

    def test_detect_dict_value_type(self):
        r = detect("我没有埋怨,磋砣的只是一些时间。".encode())

        with self.subTest("encoding instance of str"):
            self.assertIsInstance(r["encoding"], str)

        with self.subTest("language instance of str"):
            self.assertIsInstance(r["language"], str)

        with self.subTest("confidence instance of float"):
            self.assertIsInstance(r["confidence"], float)

    def test_detect_dict_value(self):
        r = detect("我没有埋怨,磋砣的只是一些时间。".encode("utf_32"))

        with self.subTest("encoding is equal to utf_32"):
            self.assertEqual(r["encoding"], "UTF-32")

    def test_utf8_sig_not_striped(self):
        r = detect("Hello World".encode("utf-8-sig"))

        with self.subTest("Verify that UTF-8-SIG is returned when using legacy detect"):
            self.assertEqual(r["encoding"], "UTF-8-SIG")

    def test_small_payload_confidence_altered(self):

        with self.subTest("Unicode should yield 1. confidence even on small bytes string"):
            r = detect("#表 10-1 クラスタ設定".encode("utf_16"))

            self.assertTrue(r["confidence"] == 1.0)

        with self.subTest("ShiftJis should not yield 1. confidence on small bytes string"):
            r = detect("#表 10-1 クラスタ設定".encode("cp932"))

            self.assertTrue(r["confidence"] < 1.0)

        with self.subTest("ShiftJis should yield 1. confidence on sufficient bytes string"):
            r = detect("#表 10-1 クラスタ設定 … リソース同居制約".encode("cp932"))

            self.assertTrue(r["confidence"] == 1.0)