File: coverage.py

package info (click to toggle)
python-charset-normalizer 3.4.3-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 712 kB
  • sloc: python: 5,434; makefile: 25; sh: 17
file content (105 lines) | stat: -rw-r--r-- 2,940 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
from __future__ import annotations

import argparse
from glob import glob
from os import sep
from os.path import isdir
from sys import argv

from charset_normalizer import __version__, from_path
from charset_normalizer.utils import iana_name


def calc_equivalence(content: bytes, cp_a: str, cp_b: str):
    str_a = content.decode(cp_a)
    str_b = content.decode(cp_b)

    character_count = len(str_a)
    diff_character_count = sum(chr_a != chr_b for chr_a, chr_b in zip(str_a, str_b))

    return 1.0 - (diff_character_count / character_count)


def cli_coverage(arguments: list[str]):
    parser = argparse.ArgumentParser(
        description="Embedded detection success coverage script checker for Charset-Normalizer"
    )

    parser.add_argument(
        "-p",
        "--with-preemptive",
        action="store_true",
        default=False,
        dest="preemptive",
        help="Enable the preemptive scan behaviour during coverage check",
    )
    parser.add_argument(
        "-c",
        "--coverage",
        action="store",
        default=90,
        type=int,
        dest="coverage",
        help="Define the minimum acceptable coverage to succeed",
    )

    args = parser.parse_args(arguments)

    if not isdir("./char-dataset"):
        print(
            "This script require https://github.com/Ousret/char-dataset to be cloned on package root directory"
        )
        exit(1)

    print(f"> using charset-normalizer {__version__}")

    success_count = 0
    total_count = 0

    for tbt_path in sorted(glob("./char-dataset/**/*.*")):
        expected_encoding = tbt_path.split(sep)[-2]
        total_count += 1

        results = from_path(tbt_path, preemptive_behaviour=args.preemptive)

        if expected_encoding == "None" and len(results) == 0:
            print(f"✅✅ '{tbt_path}'")
            success_count += 1
            continue

        if len(results) == 0:
            print(f"⚡⚡ '{tbt_path}' (nothing)")
            continue

        result = results.best()

        if (
            expected_encoding in result.could_be_from_charset
            or iana_name(expected_encoding) in result.could_be_from_charset
        ):
            print(f"✅✅ '{tbt_path}'")
            success_count += 1
            continue

        calc_eq = calc_equivalence(result.raw, expected_encoding, result.encoding)

        if calc_eq >= 0.98:
            success_count += 1
            print(
                f"️✅ ️'{tbt_path}' (got '{result.encoding}' but equivalence {round(calc_eq * 100.0, 3)} %)"
            )
            continue

        print(f"⚡ '{tbt_path}' (got '{result.encoding}')")

    success_ratio = round(success_count / total_count, 2) * 100.0

    print(
        f"Total EST coverage = {success_ratio} % ({success_count} / {total_count} files)"
    )

    return 0 if success_ratio >= args.coverage else 1


if __name__ == "__main__":
    exit(cli_coverage(argv[1:]))