File: performance.py

package info (click to toggle)
python-charset-normalizer 3.4.3-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 712 kB
  • sloc: python: 5,434; makefile: 25; sh: 17
file content (154 lines) | stat: -rw-r--r-- 5,588 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
from __future__ import annotations

import argparse
from glob import glob
from math import ceil
from os.path import isdir
from statistics import mean, stdev
from sys import argv
from time import perf_counter_ns

from chardet import detect as chardet_detect

from charset_normalizer import detect


def calc_percentile(data, percentile):
    n = len(data)
    p = n * percentile / 100
    sorted_data = sorted(data)

    return sorted_data[int(p)] if p.is_integer() else sorted_data[int(ceil(p)) - 1]


def performance_compare(arguments):
    parser = argparse.ArgumentParser(
        description="Performance CI/CD check for Charset-Normalizer"
    )

    parser.add_argument(
        "-s",
        "--size-increase",
        action="store",
        default=1,
        type=int,
        dest="size_coeff",
        help="Apply artificial size increase to challenge the detection mechanism further",
    )

    args = parser.parse_args(arguments)

    if not isdir("./char-dataset"):
        print(
            "This script require https://github.com/Ousret/char-dataset to be cloned on package root directory"
        )
        exit(1)

    chardet_results = []
    charset_normalizer_results = []

    file_list = sorted(glob("./char-dataset/**/*.*"))
    total_files = len(file_list)

    for idx, tbt_path in enumerate(file_list):
        with open(tbt_path, "rb") as fp:
            content = fp.read() * args.size_coeff

        before = perf_counter_ns()
        chardet_detect(content)
        chardet_time = round((perf_counter_ns() - before) / 1000000000, 5)
        chardet_results.append(chardet_time)

        before = perf_counter_ns()
        detect(content)
        charset_normalizer_time = round((perf_counter_ns() - before) / 1000000000, 5)
        charset_normalizer_results.append(charset_normalizer_time)

        charset_normalizer_time = charset_normalizer_time or 0.000005
        cn_faster = (chardet_time / charset_normalizer_time) * 100 - 100
        print(
            f"{idx + 1:>3}/{total_files} {tbt_path:<82} C:{chardet_time:.5f}  "
            f"CN:{charset_normalizer_time:.5f}  {cn_faster:.1f} %"
        )

    # Print the top 10 rows with the slowest execution time
    print(
        f"\n{'-' * 102}\nTop 10 rows with the slowest execution time of charset_normalizer:\n"
    )
    sorted_results = sorted(
        enumerate(charset_normalizer_results), key=lambda x: x[1], reverse=True
    )
    for idx, time in sorted_results[:10]:
        tbt_path = file_list[idx]
        print(f"{idx + 1:>3}/{total_files} {tbt_path:<82}  CN:{time:.5f}")

    # Print charset normalizer statistics
    min_time = min(charset_normalizer_results)
    max_time = max(charset_normalizer_results)
    stdev_time = stdev(charset_normalizer_results)
    mean_time = mean(charset_normalizer_results)
    cv = (stdev_time / mean_time) * 100  # Coefficient of variation
    print(f"\n{'-' * 102}\nCharset Normalizer statistics:\n")
    print(f"Minimum Execution Time: {min_time:.5f} seconds")
    print(f"Maximum Execution Time: {max_time:.5f} seconds")
    print(f"Mean Execution Time: {mean_time:.5f} seconds")
    print(f"Standard Deviation: {stdev_time:.5f} seconds")
    print(f"Coefficient of Variation (CV): {cv:.1f} %")

    # Print comparison statistics for chardet and charset normalizer
    chardet_avg_delay = round(mean(chardet_results) * 1000)
    chardet_99p = round(calc_percentile(chardet_results, 99) * 1000)
    chardet_95p = round(calc_percentile(chardet_results, 95) * 1000)
    chardet_50p = round(calc_percentile(chardet_results, 50) * 1000)

    charset_normalizer_avg_delay = round(mean(charset_normalizer_results) * 1000)
    charset_normalizer_99p = round(
        calc_percentile(charset_normalizer_results, 99) * 1000
    )
    charset_normalizer_95p = round(
        calc_percentile(charset_normalizer_results, 95) * 1000
    )
    charset_normalizer_50p = round(
        calc_percentile(charset_normalizer_results, 50) * 1000
    )

    # mypyc can offer performance ~1ms in the 50p. When eq to 0 assume 1 due to imprecise nature of this test.
    if charset_normalizer_50p == 0:
        charset_normalizer_50p = 1

    print(f"\n{'-' * 102}\nCharset Normalizer vs Chardet statistics:\n")

    print("------------------------------")
    print("--> Chardet Conclusions")
    print("   --> Avg: " + str(chardet_avg_delay) + "ms")
    print("   --> 99th: " + str(chardet_99p) + "ms")
    print("   --> 95th: " + str(chardet_95p) + "ms")
    print("   --> 50th: " + str(chardet_50p) + "ms")

    print("------------------------------")
    print("--> Charset-Normalizer Conclusions")
    print("   --> Avg: " + str(charset_normalizer_avg_delay) + "ms")
    print("   --> 99th: " + str(charset_normalizer_99p) + "ms")
    print("   --> 95th: " + str(charset_normalizer_95p) + "ms")
    print("   --> 50th: " + str(charset_normalizer_50p) + "ms")

    print("------------------------------")
    print("--> Charset-Normalizer / Chardet: Performance Сomparison")
    print(
        "   --> Avg: x"
        + str(round(chardet_avg_delay / charset_normalizer_avg_delay, 2))
    )
    print("   --> 99th: x" + str(round(chardet_99p / charset_normalizer_99p, 2)))
    print("   --> 95th: x" + str(round(chardet_95p / charset_normalizer_95p, 2)))
    print("   --> 50th: x" + str(round(chardet_50p / charset_normalizer_50p, 2)))

    return (
        0
        if chardet_avg_delay > charset_normalizer_avg_delay
        and chardet_99p > charset_normalizer_99p
        else 1
    )


if __name__ == "__main__":
    exit(performance_compare(argv[1:]))