File: bench.py

package info (click to toggle)
python-acora 2.5-0.1
  • links: PTS, VCS
  • area: main
  • in suites: sid, trixie
  • size: 2,084 kB
  • sloc: python: 996; makefile: 55; sh: 13
file content (149 lines) | stat: -rw-r--r-- 5,592 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
"""
Benchmark comparison between acora search and re.findall()
"""

COMPARED_IMPLEMENTATIONS = ["pa", "ca", "re"]
REPEAT_COUNT = 5

import re
import sys
import timeit
from time import time

from itertools import combinations
from functools import partial

from acora import AcoraBuilder, BytesAcora, UnicodeAcora, PyAcora


def prepare_benchmark_data():
    s = ('bdfdaskdjfhaslkdhfsadhfklashdflabcasdabcdJAKHDBVDFLNFCBLSADHFCALKSJ'
        'jklhcnajskbhfasjhancfksjdfhbvaliuradefhzcbdegnashdgfbcjaabesdhgkfcnash'
        'fdkhbdegxcbgjsvdhabcabcfcgbnxahsdbgfbcakjsdhgnfcxsababcmdabe')
    s = s.lower() + s + s.upper()
    search_string = s * 1000

    all_keywords = [
        'ab', 'abc', 'abcd', 'abcabc', 'ababc', 'ABBBC', 'ABCABC',
        'bdfd', 'ade', 'abe', 'bdeg', 'fklash',
        'gnfcxsababcmdabe', 'SADHFCAL',
        'notthere', 'not-to-be-found', 'not-to-be-found-either',
        ]

    if sys.version_info[0] < 3:
        all_keywords = list(map(unicode, all_keywords))
        search_string = unicode(search_string)

    return search_string, all_keywords


def compare_search(s, filename, ignore_case, *keywords):
    setup_pya = setup_cya = setup_re = 0
    run_pa = 'pa' in COMPARED_IMPLEMENTATIONS
    run_ca = 'ca' in COMPARED_IMPLEMENTATIONS
    run_re = 're' in COMPARED_IMPLEMENTATIONS

    if run_pa:
        t = time()
        builder = AcoraBuilder(keywords, ignore_case=ignore_case)
        py_acora = builder.build(acora=PyAcora)
        setup_pya = time() - t
        t = time()
    if run_ca:
        t = time()
        builder = AcoraBuilder(keywords, ignore_case=ignore_case)
        c_acora = builder.build()
        setup_ca = time() - t
    if run_re:
        t = time()
        if hasattr(keywords[0], 'encode'): # unicode in Py3?
            kw_regexp = '|'.join(keywords)
        else:
            kw_regexp = '|'.encode('ASCII').join(keywords)
        if ignore_case:
            regexp = re.compile(kw_regexp, re.I)
        else:
            regexp = re.compile(kw_regexp)
        setup_re = time() - t
    print("Case %ssensitive %s\n- setup times: PA: %.4f, CA: %.4f, RE: %.4f" % (
            ignore_case and 'in' or '',
            builder.for_unicode and 'unicode' or 'bytes',
            setup_pya, setup_ca, setup_re))

    if run_pa:
        timings = timeit.Timer(partial(py_acora.findall, s)).repeat(number=REPEAT_COUNT)
        print("TIME(paS): %.3f" % min(timings))
    if run_ca:
        timings = timeit.Timer(partial(c_acora.findall, s)).repeat(number=REPEAT_COUNT)
        print("TIME(caS): %.3f" % min(timings))
    if filename:
        if run_pa:
            timings = timeit.Timer(partial(py_acora.filefindall, filename)).repeat(number=REPEAT_COUNT)
            print("TIME(paF): %.3f" % min(timings))
        if run_ca:
            timings = timeit.Timer(partial(c_acora.filefindall, filename)).repeat(number=REPEAT_COUNT)
            print("TIME(caF): %.3f" % min(timings))
    if run_re:
        timings = timeit.Timer(partial(regexp.findall, s)).repeat(number=REPEAT_COUNT)
        print("TIME(reS): %.3f" % min(timings))

    return (
        run_pa and py_acora.findall(s) or None,
        run_ca and c_acora.findall(s) or None,
        run_pa and (filename and py_acora.filefindall(filename)) or None,
        run_ca and (filename and c_acora.filefindall(filename)) or None,
        run_re and regexp.findall(s) or None
        )


def run_benchmark(search_string, all_keywords):
    search_string_lower = search_string.lower()
    bytes_search_string = search_string.encode('ASCII')
    bytes_search_string_lower = search_string_lower.encode('ASCII')

    import tempfile
    temp_text_file = tempfile.NamedTemporaryFile()
    temp_text_file.write(bytes_search_string)
    temp_text_file.flush()

    filename = temp_text_file.name

    for i in range(len(all_keywords),0,-1):
        for keywords in combinations(all_keywords, i):
            print('##Keywords(%d): %s' % (len(keywords), ' '.join(sorted(keywords))))
            keywords_lower = [ kw.lower() for kw in keywords ]

            results = compare_search(search_string, None, False, *keywords)
            for result in results[:2]:
                assert_equal(results, result, search_string, keywords)

            results = compare_search(search_string, None, True, *keywords)
            for result in results[:2]:
                assert_equal(results, result, search_string_lower, keywords_lower)

            keywords = [ keyword.encode('ASCII') for keyword in keywords ]

            results = compare_search(bytes_search_string, filename, False, *keywords)
            for result in results[:4]:
                assert_equal(results, result, bytes_search_string, keywords)

            if sys.version_info[0] < 3:
                keywords_lower = [ keyword.encode('ASCII') for keyword in keywords_lower ]
                # case-insensitive search in byte strings is not supported in Py3
                results = compare_search(bytes_search_string, filename, True, *keywords)
                for result in results[:4]:
                    assert_equal(results, result, bytes_search_string_lower, keywords_lower)


def assert_equal(results, result, search_string, keywords):
    if result is None:
        return
    assert len(result) == sum(map(search_string.count, keywords)), \
        "EXPECTED: %d, got %s, %s" % (
            sum(map(search_string.count, keywords)),
            len(result),
            [(len(res) if res is not None else None) for res in results])


if __name__ == '__main__':
    run_benchmark(*prepare_benchmark_data())