1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149
|
"""
Benchmark comparison between acora search and re.findall()
"""
COMPARED_IMPLEMENTATIONS = ["pa", "ca", "re"]
REPEAT_COUNT = 5
import re
import sys
import timeit
from time import time
from itertools import combinations
from functools import partial
from acora import AcoraBuilder, BytesAcora, UnicodeAcora, PyAcora
def prepare_benchmark_data():
s = ('bdfdaskdjfhaslkdhfsadhfklashdflabcasdabcdJAKHDBVDFLNFCBLSADHFCALKSJ'
'jklhcnajskbhfasjhancfksjdfhbvaliuradefhzcbdegnashdgfbcjaabesdhgkfcnash'
'fdkhbdegxcbgjsvdhabcabcfcgbnxahsdbgfbcakjsdhgnfcxsababcmdabe')
s = s.lower() + s + s.upper()
search_string = s * 1000
all_keywords = [
'ab', 'abc', 'abcd', 'abcabc', 'ababc', 'ABBBC', 'ABCABC',
'bdfd', 'ade', 'abe', 'bdeg', 'fklash',
'gnfcxsababcmdabe', 'SADHFCAL',
'notthere', 'not-to-be-found', 'not-to-be-found-either',
]
if sys.version_info[0] < 3:
all_keywords = list(map(unicode, all_keywords))
search_string = unicode(search_string)
return search_string, all_keywords
def compare_search(s, filename, ignore_case, *keywords):
setup_pya = setup_cya = setup_re = 0
run_pa = 'pa' in COMPARED_IMPLEMENTATIONS
run_ca = 'ca' in COMPARED_IMPLEMENTATIONS
run_re = 're' in COMPARED_IMPLEMENTATIONS
if run_pa:
t = time()
builder = AcoraBuilder(keywords, ignore_case=ignore_case)
py_acora = builder.build(acora=PyAcora)
setup_pya = time() - t
t = time()
if run_ca:
t = time()
builder = AcoraBuilder(keywords, ignore_case=ignore_case)
c_acora = builder.build()
setup_ca = time() - t
if run_re:
t = time()
if hasattr(keywords[0], 'encode'): # unicode in Py3?
kw_regexp = '|'.join(keywords)
else:
kw_regexp = '|'.encode('ASCII').join(keywords)
if ignore_case:
regexp = re.compile(kw_regexp, re.I)
else:
regexp = re.compile(kw_regexp)
setup_re = time() - t
print("Case %ssensitive %s\n- setup times: PA: %.4f, CA: %.4f, RE: %.4f" % (
ignore_case and 'in' or '',
builder.for_unicode and 'unicode' or 'bytes',
setup_pya, setup_ca, setup_re))
if run_pa:
timings = timeit.Timer(partial(py_acora.findall, s)).repeat(number=REPEAT_COUNT)
print("TIME(paS): %.3f" % min(timings))
if run_ca:
timings = timeit.Timer(partial(c_acora.findall, s)).repeat(number=REPEAT_COUNT)
print("TIME(caS): %.3f" % min(timings))
if filename:
if run_pa:
timings = timeit.Timer(partial(py_acora.filefindall, filename)).repeat(number=REPEAT_COUNT)
print("TIME(paF): %.3f" % min(timings))
if run_ca:
timings = timeit.Timer(partial(c_acora.filefindall, filename)).repeat(number=REPEAT_COUNT)
print("TIME(caF): %.3f" % min(timings))
if run_re:
timings = timeit.Timer(partial(regexp.findall, s)).repeat(number=REPEAT_COUNT)
print("TIME(reS): %.3f" % min(timings))
return (
run_pa and py_acora.findall(s) or None,
run_ca and c_acora.findall(s) or None,
run_pa and (filename and py_acora.filefindall(filename)) or None,
run_ca and (filename and c_acora.filefindall(filename)) or None,
run_re and regexp.findall(s) or None
)
def run_benchmark(search_string, all_keywords):
search_string_lower = search_string.lower()
bytes_search_string = search_string.encode('ASCII')
bytes_search_string_lower = search_string_lower.encode('ASCII')
import tempfile
temp_text_file = tempfile.NamedTemporaryFile()
temp_text_file.write(bytes_search_string)
temp_text_file.flush()
filename = temp_text_file.name
for i in range(len(all_keywords),0,-1):
for keywords in combinations(all_keywords, i):
print('##Keywords(%d): %s' % (len(keywords), ' '.join(sorted(keywords))))
keywords_lower = [ kw.lower() for kw in keywords ]
results = compare_search(search_string, None, False, *keywords)
for result in results[:2]:
assert_equal(results, result, search_string, keywords)
results = compare_search(search_string, None, True, *keywords)
for result in results[:2]:
assert_equal(results, result, search_string_lower, keywords_lower)
keywords = [ keyword.encode('ASCII') for keyword in keywords ]
results = compare_search(bytes_search_string, filename, False, *keywords)
for result in results[:4]:
assert_equal(results, result, bytes_search_string, keywords)
if sys.version_info[0] < 3:
keywords_lower = [ keyword.encode('ASCII') for keyword in keywords_lower ]
# case-insensitive search in byte strings is not supported in Py3
results = compare_search(bytes_search_string, filename, True, *keywords)
for result in results[:4]:
assert_equal(results, result, bytes_search_string_lower, keywords_lower)
def assert_equal(results, result, search_string, keywords):
if result is None:
return
assert len(result) == sum(map(search_string.count, keywords)), \
"EXPECTED: %d, got %s, %s" % (
sum(map(search_string.count, keywords)),
len(result),
[(len(res) if res is not None else None) for res in results])
if __name__ == '__main__':
run_benchmark(*prepare_benchmark_data())
|