File: benchmarks.py

package info (click to toggle)
thefuzz 0.22.1-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 696 kB
  • sloc: python: 744; sh: 115; makefile: 8
file content (110 lines) | stat: -rw-r--r-- 4,016 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
from timeit import timeit
import math
import csv

iterations = 100000


reader = csv.DictReader(open('data/titledata.csv'), delimiter='|')
titles = [i['custom_title'] for i in reader]
title_blob = '\n'.join(titles)


cirque_strings = [
    "cirque du soleil - zarkana - las vegas",
    "cirque du soleil ",
    "cirque du soleil las vegas",
    "zarkana las vegas",
    "las vegas cirque du soleil at the bellagio",
    "zarakana - cirque du soleil - bellagio"
]

choices = [
    "",
    "new york yankees vs boston red sox",
    "",
    "zarakana - cirque du soleil - bellagio",
    None,
    "cirque du soleil las vegas",
    None
]

mixed_strings = [
    "Lorem Ipsum is simply dummy text of the printing and typesetting industry.",
    "C\\'est la vie",
    "Ça va?",
    "Cães danados",
    "\xacCamarões assados",
    "a\xac\u1234\u20ac\U00008000"
]

common_setup = "from thefuzz import fuzz, utils; "


def print_result_from_timeit(stmt='pass', setup='pass', number=1000000):
    """
    Clean function to know how much time took the execution of one statement
    """
    units = ["s", "ms", "us", "ns"]
    duration = timeit(stmt, setup, number=int(number))
    avg_duration = duration / float(number)
    thousands = int(math.floor(math.log(avg_duration, 1000)))

    print("Total time: {:f}s. Average run: {:.3f}{}.".format(
        duration, avg_duration * (1000 ** -thousands), units[-thousands]))


for s in mixed_strings + cirque_strings + choices:
    print('Test full_process for: "%s"' % s)
    print_result_from_timeit('utils.full_process(u\'%s\')' % s,
                             common_setup, number=iterations)

# benchmarking the core matching methods...

for s in cirque_strings:
    print('Test fuzz.ratio for string: "%s"' % s)
    print('-------------------------------')
    print_result_from_timeit('fuzz.ratio(u\'cirque du soleil\', u\'%s\')' % s,
                             common_setup, number=iterations / 100)

for s in cirque_strings:
    print('Test fuzz.partial_ratio for string: "%s"' % s)
    print('-------------------------------')
    print_result_from_timeit('fuzz.partial_ratio(u\'cirque du soleil\', u\'%s\')'
                             % s, common_setup, number=iterations / 100)

for s in cirque_strings:
    print('Test fuzz.WRatio for string: "%s"' % s)
    print('-------------------------------')
    print_result_from_timeit('fuzz.WRatio(u\'cirque du soleil\', u\'%s\')' % s,
                             common_setup, number=iterations / 100)

print('Test process.extract(scorer =  fuzz.QRatio) for string: "%s"' % s)
print('-------------------------------')
print_result_from_timeit('process.extract(u\'cirque du soleil\', choices, scorer =  fuzz.QRatio)',
                             common_setup + " from thefuzz import process; import string,random; random.seed(18);"
                             " choices = [\'\'.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(30)) for s in range(5000)]",
                              number=10)

print('Test process.extract(scorer =  fuzz.WRatio) for string: "%s"' % s)
print('-------------------------------')
print_result_from_timeit('process.extract(u\'cirque du soleil\', choices, scorer =  fuzz.WRatio)',
                             common_setup + " from thefuzz import process; import string,random; random.seed(18);"
                             " choices = [\'\'.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(30)) for s in range(5000)]",
                              number=10)


# let me show you something

s = 'New York Yankees'

test = 'import functools\n'
test += 'title_blob = """%s"""\n' % title_blob
test += 'title_blob = title_blob.strip()\n'
test += 'titles = title_blob.split("\\n")\n'

print('Real world ratio(): "%s"' % s)
print('-------------------------------')
test += 'prepared_ratio = functools.partial(fuzz.ratio, "%s")\n' % s
test += 'titles.sort(key=prepared_ratio)\n'
print_result_from_timeit(test, common_setup, number=100)