File: benchmark3.py

package info (click to toggle)
python-pyahocorasick 1.4.1-2.1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 768 kB
  • sloc: ansic: 4,552; python: 2,823; sh: 312; makefile: 242
file content (117 lines) | stat: -rw-r--r-- 2,599 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
from time import clock
from random import choice, randint, seed
from sys import stdout

import ahocorasick


def write(str):
    stdout.write(str)
    stdout.flush()


def writeln(str):
    stdout.write(str)
    stdout.write('\n')


class ElapsedTime:
    def __init__(self, msg):
        self.msg = msg

    def __enter__(self):
        write("%-40s: " % self.msg)
        self.start = clock()

    def __exit__(self, a1, a2, a3):
        self.stop = clock()
        writeln("%0.3f s" % self.get_time())

    def get_time(self):
        return self.stop - self.start


class Test:

    def __init__(self, max_word_length, count):
        self.min_word_length = 3
        self.max_word_length = max_word_length
        self.count = count
        self.words = []
        self.inexisting = []
        self.input = ""

        self.automaton = None
        seed(0) # make sure that tests will be repeatable

    def init_data(self):
        
        def random_word(length):
            chars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
            return ''.join(choice(chars) for _ in range(length))

        for i in range(self.count):
            length = randint(self.min_word_length, self.max_word_length)
            self.words.append(random_word(length))

            length = randint(self.min_word_length, self.max_word_length)
            self.inexisting.append(random_word(length))


        self.input = random_word(self.count)

        assert(len(self.words) == len(self.inexisting))

    def add_words(self):
        
        self.automaton = ahocorasick.Automaton()
        A = self.automaton
        for word in self.words:
            A.add_word(word, word)

    def build(self):
        
        self.automaton.make_automaton()

    def lookup(self):

        n = len(self.words)

        A = self.automaton
        for i in range(n):
            A.get(self.words[i])
            A.get(self.inexisting[i], "unknown")
    

    def search(self):

        A = self.automaton
        n = 0
        for item in A.iter(self.input):
            n += 1


    def run(self):
        
        with ElapsedTime("Generating data (%d words)" % self.count):
            self.init_data()
        
        with ElapsedTime("Add words"):
            self.add_words()
        
        with ElapsedTime("Building automaton"):
            self.build()

        with ElapsedTime("Look up"):
            self.lookup()

        with ElapsedTime("Search"):
            self.search()

def main():
    
    test = Test(32, 1000000)
    test.run()

if __name__ == '__main__':
    main()