1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27
|
#!/usr/bin/env python
# This script runs a simple benchmark of the python stemmer interface.
import timeit
datafiles = ('sampledata/englishvoc.txt', 'sampledata/puttydoc.txt',)
words_lst = [None]
for datafile in datafiles:
words = []
for line in open(datafile):
words.extend(line.split())
for cache_size in (0, 1, 10000, 30000):
setup = r"""
import Stemmer
stemmer = Stemmer.Stemmer('en', %d)
words = []
for line in open('%s'):
words.extend(line.split())
""" % (cache_size, datafile)
t = timeit.Timer(setup=setup,
stmt='stemmer.stemWords(words)')
for iters in (1, 2, 3, 10):
times = [time / iters for time in t.repeat(5, iters)]
print("'%s':words=%d,cacheSize=%d,iters=%d,mintime=%f" %
(datafile, len(words), cache_size, iters, min(times)))
|