File: benchmark.py

package info (click to toggle)
pystemmer 3.0.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 1,092 kB
  • sloc: python: 309; sh: 26; makefile: 14
file content (27 lines) | stat: -rwxr-xr-x 860 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
#!/usr/bin/env python

# This script runs a simple benchmark of the python stemmer interface.

import timeit

datafiles = ('sampledata/englishvoc.txt', 'sampledata/puttydoc.txt',)
words_lst = [None]

for datafile in datafiles:
    words = []
    for line in open(datafile):
        words.extend(line.split())
    for cache_size in (0, 1, 10000, 30000):
        setup = r"""
import Stemmer
stemmer = Stemmer.Stemmer('en', %d)
words = []
for line in open('%s'):
    words.extend(line.split())
""" % (cache_size, datafile)
        t = timeit.Timer(setup=setup,
                         stmt='stemmer.stemWords(words)')
        for iters in (1, 2, 3, 10):
            times = [time / iters for time in t.repeat(5, iters)]
            print("'%s':words=%d,cacheSize=%d,iters=%d,mintime=%f" %
                  (datafile, len(words), cache_size, iters, min(times)))