File: benchmark.py

package info (click to toggle)

pystemmer 3.0.0-1

links: PTS, VCS
area: main
in suites: forky, sid
size: 1,092 kB
sloc: python: 309; sh: 26; makefile: 14

file content (27 lines) | stat: -rwxr-xr-x 860 bytes

parent folder | download | duplicates (3)

#!/usr/bin/env python

# This script runs a simple benchmark of the python stemmer interface.

import timeit

datafiles = ('sampledata/englishvoc.txt', 'sampledata/puttydoc.txt',)
words_lst = [None]

for datafile in datafiles:
    words = []
    for line in open(datafile):
        words.extend(line.split())
    for cache_size in (0, 1, 10000, 30000):
        setup = r"""
import Stemmer
stemmer = Stemmer.Stemmer('en', %d)
words = []
for line in open('%s'):
    words.extend(line.split())
""" % (cache_size, datafile)
        t = timeit.Timer(setup=setup,
                         stmt='stemmer.stemWords(words)')
        for iters in (1, 2, 3, 10):
            times = [time / iters for time in t.repeat(5, iters)]
            print("'%s':words=%d,cacheSize=%d,iters=%d,mintime=%f" %
                  (datafile, len(words), cache_size, iters, min(times)))