1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
|
#!/usr/bin/env python
'''
This script creates a Hyperscan benchmarking corpus database from a supplied
group of Project Gutenberg texts.
'''
import sys, getopt, os.path
import gutenberg.acquire, gutenberg.cleanup, gutenberg.query
from CorpusBuilder import CorpusBuilder
stream_id = 0
stream_bytes = 0
def addBlocks(builder, block_size, stream_size, text_id, text):
global stream_id
global stream_bytes
print "text", text_id, "len", len(text)
i = 0
while i < len(text):
chunk = text[i:min(len(text), i + block_size)]
builder.add_chunk(stream_id, chunk)
i += block_size
stream_bytes += len(chunk)
if stream_bytes >= stream_size:
stream_id += 1
stream_bytes = 0
print "Text", text_id, ": added", i/block_size, "blocks of", block_size, "bytes."
def buildCorpus(outFN, block_size, stream_size, text_ids):
if len(text_ids) == 0:
print >>sys.stderr, "Must provide at least one input ID"
sys.exit(0)
builder = CorpusBuilder(outFN)
total_bytes = 0
stream_id = 0
stream_bytes = 0
for text_id in text_ids:
text_id = int(text_id)
text = gutenberg.acquire.load_etext(text_id)
text = gutenberg.cleanup.strip_headers(text).strip()
addBlocks(builder, block_size, stream_size, text_id, text)
total_bytes += len(text)
builder.finish()
print "Total:", total_bytes, "bytes."
def usage(exeName):
errmsg = "Usage: %s -o <output file> -b <block size> -s <max stream size> <gutenberg text id>..."
errmsg = errmsg % exeName
print >> sys.stderr, errmsg
sys.exit(-1)
if __name__ == '__main__':
opts, args = getopt.getopt(sys.argv[1:], 'o:b:s:')
opts = dict(opts)
requiredKeys = [ '-o', '-b', '-s' ]
for k in requiredKeys:
if not opts.has_key(k):
usage(os.path.basename(sys.argv[0]))
buildCorpus(opts['-o'], int(opts['-b']), int(opts['-s']), args)
|