File: gutenbergCorpus.py

package info (click to toggle)
hyperscan 5.4.2-4
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 12,304 kB
  • sloc: cpp: 143,324; ansic: 41,041; python: 621; sh: 32; makefile: 12
file content (68 lines) | stat: -rwxr-xr-x 1,967 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#!/usr/bin/env python

'''
This script creates a Hyperscan benchmarking corpus database from a supplied
group of Project Gutenberg texts.
'''

import sys, getopt, os.path
import gutenberg.acquire, gutenberg.cleanup, gutenberg.query
from CorpusBuilder import CorpusBuilder

stream_id = 0
stream_bytes = 0

def addBlocks(builder, block_size, stream_size, text_id, text):
    global stream_id
    global stream_bytes

    print "text", text_id, "len", len(text)
    i = 0
    while i < len(text):
        chunk = text[i:min(len(text), i + block_size)]
        builder.add_chunk(stream_id, chunk)
        i += block_size
        stream_bytes += len(chunk)
        if stream_bytes >= stream_size:
            stream_id += 1
            stream_bytes = 0
    print "Text", text_id, ": added", i/block_size, "blocks of", block_size, "bytes."

def buildCorpus(outFN, block_size, stream_size, text_ids):
    if len(text_ids) == 0:
        print >>sys.stderr, "Must provide at least one input ID"
        sys.exit(0)

    builder = CorpusBuilder(outFN)

    total_bytes = 0
    stream_id = 0
    stream_bytes = 0

    for text_id in text_ids:
        text_id = int(text_id)
        text = gutenberg.acquire.load_etext(text_id)
        text = gutenberg.cleanup.strip_headers(text).strip()
        addBlocks(builder, block_size, stream_size, text_id, text)
        total_bytes += len(text)

    builder.finish()

    print "Total:", total_bytes, "bytes."

def usage(exeName):
    errmsg = "Usage: %s -o <output file> -b <block size> -s <max stream size> <gutenberg text id>..."
    errmsg = errmsg % exeName
    print >> sys.stderr, errmsg
    sys.exit(-1)

if __name__ == '__main__':
    opts, args = getopt.getopt(sys.argv[1:], 'o:b:s:')
    opts = dict(opts)

    requiredKeys = [ '-o', '-b', '-s' ]
    for k in requiredKeys:
        if not opts.has_key(k):
            usage(os.path.basename(sys.argv[0]))

    buildCorpus(opts['-o'], int(opts['-b']), int(opts['-s']), args)