File: linebasedCorpus.py

package info (click to toggle)
vectorscan 5.4.11-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 12,864 kB
  • sloc: cpp: 150,782; ansic: 40,288; python: 620; sh: 32; makefile: 14
file content (53 lines) | stat: -rwxr-xr-x 1,338 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#!/usr/bin/env python

'''
Simple script to take a file full of lines of text and push them into a
Hyperscan benchmarking corpus database, one block per line.
'''

import sys, getopt, os.path
from CorpusBuilder import CorpusBuilder

def lineCorpus(inFN, outFN):
    '''
    Read lines from file name @inFN and write them as blocks to a new db with
    name @outFN.
    '''

    if not os.path.exists(inFN):
        print("Input file '%s' does not exist. Exiting." % outFN, file=sys.stderr)
        sys.exit(-1)

    lines = open(inFN).readlines()

    if len(lines) == 0:
        print("Input file contained no lines. Exiting.", file=sys.stderr)
        sys.exit(0)

    builder = CorpusBuilder(outFN)

    # write a single stream to contain everything
    streamId = 0

    for l in lines:
        builder.add_chunk(streamId, l.rstrip())

    builder.finish()

def usage(exeName):
    errmsg = "Usage: %s -i <input file> -o <output file>"
    errmsg = errmsg % exeName
    print(errmsg, file=sys.stderr)
    sys.exit(-1)

if __name__ == '__main__':
    args = getopt.getopt(sys.argv[1:], 'i:o:c:')
    args = dict(args[0])

    requiredKeys = [ '-i', '-o' ]
    for k in requiredKeys:
        if k not in args:
            usage(os.path.basename(sys.argv[0]))

    fnArgs = tuple([args[k] for k in requiredKeys])
    lineCorpus(*fnArgs)