File: linebasedCorpus.py

package info (click to toggle)

vectorscan 5.4.11-2

links: PTS, VCS
area: main
in suites: forky, sid, trixie
size: 12,864 kB
sloc: cpp: 150,782; ansic: 40,288; python: 620; sh: 32; makefile: 14

file content (53 lines) | stat: -rwxr-xr-x 1,338 bytes

parent folder | download | duplicates (2)

#!/usr/bin/env python

'''
Simple script to take a file full of lines of text and push them into a
Hyperscan benchmarking corpus database, one block per line.
'''

import sys, getopt, os.path
from CorpusBuilder import CorpusBuilder

def lineCorpus(inFN, outFN):
    '''
    Read lines from file name @inFN and write them as blocks to a new db with
    name @outFN.
    '''

    if not os.path.exists(inFN):
        print("Input file '%s' does not exist. Exiting." % outFN, file=sys.stderr)
        sys.exit(-1)

    lines = open(inFN).readlines()

    if len(lines) == 0:
        print("Input file contained no lines. Exiting.", file=sys.stderr)
        sys.exit(0)

    builder = CorpusBuilder(outFN)

    # write a single stream to contain everything
    streamId = 0

    for l in lines:
        builder.add_chunk(streamId, l.rstrip())

    builder.finish()

def usage(exeName):
    errmsg = "Usage: %s -i <input file> -o <output file>"
    errmsg = errmsg % exeName
    print(errmsg, file=sys.stderr)
    sys.exit(-1)

if __name__ == '__main__':
    args = getopt.getopt(sys.argv[1:], 'i:o:c:')
    args = dict(args[0])

    requiredKeys = [ '-i', '-o' ]
    for k in requiredKeys:
        if k not in args:
            usage(os.path.basename(sys.argv[0]))

    fnArgs = tuple([args[k] for k in requiredKeys])
    lineCorpus(*fnArgs)