1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64
|
####
#
# Pylet - Python Language Engineering Toolkit
# Written by Sander Canisius <S.V.M.Canisius@uvt.nl>
#
from itertools import takewhile, ifilter, imap
uttDelimiterTest = lambda line: line.startswith("<utt>")
emptyLineDelimiterTest = lambda line: not line.strip()
class filterFn:
def __init__(self, delimiterTest):
self.delimiterTest = delimiterTest
self.keep = False
def __call__(self, line):
if self.delimiterTest(line):
result = self.keep
self.keep = False
else:
result = True
self.keep = True
return result
def sentenceIterator(stream, delimiterTest=emptyLineDelimiterTest):
lineTest = lambda line: not delimiterTest(line)
# FIXME: the following code makes sure that extra lines of
# whitespace do not cause the iterator to stop before reaching
# the actual end of the stream. Most likely, this can be
# implemented more efficiently.
stream = ifilter(filterFn(delimiterTest), stream)
sentence = map(str.split, takewhile(lineTest, stream))
while sentence:
yield sentence
sentence = map(str.split, takewhile(lineTest, stream))
def nonDelimitedSentenceIterator(stream, sentenceStartTest):
sentence = []
for line in imap(str.split, stream):
if sentenceStartTest(line) and sentence:
yield sentence
sentence = []
sentence.append(line)
if sentence:
yield sentence
def makeWindow(tokens, focusIndex, leftSize, rightSize, labelFunction,
emptySlotLabel="__"):
return max(0, leftSize - focusIndex) * [emptySlotLabel] + \
map(labelFunction,
tokens[max(0, focusIndex - leftSize):min(len(tokens), focusIndex + rightSize + 1)]) + \
max(0, focusIndex + rightSize - len(tokens) + 1) * [emptySlotLabel]
|