File: sentences.py

package info (click to toggle)
frog 0.12.15-3
  • links: PTS, VCS
  • area: main
  • in suites: wheezy
  • size: 2,096 kB
  • sloc: sh: 11,167; cpp: 4,661; python: 765; makefile: 32
file content (64 lines) | stat: -rw-r--r-- 1,686 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
####
#
# Pylet - Python Language Engineering Toolkit
# Written by Sander Canisius <S.V.M.Canisius@uvt.nl>
#

from itertools import takewhile, ifilter, imap


uttDelimiterTest = lambda line: line.startswith("<utt>")
emptyLineDelimiterTest = lambda line: not line.strip()


class filterFn:

	def __init__(self, delimiterTest):
		self.delimiterTest = delimiterTest
		self.keep = False

	def __call__(self, line):
		if self.delimiterTest(line):
			result = self.keep
			self.keep = False
		else:
			result = True
			self.keep = True
		
		return result


def sentenceIterator(stream, delimiterTest=emptyLineDelimiterTest):
	lineTest = lambda line: not delimiterTest(line)

	# FIXME: the following code makes sure that extra lines of
	# whitespace do not cause the iterator to stop before reaching
	# the actual end of the stream. Most likely, this can be
	# implemented more efficiently.
	stream = ifilter(filterFn(delimiterTest), stream)

	sentence = map(str.split, takewhile(lineTest, stream))
	while sentence:
		yield sentence
		sentence = map(str.split, takewhile(lineTest, stream))


def nonDelimitedSentenceIterator(stream, sentenceStartTest):
	sentence = []
	for line in imap(str.split, stream):
		if sentenceStartTest(line) and sentence:
			yield sentence
			sentence = []

		sentence.append(line)

	if sentence:
		yield sentence


def makeWindow(tokens, focusIndex, leftSize, rightSize, labelFunction,
			   emptySlotLabel="__"):
	return max(0, leftSize - focusIndex) * [emptySlotLabel] + \
		   map(labelFunction,
			   tokens[max(0, focusIndex - leftSize):min(len(tokens), focusIndex + rightSize + 1)]) + \
		   max(0, focusIndex + rightSize - len(tokens) + 1) * [emptySlotLabel]