1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142
|
#-*- coding:utf-8 -*-
###############################################################
# PyNLPl - Read tagger data
# by Maarten van Gompel (proycon)
# http://ilk.uvt.nl/~mvgompel
# Induction for Linguistic Knowledge Research Group
# Universiteit van Tilburg
#
# Licensed under GPLv3
#
#
###############################################################
from __future__ import print_function
from __future__ import unicode_literals
from __future__ import division
from __future__ import absolute_import
import io
class Taggerdata(object):
def __init__(self,filename, encoding = 'utf-8', mode ='r'):
self.filename = filename
self.encoding = encoding
assert (mode == 'r' or mode == 'w')
self.mode = mode
self.reset()
self.firstiter = True
self.indexed = False
self.writeindex = 0
def __iter__(self):
words = []
lemmas = []
postags = []
for line in self.f:
line = line.strip()
if self.firstiter:
self.indexed = (line == "#0")
self.firstiter = False
if not line and not self.indexed:
yield (words, lemmas, postags)
words = []
lemmas = []
postags = []
elif self.indexed and len(line) > 1 and line[0] == '#' and line[1:].isdigit():
if line != "#0":
yield (words, lemmas, postags)
words = []
lemmas = []
postags = []
elif line:
try:
word, lemma, pos = line.split("\t")
except:
word = lemma = pos = "NONE"
if word == "NONE": word = None
if lemma == "NONE": lemma = None
if pos == "NONE": pos = None
words.append(word)
lemmas.append(lemma)
postags.append(pos)
if words:
yield (words, lemmas, postags)
def next(self):
words = []
lemmas = []
postags = []
while True:
try:
line = self.f.next().strip()
except StopIteration:
if words:
return (words, lemmas, postags)
else:
raise
if self.firstiter:
self.indexed = (line == "#0")
self.firstiter = False
if not line and not self.indexed:
return (words, lemmas, postags)
elif self.indexed and len(line) > 1 and line[0] == '#' and line[1:].isdigit():
if line != "#0":
return (words, lemmas, postags)
elif line:
try:
word, lemma, pos = line.split("\t")
except:
word = lemma = pos = "NONE"
if word == "NONE": word = None
if lemma == "NONE": lemma = None
if pos == "NONE": pos = None
words.append(word)
lemmas.append(lemma)
postags.append(pos)
def align(self, referencewords, datatuple):
"""align the reference sentence with the tagged data"""
targetwords = []
for i, (word,lemma,postag) in enumerate(zip(datatuple[0],datatuple[1],datatuple[2])):
if word:
subwords = word.split("_")
for w in subwords: #split multiword expressions
targetwords.append( (w, lemma, postag, i, len(subwords) > 1 ) ) #word, lemma, pos, index, multiword?
referencewords = [ w.lower() for w in referencewords ]
alignment = []
for i, referenceword in enumerate(referencewords):
found = False
best = 0
distance = 999999
for j, (targetword, lemma, pos, index, multiword) in enumerate(targetwords):
if referenceword == targetword and abs(i-j) < distance:
found = True
best = j
distance = abs(i-j)
if found:
alignment.append(targetwords[best])
else:
alignment.append((None,None,None,None,False)) #no alignment found
return alignment
def reset(self):
self.f = io.open(self.filename,self.mode, encoding=self.encoding)
def write(self, sentence):
self.f.write("#" + str(self.writeindex)+"\n")
for word, lemma, pos in sentence:
if not word: word = "NONE"
if not lemma: lemma = "NONE"
if not pos: pos = "NONE"
self.f.write( word + "\t" + lemma + "\t" + pos + "\n" )
self.writeindex += 1
def close(self):
self.f.close()
|