1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224
|
#-*- coding:utf-8 -*-
###############################################################
# PyNLPl - DutchSemCor
# by Maarten van Gompel (proycon)
# http://ilk.uvt.nl/~mvgompel
# Induction for Linguistic Knowledge Research Group
# Universiteit van Tilburg
#
# Licensed under GPLv3
#
# Modified by Ruben Izquierdo
# We need also to store the TIMBL distance to the nearest neighboor
#
# Collection of formats for the DutchSemCor project
#
###############################################################
from __future__ import print_function
from __future__ import unicode_literals
from __future__ import division
from __future__ import absolute_import
from pynlpl.common import u
import sys
if sys.version < '3':
from codecs import getwriter
stderr = getwriter('utf-8')(sys.stderr)
stdout = getwriter('utf-8')(sys.stdout)
else:
stderr = sys.stderr
stdout = sys.stdout
from pynlpl.formats.timbl import TimblOutput
from pynlpl.statistics import Distribution
import io
class WSDSystemOutput(object):
def __init__(self, filename = None):
self.data = {}
self.distances={}
self.maxDistance=1
if filename:
self.load(filename)
def append(self, word_id, senses,distance=0):
# Commented by Ruben, there are some ID's that are repeated in all sonar test files...
#assert (not word_id in self.data)
if isinstance(senses, Distribution):
self.data[word_id] = ( (x,y) for x,y in senses ) #PATCH UNDONE (#TODO: this is a patch, something's not right in Distribution?)
self.distances[word_id]=distance
if distance > self.maxDistance:
self.maxDistance=distance
return
else:
assert isinstance(senses, list) and len(senses) >= 1
self.distances[word_id]=distance
if distance > self.maxDistance:
self.maxDistance=distance
if len(senses[0]) == 1:
#not a (sense_id, confidence) tuple! compute equal confidence for all elements automatically:
confidence = 1 / float(len(senses))
self.data[word_id] = [ (x,confidence) for x in senses ]
else:
fulldistr = True
for sense, confidence in senses:
if confidence == None:
fulldistr = False
break
if fulldistr:
self.data[word_id] = Distribution(senses)
else:
self.data[word_id] = senses
def getMaxDistance(self):
return self.maxDistance
def __iter__(self):
for word_id, senses in self.data.items():
yield word_id, senses,self.distances[word_id]
def __len__(self):
return len(self.data)
def __getitem__(self, word_id):
"""Returns the sense distribution for the given word_id"""
return self.data[word_id]
def load(self, filename):
f = io.open(filename,'r',encoding='utf-8')
for line in f:
fields = line.strip().split(" ")
word_id = fields[0]
if len(fields[1:]) == 1:
#only one sense, no confidence expressed:
self.append(word_id, [(fields[1],None)])
else:
senses = []
distance=-1
for i in range(1,len(fields),2):
if i+1==len(fields):
#The last field is the distance
if fields[i][:4]=='+vdi': #Support for previous format of wsdout
distance=float(fields[i][4:])
else:
distance=float(fields[i])
else:
if fields[i+1] == '?': fields[i+1] = None
senses.append( (fields[i], fields[i+1]) )
self.append(word_id, senses,distance)
f.close()
def save(self, filename):
f = io.open(filename,'w',encoding='utf-8')
for word_id, senses,distance in self:
f.write(word_id)
for sense, confidence in senses:
if confidence == None: confidence = "?"
f.write(" " + str(sense) + " " + str(confidence))
if word_id in self.distances.keys():
f.write(' '+str(self.distances[word_id]))
f.write("\n")
f.close()
def out(self, filename):
for word_id, senses,distance in self:
print(word_id,distance,end="")
for sense, confidence in senses:
if confidence == None: confidence = "?"
print(" " + sense + " " + str(confidence),end="")
print()
def senses(self, bestonly=False):
"""Returns a list of all predicted senses"""
l = []
for word_id, senses,distance in self:
for sense, confidence in senses:
if not sense in l: l.append(sense)
if bestonly:
break
return l
def loadfromtimbl(self, filename):
timbloutput = TimblOutput(io.open(filename,'r',encoding='utf-8'))
for i, (features, referenceclass, predictedclass, distribution, distance) in enumerate(timbloutput):
if distance != None:
#distance='+vdi'+str(distance)
distance=float(distance)
if len(features) == 0:
print("WARNING: Empty feature vector in " + filename + " (line " + str(i+1) + ") skipping!!",file=stderr)
continue
word_id = features[0] #note: this is an assumption that must be adhered to!
if distribution:
self.append(word_id, distribution,distance)
def fromTimblToWsdout(self,fileTimbl,fileWsdout):
timbloutput = TimblOutput(io.open(fileTimbl,'r',encoding='utf-8'))
wsdoutfile = io.open(fileWsdout,'w',encoding='utf-8')
for i, (features, referenceclass, predictedclass, distribution, distance) in enumerate(timbloutput):
if len(features) == 0:
print("WARNING: Empty feature vector in " + fileTimbl + " (line " + str(i+1) + ") skipping!!",file=stderr)
continue
word_id = features[0] #note: this is an assumption that must be adhered to!
if distribution:
wsdoutfile.write(word_id+' ')
for sense, confidence in distribution:
if confidence== None: confidence='?'
wsdoutfile.write(sense+' '+str(confidence)+' ')
wsdoutfile.write(str(distance)+'\n')
wsdoutfile.close()
class DataSet(object): #for testsets/trainingsets
def __init__(self, filename):
self.sense = {} #word_id => (sense_id, lemma,pos)
self.targetwords = {} #(lemma,pos) => [sense_id]
f = io.open(filename,'r',encoding='utf-8')
for line in f:
if len(line) > 0 and line[0] != '#':
fields = line.strip('\n').split('\t')
word_id = fields[0]
sense_id = fields[1]
lemma = fields[2]
pos = fields[3]
self.sense[word_id] = (sense_id, lemma, pos)
if not (lemma,pos) in self.targetwords:
self.targetwords[(lemma,pos)] = []
if not sense_id in self.targetwords[(lemma,pos)]:
self.targetwords[(lemma,pos)].append(sense_id)
f.close()
def __getitem__(self, word_id):
return self.sense[self._sanitize(word_id)]
def getsense(self, word_id):
return self.sense[self._sanitize(word_id)][0]
def getlemma(self, word_id):
return self.sense[self._sanitize(word_id)][1]
def getpos(self, word_id):
return self.sense[self._sanitize(word_id)][2]
def _sanitize(self, word_id):
return u(word_id)
def __contains__(self, word_id):
return (self._sanitize(word_id) in self.sense)
def __iter__(self):
for word_id, (sense, lemma, pos) in self.sense.items():
yield (word_id, sense, lemma, pos)
def senses(self, lemma, pos):
return self.targetwords[(lemma,pos)]
|