1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134
|
###############################################################
# PyNLPl - Frog Client - Version 1.4.1
# by Maarten van Gompel (proycon)
# http://ilk.uvt.nl/~mvgompel
# Induction for Linguistic Knowledge Research Group
# Universiteit van Tilburg
#
# Derived from code by Rogier Kraf
#
# Licensed under GPLv3
#
# This is a Python library for on-the-fly communication with
# a Frog/Tadpole Server. Allowing on-the-fly lemmatisation and
# PoS-tagging. It is recommended to pass your data on a
# sentence-by-sentence basis to FrogClient.process()
#
###############################################################
from __future__ import print_function
from __future__ import unicode_literals
from __future__ import division
from __future__ import absolute_import
from pynlpl.common import u
import socket
class FrogClient:
def __init__(self,host="localhost",port=12345, server_encoding="utf-8", returnall=False, timeout=120.0):
"""Create a client connecting to a Frog or Tadpole server."""
self.BUFSIZE = 4096
self.socket = socket.socket(socket.AF_INET,socket.SOCK_STREAM)
self.socket.settimeout(timeout)
self.socket.connect( (host,int(port)) )
self.server_encoding = server_encoding
self.returnall = returnall
def process(self,input_data, source_encoding="utf-8", return_unicode = True, oldfrog=False):
"""Receives input_data in the form of a str or unicode object, passes this to the server, with proper consideration for the encodings, and returns the Frog output as a list of tuples: (word,pos,lemma,morphology), each of these is a proper unicode object unless return_unicode is set to False, in which case raw strings will be returned. Return_unicode is no longer optional, it is fixed to True, parameter is still there only for backwards-compatibility."""
if isinstance(input_data, list) or isinstance(input_data, tuple):
input_data = " ".join(input_data)
input_data = u(input_data, source_encoding) #decode (or preferably do this in an earlier stage)
input_data = input_data.strip(' \t\n')
s = input_data.encode(self.server_encoding) +b'\r\n'
if not oldfrog: s += b'EOT\r\n'
self.socket.sendall(s) #send to socket in desired encoding
output = []
done = False
while not done:
data = b""
while not data.endswith(b'\n'):
moredata = self.socket.recv(self.BUFSIZE)
if not moredata: break
data += moredata
data = u(data,self.server_encoding)
for line in data.strip(' \t\r\n').split('\n'):
if line == "READY":
done = True
break
elif line:
line = line.split('\t') #split on tab
if len(line) > 4 and line[0].isdigit(): #first column is token number
if line[0] == '1' and output:
if self.returnall:
output.append( (None,None,None,None, None,None,None, None) )
else:
output.append( (None,None,None,None) )
fields = line[1:]
parse1=parse2=ner=chunk=""
word,lemma,morph,pos = fields[0:4]
if len(fields) > 5:
ner = fields[5]
if len(fields) > 6:
chunk = fields[6]
if len(fields) >= 8:
parse1 = fields[7]
parse2 = fields[8]
if len(fields) < 5:
raise Exception("Can't process response line from Frog: ", repr(line), " got unexpected number of fields ", str(len(fields) + 1))
if self.returnall:
output.append( (word,lemma,morph,pos,ner,chunk,parse1,parse2) )
else:
output.append( (word,lemma,morph,pos) )
return output
def process_aligned(self,input_data, source_encoding="utf-8", return_unicode = True):
output = self.process(input_data, source_encoding, return_unicode)
outputwords = [ x[0] for x in output ]
inputwords = input_data.strip(' \t\n').split(' ')
alignment = self.align(inputwords, outputwords)
for i, _ in enumerate(inputwords):
targetindex = alignment[i]
if targetindex == None:
if self.returnall:
yield (None,None,None,None,None,None,None,None)
else:
yield (None,None,None,None)
else:
yield output[targetindex]
def align(self,inputwords, outputwords):
"""For each inputword, provides the index of the outputword"""
alignment = []
cursor = 0
for inputword in inputwords:
if len(outputwords) > cursor and outputwords[cursor] == inputword:
alignment.append(cursor)
cursor += 1
elif len(outputwords) > cursor+1 and outputwords[cursor+1] == inputword:
alignment.append(cursor+1)
cursor += 2
else:
alignment.append(None)
cursor += 1
return alignment
def __del__(self):
self.socket.close()
|