File: taggerdata.py

package info (click to toggle)
python-pynlpl 1.2.9-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 1,900 kB
  • sloc: python: 25,677; sh: 73; makefile: 3
file content (142 lines) | stat: -rw-r--r-- 4,932 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#-*- coding:utf-8 -*-

###############################################################
#  PyNLPl - Read tagger data
#       by Maarten van Gompel (proycon)
#       http://ilk.uvt.nl/~mvgompel
#       Induction for Linguistic Knowledge Research Group
#       Universiteit van Tilburg
#       
#       Licensed under GPLv3
#
#
###############################################################

from __future__ import print_function
from __future__ import unicode_literals
from __future__ import division
from __future__ import absolute_import    

import io

class Taggerdata(object):
    def __init__(self,filename, encoding = 'utf-8', mode ='r'):
        self.filename = filename
        self.encoding = encoding
        assert (mode == 'r' or mode == 'w')
        self.mode = mode
        self.reset()
        self.firstiter = True
        self.indexed = False
        self.writeindex = 0

    def __iter__(self):
        words = []
        lemmas = []
        postags = []
        for line in self.f:
            line = line.strip()
            if self.firstiter:
                self.indexed = (line == "#0")
                self.firstiter = False
            if not line and not self.indexed:
                yield (words, lemmas, postags)
                words = []
                lemmas = []
                postags = []
            elif self.indexed and len(line) > 1 and line[0] == '#' and line[1:].isdigit():
                if line != "#0":
                    yield (words, lemmas, postags)
                    words = []
                    lemmas = []
                    postags = []
            elif line:
                try:
                    word, lemma, pos = line.split("\t")
                except:
                    word = lemma = pos = "NONE"
                if word == "NONE": word = None
                if lemma == "NONE": lemma = None
                if pos == "NONE": pos = None
                words.append(word)
                lemmas.append(lemma)
                postags.append(pos)
        if words:
            yield (words, lemmas, postags)

    def next(self):
        words = []
        lemmas = []
        postags = []
        while True:
            try:
                line = self.f.next().strip()
            except StopIteration:
                if words:
                    return (words, lemmas, postags)
                else:
                    raise
            if self.firstiter:
                self.indexed = (line == "#0")
                self.firstiter = False
            if not line and not self.indexed:
                return (words, lemmas, postags)
            elif self.indexed and len(line) > 1 and line[0] == '#' and line[1:].isdigit():
                if line != "#0":
                    return (words, lemmas, postags)
            elif line:
                try:
                    word, lemma, pos = line.split("\t")
                except:
                    word = lemma = pos = "NONE"
                if word == "NONE": word = None
                if lemma == "NONE": lemma = None
                if pos == "NONE": pos = None
                words.append(word)
                lemmas.append(lemma)
                postags.append(pos)

    def align(self, referencewords, datatuple):
        """align the reference sentence with the tagged data"""
        targetwords = []
        for i, (word,lemma,postag) in enumerate(zip(datatuple[0],datatuple[1],datatuple[2])):
            if word:
                subwords = word.split("_")
                for w in subwords: #split multiword expressions
                    targetwords.append( (w, lemma, postag, i, len(subwords) > 1 ) ) #word, lemma, pos, index, multiword? 

        referencewords = [ w.lower() for w in referencewords ]          
        alignment = []
        for i, referenceword in enumerate(referencewords):
            found = False
            best = 0  
            distance = 999999          
            for j, (targetword, lemma, pos, index, multiword) in enumerate(targetwords):
                if referenceword == targetword and abs(i-j) < distance:
                    found = True
                    best = j
                    distance = abs(i-j)

            if found:
                alignment.append(targetwords[best])
            else:                
                alignment.append((None,None,None,None,False)) #no alignment found        
        
        return alignment   

    def reset(self):
        self.f = io.open(self.filename,self.mode, encoding=self.encoding)


    def write(self, sentence):
        self.f.write("#" + str(self.writeindex)+"\n")
        for word, lemma, pos in sentence:
           if not word: word = "NONE"
           if not lemma: lemma = "NONE"
           if not pos: pos = "NONE"
           self.f.write( word + "\t" + lemma + "\t" + pos + "\n" )                
        self.writeindex += 1

    def close(self):
        self.f.close()