1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27
|
import unicodedata
__all__ = ["ID", "FORM", "LEMMA", "CPOSTAG", "POSTAG",
"FEATS", "HEAD", "DEPREL", "PHEAD", "PDEPREL"]
ID, FORM, LEMMA, CPOSTAG, POSTAG, \
FEATS, HEAD, DEPREL, PHEAD, PDEPREL = range(10)
def isScoringToken(token):
for chr in token.decode("utf-8"):
if unicodedata.category(chr) == "Po":
return False
return True
def pairIterator(sentence, options):
for dependent in sentence:
for head in sentence:
if dependent is not head:
if not options.skipNonScoring or \
isScoringToken(dependent[FORM]):
dist = abs(int(dependent[ID]) - int(head[ID]))
if not options.maxDist or dist <= options.maxDist:
yield dependent, head
|