1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
|
#---------------------------------------------------------------
# PyNLPl - SRILM Language Model
# by Maarten van Gompel, ILK, Universiteit van Tilburg
# http://ilk.uvt.nl/~mvgompel
# proycon AT anaproy DOT nl
#
# Adapted from code by Sander Canisius
#
# Licensed under GPLv3
#
#
# This library enables using SRILM as language model
#
#----------------------------------------------------------------
from __future__ import print_function
from __future__ import unicode_literals
from __future__ import division
from __future__ import absolute_import
try:
import srilmcc
except ImportError:
import warnings
warnings.warn("srilmcc module is not compiled")
srilmcc = None
from pynlpl.textprocessors import Windower
class SRILMException(Exception):
"""Base Exception for SRILM."""
class SRILM:
def __init__(self, filename, n):
if not srilmcc:
raise SRILMException(
"SRILM is not downloaded and compiled."
"Please follow the instructions in makesrilmcc")
self.model = srilmcc.LanguageModel(filename, n)
self.n = n
def scoresentence(self, sentence, unknownwordprob=-12):
score = 0
for ngram in Windower(sentence, self.n, "<s>", "</s>"):
try:
score += self.logscore(ngram)
except KeyError:
score += unknownwordprob
return 10**score
def __getitem__(self, ngram):
return 10**self.logscore(ngram)
def __contains__(self, key):
return self.model.exists( key )
def logscore(self, ngram):
#Bug work-around
#if "" in ngram or "_" in ngram or "__" in ngram:
# print >> sys.stderr, "WARNING: Invalid word in n-gram! Ignoring", ngram
# return -999.9
if len(ngram) == self.n:
if all( (self.model.exists(x) for x in ngram) ):
#no phrases, basic trigram, compute directly
return self.model.wordProb(*ngram)
else:
raise KeyError
else:
raise Exception("Not an " + str(self.n) + "-gram")
|