1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137
|
# Natural Language Toolkit: Language Models
#
# Copyright (C) 2001-2009 NLTK Project
# Author: Steven Bird <sb@csse.unimelb.edu.au>
# URL: <http://www.nltk.org/>
# For license information, see LICENSE.TXT
import random
from itertools import chain
from math import log
from nltk.probability import ConditionalProbDist, ConditionalFreqDist, MLEProbDist
from nltk.util import ingrams
from api import *
class NgramModel(ModelI):
"""
A processing interface for assigning a probability to the next word.
"""
# add cutoff
def __init__(self, n, train, estimator=None):
"""
Creates an ngram language model to capture patterns in n consecutive
words of training text. An estimator smooths the probabilities derived
from the text and may allow generation of ngrams not seen during training.
@param n: the order of the language model (ngram size)
@type n: C{int}
@param train: the training text
@type train: C{list} of C{list} of C{string}
@param estimator: a function for generating a probability distribution
@type estimator: a function that takes a C{ConditionalFreqDist} and returns
a C{ConditionalProbDist}
"""
self._n = n
if estimator == None:
estimator = lambda fdist, bins: MLEProbDist(fdist)
cfd = ConditionalFreqDist()
self._ngrams = set()
self._prefix = ('',) * (n - 1)
for ngram in ingrams(chain(self._prefix, train), n):
self._ngrams.add(ngram)
context = tuple(ngram[:-1])
token = ngram[-1]
cfd[context].inc(token)
self._model = ConditionalProbDist(cfd, estimator, False, len(cfd))
# recursively construct the lower-order models
if n > 1:
self._backoff = NgramModel(n-1, train, estimator)
# Katz Backoff probability
def prob(self, word, context):
'''Evaluate the probability of this word in this context.'''
context = tuple(context)
if context + (word,) in self._ngrams:
return self[context].prob(word)
elif self._n > 1:
return self._alpha(context) * self._backoff.prob(word, context[:-1])
else:
raise RuntimeError("No probability mass assigned to word %s in context %s" % (word, ' '.join(context)))
def _alpha(self, tokens):
return self._beta(tokens) / self._backoff._beta(tokens[:-1])
def _beta(self, tokens):
if tokens in self:
return self[tokens].discount()
else:
return 1
def logprob(self, word, context):
'''Evaluate the (negative) log probability of this word in this context.'''
return -log(self.prob(word, context), 2)
# NB, this will always start with same word since model
# is trained on a single text
def generate(self, num_words, context=()):
'''Generate random text based on the language model.'''
text = list(context)
for i in range(num_words):
text.append(self._generate_one(text))
return text
def _generate_one(self, context):
context = (self._prefix + tuple(context))[-self._n+1:]
# print "Context (%d): <%s>" % (self._n, ','.join(context))
if context in self:
return self[context].generate()
elif self._n > 1:
return self._backoff._generate_one(context[1:])
else:
return '.'
def entropy(self, text):
'''Evaluate the total entropy of a text with respect to the model.
This is the sum of the log probability of each word in the message.'''
e = 0.0
for i in range(self._n - 1, len(text)):
context = tuple(text[i - self._n + 1 : i - 1])
token = text[i]
e += self.logprob(token, context)
return e
def __contains__(self, item):
return tuple(item) in self._model
def __getitem__(self, item):
return self._model[tuple(item)]
def __repr__(self):
return '<NgramModel with %d %d-grams>' % (len(self._ngrams), self._n)
def demo():
from nltk.corpus import brown
from nltk.probability import LidstoneProbDist, WittenBellProbDist
estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
# estimator = lambda fdist, bins: WittenBellProbDist(fdist, 0.2)
lm = NgramModel(3, brown.words(categories='news'), estimator)
print lm
# print lm.entropy(sent)
text = lm.generate(100)
import textwrap
print '\n'.join(textwrap.wrap(' '.join(text)))
if __name__ == '__main__':
demo()
|