File: ngram.py

package info (click to toggle)
w3af 1.0-rc3svn3489-1
  • links: PTS
  • area: main
  • in suites: jessie, jessie-kfreebsd, squeeze, wheezy
  • size: 59,908 kB
  • ctags: 16,916
  • sloc: python: 136,990; xml: 63,472; sh: 153; ruby: 94; makefile: 40; asm: 35; jsp: 32; perl: 18; php: 5
file content (137 lines) | stat: -rw-r--r-- 4,663 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# Natural Language Toolkit: Language Models
#
# Copyright (C) 2001-2009 NLTK Project
# Author: Steven Bird <sb@csse.unimelb.edu.au>
# URL: <http://www.nltk.org/>
# For license information, see LICENSE.TXT

import random
from itertools import chain
from math import log

from nltk.probability import ConditionalProbDist, ConditionalFreqDist, MLEProbDist
from nltk.util import ingrams

from api import *

class NgramModel(ModelI):
    """
    A processing interface for assigning a probability to the next word.
    """

    # add cutoff
    def __init__(self, n, train, estimator=None):
        """
        Creates an ngram language model to capture patterns in n consecutive
        words of training text.  An estimator smooths the probabilities derived
        from the text and may allow generation of ngrams not seen during training.

        @param n: the order of the language model (ngram size)
        @type n: C{int}
        @param train: the training text
        @type train: C{list} of C{list} of C{string}
        @param estimator: a function for generating a probability distribution
        @type estimator: a function that takes a C{ConditionalFreqDist} and returns
              a C{ConditionalProbDist}
        """

        self._n = n
        
        if estimator == None:
            estimator = lambda fdist, bins: MLEProbDist(fdist)

        cfd = ConditionalFreqDist()
        self._ngrams = set()
        self._prefix = ('',) * (n - 1)
        
        for ngram in ingrams(chain(self._prefix, train), n):
            self._ngrams.add(ngram)
            context = tuple(ngram[:-1])
            token = ngram[-1]
            cfd[context].inc(token)

        self._model = ConditionalProbDist(cfd, estimator, False, len(cfd))

        # recursively construct the lower-order models
        if n > 1:
            self._backoff = NgramModel(n-1, train, estimator)

    # Katz Backoff probability
    def prob(self, word, context):
        '''Evaluate the probability of this word in this context.'''

        context = tuple(context)
        if context + (word,) in self._ngrams:
            return self[context].prob(word)
        elif self._n > 1:
            return self._alpha(context) * self._backoff.prob(word, context[:-1])
        else:
            raise RuntimeError("No probability mass assigned to word %s in context %s" % (word, ' '.join(context))) 

    def _alpha(self, tokens):
        return self._beta(tokens) / self._backoff._beta(tokens[:-1])

    def _beta(self, tokens):
        if tokens in self:
            return self[tokens].discount()
        else:
            return 1

    def logprob(self, word, context):
        '''Evaluate the (negative) log probability of this word in this context.'''
        
        return -log(self.prob(word, context), 2) 
        
    # NB, this will always start with same word since model
    # is trained on a single text
    def generate(self, num_words, context=()):
        '''Generate random text based on the language model.'''
        text = list(context)
        for i in range(num_words):
            text.append(self._generate_one(text))
        return text

    def _generate_one(self, context):
        context = (self._prefix + tuple(context))[-self._n+1:]
#       print "Context (%d): <%s>" % (self._n, ','.join(context))
        if context in self:
            return self[context].generate()
        elif self._n > 1:
            return self._backoff._generate_one(context[1:])
        else:
            return '.'
    
    def entropy(self, text):
        '''Evaluate the total entropy of a text with respect to the model.
        This is the sum of the log probability of each word in the message.'''

        e = 0.0
        for i in range(self._n - 1, len(text)):
            context = tuple(text[i - self._n + 1 : i - 1])
            token = text[i]
            e += self.logprob(token, context)
        return e

    def __contains__(self, item):
        return tuple(item) in self._model

    def __getitem__(self, item):
        return self._model[tuple(item)]

    def __repr__(self):
        return '<NgramModel with %d %d-grams>' % (len(self._ngrams), self._n)

def demo():
    from nltk.corpus import brown
    from nltk.probability import LidstoneProbDist, WittenBellProbDist
    estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
#    estimator = lambda fdist, bins: WittenBellProbDist(fdist, 0.2)
    lm = NgramModel(3, brown.words(categories='news'), estimator)
    print lm
#    print lm.entropy(sent)
    text = lm.generate(100)
    import textwrap
    print '\n'.join(textwrap.wrap(' '.join(text)))

if __name__ == '__main__':
    demo()