File: statistics.py

package info (click to toggle)
python-pynlpl 1.2.9-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 1,900 kB
  • sloc: python: 25,677; sh: 73; makefile: 3
file content (93 lines) | stat: -rwxr-xr-x 3,305 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#!/usr/bin/env python
#-*- coding:utf-8 -*-

#---------------------------------------------------------------
# PyNLPl - Test Units for Statistics and Information Theory
#   by Maarten van Gompel, ILK, Universiteit van Tilburg
#   http://ilk.uvt.nl/~mvgompel
#   proycon AT anaproy DOT nl
#
#   Licensed under GPLv3
#
#----------------------------------------------------------------
from __future__ import print_function
from __future__ import unicode_literals
from __future__ import division
from __future__ import absolute_import

import sys
import os
import unittest

from pynlpl.statistics import FrequencyList, HiddenMarkovModel
from pynlpl.textprocessors import Windower


sentences = ["This is a sentence .".split(' '),"Moreover , this sentence is a test .".split(' ')]

class FrequencyListTest(unittest.TestCase):
    def test_freqlist_casesens(self):
        """Frequency List (case sensitive)"""
        global sentences
        f= FrequencyList()
        for sentence in sentences:
            f.append(sentence)
        self.assertTrue(( f['sentence'] == 2 and  f['this'] == 1 and f['test'] == 1 )) 

    def test_freqlist_caseinsens(self):
        """Frequency List (case insensitive)"""
        global sentences
        f= FrequencyList(None, False)
        for sentence in sentences:
            f.append(sentence)
        self.assertTrue(( f['sentence'] == 2 and  f['this'] == 2 and f['Test'] == 1 )) 

    def test_freqlist_tokencount(self):
        """Frequency List (count tokens)"""
        global sentences
        f= FrequencyList()
        for sentence in sentences:
            f.append(sentence)
        self.assertEqual(f.total,13) 

    def test_freqlist_typecount(self):
        """Frequency List (count types)"""
        global sentences
        f= FrequencyList()
        for sentence in sentences:
            f.append(sentence)
        self.assertEqual(len(f),9) 

class BigramFrequencyListTest(unittest.TestCase):
    def test_freqlist_casesens(self):
        """Bigram Frequency List (case sensitive)"""
        global sentences
        f= FrequencyList()
        for sentence in sentences:
            f.append(Windower(sentence,2))
        self.assertTrue(( f[('is','a')] == 2 and  f[('This','is')] == 1))

    def test_freqlist_caseinsens(self):
        """Bigram Frequency List (case insensitive)"""
        global sentences
        f= FrequencyList(None, False)
        for sentence in sentences:
            f.append(Windower(sentence,2))
        self.assertTrue(( f[('is','a')] == 2 and  f[('this','is')] == 1))

class HMMTest(unittest.TestCase):
    def test_viterbi(self):
        """Viterbi decode run on Hidden Markov Model"""
        hmm = HiddenMarkovModel('start')
        hmm.settransitions('start',{'rainy':0.6,'sunny':0.4})
        hmm.settransitions('rainy',{'rainy':0.7,'sunny':0.3})
        hmm.settransitions('sunny',{'rainy':0.4,'sunny':0.6}) 
        hmm.setemission('rainy', {'walk': 0.1, 'shop': 0.4, 'clean': 0.5})
        hmm.setemission('sunny', {'walk': 0.6, 'shop': 0.3, 'clean': 0.1})
        observations = ['walk', 'shop', 'clean']
        prob, path = hmm.viterbi(observations)
        self.assertEqual( path, ['sunny', 'rainy', 'rainy'])
        self.assertEqual( prob, 0.01344)
        
if __name__ == '__main__':
    unittest.main()