File: predict

package info (click to toggle)
onboard 1.4.1-5
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, bullseye
  • size: 31,548 kB
  • sloc: python: 29,215; cpp: 5,965; ansic: 5,735; xml: 1,026; sh: 163; makefile: 39
file content (146 lines) | stat: -rwxr-xr-x 5,731 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#!/usr/bin/python3
# -*- coding: utf-8 -*-

# Copyright © 2009-2010, 2012, 2014 marmuta <marmvta@gmail.com>
#
# This file is part of Onboard.
#
# Onboard is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# Onboard is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

import sys
import itertools
from optparse import OptionParser
from pypredict import *


def main():
    parser = OptionParser(usage="Usage: %prog [options] [history] [completion-prefix]")
    parser.add_option("-m", "--language-model", type="str", dest="language_model",
              help="optional filename of a language model")
    parser.add_option("-d", "--dictionary-test", action="store_true",
              dest="dictionary_test",
              help="use synthethic training data to test dictionary performance")
    parser.add_option("-c", "--case-insensitive", action="store_true",
              dest="case_insensitive",
              help="case insensitive completion")
    parser.add_option("-a", "--accent-insensitive", action="store_true",
              dest="accent_insensitive",
              help="accent insensitive completion")
    options, args = parser.parse_args()

    if options.language_model:
        model = CachedDynamicModel()
        with timeit("loading"):
            model.load(options.language_model)

        context = [str(w) for w in args or [""]]
        model.recency_ratio = 0

    elif options.dictionary_test:
        model = CachedDynamicModel()
        with timeit("creating synthetic training data"):
            s = "".join([chr(c) for c in range(ord("a"), ord("z") + 1)])
            perms = list(itertools.permutations(s, 4))
            for perm in perms[:0xfff0]:
                model.count_ngram(["".join(perm)])
        context = [str(w) for w in args or [""]]
        model.recency_ratio = 0

    else:

        # usage:
        # pypredict/predict dummy [history] prefix
        # e.g. pypredict/predict "" www test ""

        model = CachedDynamicModel()

        # debug
        training_text = """
            No, when I go to sea, I go as a simple sailor, right before the mast,
            plumb down into the forecastle, aloft there to the royal mast-head.
            True, they rather order me about some, and make me jump from spar to
            spar, like a grasshopper in a May meadow. And at first, this sort
            of thing is unpleasant enough. And more than all,
            if just previous to putting your hand into the tar-pot, you have been
            lording it as a country schoolmaster, making the tallest boys stand
            in awe of you. The transition is a keen one, I assure you, from a
            schoolmaster to a sailor, and requires a strong decoction of Seneca and
            the Stoics to enable you to grin and bear it. But even this wears off in
            time.
            www.test.com
            www.gnome.org
            """
        tokens, spans = tokenize_text(training_text)
        #tokens, spans = tokenize_text(u"<s> Mary has a little lamb.")
        #tokens, spans = tokenize_text(u"Mary has a little lamb little.")
        #tokens, spans = tokenize_text(read_corpus("../../moby.txt"))
        #tokens, spans = tokenize_text(read_corpus("/home/user/.gpredict/learned_text.txt"))
        model.learn_tokens(tokens)
        for ng in model.iter_ngrams():
            if "bzr" in ng[0]:
                print(ng)
        context = [str(w) for w in args or [""]]
        # fixme, remove <unk> <unk> <weird word> ngrams
        #context = [u"xxxxx", u""]
        #context = [u"import", u"pypredict", u""]
        model.recency_ratio = 1

    counts, totals = model.get_counts()
    for i,c in enumerate(counts):
        sys.stdout.write("%d-grams: types %10d, occurences %10d\n" % \
              (i+1, counts[i], totals[i]))

    with timeit("predict (50)"):
        choices = model.predictp(context, 50)

    print(options.case_insensitive, options.accent_insensitive)
    with timeit("predict (all)"):

        _options = model.INCLUDE_CONTROL_WORDS
        if options.case_insensitive:
            _options |= model.CASE_INSENSITIVE
        if options.accent_insensitive:
            _options |= model.ACCENT_INSENSITIVE

        choices = model.predictp(context=context, limit=-1, options=_options)

    print_choices(model, context, choices)
    print(model.memory_size(), sum(model.memory_size()))


def print_choices(model, context, choices):
    n   = min(model.order, len(context))
    history = context[-n:-1]
    prefix  = context[-1]

    print()
    print("history:", history, "prefix '%s' " % prefix)

    psum = 0
    counts = []
    for x in choices:
        ngram = history + [x[0]]
        psum += x[1]
        padding = max(model.order-len(context),0)
        ng = [""]*padding + ngram
        counts.append([model.get_ngram_count(ng[i:]) for i in range(model.order)])

    print("Probability sum %f for %d results" % (psum,len(choices)))   # ought to be 1.0 for the whole vocabulary
    print("Words with zero probability: ", sum(1 for x in choices if x[1] == 0))
    for i,x in enumerate(choices[:20]):
        print("%10f " % x[1] + "".join("%8d " % c for c in counts[i]) + "'%s'" % x[0])

if __name__ == '__main__':
    main()