File: probs.go

package info (click to toggle)
golang-github-ulikunitz-xz 0.5.15-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 11,496 kB
  • sloc: makefile: 7; sh: 5
file content (185 lines) | stat: -rw-r--r-- 4,754 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
// Copyright 2014-2022 Ulrich Kunitz. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// Package randtxt supports the generation of random text using a
// trigram model for the English language.
package randtxt

import (
	"math"
	"math/rand"
	"sort"
)

// ngram stores an entry from the language model.
type ngram struct {
	s   string
	lgP float64
	lgQ float64
}

// ngrams represents a slice of ngram values and is used to represent a
// language model.
type ngrams []ngram

func (s ngrams) Len() int           { return len(s) }
func (s ngrams) Swap(i, j int)      { s[i], s[j] = s[j], s[i] }
func (s ngrams) Less(i, j int) bool { return s[i].s < s[j].s }

// Sorts the language model in the sequence of their ngrams.
func (s ngrams) Sort() { sort.Sort(s) }

// Search is looking for an ngram or the position where it would be
// inserted.
func (s ngrams) Search(g string) int {
	return sort.Search(len(s), func(k int) bool { return s[k].s >= g })
}

// prob represents a string, usually an ngram, and a probability value.
type prob struct {
	s string
	p float64
}

// probs is a slice of prob values that can be sorted and searched.
type probs []prob

func (s probs) Len() int           { return len(s) }
func (s probs) Swap(i, j int)      { s[i], s[j] = s[j], s[i] }
func (s probs) Less(i, j int) bool { return s[i].s < s[j].s }

// SortByNgram sorts the probs slice by ngram, field s.
func (s probs) SortByNgram() { sort.Sort(s) }

// SortsByProb sorts the probs slice by probability, field p.
func (s probs) SortByProb() { sort.Sort(byProb{s}) }

// SearchNgram searches for an ngram or the position where it would be
// inserted.
func (s probs) SearchNgram(g string) int {
	return sort.Search(len(s), func(k int) bool { return s[k].s >= g })
}

// SearchProb searches ngrams for a specific probability or where it
// would be inserted.
func (s probs) SearchProb(p float64) int {
	return sort.Search(len(s), func(k int) bool { return s[k].p >= p })
}

// byProb is used to sort probs slice by probability, field p.
type byProb struct {
	probs
}

func (s byProb) Less(i, j int) bool {
	return s.probs[i].p < s.probs[j].p
}

// cdf can be used to setup a cumulative distribution function
// represented by a probs slice. We should have returned an actual
// function.
func cdf(n int, p func(i int) prob) probs {
	prs := make(probs, n)
	sum := 0.0
	for i := range prs {
		pr := p(i)
		sum += pr.p
		prs[i] = pr
	}
	q := 1.0 / sum
	x := 0.0
	for i, pr := range prs {
		x += pr.p * q
		if x > 1.0 {
			x = 1.0
		}
		prs[i].p = x
	}
	if !sort.IsSorted(byProb{prs}) {
		panic("cdf not sorted")
	}
	return prs
}

// pCDFOfLM converts a language model into a cumulative distribution
// function represented by probs.
func pCDFOfLM(lm ngrams) probs {
	return cdf(len(lm), func(i int) prob {
		return prob{lm[i].s, math.Exp2(lm[i].lgP)}
	})
}

// cCDF converts a ngrams slice into a cumulative distribution function
// using the conditional probability lgQ.
func cCDF(s ngrams) probs {
	return cdf(len(s), func(i int) prob {
		return prob{s[i].s, math.Exp2(s[i].lgQ)}
	})
}

// comap contains a map of conditional distribution function for the
// last character.
type comap map[string]probs

// comapOfLM converts a language model in a map of conditional
// distribution functions.
func comapOfLM(lm ngrams) comap {
	if !sort.IsSorted(lm) {
		panic("lm is not sorted")
	}
	m := make(comap, 26*26)
	for i := 0; i < len(lm); {
		j := i
		g := lm[i].s
		g2 := g[:2]
		z := g2 + "Z"
		i = lm.Search(z)
		if i >= len(lm) || lm[i].s != z {
			panic("unexpected search result")
		}
		i++
		m[g2] = cCDF(lm[j:i])
	}
	return m
}

// trigram returns the trigram with prefix g2 using a probability value
// in the range [0.0,1.0).
func (c comap) trigram(g2 string, p float64) string {
	prs := c[g2]
	i := prs.SearchProb(p)
	return prs[i].s
}

var (
	// CDF for normal probabilities
	pcdf = pCDFOfLM(englm3)
	// map of two letter conditionals
	cmap = comapOfLM(englm3)
)

// Reader generates a stream of text of uppercase letters with trigrams
// distributed according to a language model of the English language.
type Reader struct {
	rnd *rand.Rand
	g3  string
}

// NewReader creates a new reader. The argument src must create a uniformly
// distributed stream of random values.
func NewReader(src rand.Source) *Reader {
	rnd := rand.New(src)
	i := pcdf.SearchProb(rnd.Float64())
	return &Reader{rnd, pcdf[i].s}
}

// Read reads random text. The Read function will always return len(p)
// bytes and will never return an error.
func (r *Reader) Read(p []byte) (n int, err error) {
	for i := range p {
		r.g3 = cmap.trigram(r.g3[1:], r.rnd.Float64())
		p[i] = r.g3[2]
	}
	return len(p), nil
}