1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185
|
// Copyright 2014-2022 Ulrich Kunitz. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package randtxt supports the generation of random text using a
// trigram model for the English language.
package randtxt
import (
"math"
"math/rand"
"sort"
)
// ngram stores an entry from the language model.
type ngram struct {
s string
lgP float64
lgQ float64
}
// ngrams represents a slice of ngram values and is used to represent a
// language model.
type ngrams []ngram
func (s ngrams) Len() int { return len(s) }
func (s ngrams) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
func (s ngrams) Less(i, j int) bool { return s[i].s < s[j].s }
// Sorts the language model in the sequence of their ngrams.
func (s ngrams) Sort() { sort.Sort(s) }
// Search is looking for an ngram or the position where it would be
// inserted.
func (s ngrams) Search(g string) int {
return sort.Search(len(s), func(k int) bool { return s[k].s >= g })
}
// prob represents a string, usually an ngram, and a probability value.
type prob struct {
s string
p float64
}
// probs is a slice of prob values that can be sorted and searched.
type probs []prob
func (s probs) Len() int { return len(s) }
func (s probs) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
func (s probs) Less(i, j int) bool { return s[i].s < s[j].s }
// SortByNgram sorts the probs slice by ngram, field s.
func (s probs) SortByNgram() { sort.Sort(s) }
// SortsByProb sorts the probs slice by probability, field p.
func (s probs) SortByProb() { sort.Sort(byProb{s}) }
// SearchNgram searches for an ngram or the position where it would be
// inserted.
func (s probs) SearchNgram(g string) int {
return sort.Search(len(s), func(k int) bool { return s[k].s >= g })
}
// SearchProb searches ngrams for a specific probability or where it
// would be inserted.
func (s probs) SearchProb(p float64) int {
return sort.Search(len(s), func(k int) bool { return s[k].p >= p })
}
// byProb is used to sort probs slice by probability, field p.
type byProb struct {
probs
}
func (s byProb) Less(i, j int) bool {
return s.probs[i].p < s.probs[j].p
}
// cdf can be used to setup a cumulative distribution function
// represented by a probs slice. We should have returned an actual
// function.
func cdf(n int, p func(i int) prob) probs {
prs := make(probs, n)
sum := 0.0
for i := range prs {
pr := p(i)
sum += pr.p
prs[i] = pr
}
q := 1.0 / sum
x := 0.0
for i, pr := range prs {
x += pr.p * q
if x > 1.0 {
x = 1.0
}
prs[i].p = x
}
if !sort.IsSorted(byProb{prs}) {
panic("cdf not sorted")
}
return prs
}
// pCDFOfLM converts a language model into a cumulative distribution
// function represented by probs.
func pCDFOfLM(lm ngrams) probs {
return cdf(len(lm), func(i int) prob {
return prob{lm[i].s, math.Exp2(lm[i].lgP)}
})
}
// cCDF converts a ngrams slice into a cumulative distribution function
// using the conditional probability lgQ.
func cCDF(s ngrams) probs {
return cdf(len(s), func(i int) prob {
return prob{s[i].s, math.Exp2(s[i].lgQ)}
})
}
// comap contains a map of conditional distribution function for the
// last character.
type comap map[string]probs
// comapOfLM converts a language model in a map of conditional
// distribution functions.
func comapOfLM(lm ngrams) comap {
if !sort.IsSorted(lm) {
panic("lm is not sorted")
}
m := make(comap, 26*26)
for i := 0; i < len(lm); {
j := i
g := lm[i].s
g2 := g[:2]
z := g2 + "Z"
i = lm.Search(z)
if i >= len(lm) || lm[i].s != z {
panic("unexpected search result")
}
i++
m[g2] = cCDF(lm[j:i])
}
return m
}
// trigram returns the trigram with prefix g2 using a probability value
// in the range [0.0,1.0).
func (c comap) trigram(g2 string, p float64) string {
prs := c[g2]
i := prs.SearchProb(p)
return prs[i].s
}
var (
// CDF for normal probabilities
pcdf = pCDFOfLM(englm3)
// map of two letter conditionals
cmap = comapOfLM(englm3)
)
// Reader generates a stream of text of uppercase letters with trigrams
// distributed according to a language model of the English language.
type Reader struct {
rnd *rand.Rand
g3 string
}
// NewReader creates a new reader. The argument src must create a uniformly
// distributed stream of random values.
func NewReader(src rand.Source) *Reader {
rnd := rand.New(src)
i := pcdf.SearchProb(rnd.Float64())
return &Reader{rnd, pcdf[i].s}
}
// Read reads random text. The Read function will always return len(p)
// bytes and will never return an error.
func (r *Reader) Read(p []byte) (n int, err error) {
for i := range p {
r.g3 = cmap.trigram(r.g3[1:], r.rnd.Float64())
p[i] = r.g3[2]
}
return len(p), nil
}
|