File: probs.go

package info (click to toggle)
golang-github-ulikunitz-xz 0.5.15-1
links: PTS, VCS
area: main
in suites: forky, sid
size: 11,496 kB
sloc: makefile: 7; sh: 5
file content (185 lines) | stat: -rw-r--r-- 4,754 bytes
// Copyright 2014-2022 Ulrich Kunitz. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// Package randtxt supports the generation of random text using a
// trigram model for the English language.
package randtxt

import (
	"math"
	"math/rand"
	"sort"
)

// ngram stores an entry from the language model.
type ngram struct {
	s   string
	lgP float64
	lgQ float64
}

// ngrams represents a slice of ngram values and is used to represent a
// language model.
type ngrams []ngram

func (s ngrams) Len() int           { return len(s) }
func (s ngrams) Swap(i, j int)      { s[i], s[j] = s[j], s[i] }
func (s ngrams) Less(i, j int) bool { return s[i].s < s[j].s }

// Sorts the language model in the sequence of their ngrams.
func (s ngrams) Sort() { sort.Sort(s) }

// Search is looking for an ngram or the position where it would be
// inserted.
func (s ngrams) Search(g string) int {
	return sort.Search(len(s), func(k int) bool { return s[k].s >= g })
}

// prob represents a string, usually an ngram, and a probability value.
type prob struct {
	s string
	p float64
}

// probs is a slice of prob values that can be sorted and searched.
type probs []prob

func (s probs) Len() int           { return len(s) }
func (s probs) Swap(i, j int)      { s[i], s[j] = s[j], s[i] }
func (s probs) Less(i, j int) bool { return s[i].s < s[j].s }

// SortByNgram sorts the probs slice by ngram, field s.
func (s probs) SortByNgram() { sort.Sort(s) }

// SortsByProb sorts the probs slice by probability, field p.
func (s probs) SortByProb() { sort.Sort(byProb{s}) }

// SearchNgram searches for an ngram or the position where it would be
// inserted.
func (s probs) SearchNgram(g string) int {
	return sort.Search(len(s), func(k int) bool { return s[k].s >= g })
}

// SearchProb searches ngrams for a specific probability or where it
// would be inserted.
func (s probs) SearchProb(p float64) int {
	return sort.Search(len(s), func(k int) bool { return s[k].p >= p })
}

// byProb is used to sort probs slice by probability, field p.
type byProb struct {
	probs
}

func (s byProb) Less(i, j int) bool {
	return s.probs[i].p < s.probs[j].p
}

// cdf can be used to setup a cumulative distribution function
// represented by a probs slice. We should have returned an actual
// function.
func cdf(n int, p func(i int) prob) probs {
	prs := make(probs, n)
	sum := 0.0
	for i := range prs {
		pr := p(i)
		sum += pr.p
		prs[i] = pr
	}
	q := 1.0 / sum
	x := 0.0
	for i, pr := range prs {
		x += pr.p * q
		if x > 1.0 {
			x = 1.0
		}
		prs[i].p = x
	}
	if !sort.IsSorted(byProb{prs}) {
		panic("cdf not sorted")
	}
	return prs
}

// pCDFOfLM converts a language model into a cumulative distribution
// function represented by probs.
func pCDFOfLM(lm ngrams) probs {
	return cdf(len(lm), func(i int) prob {
		return prob{lm[i].s, math.Exp2(lm[i].lgP)}
	})
}

// cCDF converts a ngrams slice into a cumulative distribution function
// using the conditional probability lgQ.
func cCDF(s ngrams) probs {
	return cdf(len(s), func(i int) prob {
		return prob{s[i].s, math.Exp2(s[i].lgQ)}
	})
}

// comap contains a map of conditional distribution function for the
// last character.
type comap map[string]probs

// comapOfLM converts a language model in a map of conditional
// distribution functions.
func comapOfLM(lm ngrams) comap {
	if !sort.IsSorted(lm) {
		panic("lm is not sorted")
	}
	m := make(comap, 26*26)
	for i := 0; i < len(lm); {
		j := i
		g := lm[i].s
		g2 := g[:2]
		z := g2 + "Z"
		i = lm.Search(z)
		if i >= len(lm) || lm[i].s != z {
			panic("unexpected search result")
		}
		i++
		m[g2] = cCDF(lm[j:i])
	}
	return m
}

// trigram returns the trigram with prefix g2 using a probability value
// in the range [0.0,1.0).
func (c comap) trigram(g2 string, p float64) string {
	prs := c[g2]
	i := prs.SearchProb(p)
	return prs[i].s
}

var (
	// CDF for normal probabilities
	pcdf = pCDFOfLM(englm3)
	// map of two letter conditionals
	cmap = comapOfLM(englm3)
)

// Reader generates a stream of text of uppercase letters with trigrams
// distributed according to a language model of the English language.
type Reader struct {
	rnd *rand.Rand
	g3  string
}

// NewReader creates a new reader. The argument src must create a uniformly
// distributed stream of random values.
func NewReader(src rand.Source) *Reader {
	rnd := rand.New(src)
	i := pcdf.SearchProb(rnd.Float64())
	return &Reader{rnd, pcdf[i].s}
}

// Read reads random text. The Read function will always return len(p)
// bytes and will never return an error.
func (r *Reader) Read(p []byte) (n int, err error) {
	for i := range p {
		r.g3 = cmap.trigram(r.g3[1:], r.rnd.Float64())
		p[i] = r.g3[2]
	}
	return len(p), nil
}