File: main.go

package info (click to toggle)
golang-github-greatroar-blobloom 0.7.1-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 252 kB
  • sloc: sh: 10; makefile: 3
file content (112 lines) | stat: -rw-r--r-- 2,721 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
// Copyright 2020-2021 the Blobloom authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// This package implements a toy interactive spell checker.
//
// It reads a dictionary from /usr/share/dict/words and a text from standard
// input. It then reports any misspelled words on standard output.
package main

import (
	"bufio"
	"bytes"
	"fmt"
	"hash/maphash"
	"log"
	"os"
	"unicode"

	"github.com/greatroar/blobloom"
)

func main() {
	dict := loadDictionary()

	sc := bufio.NewScanner(os.Stdin)
	sc.Split(bufio.ScanWords)

	for sc.Scan() {
		word := normalize(sc.Bytes())
		if !dict.has(word) {
			fmt.Printf(">>> %s\n", word)
		}
	}
	if err := sc.Err(); err != nil {
		log.Fatal(err)
	}
}

// A Bloom filter with a randomized hash function.
type bloomfilter struct {
	*blobloom.Filter
	maphash.Seed
}

func newBloomfilter(capacity uint64, fprate float64) *bloomfilter {
	cfg := blobloom.Config{Capacity: capacity, FPRate: .001}
	return &bloomfilter{
		Filter: blobloom.NewOptimized(cfg),
		Seed:   maphash.MakeSeed(),
	}
}

func (f *bloomfilter) add(key []byte)      { f.Filter.Add(f.hash(key)) }
func (f *bloomfilter) has(key []byte) bool { return f.Filter.Has(f.hash(key)) }

func (f *bloomfilter) hash(key []byte) uint64 {
	var h maphash.Hash
	h.SetSeed(f.Seed)
	_, _ = h.Write(key)
	return h.Sum64()
}

func normalize(word []byte) []byte {
	word = bytes.TrimFunc(word, unicode.IsPunct)
	word = bytes.ToLower(word)
	return word
}

// To estimate the number of keys without scanning the file twice, we need
// an estimate of the average length of a word. This comes close for English.
const avgWordLength = 10

func loadDictionary() *bloomfilter {
	f, err := os.Open("/usr/share/dict/words")
	if err != nil {
		log.Fatal(err)
	}
	defer f.Close()

	info, err := f.Stat()
	if err != nil {
		log.Fatal(err)
	}
	filesize := uint64(info.Size())

	dict := newBloomfilter(filesize/avgWordLength, .001)

	sc := bufio.NewScanner(f)
	for sc.Scan() {
		dict.add(normalize(sc.Bytes()))
		filesize-- // Subtract newline, for fairness.
	}
	if err := sc.Err(); err != nil {
		log.Fatal(err)
	}

	log.Printf("dictionary loaded: %dkiB on disk, %dkiB in memory",
		filesize/1024, dict.NumBits()/(8*1024))

	return dict
}