File: tokenize_test.go

package info (click to toggle)
golang-github-jdkato-prose 1.1.0%2Bgit20171031.e27abfd-2
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 12,848 kB
  • sloc: python: 115; makefile: 55; sh: 21
file content (70 lines) | stat: -rw-r--r-- 2,328 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
package tokenize

import (
	"encoding/json"
	"fmt"
	"path/filepath"
	"testing"

	"github.com/jdkato/prose/internal/util"
	"github.com/stretchr/testify/assert"
)

var testdata = filepath.Join("..", "testdata")

func getWordData(file string) ([]string, [][]string) {
	in := util.ReadDataFile(filepath.Join(testdata, "treebank_sents.json"))
	out := util.ReadDataFile(filepath.Join(testdata, file))

	input := []string{}
	output := [][]string{}

	util.CheckError(json.Unmarshal(in, &input))
	util.CheckError(json.Unmarshal(out, &output))

	return input, output
}

func getWordBenchData() []string {
	in := util.ReadDataFile(filepath.Join(testdata, "treebank_sents.json"))
	input := []string{}
	util.CheckError(json.Unmarshal(in, &input))
	return input
}

func ExampleNewWordBoundaryTokenizer() {
	t := NewWordBoundaryTokenizer()
	fmt.Println(t.Tokenize("They'll save and invest more."))
	// Output: [They'll save and invest more]
}

func ExampleNewWordPunctTokenizer() {
	t := NewWordPunctTokenizer()
	fmt.Println(t.Tokenize("They'll save and invest more."))
	// Output: [They ' ll save and invest more .]
}

func ExampleNewTreebankWordTokenizer() {
	t := NewTreebankWordTokenizer()
	fmt.Println(t.Tokenize("They'll save and invest more."))
	// Output: [They 'll save and invest more .]
}

func ExampleNewBlanklineTokenizer() {
	t := NewBlanklineTokenizer()
	fmt.Println(t.Tokenize("They'll save and invest more.\n\nThanks!"))
	// Output: [They'll save and invest more. Thanks!]
}

func TestTextToWords(t *testing.T) {
	text := "Vale is a natural language linter that supports plain text, markup (Markdown, reStructuredText, AsciiDoc, and HTML), and source code comments. Vale doesn't attempt to offer a one-size-fits-all collection of rules—instead, it strives to make customization as easy as possible."
	expected := []string{
		"Vale", "is", "a", "natural", "language", "linter", "that", "supports",
		"plain", "text", ",", "markup", "(", "Markdown", ",", "reStructuredText",
		",", "AsciiDoc", ",", "and", "HTML", ")", ",", "and", "source", "code",
		"comments", ".", "Vale", "does", "n't", "attempt", "to", "offer", "a",
		"one-size-fits-all", "collection", "of", "rules—instead", ",", "it",
		"strives", "to", "make", "customization", "as", "easy", "as", "possible",
		"."}
	assert.Equal(t, expected, TextToWords(text))
}