1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
|
package tokenize
import (
"encoding/json"
"fmt"
"path/filepath"
"testing"
"github.com/jdkato/prose/internal/util"
)
type goldenRule struct {
Name string
Input string
Output []string
}
func TestPragmaticRulesEn(t *testing.T) { testLang("en", t) }
func TestPragmaticRulesFr(t *testing.T) { testLang("fr", t) }
func TestPragmaticRulesEs(t *testing.T) { testLang("es", t) }
func BenchmarkPragmaticRulesEn(b *testing.B) { benchmarkLang("en", b) }
func benchmarkLang(lang string, b *testing.B) {
tests := make([]goldenRule, 0)
f := fmt.Sprintf("golden_rules_%s.json", lang)
cases := util.ReadDataFile(filepath.Join(testdata, f))
tok, err := NewPragmaticSegmenter(lang)
util.CheckError(err)
util.CheckError(json.Unmarshal(cases, &tests))
for n := 0; n < b.N; n++ {
for _, test := range tests {
tok.Tokenize(test.Input)
}
}
}
func testLang(lang string, t *testing.T) {
tests := make([]goldenRule, 0)
f := fmt.Sprintf("golden_rules_%s.json", lang)
cases := util.ReadDataFile(filepath.Join(testdata, f))
tok, err := NewPragmaticSegmenter(lang)
util.CheckError(err)
util.CheckError(json.Unmarshal(cases, &tests))
for _, test := range tests {
compare(t, test.Name, test.Input, test.Output, tok)
}
}
func compare(t *testing.T, test, actualText string, expected []string, tok *PragmaticSegmenter) bool {
actual := tok.Tokenize(actualText)
if len(actual) != len(expected) {
t.Log(test)
t.Logf("Actual: %v\n", actual)
t.Errorf("Actual: %d, Expected: %d\n", len(actual), len(expected))
t.Log("===")
return false
}
for index, sent := range actual {
if sent != expected[index] {
t.Log(test)
t.Errorf("Actual: [%s] Expected: [%s]\n", sent, expected[index])
t.Log("===")
return false
}
}
return true
}
|