File: tokenize.go

package info (click to toggle)

golang-github-jdkato-prose 1.1.0%2Bgit20171031.e27abfd-2

links: PTS, VCS
area: main
in suites: buster
size: 12,848 kB
sloc: python: 115; makefile: 55; sh: 21

file content (27 lines) | stat: -rw-r--r-- 846 bytes

parent folder | download | duplicates (3)

/*
Package tokenize implements functions to split strings into slices of substrings.
*/
package tokenize

// ProseTokenizer is the interface implemented by an object that takes a string
// and returns a slice of substrings.
type ProseTokenizer interface {
	Tokenize(text string) []string
}

// TextToWords converts the string text into a slice of words.
//
// It does so by tokenizing text into sentences (using a port of NLTK's punkt
// tokenizer; see https://github.com/neurosnap/sentences) and then tokenizing
// the sentences into words via TreebankWordTokenizer.
func TextToWords(text string) []string {
	sentTokenizer := NewPunktSentenceTokenizer()
	wordTokenizer := NewTreebankWordTokenizer()

	words := []string{}
	for _, s := range sentTokenizer.Tokenize(text) {
		words = append(words, wordTokenizer.Tokenize(s)...)
	}

	return words
}