File: tokenize.go

package info (click to toggle)
golang-github-jdkato-prose 1.1.0%2Bgit20171031.e27abfd-2
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 12,848 kB
  • sloc: python: 115; makefile: 55; sh: 21
file content (27 lines) | stat: -rw-r--r-- 846 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
/*
Package tokenize implements functions to split strings into slices of substrings.
*/
package tokenize

// ProseTokenizer is the interface implemented by an object that takes a string
// and returns a slice of substrings.
type ProseTokenizer interface {
	Tokenize(text string) []string
}

// TextToWords converts the string text into a slice of words.
//
// It does so by tokenizing text into sentences (using a port of NLTK's punkt
// tokenizer; see https://github.com/neurosnap/sentences) and then tokenizing
// the sentences into words via TreebankWordTokenizer.
func TextToWords(text string) []string {
	sentTokenizer := NewPunktSentenceTokenizer()
	wordTokenizer := NewTreebankWordTokenizer()

	words := []string{}
	for _, s := range sentTokenizer.Tokenize(text) {
		words = append(words, wordTokenizer.Tokenize(s)...)
	}

	return words
}