1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74
|
// Package chunk implements functions for finding useful chunks in text previously tagged from parts of speech.
//
package chunk
import (
"regexp"
"github.com/jdkato/prose/tag"
)
// quadString creates a string containing all of the tags, each padded to 4
// characters wide.
func quadsString(tagged []tag.Token) string {
tagQuads := ""
for _, tok := range tagged {
padding := ""
pos := tok.Tag
switch len(pos) {
case 0:
padding = "____" // should not exist
case 1:
padding = "___"
case 2:
padding = "__"
case 3:
padding = "_"
case 4: // no padding required
default:
pos = pos[:4] // longer than 4 ... truncate!
}
tagQuads += pos + padding
}
return tagQuads
}
// TreebankNamedEntities matches proper names, excluding prior adjectives,
// possibly including numbers and a linkage by preposition or subordinating
// conjunctions (for example "Bank of England").
var TreebankNamedEntities = regexp.MustCompile(
`((CD__)*(NNP.)+(CD__|NNP.)*)+` +
`((IN__)*(CD__)*(NNP.)+(CD__|NNP.)*)*`)
// Chunk returns a slice containing the chunks of interest according to the
// regexp.
//
// This is a convenience wrapper around Locate, which should be used if you
// need access the to the in-text locations of each chunk.
func Chunk(tagged []tag.Token, rx *regexp.Regexp) []string {
chunks := []string{}
for _, loc := range Locate(tagged, rx) {
res := ""
for t, tt := range tagged[loc[0]:loc[1]] {
if t != 0 {
res += " "
}
res += tt.Text
}
chunks = append(chunks, res)
}
return chunks
}
// Locate finds the chunks of interest according to the regexp.
func Locate(tagged []tag.Token, rx *regexp.Regexp) [][]int {
rx.Longest() // make sure we find the longest possible sequences
rs := rx.FindAllStringIndex(quadsString(tagged), -1)
for i, ii := range rs {
for j := range ii {
// quadsString makes every offset 4x what it should be
rs[i][j] /= 4
}
}
return rs
}
|