1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227
|
/*
Package summarize implements utilities for computing readability scores, usage statistics, and TL;DR summaries of text.
*/
package summarize
import (
"sort"
"strings"
"unicode"
"github.com/jdkato/prose/internal/util"
"github.com/jdkato/prose/tokenize"
"github.com/montanaflynn/stats"
)
// A Word represents a single word in a Document.
type Word struct {
Text string // the actual text
Syllables int // the number of syllables
}
// A Sentence represents a single sentence in a Document.
type Sentence struct {
Text string // the actual text
Length int // the number of words
Words []Word // the words in this sentence
Paragraph int
}
// A RankedParagraph is a paragraph ranked by its number of keywords.
type RankedParagraph struct {
Sentences []Sentence
Position int // the zero-based position within a Document
Rank int
}
// A Document represents a collection of text to be analyzed.
//
// A Document's calculations depend on its word and sentence tokenizers. You
// can use the defaults by invoking NewDocument, choose another implemention
// from the tokenize package, or use your own (as long as it implements the
// ProseTokenizer interface). For example,
//
// d := Document{Content: ..., WordTokenizer: ..., SentenceTokenizer: ...}
// d.Initialize()
type Document struct {
Content string // Actual text
NumCharacters float64 // Number of Characters
NumComplexWords float64 // PolysylWords without common suffixes
NumParagraphs float64 // Number of paragraphs
NumPolysylWords float64 // Number of words with > 2 syllables
NumSentences float64 // Number of sentences
NumSyllables float64 // Number of syllables
NumWords float64 // Number of words
Sentences []Sentence // the Document's sentences
WordFrequency map[string]int // [word]frequency
SentenceTokenizer tokenize.ProseTokenizer
WordTokenizer tokenize.ProseTokenizer
}
// An Assessment provides comprehensive access to a Document's metrics.
type Assessment struct {
// assessments returning an estimated grade level
AutomatedReadability float64
ColemanLiau float64
FleschKincaid float64
GunningFog float64
SMOG float64
// mean & standard deviation of the above estimated grade levels
MeanGradeLevel float64
StdDevGradeLevel float64
// assessments returning non-grade numerical scores
DaleChall float64
ReadingEase float64
}
// NewDocument is a Document constructor that takes a string as an argument. It
// then calculates the data necessary for computing readability and usage
// statistics.
//
// This is a convenience wrapper around the Document initialization process
// that defaults to using a WordBoundaryTokenizer and a PunktSentenceTokenizer
// as its word and sentence tokenizers, respectively.
func NewDocument(text string) *Document {
wTok := tokenize.NewWordBoundaryTokenizer()
sTok := tokenize.NewPunktSentenceTokenizer()
doc := Document{Content: text, WordTokenizer: wTok, SentenceTokenizer: sTok}
doc.Initialize()
return &doc
}
// Initialize calculates the data necessary for computing readability and usage
// statistics.
func (d *Document) Initialize() {
d.WordFrequency = make(map[string]int)
for i, paragraph := range strings.Split(d.Content, "\n\n") {
for _, s := range d.SentenceTokenizer.Tokenize(paragraph) {
wordCount := d.NumWords
d.NumSentences++
words := []Word{}
for _, word := range d.WordTokenizer.Tokenize(s) {
word = strings.TrimSpace(word)
if len(word) == 0 {
continue
}
d.NumCharacters += countChars(word)
if _, found := d.WordFrequency[word]; found {
d.WordFrequency[word]++
} else {
d.WordFrequency[word] = 1
}
syllables := Syllables(word)
words = append(words, Word{Text: word, Syllables: syllables})
d.NumSyllables += float64(syllables)
if syllables > 2 {
d.NumPolysylWords++
}
if isComplex(word, syllables) {
d.NumComplexWords++
}
d.NumWords++
}
d.Sentences = append(d.Sentences, Sentence{
Text: strings.TrimSpace(s),
Length: int(d.NumWords - wordCount),
Words: words,
Paragraph: i})
}
d.NumParagraphs++
}
}
// Assess returns an Assessment for the Document d.
func (d *Document) Assess() *Assessment {
a := Assessment{
FleschKincaid: d.FleschKincaid(), ReadingEase: d.FleschReadingEase(),
GunningFog: d.GunningFog(), SMOG: d.SMOG(), DaleChall: d.DaleChall(),
AutomatedReadability: d.AutomatedReadability(), ColemanLiau: d.ColemanLiau()}
gradeScores := []float64{
a.FleschKincaid, a.AutomatedReadability, a.GunningFog, a.SMOG,
a.ColemanLiau}
mean, merr := stats.Mean(gradeScores)
stdDev, serr := stats.StandardDeviation(gradeScores)
if merr != nil || serr != nil {
a.MeanGradeLevel = 0.0
a.StdDevGradeLevel = 0.0
} else {
a.MeanGradeLevel = mean
a.StdDevGradeLevel = stdDev
}
return &a
}
// Summary returns a Document's n highest ranked paragraphs according to
// keyword frequency.
func (d *Document) Summary(n int) []RankedParagraph {
rankings := []RankedParagraph{}
scores := d.Keywords()
for i := 0; i < int(d.NumParagraphs); i++ {
p := RankedParagraph{Position: i}
rank := 0
size := 0
for _, s := range d.Sentences {
if s.Paragraph == i {
size += s.Length
for _, w := range s.Words {
if score, found := scores[w.Text]; found {
rank += score
}
}
p.Sentences = append(p.Sentences, s)
}
}
// Favor longer paragraphs, as they tend to be more informational.
p.Rank = (rank * size)
rankings = append(rankings, p)
}
// Sort by raking:
sort.Sort(byRank(rankings))
// Take the top-n paragraphs:
size := len(rankings)
if size > n {
rankings = rankings[size-n:]
}
// Sort by chronological position:
sort.Sort(byIndex(rankings))
return rankings
}
type byRank []RankedParagraph
func (s byRank) Len() int { return len(s) }
func (s byRank) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
func (s byRank) Less(i, j int) bool { return s[i].Rank < s[j].Rank }
type byIndex []RankedParagraph
func (s byIndex) Len() int { return len(s) }
func (s byIndex) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
func (s byIndex) Less(i, j int) bool { return s[i].Position < s[j].Position }
func isComplex(word string, syllables int) bool {
if util.HasAnySuffix(word, []string{"es", "ed", "ing"}) {
syllables--
}
return syllables > 2
}
func countChars(word string) float64 {
count := 0
for _, c := range word {
if unicode.IsLetter(c) || unicode.IsNumber(c) {
count++
}
}
return float64(count)
}
|