1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148
|
// Copyright (c) 2015 Shawn Goertzen
// Copyright (c) 2017 Mikael Berthe
//
// This code mostly comes from github.com/sgoertzen/html2text,
// with some specific but intrusive changes for Mastodon HTML messages.
// For example, links are not displayed for hashtags and mentions,
// and links alone are displayed for the other cases.
//
// Licensed under the MIT license.
// Please see the LICENSE file is this directory.
package html2text
import (
"bytes"
"errors"
"golang.org/x/net/html"
"strings"
)
var breakers = map[string]bool{
"br": true,
"div": true,
"tr": true,
"li": true,
"p": true,
}
// Textify turns an HTML body into a text string
func Textify(body string) (string, error) {
r := strings.NewReader(body)
doc, err := html.Parse(r)
if err != nil {
return "", errors.New("unable to parse the html")
}
var buffer bytes.Buffer
process(doc, &buffer, "")
s := strings.TrimSpace(buffer.String())
return s, nil
}
func process(n *html.Node, b *bytes.Buffer, class string) {
processChildren := true
if n.Type == html.ElementNode && n.Data == "head" {
return
} else if n.Type == html.ElementNode && n.Data == "a" && n.FirstChild != nil {
anchor(n, b, class)
processChildren = false
} else if n.Type == html.TextNode {
// Clean up data
cleanData := strings.Replace(strings.Trim(n.Data, " \t"), "\u00a0", " ", -1)
// Heuristics to add a whitespace character...
var prevSpace, nextSpace bool // hint if previous/next char is a space
var last byte
bl := b.Len()
if bl > 0 {
last = b.Bytes()[bl-1]
if last == ' ' {
prevSpace = true
}
}
if len(cleanData) > 0 && cleanData[0] == ' ' {
nextSpace = true
}
if prevSpace && nextSpace {
b.WriteString(cleanData[1:]) // Trim 1 space
} else {
if bl > 0 && last != '\n' && last != '@' && last != '#' && !prevSpace && !nextSpace {
b.WriteString(" ")
}
b.WriteString(cleanData)
}
}
if processChildren {
var class string
if n.Type == html.ElementNode && n.Data == "span" {
for _, attr := range n.Attr {
if attr.Key == "class" {
class = attr.Val
break
}
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
process(c, b, class)
}
}
if b.Len() > 0 {
bl := b.Len()
last := b.Bytes()[bl-1]
if last != '\n' && n.Type == html.ElementNode && breakers[n.Data] {
// Remove previous space
for last == ' ' {
bl--
b.Truncate(bl)
if bl > 0 {
last = b.Bytes()[bl-1]
} else {
last = '\x00'
}
}
b.WriteString("\n")
}
}
}
func anchor(n *html.Node, b *bytes.Buffer, class string) {
bl := b.Len()
var last byte
if bl > 0 {
last = b.Bytes()[bl-1]
}
// Add heading space if needed
if last != ' ' && last != '\n' && last != '#' && last != '@' {
b.WriteString(" ")
}
var tmpbuf bytes.Buffer
for c := n.FirstChild; c != nil; c = c.NextSibling {
process(c, &tmpbuf, class)
}
if class == "tag" || class == "h-card" || last == '@' {
b.Write(tmpbuf.Bytes())
return
}
s := tmpbuf.String()
if strings.HasPrefix(s, "#") || strings.HasPrefix(s, "@") {
b.WriteString(s) // Tag or mention: display content
return
}
// Display href link
for _, attr := range n.Attr {
if attr.Key == "href" {
link := n.Attr[0].Val
b.WriteString(link)
break
}
}
}
|