File: html2text.go

package info (click to toggle)
madonctl 3.0.2%2Bds1-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 688 kB
  • sloc: sh: 47; makefile: 4
file content (148 lines) | stat: -rw-r--r-- 3,291 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
// Copyright (c) 2015 Shawn Goertzen
// Copyright (c) 2017 Mikael Berthe
//
// This code mostly comes from github.com/sgoertzen/html2text,
// with some specific but intrusive changes for Mastodon HTML messages.
// For example, links are not displayed for hashtags and mentions,
// and links alone are displayed for the other cases.
//
// Licensed under the MIT license.
// Please see the LICENSE file is this directory.

package html2text

import (
	"bytes"
	"errors"
	"golang.org/x/net/html"
	"strings"
)

var breakers = map[string]bool{
	"br":  true,
	"div": true,
	"tr":  true,
	"li":  true,
	"p":   true,
}

// Textify turns an HTML body into a text string
func Textify(body string) (string, error) {
	r := strings.NewReader(body)
	doc, err := html.Parse(r)
	if err != nil {
		return "", errors.New("unable to parse the html")
	}
	var buffer bytes.Buffer
	process(doc, &buffer, "")

	s := strings.TrimSpace(buffer.String())
	return s, nil
}

func process(n *html.Node, b *bytes.Buffer, class string) {
	processChildren := true

	if n.Type == html.ElementNode && n.Data == "head" {
		return
	} else if n.Type == html.ElementNode && n.Data == "a" && n.FirstChild != nil {
		anchor(n, b, class)
		processChildren = false
	} else if n.Type == html.TextNode {
		// Clean up data
		cleanData := strings.Replace(strings.Trim(n.Data, " \t"), "\u00a0", " ", -1)

		// Heuristics to add a whitespace character...
		var prevSpace, nextSpace bool // hint if previous/next char is a space
		var last byte
		bl := b.Len()
		if bl > 0 {
			last = b.Bytes()[bl-1]
			if last == ' ' {
				prevSpace = true
			}
		}
		if len(cleanData) > 0 && cleanData[0] == ' ' {
			nextSpace = true
		}
		if prevSpace && nextSpace {
			b.WriteString(cleanData[1:]) // Trim 1 space
		} else {
			if bl > 0 && last != '\n' && last != '@' && last != '#' && !prevSpace && !nextSpace {
				b.WriteString(" ")
			}
			b.WriteString(cleanData)
		}
	}

	if processChildren {
		var class string
		if n.Type == html.ElementNode && n.Data == "span" {
			for _, attr := range n.Attr {
				if attr.Key == "class" {
					class = attr.Val
					break
				}
			}
		}
		for c := n.FirstChild; c != nil; c = c.NextSibling {
			process(c, b, class)
		}
	}

	if b.Len() > 0 {
		bl := b.Len()
		last := b.Bytes()[bl-1]
		if last != '\n' && n.Type == html.ElementNode && breakers[n.Data] {
			// Remove previous space
			for last == ' ' {
				bl--
				b.Truncate(bl)
				if bl > 0 {
					last = b.Bytes()[bl-1]
				} else {
					last = '\x00'
				}
			}
			b.WriteString("\n")
		}
	}
}

func anchor(n *html.Node, b *bytes.Buffer, class string) {
	bl := b.Len()
	var last byte
	if bl > 0 {
		last = b.Bytes()[bl-1]
	}

	// Add heading space if needed
	if last != ' ' && last != '\n' && last != '#' && last != '@' {
		b.WriteString(" ")
	}

	var tmpbuf bytes.Buffer
	for c := n.FirstChild; c != nil; c = c.NextSibling {
		process(c, &tmpbuf, class)
	}

	if class == "tag" || class == "h-card" || last == '@' {
		b.Write(tmpbuf.Bytes())
		return
	}

	s := tmpbuf.String()
	if strings.HasPrefix(s, "#") || strings.HasPrefix(s, "@") {
		b.WriteString(s) // Tag or mention: display content
		return
	}

	// Display href link
	for _, attr := range n.Attr {
		if attr.Key == "href" {
			link := n.Attr[0].Val
			b.WriteString(link)
			break
		}
	}
}