File: base.go

package info (click to toggle)
golang-github-johanneskaufmann-html-to-markdown 2.3.1-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 2,080 kB
  • sloc: makefile: 3
file content (135 lines) | stat: -rw-r--r-- 4,610 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
package base

import (
	"bytes"
	"strings"

	"github.com/JohannesKaufmann/html-to-markdown/v2/converter"

	"github.com/JohannesKaufmann/dom"
	"github.com/JohannesKaufmann/html-to-markdown/v2/collapse"
	"github.com/JohannesKaufmann/html-to-markdown/v2/internal/domutils"
	"github.com/JohannesKaufmann/html-to-markdown/v2/internal/textutils"

	"golang.org/x/net/html"
)

type base struct{}

// NewBasePlugin registers a bunch of stuff that is not necessarily related to commonmark,
// like removing nodes, trimming whitespace, collapsing whitespace, ...
func NewBasePlugin() converter.Plugin {
	base := base{}
	return &base
}

func (s *base) Name() string {
	return "base"
}
func (b *base) Init(conv *converter.Converter) error {
	conv.Register.TagType("#comment", converter.TagTypeRemove, converter.PriorityStandard)
	conv.Register.TagType("head", converter.TagTypeRemove, converter.PriorityStandard)
	conv.Register.TagType("script", converter.TagTypeRemove, converter.PriorityStandard)
	conv.Register.TagType("style", converter.TagTypeRemove, converter.PriorityStandard)
	conv.Register.TagType("link", converter.TagTypeRemove, converter.PriorityStandard)
	conv.Register.TagType("meta", converter.TagTypeRemove, converter.PriorityStandard)

	conv.Register.TagType("iframe", converter.TagTypeRemove, converter.PriorityStandard)
	conv.Register.TagType("noscript", converter.TagTypeRemove, converter.PriorityStandard)

	conv.Register.TagType("input", converter.TagTypeRemove, converter.PriorityStandard)
	conv.Register.TagType("textarea", converter.TagTypeRemove, converter.PriorityStandard)

	// "tr" is not in the `IsBlockNode` list,
	// but we want to treat is as a block anyway.
	// conv.Register.TagStrategy("tr", converter.StrategyMarkdownBlock, converter.PriorityStandard)
	// conv.Register.TagType("tr", converter.BlockTagType, converter.PriorityStandard)

	conv.Register.PreRenderer(b.preRenderRemove, converter.PriorityEarly)
	// Note: The priority is low, so that collapse runs _after_ all the other functions
	conv.Register.PreRenderer(b.preRenderCollapse, converter.PriorityLate)

	conv.Register.TextTransformer(b.handleTextTransform, converter.PriorityStandard)

	conv.Register.PostRenderer(b.postRenderTrimContent, converter.PriorityStandard)
	conv.Register.PostRenderer(b.postRenderUnescapeContent, converter.PriorityStandard+20)

	return nil
}

func (b *base) preRenderRemove(ctx converter.Context, doc *html.Node) {
	var finder func(node *html.Node)
	finder = func(node *html.Node) {
		name := dom.NodeName(node)

		if tagType, _ := ctx.GetTagType(name); tagType == converter.TagTypeRemove {
			dom.RemoveNode(node)
			return
		}

		for child := node.FirstChild; child != nil; child = child.NextSibling {
			// Because we are sometimes removing a node, this causes problems
			// with the for loop. Using `defer` is a cool trick!
			// https://gist.github.com/loopthrough/17da0f416054401fec355d338727c46e
			defer finder(child)
		}
	}
	finder(doc)

	// - - - - - - - //

	// After removing elements (see above) it can happen that we have
	// two #text nodes right next to each other. This would cause problems
	// with the collapse so we merge them together.
	domutils.MergeAdjacentTextNodes(doc)
}

func (b *base) preRenderCollapse(ctx converter.Context, doc *html.Node) {
	collapse.Collapse(doc, &collapse.DomFuncs{
		IsBlockNode: func(node *html.Node) bool {
			tagName := dom.NodeName(node)
			tagType, ok := ctx.GetTagType(tagName)
			if ok {
				return tagType == converter.TagTypeBlock
			}

			return dom.NameIsBlockNode(tagName)
		},
	})
}

var characterEntityReplacer = strings.NewReplacer(
	// We are not using `html.EscapeString` because we
	// care about fewer characters
	"<", "&lt;",
	">", "&gt;",
	"&", "&amp;",
)

func (b *base) handleTextTransform(ctx converter.Context, content string) string {

	// TODO: similar to UnEscapers also only escape if nessesary.
	//       "<" only if not followed by space
	//       "&" only if character entity
	content = characterEntityReplacer.Replace(content)

	// TODO: reduce conversion between types
	content = string(ctx.EscapeContent([]byte(content)))

	return content
}

func (b *base) postRenderTrimContent(ctx converter.Context, result []byte) []byte {
	// Remove whitespace from the beginning & end
	result = bytes.TrimSpace(result)

	// Remove too many newlines
	result = textutils.TrimConsecutiveNewlines(result)
	result = textutils.TrimUnnecessaryHardLineBreaks(result)

	return result
}
func (b *base) postRenderUnescapeContent(ctx converter.Context, result []byte) []byte {
	result = ctx.UnEscapeContent(result)
	return result
}