File: cmd_convert.go

package info (click to toggle)
golang-github-johanneskaufmann-html-to-markdown 2.3.1-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 2,080 kB
  • sloc: makefile: 3
file content (135 lines) | stat: -rw-r--r-- 3,484 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
package cmd

import (
	"bytes"
	"errors"
	"fmt"

	"github.com/JohannesKaufmann/dom"
	"github.com/JohannesKaufmann/html-to-markdown/v2/converter"
	"github.com/JohannesKaufmann/html-to-markdown/v2/plugin/base"
	"github.com/JohannesKaufmann/html-to-markdown/v2/plugin/commonmark"
	"github.com/JohannesKaufmann/html-to-markdown/v2/plugin/strikethrough"
	"github.com/JohannesKaufmann/html-to-markdown/v2/plugin/table"
	"github.com/andybalholm/cascadia"
	"golang.org/x/net/html"
)

func overrideValidationError(e *commonmark.ValidateConfigError) error {

	// TODO: Maybe OptionFunc should already validate and return an error?
	//       Then it would be easier to override the Key since we have once
	//       place to assemble the []OptionFunc and directly treat the errors...
	//
	// We would basically invoke it ourselves:
	//    err := commonmark.WithStrongDelimiter(cli.config.strongDelimiter)(conv)

	switch e.Key {
	case "StrongDelimiter":
		e.Key = "opt-strong-delimiter"
	}

	e.KeyWithValue = fmt.Sprintf("--%s=%q", e.Key, e.Value)
	return e
}

func (cli *CLI) includeNodesFromDoc(doc *html.Node) (*html.Node, error) {
	if len(cli.config.includeSelector) == 0 {
		return doc, nil
	}
	nodes := cascadia.QueryAll(doc, cli.config.includeSelector)

	root := &html.Node{}
	for _, n := range nodes {
		dom.RemoveNode(n)
		root.AppendChild(n)
	}

	return root, nil
}
func (cli *CLI) excludeNodesFromDoc(doc *html.Node) error {
	if len(cli.config.excludeSelector) == 0 {
		return nil
	}

	var finder func(node *html.Node)
	finder = func(node *html.Node) {
		if cli.config.excludeSelector.Match(node) {
			dom.RemoveNode(node)
			return
		}

		for child := node.FirstChild; child != nil; child = child.NextSibling {
			// Because we are sometimes removing a node, this causes problems
			// with the for loop. Using `defer` is a cool trick!
			// https://gist.github.com/loopthrough/17da0f416054401fec355d338727c46e
			defer finder(child)
		}
	}
	finder(doc)

	return nil
}
func (cli *CLI) parseInputWithSelectors(input []byte) (*html.Node, error) {
	r := bytes.NewReader(input)

	doc, err := html.Parse(r)
	if err != nil {
		return nil, fmt.Errorf("error while parsing html: %w", err)
	}

	doc, err = cli.includeNodesFromDoc(doc)
	if err != nil {
		return nil, err
	}

	err = cli.excludeNodesFromDoc(doc)
	if err != nil {
		return nil, err
	}

	return doc, nil
}

func (cli *CLI) convert(input []byte) ([]byte, error) {
	conv := converter.NewConverter(
		converter.WithPlugins(
			base.NewBasePlugin(),
			commonmark.NewCommonmarkPlugin(
				commonmark.WithStrongDelimiter(cli.config.strongDelimiter),
			),
		),
	)
	if cli.config.enablePluginStrikethrough {
		conv.Register.Plugin(strikethrough.NewStrikethroughPlugin())
	}

	if cli.config.enablePluginTable {
		conv.Register.Plugin(
			table.NewTablePlugin(
				table.WithSkipEmptyRows(cli.config.tableSkipEmptyRows),
				table.WithHeaderPromotion(cli.config.tableHeaderPromotion),
				table.WithSpanCellBehavior(table.SpanCellBehavior(cli.config.tableSpanCellBehavior)),
				table.WithPresentationTables(cli.config.tablePresentationTables),
			),
		)
	}

	doc, err := cli.parseInputWithSelectors(input)
	if err != nil {
		return nil, err
	}

	markdown, err := conv.ConvertNode(doc, converter.WithDomain(cli.config.domain))
	if err != nil {

		var validationErr *commonmark.ValidateConfigError
		if errors.As(err, &validationErr) {
			return nil, overrideValidationError(validationErr)
		}

		return nil, err
	}

	return markdown, nil
}