1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
|
package cmd
import (
"bytes"
"errors"
"fmt"
"github.com/JohannesKaufmann/dom"
"github.com/JohannesKaufmann/html-to-markdown/v2/converter"
"github.com/JohannesKaufmann/html-to-markdown/v2/plugin/base"
"github.com/JohannesKaufmann/html-to-markdown/v2/plugin/commonmark"
"github.com/JohannesKaufmann/html-to-markdown/v2/plugin/strikethrough"
"github.com/JohannesKaufmann/html-to-markdown/v2/plugin/table"
"github.com/andybalholm/cascadia"
"golang.org/x/net/html"
)
func overrideValidationError(e *commonmark.ValidateConfigError) error {
// TODO: Maybe OptionFunc should already validate and return an error?
// Then it would be easier to override the Key since we have once
// place to assemble the []OptionFunc and directly treat the errors...
//
// We would basically invoke it ourselves:
// err := commonmark.WithStrongDelimiter(cli.config.strongDelimiter)(conv)
switch e.Key {
case "StrongDelimiter":
e.Key = "opt-strong-delimiter"
}
e.KeyWithValue = fmt.Sprintf("--%s=%q", e.Key, e.Value)
return e
}
func (cli *CLI) includeNodesFromDoc(doc *html.Node) (*html.Node, error) {
if len(cli.config.includeSelector) == 0 {
return doc, nil
}
nodes := cascadia.QueryAll(doc, cli.config.includeSelector)
root := &html.Node{}
for _, n := range nodes {
dom.RemoveNode(n)
root.AppendChild(n)
}
return root, nil
}
func (cli *CLI) excludeNodesFromDoc(doc *html.Node) error {
if len(cli.config.excludeSelector) == 0 {
return nil
}
var finder func(node *html.Node)
finder = func(node *html.Node) {
if cli.config.excludeSelector.Match(node) {
dom.RemoveNode(node)
return
}
for child := node.FirstChild; child != nil; child = child.NextSibling {
// Because we are sometimes removing a node, this causes problems
// with the for loop. Using `defer` is a cool trick!
// https://gist.github.com/loopthrough/17da0f416054401fec355d338727c46e
defer finder(child)
}
}
finder(doc)
return nil
}
func (cli *CLI) parseInputWithSelectors(input []byte) (*html.Node, error) {
r := bytes.NewReader(input)
doc, err := html.Parse(r)
if err != nil {
return nil, fmt.Errorf("error while parsing html: %w", err)
}
doc, err = cli.includeNodesFromDoc(doc)
if err != nil {
return nil, err
}
err = cli.excludeNodesFromDoc(doc)
if err != nil {
return nil, err
}
return doc, nil
}
func (cli *CLI) convert(input []byte) ([]byte, error) {
conv := converter.NewConverter(
converter.WithPlugins(
base.NewBasePlugin(),
commonmark.NewCommonmarkPlugin(
commonmark.WithStrongDelimiter(cli.config.strongDelimiter),
),
),
)
if cli.config.enablePluginStrikethrough {
conv.Register.Plugin(strikethrough.NewStrikethroughPlugin())
}
if cli.config.enablePluginTable {
conv.Register.Plugin(
table.NewTablePlugin(
table.WithSkipEmptyRows(cli.config.tableSkipEmptyRows),
table.WithHeaderPromotion(cli.config.tableHeaderPromotion),
table.WithSpanCellBehavior(table.SpanCellBehavior(cli.config.tableSpanCellBehavior)),
table.WithPresentationTables(cli.config.tablePresentationTables),
),
)
}
doc, err := cli.parseInputWithSelectors(input)
if err != nil {
return nil, err
}
markdown, err := conv.ConvertNode(doc, converter.WithDomain(cli.config.domain))
if err != nil {
var validationErr *commonmark.ValidateConfigError
if errors.As(err, &validationErr) {
return nil, overrideValidationError(validationErr)
}
return nil, err
}
return markdown, nil
}
|