File: table.go

package info (click to toggle)
golang-github-johanneskaufmann-html-to-markdown 2.3.1-1
  • links: PTS, VCS
  • area: main
  • in suites: sid, trixie
  • size: 2,080 kB
  • sloc: makefile: 3
file content (149 lines) | stat: -rw-r--r-- 4,000 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
package table

import (
	"fmt"
	"sync"

	"github.com/JohannesKaufmann/dom"
	"github.com/JohannesKaufmann/html-to-markdown/v2/converter"
	"golang.org/x/net/html"
)

type option func(p *tablePlugin) error

type SpanCellBehavior string

const (
	// SpanBehaviorEmpty renders an empty cell.
	SpanBehaviorEmpty SpanCellBehavior = "empty"
	// SpanBehaviorMirror renders the same content as the original cell.
	SpanBehaviorMirror SpanCellBehavior = "mirror"
)

// WithSpanCellBehavior configures how cells affected by colspan/rowspan attributes
// should be rendered. When a cell spans multiple columns or rows, the affected cells
// can either be empty or contain the same content as the original cell.
func WithSpanCellBehavior(behavior SpanCellBehavior) option {
	return func(p *tablePlugin) error {
		switch behavior {
		case "":
			// TODO: should we allow empty string?
			return nil

		case SpanBehaviorEmpty, SpanBehaviorMirror:
			p.spanCellBehavior = behavior
			return nil

		default:
			return fmt.Errorf("unknown value %q for span cell behavior", behavior)
		}
	}
}

// WithSkipEmptyRows configures the table plugin to omit empty rows from the output.
// An empty row is defined as a row where all cells contain no content or only whitespace.
// When set to true, empty rows will be omitted from the output. When false (default),
// all rows are preserved.
func WithSkipEmptyRows(skip bool) option {
	return func(p *tablePlugin) error {
		p.skipEmptyRows = skip
		return nil
	}
}

// WithHeaderPromotion configures whether the first row should be treated as a header
// when the table has no explicit header row (e.g. <th> elements). When set to true, the
// first row will be converted to a header row with separator dashes. When false (default),
// all rows are treated as regular content.
func WithHeaderPromotion(promote bool) option {
	return func(p *tablePlugin) error {
		p.promoteFirstRowToHeader = promote
		return nil
	}
}

// WithPresentationTables configures whether tables marked with role="presentation"
// should be converted to markdown. When set to true, presentation tables will be
// converted like regular tables. When false (default), these tables are skipped
// since they typically represent layout rather than semantic content.
func WithPresentationTables(convert bool) option {
	return func(p *tablePlugin) error {
		p.convertPresentationTables = convert
		return nil
	}
}

type tablePlugin struct {
	m   sync.RWMutex
	err error

	spanCellBehavior          SpanCellBehavior
	skipEmptyRows             bool
	promoteFirstRowToHeader   bool
	convertPresentationTables bool
}

func (p *tablePlugin) setError(err error) {
	p.m.Lock()
	defer p.m.Unlock()

	p.err = err
}
func (p *tablePlugin) getError() error {
	p.m.RLock()
	defer p.m.RUnlock()

	return p.err
}

func NewTablePlugin(opts ...option) converter.Plugin {
	plugin := &tablePlugin{}
	for _, opt := range opts {
		err := opt(plugin)
		if err != nil {
			plugin.setError(err)
			break
		}
	}
	return plugin
}

func (s *tablePlugin) Name() string {
	return "table"
}

func (s *tablePlugin) Init(conv *converter.Converter) error {
	if err := s.getError(); err != nil {
		// Any error raised from the option func
		return err
	}

	conv.Register.EscapedChar('|')

	conv.Register.Renderer(s.handleRender, converter.PriorityStandard)

	return nil
}

func (s *tablePlugin) handleRender(ctx converter.Context, w converter.Writer, n *html.Node) converter.RenderStatus {
	name := dom.NodeName(n)
	switch name {
	case "table":
		return s.renderTable(ctx, w, n)

	case "tr":
		// Normally, when the "table" gets rendered we do NOT go into this case.
		// But as a fallback we separate the rows through newlines.
		return s.renderFallbackRow(ctx, w, n)

	}

	return converter.RenderTryNext
}

func (s *tablePlugin) renderFallbackRow(ctx converter.Context, w converter.Writer, n *html.Node) converter.RenderStatus {
	w.WriteString("\n\n")
	ctx.RenderChildNodes(ctx, w, n)
	w.WriteString("\n\n")
	return converter.RenderSuccess
}