1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181
|
package table
import (
"fmt"
"sync"
"github.com/JohannesKaufmann/dom"
"github.com/JohannesKaufmann/html-to-markdown/v2/converter"
"golang.org/x/net/html"
)
type option func(p *tablePlugin) error
type SpanCellBehavior string
const (
// SpanBehaviorEmpty renders an empty cell.
SpanBehaviorEmpty SpanCellBehavior = "empty"
// SpanBehaviorMirror renders the same content as the original cell.
SpanBehaviorMirror SpanCellBehavior = "mirror"
)
// WithSpanCellBehavior configures how cells affected by colspan/rowspan attributes
// should be rendered. When a cell spans multiple columns or rows, the affected cells
// can either be empty or contain the same content as the original cell.
func WithSpanCellBehavior(behavior SpanCellBehavior) option {
return func(p *tablePlugin) error {
switch behavior {
case "":
// TODO: should we allow empty string?
return nil
case SpanBehaviorEmpty, SpanBehaviorMirror:
p.spanCellBehavior = behavior
return nil
default:
return fmt.Errorf("unknown value %q for span cell behavior", behavior)
}
}
}
type NewlineBehavior string
const (
// NewlineBehaviorSkip skips tables with newlines in cells (default).
NewlineBehaviorSkip NewlineBehavior = "skip"
// NewlineBehaviorPreserve preserves newlines in cells.
NewlineBehaviorPreserve NewlineBehavior = "preserve"
)
// WithNewlineBehavior configures how to handle newlines in table cells.
// When set to NewlineBehaviorSkip (default), tables with newlines in cells are skipped.
// When set to NewlineBehaviorPreserve, newlines are preserved in cells.
//
// Markdown tables don't support multiline content by default, so this provides a workaround to still convert tables with newlines.
func WithNewlineBehavior(behavior NewlineBehavior) option {
return func(p *tablePlugin) error {
switch behavior {
case "":
// Allow empty string to default to Skip
return nil
case NewlineBehaviorSkip, NewlineBehaviorPreserve:
p.newlineBehavior = behavior
return nil
default:
return fmt.Errorf("unknown value %q for newline behavior", behavior)
}
}
}
// WithSkipEmptyRows configures the table plugin to omit empty rows from the output.
// An empty row is defined as a row where all cells contain no content or only whitespace.
// When set to true, empty rows will be omitted from the output. When false (default),
// all rows are preserved.
func WithSkipEmptyRows(skip bool) option {
return func(p *tablePlugin) error {
p.skipEmptyRows = skip
return nil
}
}
// WithHeaderPromotion configures whether the first row should be treated as a header
// when the table has no explicit header row (e.g. <th> elements). When set to true, the
// first row will be converted to a header row with separator dashes. When false (default),
// all rows are treated as regular content.
func WithHeaderPromotion(promote bool) option {
return func(p *tablePlugin) error {
p.promoteFirstRowToHeader = promote
return nil
}
}
// WithPresentationTables configures whether tables marked with role="presentation"
// should be converted to markdown. When set to true, presentation tables will be
// converted like regular tables. When false (default), these tables are skipped
// since they typically represent layout rather than semantic content.
func WithPresentationTables(convert bool) option {
return func(p *tablePlugin) error {
p.convertPresentationTables = convert
return nil
}
}
type tablePlugin struct {
m sync.RWMutex
err error
spanCellBehavior SpanCellBehavior
newlineBehavior NewlineBehavior
skipEmptyRows bool
promoteFirstRowToHeader bool
convertPresentationTables bool
}
func (p *tablePlugin) setError(err error) {
p.m.Lock()
defer p.m.Unlock()
p.err = err
}
func (p *tablePlugin) getError() error {
p.m.RLock()
defer p.m.RUnlock()
return p.err
}
func NewTablePlugin(opts ...option) converter.Plugin {
plugin := &tablePlugin{}
for _, opt := range opts {
err := opt(plugin)
if err != nil {
plugin.setError(err)
break
}
}
return plugin
}
func (s *tablePlugin) Name() string {
return "table"
}
func (s *tablePlugin) Init(conv *converter.Converter) error {
if err := s.getError(); err != nil {
// Any error raised from the option func
return err
}
conv.Register.EscapedChar('|')
conv.Register.Renderer(s.handleRender, converter.PriorityStandard)
return nil
}
func (s *tablePlugin) handleRender(ctx converter.Context, w converter.Writer, n *html.Node) converter.RenderStatus {
name := dom.NodeName(n)
switch name {
case "table":
return s.renderTable(ctx, w, n)
case "tr":
// Normally, when the "table" gets rendered we do NOT go into this case.
// But as a fallback we separate the rows through newlines.
return s.renderFallbackRow(ctx, w, n)
}
return converter.RenderTryNext
}
func (s *tablePlugin) renderFallbackRow(ctx converter.Context, w converter.Writer, n *html.Node) converter.RenderStatus {
w.WriteString("\n\n")
ctx.RenderChildNodes(ctx, w, n)
w.WriteString("\n\n")
return converter.RenderSuccess
}
|