1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150
|
package commonmark
import (
"bytes"
"regexp"
"github.com/JohannesKaufmann/dom"
"github.com/JohannesKaufmann/html-to-markdown/v2/converter"
"github.com/JohannesKaufmann/html-to-markdown/v2/internal/textutils"
"github.com/JohannesKaufmann/html-to-markdown/v2/marker"
"golang.org/x/net/html"
)
// TODO: remove regex
var multipleSpacesR = regexp.MustCompile(` +`)
func (r *commonmark) setextUnderline(level int, width int) []byte {
line := "-"
if level == 1 {
line = "="
}
return bytes.Repeat([]byte(line), width)
}
func (r *commonmark) atxPrefix(level int) []byte {
return bytes.Repeat([]byte("#"), level)
}
func getHeadingLevel(name string) int {
switch name {
case "h1":
return 1
case "h2":
return 2
case "h3":
return 3
case "h4":
return 4
case "h5":
return 5
case "h6":
return 6
default:
return 6
}
}
func runeCount(chars []rune) (count int) {
for _, char := range chars {
if char == marker.MarkerEscaping {
continue
}
count++
}
return
}
func getUnderlineWidth(content []byte, minVal int) int {
var width int
parts := bytes.Split(content, []byte("\n"))
for _, part := range parts {
// Count how wide the line should be,
// while using RuneCount to correctly count ä, ö, ...
//
// TODO: optimize function w := utf8.RuneCount(part)
w := runeCount([]rune(string(part)))
if w > width {
width = w
}
}
// Technically the minimum value is only one character,
// but one dash could easily trigger a heading.
if width < minVal {
return minVal
}
return width
}
func escapePoundSignAtEnd(s []byte) []byte {
// -1 #
// -2 placeholder
// -3 maybe \
if s[len(s)-1] != '#' {
// We don't have a # at the end,
// so there is no work to do...
return s
}
if len(s) >= 3 && s[len(s)-3] == '\\' {
// It is already escaped,
// so there is no work to do...
return s
}
// Because we have a # at the end,
// we should manually force the escaping
// by overriding the placeholder.
s[len(s)-2] = '\\'
return s
}
func (c *commonmark) renderHeading(ctx converter.Context, w converter.Writer, n *html.Node) converter.RenderStatus {
// ctx = context.WithValue(ctx, "is_inside_heading", true)
level := getHeadingLevel(dom.NodeName(n))
var buf bytes.Buffer
ctx.RenderChildNodes(ctx, &buf, n)
content := buf.Bytes()
if len(bytes.TrimSpace(content)) == 0 {
return converter.RenderSuccess
}
if c.HeadingStyle == HeadingStyleSetext && level < 3 {
// Note: We don't want to use `TrimUnnecessaryHardLineBreaks` here,
// since `EscapeMultiLine` also takes care of newlines.
content = textutils.TrimConsecutiveNewlines(content)
content = textutils.EscapeMultiLine(content)
width := getUnderlineWidth(content, 3)
underline := c.setextUnderline(level, width)
w.WriteString("\n\n")
w.Write(content)
w.WriteRune('\n')
w.Write(underline)
w.WriteString("\n\n")
} else {
content = bytes.ReplaceAll(content, []byte("\n"), []byte(" "))
content = bytes.ReplaceAll(content, []byte("\r"), []byte(" "))
// Replace multiple spaces by one space.
content = multipleSpacesR.ReplaceAll(content, []byte(" "))
content = bytes.TrimSpace(content)
// A # sign at the end would be removed otherwise
content = escapePoundSignAtEnd(content)
w.WriteString("\n\n")
w.Write(c.atxPrefix(level))
w.WriteRune(' ')
w.Write(content)
w.WriteString("\n\n")
}
return converter.RenderSuccess
}
|