File: render_code.go

package info (click to toggle)
golang-github-johanneskaufmann-html-to-markdown 2.3.1-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 2,080 kB
  • sloc: makefile: 3
file content (149 lines) | stat: -rw-r--r-- 4,058 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
package commonmark

import (
	"bytes"
	"strings"
	"unicode/utf8"

	"github.com/JohannesKaufmann/dom"
	"github.com/JohannesKaufmann/html-to-markdown/v2/converter"
	"github.com/JohannesKaufmann/html-to-markdown/v2/internal/textutils"
	"github.com/JohannesKaufmann/html-to-markdown/v2/marker"
	"golang.org/x/net/html"
)

func (c *commonmark) renderInlineCode(_ converter.Context, w converter.Writer, n *html.Node) converter.RenderStatus {
	// TODO: configure delimeter in options?
	fenceChar := '`'

	codeContent, _ := getCodeWithoutTags(n)

	// TODO: debug flag?
	if len(codeContent) == 0 {
		// fmt.Println("expected an empty inline code to be already removed")
		// panic("expected an empty inline code to be already removed")
	}
	// TODO: configurable function to decide if inline or block?
	if bytes.Contains(codeContent, []byte("\n")) {
		// fmt.Println("inline code contains newlines")
		// return c.renderBlockCode(ctx, w, n, render)
	}

	if bytes.TrimSpace(codeContent) == nil {
		// No stripping occurs if the code span contains _only_ spaces:
		w.WriteRune(fenceChar)
		w.Write(codeContent)
		w.WriteRune(fenceChar)
		return converter.RenderSuccess
	}

	// Newlines in the text aren't great, since this is inline code and not a code block.
	// Newlines will be stripped anyway in the browser, but it won't be recognized as code
	// from the markdown parser when there is more than one newline.
	codeContent = textutils.CollapseInlineCodeContent(codeContent)

	code := string(codeContent)

	maxCount := textutils.CalculateCodeFenceOccurrences(fenceChar, code)
	maxCount++

	fence := strings.Repeat(string(fenceChar), maxCount)

	// Code contains a backtick as first character
	if strings.HasPrefix(code, "`") {
		code = " " + code
	}
	// Code contains a backtick as last character
	if strings.HasSuffix(code, "`") {
		code = code + " "
	}

	w.WriteString(fence)
	w.WriteString(code)
	w.WriteString(fence)

	return converter.RenderSuccess
}
func (c *commonmark) renderBlockCode(_ converter.Context, w converter.Writer, n *html.Node) converter.RenderStatus {
	code, infoString := getCodeWithoutTags(n)

	if bytes.HasSuffix(code, []byte("\n")) {
		code = code[:len(code)-1]
	}

	fenceChar, _ := utf8.DecodeRuneInString(c.CodeBlockFence)
	fence := textutils.CalculateCodeFence(fenceChar, string(code))

	// We want to keep the original content inside the code block untouched.
	// Because multiple newlines would be trimmed, we temporarily replace it with another character.
	code = bytes.ReplaceAll(code, []byte("\n"), marker.BytesMarkerCodeBlockNewline)

	w.WriteString("\n\n")
	w.WriteString(fence)
	w.WriteString(infoString)
	w.WriteRune('\n')
	w.Write(code)
	w.WriteRune('\n')
	w.WriteString(fence)
	w.WriteString("\n\n")

	return converter.RenderSuccess
}

func getCodeLanguage(n *html.Node) string {
	class := dom.GetAttributeOr(n, "class", "")

	parts := strings.Split(class, " ")
	for _, part := range parts {
		if !strings.Contains(part, "language-") && !strings.Contains(part, "lang-") {
			continue
		}

		part = strings.Replace(part, "language-", "", 1)
		part = strings.Replace(part, "lang-", "", 1)

		return part
	}

	return ""
}
func getCodeWithoutTags(startNode *html.Node) ([]byte, string) {
	var buf bytes.Buffer
	var infoString string

	var f func(*html.Node)
	f = func(n *html.Node) {
		if n.Type == html.ElementNode && (n.Data == "code" || n.Data == "pre") {

			// TODO: what if multiple elements have an info string?
			if infoString == "" {
				infoString = getCodeLanguage(n)
			}
		}

		// - - - //

		if n.Type == html.ElementNode && (n.Data == "style" || n.Data == "script" || n.Data == "textarea") {
			return
		}
		if n.Type == html.ElementNode && (n.Data == "br" || n.Data == "div") {
			buf.WriteString("\n")
		}

		if n.Type == html.TextNode {
			// if strings.TrimSpace(n.Data) == "" && strings.Contains(n.Data, "\n") {
			// 	buf.WriteString("\n")
			// }
			buf.WriteString(n.Data)
			return
		}

		for c := n.FirstChild; c != nil; c = c.NextSibling {
			f(c)
		}
	}

	f(startNode)

	return buf.Bytes(), infoString
}