1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89
|
package converter
import (
"unicode/utf8"
"github.com/JohannesKaufmann/html-to-markdown/v2/marker"
)
const (
actionKeep = iota
actionEscape = iota
)
// IMPORTANT: Only internally we assume it is only byte
var placeholderByte byte = marker.BytesMarkerEscaping[0]
func (conv *Converter) escapeContent(chars []byte) []byte {
if conv.escapeMode == EscapeModeDisabled {
return chars
}
newChars := make([]byte, 0, len(chars))
for index := 0; index < len(chars); index++ {
if chars[index] == '\u0000' {
// For security reasons, the Unicode character U+0000 must be replaced with the REPLACEMENT CHARACTER (U+FFFD).
newChars = append(newChars, []byte(string('\ufffd'))...)
continue
}
r, _ := utf8.DecodeRune(chars[index:])
isMarkdownChar := conv.checkIsEscapedChar(r)
if isMarkdownChar {
newChars = append(newChars, placeholderByte, chars[index])
} else {
newChars = append(newChars, chars[index])
}
}
return newChars
}
func (conv *Converter) unEscapeContent(chars []byte) []byte {
if conv.escapeMode == EscapeModeDisabled {
return chars
}
checkElements := func(index int) int {
for _, handler := range conv.getUnEscapeHandlers() {
if skip := handler.Value(chars, index); skip != -1 {
return skip
}
}
return -1
}
changes := make([]uint8, len(chars))
for index := 0; index < len(chars); index++ {
if chars[index] != placeholderByte {
continue
}
if index+1 >= len(chars) {
break
}
skip := checkElements(index + 1)
if skip == -1 {
continue
}
changes[index] = actionEscape
index += skip - 1
}
newChars := make([]byte, 0, len(chars))
for index, char := range chars {
if char != placeholderByte {
newChars = append(newChars, char)
continue
}
// What to do with this placeholder? Should we escape or not?
if changes[index] == actionEscape {
newChars = append(newChars, '\\')
}
}
return newChars
}
|