1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122
|
// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
package xml // import "miniflux.app/v2/internal/reader/xml"
import (
"bytes"
"encoding/xml"
"fmt"
"io"
"unicode/utf8"
"miniflux.app/v2/internal/reader/encoding"
)
// NewXMLDecoder returns a XML decoder that filters illegal characters.
func NewXMLDecoder(data io.ReadSeeker) *xml.Decoder {
var decoder *xml.Decoder
// This is way fasted than io.ReadAll(data) as the buffer can be allocated in one go instead of dynamically grown.
buffer := &bytes.Buffer{}
io.Copy(buffer, data)
if hasUTF8XMLDeclaration(buffer.Bytes()) {
// TODO: detect actual encoding from bytes if not UTF-8 and convert to UTF-8 if needed.
// For now we just expect the invalid characters to be stripped out.
// Filter invalid chars now, since decoder.CharsetReader isn't called for utf-8 content
filteredBytes := filterValidXMLChars(buffer.Bytes())
decoder = xml.NewDecoder(bytes.NewReader(filteredBytes))
} else {
data.Seek(0, io.SeekStart)
decoder = xml.NewDecoder(data)
// The XML document will be converted to UTF-8 by encoding.CharsetReader
// Invalid characters will be filtered later via decoder.CharsetReader
decoder.CharsetReader = charsetReaderFilterInvalidUtf8
}
decoder.Entity = xml.HTMLEntity
decoder.Strict = false
return decoder
}
func charsetReaderFilterInvalidUtf8(charset string, input io.Reader) (io.Reader, error) {
utf8Reader, err := encoding.CharsetReader(charset, input)
if err != nil {
return nil, err
}
rawData, err := io.ReadAll(utf8Reader)
if err != nil {
return nil, fmt.Errorf("xml: unable to read data: %w", err)
}
filteredBytes := filterValidXMLChars(rawData)
return bytes.NewReader(filteredBytes), nil
}
// filterValidXMLChars filters inplace invalid XML characters.
// This function is inspired from bytes.Map
func filterValidXMLChars(s []byte) []byte {
var i uint // declaring it as an uint removes a bound check in the loop.
var j int
for i = 0; i < uint(len(s)); {
wid := 1
r := rune(s[i])
if r >= utf8.RuneSelf {
r, wid = utf8.DecodeRune(s[i:])
}
if r != utf8.RuneError {
if r = filterValidXMLChar(r); r >= 0 {
utf8.EncodeRune(s[j:], r)
j += wid
}
}
i += uint(wid)
}
return s[:j]
}
// This function is copied from encoding/xml package,
// and is used to check if all the characters are legal.
func filterValidXMLChar(r rune) rune {
if r == 0x09 ||
r == 0x0A ||
r == 0x0D ||
r >= 0x20 && r <= 0xD7FF ||
r >= 0xE000 && r <= 0xFFFD ||
r >= 0x10000 && r <= 0x10FFFF {
return r
}
return -1
}
// This function is copied from encoding/xml's procInst and adapted for []bytes instead of string
func getEncoding(b []byte) []byte {
// This parsing is somewhat lame and not exact.
// It works for all actual cases, though.
idx := bytes.Index(b, []byte("encoding="))
if idx == -1 {
return nil
}
v := b[idx+len("encoding="):]
if len(v) == 0 {
return nil
}
if v[0] != '\'' && v[0] != '"' {
return nil
}
idx = bytes.IndexRune(v[1:], rune(v[0]))
if idx == -1 {
return nil
}
return v[1 : idx+1]
}
func hasUTF8XMLDeclaration(data []byte) bool {
enc := getEncoding(data)
return enc == nil || bytes.EqualFold(enc, []byte("utf-8"))
}
|