File: decoder.go

package info (click to toggle)
miniflux 2.2.16-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 6,188 kB
  • sloc: xml: 4,853; javascript: 1,158; sh: 257; makefile: 161
file content (122 lines) | stat: -rw-r--r-- 3,277 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

package xml // import "miniflux.app/v2/internal/reader/xml"

import (
	"bytes"
	"encoding/xml"
	"fmt"
	"io"
	"unicode/utf8"

	"miniflux.app/v2/internal/reader/encoding"
)

// NewXMLDecoder returns a XML decoder that filters illegal characters.
func NewXMLDecoder(data io.ReadSeeker) *xml.Decoder {
	var decoder *xml.Decoder

	// This is way fasted than io.ReadAll(data) as the buffer can be allocated in one go instead of dynamically grown.
	buffer := &bytes.Buffer{}
	io.Copy(buffer, data)

	if hasUTF8XMLDeclaration(buffer.Bytes()) {
		// TODO: detect actual encoding from bytes if not UTF-8 and convert to UTF-8 if needed.
		// For now we just expect the invalid characters to be stripped out.

		// Filter invalid chars now, since decoder.CharsetReader isn't called for utf-8 content
		filteredBytes := filterValidXMLChars(buffer.Bytes())

		decoder = xml.NewDecoder(bytes.NewReader(filteredBytes))
	} else {
		data.Seek(0, io.SeekStart)
		decoder = xml.NewDecoder(data)

		// The XML document will be converted to UTF-8 by encoding.CharsetReader
		// Invalid characters will be filtered later via decoder.CharsetReader
		decoder.CharsetReader = charsetReaderFilterInvalidUtf8
	}

	decoder.Entity = xml.HTMLEntity
	decoder.Strict = false

	return decoder
}

func charsetReaderFilterInvalidUtf8(charset string, input io.Reader) (io.Reader, error) {
	utf8Reader, err := encoding.CharsetReader(charset, input)
	if err != nil {
		return nil, err
	}
	rawData, err := io.ReadAll(utf8Reader)
	if err != nil {
		return nil, fmt.Errorf("xml: unable to read data: %w", err)
	}
	filteredBytes := filterValidXMLChars(rawData)
	return bytes.NewReader(filteredBytes), nil
}

// filterValidXMLChars filters inplace invalid XML characters.
// This function is inspired from bytes.Map
func filterValidXMLChars(s []byte) []byte {
	var i uint // declaring it as an uint removes a bound check in the loop.
	var j int

	for i = 0; i < uint(len(s)); {
		wid := 1
		r := rune(s[i])
		if r >= utf8.RuneSelf {
			r, wid = utf8.DecodeRune(s[i:])
		}
		if r != utf8.RuneError {
			if r = filterValidXMLChar(r); r >= 0 {
				utf8.EncodeRune(s[j:], r)
				j += wid
			}
		}
		i += uint(wid)
	}
	return s[:j]
}

// This function is copied from encoding/xml package,
// and is used to check if all the characters are legal.
func filterValidXMLChar(r rune) rune {
	if r == 0x09 ||
		r == 0x0A ||
		r == 0x0D ||
		r >= 0x20 && r <= 0xD7FF ||
		r >= 0xE000 && r <= 0xFFFD ||
		r >= 0x10000 && r <= 0x10FFFF {
		return r
	}
	return -1
}

// This function is copied from encoding/xml's procInst and adapted for []bytes instead of string
func getEncoding(b []byte) []byte {
	// This parsing is somewhat lame and not exact.
	// It works for all actual cases, though.
	idx := bytes.Index(b, []byte("encoding="))
	if idx == -1 {
		return nil
	}
	v := b[idx+len("encoding="):]
	if len(v) == 0 {
		return nil
	}
	if v[0] != '\'' && v[0] != '"' {
		return nil
	}
	idx = bytes.IndexRune(v[1:], rune(v[0]))
	if idx == -1 {
		return nil
	}
	return v[1 : idx+1]
}

func hasUTF8XMLDeclaration(data []byte) bool {
	enc := getEncoding(data)
	return enc == nil || bytes.EqualFold(enc, []byte("utf-8"))
}