File: encoding.go

package info (click to toggle)
miniflux 2.2.6-2
  • links: PTS, VCS
  • area: main
  • in suites: forky
  • size: 5,640 kB
  • sloc: xml: 4,843; javascript: 1,326; sh: 290; makefile: 179
file content (63 lines) | stat: -rw-r--r-- 2,236 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

package encoding // import "miniflux.app/v2/internal/reader/encoding"

import (
	"bytes"
	"fmt"
	"io"
	"unicode/utf8"

	"golang.org/x/net/html/charset"
)

// CharsetReader is used when the XML encoding is specified for the input document.
//
// The document is converted in UTF-8 only if a different encoding is specified
// and the document is not already UTF-8.
//
// Several edge cases could exists:
//
// - Feeds with encoding specified only in Content-Type header and not in XML document
// - Feeds with encoding specified in both places
// - Feeds with encoding specified only in XML document and not in HTTP header
// - Feeds with wrong encoding defined and already in UTF-8
func CharsetReader(charsetLabel string, input io.Reader) (io.Reader, error) {
	buffer, err := io.ReadAll(input)
	if err != nil {
		return nil, fmt.Errorf(`encoding: unable to read input: %w`, err)
	}

	r := bytes.NewReader(buffer)

	// The document is already UTF-8, do not do anything (avoid double-encoding).
	// That means the specified encoding in XML prolog is wrong.
	if utf8.Valid(buffer) {
		return r, nil
	}

	// Transform document to UTF-8 from the specified encoding in XML prolog.
	return charset.NewReaderLabel(charsetLabel, r)
}

// NewCharsetReader returns an io.Reader that converts the content of r to UTF-8.
func NewCharsetReader(r io.Reader, contentType string) (io.Reader, error) {
	buffer, err := io.ReadAll(r)
	if err != nil {
		return nil, fmt.Errorf(`encoding: unable to read input: %w`, err)
	}

	internalReader := bytes.NewReader(buffer)

	// The document is already UTF-8, do not do anything.
	if utf8.Valid(buffer) {
		return internalReader, nil
	}

	// Transform document to UTF-8 from the specified encoding in Content-Type header.
	// Note that only the first 1024 bytes are used to detect the encoding.
	// If the <meta charset> tag is not found in the first 1024 bytes, charset.DetermineEncoding returns "windows-1252" resulting in encoding issues.
	// See https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
	return charset.NewReader(internalReader, contentType)
}