File: sanitize_string.go

package info (click to toggle)
golang-github-protonmail-gopenpgp-v3 3.3.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 1,028 kB
  • sloc: sh: 87; makefile: 2
file content (136 lines) | stat: -rw-r--r-- 3,026 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
package internal

import (
	"bufio"
	"bytes"
	"io"
	"strings"
	"unicode"
	"unicode/utf8"
)

func SanitizeString(input string) string {
	return strings.ToValidUTF8(input, string(unicode.ReplacementChar))
}

func NewSanitizeReader(r io.Reader) io.Reader {
	sanitizer := &sanitizeReader{r, new(bytes.Buffer), false}
	return newSanitizeUtf8Reader(sanitizer)
}

type sanitizeUtf8Reader struct {
	r               *bufio.Reader
	reminder        []byte
	internalBuffer  [4]byte
	lastRuneInvalid bool
}

func newSanitizeUtf8Reader(reader io.Reader) *sanitizeUtf8Reader {
	return &sanitizeUtf8Reader{
		r: bufio.NewReader(reader),
	}
}

func (sr *sanitizeUtf8Reader) Read(buf []byte) (int, error) {
	read := 0
	// Check if there is a reminder from the previous read
	if sr.reminder != nil {
		toCopy := len(sr.reminder)
		if toCopy > len(buf) {
			toCopy = len(buf)
		}
		copy(buf[read:], sr.reminder[:toCopy])
		read += toCopy
		if toCopy < len(sr.reminder) {
			sr.reminder = sr.reminder[toCopy:]
		} else {
			sr.reminder = nil
		}
	}
	// Decode utf-8 runes from the internal reader and copy
	for read < len(buf) {
		runeItem, size, err := sr.r.ReadRune()
		if err != nil {
			return read, err
		}
		if runeItem == unicode.ReplacementChar {
			// If last rune written is a replacement skip
			if sr.lastRuneInvalid {
				continue
			}
			size = 3
			sr.lastRuneInvalid = true
		} else {
			sr.lastRuneInvalid = false
		}
		if read+size <= len(buf) {
			utf8.EncodeRune(buf[read:], runeItem)
			read += size
		} else {
			// Not enough space to write the entire rune
			size = utf8.EncodeRune(sr.internalBuffer[:], runeItem)
			copied := copy(buf[read:], sr.internalBuffer[:len(buf)-read])
			sr.reminder = sr.internalBuffer[copied:size]
			read += copied
			break
		}
	}
	return read, nil
}

type sanitizeReader struct {
	r      io.Reader
	buffer *bytes.Buffer
	pin    bool
}

func (sr *sanitizeReader) resetState() {
	sr.pin = false
}

func (sr *sanitizeReader) Read(buf []byte) (int, error) {
	// read from internal buffer first
	internalRead, _ := sr.buffer.Read(buf)
	if internalRead == len(buf) {
		return internalRead, nil
	}
	// if there is more space in buf, read from the reader
	n, err := sr.r.Read(buf[internalRead:])
	if err != nil && err != io.EOF {
		// error occurred that is not EOF
		return n, err
	}
	// filter non-unicode and \r\n in what has been read from the reader,
	for i := internalRead; i < internalRead+n; {
		c := buf[i]
		if sr.pin {
			// last char read is \r
			if c == '\n' {
				sr.buffer.WriteByte('\n')
				i++
			} else {
				sr.buffer.WriteByte('\r')
			}
			sr.resetState()
			continue
		}
		if c == '\r' {
			// check for \n on next char
			i++
			sr.pin = true
			continue
		}
		sr.resetState()
		sr.buffer.Write(buf[i : i+1])
		i++
	}
	if err == io.EOF && sr.pin {
		sr.resetState()
		sr.buffer.WriteByte('\r')
	}
	finalRead, _ := sr.buffer.Read(buf[internalRead:])
	if err == io.EOF && sr.buffer.Len() == 0 {
		return internalRead + finalRead, err
	}
	return internalRead + finalRead, nil
}