File: unvis.go

package info (click to toggle)
golang-github-vbatts-go-mtree 0.5.4%2Bds-1~exp1
  • links: PTS, VCS
  • area: main
  • in suites: experimental
  • size: 796 kB
  • sloc: sh: 198; makefile: 80
file content (294 lines) | stat: -rw-r--r-- 6,889 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
/*
 * govis: unicode aware vis(3) encoding implementation
 * Copyright (C) 2017 SUSE LLC.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package govis

import (
	"fmt"
	"strconv"
	"unicode"
)

// unvisParser stores the current state of the token parser.
type unvisParser struct {
	tokens []rune
	idx    int
	flag   VisFlag
}

// Next moves the index to the next character.
func (p *unvisParser) Next() {
	p.idx++
}

// Peek gets the current token.
func (p *unvisParser) Peek() (rune, error) {
	if p.idx >= len(p.tokens) {
		return unicode.ReplacementChar, fmt.Errorf("tried to read past end of token list")
	}
	return p.tokens[p.idx], nil
}

// End returns whether all of the tokens have been consumed.
func (p *unvisParser) End() bool {
	return p.idx >= len(p.tokens)
}

func newParser(input string, flag VisFlag) *unvisParser {
	return &unvisParser{
		tokens: []rune(input),
		idx:    0,
		flag:   flag,
	}
}

// While a recursive descent parser is overkill for parsing simple escape
// codes, this is IMO much easier to read than the ugly 80s coroutine code used
// by the original unvis(3) parser. Here's the EBNF for an unvis sequence:
//
// <input>           ::= (<rune>)*
// <rune>            ::= ("\" <escape-sequence>) | ("%" <escape-hex>) | <plain-rune>
// <plain-rune>      ::= any rune
// <escape-sequence> ::= ("x" <escape-hex>) | ("M" <escape-meta>) | ("^" <escape-ctrl) | <escape-cstyle> | <escape-octal>
// <escape-meta>     ::= ("-" <escape-meta1>) | ("^" <escape-ctrl>)
// <escape-meta1>    ::= any rune
// <escape-ctrl>     ::= "?" | any rune
// <escape-cstyle>   ::= "\" | "n" | "r" | "b" | "a" | "v" | "t" | "f"
// <escape-hex>      ::= [0-9a-f] [0-9a-f]
// <escape-octal>    ::= [0-7] ([0-7] ([0-7])?)?

func unvisPlainRune(p *unvisParser) ([]byte, error) {
	ch, err := p.Peek()
	if err != nil {
		return nil, fmt.Errorf("plain rune: %c", ch)
	}
	p.Next()

	// XXX: Maybe we should not be converting to runes and then back to strings
	//      here. Are we sure that the byte-for-byte representation is the
	//      same? If the bytes change, then using these strings for paths will
	//      break...

	str := string(ch)
	return []byte(str), nil
}

func unvisEscapeCStyle(p *unvisParser) ([]byte, error) {
	ch, err := p.Peek()
	if err != nil {
		return nil, fmt.Errorf("escape hex: %s", err)
	}

	output := ""
	switch ch {
	case 'n':
		output = "\n"
	case 'r':
		output = "\r"
	case 'b':
		output = "\b"
	case 'a':
		output = "\x07"
	case 'v':
		output = "\v"
	case 't':
		output = "\t"
	case 'f':
		output = "\f"
	case 's':
		output = " "
	case 'E':
		output = "\x1b"
	case '\n':
		// Hidden newline.
	case '$':
		// Hidden marker.
	default:
		// XXX: We should probably allow falling through and return "\" here...
		return nil, fmt.Errorf("escape cstyle: unknown escape character: %q", ch)
	}

	p.Next()
	return []byte(output), nil
}

func unvisEscapeDigits(p *unvisParser, base int, force bool) ([]byte, error) {
	var code int

	for i := int(0xFF); i > 0; i /= base {
		ch, err := p.Peek()
		if err != nil {
			if !force && i != 0xFF {
				break
			}
			return nil, fmt.Errorf("escape base %d: %s", base, err)
		}

		digit, err := strconv.ParseInt(string(ch), base, 8)
		if err != nil {
			if !force && i != 0xFF {
				break
			}
			return nil, fmt.Errorf("escape base %d: could not parse digit: %s", base, err)
		}

		code = (code * base) + int(digit)
		p.Next()
	}

	if code > unicode.MaxLatin1 {
		return nil, fmt.Errorf("escape base %d: code %q outside latin-1 encoding", base, code)
	}

	char := byte(code & 0xFF)
	return []byte{char}, nil
}

func unvisEscapeCtrl(p *unvisParser, mask byte) ([]byte, error) {
	ch, err := p.Peek()
	if err != nil {
		return nil, fmt.Errorf("escape ctrl: %s", err)
	}
	if ch > unicode.MaxLatin1 {
		return nil, fmt.Errorf("escape ctrl: code %q outside latin-1 encoding", ch)
	}

	char := byte(ch) & 0x1f
	if ch == '?' {
		char = 0x7f
	}

	p.Next()
	return []byte{mask | char}, nil
}

func unvisEscapeMeta(p *unvisParser) ([]byte, error) {
	ch, err := p.Peek()
	if err != nil {
		return nil, fmt.Errorf("escape meta: %s", err)
	}

	mask := byte(0x80)

	switch ch {
	case '^':
		// The same as "\^..." except we apply a mask.
		p.Next()
		return unvisEscapeCtrl(p, mask)

	case '-':
		p.Next()

		ch, err := p.Peek()
		if err != nil {
			return nil, fmt.Errorf("escape meta1: %s", err)
		}
		if ch > unicode.MaxLatin1 {
			return nil, fmt.Errorf("escape meta1: code %q outside latin-1 encoding", ch)
		}

		// Add mask to character.
		p.Next()
		return []byte{mask | byte(ch)}, nil
	}

	return nil, fmt.Errorf("escape meta: unknown escape char: %s", err)
}

func unvisEscapeSequence(p *unvisParser) ([]byte, error) {
	ch, err := p.Peek()
	if err != nil {
		return nil, fmt.Errorf("escape sequence: %s", err)
	}

	switch ch {
	case '\\':
		p.Next()
		return []byte("\\"), nil

	case '0', '1', '2', '3', '4', '5', '6', '7':
		return unvisEscapeDigits(p, 8, false)

	case 'x':
		p.Next()
		return unvisEscapeDigits(p, 16, true)

	case '^':
		p.Next()
		return unvisEscapeCtrl(p, 0x00)

	case 'M':
		p.Next()
		return unvisEscapeMeta(p)

	default:
		return unvisEscapeCStyle(p)
	}
}

func unvisRune(p *unvisParser) ([]byte, error) {
	ch, err := p.Peek()
	if err != nil {
		return nil, fmt.Errorf("rune: %s", err)
	}

	switch ch {
	case '\\':
		p.Next()
		return unvisEscapeSequence(p)

	case '%':
		// % HEX HEX only applies to HTTPStyle encodings.
		if p.flag&VisHTTPStyle == VisHTTPStyle {
			p.Next()
			return unvisEscapeDigits(p, 16, true)
		}
		fallthrough

	default:
		return unvisPlainRune(p)
	}
}

func unvis(p *unvisParser) (string, error) {
	var output []byte
	for !p.End() {
		ch, err := unvisRune(p)
		if err != nil {
			return "", fmt.Errorf("input: %s", err)
		}
		output = append(output, ch...)
	}
	return string(output), nil
}

// Unvis takes a string formatted with the given Vis flags (though only the
// VisHTTPStyle flag is checked) and output the un-encoded version of the
// encoded string. An error is returned if any escape sequences in the input
// string were invalid.
func Unvis(input string, flag VisFlag) (string, error) {
	// TODO: Check all of the VisFlag bits.
	p := newParser(input, flag)
	output, err := unvis(p)
	if err != nil {
		return "", fmt.Errorf("unvis: %s", err)
	}
	if !p.End() {
		return "", fmt.Errorf("unvis: trailing characters at end of input")
	}
	return output, nil
}