File: decode.go

package info (click to toggle)
golang-github-dop251-scsu 0.0~git20220106.84ac880-3
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 120 kB
  • sloc: makefile: 2
file content (326 lines) | stat: -rw-r--r-- 8,154 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
package scsu

import (
	"bytes"
	"errors"
	"fmt"
	"io"
	"strings"
	"unicode/utf16"
)

type Reader struct {
	scsu
	brd       io.ByteReader
	bytesRead int
}

var (
	ErrIllegalInput = errors.New("illegal input")
)

func NewReader(r io.ByteReader) *Reader {
	d := &Reader{
		brd: r,
	}
	d.init()
	return d
}

func (r *Reader) readByte() (byte, error) {
	b, err := r.brd.ReadByte()
	if err == nil {
		r.bytesRead++
	}
	return b, err
}

/** (re-)define (and select) a dynamic window
  A sliding window position cannot start at any Unicode value,
  so rather than providing an absolute offset, this function takes
  an index value which selects among the possible starting values.

  Most scripts in Unicode start on or near a half-block boundary
  so the default behaviour is to multiply the index by 0x80. Han,
  Hangul, Surrogates and other scripts between 0x3400 and 0xDFFF
  show very poor locality--therefore no sliding window can be set
  there. A jumpOffset is added to the index value to skip that region,
  and only 167 index values total are required to select all eligible
  half-blocks.

  Finally, a few scripts straddle half block boundaries. For them, a
  table of fixed offsets is used, and the index values from 0xF9 to
  0xFF are used to select these special offsets.

  After (re-)defining a windows location it is selected so it is ready
  for use.

  Recall that all Windows are of the same length (128 code positions).
*/
func (r *Reader) defineWindow(iWindow int, offset byte) error {
	// 0 is a reserved value
	if offset == 0 {
		return ErrIllegalInput
	}
	if offset < gapThreshold {
		r.dynamicOffset[iWindow] = int32(offset) << 7
	} else if offset < reservedStart {
		r.dynamicOffset[iWindow] = (int32(offset) << 7) + gapOffset
	} else if offset < fixedThreshold {
		return fmt.Errorf("offset = %d", offset)
	} else {
		r.dynamicOffset[iWindow] = fixedOffset[offset-fixedThreshold]
	}

	// make the redefined window the active one
	r.window = iWindow
	return nil
}

/** (re-)define (and select) a window as an extended dynamic window
  The surrogate area in Unicode allows access to 2**20 codes beyond the
  first 64K codes by combining one of 1024 characters from the High
  Surrogate Area with one of 1024 characters from the Low Surrogate
  Area (see Unicode 2.0 for the details).

  The tags SDX and UDX set the window such that each subsequent byte in
  the range 80 to FF represents a surrogate pair. The following diagram
  shows how the bits in the two bytes following the SDX or UDX, and a
  subsequent data byte, map onto the bits in the resulting surrogate pair.

   hbyte         lbyte          data
  nnnwwwww      zzzzzyyy      1xxxxxxx

   high-surrogate     low-surrogate
  110110wwwwwzzzzz   110111yyyxxxxxxx

  @param chOffset - Since the three top bits of chOffset are not needed to
  set the location of the extended Window, they are used instead
  to select the window, thereby reducing the number of needed command codes.
  The bottom 13 bits of chOffset are used to calculate the offset relative to
  a 7 bit input data byte to yield the 20 bits expressed by each surrogate pair.
  **/
func (r *Reader) defineExtendedWindow(chOffset uint16) {
	// The top 3 bits of iOffsetHi are the window index
	window := chOffset >> 13

	// Calculate the new offset
	r.dynamicOffset[window] = ((int32(chOffset) & 0x1FFF) << 7) + (1 << 16)

	// make the redefined window the active one
	r.window = int(window)
}

// convert an io.EOF into io.ErrUnexpectedEOF
func unexpectedEOF(e error) error {
	if errors.Is(e, io.EOF) {
		return io.ErrUnexpectedEOF
	}

	return e
}

func (r *Reader) expandUnicode() (rune, error) {
	for {
		b, err := r.readByte()
		if err != nil {
			return 0, err
		}
		if b >= UC0 && b <= UC7 {
			r.window = int(b) - UC0
			r.unicodeMode = false
			return -1, nil
		}
		if b >= UD0 && b <= UD7 {
			b1, err := r.readByte()
			if err != nil {
				return 0, unexpectedEOF(err)
			}
			r.unicodeMode = false
			return -1, r.defineWindow(int(b)-UD0, b1)
		}
		if b == UDX {
			c, err := r.readUint16()
			if err != nil {
				return 0, unexpectedEOF(err)
			}
			r.defineExtendedWindow(c)
			r.unicodeMode = false
			return -1, nil
		}
		if b == UQU {
			r, err := r.readUint16()
			if err != nil {
				return 0, err
			}
			return rune(r), nil
		} else {
			b1, err := r.readByte()
			if err != nil {
				return 0, unexpectedEOF(err)
			}
			ch := rune(uint16FromTwoBytes(b, b1))
			if utf16.IsSurrogate(ch) {
				ch1, err := r.readUint16()
				if err != nil {
					return 0, unexpectedEOF(err)
				}
				surrLo := rune(ch1)
				if !utf16.IsSurrogate(surrLo) {
					return 0, ErrIllegalInput
				}
				return utf16.DecodeRune(ch, surrLo), nil
			}
			return ch, nil
		}
	}
}

func (r *Reader) readUint16() (uint16, error) {
	b1, err := r.readByte()
	if err != nil {
		return 0, unexpectedEOF(err)
	}
	b2, err := r.readByte()
	if err != nil {
		return 0, unexpectedEOF(err)
	}
	return uint16FromTwoBytes(b1, b2), nil
}

func uint16FromTwoBytes(hi, lo byte) uint16 {
	return uint16(hi)<<8 | uint16(lo)
}

/** expand portion of the input that is in single byte mode **/
func (r *Reader) expandSingleByte() (rune, error) {
	for {
		b, err := r.readByte()
		if err != nil {
			return 0, err
		}
		staticWindow := 0
		dynamicWindow := r.window

		switch b {
		case SQ0, SQ1, SQ2, SQ3, SQ4, SQ5, SQ6, SQ7:
			// Select window pair to quote from
			dynamicWindow = int(b) - SQ0
			staticWindow = dynamicWindow
			b, err = r.readByte()
			if err != nil {
				return 0, unexpectedEOF(err)
			}
			fallthrough
		default:
			// output as character
			if b < 0x80 {
				// use static window
				return int32(b) + staticOffset[staticWindow], nil
			} else {
				ch := int32(b) - 0x80
				ch += r.dynamicOffset[dynamicWindow]
				return ch, nil
			}
		case SDX:
			// define a dynamic window as extended
			ch, err := r.readUint16()
			if err != nil {
				return 0, unexpectedEOF(err)
			}
			r.defineExtendedWindow(ch)
		case SD0, SD1, SD2, SD3, SD4, SD5, SD6, SD7:
			// Position a dynamic Window
			b1, err := r.readByte()
			if err != nil {
				return 0, unexpectedEOF(err)
			}
			err = r.defineWindow(int(b)-SD0, b1)
			if err != nil {
				return 0, err
			}
		case SC0, SC1, SC2, SC3, SC4, SC5, SC6, SC7:
			// Select a new dynamic Window
			r.window = int(b) - SC0
		case SCU:
			// switch to Unicode mode and continue parsing
			r.unicodeMode = true
			return -1, nil
		case SQU:
			// directly extract one Unicode character
			ch, err := r.readUint16()
			if err != nil {
				return 0, err
			}
			return rune(ch), nil
		case Srs:
			return 0, ErrIllegalInput
		}
	}
}

func (r *Reader) readRune() (rune, error) {
	for {
		var c rune
		var err error
		if r.unicodeMode {
			c, err = r.expandUnicode()
		} else {
			c, err = r.expandSingleByte()
		}
		if err != nil {
			return 0, err
		}
		if c == -1 {
			continue
		}
		return c, nil
	}
}

// ReadRune reads a single SCSU encoded Unicode character
// and returns the rune and the amount of bytes consumed. If no character is
// available, err will be set.
func (r *Reader) ReadRune() (rune, int, error) {
	pr := r.bytesRead
	c, err := r.readRune()
	return c, r.bytesRead - pr, err
}

// ReadStringSizeHint is like ReadString, but takes a hint about the expected string size.
// Note this is the size of the UTF-8 encoded string in bytes.
func (r *Reader) ReadStringSizeHint(sizeHint int) (string, error) {
	var sb strings.Builder
	if sizeHint > 0 {
		sb.Grow(sizeHint)
	}
	for {
		r, err := r.readRune()
		if err != nil {
			if errors.Is(err, io.EOF) {
				break
			}
			return "", err
		}
		sb.WriteRune(r)
	}
	return sb.String(), nil
}

// ReadString reads all available input as a string.
// It keeps reading the source reader until it returns io.EOF or an error occurs.
// In case of io.EOF the error returned by ReadString will be nil.
func (r *Reader) ReadString() (string, error) {
	return r.ReadStringSizeHint(0)
}

func (r *Reader) Reset(rd io.ByteReader) {
	r.brd, r.bytesRead = rd, 0
	r.reset()
	r.init()
}

// Decode a byte array as a string.
func Decode(b []byte) (string, error) {
	return NewReader(bytes.NewBuffer(b)).ReadStringSizeHint(len(b))
}