File: charmap.go

package info (click to toggle)
golang-github-gdamore-encoding 1.0.0-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, bullseye, bullseye-proposed-updates, experimental, sid
  • size: 144 kB
  • sloc: makefile: 2
file content (196 lines) | stat: -rw-r--r-- 5,652 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
// Copyright 2015 Garrett D'Amore
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use file except in compliance with the License.
// You may obtain a copy of the license at
//
//    http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package encoding

import (
	"sync"
	"unicode/utf8"

	"golang.org/x/text/encoding"
	"golang.org/x/text/transform"
)

const (
	// RuneError is an alias for the UTF-8 replacement rune, '\uFFFD'.
	RuneError = '\uFFFD'

	// RuneSelf is the rune below which UTF-8 and the Unicode values are
	// identical.  Its also the limit for ASCII.
	RuneSelf = 0x80

	// ASCIISub is the ASCII substitution character.
	ASCIISub = '\x1a'
)

// Charmap is a structure for setting up encodings for 8-bit character sets,
// for transforming between UTF8 and that other character set.  It has some
// ideas borrowed from golang.org/x/text/encoding/charmap, but it uses a
// different implementation.  This implementation uses maps, and supports
// user-defined maps.
//
// We do assume that a character map has a reasonable substitution character,
// and that valid encodings are stable (exactly a 1:1 map) and stateless
// (that is there is no shift character or anything like that.)  Hence this
// approach will not work for many East Asian character sets.
//
// Measurement shows little or no measurable difference in the performance of
// the two approaches.  The difference was down to a couple of nsec/op, and
// no consistent pattern as to which ran faster.  With the conversion to
// UTF-8 the code takes about 25 nsec/op.  The conversion in the reverse
// direction takes about 100 nsec/op.  (The larger cost for conversion
// from UTF-8 is most likely due to the need to convert the UTF-8 byte stream
// to a rune before conversion.
//
type Charmap struct {
	transform.NopResetter
	bytes map[rune]byte
	runes [256][]byte
	once  sync.Once

	// The map between bytes and runes.  To indicate that a specific
	// byte value is invalid for a charcter set, use the rune
	// utf8.RuneError.  Values that are absent from this map will
	// be assumed to have the identity mapping -- that is the default
	// is to assume ISO8859-1, where all 8-bit characters have the same
	// numeric value as their Unicode runes.  (Not to be confused with
	// the UTF-8 values, which *will* be different for non-ASCII runes.)
	//
	// If no values less than RuneSelf are changed (or have non-identity
	// mappings), then the character set is assumed to be an ASCII
	// superset, and certain assumptions and optimizations become
	// available for ASCII bytes.
	Map map[byte]rune

	// The ReplacementChar is the byte value to use for substitution.
	// It should normally be ASCIISub for ASCII encodings.  This may be
	// unset (left to zero) for mappings that are strictly ASCII supersets.
	// In that case ASCIISub will be assumed instead.
	ReplacementChar byte
}

type cmapDecoder struct {
	transform.NopResetter
	runes [256][]byte
}

type cmapEncoder struct {
	transform.NopResetter
	bytes   map[rune]byte
	replace byte
}

// Init initializes internal values of a character map.  This should
// be done early, to minimize the cost of allocation of transforms
// later.  It is not strictly necessary however, as the allocation
// functions will arrange to call it if it has not already been done.
func (c *Charmap) Init() {
	c.once.Do(c.initialize)
}

func (c *Charmap) initialize() {
	c.bytes = make(map[rune]byte)
	ascii := true

	for i := 0; i < 256; i++ {
		r, ok := c.Map[byte(i)]
		if !ok {
			r = rune(i)
		}
		if r < 128 && r != rune(i) {
			ascii = false
		}
		if r != RuneError {
			c.bytes[r] = byte(i)
		}
		utf := make([]byte, utf8.RuneLen(r))
		utf8.EncodeRune(utf, r)
		c.runes[i] = utf
	}
	if ascii && c.ReplacementChar == '\x00' {
		c.ReplacementChar = ASCIISub
	}
}

// NewDecoder returns a Decoder the converts from the 8-bit
// character set to UTF-8.  Unknown mappings, if any, are mapped
// to '\uFFFD'.
func (c *Charmap) NewDecoder() *encoding.Decoder {
	c.Init()
	return &encoding.Decoder{Transformer: &cmapDecoder{runes: c.runes}}
}

// NewEncoder returns a Transformer that converts from UTF8 to the
// 8-bit character set.  Unknown mappings are mapped to 0x1A.
func (c *Charmap) NewEncoder() *encoding.Encoder {
	c.Init()
	return &encoding.Encoder{
		Transformer: &cmapEncoder{
			bytes:   c.bytes,
			replace: c.ReplacementChar,
		},
	}
}

func (d *cmapDecoder) Transform(dst, src []byte, atEOF bool) (int, int, error) {
	var e error
	var ndst, nsrc int

	for _, c := range src {
		b := d.runes[c]
		l := len(b)

		if ndst+l > len(dst) {
			e = transform.ErrShortDst
			break
		}
		for i := 0; i < l; i++ {
			dst[ndst] = b[i]
			ndst++
		}
		nsrc++
	}
	return ndst, nsrc, e
}

func (d *cmapEncoder) Transform(dst, src []byte, atEOF bool) (int, int, error) {
	var e error
	var ndst, nsrc int
	for nsrc < len(src) {
		if ndst >= len(dst) {
			e = transform.ErrShortDst
			break
		}

		r, sz := utf8.DecodeRune(src[nsrc:])
		if r == utf8.RuneError && sz == 1 {
			// If its inconclusive due to insufficient data in
			// in the source, report it
			if !atEOF && !utf8.FullRune(src[nsrc:]) {
				e = transform.ErrShortSrc
				break
			}
		}

		if c, ok := d.bytes[r]; ok {
			dst[ndst] = c
		} else {
			dst[ndst] = d.replace
		}
		nsrc += sz
		ndst++
	}

	return ndst, nsrc, e
}