1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195
|
// Copyright 2024 Garrett D'Amore
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use file except in compliance with the License.
// You may obtain a copy of the license at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package encoding
import (
"sync"
"unicode/utf8"
"golang.org/x/text/encoding"
"golang.org/x/text/transform"
)
const (
// RuneError is an alias for the UTF-8 replacement rune, '\uFFFD'.
RuneError = '\uFFFD'
// RuneSelf is the rune below which UTF-8 and the Unicode values are
// identical. Its also the limit for ASCII.
RuneSelf = 0x80
// ASCIISub is the ASCII substitution character.
ASCIISub = '\x1a'
)
// Charmap is a structure for setting up encodings for 8-bit character sets,
// for transforming between UTF8 and that other character set. It has some
// ideas borrowed from golang.org/x/text/encoding/charmap, but it uses a
// different implementation. This implementation uses maps, and supports
// user-defined maps.
//
// We do assume that a character map has a reasonable substitution character,
// and that valid encodings are stable (exactly a 1:1 map) and stateless
// (that is there is no shift character or anything like that.) Hence this
// approach will not work for many East Asian character sets.
//
// Measurement shows little or no measurable difference in the performance of
// the two approaches. The difference was down to a couple of nsec/op, and
// no consistent pattern as to which ran faster. With the conversion to
// UTF-8 the code takes about 25 nsec/op. The conversion in the reverse
// direction takes about 100 nsec/op. (The larger cost for conversion
// from UTF-8 is most likely due to the need to convert the UTF-8 byte stream
// to a rune before conversion.
type Charmap struct {
transform.NopResetter
bytes map[rune]byte
runes [256][]byte
once sync.Once
// The map between bytes and runes. To indicate that a specific
// byte value is invalid for a charcter set, use the rune
// utf8.RuneError. Values that are absent from this map will
// be assumed to have the identity mapping -- that is the default
// is to assume ISO8859-1, where all 8-bit characters have the same
// numeric value as their Unicode runes. (Not to be confused with
// the UTF-8 values, which *will* be different for non-ASCII runes.)
//
// If no values less than RuneSelf are changed (or have non-identity
// mappings), then the character set is assumed to be an ASCII
// superset, and certain assumptions and optimizations become
// available for ASCII bytes.
Map map[byte]rune
// The ReplacementChar is the byte value to use for substitution.
// It should normally be ASCIISub for ASCII encodings. This may be
// unset (left to zero) for mappings that are strictly ASCII supersets.
// In that case ASCIISub will be assumed instead.
ReplacementChar byte
}
type cmapDecoder struct {
transform.NopResetter
runes [256][]byte
}
type cmapEncoder struct {
transform.NopResetter
bytes map[rune]byte
replace byte
}
// Init initializes internal values of a character map. This should
// be done early, to minimize the cost of allocation of transforms
// later. It is not strictly necessary however, as the allocation
// functions will arrange to call it if it has not already been done.
func (c *Charmap) Init() {
c.once.Do(c.initialize)
}
func (c *Charmap) initialize() {
c.bytes = make(map[rune]byte)
ascii := true
for i := 0; i < 256; i++ {
r, ok := c.Map[byte(i)]
if !ok {
r = rune(i)
}
if r < 128 && r != rune(i) {
ascii = false
}
if r != RuneError {
c.bytes[r] = byte(i)
}
utf := make([]byte, utf8.RuneLen(r))
utf8.EncodeRune(utf, r)
c.runes[i] = utf
}
if ascii && c.ReplacementChar == '\x00' {
c.ReplacementChar = ASCIISub
}
}
// NewDecoder returns a Decoder the converts from the 8-bit
// character set to UTF-8. Unknown mappings, if any, are mapped
// to '\uFFFD'.
func (c *Charmap) NewDecoder() *encoding.Decoder {
c.Init()
return &encoding.Decoder{Transformer: &cmapDecoder{runes: c.runes}}
}
// NewEncoder returns a Transformer that converts from UTF8 to the
// 8-bit character set. Unknown mappings are mapped to 0x1A.
func (c *Charmap) NewEncoder() *encoding.Encoder {
c.Init()
return &encoding.Encoder{
Transformer: &cmapEncoder{
bytes: c.bytes,
replace: c.ReplacementChar,
},
}
}
func (d *cmapDecoder) Transform(dst, src []byte, atEOF bool) (int, int, error) {
var e error
var ndst, nsrc int
for _, c := range src {
b := d.runes[c]
l := len(b)
if ndst+l > len(dst) {
e = transform.ErrShortDst
break
}
for i := 0; i < l; i++ {
dst[ndst] = b[i]
ndst++
}
nsrc++
}
return ndst, nsrc, e
}
func (d *cmapEncoder) Transform(dst, src []byte, atEOF bool) (int, int, error) {
var e error
var ndst, nsrc int
for nsrc < len(src) {
if ndst >= len(dst) {
e = transform.ErrShortDst
break
}
r, sz := utf8.DecodeRune(src[nsrc:])
if r == utf8.RuneError && sz == 1 {
// If its inconclusive due to insufficient data in
// in the source, report it
if atEOF && !utf8.FullRune(src[nsrc:]) {
e = transform.ErrShortSrc
break
}
}
if c, ok := d.bytes[r]; ok {
dst[ndst] = c
} else {
dst[ndst] = d.replace
}
nsrc += sz
ndst++
}
return ndst, nsrc, e
}
|