File: charmap.go

package info (click to toggle)
golang-github-gdamore-encoding 1.0.1-1
links: PTS, VCS
area: main
in suites: forky, sid, trixie
size: 164 kB
sloc: makefile: 2
file content (195 lines) | stat: -rw-r--r-- 5,648 bytes
// Copyright 2024 Garrett D'Amore
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use file except in compliance with the License.
// You may obtain a copy of the license at
//
//    http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package encoding

import (
	"sync"
	"unicode/utf8"

	"golang.org/x/text/encoding"
	"golang.org/x/text/transform"
)

const (
	// RuneError is an alias for the UTF-8 replacement rune, '\uFFFD'.
	RuneError = '\uFFFD'

	// RuneSelf is the rune below which UTF-8 and the Unicode values are
	// identical.  Its also the limit for ASCII.
	RuneSelf = 0x80

	// ASCIISub is the ASCII substitution character.
	ASCIISub = '\x1a'
)

// Charmap is a structure for setting up encodings for 8-bit character sets,
// for transforming between UTF8 and that other character set.  It has some
// ideas borrowed from golang.org/x/text/encoding/charmap, but it uses a
// different implementation.  This implementation uses maps, and supports
// user-defined maps.
//
// We do assume that a character map has a reasonable substitution character,
// and that valid encodings are stable (exactly a 1:1 map) and stateless
// (that is there is no shift character or anything like that.)  Hence this
// approach will not work for many East Asian character sets.
//
// Measurement shows little or no measurable difference in the performance of
// the two approaches.  The difference was down to a couple of nsec/op, and
// no consistent pattern as to which ran faster.  With the conversion to
// UTF-8 the code takes about 25 nsec/op.  The conversion in the reverse
// direction takes about 100 nsec/op.  (The larger cost for conversion
// from UTF-8 is most likely due to the need to convert the UTF-8 byte stream
// to a rune before conversion.
type Charmap struct {
	transform.NopResetter
	bytes map[rune]byte
	runes [256][]byte
	once  sync.Once

	// The map between bytes and runes.  To indicate that a specific
	// byte value is invalid for a charcter set, use the rune
	// utf8.RuneError.  Values that are absent from this map will
	// be assumed to have the identity mapping -- that is the default
	// is to assume ISO8859-1, where all 8-bit characters have the same
	// numeric value as their Unicode runes.  (Not to be confused with
	// the UTF-8 values, which *will* be different for non-ASCII runes.)
	//
	// If no values less than RuneSelf are changed (or have non-identity
	// mappings), then the character set is assumed to be an ASCII
	// superset, and certain assumptions and optimizations become
	// available for ASCII bytes.
	Map map[byte]rune

	// The ReplacementChar is the byte value to use for substitution.
	// It should normally be ASCIISub for ASCII encodings.  This may be
	// unset (left to zero) for mappings that are strictly ASCII supersets.
	// In that case ASCIISub will be assumed instead.
	ReplacementChar byte
}

type cmapDecoder struct {
	transform.NopResetter
	runes [256][]byte
}

type cmapEncoder struct {
	transform.NopResetter
	bytes   map[rune]byte
	replace byte
}

// Init initializes internal values of a character map.  This should
// be done early, to minimize the cost of allocation of transforms
// later.  It is not strictly necessary however, as the allocation
// functions will arrange to call it if it has not already been done.
func (c *Charmap) Init() {
	c.once.Do(c.initialize)
}

func (c *Charmap) initialize() {
	c.bytes = make(map[rune]byte)
	ascii := true

	for i := 0; i < 256; i++ {
		r, ok := c.Map[byte(i)]
		if !ok {
			r = rune(i)
		}
		if r < 128 && r != rune(i) {
			ascii = false
		}
		if r != RuneError {
			c.bytes[r] = byte(i)
		}
		utf := make([]byte, utf8.RuneLen(r))
		utf8.EncodeRune(utf, r)
		c.runes[i] = utf
	}
	if ascii && c.ReplacementChar == '\x00' {
		c.ReplacementChar = ASCIISub
	}
}

// NewDecoder returns a Decoder the converts from the 8-bit
// character set to UTF-8.  Unknown mappings, if any, are mapped
// to '\uFFFD'.
func (c *Charmap) NewDecoder() *encoding.Decoder {
	c.Init()
	return &encoding.Decoder{Transformer: &cmapDecoder{runes: c.runes}}
}

// NewEncoder returns a Transformer that converts from UTF8 to the
// 8-bit character set.  Unknown mappings are mapped to 0x1A.
func (c *Charmap) NewEncoder() *encoding.Encoder {
	c.Init()
	return &encoding.Encoder{
		Transformer: &cmapEncoder{
			bytes:   c.bytes,
			replace: c.ReplacementChar,
		},
	}
}

func (d *cmapDecoder) Transform(dst, src []byte, atEOF bool) (int, int, error) {
	var e error
	var ndst, nsrc int

	for _, c := range src {
		b := d.runes[c]
		l := len(b)

		if ndst+l > len(dst) {
			e = transform.ErrShortDst
			break
		}
		for i := 0; i < l; i++ {
			dst[ndst] = b[i]
			ndst++
		}
		nsrc++
	}
	return ndst, nsrc, e
}

func (d *cmapEncoder) Transform(dst, src []byte, atEOF bool) (int, int, error) {
	var e error
	var ndst, nsrc int
	for nsrc < len(src) {
		if ndst >= len(dst) {
			e = transform.ErrShortDst
			break
		}

		r, sz := utf8.DecodeRune(src[nsrc:])
		if r == utf8.RuneError && sz == 1 {
			// If its inconclusive due to insufficient data in
			// in the source, report it
			if atEOF && !utf8.FullRune(src[nsrc:]) {
				e = transform.ErrShortSrc
				break
			}
		}

		if c, ok := d.bytes[r]; ok {
			dst[ndst] = c
		} else {
			dst[ndst] = d.replace
		}
		nsrc += sz
		ndst++
	}

	return ndst, nsrc, e
}