1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137
|
// Package unistring contains an implementation of a hybrid ASCII/UTF-16 string.
// For ASCII strings the underlying representation is equivalent to a normal Go string.
// For unicode strings the underlying representation is UTF-16 as []uint16 with 0th element set to 0xFEFF.
// unicode.String allows representing malformed UTF-16 values (e.g. stand-alone parts of surrogate pairs)
// which cannot be represented in UTF-8.
// At the same time it is possible to use unicode.String as property keys just as efficiently as simple strings,
// (the leading 0xFEFF ensures there is no clash with ASCII string), and it is possible to convert it
// to valueString without extra allocations.
package unistring
import (
"reflect"
"unicode/utf16"
"unicode/utf8"
"unsafe"
)
const (
BOM = 0xFEFF
)
type String string
// Scan checks if the string contains any unicode characters. If it does, converts to an array suitable for creating
// a String using FromUtf16, otherwise returns nil.
func Scan(s string) []uint16 {
utf16Size := 0
for ; utf16Size < len(s); utf16Size++ {
if s[utf16Size] >= utf8.RuneSelf {
goto unicode
}
}
return nil
unicode:
for _, chr := range s[utf16Size:] {
utf16Size++
if chr > 0xFFFF {
utf16Size++
}
}
buf := make([]uint16, utf16Size+1)
buf[0] = BOM
c := 1
for _, chr := range s {
if chr <= 0xFFFF {
buf[c] = uint16(chr)
} else {
first, second := utf16.EncodeRune(chr)
buf[c] = uint16(first)
c++
buf[c] = uint16(second)
}
c++
}
return buf
}
func NewFromString(s string) String {
if buf := Scan(s); buf != nil {
return FromUtf16(buf)
}
return String(s)
}
func NewFromRunes(s []rune) String {
ascii := true
size := 0
for _, c := range s {
if c >= utf8.RuneSelf {
ascii = false
if c > 0xFFFF {
size++
}
}
size++
}
if ascii {
return String(s)
}
b := make([]uint16, size+1)
b[0] = BOM
i := 1
for _, c := range s {
if c <= 0xFFFF {
b[i] = uint16(c)
} else {
first, second := utf16.EncodeRune(c)
b[i] = uint16(first)
i++
b[i] = uint16(second)
}
i++
}
return FromUtf16(b)
}
func FromUtf16(b []uint16) String {
var str string
hdr := (*reflect.StringHeader)(unsafe.Pointer(&str))
hdr.Data = uintptr(unsafe.Pointer(&b[0]))
hdr.Len = len(b) * 2
return String(str)
}
func (s String) String() string {
if b := s.AsUtf16(); b != nil {
return string(utf16.Decode(b[1:]))
}
return string(s)
}
func (s String) AsUtf16() []uint16 {
if len(s) < 4 || len(s)&1 != 0 {
return nil
}
var a []uint16
raw := string(s)
sliceHeader := (*reflect.SliceHeader)(unsafe.Pointer(&a))
sliceHeader.Data = (*reflect.StringHeader)(unsafe.Pointer(&raw)).Data
l := len(raw) / 2
sliceHeader.Len = l
sliceHeader.Cap = l
if a[0] == BOM {
return a
}
return nil
}
|