1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174
|
// Package utfbom implements the detection of the BOM (Unicode Byte Order Mark) and removing as necessary.
// It wraps an io.Reader object, creating another object (Reader) that also implements the io.Reader
// interface but provides automatic BOM checking and removing as necessary.
package utfbom
import (
"errors"
"io"
)
// Encoding is type alias for detected UTF encoding.
type Encoding int
// Constants to identify detected UTF encodings.
const (
// Unknown encoding, returned when no BOM was detected
Unknown Encoding = iota
// UTF8, BOM bytes: EF BB BF
UTF8
// UTF-16, big-endian, BOM bytes: FE FF
UTF16BigEndian
// UTF-16, little-endian, BOM bytes: FF FE
UTF16LittleEndian
// UTF-32, big-endian, BOM bytes: 00 00 FE FF
UTF32BigEndian
// UTF-32, little-endian, BOM bytes: FF FE 00 00
UTF32LittleEndian
)
const maxConsecutiveEmptyReads = 100
// Skip creates Reader which automatically detects BOM (Unicode Byte Order Mark) and removes it as necessary.
// It also returns the encoding detected by the BOM.
// If the detected encoding is not needed, you can call the SkipOnly function.
func Skip(rd io.Reader) (*Reader, Encoding) {
// Is it already a Reader?
b, ok := rd.(*Reader)
if ok {
return b, Unknown
}
enc, left, err := detectUtf(rd)
return &Reader{
rd: rd,
buf: left,
err: err,
}, enc
}
// SkipOnly creates Reader which automatically detects BOM (Unicode Byte Order Mark) and removes it as necessary.
func SkipOnly(rd io.Reader) *Reader {
r, _ := Skip(rd)
return r
}
// Reader implements automatic BOM (Unicode Byte Order Mark) checking and
// removing as necessary for an io.Reader object.
type Reader struct {
rd io.Reader // reader provided by the client
buf []byte // buffered data
err error // last error
}
// Read is an implementation of io.Reader interface.
// The bytes are taken from the underlying Reader, but it checks for BOMs, removing them as necessary.
func (r *Reader) Read(p []byte) (n int, err error) {
if len(p) == 0 {
return 0, nil
}
if r.buf == nil {
if r.err != nil {
return 0, r.readErr()
}
return r.rd.Read(p)
}
// copy as much as we can
n = copy(p, r.buf)
r.buf = nilIfEmpty(r.buf[n:])
return n, nil
}
func (r *Reader) readErr() error {
err := r.err
r.err = nil
return err
}
var errNegativeRead = errors.New("utfbom: reader returned negative count from Read")
func detectUtf(rd io.Reader) (enc Encoding, buf []byte, err error) {
buf, err = readBOM(rd)
if len(buf) >= 4 {
if isUTF32BigEndianBOM4(buf) {
return UTF32BigEndian, nilIfEmpty(buf[4:]), err
}
if isUTF32LittleEndianBOM4(buf) {
return UTF32LittleEndian, nilIfEmpty(buf[4:]), err
}
}
if len(buf) > 2 && isUTF8BOM3(buf) {
return UTF8, nilIfEmpty(buf[3:]), err
}
if (err != nil && err != io.EOF) || (len(buf) < 2) {
return Unknown, nilIfEmpty(buf), err
}
if isUTF16BigEndianBOM2(buf) {
return UTF16BigEndian, nilIfEmpty(buf[2:]), err
}
if isUTF16LittleEndianBOM2(buf) {
return UTF16LittleEndian, nilIfEmpty(buf[2:]), err
}
return Unknown, nilIfEmpty(buf), err
}
func readBOM(rd io.Reader) (buf []byte, err error) {
const maxBOMSize = 4
var bom [maxBOMSize]byte // used to read BOM
// read as many bytes as possible
for nEmpty, n := 0, 0; err == nil && len(buf) < maxBOMSize; buf = bom[:len(buf)+n] {
if n, err = rd.Read(bom[len(buf):]); n < 0 {
panic(errNegativeRead)
}
if n > 0 {
nEmpty = 0
} else {
nEmpty++
if nEmpty >= maxConsecutiveEmptyReads {
err = io.ErrNoProgress
}
}
}
return
}
func isUTF32BigEndianBOM4(buf []byte) bool {
return buf[0] == 0x00 && buf[1] == 0x00 && buf[2] == 0xFE && buf[3] == 0xFF
}
func isUTF32LittleEndianBOM4(buf []byte) bool {
return buf[0] == 0xFF && buf[1] == 0xFE && buf[2] == 0x00 && buf[3] == 0x00
}
func isUTF8BOM3(buf []byte) bool {
return buf[0] == 0xEF && buf[1] == 0xBB && buf[2] == 0xBF
}
func isUTF16BigEndianBOM2(buf []byte) bool {
return buf[0] == 0xFE && buf[1] == 0xFF
}
func isUTF16LittleEndianBOM2(buf []byte) bool {
return buf[0] == 0xFF && buf[1] == 0xFE
}
func nilIfEmpty(buf []byte) (res []byte) {
if len(buf) > 0 {
res = buf
}
return
}
|