1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195
|
package charset
import (
"fmt"
"unicode/utf8"
)
func init() {
registerClass("cp932", fromCP932, nil)
}
// encoding details
// (Traditional) Shift-JIS
//
// 00..1f control characters
// 20 space
// 21..7f JIS X 0201:1976/1997 roman (see notes)
// 80 undefined
// 81..9f lead byte of JIS X 0208-1983 or JIS X 0202:1990/1997
// a0 undefined
// a1..df JIS X 0201:1976/1997 katakana
// e0..ea lead byte of JIS X 0208-1983 or JIS X 0202:1990/1997
// eb..ff undefined
//
// CP932 (windows-31J)
//
// this encoding scheme extends Shift-JIS in the following way
//
// eb..ec undefined (marked as lead bytes - see notes below)
// ed..ee lead byte of NEC-selected IBM extended characters
// ef undefined (marked as lead byte - see notes below)
// f0..f9 lead byte of User defined GAIJI (see note below)
// fa..fc lead byte of IBM extended characters
// fd..ff undefined
//
//
// Notes
//
// JISX 0201:1976/1997 roman
// this is the same as ASCII but with 0x5c (ASCII code for '\')
// representing the Yen currency symbol '¥' (U+00a5)
// This mapping is contentious, some conversion packages implent it
// others do not.
// The mapping files from The Unicode Consortium show cp932 mapping
// plain ascii in the range 00..7f whereas shift-jis maps 0x5c ('\') to the yen
// symbol (¥) and 0x7e ('~') to overline (¯)
//
// CP932 double-byte character codes:
//
// eb-ec, ef, f0-f9:
// Marked as DBCS LEAD BYTEs in the unicode mapping data
// obtained from:
// https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP932.TXT
//
// but there are no defined mappings for codes in this range.
// It is not clear whether or not an implementation should
// consume one or two bytes before emitting an error char.
const (
kanaPages = 1
kanaPageSize = 63
kanaChar0 = 0xa1
cp932Pages = 45 // 81..84, 87..9f, e0..ea, ed..ee, fa..fc
cp932PageSize = 189 // 40..fc (including 7f)
cp932Char0 = 0x40
)
type jisTables struct {
page0 [256]rune
dbcsoff [256]int
cp932 []rune
}
type translateFromCP932 struct {
tables *jisTables
scratch []byte
}
func (p *translateFromCP932) Translate(data []byte, eof bool) (int, []byte, error) {
tables := p.tables
p.scratch = p.scratch[:0]
n := 0
for i := 0; i < len(data); i++ {
b := data[i]
r := tables.page0[b]
if r != -1 {
p.scratch = appendRune(p.scratch, r)
n++
continue
}
// DBCS
i++
if i >= len(data) {
break
}
pnum := tables.dbcsoff[b]
ix := int(data[i]) - cp932Char0
if pnum == -1 || ix < 0 || ix >= cp932PageSize {
r = utf8.RuneError
} else {
r = tables.cp932[pnum*cp932PageSize+ix]
}
p.scratch = appendRune(p.scratch, r)
n += 2
}
return n, p.scratch, nil
}
type cp932Key bool
func fromCP932(arg string) (Translator, error) {
shiftJIS := arg == "shiftjis"
tables, err := cache(cp932Key(shiftJIS), func() (interface{}, error) {
tables := new(jisTables)
kana, err := jisGetMap("jisx0201kana.dat", kanaPageSize, kanaPages)
if err != nil {
return nil, err
}
tables.cp932, err = jisGetMap("cp932.dat", cp932PageSize, cp932Pages)
if err != nil {
return nil, err
}
// jisx0201kana is mapped into 0xA1..0xDF
for i := 0; i < kanaPageSize; i++ {
tables.page0[i+kanaChar0] = kana[i]
}
// 00..7f same as ascii in cp932
for i := rune(0); i < 0x7f; i++ {
tables.page0[i] = i
}
if shiftJIS {
// shift-jis uses JIS X 0201 for the ASCII range
// this is the same as ASCII apart from
// 0x5c ('\') maps to yen symbol (¥) and 0x7e ('~') maps to overline (¯)
tables.page0['\\'] = '¥'
tables.page0['~'] = '¯'
}
// pre-calculate DBCS page numbers to mapping file page numbers
// and mark codes in page0 that are DBCS lead bytes
pnum := 0
for i := 0x81; i <= 0x84; i++ {
tables.page0[i] = -1
tables.dbcsoff[i] = pnum
pnum++
}
for i := 0x87; i <= 0x9f; i++ {
tables.page0[i] = -1
tables.dbcsoff[i] = pnum
pnum++
}
for i := 0xe0; i <= 0xea; i++ {
tables.page0[i] = -1
tables.dbcsoff[i] = pnum
pnum++
}
if shiftJIS {
return tables, nil
}
// add in cp932 extensions
for i := 0xed; i <= 0xee; i++ {
tables.page0[i] = -1
tables.dbcsoff[i] = pnum
pnum++
}
for i := 0xfa; i <= 0xfc; i++ {
tables.page0[i] = -1
tables.dbcsoff[i] = pnum
pnum++
}
return tables, nil
})
if err != nil {
return nil, err
}
return &translateFromCP932{tables: tables.(*jisTables)}, nil
}
func jisGetMap(name string, pgsize, npages int) ([]rune, error) {
data, err := readFile(name)
if err != nil {
return nil, err
}
m := []rune(string(data))
if len(m) != pgsize*npages {
return nil, fmt.Errorf("%q: incorrect length data", name)
}
return m, nil
}
|