1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190
|
/*
* _codecs_hk.c: Codecs collection for encodings from Hong Kong
*
* Written by Hye-Shik Chang <perky@FreeBSD.org>
*/
#define USING_IMPORTED_MAPS
#define CJK_MOD_SPECIFIC_STATE \
const encode_map *big5_encmap; \
const decode_map *big5_decmap;
#include "cjkcodecs.h"
#include "mappings_hk.h"
/*
* BIG5HKSCS codec
*/
CODEC_INIT(big5hkscs)
{
cjkcodecs_module_state *st = codec->modstate;
if (IMPORT_MAP(tw, big5, &st->big5_encmap, &st->big5_decmap)) {
return -1;
}
return 0;
}
/*
* There are four possible pair unicode -> big5hkscs maps as in HKSCS 2004:
* U+00CA U+0304 -> 8862 (U+00CA alone is mapped to 8866)
* U+00CA U+030C -> 8864
* U+00EA U+0304 -> 88a3 (U+00EA alone is mapped to 88a7)
* U+00EA U+030C -> 88a5
* These are handled by not mapping tables but a hand-written code.
*/
static const DBCHAR big5hkscs_pairenc_table[4] = {0x8862, 0x8864, 0x88a3, 0x88a5};
ENCODER(big5hkscs)
{
while (*inpos < inlen) {
Py_UCS4 c = INCHAR1;
DBCHAR code;
Py_ssize_t insize;
if (c < 0x80) {
REQUIRE_OUTBUF(1);
**outbuf = (unsigned char)c;
NEXT(1, 1);
continue;
}
insize = 1;
REQUIRE_OUTBUF(2);
if (c < 0x10000) {
if (TRYMAP_ENC(big5hkscs_bmp, code, c)) {
if (code == MULTIC) {
Py_UCS4 c2;
if (inlen - *inpos >= 2)
c2 = INCHAR2;
else
c2 = 0;
if (inlen - *inpos >= 2 &&
((c & 0xffdf) == 0x00ca) &&
((c2 & 0xfff7) == 0x0304)) {
code = big5hkscs_pairenc_table[
((c >> 4) |
(c2 >> 3)) & 3];
insize = 2;
}
else if (inlen - *inpos < 2 &&
!(flags & MBENC_FLUSH))
return MBERR_TOOFEW;
else {
if (c == 0xca)
code = 0x8866;
else /* c == 0xea */
code = 0x88a7;
}
}
}
else if (TRYMAP_ENC_ST(big5, code, c))
;
else
return 1;
}
else if (c < 0x20000)
return insize;
else if (c < 0x30000) {
if (TRYMAP_ENC(big5hkscs_nonbmp, code, c & 0xffff))
;
else
return insize;
}
else
return insize;
OUTBYTE1(code >> 8);
OUTBYTE2(code & 0xFF);
NEXT(insize, 2);
}
return 0;
}
#define BH2S(c1, c2) (((c1) - 0x87) * (0xfe - 0x40 + 1) + ((c2) - 0x40))
DECODER(big5hkscs)
{
while (inleft > 0) {
unsigned char c = INBYTE1;
Py_UCS4 decoded;
if (c < 0x80) {
OUTCHAR(c);
NEXT_IN(1);
continue;
}
REQUIRE_INBUF(2);
if (0xc6 > c || c > 0xc8 || (c < 0xc7 && INBYTE2 < 0xa1)) {
if (TRYMAP_DEC_ST(big5, decoded, c, INBYTE2)) {
OUTCHAR(decoded);
NEXT_IN(2);
continue;
}
}
if (TRYMAP_DEC(big5hkscs, decoded, c, INBYTE2))
{
int s = BH2S(c, INBYTE2);
const unsigned char *hintbase;
assert(0x87 <= c && c <= 0xfe);
assert(0x40 <= INBYTE2 && INBYTE2 <= 0xfe);
if (BH2S(0x87, 0x40) <= s && s <= BH2S(0xa0, 0xfe)) {
hintbase = big5hkscs_phint_0;
s -= BH2S(0x87, 0x40);
}
else if (BH2S(0xc6,0xa1) <= s && s <= BH2S(0xc8,0xfe)){
hintbase = big5hkscs_phint_12130;
s -= BH2S(0xc6, 0xa1);
}
else if (BH2S(0xf9,0xd6) <= s && s <= BH2S(0xfe,0xfe)){
hintbase = big5hkscs_phint_21924;
s -= BH2S(0xf9, 0xd6);
}
else
return MBERR_INTERNAL;
if (hintbase[s >> 3] & (1 << (s & 7))) {
OUTCHAR(decoded | 0x20000);
NEXT_IN(2);
}
else {
OUTCHAR(decoded);
NEXT_IN(2);
}
continue;
}
switch ((c << 8) | INBYTE2) {
case 0x8862: OUTCHAR2(0x00ca, 0x0304); break;
case 0x8864: OUTCHAR2(0x00ca, 0x030c); break;
case 0x88a3: OUTCHAR2(0x00ea, 0x0304); break;
case 0x88a5: OUTCHAR2(0x00ea, 0x030c); break;
default: return 1;
}
NEXT_IN(2); /* all decoded code points are pairs, above. */
}
return 0;
}
BEGIN_MAPPINGS_LIST(3)
MAPPING_DECONLY(big5hkscs)
MAPPING_ENCONLY(big5hkscs_bmp)
MAPPING_ENCONLY(big5hkscs_nonbmp)
END_MAPPINGS_LIST
BEGIN_CODECS_LIST(1)
CODEC_STATELESS_WINIT(big5hkscs)
END_CODECS_LIST
I_AM_A_MODULE_FOR(hk)
|