1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
|
/* Copyright (c) 1993-2008 by Richard Kelsey and Jonathan Rees.
See file COPYING. */
/*
* This file defines functions for dealing with a synthetic text
* encoding called UTF-8. It's like UTF-8, but also encodes unpaired
* surrogates directly, which is what we need for the Windows API.
*/
#include <windows.h>
static char masks[4] = { 0x7f, 0x1f, 0x0f, 0x07 };
/*
* - NUL-terminates
* - if utf_8of16 is NULL, we just compute the size
* - returns size (sans NUL) needed for UTF-8of16
*/
int
s48_utf_16_to_utf_8of16(LPWSTR utf_16,
unsigned char* utf_8of16)
{
int p = 0, i = 0;
while (utf_16[i])
{
unsigned int c = utf_16[i];
++i;
if ((c >= 0xD800) && (c <= 0xDBFF) /* high surrogate */
&& utf_16[i]
&& (utf_16[i] >= 0xDC00) && (utf_16[i] <= 0xDFFF)) /* low surrogate */
{
c = ((c - 0xd7c0) << 10) + (utf_16[i] & 0x3ff);
++i;
}
if (c <= 0x7f)
{
if (utf_8of16)
utf_8of16[p] = (unsigned char) c;
++p;
}
else if (c <= 0x7ff)
{
if (utf_8of16)
{
utf_8of16[p] = (unsigned char) ((c >> 6) + 0xc0);
utf_8of16[p+1] = (unsigned char) ((c & 0x3f) + 0x80);
}
p += 2;
}
else if (c <= 0xffff)
{
if (utf_8of16)
{
utf_8of16[p] = (unsigned char) ((c >> 12) + 0xe0);
utf_8of16[p+1] = (unsigned char) (((c >> 6) & 0x3f) + 0x80);
utf_8of16[p+2] = (unsigned char) ((c & 0x3f) + 0x80);
}
p += 3;
}
else
{
if (utf_8of16)
{
utf_8of16[p] = (unsigned char) ((c >> 18) + 0xf0);
utf_8of16[p+1] = (unsigned char) (((c >> 12) + 0xe0) + 0x80);
utf_8of16[p+2] = (unsigned char) (((c >> 6) & 0x3f) + 0x80);
utf_8of16[p+3] = (unsigned char) ((c & 0x3f) + 0x80);
}
p += 4;
}
}
if (utf_8of16)
utf_8of16[p] = 0;
return p;
}
/*
* The table, and the associated decoding algorithm, is from
* Richard Gillam: "Unicode Demystified", chapter 14
*/
static char states[3][32] =
{
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 2, 2, 3, -1},
{-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, 0, 0, 0, 0, 0, 0, 0, 0, -2, -2, -2, -2, -2, -2, -2, -2},
{-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, 1, 1, 1, 1, 1, 1, 1, 1, -2, -2, -2, -2, -2, -2, -2, -2},
};
/*
* - NUL-terminates
* - if utf_6 is NULL, we just compute the size
* - returns size (sans NUL) needed for UTF-16
*/
int
s48_utf_8of16_to_utf_16(const unsigned char* utf_8of16,
LPWSTR utf_16,
int* errorp)
{
int p = 0, q = 0, state = 0, error = 0;
unsigned int scalar_value = 0;
unsigned mask = 0;
while (utf_8of16[q])
{
unsigned char c = utf_8of16[q];
++q;
state = states[state][c >> 3];
switch (state) {
case 0:
scalar_value += c & 0x7f;
if (scalar_value > 0xffff)
{
if (utf_16)
{
utf_16[p] = (scalar_value >> 10) + 0xD7C0;
utf_16[p+1] = (scalar_value & 0x3FF) + 0xDC00;
}
p += 2;
}
else
{
if (utf_16)
utf_16[p] = scalar_value;
++p;
}
scalar_value = 0;
mask = 0;
break;
case 1:
case 2:
if (mask == 0)
mask = masks[state];
scalar_value = (scalar_value + (c & mask)) << 6;
mask = 0x3f;
break;
case -2:
--q;
/* fall thru */
case -1:
if (utf_16)
utf_16[p] = 0xfffd;
++p;
scalar_value = 0;
state = 0;
mask = 0;
error = 1;
break;
}
}
if (errorp)
*errorp = error;
if (utf_16)
utf_16[p] = 0;
return p;
}
/*
#include <stdlib.h>
#include <stdio.h>
int
main(void)
{
unsigned int t1[] = { 'A', 'B', 0xd800, 0xd900, 0xdfff, 'C', 'D', 0 };
int size_8 = s48_utf_16_to_utf_8of16(t1, NULL);
printf("size_8 %d\n", size_8);
unsigned char c[1000];
size_8 = s48_utf_16_to_utf_8of16(t1, c);
printf("size_8 %d\n", size_8);
{
int i = 0;
while (i < size_8)
{
printf("%d: %4x\n", i, c[i]);
++i;
}
}
unsigned int u[1000];
int error;
int size_16 = s48_utf_8of16_to_utf_16(c, NULL, &error);
printf("size_16 %d\n", size_16);
size_16 = s48_utf_8of16_to_utf_16(c, u, &error);
printf("size_16 %d\n", size_16);
{
int i = 0;
while (i < size_16)
{
printf("%d: %4x\n", i, u[i]);
++i;
}
}
}
*/
|