1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349
|
#include "to_utf8.hpp"
#include <vector>
#include <cassert>
#include <stdexcept>
#include <components/debug/debuglog.hpp>
/* This file contains the code to translate from WINDOWS-1252 (native
charset used in English version of Morrowind) to UTF-8. The library
is designed to be extened to support more source encodings later,
which means that we may add support for Russian, Polish and Chinese
files and so on.
The code does not depend on any external library at
runtime. Instead, it uses a pregenerated table made with iconv (see
gen_iconv.cpp and the Makefile) which is located in tables_gen.hpp.
This is both faster and uses less dependencies. The tables would
only need to be regenerated if we are adding support more input
encodings. As such, there is no need to make the generator code
platform independent.
The library is optimized for the case of pure ASCII input strings,
which is the vast majority of cases at least for the English
version. A test of my version of Morrowind.esm got 130 non-ASCII vs
236195 ASCII strings, or less than 0.06% of strings containing
non-ASCII characters.
To optmize for this, ff the first pass of the string does not find
any non-ASCII characters, the entire string is passed along without
any modification.
Most of the non-ASCII strings are books, and are quite large. (The
non-ASCII characters are typically starting and ending quotation
marks.) Within these, almost all the characters are ASCII. For this
purpose, the library is also optimized for mostly-ASCII contents
even in the cases where some conversion is necessary.
*/
// Generated tables
#include "tables_gen.hpp"
using namespace ToUTF8;
Utf8Encoder::Utf8Encoder(const FromType sourceEncoding):
mOutput(50*1024)
{
switch (sourceEncoding)
{
case ToUTF8::WINDOWS_1252:
{
translationArray = ToUTF8::windows_1252;
break;
}
case ToUTF8::WINDOWS_1250:
{
translationArray = ToUTF8::windows_1250;
break;
}
case ToUTF8::WINDOWS_1251:
{
translationArray = ToUTF8::windows_1251;
break;
}
case ToUTF8::CP437:
{
translationArray = ToUTF8::cp437;
break;
}
default:
{
assert(0);
}
}
}
std::string Utf8Encoder::getUtf8(const char* input, size_t size)
{
// Double check that the input string stops at some point (it might
// contain zero terminators before this, inside its own data, which
// is also ok.)
assert(input[size] == 0);
// Note: The rest of this function is designed for single-character
// input encodings only. It also assumes that the input encoding
// shares its first 128 values (0-127) with ASCII. There are no plans
// to add more encodings to this module (we are using utf8 for new
// content files), so that shouldn't be an issue.
// Compute output length, and check for pure ascii input at the same
// time.
bool ascii;
size_t outlen = getLength(input, ascii);
// If we're pure ascii, then don't bother converting anything.
if(ascii)
return std::string(input, outlen);
// Make sure the output is large enough
resize(outlen);
char *out = &mOutput[0];
// Translate
while (*input)
copyFromArray(*(input++), out);
// Make sure that we wrote the correct number of bytes
assert((out-&mOutput[0]) == (int)outlen);
// And make extra sure the output is null terminated
assert(mOutput.size() > outlen);
assert(mOutput[outlen] == 0);
// Return a string
return std::string(&mOutput[0], outlen);
}
std::string Utf8Encoder::getLegacyEnc(const char *input, size_t size)
{
// Double check that the input string stops at some point (it might
// contain zero terminators before this, inside its own data, which
// is also ok.)
assert(input[size] == 0);
// TODO: The rest of this function is designed for single-character
// input encodings only. It also assumes that the input the input
// encoding shares its first 128 values (0-127) with ASCII. These
// conditions must be checked again if you add more input encodings
// later.
// Compute output length, and check for pure ascii input at the same
// time.
bool ascii;
size_t outlen = getLength2(input, ascii);
// If we're pure ascii, then don't bother converting anything.
if(ascii)
return std::string(input, outlen);
// Make sure the output is large enough
resize(outlen);
char *out = &mOutput[0];
// Translate
while(*input)
copyFromArray2(input, out);
// Make sure that we wrote the correct number of bytes
assert((out-&mOutput[0]) == (int)outlen);
// And make extra sure the output is null terminated
assert(mOutput.size() > outlen);
assert(mOutput[outlen] == 0);
// Return a string
return std::string(&mOutput[0], outlen);
}
// Make sure the output vector is large enough for 'size' bytes,
// including a terminating zero after it.
void Utf8Encoder::resize(size_t size)
{
if (mOutput.size() <= size)
// Add some extra padding to reduce the chance of having to resize
// again later.
mOutput.resize(3*size);
// And make sure the string is zero terminated
mOutput[size] = 0;
}
/** Get the total length length needed to decode the given string with
the given translation array. The arrays are encoded with 6 bytes
per character, with the first giving the length and the next 5 the
actual data.
The function serves a dual purpose for optimization reasons: it
checks if the input is pure ascii (all values are <= 127). If this
is the case, then the ascii parameter is set to true, and the
caller can optimize for this case.
*/
size_t Utf8Encoder::getLength(const char* input, bool &ascii)
{
ascii = true;
size_t len = 0;
const char* ptr = input;
unsigned char inp = *ptr;
// Do away with the ascii part of the string first (this is almost
// always the entire string.)
while (inp && inp < 128)
inp = *(++ptr);
len += (ptr-input);
// If we're not at the null terminator at this point, then there
// were some non-ascii characters to deal with. Go to slow-mode for
// the rest of the string.
if (inp)
{
ascii = false;
while (inp)
{
// Find the translated length of this character in the
// lookup table.
len += translationArray[inp*6];
inp = *(++ptr);
}
}
return len;
}
// Translate one character 'ch' using the translation array 'arr', and
// advance the output pointer accordingly.
void Utf8Encoder::copyFromArray(unsigned char ch, char* &out)
{
// Optimize for ASCII values
if (ch < 128)
{
*(out++) = ch;
return;
}
const signed char *in = translationArray + ch*6;
int len = *(in++);
for (int i=0; i<len; i++)
*(out++) = *(in++);
}
size_t Utf8Encoder::getLength2(const char* input, bool &ascii)
{
ascii = true;
size_t len = 0;
const char* ptr = input;
unsigned char inp = *ptr;
// Do away with the ascii part of the string first (this is almost
// always the entire string.)
while (inp && inp < 128)
inp = *(++ptr);
len += (ptr-input);
// If we're not at the null terminator at this point, then there
// were some non-ascii characters to deal with. Go to slow-mode for
// the rest of the string.
if (inp)
{
ascii = false;
while(inp)
{
len += 1;
// Find the translated length of this character in the
// lookup table.
switch(inp)
{
case 0xe2: len -= 2; break;
case 0xc2:
case 0xcb:
case 0xc4:
case 0xc6:
case 0xc3:
case 0xd0:
case 0xd1:
case 0xd2:
case 0xc5: len -= 1; break;
}
inp = *(++ptr);
}
}
return len;
}
void Utf8Encoder::copyFromArray2(const char*& chp, char* &out)
{
unsigned char ch = *(chp++);
// Optimize for ASCII values
if (ch < 128)
{
*(out++) = ch;
return;
}
int len = 1;
switch (ch)
{
case 0xe2: len = 3; break;
case 0xc2:
case 0xcb:
case 0xc4:
case 0xc6:
case 0xc3:
case 0xd0:
case 0xd1:
case 0xd2:
case 0xc5: len = 2; break;
}
if (len == 1) // There is no 1 length utf-8 glyph that is not 0x20 (empty space)
{
*(out++) = ch;
return;
}
unsigned char ch2 = *(chp++);
unsigned char ch3 = '\0';
if (len == 3)
ch3 = *(chp++);
for (int i = 128; i < 256; i++)
{
unsigned char b1 = translationArray[i*6 + 1], b2 = translationArray[i*6 + 2], b3 = translationArray[i*6 + 3];
if (b1 == ch && b2 == ch2 && (len != 3 || b3 == ch3))
{
*(out++) = (char)i;
return;
}
}
Log(Debug::Info) << "Could not find glyph " << std::hex << (int)ch << " " << (int)ch2 << " " << (int)ch3;
*(out++) = ch; // Could not find glyph, just put whatever
}
ToUTF8::FromType ToUTF8::calculateEncoding(const std::string& encodingName)
{
if (encodingName == "win1250")
return ToUTF8::WINDOWS_1250;
else if (encodingName == "win1251")
return ToUTF8::WINDOWS_1251;
else if (encodingName == "win1252")
return ToUTF8::WINDOWS_1252;
else
throw std::runtime_error(std::string("Unknown encoding '") + encodingName + std::string("', see openmw --help for available options."));
}
std::string ToUTF8::encodingUsingMessage(const std::string& encodingName)
{
if (encodingName == "win1250")
return "Using Central and Eastern European font encoding.";
else if (encodingName == "win1251")
return "Using Cyrillic font encoding.";
else if (encodingName == "win1252")
return "Using default (English) font encoding.";
else
throw std::runtime_error(std::string("Unknown encoding '") + encodingName + std::string("', see openmw --help for available options."));
}
|