File: to_utf8.cpp

package info (click to toggle)
openmw 0.47.0-3
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 23,276 kB
  • sloc: cpp: 249,935; xml: 1,978; sh: 1,327; python: 63; makefile: 26
file content (349 lines) | stat: -rw-r--r-- 10,427 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
#include "to_utf8.hpp"

#include <vector>
#include <cassert>
#include <stdexcept>

#include <components/debug/debuglog.hpp>

/* This file contains the code to translate from WINDOWS-1252 (native
   charset used in English version of Morrowind) to UTF-8. The library
   is designed to be extened to support more source encodings later,
   which means that we may add support for Russian, Polish and Chinese
   files and so on.

   The code does not depend on any external library at
   runtime. Instead, it uses a pregenerated table made with iconv (see
   gen_iconv.cpp and the Makefile) which is located in tables_gen.hpp.

   This is both faster and uses less dependencies. The tables would
   only need to be regenerated if we are adding support more input
   encodings. As such, there is no need to make the generator code
   platform independent.

   The library is optimized for the case of pure ASCII input strings,
   which is the vast majority of cases at least for the English
   version. A test of my version of Morrowind.esm got 130 non-ASCII vs
   236195 ASCII strings, or less than 0.06% of strings containing
   non-ASCII characters.

   To optmize for this, ff the first pass of the string does not find
   any non-ASCII characters, the entire string is passed along without
   any modification.

   Most of the non-ASCII strings are books, and are quite large. (The
   non-ASCII characters are typically starting and ending quotation
   marks.) Within these, almost all the characters are ASCII. For this
   purpose, the library is also optimized for mostly-ASCII contents
   even in the cases where some conversion is necessary.
 */


// Generated tables
#include "tables_gen.hpp"

using namespace ToUTF8;

Utf8Encoder::Utf8Encoder(const FromType sourceEncoding):
    mOutput(50*1024)
{
    switch (sourceEncoding)
    {
        case ToUTF8::WINDOWS_1252:
        {
            translationArray = ToUTF8::windows_1252;
            break;
        }
        case ToUTF8::WINDOWS_1250:
        {
            translationArray = ToUTF8::windows_1250;
            break;
        }
        case ToUTF8::WINDOWS_1251:
        {
            translationArray = ToUTF8::windows_1251;
            break;
        }
        case ToUTF8::CP437:
        {
            translationArray = ToUTF8::cp437;
            break;
        }

        default:
        {
            assert(0);
        }
    }
}

std::string Utf8Encoder::getUtf8(const char* input, size_t size)
{
    // Double check that the input string stops at some point (it might
    // contain zero terminators before this, inside its own data, which
    // is also ok.)
    assert(input[size] == 0);

    // Note: The rest of this function is designed for single-character
    // input encodings only. It also assumes that the input encoding
    // shares its first 128 values (0-127) with ASCII. There are no plans
    // to add more encodings to this module (we are using utf8 for new
    // content files), so that shouldn't be an issue.

    // Compute output length, and check for pure ascii input at the same
    // time.
    bool ascii;
    size_t outlen = getLength(input, ascii);

    // If we're pure ascii, then don't bother converting anything.
    if(ascii)
        return std::string(input, outlen);

    // Make sure the output is large enough
    resize(outlen);
    char *out = &mOutput[0];

    // Translate
    while (*input)
        copyFromArray(*(input++), out);

    // Make sure that we wrote the correct number of bytes
    assert((out-&mOutput[0]) == (int)outlen);

    // And make extra sure the output is null terminated
    assert(mOutput.size() > outlen);
    assert(mOutput[outlen] == 0);

    // Return a string
    return std::string(&mOutput[0], outlen);
}

std::string Utf8Encoder::getLegacyEnc(const char *input, size_t size)
{
    // Double check that the input string stops at some point (it might
    // contain zero terminators before this, inside its own data, which
    // is also ok.)
    assert(input[size] == 0);

    // TODO: The rest of this function is designed for single-character
    // input encodings only. It also assumes that the input the input
    // encoding shares its first 128 values (0-127) with ASCII. These
    // conditions must be checked again if you add more input encodings
    // later.

    // Compute output length, and check for pure ascii input at the same
    // time.
    bool ascii;
    size_t outlen = getLength2(input, ascii);

    // If we're pure ascii, then don't bother converting anything.
    if(ascii)
        return std::string(input, outlen);

    // Make sure the output is large enough
    resize(outlen);
    char *out = &mOutput[0];

    // Translate
    while(*input)
        copyFromArray2(input, out);

    // Make sure that we wrote the correct number of bytes
    assert((out-&mOutput[0]) == (int)outlen);

    // And make extra sure the output is null terminated
    assert(mOutput.size() > outlen);
    assert(mOutput[outlen] == 0);

    // Return a string
    return std::string(&mOutput[0], outlen);
}

// Make sure the output vector is large enough for 'size' bytes,
// including a terminating zero after it.
void Utf8Encoder::resize(size_t size)
{
    if (mOutput.size() <= size)
        // Add some extra padding to reduce the chance of having to resize
        // again later.
        mOutput.resize(3*size);

    // And make sure the string is zero terminated
    mOutput[size] = 0;
}

/** Get the total length length needed to decode the given string with
  the given translation array. The arrays are encoded with 6 bytes
  per character, with the first giving the length and the next 5 the
  actual data.

  The function serves a dual purpose for optimization reasons: it
  checks if the input is pure ascii (all values are <= 127). If this
  is the case, then the ascii parameter is set to true, and the
  caller can optimize for this case.
 */
size_t Utf8Encoder::getLength(const char* input, bool &ascii)
{
    ascii = true;
    size_t len = 0;
    const char* ptr = input;
    unsigned char inp = *ptr;

    // Do away with the ascii part of the string first (this is almost
    // always the entire string.)
    while (inp && inp < 128)
        inp = *(++ptr);
    len += (ptr-input);

    // If we're not at the null terminator at this point, then there
    // were some non-ascii characters to deal with. Go to slow-mode for
    // the rest of the string.
    if (inp)
    {
        ascii = false;
        while (inp)
        {
            // Find the translated length of this character in the
            // lookup table.
            len += translationArray[inp*6];
            inp = *(++ptr);
        }
    }
    return len;
}

// Translate one character 'ch' using the translation array 'arr', and
// advance the output pointer accordingly.
void Utf8Encoder::copyFromArray(unsigned char ch, char* &out)
{
    // Optimize for ASCII values
    if (ch < 128)
    {
        *(out++) = ch;
        return;
    }

    const signed char *in = translationArray + ch*6;
    int len = *(in++);
    for (int i=0; i<len; i++)
        *(out++) = *(in++);
}

size_t Utf8Encoder::getLength2(const char* input, bool &ascii)
{
    ascii = true;
    size_t len = 0;
    const char* ptr = input;
    unsigned char inp = *ptr;

    // Do away with the ascii part of the string first (this is almost
    // always the entire string.)
    while (inp && inp < 128)
        inp = *(++ptr);
    len += (ptr-input);

    // If we're not at the null terminator at this point, then there
    // were some non-ascii characters to deal with. Go to slow-mode for
    // the rest of the string.
    if (inp)
    {
        ascii = false;
        while(inp)
        {
            len += 1;
            // Find the translated length of this character in the
            // lookup table.
            switch(inp)
            {
                case 0xe2: len -= 2; break;
                case 0xc2:
                case 0xcb:
                case 0xc4:
                case 0xc6:
                case 0xc3:
                case 0xd0:
                case 0xd1:
                case 0xd2:
                case 0xc5: len -= 1; break;
            }

            inp = *(++ptr);
        }
    }
    return len;
}

void Utf8Encoder::copyFromArray2(const char*& chp, char* &out)
{
    unsigned char ch = *(chp++);
    // Optimize for ASCII values
    if (ch < 128)
    {
        *(out++) = ch;
        return;
    }

    int len = 1;
    switch (ch)
    {
        case 0xe2: len = 3; break;
        case 0xc2:
        case 0xcb:
        case 0xc4:
        case 0xc6:
        case 0xc3:
        case 0xd0:
        case 0xd1:
        case 0xd2:
        case 0xc5: len = 2; break;
    }

    if (len == 1) // There is no 1 length utf-8 glyph that is not 0x20 (empty space)
    {
        *(out++) = ch;
        return;
    }

    unsigned char ch2 = *(chp++);
    unsigned char ch3 = '\0';
    if (len == 3)
        ch3 = *(chp++);

    for (int i = 128; i < 256; i++)
    {
        unsigned char b1 = translationArray[i*6 + 1], b2 = translationArray[i*6 + 2], b3 = translationArray[i*6 + 3];
        if (b1 == ch && b2 == ch2 && (len != 3 || b3 == ch3))
        {
            *(out++) = (char)i;
            return;
        }
    }

    Log(Debug::Info) << "Could not find glyph " << std::hex << (int)ch << " " << (int)ch2 << " " << (int)ch3;

    *(out++) = ch; // Could not find glyph, just put whatever
}

ToUTF8::FromType ToUTF8::calculateEncoding(const std::string& encodingName)
{
    if (encodingName == "win1250")
        return ToUTF8::WINDOWS_1250;
    else if (encodingName == "win1251")
        return ToUTF8::WINDOWS_1251;
    else if (encodingName == "win1252")
        return ToUTF8::WINDOWS_1252;
    else
        throw std::runtime_error(std::string("Unknown encoding '") + encodingName + std::string("', see openmw --help for available options."));
}

std::string ToUTF8::encodingUsingMessage(const std::string& encodingName)
{
    if (encodingName == "win1250")
        return "Using Central and Eastern European font encoding.";
    else if (encodingName == "win1251")
        return "Using Cyrillic font encoding.";
    else if (encodingName == "win1252")
        return "Using default (English) font encoding.";
    else
        throw std::runtime_error(std::string("Unknown encoding '") + encodingName + std::string("', see openmw --help for available options."));
}