1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384
|
// Common/UTFConvert.h
#ifndef ZIP7_INC_COMMON_UTF_CONVERT_H
#define ZIP7_INC_COMMON_UTF_CONVERT_H
#include "MyBuffer.h"
#include "MyString.h"
struct CUtf8Check
{
// Byte MaxByte; // in original src stream
bool NonUtf;
bool ZeroChar;
bool SingleSurrogate;
bool Escape;
bool Truncated;
UInt32 MaxHighPoint; // only for points >= 0x80
CUtf8Check() { Clear(); }
void Clear()
{
// MaxByte = 0;
NonUtf = false;
ZeroChar = false;
SingleSurrogate = false;
Escape = false;
Truncated = false;
MaxHighPoint = 0;
}
void Update(const CUtf8Check &c)
{
if (c.NonUtf) NonUtf = true;
if (c.ZeroChar) ZeroChar = true;
if (c.SingleSurrogate) SingleSurrogate = true;
if (c.Escape) Escape = true;
if (c.Truncated) Truncated = true;
if (MaxHighPoint < c.MaxHighPoint) MaxHighPoint = c.MaxHighPoint;
}
void PrintStatus(AString &s) const
{
s.Empty();
// s.Add_OptSpaced("MaxByte=");
// s.Add_UInt32(MaxByte);
if (NonUtf) s.Add_OptSpaced("non-UTF8");
if (ZeroChar) s.Add_OptSpaced("ZeroChar");
if (SingleSurrogate) s.Add_OptSpaced("SingleSurrogate");
if (Escape) s.Add_OptSpaced("Escape");
if (Truncated) s.Add_OptSpaced("Truncated");
if (MaxHighPoint != 0)
{
s.Add_OptSpaced("MaxUnicode=");
s.Add_UInt32(MaxHighPoint);
}
}
bool IsOK(bool allowReduced = false) const
{
if (NonUtf || SingleSurrogate || ZeroChar)
return false;
if (MaxHighPoint >= 0x110000)
return false;
if (Truncated && !allowReduced)
return false;
return true;
}
// it checks full buffer as specified in (size) and it doesn't stop on zero char
void Check_Buf(const char *src, size_t size) throw();
void Check_AString(const AString &s) throw()
{
Check_Buf(s.Ptr(), s.Len());
}
};
/*
if (allowReduced == false) - all UTF-8 character sequences must be finished.
if (allowReduced == true) - it allows truncated last character-Utf8-sequence
*/
bool Check_UTF8_Buf(const char *src, size_t size, bool allowReduced) throw();
bool CheckUTF8_AString(const AString &s) throw();
#define Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR (1 << 0)
#define Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE (1 << 1)
#define Z7_UTF_FLAG_FROM_UTF8_BMP_ESCAPE_CONVERT (1 << 2)
/*
Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR
if (flag is NOT set)
{
it processes SINGLE-SURROGATE-8 as valid Unicode point.
it converts SINGLE-SURROGATE-8 to SINGLE-SURROGATE-16
Note: some sequencies of two SINGLE-SURROGATE-8 points
will generate correct SURROGATE-16-PAIR, and
that SURROGATE-16-PAIR later will be converted to correct
UTF8-SURROGATE-21 point. So we don't restore original
STR-8 sequence in that case.
}
if (flag is set)
{
if (Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE is defined)
it generates ESCAPE for SINGLE-SURROGATE-8,
if (Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE is not defined)
it generates U+fffd for SINGLE-SURROGATE-8,
}
Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE
if (flag is NOT set)
it generates (U+fffd) code for non-UTF-8 (invalid) characters
if (flag is set)
{
It generates (ESCAPE) codes for NON-UTF-8 (invalid) characters.
And later we can restore original UTF-8-RAW characters from (ESCAPE-16-21) codes.
}
Z7_UTF_FLAG_FROM_UTF8_BMP_ESCAPE_CONVERT
if (flag is NOT set)
{
it process ESCAPE-8 points as another Unicode points.
In Linux: ESCAPE-16 will mean two different ESCAPE-8 seqences,
so we need HIGH-ESCAPE-PLANE-21 to restore UTF-8-RAW -> UTF-16 -> UTF-8-RAW
}
if (flag is set)
{
it generates ESCAPE-16-21 for ESCAPE-8 points
so we can restore UTF-8-RAW -> UTF-16 -> UTF-8-RAW without HIGH-ESCAPE-PLANE-21.
}
Main USE CASES with UTF-8 <-> UTF-16 conversions:
WIN32: UTF-16-RAW -> UTF-8 (Archive) -> UTF-16-RAW
{
set Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE
Do NOT set Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR
Do NOT set Z7_UTF_FLAG_FROM_UTF8_BMP_ESCAPE_CONVERT
So we restore original SINGLE-SURROGATE-16 from single SINGLE-SURROGATE-8.
}
Linux: UTF-8-RAW -> UTF-16 (Intermediate / Archive) -> UTF-8-RAW
{
we want restore original UTF-8-RAW sequence later from that ESCAPE-16.
Set the flags:
Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR
Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE
Z7_UTF_FLAG_FROM_UTF8_BMP_ESCAPE_CONVERT
}
MacOS: UTF-8-RAW -> UTF-16 (Intermediate / Archive) -> UTF-8-RAW
{
we want to restore correct UTF-8 without any BMP processing:
Set the flags:
Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR
Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE
}
*/
// zero char is not allowed in (src) buf
bool Convert_UTF8_Buf_To_Unicode(const char *src, size_t srcSize, UString &dest, unsigned flags = 0);
bool ConvertUTF8ToUnicode_Flags(const AString &src, UString &dest, unsigned flags = 0);
bool ConvertUTF8ToUnicode(const AString &src, UString &dest);
#define Z7_UTF_FLAG_TO_UTF8_SURROGATE_ERROR (1 << 8)
#define Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE (1 << 9)
// #define Z7_UTF_FLAG_TO_UTF8_PARSE_HIGH_ESCAPE (1 << 10)
/*
Z7_UTF_FLAG_TO_UTF8_SURROGATE_ERROR
if (flag is NOT set)
{
we extract SINGLE-SURROGATE as normal UTF-8
In Windows : for UTF-16-RAW <-> UTF-8 (archive) <-> UTF-16-RAW in .
In Linux :
use-case-1: UTF-8 -> UTF-16 -> UTF-8 doesn't generate UTF-16 SINGLE-SURROGATE,
if (Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR) is used.
use-case 2: UTF-16-7z (with SINGLE-SURROGATE from Windows) -> UTF-8 (Linux)
will generate SINGLE-SURROGATE-UTF-8 here.
}
if (flag is set)
{
we generate UTF_REPLACEMENT_CHAR (0xfffd) for SINGLE_SURROGATE
it can be used for compatibility mode with WIN32 UTF function
or if we want UTF-8 stream without any errors
}
Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE
if (flag is NOT set) it doesn't extract raw 8-bit symbol from Escape-Plane-16
if (flag is set) it extracts raw 8-bit symbol from Escape-Plane-16
in Linux we need some way to extract NON-UTF8 RAW 8-bits from BMP (UTF-16 7z archive):
if (we use High-Escape-Plane), we can transfer BMP escapes to High-Escape-Plane.
if (we don't use High-Escape-Plane), we must use Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE.
Z7_UTF_FLAG_TO_UTF8_PARSE_HIGH_ESCAPE
// that flag affects the code only if (wchar_t is 32-bit)
// that mode with high-escape can be disabled now in UTFConvert.cpp
if (flag is NOT set)
it doesn't extract raw 8-bit symbol from High-Escape-Plane
if (flag is set)
it extracts raw 8-bit symbol from High-Escape-Plane
Main use cases:
WIN32 : UTF-16-RAW -> UTF-8 (archive) -> UTF-16-RAW
{
Do NOT set Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE.
Do NOT set Z7_UTF_FLAG_TO_UTF8_SURROGATE_ERROR.
So we restore original UTF-16-RAW.
}
Linix : UTF-8 with Escapes -> UTF-16 (7z archive) -> UTF-8 with Escapes
set Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE to extract non-UTF from 7z archive
set Z7_UTF_FLAG_TO_UTF8_PARSE_HIGH_ESCAPE for intermediate UTF-16.
Note: high esacape mode can be ignored now in UTFConvert.cpp
macOS:
the system doesn't support incorrect UTF-8 in file names.
set Z7_UTF_FLAG_TO_UTF8_SURROGATE_ERROR
*/
extern unsigned g_Unicode_To_UTF8_Flags;
void ConvertUnicodeToUTF8_Flags(const UString &src, AString &dest, unsigned flags = 0);
void ConvertUnicodeToUTF8(const UString &src, AString &dest);
void Convert_Unicode_To_UTF8_Buf(const UString &src, CByteBuffer &dest);
/*
#ifndef _WIN32
void Convert_UTF16_To_UTF32(const UString &src, UString &dest);
void Convert_UTF32_To_UTF16(const UString &src, UString &dest);
bool UTF32_IsThere_BigPoint(const UString &src);
bool Unicode_IsThere_BmpEscape(const UString &src);
#endif
bool Unicode_IsThere_Utf16SurrogateError(const UString &src);
*/
#ifdef Z7_WCHART_IS_16BIT
#define Convert_UnicodeEsc16_To_UnicodeEscHigh(s)
#else
void Convert_UnicodeEsc16_To_UnicodeEscHigh(UString &s);
#endif
/*
// #include "../../C/CpuArch.h"
// ---------- Utf16 Little endian functions ----------
// We store 16-bit surrogates even in 32-bit WCHARs in Linux.
// So now we don't use the following code:
#if WCHAR_MAX > 0xffff
// void *p : pointer to src bytes stream
// size_t len : num Utf16 characters : it can include or not include NULL character
inline size_t Utf16LE__Get_Num_WCHARs(const void *p, size_t len)
{
#if WCHAR_MAX > 0xffff
size_t num_wchars = 0;
for (size_t i = 0; i < len; i++)
{
wchar_t c = GetUi16(p);
p = (const void *)((const Byte *)p + 2);
if (c >= 0xd800 && c < 0xdc00 && i + 1 != len)
{
wchar_t c2 = GetUi16(p);
if (c2 >= 0xdc00 && c2 < 0xe000)
{
c = 0x10000 + ((c & 0x3ff) << 10) + (c2 & 0x3ff);
p = (const void *)((const Byte *)p + 2);
i++;
}
}
num_wchars++;
}
return num_wchars;
#else
UNUSED_VAR(p)
return len;
#endif
}
// #include <stdio.h>
inline wchar_t *Utf16LE__To_WCHARs_Sep(const void *p, size_t len, wchar_t *dest)
{
for (size_t i = 0; i < len; i++)
{
wchar_t c = GetUi16(p);
p = (const void *)((const Byte *)p + 2);
#if WCHAR_PATH_SEPARATOR != L'/'
if (c == L'/')
c = WCHAR_PATH_SEPARATOR;
#endif
#if WCHAR_MAX > 0xffff
if (c >= 0xd800 && c < 0xdc00 && i + 1 != len)
{
wchar_t c2 = GetUi16(p);
if (c2 >= 0xdc00 && c2 < 0xe000)
{
// printf("\nSurragate : %4x %4x -> ", (int)c, (int)c2);
c = 0x10000 + ((c & 0x3ff) << 10) + (c2 & 0x3ff);
p = (const void *)((const Byte *)p + 2);
i++;
// printf("%4x\n", (int)c);
}
}
#endif
*dest++ = c;
}
return dest;
}
inline size_t Get_Num_Utf16_chars_from_wchar_string(const wchar_t *p)
{
size_t num = 0;
for (;;)
{
wchar_t c = *p++;
if (c == 0)
return num;
num += ((c >= 0x10000 && c < 0x110000) ? 2 : 1);
}
return num;
}
inline Byte *wchars_to_Utf16LE(const wchar_t *p, Byte *dest)
{
for (;;)
{
wchar_t c = *p++;
if (c == 0)
return dest;
if (c >= 0x10000 && c < 0x110000)
{
SetUi16(dest , (UInt16)(0xd800 + ((c >> 10) & 0x3FF)));
SetUi16(dest + 2, (UInt16)(0xdc00 + ( c & 0x3FF)));
dest += 4;
}
else
{
SetUi16(dest, c);
dest += 2;
}
}
}
#endif
*/
#endif
|