1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131
|
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
******************************************************************************
*
* Copyright (C) 1999-2012, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
* file name: utf_impl.cpp
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 1999sep13
* created by: Markus W. Scherer
*
* This file provides implementation functions for macros in the utfXX.h
* that would otherwise be too long as macros.
*/
#include "base/third_party/icu/icu_utf.h"
namespace base_icu {
// source/common/utf_impl.cpp
static const UChar32
utf8_errorValue[6]={
// Same values as UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF_ERROR_VALUE,
// but without relying on the obsolete unicode/utf_old.h.
0x15, 0x9f, 0xffff,
0x10ffff
};
static UChar32
errorValue(int32_t count, int8_t strict) {
if(strict>=0) {
return utf8_errorValue[count];
} else if(strict==-3) {
return 0xfffd;
} else {
return CBU_SENTINEL;
}
}
/*
* Handle the non-inline part of the U8_NEXT() and U8_NEXT_FFFD() macros
* and their obsolete sibling UTF8_NEXT_CHAR_SAFE().
*
* U8_NEXT() supports NUL-terminated strings indicated via length<0.
*
* The "strict" parameter controls the error behavior:
* <0 "Safe" behavior of U8_NEXT():
* -1: All illegal byte sequences yield U_SENTINEL=-1.
* -2: Same as -1, except for lenient treatment of surrogate code points as legal.
* Some implementations use this for roundtripping of
* Unicode 16-bit strings that are not well-formed UTF-16, that is, they
* contain unpaired surrogates.
* -3: All illegal byte sequences yield U+FFFD.
* 0 Obsolete "safe" behavior of UTF8_NEXT_CHAR_SAFE(..., FALSE):
* All illegal byte sequences yield a positive code point such that this
* result code point would be encoded with the same number of bytes as
* the illegal sequence.
* >0 Obsolete "strict" behavior of UTF8_NEXT_CHAR_SAFE(..., TRUE):
* Same as the obsolete "safe" behavior, but non-characters are also treated
* like illegal sequences.
*
* Note that a UBool is the same as an int8_t.
*/
UChar32
utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict) {
// *pi is one after byte c.
int32_t i=*pi;
// length can be negative for NUL-terminated strings: Read and validate one byte at a time.
if(i==length || c>0xf4) {
// end of string, or not a lead byte
} else if(c>=0xf0) {
// Test for 4-byte sequences first because
// U8_NEXT() handles shorter valid sequences inline.
uint8_t t1=s[i], t2, t3;
c&=7;
if(CBU8_IS_VALID_LEAD4_AND_T1(c, t1) &&
++i!=length && (t2=s[i]-0x80)<=0x3f &&
++i!=length && (t3=s[i]-0x80)<=0x3f) {
++i;
c=(c<<18)|((t1&0x3f)<<12)|(t2<<6)|t3;
// strict: forbid non-characters like U+fffe
if(strict<=0 || !CBU_IS_UNICODE_NONCHAR(c)) {
*pi=i;
return c;
}
}
} else if(c>=0xe0) {
c&=0xf;
if(strict!=-2) {
uint8_t t1=s[i], t2;
if(CBU8_IS_VALID_LEAD3_AND_T1(c, t1) &&
++i!=length && (t2=s[i]-0x80)<=0x3f) {
++i;
c=(c<<12)|((t1&0x3f)<<6)|t2;
// strict: forbid non-characters like U+fffe
if(strict<=0 || !CBU_IS_UNICODE_NONCHAR(c)) {
*pi=i;
return c;
}
}
} else {
// strict=-2 -> lenient: allow surrogates
uint8_t t1=s[i]-0x80, t2;
if(t1<=0x3f && (c>0 || t1>=0x20) &&
++i!=length && (t2=s[i]-0x80)<=0x3f) {
*pi=i+1;
return (c<<12)|(t1<<6)|t2;
}
}
} else if(c>=0xc2) {
uint8_t t1=s[i]-0x80;
if(t1<=0x3f) {
*pi=i+1;
return ((c-0xc0)<<6)|t1;
}
} // else 0x80<=c<0xc2 is not a lead byte
/* error handling */
c=errorValue(i-*pi, strict);
*pi=i;
return c;
}
} // namespace base_icu
|