1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292
|
// Copyright 2009 The Closure Library Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS-IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/**
* @fileoverview Collection of utility functions for Unicode character.
*
*/
goog.provide('goog.i18n.uChar');
// Constants for handling Unicode supplementary characters (surrogate pairs).
/**
* The minimum value for Supplementary code points.
* @type {number}
* @private
*/
goog.i18n.uChar.SUPPLEMENTARY_CODE_POINT_MIN_VALUE_ = 0x10000;
/**
* The highest Unicode code point value (scalar value) according to the Unicode
* Standard.
* @type {number}
* @private
*/
goog.i18n.uChar.CODE_POINT_MAX_VALUE_ = 0x10FFFF;
/**
* Lead surrogate minimum value.
* @type {number}
* @private
*/
goog.i18n.uChar.LEAD_SURROGATE_MIN_VALUE_ = 0xD800;
/**
* Lead surrogate maximum value.
* @type {number}
* @private
*/
goog.i18n.uChar.LEAD_SURROGATE_MAX_VALUE_ = 0xDBFF;
/**
* Trail surrogate minimum value.
* @type {number}
* @private
*/
goog.i18n.uChar.TRAIL_SURROGATE_MIN_VALUE_ = 0xDC00;
/**
* Trail surrogate maximum value.
* @type {number}
* @private
*/
goog.i18n.uChar.TRAIL_SURROGATE_MAX_VALUE_ = 0xDFFF;
/**
* The number of least significant bits of a supplementary code point that in
* UTF-16 become the least significant bits of the trail surrogate. The rest of
* the in-use bits of the supplementary code point become the least significant
* bits of the lead surrogate.
* @type {number}
* @private
*/
goog.i18n.uChar.TRAIL_SURROGATE_BIT_COUNT_ = 10;
/**
* Gets the U+ notation string of a Unicode character. Ex: 'U+0041' for 'A'.
* @param {string} ch The given character.
* @return {string} The U+ notation of the given character.
*/
goog.i18n.uChar.toHexString = function(ch) {
var chCode = goog.i18n.uChar.toCharCode(ch);
var chCodeStr = 'U+' +
goog.i18n.uChar.padString_(chCode.toString(16).toUpperCase(), 4, '0');
return chCodeStr;
};
/**
* Gets a string padded with given character to get given size.
* @param {string} str The given string to be padded.
* @param {number} length The target size of the string.
* @param {string} ch The character to be padded with.
* @return {string} The padded string.
* @private
*/
goog.i18n.uChar.padString_ = function(str, length, ch) {
while (str.length < length) {
str = ch + str;
}
return str;
};
/**
* Gets Unicode value of the given character.
* @param {string} ch The given character, which in the case of a supplementary
* character is actually a surrogate pair. The remainder of the string is
* ignored.
* @return {number} The Unicode value of the character.
*/
goog.i18n.uChar.toCharCode = function(ch) {
return goog.i18n.uChar.getCodePointAround(ch, 0);
};
/**
* Gets a character from the given Unicode value. If the given code point is not
* a valid Unicode code point, null is returned.
* @param {number} code The Unicode value of the character.
* @return {?string} The character corresponding to the given Unicode value.
*/
goog.i18n.uChar.fromCharCode = function(code) {
if (!goog.isDefAndNotNull(code) ||
!(code >= 0 && code <= goog.i18n.uChar.CODE_POINT_MAX_VALUE_)) {
return null;
}
if (goog.i18n.uChar.isSupplementaryCodePoint(code)) {
// First, we split the code point into the trail surrogate part (the
// TRAIL_SURROGATE_BIT_COUNT_ least significant bits) and the lead surrogate
// part (the rest of the bits, shifted down; note that for now this includes
// the supplementary offset, also shifted down, to be subtracted off below).
var leadBits = code >> goog.i18n.uChar.TRAIL_SURROGATE_BIT_COUNT_;
var trailBits = code &
// A bit-mask to get the TRAIL_SURROGATE_BIT_COUNT_ (i.e. 10) least
// significant bits. 1 << 10 = 0x0400. 0x0400 - 1 = 0x03FF.
((1 << goog.i18n.uChar.TRAIL_SURROGATE_BIT_COUNT_) - 1);
// Now we calculate the code point of each surrogate by adding each offset
// to the corresponding base code point.
var leadCodePoint = leadBits +
(goog.i18n.uChar.LEAD_SURROGATE_MIN_VALUE_ -
// Subtract off the supplementary offset, which had been shifted down
// with the rest of leadBits. We do this here instead of before the
// shift in order to save a separate subtraction step.
(goog.i18n.uChar.SUPPLEMENTARY_CODE_POINT_MIN_VALUE_ >>
goog.i18n.uChar.TRAIL_SURROGATE_BIT_COUNT_));
var trailCodePoint = trailBits + goog.i18n.uChar.TRAIL_SURROGATE_MIN_VALUE_;
// Convert the code points into a 2-character long string.
return String.fromCharCode(leadCodePoint) +
String.fromCharCode(trailCodePoint);
}
return String.fromCharCode(code);
};
/**
* Returns the Unicode code point at the specified index.
*
* If the char value specified at the given index is in the leading-surrogate
* range, and the following index is less than the length of {@code string}, and
* the char value at the following index is in the trailing-surrogate range,
* then the supplementary code point corresponding to this surrogate pair is
* returned.
*
* If the char value specified at the given index is in the trailing-surrogate
* range, and the preceding index is not before the start of {@code string}, and
* the char value at the preceding index is in the leading-surrogate range, then
* the negated supplementary code point corresponding to this surrogate pair is
* returned.
*
* The negation allows the caller to differentiate between the case where the
* given index is at the leading surrogate and the one where it is at the
* trailing surrogate, and thus deduce where the next character starts and
* preceding character ends.
*
* Otherwise, the char value at the given index is returned. Thus, a leading
* surrogate is returned when it is not followed by a trailing surrogate, and a
* trailing surrogate is returned when it is not preceded by a leading
* surrogate.
*
* @param {string} string The string.
* @param {number} index The index from which the code point is to be retrieved.
* @return {number} The code point at the given index. If the given index is
* that of the start (i.e. lead surrogate) of a surrogate pair, returns the code
* point encoded by the pair. If the given index is that of the end (i.e. trail
* surrogate) of a surrogate pair, returns the negated code pointed encoded by
* the pair.
*/
goog.i18n.uChar.getCodePointAround = function(string, index) {
var charCode = string.charCodeAt(index);
if (goog.i18n.uChar.isLeadSurrogateCodePoint(charCode) &&
index + 1 < string.length) {
var trail = string.charCodeAt(index + 1);
if (goog.i18n.uChar.isTrailSurrogateCodePoint(trail)) {
// Part of a surrogate pair.
return /** @type {number} */ (
goog.i18n.uChar.buildSupplementaryCodePoint(charCode, trail));
}
} else if (goog.i18n.uChar.isTrailSurrogateCodePoint(charCode) && index > 0) {
var lead = string.charCodeAt(index - 1);
if (goog.i18n.uChar.isLeadSurrogateCodePoint(lead)) {
// Part of a surrogate pair.
return /** @type {number} */ (
-goog.i18n.uChar.buildSupplementaryCodePoint(lead, charCode));
}
}
return charCode;
};
/**
* Determines the length of the string needed to represent the specified
* Unicode code point.
* @param {number} codePoint
* @return {number} 2 if codePoint is a supplementary character, 1 otherwise.
*/
goog.i18n.uChar.charCount = function(codePoint) {
return goog.i18n.uChar.isSupplementaryCodePoint(codePoint) ? 2 : 1;
};
/**
* Determines whether the specified Unicode code point is in the supplementary
* Unicode characters range.
* @param {number} codePoint
* @return {boolean} Whether then given code point is a supplementary character.
*/
goog.i18n.uChar.isSupplementaryCodePoint = function(codePoint) {
return codePoint >= goog.i18n.uChar.SUPPLEMENTARY_CODE_POINT_MIN_VALUE_ &&
codePoint <= goog.i18n.uChar.CODE_POINT_MAX_VALUE_;
};
/**
* Gets whether the given code point is a leading surrogate character.
* @param {number} codePoint
* @return {boolean} Whether the given code point is a leading surrogate
* character.
*/
goog.i18n.uChar.isLeadSurrogateCodePoint = function(codePoint) {
return codePoint >= goog.i18n.uChar.LEAD_SURROGATE_MIN_VALUE_ &&
codePoint <= goog.i18n.uChar.LEAD_SURROGATE_MAX_VALUE_;
};
/**
* Gets whether the given code point is a trailing surrogate character.
* @param {number} codePoint
* @return {boolean} Whether the given code point is a trailing surrogate
* character.
*/
goog.i18n.uChar.isTrailSurrogateCodePoint = function(codePoint) {
return codePoint >= goog.i18n.uChar.TRAIL_SURROGATE_MIN_VALUE_ &&
codePoint <= goog.i18n.uChar.TRAIL_SURROGATE_MAX_VALUE_;
};
/**
* Composes a supplementary Unicode code point from the given UTF-16 surrogate
* pair. If leadSurrogate isn't a leading surrogate code point or trailSurrogate
* isn't a trailing surrogate code point, null is returned.
* @param {number} lead The leading surrogate code point.
* @param {number} trail The trailing surrogate code point.
* @return {?number} The supplementary Unicode code point obtained by decoding
* the given UTF-16 surrogate pair.
*/
goog.i18n.uChar.buildSupplementaryCodePoint = function(lead, trail) {
if (goog.i18n.uChar.isLeadSurrogateCodePoint(lead) &&
goog.i18n.uChar.isTrailSurrogateCodePoint(trail)) {
var shiftedLeadOffset =
(lead << goog.i18n.uChar.TRAIL_SURROGATE_BIT_COUNT_) -
(goog.i18n.uChar.LEAD_SURROGATE_MIN_VALUE_
<< goog.i18n.uChar.TRAIL_SURROGATE_BIT_COUNT_);
var trailOffset = trail - goog.i18n.uChar.TRAIL_SURROGATE_MIN_VALUE_ +
goog.i18n.uChar.SUPPLEMENTARY_CODE_POINT_MIN_VALUE_;
return shiftedLeadOffset + trailOffset;
}
return null;
};
|