1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507
|
<?xml version="1.0" encoding="UTF-8"?>
<!--
Documentation for LCL (Lazarus Component Library) and LazUtils (Lazarus
Utilities) are published under the Creative Commons Attribution-ShareAlike 4.0
International public license.
https://creativecommons.org/licenses/by-sa/4.0/legalcode.txt
https://gitlab.com/freepascal.org/lazarus/lazarus/-/blob/main/docs/cc-by-sa-4-0.txt
Copyright (c) 1997-2025, by the Lazarus Development Team.
-->
<fpdoc-descriptions>
<package name="lazutils">
<!--
====================================================================
lazutf16
====================================================================
-->
<module name="lazutf16">
<short>
Contains routines used for UTF-16 character and string operations.
</short>
<descr>
<p>
<file>lazutf16.pas</file> includes string routines which are based on UTF-16
implementations, although it might also include routines for other encodings.
</p>
<p>
A UTF-16 based implementation for LowerCase, for example, is faster in
WideString and UnicodeString then the default UTF-8 implementation.
</p>
<p>
Currently this unit includes only UTF8LowerCaseViaTables which is based on
a UTF-16 table, but it might be extended to include various UTF-16 routines.
</p>
<p>
<file>lazutf16.pas</file> is part of the <file>LazUtils</file> package.
</p>
</descr>
<element name="UTF16CharacterLength">
<short>
Gets the length of the UTF-16 character in the specified PWideChar value as a
number Word values.
</short>
<descr>
<p>
Uses the endian-ness for the platform. UTF16CharacterLength checks the first
Word value in <var>p</var> to determine the return value for the routine, and
can contain:
</p>
<dl>
<dt>0</dt>
<dd>
Returned when the value in the <var>p</var> argument is <b>Nil</b>.
</dd>
<dt>1</dt>
<dd>
Returned when the first Word value in <var>p</var> argument is outside the
range $D800..$DFFF.
</dd>
<dt>2</dt>
<dd>
Returned when the first Word value in <var>p</var> argument is included in the
range $D800..$DFFF.
</dd>
</dl>
</descr>
</element>
<element name="UTF16CharacterLength.Result">
<short>
Length of the UTF-16 character in the specified value, or 0 when <b>Nil</b>.
</short>
</element>
<element name="UTF16CharacterLength.p">
<short>
PWideChar value examined in the routine.
</short>
</element>
<element name="UTF16Length">
<short>
Gets the length for the specified value in UTF-16 characters.
</short>
<descr>
<p>
<var>UTF16Length</var> is an overloaded <var>PtrInt</var> function used to get
the length for the specified value as a number of Unicode code points. The
overloaded variants allow the string examined in the routine to be specified
as either a <var>UnicodeString</var> or a <var>PWideChar</var> type. The
variant using PWideChar includes the WordCount argument with the number of
Word values in the input.
</p>
<p>
UTF16Length examines the input values to determine the number of code points
for the return value. Each code point can be represented as either 1 or 2 word
values in UTF-16. The UTF16CharacterLength routine is called for the input
until the number of Word values has been examined in the method. The return
value is incremented by 1 for each code point read from the input value.
</p>
</descr>
<seealso>
<link id="UTF16CharacterLength"/>
</seealso>
</element>
<element name="UTF16Length.Result">
<short>
Number of UTF-16 codepoints in the examined value.
</short>
</element>
<element name="UTF16Length.s">
<short>
Unicode string with the values examined in the routine.
</short>
</element>
<element name="UTF16Length.p">
<short>
Pointer to the WideChar values examined in the routine.
</short>
</element>
<element name="UTF16Length.WordCount">
<short>
Number of UTF-16 word values to examine in the Unicode string.
</short>
</element>
<element name="UTF16Copy">
<short>
Copies a number of UTF-16 characters at the given character position in the
specified value.
</short>
<descr/>
<seealso/>
</element>
<element name="UTF16Copy.Result">
<short>UnicodeString with the values copied in the routine.</short>
</element>
<element name="UTF16Copy.s">
<short>UnicodeString with the values examined in the routine.</short>
</element>
<element name="UTF16Copy.StartCharIndex">
<short>
1-based staring character (code point) position in the Unicode string.
</short>
</element>
<element name="UTF16Copy.CharCount">
<short>Number of characters (code points) copied in the routine.</short>
</element>
<element name="UTF16CharStart">
<short>
Gets a pointer to the Unicode character at the ordinal position in P specified
by the CharIndex argument.
</short>
<descr>
<p>
<var>P</var> is the <var>PWideChar</var> value with the content examined in
the routine.
</p>
<p>
<var>Len</var> contains the number of <var>Word</var> values examined in the P
argument.
</p>
<p>
<var>CharIndex</var> specifies the ordinal position in P for the character
pointer in the return value. CharIndex is zero-based and refers to a code
point in P and not the individual Word values.
</p>
<p>
UTF16CharStart calls the <var>UTF16CharacterLength</var> routine to examine
and skip each of the code points in P until the code point at CharIndex is
found. The return value points to the Unicode character at the specified
ordinal position. The return value is Nil for any of the following conditions:
</p>
<ul>
<li>The P argument is <b>Nil</b>.</li>
<li>Len is zero or a negative number.</li>
<li>The code point at CharIndex does not exist in the specified Len.</li>
</ul>
</descr>
<seealso>
<link id="UTF16CharacterLength"/>
</seealso>
</element>
<element name="UTF16CharStart.Result">
<short>
Pointer to the Unicode character (code point) at the specified ordinal
position.
</short>
</element>
<element name="UTF16CharStart.P">
<short>PWideChar value with the values examined in the routine.</short>
</element>
<element name="UTF16CharStart.Len">
<short>Len is the number of Word values in P.</short>
</element>
<element name="UTF16CharStart.CharIndex">
<short>
CharIndex is the position for the desired UnicodeChar (starting at 0).
</short>
</element>
<element name="UTF16Pos">
<short>Pos implemented for UTF-16-encoded values.</short>
<descr>
<p>
<var>UTF16Pos</var> is a <var>PtrInt</var> function used to get the character
index in SearchInText where the value in SearchForText is located. StartPos
allows the search to begin at a specific character (code point).
</p>
<p>
The return value is the 1-based UTF-16 character index where the SearchForText
starts in SearchInText, or 0 when not found.
</p>
</descr>
<seealso/>
</element>
<element name="UTF16Pos.Result">
<short>
Character index where the SearchForText starts in SearchInText, or 0 when not
found.
</short>
</element>
<element name="UTF16Pos.SearchForText">
<short>UTF-16-encoded value to locate in SearchInText.</short>
</element>
<element name="UTF16Pos.SearchInText">
<short>UTF-16-encoded value searched in the routine.</short>
</element>
<element name="UTF16Pos.StartPos">
<short>
Optional starting position (in UTF-16 code points, not in words).
</short>
</element>
<element name="UTF16CharacterToUnicode">
<short>
Converts ordinal values for UTF-16 code points in p to its Unicode equivalent.
</short>
<descr>
<p>
UTF16CharacterToUnicode converts 16-bit values in p to the equivalent Unicode
value.
</p>
<p>
Unpaired surrogates are invalid in any UTFs. These include any value in the
range $D800..$DBFF not followed by a value in the range $DC00..$DFFF, or any
value in the range $DC00..$DFFF not preceded by a value in the range
$D800..$DBFF.
</p>
<p>
UTF16CharacterToUnicode ensures that ordinal value(s) in the reserved
range(s) are converted to the correct Unicode value. CharLen is updated to
reflect whether the values in p are a character represented by a single
UTF-16 code point (1), or requires 2 code points for the surrogate pair (2).
It is set to 0 when p contains an invalid UTF-16 code point.
</p>
<p>
The return value contains the Cardinal value for the Unicode code point, or 0
when p contains an invalid UTF-16 code point.
</p>
</descr>
<seealso/>
</element>
<element name="UTF16CharacterToUnicode.Result">
<short>Unicode code point for the values in p.</short>
</element>
<element name="UTF16CharacterToUnicode.p">
<short>UTF-16 code points examined and converted in the routine.</short>
</element>
<element name="UTF16CharacterToUnicode.CharLen">
<short>Number of UTF-16 code points for the converted character.</short>
</element>
<element name="UnicodeToUTF16">
<short>
Converts a Unicode character (code point) to its UTF-16 equivalent as a UnicodeString type.
</short>
<descr>
<p>
Cardinal values below $10000 result in a single WideChar value for the code
point. Cardinal values in the range $D800 - $DFFF are reserved code points
and contain 2 WideChar values in the result to represent the UTF-16 code
point. WIdeChar values are cast to the UnicodeString type used in the return
value.
</p>
</descr>
<seealso/>
</element>
<element name="UnicodeToUTF16.Result">
<short>
UnicodeString value for the specified UTF-16 code point.
</short>
</element>
<element name="UnicodeToUTF16.u">
<short>
Cardinal value with the UTF-16 code point converted in the routine.
</short>
</element>
<element name="IsUTF16CharValid">
<short>
Returns <b>True</b> if the specified values are a valid UTF-16 character
(codepoint).
</short>
<descr>
<p>
Based on the specification defined by the Unicode consortium, at:
</p>
<p>
<url href="https://unicode.org/faq/utf_bom.html#utf16-7">
https://unicode.org/faq/utf_bom.html#utf16-7
</url>
</p>
<p>
Q: Are there any 16-bit values that are invalid?
</p>
<p>
A: Unpaired surrogates are invalid in UTFs. These include any value in the
range D800 to DBFF not followed by a value in the range DC00 to DFFF, or
any value in the range DC00 to DFFF not preceded by a value in the range
D800 to DBFF. [AF]
</p>
<p>
If ANextChar is set to #0 there is no next character.
</p>
</descr>
<seealso/>
</element>
<element name="IsUTF16CharValid.Result">
<short>
Returns False if AChar is #0 or AChar and ANextChar are unpaired surrogates.
</short>
</element>
<element name="IsUTF16CharValid.AChar">
<short>
First UTF-16 code examined in the method.
</short>
</element>
<element name="IsUTF16CharValid.ANextChar">
<short>
Next UTF-16 code examined in the method.
</short>
</element>
<element name="IsUTF16StringValid">
<short>
Determines if the specified Unicode string contains valid UTF-16 code points.
</short>
<descr>
<p>
Examines the content in AStr for valid UTF-16 characters. Calls IsUTF16CharValid for consecutive code point pairs.
</p>
</descr>
<seealso/>
</element>
<element name="IsUTF16StringValid.Result">
<short>
<b>True</b> if the specified Unicode string contains valid UTF-16 code points.
</short>
</element>
<element name="IsUTF16StringValid.AWideStr">
<short>
Unicode string examined in the routine.
</short>
</element>
<element name="Utf16StringReplace">
<short>
Deprecated. Replaces a pattern in a Unicode string with another Unicode pattern.
</short>
<descr>
<p>
Utf16StringReplace is the same as <var>SysUtil.StringReplace</var> but for
WideStrings and UnicodeStrings. It has been deprecated in LazUtils; use
StringReplace (sysutils) or UnicodeStringReplace (sysutils) instead.
</p>
</descr>
<version>
Deprecated in LazUtils version 4.0; use StringReplace or UnicodeStringReplace
from the SysUtils unit instead.
</version>
<seealso>
<link id="#rtl.sysutils.StringReplace">StringReplace</link>
<link id="#rtl.sysutils.UnicodeStringReplace">UnicodeStringReplace</link>
<link id="#rtl.sysutils.TReplaceFlags">TReplaceFlags</link>
</seealso>
</element>
<element name="Utf16StringReplace.Result">
<short>
Updated value for Unicode string after pattern replacement(s).
</short>
</element>
<element name="Utf16StringReplace.S">
<short>
Unicode string value examined in the routine.
</short>
</element>
<element name="Utf16StringReplace.OldPattern">
<short>
Unicode string with the pattern to locate in S.
</short>
</element>
<element name="Utf16StringReplace.NewPattern">
<short>
Unicode string with the pattern used to replace the old pattern in S.
</short>
</element>
<element name="Utf16StringReplace.Flags">
<short>
Set of replacement flags applied in the routine.
</short>
</element>
<element name="Utf16StringReplace.Count">
<short>
Number of replacements performed in the routine.
</short>
</element>
<element name="UnicodeLowercase">
<short>
Converts the specified Unicode (UTF-16) codepoint to its lowercase equivalent.
</short>
<descr>
<p>
<var>UnicodeLowercase</var> ensures the character (16-bit codepoint) is
converted to lowercase using the case conversion logic for the UTF-16 encoding.
Characters in the following Unicode blocks are handled:
</p>
<dl>
<dt>$0041..$005A</dt>
<dd>Capital letters in the C0 Latin block</dd>
<dt>$00C0..$00DE</dt>
<dd>Capital letters in the C1 Latin-1 Supplement block</dd>
<dt>$0100..$024E</dt>
<dd>Capital letters in the Latin Extended-A block</dd>
<dt>$0386..$03AB</dt>
<dd>Capital letters in the Greek and Coptic block</dd>
<dt>$03D8..$042F</dt>
<dd>Archaic letters in the Greek and Coptic block</dd>
<dt>$0460..$0512</dt>
<dd>Historic letters in Cyrillic block</dd>
<dt>$0531..$0556</dt>
<dd>Capital letters in the Armenian block</dd>
<dt>$10A0..$10C5</dt>
<dd>Capital letters in the Georgian block</dd>
<dt>$1E00..$1FFC</dt>
<dd>Capital letters in the Latin Extended Additional block</dd>
<dt>$2126..$2183</dt>
<dd>Characters in the Letter-like Symbols block</dd>
<dt>$24B6..$24CF</dt>
<dd>Parenthesized Latin letters in the Enclosed Alphanumerics block</dd>
<dt>$2C00..$2C2E</dt>
<dd>Capital letters in the Glagolitic block (precursor of Cyrillic)</dd>
<dt>$2C60..$2CE2</dt>
<dd>Capital letters in the Latin Extended-C block</dd>
<dt>$FF21..$FF3A</dt>
<dd>ASCII variants in the Halfwidth and Fullwidth Forms block</dd>
</dl>
</descr>
<version>
Modified in LazUtils version 4.0 to dynamically initialize the character
mapping tables once when either UnicodeLowercase or UTF8LowerCaseViaTables is
called.
</version>
<seealso/>
</element>
<element name="UnicodeLowercase.Result">
<short>
Cardinal value with the lowercase equivalent for <var>u</var>, or the value in
<var>u</var> when conversion is not needed.
</short>
</element>
<element name="UnicodeLowercase.u">
<short>
Cardinal value for the Unicode character converted to lowercase in the routine.
</short>
</element>
<element name="UTF8LowerCaseViaTables">
<short>
Converts a UTF-8-encoded string to lowercase UTF-8 values using internal
case tables.
</short>
<descr/>
<version>
Modified in LazUtils version 4.0 to dynamically initialize the character
mapping tables once when either UnicodeLowercase or UTF8LowerCaseViaTables is
called.
</version>
<seealso/>
</element>
<element name="UTF8LowerCaseViaTables.Result">
<short>String with the lowercase UTF-8 values for s.</short>
</element>
<element name="UTF8LowerCaseViaTables.s">
<short>
String with UTF-8 values converted to lowercase UTF-8 in the routine.
</short>
</element>
</module>
<!-- lazutf16 -->
</package>
</fpdoc-descriptions>
|