1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665
|
\input texinfo @c -*-texinfo-*-
@c %**start of header
@setfilename libidn2.info
@include version.texi
@documentencoding utf-8
@settitle Libidn2 @value{VERSION}
@finalout
@syncodeindex pg cp
@c %**end of header
@copying
This manual is for Libidn2 (version @value{VERSION}, @value{UPDATED}),
an implementation of IDNA2008/TR46 internationalized domain names.
Copyright @copyright{} 2011--2025 Simon Josefsson
@end copying
@dircategory Software libraries
@direntry
* libidn2: (libidn2). Internationalized domain names (IDNA2008/TR46) processing.
@end direntry
@dircategory Localization
@direntry
* idn2: (libidn2)Invoking idn2. Internationalized Domain Name (IDNA2008/TR46) conversion.
@end direntry
@titlepage
@title Libidn2
@subtitle Internationalized Domain Names (IDNA2008/TR46)
@subtitle Version @value{VERSION}, @value{UPDATED}
@author Simon Josefsson
@page
@vskip 0pt plus 1filll
@insertcopying
@end titlepage
@c Output the table of the contents at the beginning.
@contents
@ifnottex
@node Top
@top Libidn2
@insertcopying
@end ifnottex
@menu
* Introduction:: What is Libidn2?
* Library Functions:: Library functions.
* Converting from libidn:: Demonstrate how to convert from libidn.
* Examples:: Demonstrate how to use the library.
* Invoking idn2:: Command line interface to the library.
* Interface Index::
* Concept Index::
@end menu
@node Introduction
@chapter Introduction
Libidn2 is a free software implementation of IDNA2008, Punycode and
Unicode TR46. Its purpose is to encode and decode internationalized
domain names.
The library is a rewrite of the popular but legacy libidn library, and
is backwards (API) compatible with it. See @ref{Converting from
libidn} for more information.
For technical reference, see:
@itemize
@item IDNA2008 Framework (@uref{https://tools.ietf.org/html/rfc5890})
@item IDNA2008 Protocol (@uref{https://tools.ietf.org/html/rfc5891})
@item IDNA2008 Unicode tables (@uref{https://tools.ietf.org/html/rfc5892})
@item IDNA2008 Bidi rule (@uref{https://tools.ietf.org/html/rfc5893})
@item Punycode (@uref{https://tools.ietf.org/html/rfc3492})
@item Unicode IDNA Compatibility Processing (@uref{http://www.unicode.org/reports/tr46/})
@end itemize
Libidn2 uses GNU libunistring
(@uref{https://www.gnu.org/software/libunistring/}) for Unicode
processing and optionally GNU libiconv
(@uref{https://www.gnu.org/software/libiconv/}) for character set
conversion.
The library is dual-licensed under LGPLv3 or GPLv2, see the file COPYING
for detailed information.
@node Library Functions
@chapter Library Functions
@cindex Library Functions
Below are the interfaces of the Libidn2 library documented.
@section Header file @code{idn2.h}
To use the functions documented in this chapter, you need to include
the file @file{idn2.h} like this:
@example
#include <idn2.h>
@end example
@section Core Functions
When you have the data encoded in UTF-8 form the direct interfaces to
the library are as follows.
@include texi/idn2_to_ascii_8z.texi
@include texi/idn2_to_unicode_8z8z.texi
@include texi/idn2_lookup_u8.texi
@include texi/idn2_register_u8.texi
@section Locale Functions
As a convenience, the following functions are provided that will
convert the input from the locale encoding format to UTF-8 and
normalize the string using NFC, and then apply the core functions
described earlier.
@include texi/idn2_to_ascii_lz.texi
@include texi/idn2_to_unicode_8zlz.texi
@include texi/idn2_to_unicode_lzlz.texi
@include texi/idn2_lookup_ul.texi
@include texi/idn2_register_ul.texi
@section Control Flags
The @code{flags} parameter can take on the following values, or a
bit-wise inclusive or of any subset of the parameters:
@deftypevr {Global flag} {idn2_flags} IDN2_NFC_INPUT
Apply NFC normalization on input.
@end deftypevr
@deftypevr {Global flag} {idn2_flags} IDN2_ALABEL_ROUNDTRIP
Apply additional round-trip conversion of A-label inputs.
@end deftypevr
@deftypevr {Global flag} {idn2_flags} IDN2_TRANSITIONAL
Perform Unicode TR46 transitional processing.
@end deftypevr
@deftypevr {Global flag} {idn2_flags} IDN2_NONTRANSITIONAL
Perform Unicode TR46 non-transitional processing (default).
@end deftypevr
@deftypevr {Global flag} {idn2_flags} IDN2_NO_TR46
Disable any TR#46 transitional or non-transitional processing.
@end deftypevr
@deftypevr {Global flag} {idn2_flags} IDN2_USE_STD3_ASCII_RULES
Use STD3 ASCII rules. This is a TR#46 flag and is a no-op when IDN2_NO_TR46 is specified.
@end deftypevr
@section Error Handling
@include texi/idn2_strerror.texi
@include texi/idn2_strerror_name.texi
@section Return Codes
The functions normally return 0 on success or a negative error code.
@deftypevr {Return code} {idn2_rc} IDN2_OK
Successful return.
@end deftypevr
@deftypevr {Return code} {idn2_rc} IDN2_MALLOC
Memory allocation error.
@end deftypevr
@deftypevr {Return code} {idn2_rc} IDN2_NO_CODESET
Could not determine locale string encoding format.
@end deftypevr
@deftypevr {Return code} {idn2_rc} IDN2_ICONV_FAIL
Could not transcode locale string to UTF-8.
@end deftypevr
@deftypevr {Return code} {idn2_rc} IDN2_ENCODING_ERROR
Unicode data encoding error.
@end deftypevr
@deftypevr {Return code} {idn2_rc} IDN2_NFC
Error normalizing string.
@end deftypevr
@deftypevr {Return code} {idn2_rc} IDN2_PUNYCODE_BAD_INPUT
Punycode invalid input.
@end deftypevr
@deftypevr {Return code} {idn2_rc} IDN2_PUNYCODE_BIG_OUTPUT
Punycode output buffer too small.
@end deftypevr
@deftypevr {Return code} {idn2_rc} IDN2_PUNYCODE_OVERFLOW
Punycode conversion would overflow.
@end deftypevr
@deftypevr {Return code} {idn2_rc} IDN2_TOO_BIG_DOMAIN
Domain name longer than 255 characters.
@end deftypevr
@deftypevr {Return code} {idn2_rc} IDN2_TOO_BIG_LABEL
Domain label longer than 63 characters.
@end deftypevr
@deftypevr {Return code} {idn2_rc} IDN2_INVALID_ALABEL
Input A-label is not valid.
@end deftypevr
@deftypevr {Return code} {idn2_rc} IDN2_UALABEL_MISMATCH
Input A-label and U-label does not match.
@end deftypevr
@deftypevr {Return code} {idn2_rc} IDN2_INVALID_FLAGS
Invalid combination of flags.
@end deftypevr
@deftypevr {Return code} {idn2_rc} IDN2_NOT_NFC
String is not NFC.
@end deftypevr
@deftypevr {Return code} {idn2_rc} IDN2_2HYPHEN
String has forbidden two hyphens.
@end deftypevr
@deftypevr {Return code} {idn2_rc} IDN2_HYPHEN_STARTEND
String has forbidden starting/ending hyphen.
@end deftypevr
@deftypevr {Return code} {idn2_rc} IDN2_LEADING_COMBINING
String has forbidden leading combining character.
@end deftypevr
@deftypevr {Return code} {idn2_rc} IDN2_DISALLOWED
String has disallowed character.
@end deftypevr
@deftypevr {Return code} {idn2_rc} IDN2_CONTEXTJ
String has forbidden context-j character.
@end deftypevr
@deftypevr {Return code} {idn2_rc} IDN2_CONTEXTJ_NO_RULE
String has context-j character with no rull.
@end deftypevr
@deftypevr {Return code} {idn2_rc} IDN2_CONTEXTO
String has forbidden context-o character.
@end deftypevr
@deftypevr {Return code} {idn2_rc} IDN2_CONTEXTO_NO_RULE
String has context-o character with no rull.
@end deftypevr
@deftypevr {Return code} {idn2_rc} IDN2_UNASSIGNED
String has forbidden unassigned character.
@end deftypevr
@deftypevr {Return code} {idn2_rc} IDN2_BIDI
String has forbidden bi-directional properties.
@end deftypevr
@deftypevr {Return code} {idn2_rc} IDN2_DOT_IN_LABEL
Label has forbidden dot (TR46).
@end deftypevr
@deftypevr {Return code} {idn2_rc} IDN2_INVALID_TRANSITIONAL
Label has character forbidden in transitional mode (TR46).
@end deftypevr
@deftypevr {Return code} {idn2_rc} IDN2_INVALID_NONTRANSITIONAL
Label has character forbidden in non-transitional mode (TR46).
@end deftypevr
@section Memory Handling
@include texi/idn2_free.texi
@section Version Check
It is often desirable to check that the version of Libidn2 used is
indeed one which fits all requirements. Even with binary
compatibility new features may have been introduced but due to problem
with the dynamic linker an old version is actually used. So you may
want to check that the version is okay right after program startup.
@include texi/idn2_check_version.texi
The normal way to use the function is to put something similar to the
following first in your @code{main}:
@example
if (!idn2_check_version (IDN2_VERSION))
@{
printf ("idn2_check_version() failed:\n"
"Header file incompatible with shared library.\n");
exit(EXIT_FAILURE);
@}
@end example
@node Converting from libidn
@chapter Converting from libidn
@cindex libidn
This library is backwards (API) compatible with the libidn library
(@uref{https://www.gnu.org/software/libidn/}).
Although it is recommended for new software to use the native libidn2
functions (i.e., the ones prefixed with @code{idn2}), old software
isn't always feasible to modify.
@section Converting with minimal modifications
As such, libidn2, provides compatibility macros which switch all libidn
functions, to libidn2 functions in a backwards compatible way. To take
advantage of these compatibility functions, it is sufficient to replace
the @code{idna.h} header in legacy code, with @code{idn2.h}. That
would transform the software from using libidn, i.e., IDNA2003,
to using libidn2 with IDNA2008 non-transitional encoding.
@section Converting to native APIs
However, it is recommended to switch applications to the IDN2 native APIs.
The following table provides a mapping of libidn code snippets to
libidn2, for switching to IDNA2008.
@multitable @columnfractions .40 .40
@headitem libidn @tab libidn2
@item
@verbatim
rc = idna_to_ascii_8z (buf, &p, IDNA_USE_STD3_ASCII_RULES);
if (rc != IDNA_SUCCESS)
@end verbatim
@tab
@verbatim
rc = idn2_to_ascii_8z (buf, &p, IDN2_USE_STD3_ASCII_RULES);
if (rc != IDN2_OK)
@end verbatim
@item
@verbatim
rc = idna_to_ascii_8z (buf, &p, 0 /* any other flags */);
if (rc != IDNA_SUCCESS)
@end verbatim
@tab
@verbatim
/* we recommend to use the default flags (0), so that
* the default behavior of libidn2 applies. */
rc = idn2_to_ascii_8z (buf, &p, 0);
if (rc != IDN2_OK)
@end verbatim
@item
@verbatim
rc = idna_to_unicode_8z8z (buf, &p, 0 /* any flags */);
if (rc != IDNA_SUCCESS)
@end verbatim
@tab
@verbatim
rc = idn2_to_unicode_8z8z (buf, &p, 0);
if (rc != IDN2_OK)
@end verbatim
@end multitable
Note that, although the table only lists the UTF-8 functions, the mapping
is identical for every other one on the family of toUnicode and toAscii.
As the IDNA2003 details differ signicantly to IDNA2008, not all flags used in
the libidn functions map to any specific flags; it is typically safe to use
the suggested libidn2 flags. Exceptionally the libidn flag @code{IDNA_USE_STD3_ASCII_RULES}
is mapped to @code{IDN2_USE_STD3_ASCII_RULES}.
@section Converting with backwards compatibility
In several cases where IDNA2008 mappings do not exist whereas IDNA2003
mappings do, software like browsers take a backwards compatible approach.
That is convert the domain to IDNA2008 form, and if that fails try the
IDNA2003 conversion. The following example demonstrates that approach.
@verbatim
rc = idn2_to_ascii_8z (buf, &p, IDN2_NONTRANSITIONAL); /* IDNA2008 */
if (rc == IDN2_DISALLOWED)
rc = idn2_to_ascii_8z (buf, &p, IDN2_TRANSITIONAL); /* IDNA2003 - compatible */
@end verbatim
@section Using libidn and libidn2 code
In the special case of software that needs to support both
libraries (e.g., both IDNA2003 and IDNA2008), you must define
@code{IDN2_SKIP_LIBIDN_COMPAT} prior to including @code{idn2.h}
in order to disable compatibility code which overlaps with libidn
functionality. That would allow software to use both libraries' functions.
@section Stringprep and libidn2
The original libidn library includes functionality for the stringprep
processing in @code{stringprep.h}. That functionality was an integral part
of an IDNA2003 implementation, but it does not apply to IDNA2008. Furthermore,
stringprep processing has been replaced by the PRECIS framework (RFC8264).
For the reasons above, libidn2 does not implement stringprep or
any other string processing protocols unrelated to IDNA2008. Applications
requiring the stringprep processing should continue using the original
libidn, and new applications should consider using the PRECIS framework.
@node Examples
@chapter Examples
@cindex Examples
This chapter contains example code which illustrate how Libidn2 is
used when you write your own application.
@menu
* ToASCII:: Example using IDNA ToASCII.
* ToUnicode:: Example using IDNA ToUnicode.
* Lookup:: Example IDNA2008 Lookup domain name operation.
* Register:: Example IDNA2008 Register label operation.
@end menu
@node ToASCII
@section ToASCII example
This example demonstrates how the library is used to convert
internationalized domain names into ASCII compatible names (ACE).
It expects input to be in UTF-8 form.
@verbatiminclude example-toascii.c
@node ToUnicode
@section ToUnicode example
This example demonstrates how the library is used to convert ASCII
compatible names (ACE) to internationalized domain names.
Both input and output are in UTF-8 form.
@verbatiminclude example-tounicode.c
@node Lookup
@section Lookup
This example demonstrates how a domain name is processed before it is
lookup in the DNS. The input expected is in the locale encoding.
@verbatiminclude lookup.c
@node Register
@section Register
This example demonstrates how a domain label is processed before it is
registered in the DNS. The input expected is in the locale encoding.
@verbatiminclude register.c
@node Invoking idn2
@chapter Invoking idn2
@pindex idn2
@cindex invoking @command{idn2}
@cindex command line
@command{idn2} translates internationalized domain names to the
IDNA2008 encoded format, either for lookup or registration.
If strings are specified on the command line, they are used as input
and the computed output is printed to standard output @code{stdout}.
If no strings are specified on the command line, the program read
data, line by line, from the standard input @code{stdin}, and print
the computed output to standard output. What processing is performed
(e.g., lookup or register) is indicated by options. If any errors are
encountered, the execution of the applications is aborted.
All strings are expected to be encoded in the preferred charset used
by your locale. Use @code{--debug} to find out what this charset is.
On POSIX systems you may use the @code{LANG} environment variable to
specify a different locale.
To process a string that starts with @code{-}, for example
@code{-foo}, use @code{--} to signal the end of parameters, as in
@code{idn2 -r -- -foo}.
@section Options
@code{idn2} recognizes these commands:
@verbatiminclude idn2-help.texi
@section Environment Variables
On POSIX systems the @var{LANG} environment variable can be used to
override the system locale for the command being invoked. The system
locale may influence what character set is used to decode data (i.e.,
strings on the command line or data read from the standard input
stream), and to encode data to the standard output. If your system is
set up correctly, however, the application will use the correct locale
and character set automatically. Example usage:
@example
$ LANG=en_US.UTF-8 idn2
...
@end example
@section Examples
Standard usage, reading input from standard input and disabling
license and usage instructions:
@example
jas@@latte:~$ idn2 --quiet
r@"aksm@"org@aa{}s.se
xn--rksmrgs-5wao1o.se
...
@end example
Reading input from the command line:
@example
jas@@latte:~$ idn2 r@"aksm@"org@aa{}s.se bl@aa{}b@ae{}rgr@o{}d.no
xn--rksmrgs-5wao1o.se
xn--blbrgrd-fxak7p.no
jas@@latte:~$
@end example
Testing the IDNA2008 Register function:
@example
jas@@latte:~$ idn2 --register fu@ss{}ball
xn--fuball-cta
jas@@latte:~$
@end example
@section Troubleshooting
Getting character data encoded right, and making sure Libidn2 use the
same encoding, can be difficult. The reason for this is that most
systems may encode character data in more than one character encoding,
i.e., using @code{UTF-8} together with @code{ISO-8859-1} or
@code{ISO-2022-JP}. This problem is likely to continue to exist until
only one character encoding come out as the evolutionary winner, or
(more likely, at least to some extents) forever.
The first step to troubleshooting character encoding problems with
Libidn2 is to use the @samp{--debug} parameter to find out which
character set encoding @samp{idn2} believe your locale uses.
@example
jas@@latte:~$ idn2 --debug --quiet ""
Charset: UTF-8
jas@@latte:~$
@end example
If it prints @code{ANSI_X3.4-1968} (i.e., @code{US-ASCII}), this
indicate you have not configured your locale properly. To configure
the locale, you can, for example, use @samp{LANG=sv_SE.UTF-8; export
LANG} at a @code{/bin/sh} prompt, to set up your locale for a Swedish
environment using @code{UTF-8} as the encoding.
Sometimes @samp{idn2} appear to be unable to translate from your
system locale into @code{UTF-8} (which is used internally), and you
will get an error message like this:
@example
idn2: lookup: could not convert string to UTF-8
@end example
One explanation is that you didn't install the @samp{iconv} conversion
tools. You can find it as a standalone library in GNU Libiconv
(@uref{https://www.gnu.org/software/libiconv/}). On many GNU/Linux
systems, this library is part of the system, but you may have to
install additional packages to be able to use it.
Another explanation is that the error is correct and you are feeding
@samp{idn2} invalid data. This can happen inadvertently if you are
not careful with the character set encoding you use. For example, if
your shell run in a @code{ISO-8859-1} environment, and you invoke
@samp{idn2} with the @samp{LANG} environment variable as follows, you
will feed it @code{ISO-8859-1} characters but force it to believe they
are @code{UTF-8}. Naturally this will lead to an error, unless the
byte sequences happen to be valid @code{UTF-8}. Note that even if you
don't get an error, the output may be incorrect in this situation,
because @code{ISO-8859-1} and @code{UTF-8} does not in general encode
the same characters as the same byte sequences.
@example
jas@@latte:~$ idn2 --quiet --debug ""
Charset: ISO-8859-1
jas@@latte:~$ LANG=sv_SE.UTF-8 idn2 --debug r@"aksm@"org@aa{}s
Charset: UTF-8
input[0] = 0x72
input[1] = 0xc3
input[2] = 0xa4
input[3] = 0xc3
input[4] = 0xa4
input[5] = 0x6b
input[6] = 0x73
input[7] = 0x6d
input[8] = 0xc3
input[9] = 0xb6
input[10] = 0x72
input[11] = 0x67
input[12] = 0xc3
input[13] = 0xa5
input[14] = 0x73
UCS-4 input[0] = U+0072
UCS-4 input[1] = U+00e4
UCS-4 input[2] = U+00e4
UCS-4 input[3] = U+006b
UCS-4 input[4] = U+0073
UCS-4 input[5] = U+006d
UCS-4 input[6] = U+00f6
UCS-4 input[7] = U+0072
UCS-4 input[8] = U+0067
UCS-4 input[9] = U+00e5
UCS-4 input[10] = U+0073
output[0] = 0x72
output[1] = 0xc3
output[2] = 0xa4
output[3] = 0xc3
output[4] = 0xa4
output[5] = 0x6b
output[6] = 0x73
output[7] = 0x6d
output[8] = 0xc3
output[9] = 0xb6
output[10] = 0x72
output[11] = 0x67
output[12] = 0xc3
output[13] = 0xa5
output[14] = 0x73
UCS-4 output[0] = U+0072
UCS-4 output[1] = U+00e4
UCS-4 output[2] = U+00e4
UCS-4 output[3] = U+006b
UCS-4 output[4] = U+0073
UCS-4 output[5] = U+006d
UCS-4 output[6] = U+00f6
UCS-4 output[7] = U+0072
UCS-4 output[8] = U+0067
UCS-4 output[9] = U+00e5
UCS-4 output[10] = U+0073
xn--rksmrgs-5waap8p
jas@@latte:~$
@end example
The sense moral here is to forget about @samp{LANG} (instead,
configure your system locale properly) unless you know what you are
doing, and if you want to use @samp{LANG}, do it carefully and after
verifying with @samp{--debug} that you get the desired results.
@node Interface Index
@unnumbered Interface Index
@printindex fn
@node Concept Index
@unnumbered Concept Index
@printindex cp
@bye
|