1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297
|
.\" Hey Emacs! This file is -*- nroff -*- source.
.\"
.\" Copyright (C) Markus Kuhn, 1996, 2001
.\"
.\" This is free documentation; you can redistribute it and/or
.\" modify it under the terms of the GNU General Public License as
.\" published by the Free Software Foundation; either version 2 of
.\" the License, or (at your option) any later version.
.\"
.\" The GNU General Public License's references to "object code"
.\" and "executables" are to be interpreted as the output of any
.\" document formatting or typesetting system, including
.\" intermediate and printed output.
.\"
.\" This manual is distributed in the hope that it will be useful,
.\" but WITHOUT ANY WARRANTY; without even the implied warranty of
.\" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
.\" GNU General Public License for more details.
.\"
.\" You should have received a copy of the GNU General Public
.\" License along with this manual; if not, write to the Free
.\" Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111,
.\" USA.
.\"
.\" 1995-11-26 Markus Kuhn <mskuhn@cip.informatik.uni-erlangen.de>
.\" First version written
.\" 2001-05-11 Markus Kuhn <mgk25@cl.cam.ac.uk>
.\" Update
.\"
.\" Japanese Version Copyright (c) 1997 HANATAKA Shinya
.\" all rights reserved.
.\" Translated Thu Jun 3 20:40:01 JST 1997
.\" by HANATAKA Shinya <hanataka@abyss.rim.or.jp>
.\" Updated (add SECURITY section) & modified Mon Feb 26 2001
.\" by NAKANO Takeo <nakano@apm.seikei.ac.jp>
.\" Updated & Modified Sun Jul 1 09:28:47 JST 2001
.\" by Yuichi SATO <ysato@h4.dion.ne.jp>
.\"
.TH UTF-8 7 2001-05-11 "GNU" "Linux Programmer's Manual"
.SH ̾
UTF-8 \- ASCII ȸߴΤ¿Х Unicode 沽
.SH
.B "˥ (Unicode) 3.0"
ʸ 16 ӥåȤΥɶ֤롣
Ǥñ Unicode 沽ˡ
.RB ( UCS-2 )
Ǥϡʸ 16 ӥåȡ (16 ӥåʸ) ǹ롣
ˤϡ
'\\0' '/' Τ褦 (ե̾ C Υ饤֥ؿΰ)
üʰ̣ 16 ӥåʸޤޤ뤳Ȥ롣
ˡۤȤɤ UNIX ġ ASCII եϤȤƴԤΤǡ
ѹʤˤ 16 ӥåȥɤʸȤɤळȤǤʤ
ͳ顢
.B UCS-2
ϥե̾ƥȥե롦ĶѿʤɤѤ롢Ѥ
.B Unicode
ȤƤŬڤǤ롣
Unicode ΥѡåȤǤ
.B "ISO 10646 Universal Character Set (UCS)"
31 ӥåȤΥɶ֤뤬κǤñ沽Ǥ
.B UCS-4
ˤ (32 ӥåȡɤȤ) Ʊ꤬롣
.PP
.B Unicode
.B UCS
.B UTF-8
沽ˤϤ꤬ʤΤǡUnix OS
.B Unicode
ʸѤ뤿ΰŪˡȤʤäƤ롣
.SS
.B UTF-8
沽ϰʲΤ褦Ƥ:
.TP 0.2i
*
.B UCS
ʸΤ 0x00000000 0x0000007f ޤ (ŵŪ
.B US-ASCII
ʸ) (ASCII ȤθߴΤ) ñ 0x00 0x7f ΥХȤ
沽롣 7 ӥå ASCII ʸΤߤޤեʸ
ؤƤϡ
.B ASCII
.B UTF-8
Ʊ沽ԤʤȤ̣롣
.TP
*
0x7f 礭Τ٤Ƥ
.B UCS
ʸϡ 0x80 0xfd ޤǤϰϤΥХȤΤߤޤ
¿Хʸ沽롣
äʸ
ASCII ХȤޤޤ뤳Ȥʤ'\\0' '/' ȯʤ
.TP
*
.B UCS-4
ʸǤϼŪȤνݤ롣
.TP
*
2^31 ӥåȤΤ٤Ƥ UCS
.B UTF-8
Ѥ沽Ǥ롣
.TP
*
.B UTF-8
沽Ǥ 0xfe 0xff ΥХȤФ˻Ѥʤ
.TP
*
ASCII Ǥʤ
.B UCS
ʸ¿ХκǽΥХȤϡ
0xc0 0xfd ϰϤɽ졢
ʸХȤǹƤ뤫
¿ХλĤʬΥХȤϡ줾 0x80 0xbf ϰϤˤ롣
ˤƱưפˤʤꡢơȥ쥹沽ǽˤʤꡢ
ХȤʶФƷǤˤʤ롣
.TP
*
.B UTF-8
Ѥ
.B UCS
ʸ沽Ϻ 6 ХȤĹˤʤ롣
.B Unicode
ʤǤ 0x10ffff ʸꤷʤΤǡUnicode ʸ
.B UTF-8
Ǥ 4 ХȤޤǤˤʤʤ
.SS 沽
ʲΥХʸɽ˻Ѥ롣
ɤΥХѤ뤫ʸ UCS ֹ˰¸:
.TP 0.4i
0x00000000 \- 0x0000007F:
.RI 0 xxxxxxx
.TP
0x00000080 \- 0x000007FF:
.RI 110 xxxxx
.RI 10 xxxxxx
.TP
0x00000800 \- 0x0000FFFF:
.RI 1110 xxxx
.RI 10 xxxxxx
.RI 10 xxxxxx
.TP
0x00010000 \- 0x001FFFFF:
.RI 11110 xxx
.RI 10 xxxxxx
.RI 10 xxxxxx
.RI 10 xxxxxx
.TP
0x00200000 \- 0x03FFFFFF:
.RI 111110 xx
.RI 10 xxxxxx
.RI 10 xxxxxx
.RI 10 xxxxxx
.RI 10 xxxxxx
.TP
0x04000000 \- 0x7FFFFFFF:
.RI 1111110 x
.RI 10 xxxxxx
.RI 10 xxxxxx
.RI 10 xxxxxx
.RI 10 xxxxxx
.RI 10 xxxxxx
.PP
.I xxx
ӥåȤʬˤ 2 ʿɽ路ʸɤΥӥåʬб롣
ʸɽΤ˺ǤûХΤߤѤǤ롣
.PP
0xd800\(en0xdfff (UTF-16 )
0xfffe, 0xffff (UCS non-character) Ȥ
.B UCS
ɤͤϡ
.B UTF-8
˽ȥ٤ǤϤʤ
.SS
.B Unicode
ʸ 0xa9 = 1010 1001 (ԡ饤ȡޡ) UTF-8 沽
.sp
.RS
11000010 10101001 = 0xc2 0xa9
.RE
.sp
ˤʤ롣
.PP
0x2260 = 0010 0010 0110 0000 ()
.sp
.RS
11100010 10001001 10100000 = 0xe2 0x89 0xa0
.RE
.sp
ˤʤ롣
.SS ץꥱˤ
桼ϥץꥱ
.B UTF-8
ݡȤͭˤ뤿ˡʲΤ褦ˤ
.B UTF-8
ʤФʤʤ
.PP
.RS
export LANG=en_GB.UTF-8
.RE
.PP
ѤƤʸ沽ʬäƤʤФʤʤ
ץꥱեȥϡ
ʲΤ褦ˤƾ˥ꤹ٤Ǥ롣
.PP
.RS
setlocale(LC_CTYPE, "")
.RE
.PP
ޤ
.B UTF-8
뤬Ƥơץ졼ƥȤɸϡü̿
ץ졼ƥȥեơե̾Ķѿ
.B UTF-8
沽Ƥ뤫å뤿ˡ
ץޡϰʲΤ褦ʼȤǤ롣
.PP
.RS
strcmp(nl_langinfo(CODESET), "UTF-8") == 0
.RE
.PP
.B US-ASCII
.B ISO 8859
ȤäХȤ沽ˤʤäƤץޡϡ
ޤǤ 2 Ĥβ꤬
.B UTF-8
ˤƤϺͭǤϤʤʤäȤΤäƤ٤
1 ܤѹϡ1 ХȤɬ 1 ĤʸбʤȤǤ롣
2 ܤѹϡǶüߥ졼
.B UTF-8
⡼ɤˤ졦ܸ졦ڹī
.B ʸ
䥹ڡʤ (non-spacing)
.B "ʸ (combining characters)"
бƤΤǡ
.B ASCII
ΤȤΤ褦 1 ʸϤ
ɬ 1 Ĥʤ櫓ǤϤʤȤǤ롣
Ǥϡʸ䥫ΰ֤Τ
.BR mbsrtowcs (3)
.BR wcswidth (3)
Ȥä饤֥ؿȤ٤Ǥ롣
.PP
(VT100 üʤɤǻȤ)
.B ISO 2022
沽
.B UTF-8
ؤʥץ ESC % G ("\\x1b%G") Ǥ롣
б
.B UTF-8
.B ISO 2022
ؤΥ ESC % @ ("\\x1b%@") Ǥ롣
(G0 åȤ G1 åȤؤȤä)
¾ ISO 2022 ϡUTF-8 ⡼ɤǤϻȤʤ
.PP
ͽΤǤ뾭ǤϡPOSIX ƥΰŪʸ沽ƤΥ٥
.B UTF-8
.B ASCII
.B ISO 8859
֤ץ졼ƥȤͥ줿Ķ뤳ȤԤǤ롣
.SS ƥ
.BR Unicode " " UCS
εʤǤϡ
.B UTF-8
ԤϤǤûѤ褦ᤷƤ롣
㤨СƬХȤ 0xc0 Ǥ褦 2 Х
ΤϽƤȤϤʤ
.B Unicode 3.1
Ǥϡʤ˽ץ
ûɽǤϤʤϤդʤȤबɲä줿
ϥƥͳˤ롣
桼ϤƥδФå硢
ץ
.B ASCII
Ǥ "/../" ";" "NUL" å
û沽ƤʤʸᤴƤޤ⤷ʤǤ롣
ʤʤ顢ûǤϤʤ
.B UTF-8
沽Ǥϡʸɽ褦͡
.B ASCII
ʳη¸ߤ뤿Ǥ롣
.SS
ISO/IEC 10646-1:2000, Unicode 3.1, RFC\ 2279, Plan 9.
.\" .SH
.\" Markus Kuhn <mgk25@cl.cam.ac.uk>
.SH Ϣ
.BR nl_langinfo (3),
.BR setlocale (3),
.BR charsets (7),
.BR unicode (7)
|