1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335
|
\ UTF-8 handling 12dec04py
\ Copyright (C) 2004,2005,2006,2007,2008 Free Software Foundation, Inc.
\ This file is part of Gforth.
\ Gforth is free software; you can redistribute it and/or
\ modify it under the terms of the GNU General Public License
\ as published by the Free Software Foundation, either version 3
\ of the License, or (at your option) any later version.
\ This program is distributed in the hope that it will be useful,
\ but WITHOUT ANY WARRANTY; without even the implied warranty of
\ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
\ GNU General Public License for more details.
\ You should have received a copy of the GNU General Public License
\ along with this program. If not, see http://www.gnu.org/licenses/.
\ short: u8 means utf-8 encoded address
s" malformed UTF-8 character" exception Constant UTF-8-err
$80 Value max-single-byte
: u8len ( u8 -- n )
dup max-single-byte u< IF drop 1 EXIT THEN \ special case ASCII
$800 2 >r
BEGIN 2dup u>= WHILE 5 lshift r> 1+ >r dup 0= UNTIL THEN
2drop r> ;
: u8@+ ( u8addr -- u8addr' u )
count dup max-single-byte u< ?EXIT \ special case ASCII
dup $C2 u< IF UTF-8-err throw THEN \ malformed character
$7F and $40 >r
BEGIN dup r@ and WHILE r@ xor
6 lshift r> 5 lshift >r >r count
dup $C0 and $80 <> IF UTF-8-err throw THEN
$3F and r> or
REPEAT rdrop ;
: u8!+ ( u u8addr -- u8addr' )
over max-single-byte u< IF tuck c! 1+ EXIT THEN \ special case ASCII
>r 0 swap $3F
BEGIN 2dup u> WHILE
2/ >r dup $3F and $80 or swap 6 rshift r>
REPEAT $7F xor 2* or r>
BEGIN over $80 u>= WHILE tuck c! 1+ REPEAT nip ;
\ plug-in so that char and '<char> work for UTF-8
[ifundef] char@ \ !! bootstrapping help
Defer char@ ( addr u -- char addr' u' )
:noname over c@ -rot 1 /string ; IS char@
[then]
:noname ( addr u -- char addr' u' )
\ !! the if here seems to work around some breakage, but not
\ entirely; e.g., try 'รง' with LANG=C.
dup 1 u<= IF defers char@ EXIT THEN
over + >r u8@+ swap r> over - ; IS char@
\ scan to next/previous character
\ alternative names: u8char+ u8char-
: u8>> ( u8addr -- u8addr' ) u8@+ drop ;
: u8<< ( u8addr -- u8addr' )
BEGIN 1- dup c@ $C0 and max-single-byte <> UNTIL ;
\ utf key and emit
Defer check-xy ' noop IS check-xy
: u8key ( -- u )
defers key dup max-single-byte u< ?EXIT \ special case ASCII
dup $FF = ?EXIT \ special resize character
dup $C2 u< IF UTF-8-err throw THEN \ malformed character
$7F and $40 >r
BEGIN dup r@ and WHILE r@ xor
6 lshift r> 5 lshift >r >r defers key
dup $C0 and $80 <> IF UTF-8-err throw THEN
$3F and r> or
REPEAT rdrop ;
: u8emit ( u -- )
dup max-single-byte u< IF defers emit EXIT THEN \ special case ASCII
0 swap $3F
BEGIN 2dup u> WHILE
2/ >r dup $3F and $80 or swap 6 rshift r>
REPEAT $7F xor 2* or
BEGIN dup $80 u>= WHILE defers emit REPEAT drop ;
\ utf-8 stuff for xchars
: +u8/string ( xc-addr1 u1 -- xc-addr2 u2 )
over dup u8>> swap - /string ;
: u8\string- ( xcaddr u -- xcaddr u' )
over + u8<< over - ;
: u8@ ( c-addr -- u )
u8@+ nip ;
: u8!+? ( xc xc-addr1 u1 -- xc-addr2 u2 f )
>r over u8len r@ over u< if ( xc xc-addr1 len r: u1 )
\ not enough space
drop nip r> false
else
>r u8!+ r> r> swap - true
then ;
: u8addrlen ( u8-addr u -- u ) drop
\ length of UTF-8 char starting at u8-addr (accesses only u8-addr)
c@
dup $80 u< if drop 1 exit endif
dup $c0 u< if UTF-8-err throw endif
dup $e0 u< if drop 2 exit endif
dup $f0 u< if drop 3 exit endif
dup $f8 u< if drop 4 exit endif
dup $fc u< if drop 5 exit endif
dup $fe u< if drop 6 exit endif
UTF-8-err throw ;
: -u8trailing-garbage ( addr u1 -- addr u2 )
2dup + dup u8<< ( addr u1 end1 end2 )
2dup dup over over - u8addrlen + = if \ last character ok
2drop
else
nip nip over -
then ;
[IFUNDEF] wcwidth
: wc,3 ( n low high -- ) 1+ , , , ;
Create wc-table \ derived from wcwidth source code, for UCS32
0 $0300 $0357 wc,3
0 $035D $036F wc,3
0 $0483 $0486 wc,3
0 $0488 $0489 wc,3
0 $0591 $05A1 wc,3
0 $05A3 $05B9 wc,3
0 $05BB $05BD wc,3
0 $05BF $05BF wc,3
0 $05C1 $05C2 wc,3
0 $05C4 $05C4 wc,3
0 $0600 $0603 wc,3
0 $0610 $0615 wc,3
0 $064B $0658 wc,3
0 $0670 $0670 wc,3
0 $06D6 $06E4 wc,3
0 $06E7 $06E8 wc,3
0 $06EA $06ED wc,3
0 $070F $070F wc,3
0 $0711 $0711 wc,3
0 $0730 $074A wc,3
0 $07A6 $07B0 wc,3
0 $0901 $0902 wc,3
0 $093C $093C wc,3
0 $0941 $0948 wc,3
0 $094D $094D wc,3
0 $0951 $0954 wc,3
0 $0962 $0963 wc,3
0 $0981 $0981 wc,3
0 $09BC $09BC wc,3
0 $09C1 $09C4 wc,3
0 $09CD $09CD wc,3
0 $09E2 $09E3 wc,3
0 $0A01 $0A02 wc,3
0 $0A3C $0A3C wc,3
0 $0A41 $0A42 wc,3
0 $0A47 $0A48 wc,3
0 $0A4B $0A4D wc,3
0 $0A70 $0A71 wc,3
0 $0A81 $0A82 wc,3
0 $0ABC $0ABC wc,3
0 $0AC1 $0AC5 wc,3
0 $0AC7 $0AC8 wc,3
0 $0ACD $0ACD wc,3
0 $0AE2 $0AE3 wc,3
0 $0B01 $0B01 wc,3
0 $0B3C $0B3C wc,3
0 $0B3F $0B3F wc,3
0 $0B41 $0B43 wc,3
0 $0B4D $0B4D wc,3
0 $0B56 $0B56 wc,3
0 $0B82 $0B82 wc,3
0 $0BC0 $0BC0 wc,3
0 $0BCD $0BCD wc,3
0 $0C3E $0C40 wc,3
0 $0C46 $0C48 wc,3
0 $0C4A $0C4D wc,3
0 $0C55 $0C56 wc,3
0 $0CBC $0CBC wc,3
0 $0CBF $0CBF wc,3
0 $0CC6 $0CC6 wc,3
0 $0CCC $0CCD wc,3
0 $0D41 $0D43 wc,3
0 $0D4D $0D4D wc,3
0 $0DCA $0DCA wc,3
0 $0DD2 $0DD4 wc,3
0 $0DD6 $0DD6 wc,3
0 $0E31 $0E31 wc,3
0 $0E34 $0E3A wc,3
0 $0E47 $0E4E wc,3
0 $0EB1 $0EB1 wc,3
0 $0EB4 $0EB9 wc,3
0 $0EBB $0EBC wc,3
0 $0EC8 $0ECD wc,3
0 $0F18 $0F19 wc,3
0 $0F35 $0F35 wc,3
0 $0F37 $0F37 wc,3
0 $0F39 $0F39 wc,3
0 $0F71 $0F7E wc,3
0 $0F80 $0F84 wc,3
0 $0F86 $0F87 wc,3
0 $0F90 $0F97 wc,3
0 $0F99 $0FBC wc,3
0 $0FC6 $0FC6 wc,3
0 $102D $1030 wc,3
0 $1032 $1032 wc,3
0 $1036 $1037 wc,3
0 $1039 $1039 wc,3
0 $1058 $1059 wc,3
1 $0000 $1100 wc,3
2 $1100 $115f wc,3
0 $1160 $11FF wc,3
0 $1712 $1714 wc,3
0 $1732 $1734 wc,3
0 $1752 $1753 wc,3
0 $1772 $1773 wc,3
0 $17B4 $17B5 wc,3
0 $17B7 $17BD wc,3
0 $17C6 $17C6 wc,3
0 $17C9 $17D3 wc,3
0 $17DD $17DD wc,3
0 $180B $180D wc,3
0 $18A9 $18A9 wc,3
0 $1920 $1922 wc,3
0 $1927 $1928 wc,3
0 $1932 $1932 wc,3
0 $1939 $193B wc,3
0 $200B $200F wc,3
0 $202A $202E wc,3
0 $2060 $2063 wc,3
0 $206A $206F wc,3
0 $20D0 $20EA wc,3
2 $2329 $232A wc,3
0 $302A $302F wc,3
2 $2E80 $303E wc,3
0 $3099 $309A wc,3
2 $3040 $A4CF wc,3
2 $AC00 $D7A3 wc,3
2 $F900 $FAFF wc,3
0 $FB1E $FB1E wc,3
0 $FE00 $FE0F wc,3
0 $FE20 $FE23 wc,3
2 $FE30 $FE6F wc,3
0 $FEFF $FEFF wc,3
2 $FF00 $FF60 wc,3
2 $FFE0 $FFE6 wc,3
0 $FFF9 $FFFB wc,3
0 $1D167 $1D169 wc,3
0 $1D173 $1D182 wc,3
0 $1D185 $1D18B wc,3
0 $1D1AA $1D1AD wc,3
2 $20000 $2FFFD wc,3
2 $30000 $3FFFD wc,3
0 $E0001 $E0001 wc,3
0 $E0020 $E007F wc,3
0 $E0100 $E01EF wc,3
here wc-table - Constant #wc-table
\ inefficient table walk:
: wcwidth ( xc -- n )
wc-table #wc-table over + swap ?DO
dup I 2@ within IF I 2 cells + @ UNLOOP EXIT THEN
3 cells +LOOP 1 ;
[THEN]
: u8width ( xcaddr u -- n )
0 rot rot over + swap ?DO
I xc@+ swap >r wcwidth +
r> I - +LOOP ;
: set-encoding-utf-8 ( -- )
['] u8emit is xemit
['] u8key is xkey
['] u8>> is xchar+
['] u8<< is xchar-
[ [IFDEF] xstring+ ]
['] u8\string- is xstring-
['] +u8/string is +xstring
[ [THEN] ]
[ [IFDEF] +x/string ]
['] u8\string- is x\string-
['] +u8/string is +x/string
[ [THEN] ]
['] u8@ is xc@
['] u8!+? is xc!+?
['] u8@+ is xc@+
['] u8len is xc-size
[ [IFDEF] x-width ]
['] u8width is x-width
[ [THEN] ]
[ [IFDEF] x-size ]
['] u8addrlen is x-size
[ [THEN] ]
['] -u8trailing-garbage is -trailing-garbage
;
: utf-8-cold ( -- )
s" LC_ALL" getenv 2dup d0= IF 2drop
s" LC_CTYPE" getenv 2dup d0= IF 2drop
s" LANG" getenv 2dup d0= IF 2drop
s" C" THEN THEN THEN
s" UTF-8" search nip nip
IF set-encoding-utf-8 ELSE set-encoding-fixed-width THEN ;
environment-wordlist set-current
: xchar-encoding ( -- addr u ) \ xchar-ext
\G Returns a printable ASCII string that reperesents the encoding,
\G and use the preferred MIME name (if any) or the name in
\G @url{http://www.iana.org/assignments/character-sets} like
\G ``ISO-LATIN-1'' or ``UTF-8'', with the exception of ``ASCII'', where
\G we prefer the alias ``ASCII''.
max-single-byte $80 = IF s" UTF-8" ELSE s" ISO-LATIN-1" THEN ;
forth definitions
:noname ( -- )
defers 'cold
utf-8-cold
; is 'cold
utf-8-cold
|