File: char.jl

package info (click to toggle)
julia 1.0.3%2Bdfsg-4
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 49,452 kB
  • sloc: lisp: 236,453; ansic: 55,579; cpp: 25,603; makefile: 1,685; pascal: 1,130; sh: 956; asm: 86; xml: 76
file content (299 lines) | stat: -rw-r--r-- 10,884 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
# This file is a part of Julia. License is MIT: https://julialang.org/license

"""
The `AbstractChar` type is the supertype of all character implementations
in Julia.   A character represents a Unicode code point, and can be converted
to an integer via the [`codepoint`](@ref) function in order to obtain the
numerical value of the code point, or constructed from the same integer.
These numerical values determine how characters are compared with `<` and `==`,
for example.  New `T <: AbstractChar` types should define a `codepoint(::T)`
method and a `T(::UInt32)` constructor, at minimum.

A given `AbstractChar` subtype may be capable of representing only a subset
of Unicode, in which case conversion from an unsupported `UInt32` value
may throw an error.  Conversely, the built-in [`Char`](@ref) type represents
a *superset* of Unicode (in order to losslessly encode invalid byte streams),
in which case conversion of a non-Unicode value *to* `UInt32` throws an error.
The [`isvalid`](@ref) function can be used to check which codepoints are
representable in a given `AbstractChar` type.

Internally, an `AbstractChar` type may use a variety of encodings.  Conversion
via `codepoint(char)` will not reveal this encoding because it always returns the
Unicode value of the character. `print(io, c)` of any `c::AbstractChar`
produces an encoding determined by `io` (UTF-8 for all built-in `IO`
types), via conversion to `Char` if necessary.

`write(io, c)`, in contrast, may emit an encoding depending on
`typeof(c)`, and `read(io, typeof(c))` should read the same encoding as `write`.
New `AbstractChar` types must provide their own implementations of
`write` and `read`.
"""
AbstractChar

"""
    Char(c::Union{Number,AbstractChar})

`Char` is a 32-bit [`AbstractChar`](@ref) type that is the default representation
of characters in Julia.  `Char` is the type used for character literals like `'x'`
and it is also the element type of [`String`](@ref).

In order to losslessly represent arbitrary byte streams stored in a `String`,
a `Char` value may store information that cannot be converted to a Unicode
codepoint — converting such a `Char` to `UInt32` will throw an error.
The [`isvalid(c::Char)`](@ref) function can be used to query whether `c`
represents a valid Unicode character.
"""
Char

(::Type{T})(x::Number) where {T<:AbstractChar} = T(UInt32(x))
(::Type{AbstractChar})(x::Number) = Char(x)
(::Type{T})(x::AbstractChar) where {T<:Union{Number,AbstractChar}} = T(codepoint(x))
(::Type{T})(x::T) where {T<:AbstractChar} = x

codepoint(c::Char) = UInt32(c)

"""
    codepoint(c::AbstractChar)

Return the Unicode codepoint (an unsigned integer) corresponding
to the character `c` (or throw an exception if `c` does not represent
a valid character).   For `Char`, this is a `UInt32` value, but
`AbstractChar` types that represent only a subset of Unicode may
return a different-sized integer (e.g. `UInt8`).
"""
codepoint # defined for Char in boot.jl

struct InvalidCharError{T<:AbstractChar} <: Exception
    char::T
end
struct CodePointError{T<:Integer} <: Exception
    code::T
end
@noinline invalid_char(c::AbstractChar) = throw(InvalidCharError(c))
@noinline code_point_err(u::Integer) = throw(CodePointError(u))

function ismalformed(c::Char)
    u = reinterpret(UInt32, c)
    l1 = leading_ones(u) << 3
    t0 = trailing_zeros(u) & 56
    (l1 == 8) | (l1 + t0 > 32) |
    (((u & 0x00c0c0c0) ⊻ 0x00808080) >> t0 != 0)
end

@inline is_overlong_enc(u::UInt32) = (u >> 24 == 0xc0) | (u >> 24 == 0xc1) | (u >> 21 == 0x0704) | (u >> 20 == 0x0f08)

function isoverlong(c::Char)
    u = reinterpret(UInt32, c)
    is_overlong_enc(u)
end

# fallback: other AbstractChar types, by default, are assumed
#           not to support malformed or overlong encodings.

"""
    ismalformed(c::AbstractChar)

Return `true` if `c` represents malformed (non-Unicode) data according to the
encoding used by `c`.  Defaults to `false` for non-`Char` types.  See also
[`show_invalid`](@ref).
"""
ismalformed(c::AbstractChar) = false

"""
    isoverlong(c::AbstractChar)

Return `true` if `c` represents an overlong UTF-8 sequence.  Defaults
to `false` for non-`Char` types.  See also [`decode_overlong`](@ref)
and [`show_invalid`](@ref).
"""
isoverlong(c::AbstractChar) = false

function UInt32(c::Char)
    # TODO: use optimized inline LLVM
    u = reinterpret(UInt32, c)
    u < 0x80000000 && return u >> 24
    l1 = leading_ones(u)
    t0 = trailing_zeros(u) & 56
    (l1 == 1) | (8l1 + t0 > 32) |
    ((((u & 0x00c0c0c0) ⊻ 0x00808080) >> t0 != 0) | is_overlong_enc(u)) &&
        invalid_char(c)::Union{}
    u &= 0xffffffff >> l1
    u >>= t0
    ((u & 0x0000007f) >> 0) | ((u & 0x00007f00) >> 2) |
    ((u & 0x007f0000) >> 4) | ((u & 0x7f000000) >> 6)
end

function decode_overlong(c::Char)
    u = reinterpret(UInt32, c)
    l1 = leading_ones(u)
    t0 = trailing_zeros(u) & 56
    u &= 0xffffffff >> l1
    u >>= t0
    ((u & 0x0000007f) >> 0) | ((u & 0x00007f00) >> 2) |
    ((u & 0x007f0000) >> 4) | ((u & 0x7f000000) >> 6)
end

"""
    decode_overlong(c::AbstractChar)

When [`isoverlong(c)`](@ref) is `true`, `decode_overlong(c)` returns
the Unicode codepoint value of `c`.   `AbstractChar` implementations
that support overlong encodings should implement `Base.decode_overlong`.
"""
decode_overlong

function Char(u::UInt32)
    u < 0x80 && return reinterpret(Char, u << 24)
    u < 0x00200000 || code_point_err(u)::Union{}
    c = ((u << 0) & 0x0000003f) | ((u << 2) & 0x00003f00) |
        ((u << 4) & 0x003f0000) | ((u << 6) & 0x3f000000)
    c = u < 0x00000800 ? (c << 16) | 0xc0800000 :
        u < 0x00010000 ? (c << 08) | 0xe0808000 :
                         (c << 00) | 0xf0808080
    reinterpret(Char, c)
end

function (T::Union{Type{Int8},Type{UInt8}})(c::Char)
    i = reinterpret(Int32, c)
    i ≥ 0 ? ((i >>> 24) % T) : T(UInt32(c))
end

function Char(b::Union{Int8,UInt8})
    0 ≤ b ≤ 0x7f ? reinterpret(Char, (b % UInt32) << 24) : Char(UInt32(b))
end

convert(::Type{AbstractChar}, x::Number) = Char(x) # default to Char
convert(::Type{T}, x::Number) where {T<:AbstractChar} = T(x)
convert(::Type{T}, x::AbstractChar) where {T<:Number} = T(x)
convert(::Type{T}, c::AbstractChar) where {T<:AbstractChar} = T(c)
convert(::Type{T}, c::T) where {T<:AbstractChar} = c

rem(x::AbstractChar, ::Type{T}) where {T<:Number} = rem(codepoint(x), T)

typemax(::Type{Char}) = reinterpret(Char, typemax(UInt32))
typemin(::Type{Char}) = reinterpret(Char, typemin(UInt32))

size(c::AbstractChar) = ()
size(c::AbstractChar,d) = convert(Int, d) < 1 ? throw(BoundsError()) : 1
ndims(c::AbstractChar) = 0
ndims(::Type{<:AbstractChar}) = 0
length(c::AbstractChar) = 1
firstindex(c::AbstractChar) = 1
lastindex(c::AbstractChar) = 1
getindex(c::AbstractChar) = c
getindex(c::AbstractChar, i::Integer) = i == 1 ? c : throw(BoundsError())
getindex(c::AbstractChar, I::Integer...) = all(x -> x == 1, I) ? c : throw(BoundsError())
first(c::AbstractChar) = c
last(c::AbstractChar) = c
eltype(::Type{T}) where {T<:AbstractChar} = T

iterate(c::AbstractChar, done=false) = done ? nothing : (c, true)
isempty(c::AbstractChar) = false
in(x::AbstractChar, y::AbstractChar) = x == y

==(x::Char, y::Char) = reinterpret(UInt32, x) == reinterpret(UInt32, y)
isless(x::Char, y::Char) = reinterpret(UInt32, x) < reinterpret(UInt32, y)
hash(x::Char, h::UInt) =
    hash_uint64(((reinterpret(UInt32, x) + UInt64(0xd4d64234)) << 32) ⊻ UInt64(h))

# fallbacks:
isless(x::AbstractChar, y::AbstractChar) = isless(Char(x), Char(y))
==(x::AbstractChar, y::AbstractChar) = Char(x) == Char(y)
hash(x::AbstractChar, h::UInt) = hash(Char(x), h)
widen(::Type{T}) where {T<:AbstractChar} = T

-(x::AbstractChar, y::AbstractChar) = Int(x) - Int(y)
-(x::T, y::Integer) where {T<:AbstractChar} = T(Int32(x) - Int32(y))
+(x::T, y::Integer) where {T<:AbstractChar} = T(Int32(x) + Int32(y))
+(x::Integer, y::AbstractChar) = y + x

# `print` should output UTF-8 by default for all AbstractChar types.
# (Packages may implement other IO subtypes to specify different encodings.)
# In contrast, `write(io, c)` outputs a `c` in an encoding determined by typeof(c).
print(io::IO, c::Char) = (write(io, c); nothing)
print(io::IO, c::AbstractChar) = print(io, Char(c)) # fallback: convert to output UTF-8

const hex_chars = UInt8['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
                        'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i',
                        'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r',
                        's', 't', 'u', 'v', 'w', 'x', 'y', 'z']

function show_invalid(io::IO, c::Char)
    write(io, 0x27)
    u = reinterpret(UInt32, c)
    while true
        a = hex_chars[((u >> 28) & 0xf) + 1]
        b = hex_chars[((u >> 24) & 0xf) + 1]
        write(io, 0x5c, UInt8('x'), a, b)
        (u <<= 8) == 0 && break
    end
    write(io, 0x27)
end

"""
    show_invalid(io::IO, c::AbstractChar)

Called by `show(io, c)` when [`isoverlong(c)`](@ref) or
[`ismalformed(c)`](@ref) return `true`.   Subclasses
of `AbstractChar` should define `Base.show_invalid` methods
if they support storing invalid character data.
"""
show_invalid

# show c to io, assuming UTF-8 encoded output
function show(io::IO, c::AbstractChar)
    if c <= '\\'
        b = c == '\0' ? 0x30 :
            c == '\a' ? 0x61 :
            c == '\b' ? 0x62 :
            c == '\t' ? 0x74 :
            c == '\n' ? 0x6e :
            c == '\v' ? 0x76 :
            c == '\f' ? 0x66 :
            c == '\r' ? 0x72 :
            c == '\e' ? 0x65 :
            c == '\'' ? 0x27 :
            c == '\\' ? 0x5c : 0xff
        if b != 0xff
            write(io, 0x27, 0x5c, b, 0x27)
            return
        end
    end
    if isoverlong(c) || ismalformed(c)
        show_invalid(io, c)
    elseif isprint(c)
        write(io, 0x27)
        print(io, c) # use print, not write, to use UTF-8 for any AbstractChar
        write(io, 0x27)
    else # unprintable, well-formed, non-overlong Unicode
        u = codepoint(c)
        write(io, 0x27, 0x5c, c <= '\x7f' ? 0x78 : c <= '\uffff' ? 0x75 : 0x55)
        d = max(2, 8 - (leading_zeros(u) >> 2))
        while 0 < d
            write(io, hex_chars[((u >> ((d -= 1) << 2)) & 0xf) + 1])
        end
        write(io, 0x27)
    end
    return
end

function show(io::IO, ::MIME"text/plain", c::T) where {T<:AbstractChar}
    show(io, c)
    if !ismalformed(c)
        print(io, ": ")
        if isoverlong(c)
            print(io, "[overlong] ")
            u = decode_overlong(c)
            c = T(u)
        else
            u = codepoint(c)
        end
        h = string(u, base = 16, pad = u ≤ 0xffff ? 4 : 6)
        print(io, (isascii(c) ? "ASCII/" : ""), "Unicode U+", h)
    else
        print(io, ": Malformed UTF-8")
    end
    abr = Unicode.category_abbrev(c)
    str = Unicode.category_string(c)
    print(io, " (category ", abr, ": ", str, ")")
end