File: data_generator.jl

package info (click to toggle)
utf8proc 2.11.3-1
links: PTS, VCS
area: main
in suites: experimental
size: 2,836 kB
sloc: ansic: 19,168; lisp: 449; makefile: 242; sh: 7
file content (498 lines) | stat: -rw-r--r-- 18,758 bytes
parent folder | download | duplicates (3)
using OffsetArrays: Origin

parsehex(str) = parse(UInt32, str, base=16)

function parse_hex_range(line)
    m = match(r"^([0-9A-F]+)(\.\.([0-9A-F]+))? +; +([^#]+)", line)
    if isnothing(m)
        return nothing
    end
    i = parsehex(m[1])
    j = !isnothing(m[3]) ? parsehex(m[3]) : i
    desc = rstrip(m[4])
    return (i:j, desc)
end

function read_hex_ranges(filename)
    [r for r in parse_hex_range.(readlines(filename)) if !isnothing(r)]
end

function collect_codepoints(range_desc, description)
    list = UInt32[]
    for (r,d) in range_desc
        if d == description
            append!(list, r)
        end
    end
    list
end

function set_all!(d, keys, value)
    for k in keys
        d[k] = value
    end
end

#-------------------------------------------------------------------------------

derived_core_properties = read_hex_ranges("DerivedCoreProperties.txt")

ignorable = Set(collect_codepoints(derived_core_properties, "Default_Ignorable_Code_Point"))
uppercase = Set(collect_codepoints(derived_core_properties, "Uppercase"))
lowercase = Set(collect_codepoints(derived_core_properties, "Lowercase"))


#-------------------------------------------------------------------------------
function derive_indic_conjunct_break(derived_core_properties)
    props = Dict{UInt32, String}()
    set_all!(props, collect_codepoints(derived_core_properties, "InCB; Linker"),    "LINKER")
    set_all!(props, collect_codepoints(derived_core_properties, "InCB; Consonant"), "CONSONANT")
    set_all!(props, collect_codepoints(derived_core_properties, "InCB; Extend"),    "EXTEND")
    props
end

let indic_conjunct_break = derive_indic_conjunct_break(derived_core_properties)
    global function get_indic_conjunct_break(code)
        get(indic_conjunct_break, code, "NONE")
    end
end

#-------------------------------------------------------------------------------
function read_grapheme_boundclasses(grapheme_break_filename, emoji_data_filename)
    grapheme_boundclass = Dict{UInt32, String}()
    for (r,desc) in read_hex_ranges(grapheme_break_filename)
        set_all!(grapheme_boundclass, r, Base.uppercase(desc))
    end
    for (r,desc) in read_hex_ranges(emoji_data_filename)
        if desc == "Extended_Pictographic"
            set_all!(grapheme_boundclass, r, "EXTENDED_PICTOGRAPHIC")
        elseif desc == "Emoji_Modifier"
            set_all!(grapheme_boundclass, r, "EXTEND")
        end
    end
    return grapheme_boundclass
end

let grapheme_boundclasses = read_grapheme_boundclasses("GraphemeBreakProperty.txt", "emoji-data.txt")
    global function get_grapheme_boundclass(code)
        get(grapheme_boundclasses, code, "OTHER")
    end
end

#-------------------------------------------------------------------------------
function read_composition_exclusions(pattern)
    section = match(pattern, read("CompositionExclusions.txt",String)).match
    es = UInt32[]
    for line in split(section, '\n')
        m = match(r"^([0-9A-F]+) +#"i, line)
        if !isnothing(m)
            push!(es, parsehex(m[1]))
        end
    end
    es
end

exclusions = Set(read_composition_exclusions(r"# \(1\) Script Specifics.*?# Total code points:"s))
excl_version = Set(read_composition_exclusions(r"# \(2\) Post Composition Version precomposed characters.*?# Total code points:"s))

#-------------------------------------------------------------------------------
function read_case_folding(filename)
    case_folding = Dict{UInt32,Vector{UInt32}}()
    for line in readlines(filename)
        m = match(r"^([0-9A-F]+); [CF]; ([0-9A-F ]+);"i, line)
        !isnothing(m) || continue
        case_folding[parsehex(m[1])] = parsehex.(split(m[2]))
    end
    case_folding
end

let case_folding = read_case_folding("CaseFolding.txt")
    global function get_case_folding(code)
        get(case_folding, code, nothing)
    end
end

#-------------------------------------------------------------------------------
# Utilities for reading per-char properties from UnicodeData.txt
function split_unicode_data_line(line)
    m = match(r"""
      ([0-9A-F]+);        # code
      ([^;]+);            # name
      ([A-Z]+);           # general category
      ([0-9]+);           # canonical combining class
      ([A-Z]+);           # bidi class
      (<([A-Z]*)>)?       # decomposition type
      ((\ ?[0-9A-F]+)*);  # decompomposition mapping
      ([0-9]*);           # decimal digit
      ([0-9]*);           # digit
      ([^;]*);            # numeric
      ([YN]*);            # bidi mirrored
      ([^;]*);            # unicode 1.0 name
      ([^;]*);            # iso comment
      ([0-9A-F]*);        # simple uppercase mapping
      ([0-9A-F]*);        # simple lowercase mapping
      ([0-9A-F]*)$        # simple titlecase mapping
    """ix, line)
    @assert !isnothing(m)
    code = parse(UInt32, m[1], base=16)
    (code             = code,
     name             = m[2],
     category         = m[3],
     combining_class  = parse(Int, m[4]),
     bidi_class       = m[5],
     decomp_type      = m[7],
     decomp_mapping   = m[8] == "" ? nothing : parsehex.(split(m[8])),
     bidi_mirrored    = m[13] == "Y",
     # issue #130: use nonstandard uppercase ß -> ẞ
     # issue #195: if character is uppercase but has no lowercase mapping,
     #             then make lowercase mapping = itself (vice versa for lowercase)
     uppercase_mapping = m[16] != ""                      ? parsehex(m[16]) :
                         code  == 0x000000df              ? 0x00001e9e      :
                         m[17] == "" && code in lowercase ? code            :
                         nothing,
     lowercase_mapping = m[17] != ""                      ? parsehex(m[17]) :
                         m[16] == "" && code in uppercase ? code            :
                         nothing,
     titlecase_mapping = m[18] != ""         ? parsehex(m[18]) :
                         code  == 0x000000df ? 0x00001e9e      :
                         nothing,
    )
end

function read_unicode_data(filename)
    raw_char_props = split_unicode_data_line.(readlines(filename))
    char_props = Origin(0)(Vector{eltype(raw_char_props)}())
    @assert issorted(raw_char_props, by=c->c.code)
    raw_char_props = Iterators.Stateful(raw_char_props)
    while !isempty(raw_char_props)
        c = popfirst!(raw_char_props)
        if occursin(", First>", c.name)
            nc = popfirst!(raw_char_props)
            @assert occursin(", Last>", nc.name)
            name = replace(c.name, ", First"=>"")
            for i in c.code:nc.code
                push!(char_props, (; c..., name=name, code=i))
            end
        else
            push!(char_props, c)
        end
    end
    return char_props
end

char_props = read_unicode_data("UnicodeData.txt")
char_hash = Dict(c.code=>c for c in char_props)

#-------------------------------------------------------------------------------
# Read character widths from UAX #11: East Asian Width
function read_east_asian_widths(filename)
    ea_widths = Dict{UInt32,Int}()
    for (rng,widthcode) in read_hex_ranges(filename)
        w = widthcode == "W" || widthcode == "F" ? 2 : # wide or full
            widthcode == "Na"|| widthcode == "H" ? 1 : # narrow or half-width
            widthcode == "A"  ? -1 : # ambiguous width
            nothing
        if !isnothing(w)
            set_all!(ea_widths, rng, w)
        end
    end
    return ea_widths
end

let ea_widths = read_east_asian_widths("EastAsianWidth.txt")
    # Following work by @jiahao, we compute character widths using a combination of
    #   * character category
    #   * UAX 11: East Asian Width
    #   * a few exceptions as needed
    # Adapted from http://nbviewer.ipython.org/gist/jiahao/07e8b08bf6d8671e9734
    global function derive_char_width(code, category)
        # Use a default width of 1 for all character categories that are
        # letter/symbol/number-like, as well as for unassigned/private-use chars.
        # This provides a useful nonzero fallback for new codepoints when a new
        # Unicode version has been released.
        width = 1

        # Various zero-width categories
        #
        # "Sk" not included in zero width - see issue #167
        if category in ("Mn", "Mc", "Me", "Zl", "Zp", "Cc", "Cf", "Cs")
            width = 0
        end

        # Widths from UAX #11: East Asian Width
        eaw = get(ea_widths, code, nothing)
        if !isnothing(eaw)
            width = eaw < 0 ? 1 : eaw
        end

        # A few exceptional cases, found by manual comparison to other wcwidth
        # functions and similar checks.
        if category == "Mn"
            width = 0
        end

        if code == 0x00ad
            # Soft hyphen is typically printed as a hyphen (-) in terminals.
            width = 1
        elseif code == 0x2028 || code == 0x2029
            #By definition, should have zero width (on the same line)
            #0x002028 '\u2028' category: Zl name: LINE SEPARATOR/
            #0x002029 '\u2029' category: Zp name: PARAGRAPH SEPARATOR/
            width = 0
        end

        return width
    end
    global function is_ambiguous_width(code)
        return get(ea_widths, code, 0) < 0
    end
end

#-------------------------------------------------------------------------------
# Construct data tables which will drive libutf8proc
#
# These tables are "compressed" with an ad-hoc compression scheme (largely some
# simple deduplication and indexing) which can easily and efficiently be
# decompressed on the C side at runtime.

# Inverse decomposition mapping tables for combining two characters into a single one.
comb_mapping = Dict{UInt32, Dict{UInt32, UInt32}}()
comb_issecond = Set{UInt32}()
for char in char_props
    # What happens with decompositions that are longer than 2?
    if isnothing(char.decomp_type) && !isnothing(char.decomp_mapping) &&
            length(char.decomp_mapping) == 2 && !isnothing(char_hash[char.decomp_mapping[1]]) &&
            char_hash[char.decomp_mapping[1]].combining_class == 0 &&
            (char.code ∉ exclusions && char.code ∉ excl_version)
        dm0 = char.decomp_mapping[1]
        dm1 = char.decomp_mapping[2]
        if !haskey(comb_mapping, dm0)
            comb_mapping[dm0] = Dict{UInt32, UInt32}()
        end
        comb_mapping[dm0][dm1] = char.code
        push!(comb_issecond, dm1)
    end
end

comb_index = Dict{UInt32, UInt32}()
comb_length = Dict{UInt32, UInt32}()
let
    ind = 0
    for dm0 in sort!(collect(keys(comb_mapping)))
        comb_index[dm0] = ind
        len = length(comb_mapping[dm0])
        comb_length[dm0] = len
        ind += len
    end
end

utf16_encode(utf32_seq) = transcode(UInt16, transcode(String, utf32_seq))

# Utility for packing all UTF-16 encoded sequences into one big array
struct UTF16Sequences
    storage::Vector{UInt16}
    indices::Dict{Vector{UInt16},Int}
end
UTF16Sequences() = UTF16Sequences(UInt16[], Dict{Vector{UInt16},Int}())

"""
Return "sequence code" (seqindex in the C code) for a sequence: a UInt16 where
* The 14 low bits are the index into the `sequences.storage` array where the
  sequence resides
* The two top bits are the length of the sequence, or if equal to 3, the first
  entry of the sequence itself contains the length.
"""
function encode_sequence!(sequences::UTF16Sequences, utf32_seq::Vector)
    if length(utf32_seq) == 0
        return typemax(UInt16)
    end
    # lencode contains the length of the UTF-32 sequence after decoding
    # No sequence has len 0, so we encode len 1 as 0, len 2 as 1.
    # We have only 2 bits for the length, though, so longer sequences are
    # encoded in the sequence data itself.
    seq_lencode = length(utf32_seq) - 1
    utf16_seq = utf16_encode(utf32_seq)
    idx = get!(sequences.indices, utf16_seq) do
        i = length(sequences.storage)
        utf16_seq_enc = seq_lencode < 3 ? utf16_seq :
                        pushfirst!(copy(utf16_seq), seq_lencode)
        append!(sequences.storage, utf16_seq_enc)
        i
    end
    @assert idx <= 0x3FFF
    seq_code = idx | (min(seq_lencode, 3) << 14)
    return seq_code
end

function encode_sequence!(sequences::UTF16Sequences, code::Integer)
    encode_sequence!(sequences, [code])
end

function encode_sequence!(sequences::UTF16Sequences, ::Nothing)
    return typemax(UInt16)
end

function char_table_properties!(sequences, char)
    code = char.code

    return (
        category             = char.category,
        combining_class      = char.combining_class,
        bidi_class           = char.bidi_class,
        decomp_type          = char.decomp_type,
        decomp_seqindex      = encode_sequence!(sequences, char.decomp_mapping),
        casefold_seqindex    = encode_sequence!(sequences, get_case_folding(code)),
        uppercase_seqindex   = encode_sequence!(sequences, char.uppercase_mapping),
        lowercase_seqindex   = encode_sequence!(sequences, char.lowercase_mapping),
        titlecase_seqindex   = encode_sequence!(sequences, char.titlecase_mapping),
        comb_index           = get(comb_index, code, 0x3FF), # see utf8proc_property_struct::comb_index
        comb_length          = get(comb_length, code, 0),
        comb_issecond        = code in comb_issecond,
        bidi_mirrored        = char.bidi_mirrored,
        comp_exclusion       = code in exclusions || code in excl_version,
        ignorable            = code in ignorable,
        control_boundary     = char.category in ("Zl", "Zp", "Cc", "Cf") &&
                               !(char.code in (0x200C, 0x200D)),
        charwidth            = derive_char_width(code, char.category),
        ambiguous_width      = is_ambiguous_width(code),
        boundclass           = get_grapheme_boundclass(code),
        indic_conjunct_break = get_indic_conjunct_break(code),
    )
end

# Many character properties are duplicates. Deduplicate them, constructing a
# per-character array of indicies into the properties array
sequences = UTF16Sequences()
char_table_props = [char_table_properties!(sequences, cp) for cp in char_props]

deduplicated_props = Origin(0)(Vector{eltype(char_table_props)}())
char_property_indices = Origin(0)(zeros(Int, 0x00110000))
let index_map = Dict{eltype(char_table_props),Int}()
    for (char, table_props) in zip(char_props, char_table_props)
        entry_idx = get!(index_map, table_props) do
            idx = length(deduplicated_props)
            push!(deduplicated_props, table_props)
            idx
        end
        # Add 1 because unassigned codes occupy slot at index 0
        char_property_indices[char.code] = entry_idx + 1
    end
end

# Now compress char_property_indices by breaking it into pages and
# deduplicating those (this works as compression because there are large
# contiguous ranges of code space with identical properties)
prop_page_indices = Int[]
prop_pages = Int[]
let
    page_size = 0x100
    page_index_map = Dict{Vector{Int}, Int}()
    for page in Iterators.partition(char_property_indices, page_size)
        page_idx = get!(page_index_map, page) do
            idx = length(prop_pages)
            append!(prop_pages, page)
            idx
        end
        push!(prop_page_indices, page_idx)
    end
end

#-------------------------------------------------------------------------------
function write_c_index_array(io, array, linelen)
    print(io, "{\n  ")
    i = 0
    for x in array
        i += 1
        if i == linelen
            i = 0
            print(io, "\n  ")
        end
        print(io, x, ", ")
    end
    print(io, "};\n\n")
end

function c_enum_name(prefix, str)
    if isnothing(str)
        return "0"
    else
        return "UTF8PROC_$(prefix)_$(Base.uppercase(str))"
    end
end

function c_uint16(seqindex)
    if seqindex == typemax(UInt16)
        return "UINT16_MAX"
    else
        return string(seqindex)
    end
end

function print_c_data_tables(io, sequences, prop_page_indices, prop_pages, deduplicated_props,
                             comb_index, comb_length, comb_issecond)
    print(io, "static const utf8proc_uint16_t utf8proc_sequences[] = ")
    write_c_index_array(io, sequences.storage, 8)
    print(io, "static const utf8proc_uint16_t utf8proc_stage1table[] = ")
    write_c_index_array(io, prop_page_indices, 8)
    print(io, "static const utf8proc_uint16_t utf8proc_stage2table[] = ")
    write_c_index_array(io, prop_pages, 8)

    print(io, """
        static const utf8proc_property_t utf8proc_properties[] = {
          {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX,  0x3FF,0,false,  false,false,false,false, 1, 0, 0, UTF8PROC_BOUNDCLASS_OTHER, UTF8PROC_INDIC_CONJUNCT_BREAK_NONE},
        """)
    for prop in deduplicated_props
        print(io, "  {",
              c_enum_name("CATEGORY", prop.category), ", ",
              prop.combining_class, ", ",
              c_enum_name("BIDI_CLASS", prop.bidi_class), ", ",
              c_enum_name("DECOMP_TYPE", prop.decomp_type), ", ",
              c_uint16(prop.decomp_seqindex), ", ",
              c_uint16(prop.casefold_seqindex), ", ",
              c_uint16(prop.uppercase_seqindex), ", ",
              c_uint16(prop.lowercase_seqindex), ", ",
              c_uint16(prop.titlecase_seqindex), ", ",
              c_uint16(prop.comb_index), ", ",
              c_uint16(prop.comb_length), ", ",
              prop.comb_issecond, ", ",
              prop.bidi_mirrored, ", ",
              prop.comp_exclusion, ", ",
              prop.ignorable, ", ",
              prop.control_boundary, ", ",
              prop.charwidth, ", ",
              prop.ambiguous_width, ", ",
              "0, ", # bitfield padding
              c_enum_name("BOUNDCLASS", prop.boundclass), ", ",
              c_enum_name("INDIC_CONJUNCT_BREAK", prop.indic_conjunct_break),
              "},\n"
        )
    end
    print(io, "};\n\n")

    print(io, "static const utf8proc_int32_t utf8proc_combinations_second[] = {\n")
    for dm0 in sort!(collect(keys(comb_mapping)))
        print(io, " ");
        for dm1 in sort!(collect(keys(comb_mapping[dm0])))
            print(io, " ", dm1, ",")
        end
        print(io, "\n");
    end
    print(io, "};\n\n")

    print(io, "static const utf8proc_int32_t utf8proc_combinations_combined[] = {\n")
    for dm0 in sort!(collect(keys(comb_mapping)))
        print(io, " ");
        for dm1 in sort!(collect(keys(comb_mapping[dm0])))
            code = comb_mapping[dm0][dm1]
            print(io, " ", code, ",")
        end
        print(io, "\n");
    end
    print(io, "};\n\n")
end


if !isinteractive()
    print_c_data_tables(stdout, sequences, prop_page_indices, prop_pages, deduplicated_props,
                        comb_index, comb_length, comb_issecond)
end