File: wcwidth.py

package info (click to toggle)
kitty 0.42.1-1
links: PTS, VCS
area: main
in suites: experimental
size: 28,564 kB
sloc: ansic: 82,787; python: 55,191; objc: 5,122; sh: 1,295; xml: 364; makefile: 143; javascript: 78
file content (1233 lines) | stat: -rwxr-xr-x 45,935 bytes
#!/usr/bin/env python
# License: GPL v3 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net>

# Imports {{{
import json
import os
import re
import subprocess
import sys
from collections import defaultdict
from collections.abc import Generator, Hashable, Iterable
from contextlib import contextmanager
from functools import lru_cache, partial
from html.entities import html5
from io import StringIO
from math import ceil, log
from typing import (
    Callable,
    DefaultDict,
    Iterator,
    Literal,
    NamedTuple,
    Optional,
    Protocol,
    Sequence,
    TypedDict,
    Union,
)
from urllib.request import urlopen

if __name__ == '__main__' and not __package__:
    import __main__
    __main__.__package__ = 'gen'
    sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
# }}}

# Fetching data {{{
non_characters = frozenset(range(0xfffe, 0x10ffff, 0x10000))
non_characters |= frozenset(range(0xffff, 0x10ffff + 1, 0x10000))
non_characters |= frozenset(range(0xfdd0, 0xfdf0))
if len(non_characters) != 66:
    raise SystemExit('non_characters table incorrect')
emoji_skin_tone_modifiers = frozenset(range(0x1f3fb, 0x1F3FF + 1))


def get_data(fname: str, folder: str = 'UCD') -> Iterable[str]:
    url = f'https://www.unicode.org/Public/{folder}/latest/{fname}'
    bn = os.path.basename(url)
    local = os.path.join('/tmp', bn)
    if os.path.exists(local):
        with open(local, 'rb') as f:
            data = f.read()
    else:
        data = urlopen(url).read()
        with open(local, 'wb') as f:
            f.write(data)
    for line in data.decode('utf-8').splitlines():
        line = line.strip()
        if line and not line.startswith('#'):
            yield line


@lru_cache(maxsize=2)
def unicode_version() -> tuple[int, int, int]:
    for line in get_data("ReadMe.txt"):
        m = re.search(r'Version\s+(\d+)\.(\d+)\.(\d+)', line)
        if m is not None:
            return int(m.group(1)), int(m.group(2)), int(m.group(3))
    raise ValueError('Could not find Unicode Version')
# }}}

# Parsing Unicode databases {{{
# Map of class names to set of codepoints in class
class_maps: dict[str, set[int]] = {}
all_symbols: set[int] = set()
name_map: dict[int, str] = {}
word_search_map: DefaultDict[str, set[int]] = defaultdict(set)
soft_hyphen = 0xad
flag_codepoints = frozenset(range(0x1F1E6, 0x1F1E6 + 26))
# See https://github.com/harfbuzz/harfbuzz/issues/169
marks = set(emoji_skin_tone_modifiers) | flag_codepoints
not_assigned = set(range(0, sys.maxunicode))
property_maps: dict[str, set[int]] = defaultdict(set)
grapheme_segmentation_maps: dict[str, set[int]] = defaultdict(set)
grapheme_break_as_int: dict[str, int] = {}
int_as_grapheme_break: tuple[str, ...] = ()
incb_as_int: dict[str, int] = {}
int_as_incb: tuple[str, ...] = ()
incb_map: dict[str, set[int]] = defaultdict(set)
extended_pictographic: set[int] = set()


def parse_prop_list() -> None:
    global marks
    for line in get_data('ucd/PropList.txt'):
        if line.startswith('#'):
            continue
        cp_or_range, rest = line.split(';', 1)
        chars = parse_range_spec(cp_or_range.strip())
        name = rest.strip().split()[0]
        property_maps[name] |= chars
    # see https://www.unicode.org/faq/unsup_char.html#3
    marks |= property_maps['Other_Default_Ignorable_Code_Point']


def parse_ucd() -> None:

    def add_word(w: str, c: int) -> None:
        if c <= 32 or c == 127 or 128 <= c <= 159:
            return
        if len(w) > 1:
            word_search_map[w.lower()].add(c)

    first: Optional[int] = None
    for word, c in html5.items():
        if len(c) == 1:
            add_word(word.rstrip(';'), ord(c))
    word_search_map['nnbsp'].add(0x202f)
    for line in get_data('ucd/UnicodeData.txt'):
        parts = [x.strip() for x in line.split(';')]
        codepoint = int(parts[0], 16)
        name = parts[1] or parts[10]
        if name == '<control>':
            name = parts[10]
        if name:
            name_map[codepoint] = name
            for word in name.lower().split():
                add_word(word, codepoint)
        category = parts[2]
        s = class_maps.setdefault(category, set())
        desc = parts[1]
        codepoints: Union[tuple[int, ...], Iterable[int]] = (codepoint,)
        if first is None:
            if desc.endswith(', First>'):
                first = codepoint
                continue
        else:
            codepoints = range(first, codepoint + 1)
            first = None
        for codepoint in codepoints:
            s.add(codepoint)
            not_assigned.discard(codepoint)
            if category.startswith('M'):
                marks.add(codepoint)
            elif category.startswith('S'):
                all_symbols.add(codepoint)
            elif category == 'Cf':
                # we add Cf to marks as it contains things like tags and zero
                # width chars. Not sure if *all* of Cf should be treated as
                # combining chars, might need to add individual exceptions in
                # the future.
                marks.add(codepoint)

    with open('gen/nerd-fonts-glyphs.txt') as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith('#'):
                continue
            code, category, name = line.split(' ', 2)
            codepoint = int(code, 16)
            if name and codepoint not in name_map:
                name_map[codepoint] = name.upper()
                for word in name.lower().split():
                    add_word(word, codepoint)

    # Some common synonyms
    word_search_map['bee'] |= word_search_map['honeybee']
    word_search_map['lambda'] |= word_search_map['lamda']
    word_search_map['lamda'] |= word_search_map['lambda']
    word_search_map['diamond'] |= word_search_map['gem']


def parse_range_spec(spec: str) -> set[int]:
    spec = spec.strip()
    if '..' in spec:
        chars_ = tuple(map(lambda x: int(x, 16), filter(None, spec.split('.'))))
        chars = set(range(chars_[0], chars_[1] + 1))
    else:
        chars = {int(spec, 16)}
    return chars


def split_two(line: str) -> tuple[set[int], str]:
    spec, rest = line.split(';', 1)
    spec, rest = spec.strip(), rest.strip().split(' ', 1)[0].strip()
    return parse_range_spec(spec), rest


all_emoji: set[int] = set()
emoji_presentation_bases: set[int] = set()
narrow_emoji: set[int] = set()
wide_emoji: set[int] = set()
flags: dict[int, list[int]] = {}


def parse_basic_emoji(spec: str) -> None:
    parts = list(filter(None, spec.split()))
    has_emoji_presentation = len(parts) < 2
    chars = parse_range_spec(parts[0])
    all_emoji.update(chars)
    emoji_presentation_bases.update(chars)
    (wide_emoji if has_emoji_presentation else narrow_emoji).update(chars)


def parse_keycap_sequence(spec: str) -> None:
    base, fe0f, cc = list(filter(None, spec.split()))
    chars = parse_range_spec(base)
    all_emoji.update(chars)
    emoji_presentation_bases.update(chars)
    narrow_emoji.update(chars)


def parse_flag_emoji_sequence(spec: str) -> None:
    a, b = list(filter(None, spec.split()))
    left, right = int(a, 16), int(b, 16)
    chars = {left, right}
    all_emoji.update(chars)
    wide_emoji.update(chars)
    emoji_presentation_bases.update(chars)
    flags.setdefault(left, []).append(right)


def parse_emoji_tag_sequence(spec: str) -> None:
    a = int(spec.split()[0], 16)
    all_emoji.add(a)
    wide_emoji.add(a)
    emoji_presentation_bases.add(a)


def parse_emoji_modifier_sequence(spec: str) -> None:
    a, b = list(filter(None, spec.split()))
    char, mod = int(a, 16), int(b, 16)
    mod
    all_emoji.add(char)
    wide_emoji.add(char)
    emoji_presentation_bases.add(char)


def parse_emoji() -> None:
    for line in get_data('emoji-sequences.txt', 'emoji'):
        parts = [x.strip() for x in line.split(';')]
        if len(parts) < 2:
            continue
        data, etype = parts[:2]
        if etype == 'Basic_Emoji':
            parse_basic_emoji(data)
        elif etype == 'Emoji_Keycap_Sequence':
            parse_keycap_sequence(data)
        elif etype == 'RGI_Emoji_Flag_Sequence':
            parse_flag_emoji_sequence(data)
        elif etype == 'RGI_Emoji_Tag_Sequence':
            parse_emoji_tag_sequence(data)
        elif etype == 'RGI_Emoji_Modifier_Sequence':
            parse_emoji_modifier_sequence(data)


doublewidth: set[int] = set()
ambiguous: set[int] = set()


def parse_eaw() -> None:
    global doublewidth, ambiguous
    seen: set[int] = set()
    for line in get_data('ucd/EastAsianWidth.txt'):
        chars, eaw = split_two(line)
        if eaw == 'A':
            ambiguous |= chars
            seen |= chars
        elif eaw in ('W', 'F'):
            doublewidth |= chars
            seen |= chars
    doublewidth |= set(range(0x3400, 0x4DBF + 1)) - seen
    doublewidth |= set(range(0x4E00, 0x9FFF + 1)) - seen
    doublewidth |= set(range(0xF900, 0xFAFF + 1)) - seen
    doublewidth |= set(range(0x20000, 0x2FFFD + 1)) - seen
    doublewidth |= set(range(0x30000, 0x3FFFD + 1)) - seen


def parse_grapheme_segmentation() -> None:
    global extended_pictographic, grapheme_break_as_int, incb_as_int, int_as_grapheme_break, int_as_incb
    global seg_props_from_int, seg_props_as_int
    grapheme_segmentation_maps['AtStart']  # this is used by the segmentation algorithm, no character has it
    grapheme_segmentation_maps['None']  # this is used by the segmentation algorithm, no character has it
    for line in get_data('ucd/auxiliary/GraphemeBreakProperty.txt'):
        chars, category = split_two(line)
        grapheme_segmentation_maps[category] |= chars
    grapheme_segmentation_maps['Private_Expecting_RI']  # this is used by the segmentation algorithm, no character has it
    grapheme_break_as_int = {x: i for i, x in enumerate(grapheme_segmentation_maps)}
    int_as_grapheme_break = tuple(grapheme_break_as_int)
    incb_map['None']   # used by segmentation algorithm no character has it
    for line in get_data('ucd/DerivedCoreProperties.txt'):
        spec, rest = line.split(';', 1)
        category = rest.strip().split(' ', 1)[0].strip().rstrip(';')
        chars = parse_range_spec(spec.strip())
        if category == 'InCB':
            # Most InCB chars also have a GBP categorization, but not all,
            # there exist some InCB chars that do not have a GBP category
            subcat = rest.strip().split(';')[1].strip().split()[0].strip()
            incb_map[subcat] |= chars
    incb_as_int = {x: i for i, x in enumerate(incb_map)}
    int_as_incb = tuple(incb_as_int)
    for line in get_data('ucd/emoji/emoji-data.txt'):
        chars, category = split_two(line)
        if 'Extended_Pictographic#' == category:
            extended_pictographic |= chars
    seg_props_from_int = {'grapheme_break': int_as_grapheme_break, 'indic_conjunct_break': int_as_incb}
    seg_props_as_int = {'grapheme_break': grapheme_break_as_int, 'indic_conjunct_break': incb_as_int}


class GraphemeSegmentationTest(TypedDict):
    data: tuple[str, ...]
    comment: str


grapheme_segmentation_tests: list[GraphemeSegmentationTest] = []


def parse_test_data() -> None:
    for line in get_data('ucd/auxiliary/GraphemeBreakTest.txt'):
        t, comment = line.split('#')
        t = t.lstrip('÷').strip().rstrip('÷').strip()
        chars: list[list[str]] = [[]]
        for x in re.split(r'([÷×])', t):
            x = x.strip()
            match x:
                case '÷':
                    chars.append([])
                case '×':
                    pass
                case '':
                    pass
                case _:
                    ch = chr(int(x, 16))
                    chars[-1].append(ch)
        c = tuple(''.join(c) for c in chars)
        grapheme_segmentation_tests.append({'data': c, 'comment': comment.strip()})
    grapheme_segmentation_tests.append({
        'data': (' ', '\xad', ' '),
        'comment': '÷ [0.2] SPACE (Other) ÷ [0.4] SOFT HYPHEN ÷ [999.0] SPACE (Other) ÷ [0.3]'
    })
    grapheme_segmentation_tests.append({
        'data': ('\U0001f468\u200d\U0001f469\u200d\U0001f467\u200d\U0001f466',),
        'comment': '÷ [0.2] MAN × [9.0] ZERO WIDTH JOINER × [11.0] WOMAN × [9.0] ZERO WIDTH JOINER × [11.0] GIRL × [9.0] ZERO WIDTH JOINER × [11.0] BOY ÷ [0.3]'
    })
# }}}


def write_case(spec: Union[tuple[int, ...], int], p: Callable[..., None], for_go: bool = False) -> None:
    if isinstance(spec, tuple):
        if for_go:
            v = ', '.join(f'0x{x:x}' for x in range(spec[0], spec[1] + 1))
            p(f'\t\tcase {v}:')
        else:
            p('\t\tcase 0x{:x} ... 0x{:x}:'.format(*spec))
    else:
        p(f'\t\tcase 0x{spec:x}:')


@contextmanager
def create_header(path: str, include_data_types: bool = True) -> Generator[Callable[..., None], None, None]:
    with open(path, 'w') as f:
        p = partial(print, file=f)
        p('// Unicode data, built from the Unicode Standard', '.'.join(map(str, unicode_version())))
        p(f'// Code generated by {os.path.basename(__file__)}, DO NOT EDIT.', end='\n\n')
        if path.endswith('.h'):
            p('#pragma once')
        if include_data_types:
            p('#include "data-types.h"\n')
            p('START_ALLOW_CASE_RANGE')
        p()
        yield p
        p()
        if include_data_types:
            p('END_ALLOW_CASE_RANGE')


def gen_names() -> None:
    aliases_map: dict[int, set[str]] = {}
    for word, codepoints in word_search_map.items():
        for cp in codepoints:
            aliases_map.setdefault(cp, set()).add(word)
    if len(name_map) > 0xffff:
        raise Exception('Too many named codepoints')
    with open('tools/unicode_names/names.txt', 'w') as f:
        print(len(name_map), len(word_search_map), file=f)
        for cp in sorted(name_map):
            name = name_map[cp]
            words = name.lower().split()
            aliases = aliases_map.get(cp, set()) - set(words)
            end = '\n'
            if aliases:
                end = '\t' + ' '.join(sorted(aliases)) + end
            print(cp, *words, end=end, file=f)


def gofmt(*files: str) -> None:
    subprocess.check_call(['gofmt', '-w', '-s'] + list(files))


def gen_rowcolumn_diacritics() -> None:
    # codes of all row/column diacritics
    codes = []
    with open("gen/rowcolumn-diacritics.txt") as file:
        for line in file.readlines():
            if line.startswith('#'):
                continue
            code = int(line.split(";")[0], 16)
            codes.append(code)

    go_file = 'tools/utils/images/rowcolumn_diacritics.go'
    with create_header('kitty/rowcolumn-diacritics.c') as p, create_header(go_file, include_data_types=False) as g:
        p('int diacritic_to_num(char_type code) {')
        p('\tswitch (code) {')
        g('package images')
        g(f'var NumberToDiacritic = [{len(codes)}]rune''{')
        g(', '.join(f'0x{x:x}' for x in codes) + ',')
        g('}')

        range_start_num = 1
        range_start = 0
        range_end = 0

        def print_range() -> None:
            if range_start >= range_end:
                return
            write_case((range_start, range_end), p)
            p('\t\treturn code - ' + hex(range_start) + ' + ' +
              str(range_start_num) + ';')

        for code in codes:
            if range_end == code:
                range_end += 1
            else:
                print_range()
                range_start_num += range_end - range_start
                range_start = code
                range_end = code + 1
        print_range()

        p('\t}')
        p('\treturn 0;')
        p('}')
    gofmt(go_file)


def gen_test_data() -> None:
    with open('kitty_tests/GraphemeBreakTest.json', 'wb') as f:
        f.write(json.dumps(grapheme_segmentation_tests, indent=2, ensure_ascii=False).encode())


def getsize(data: Iterable[int]) -> Literal[1, 2, 4]:
    # return smallest possible integer size for the given array
    maxdata = max(data)
    if maxdata < 256:
        return 1
    if maxdata < 65536:
        return 2
    return 4


def mask_for(bits: int) -> int:
    return ~((~0) << bits)


def splitbins[T: Hashable](t: tuple[T, ...], property_size: int, use_fixed_shift: int = 0) -> tuple[list[int], list[int], list[T], int]:
    if use_fixed_shift:
        candidates = range(use_fixed_shift, use_fixed_shift + 1)
    else:
        n = len(t)-1    # last valid index
        maxshift = 0    # the most we can shift n and still have something left
        if n > 0:
            while n >> 1:
                n >>= 1
                maxshift += 1
        candidates = range(maxshift + 1)
    t3: list[T] = []
    tmap: dict[T, int] = {}
    seen = set()
    for x in t:
        if x not in seen:
            seen.add(x)
            tmap[x] = len(t3)
            t3.append(x)
    t_int = tuple(tmap[x] for x in t)
    bytesz = sys.maxsize

    def memsize() -> int:
        ans = len(t1)*getsize(t1)
        sz3 = len(t3)*property_size + len(t2)*getsize(t2)
        sz2 = len(t2) * property_size
        return ans + min(sz2, sz3)
    for shift in candidates:
        t1: list[int] = []
        t2: list[int] = []
        size = 2**shift
        bincache: dict[tuple[int, ...], int] = {}
        for i in range(0, len(t_int), size):
            bin = t_int[i:i+size]
            index = bincache.get(bin)
            if index is None:
                index = len(t2)
                bincache[bin] = index
                t2.extend(bin)
            t1.append(index >> shift)
        # determine memory size
        b = memsize()
        if b < bytesz:
            best = t1, t2, shift
            bytesz = b
    t1, t2, shift = best
    return t1, t2, t3, shift


class Property(Protocol):
    @property
    def as_c(self) -> str:
        return ''

    @property
    def as_go(self) -> str:
        return ''

    @classmethod
    def bitsize(cls) -> int:
        return 0


def get_types(sz: int) -> tuple[str, str]:
    sz *= 8
    return f'uint{sz}_t', f'uint{sz}'


def gen_multistage_table(
    c: Callable[..., None], g: Callable[..., None], t1: Sequence[int], t2: Sequence[int], t3: Sequence[Property], shift: int, input_max_val: int
) -> None:
    t1_type_sz = getsize(t1)
    ctype_t1, gotype_t1 = get_types(t1_type_sz)
    mask = mask_for(shift)
    name = t3[0].__class__.__name__
    t2_type_sz = getsize(tuple(range(len(t3))))
    ctype_t2, gotype_t2 = get_types(t2_type_sz)
    t3_type_sz = t3[0].bitsize() // 8
    lname = name.lower()
    input_type = get_types(getsize((input_max_val,)))[1]

    # Output t1
    c(f'static const char_type {name}_mask = {mask}u;')
    c(f'static const char_type {name}_shift = {shift}u;')
    c(f'static const {ctype_t1} {name}_t1[{len(t1)}] = ''{')
    c(f'\t{", ".join(map(str, t1))}')
    c('};')
    g(f'const {lname}_mask = {mask}')
    g(f'const {lname}_shift = {shift}')
    g(f'var {lname}_t1 = [{len(t1)}]{gotype_t1}''{')
    g(f'\t{", ".join(map(str, t1))},')
    g('}')
    bytesz = len(t1) * t1_type_sz

    if t3_type_sz > t2_type_sz:  # needs 3 levels
        bytesz += len(t2) * t2_type_sz + len(t3) * t3_type_sz
        c(f'static const {ctype_t2} {name}_t2[{len(t2)}] = ''{')
        c(f'\t{", ".join(map(str, t2))}')
        c('};')
        items = '\n\t'.join(x.as_c + f', // {i}' for i, x in enumerate(t3))
        c(f'static const {name} {name}_t3[{len(t3)}] = ''{')
        c(f'\t{items}')
        c('};')

        g(f'var {lname}_t2 = [{len(t2)}]{gotype_t2}''{')
        g(f'\t{", ".join(map(str, t2))},')
        g('}')
        items = '\n\t'.join(x.as_go + f', // {i}' for i, x in enumerate(t3))
        g(f'var {lname}_t3 = [{len(t3)}]{name}''{')
        g(f'\t{items}')
        g('}')

        g(f'''
        // Array accessor function that avoids bounds checking
        func {lname}_for(x {input_type}) {name} {{
            t1 := uintptr(*(*{gotype_t1})(unsafe.Pointer(uintptr(unsafe.Pointer(&{lname}_t1[0])) + uintptr(x>>{lname}_shift)*{t1_type_sz})))
            t1_shifted := (t1 << {lname}_shift) + (uintptr(x) & {lname}_mask)
            t2 := uintptr(*(*{gotype_t2})(unsafe.Pointer(uintptr(unsafe.Pointer(&{lname}_t2[0])) + t1_shifted*{t2_type_sz})))
            return *(*{name})(unsafe.Pointer(uintptr(unsafe.Pointer(&{lname}_t3[0])) + t2*{t3_type_sz}))
        }}
        ''')
    else:
        t3 = tuple(t3[i] for i in t2)
        bytesz += len(t3) * t3_type_sz
        items = '\n\t'.join(x.as_c + ',' for x in t3)
        c(f'static const {name} {name}_t2[{len(t3)}] = ''{')
        c(f'\t{items}')
        c('};')
        items = '\n\t'.join(x.as_go + ',' for x in t3)
        g(f'var {lname}_t2 = [{len(t3)}]{name}''{')
        g(f'\t{items}')
        g('}')
        g(f'''
        // Array accessor function that avoids bounds checking
        func {lname}_for(x {input_type}) {name} {{
            t1 := uintptr(*(*{gotype_t1})(unsafe.Pointer(uintptr(unsafe.Pointer(&{lname}_t1[0])) + uintptr(x>>{lname}_shift)*{t1_type_sz})))
            t1_shifted := (t1 << {lname}_shift) + (uintptr(x) & {lname}_mask)
            return *(*{name})(unsafe.Pointer(uintptr(unsafe.Pointer(&{lname}_t2[0])) + t1_shifted*{t3_type_sz}))
        }}
        ''')
    print(f'Size of {name} table: {ceil(bytesz/1024)}KB with {shift} bit shift')


width_shift = 4


def bitsize(maxval: int) -> int:  # number of bits needed to store maxval
    return ceil(log(maxval, 2))


def clamped_bitsize(val: int) -> int:
    if val <= 8:
        return 8
    if val <= 16:
        return 16
    if val <= 32:
        return 32
    if val <= 64:
        return 64
    raise ValueError('Too many fields')


def bitfield_from_int(
    fields: dict[str, int], x: int, int_to_str: dict[str, tuple[str, ...]]
) -> dict[str, str | bool]:
    # first field is least significant, last field is most significant
    args: dict[str, str | bool] = {}
    for f, shift in fields.items():
        mask = mask_for(shift)
        val = x & mask
        if shift == 1:
            args[f] = bool(val)
        else:
            args[f] = int_to_str[f][val]
        x >>= shift
    return args


def bitfield_as_int(
    fields: dict[str, int], vals: Sequence[bool | str], str_maps: dict[str, dict[str, int]]
) -> int:
    # first field is least significant, last field is most significant
    ans = shift = 0
    for i, (f, width) in enumerate(fields.items()):
        qval = vals[i]
        if isinstance(qval, str):
            val = str_maps[f][qval]
        else:
            val = int(qval)
        ans |= val << shift
        shift += width
    return ans


seg_props_from_int: dict[str, tuple[str, ...]] = {}
seg_props_as_int: dict[str, dict[str, int]] = {}


class GraphemeSegmentationProps(NamedTuple):

    grapheme_break: str = ''  # set at runtime
    indic_conjunct_break: str = ''  # set at runtime
    is_extended_pictographic: bool = True

    @classmethod
    def used_bits(cls) -> int:
        return sum(int(cls._field_defaults[f]) for f in cls._fields)

    @classmethod
    def bitsize(cls) -> int:
        return clamped_bitsize(cls.used_bits())

    @classmethod
    def fields(cls) -> dict[str, int]:
        return {f: int(cls._field_defaults[f]) for f in cls._fields}

    @classmethod
    def from_int(cls, x: int) -> 'GraphemeSegmentationProps':
        args = bitfield_from_int(cls.fields(), x, seg_props_from_int)
        return cls(**args)  # type: ignore

    def __int__(self) -> int:
        return bitfield_as_int(self.fields(), self, seg_props_as_int)


control_grapheme_breaks = 'CR', 'LF', 'Control'
linker_or_extend = 'Linker', 'Extend'


def bitfield_declaration_as_c(name: str, fields: dict[str, int], *alternate_fields: dict[str, int]) -> str:
    base_size = clamped_bitsize(sum(fields.values()))
    base_type = f'uint{base_size}_t'
    ans = [f'// {name}Declaration: uses {sum(fields.values())} bits {{''{{', f'typedef union {name} {{']
    def struct(fields: dict[str, int]) -> Iterator[str]:
        if not fields:
            return
        empty = base_size - sum(fields.values())
        yield '    struct __attribute__((packed)) {'
        yield '#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__'
        for f, width in reversed(fields.items()):
            yield f'        uint{clamped_bitsize(width)}_t {f} : {width};'
        if empty:
            yield f'        uint{clamped_bitsize(empty)}_t : {empty};'
        yield '#elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__'
        if empty:
            yield f'        uint{clamped_bitsize(empty)}_t : {empty};'
        for f, width in fields.items():
            yield f'        uint{clamped_bitsize(width)}_t {f} : {width};'
        yield '#else'
        yield '#error "Unsupported endianness"'
        yield '#endif'
        yield '    };'
    ans.extend(struct(fields))
    for fields in alternate_fields:
        ans.extend(struct(fields))
    ans.append(f'    {base_type} val;')
    ans.append(f'}} {name};')
    ans.append(f'static_assert(sizeof({name}) == sizeof({base_type}), "Fix the ordering of {name}");')
    ans.append(f'// End{name}Declaration }}''}}')
    return '\n'.join(ans)


class GraphemeSegmentationState(NamedTuple):
    grapheme_break: str = ''  # set at runtime
    # True if the last character ends a sequence of Indic_Conjunct_Break values:  consonant {extend|linker}*
    incb_consonant_extended: bool = True
    # True if the last character ends a sequence of Indic_Conjunct_Break values:  consonant {extend|linker}* linker
    incb_consonant_extended_linker: bool = True
    # True if the last character ends a sequence of Indic_Conjunct_Break values:  consonant {extend|linker}* linker {extend|linker}*
    incb_consonant_extended_linker_extended: bool = True
    # True if the last character ends an emoji modifier sequence \p{Extended_Pictographic} Extend*
    emoji_modifier_sequence: bool = True
    # True if the last character was immediately preceded by an emoji modifier sequence   \p{Extended_Pictographic} Extend*
    emoji_modifier_sequence_before_last_char: bool = True

    @classmethod
    def make(cls) -> 'GraphemeSegmentationState':
        return GraphemeSegmentationState('AtStart', False, False, False, False, False)

    @classmethod
    def fields(cls) -> dict[str, int]:
        return {f: int(cls._field_defaults[f]) for f in cls._fields}

    @classmethod
    def from_int(cls, x: int) -> 'GraphemeSegmentationState':
        args = bitfield_from_int(cls.fields(), x, {'grapheme_break': int_as_grapheme_break})
        return cls(**args)  # type: ignore

    def __int__(self) -> int:
        return bitfield_as_int(self.fields(), self, seg_props_as_int)

    @classmethod
    def c_declaration(cls) -> str:
        return bitfield_declaration_as_c(cls.__name__, cls.fields())

    @classmethod
    def used_bits(cls) -> int:
        return sum(int(cls._field_defaults[f]) for f in cls._fields)

    @classmethod
    def bitsize(cls) -> int:
        return clamped_bitsize(cls.used_bits())

    def add_to_current_cell(self, p: GraphemeSegmentationProps) -> 'GraphemeSegmentationResult':
        prev = self.grapheme_break
        prop = p.grapheme_break
        incb = p.indic_conjunct_break
        add_to_cell = False
        if self.grapheme_break == 'AtStart':
            add_to_cell = True
            if prop == 'Regional_Indicator':
                prop = 'Private_Expecting_RI'
        else:
            # No break between CR and LF (GB3).
            if prev == 'CR' and prop == 'LF':
                add_to_cell = True
            # Break before and after controls (GB4, GB5).
            elif prev in control_grapheme_breaks or prop in control_grapheme_breaks:
                pass
            # No break between Hangul syllable sequences (GB6, GB7, GB8).
            elif (
                (prev == 'L' and prop in ('L', 'V', 'LV', 'LVT')) or
                (prev in ('LV', 'V') and prop in ('V', 'T')) or
                (prev in ('LVT', 'T') and prop == 'T')
            ):
                add_to_cell = True
            # No break before: extending characters or ZWJ (GB9), SpacingMarks (GB9a), Prepend characters (GB9b).
            elif prop in ('Extend', 'ZWJ', 'SpacingMark') or prev == 'Prepend':
                add_to_cell = True
            # No break within certain combinations of Indic_Conjunct_Break values
            # Between consonant {extend|linker}* linker {extend|linker}* and consonant (GB9c).
            elif self.incb_consonant_extended_linker_extended and incb == 'Consonant':
                add_to_cell = True
            # No break within emoji modifier sequences or emoji zwj sequences (GB11).
            elif prev == 'ZWJ' and self.emoji_modifier_sequence_before_last_char and p.is_extended_pictographic:
                add_to_cell = True
            # No break between RI if there is an odd number of RI characters before (GB12, GB13).
            elif prop == 'Regional_Indicator':
                if prev == 'Private_Expecting_RI':
                    add_to_cell = True
                else:
                    prop = 'Private_Expecting_RI'
            # Break everywhere else GB999

        incb_consonant_extended_linker = self.incb_consonant_extended and incb == 'Linker'
        incb_consonant_extended_linker_extended = incb_consonant_extended_linker or (
                self.incb_consonant_extended_linker_extended and incb in linker_or_extend)
        incb_consonant_extended = incb == 'Consonant' or (
            self.incb_consonant_extended and incb in linker_or_extend)
        emoji_modifier_sequence_before_last_char = self.emoji_modifier_sequence
        emoji_modifier_sequence = (self.emoji_modifier_sequence and prop == 'Extend') or p.is_extended_pictographic

        return GraphemeSegmentationResult(GraphemeSegmentationState(
            grapheme_break=prop, incb_consonant_extended=incb_consonant_extended,
            incb_consonant_extended_linker=incb_consonant_extended_linker,
            incb_consonant_extended_linker_extended=incb_consonant_extended_linker_extended,
            emoji_modifier_sequence=emoji_modifier_sequence, emoji_modifier_sequence_before_last_char=emoji_modifier_sequence_before_last_char
        ), add_to_cell)


def split_into_graphemes(props: Sequence[GraphemeSegmentationProps], text: str) -> Iterator[str]:
    s = GraphemeSegmentationState.make()
    pos = 0
    for i, ch in enumerate(text):
        p = props[ord(ch)]
        s, add_to_cell = s.add_to_current_cell(p)
        if not add_to_cell:
            yield text[pos:i]
            pos = i
    if pos < len(text):
        yield text[pos:]


def split_into_graphemes_with_table(
    props: Sequence['GraphemeSegmentationProps'], table: Sequence['GraphemeSegmentationResult'], text: str,
) -> Iterator[str]:
    s = GraphemeSegmentationResult.make()
    pos = 0
    for i, ch in enumerate(text):
        k = int(GraphemeSegmentationKey(s.new_state, props[ord(ch)]))
        s = table[k]
        if not s.add_to_current_cell:
            yield text[pos:i]
            pos = i
    if pos < len(text):
        yield text[pos:]


def test_grapheme_segmentation(split_into_graphemes: Callable[[str], Iterator[str]]) -> None:
    for test in grapheme_segmentation_tests:
        expected = test['data']
        actual = tuple(split_into_graphemes(''.join(test['data'])))
        if expected != actual:
            def as_codepoints(text: str) -> str:
                return ' '.join(hex(ord(x))[2:] for x in text)
            qe = tuple(map(as_codepoints, expected))
            qa = tuple(map(as_codepoints, actual))
            raise SystemExit(f'Failed to split graphemes for: {test["comment"]}\n{expected!r} {qe} != {actual!r} {qa}')


class GraphemeSegmentationKey(NamedTuple):
    state: GraphemeSegmentationState
    char: GraphemeSegmentationProps

    @classmethod
    def from_int(cls, x: int) -> 'GraphemeSegmentationKey':
        shift = GraphemeSegmentationProps.used_bits()
        mask = mask_for(shift)
        state = GraphemeSegmentationState.from_int(x >> shift)
        char = GraphemeSegmentationProps.from_int(x & mask)
        return GraphemeSegmentationKey(state, char)

    def __int__(self) -> int:
        shift = GraphemeSegmentationProps.used_bits()
        return int(self.state) << shift | int(self.char)

    def result(self) -> 'GraphemeSegmentationResult':
        return self.state.add_to_current_cell(self.char)

    @classmethod
    def code_to_convert_to_int(cls, for_go: bool = False) -> str:
        lines: list[str] = []
        a = lines.append
        shift = GraphemeSegmentationProps.used_bits()
        if for_go:
            base_type = f'uint{GraphemeSegmentationState.bitsize()}'
            a(f'func grapheme_segmentation_key(r GraphemeSegmentationResult, ch CharProps) ({base_type}) ''{')
            a(f'\treturn (r.State() << {shift}) | ch.GraphemeSegmentationProperty()')
            a('}')
        else:
            base_type = f'uint{GraphemeSegmentationState.bitsize()}_t'
            a(f'static inline {base_type} {cls.__name__}(GraphemeSegmentationResult r, CharProps ch)' '{')
            a(f'\treturn (r.state << {shift}) | ch.grapheme_segmentation_property;')
            a('}')
        return '\n'.join(lines)


class GraphemeSegmentationResult(NamedTuple):
    new_state: GraphemeSegmentationState = GraphemeSegmentationState()
    add_to_current_cell: bool = True

    @classmethod
    def used_bits(cls) -> int:
        return sum(int(GraphemeSegmentationState._field_defaults[f]) for f in GraphemeSegmentationState._fields) + 1

    @classmethod
    def bitsize(cls) -> int:
        return clamped_bitsize(cls.used_bits())

    @classmethod
    def make(cls) -> 'GraphemeSegmentationResult':
        return GraphemeSegmentationResult(GraphemeSegmentationState.make(), False)

    @classmethod
    def go_fields(cls) -> Sequence[str]:
        ans = []
        ans.append('add_to_current_cell 1')
        for f, width in reversed(GraphemeSegmentationState.fields().items()):
            ans.append(f'{f} {width}')
        return tuple(ans)

    @property
    def as_go(self) -> str:
        shift = 0
        parts = []
        for f in reversed(GraphemeSegmentationResult.go_fields()):
            f, _, w = f.partition(' ')
            bits = int(w)
            if f != 'add_to_current_cell':
                x = getattr(self.new_state, f)
                if f == 'grapheme_break':
                    x = f'GraphemeSegmentationResult(GBP_{x})'
                else:
                    x = int(x)
            else:
                x = int(self.add_to_current_cell)
            mask = '0b' + '1' * bits
            parts.append(f'(({x} & {mask}) << {shift})')
            shift += bits
        return ' | '.join(parts)

    @classmethod
    def go_extra(cls) -> str:
        bits = GraphemeSegmentationState.used_bits()
        base_type = f'uint{GraphemeSegmentationState.bitsize()}'
        return f'''
func (r GraphemeSegmentationResult) State() (ans {base_type}) {{
    return {base_type}(r) & {mask_for(bits)}
}}
    '''

    @property
    def as_c(self) -> str:
        parts = []
        for f in GraphemeSegmentationState._fields:
            x = getattr(self.new_state, f)
            match f:
                case 'grapheme_break':
                    x = f'GBP_{x}'
                case _:
                    x = int(x)
            parts.append(f'.{f}={x}')
        parts.append(f'.add_to_current_cell={int(self.add_to_current_cell)}')
        return '{' + ', '.join(parts) + '}'

    @classmethod
    def c_declaration(cls) -> str:
        fields = {'add_to_current_cell': 1}
        sfields = GraphemeSegmentationState.fields()
        fields.update(sfields)
        bits = sum(sfields.values())
        # dont know if the alternate state access works in big endian
        return bitfield_declaration_as_c('GraphemeSegmentationResult', fields, {'state': bits})


class CharProps(NamedTuple):

    width: int = 3
    is_emoji: bool = True
    category: str = ''  # set at runtime
    is_emoji_presentation_base: bool = True

    # derived properties for fast lookup
    is_invalid: bool = True
    is_non_rendered: bool = True
    is_symbol: bool = True
    is_combining_char: bool = True
    is_word_char: bool = True
    is_punctuation: bool = True

    # needed for grapheme segmentation set as LSB bits for easy conversion to GraphemeSegmentationProps
    grapheme_break: str = ''  # set at runtime
    indic_conjunct_break: str = ''  # set at runtime
    is_extended_pictographic: bool = True

    @classmethod
    def bitsize(cls) -> int:
        ans = sum(int(cls._field_defaults[f]) for f in cls._fields)
        return clamped_bitsize(ans)

    @classmethod
    def go_fields(cls) -> Sequence[str]:
        ans = []
        for f in cls._fields:
            bits = int(cls._field_defaults[f])
            if f == 'width':
                f = 'shifted_width'
            ans.append(f'{f} {bits}')
        return tuple(ans)

    @property
    def as_go(self) -> str:
        shift = 0
        parts = []
        for f in reversed(self.go_fields()):
            f, _, w = f.partition(' ')
            if f == 'shifted_width':
                f = 'width'
            x = getattr(self, f)
            match f:
                case 'width':
                    x += width_shift
                case 'grapheme_break':
                    x = f'CharProps(GBP_{x})'
                case 'indic_conjunct_break':
                    x = f'CharProps(ICB_{x})'
                case 'category':
                    x = f'CharProps(UC_{x})'
                case _:
                    x = int(x)
            bits = int(w)
            mask = '0b' + '1' * bits
            parts.append(f'(({x} & {mask}) << {shift})')
            shift += bits
        return ' | '.join(parts)

    @classmethod
    def go_extra(cls) -> str:
        base_type = f'uint{GraphemeSegmentationState.bitsize()}'
        f = GraphemeSegmentationProps.fields()
        s = f['grapheme_break'] + f['indic_conjunct_break']
        return f'''
func (s CharProps) Width() int {{
	return int(s.Shifted_width()) - {width_shift}
}}

func (s CharProps) GraphemeSegmentationProperty() {base_type} {{
    return {base_type}(s.Grapheme_break() | (s.Indic_conjunct_break() << {f["grapheme_break"]}) | (s.Is_extended_pictographic() << {s}))
}}
    '''

    @property
    def as_c(self) -> str:
        parts = []
        for f in self._fields:
            x = getattr(self, f)
            match f:
                case 'width':
                    x += width_shift
                    f = 'shifted_width'
                case 'grapheme_break':
                    x = f'GBP_{x}'
                case 'indic_conjunct_break':
                    x = f'ICB_{x}'
                case 'category':
                    x = f'UC_{x}'
                case _:
                    x = int(x)
            parts.append(f'.{f}={x}')
        return '{' + ', '.join(parts) + '}'

    @classmethod
    def fields(cls) -> dict[str, int]:
        return {'shifted_width' if f == 'width' else f: int(cls._field_defaults[f]) for f in cls._fields}

    @classmethod
    def c_declaration(cls) -> str:
        # Dont know if grapheme_segmentation_property in alternate works on big endian
        alternate = {
            'grapheme_segmentation_property': sum(int(cls._field_defaults[f]) for f in GraphemeSegmentationProps._fields)
        }
        return bitfield_declaration_as_c(cls.__name__, cls.fields(), alternate)


def generate_enum(p: Callable[..., None], gp: Callable[..., None], name: str, *items: str, prefix: str = '') -> None:
    p(f'typedef enum {name} {{')  # }}
    gp(f'type {name} uint8\n')
    gp('const (')  # )
    for i, x in enumerate(items):
        x = prefix + x
        p(f'\t{x},')
        if i == 0:
            gp(f'{x} {name} = iota')
        else:
            gp(x)
    p(f'}} {name};')
    gp(')')
    p('')
    gp('')


def category_set(predicate: Callable[[str], bool]) -> set[int]:
    ans = set()
    for c, chs in class_maps.items():
        if predicate(c):
            ans |= chs
    return ans


def top_level_category(q: str) -> set[int]:
    return category_set(lambda x: x[0] in q)


def patch_declaration(name: str, decl: str, raw: str) -> str:
    begin = f'// {name}Declaration'
    end = f'// End{name}Declaration }}''}}'
    return re.sub(rf'{begin}.+?{end}', decl.rstrip(), raw, flags=re.DOTALL)


def gen_char_props() -> None:
    CharProps._field_defaults['grapheme_break'] = str(bitsize(len(grapheme_segmentation_maps)))
    CharProps._field_defaults['indic_conjunct_break'] = str(bitsize(len(incb_map)))
    CharProps._field_defaults['category'] = str(bitsize(len(class_maps) + 1))
    GraphemeSegmentationProps._field_defaults['grapheme_break'] = CharProps._field_defaults['grapheme_break']
    GraphemeSegmentationProps._field_defaults['indic_conjunct_break'] = CharProps._field_defaults['indic_conjunct_break']
    GraphemeSegmentationState._field_defaults['grapheme_break'] = GraphemeSegmentationProps._field_defaults['grapheme_break']
    invalid = class_maps['Cc'] | class_maps['Cs'] | non_characters
    non_printing = invalid | class_maps['Cf']
    non_rendered = non_printing | property_maps['Other_Default_Ignorable_Code_Point'] | set(range(0xfe00, 0xfe0f + 1))
    is_word_char = top_level_category('LN')
    is_punctuation = top_level_category('P')
    width_map: dict[int, int] = {}
    cat_map: dict[int, str] = {}
    for cat, chs in class_maps.items():
        for ch in chs:
            cat_map[ch] = cat
    def aw(s: Iterable[int], width: int) -> None:
        nonlocal width_map
        d = dict.fromkeys(s, width)
        d.update(width_map)
        width_map = d

    aw(flag_codepoints, 2)
    aw(doublewidth, 2)
    aw(wide_emoji, 2)
    aw(marks | {0}, 0)
    aw(non_printing, -1)
    aw(ambiguous, -2)
    aw(class_maps['Co'], -3)  # Private use
    aw(not_assigned, -4)

    gs_map: dict[int, str] = {}
    icb_map: dict[int, str] = {}
    for name, cps in grapheme_segmentation_maps.items():
        gs_map.update(dict.fromkeys(cps, name))
    for name, cps in incb_map.items():
        icb_map.update(dict.fromkeys(cps, name))
    prop_array = tuple(
        CharProps(
            width=width_map.get(ch, 1), grapheme_break=gs_map.get(ch, 'None'), indic_conjunct_break=icb_map.get(ch, 'None'),
            is_invalid=ch in invalid, is_non_rendered=ch in non_rendered, is_emoji=ch in all_emoji, is_symbol=ch in all_symbols,
            is_extended_pictographic=ch in extended_pictographic, is_emoji_presentation_base=ch in emoji_presentation_bases,
            is_combining_char=ch in marks, category=cat_map.get(ch, 'Cn'), is_word_char=ch in is_word_char,
            is_punctuation=ch in is_punctuation,
        ) for ch in range(sys.maxunicode + 1))
    gsprops = tuple(GraphemeSegmentationProps(
        grapheme_break=x.grapheme_break, indic_conjunct_break=x.indic_conjunct_break,
        is_extended_pictographic=x.is_extended_pictographic) for x in prop_array)
    test_grapheme_segmentation(partial(split_into_graphemes, gsprops))
    gseg_results = tuple(GraphemeSegmentationKey.from_int(i).result() for i in range(1 << 16))

    test_grapheme_segmentation(partial(split_into_graphemes_with_table, gsprops, gseg_results))
    t1, t2, t3, t_shift = splitbins(prop_array, CharProps.bitsize() // 8)
    g1, g2, g3, g_shift = splitbins(gseg_results, GraphemeSegmentationResult.bitsize() // 8)

    from .bitfields import make_bitfield
    buf = StringIO()
    cen = partial(print, file=buf)
    with create_header('kitty/char-props-data.h', include_data_types=False) as c, open('tools/wcswidth/char-props-data.go', 'w') as gof:
        gp = partial(print, file=gof)
        gp('package wcswidth')
        gp('import "unsafe"')
        gp(f'const MAX_UNICODE = {sys.maxunicode}')
        gp(f'const UNICODE_LIMIT = {sys.maxunicode + 1}')
        cen('// UCBDeclaration {{''{')
        cen(f'#define MAX_UNICODE ({sys.maxunicode}u)')
        generate_enum(cen, gp, 'GraphemeBreakProperty', *grapheme_segmentation_maps, prefix='GBP_')
        generate_enum(c, gp, 'IndicConjunctBreak', *incb_map, prefix='ICB_')
        generate_enum(cen, gp, 'UnicodeCategory', 'Cn', *class_maps, prefix='UC_')
        cen('// EndUCBDeclaration }}''}')
        gp(make_bitfield('tools/wcswidth', 'CharProps', *CharProps.go_fields(), add_package=False)[1])
        gp(make_bitfield('tools/wcswidth', 'GraphemeSegmentationResult', *GraphemeSegmentationResult.go_fields(), add_package=False)[1])
        gp(CharProps.go_extra())
        gp(GraphemeSegmentationResult.go_extra())
        gen_multistage_table(c, gp, t1, t2, t3, t_shift, len(prop_array)-1)
        gen_multistage_table(c, gp, g1, g2, g3, g_shift, len(gseg_results)-1)
        c(GraphemeSegmentationKey.code_to_convert_to_int())
        c(GraphemeSegmentationState.c_declaration())
        gp(GraphemeSegmentationKey.code_to_convert_to_int(for_go=True))
    gofmt(gof.name)
    with open('kitty/char-props.h', 'r+') as f:
        raw = f.read()
        nraw = re.sub(r'\d+/\*=width_shift\*/', f'{width_shift}/*=width_shift*/', raw)
        nraw = patch_declaration('CharProps', CharProps.c_declaration(), nraw)
        nraw = patch_declaration('GraphemeSegmentationResult', GraphemeSegmentationResult.c_declaration(), nraw)
        nraw = patch_declaration('UCB', buf.getvalue(), nraw)
        if nraw != raw:
            f.seek(0)
            f.truncate()
            f.write(nraw)


def main(args: list[str]=sys.argv) -> None:
    parse_ucd()
    parse_prop_list()
    parse_emoji()
    parse_eaw()
    parse_grapheme_segmentation()
    parse_test_data()
    gen_names()
    gen_rowcolumn_diacritics()
    gen_test_data()
    gen_char_props()


if __name__ == '__main__':
    import runpy
    m = runpy.run_path(os.path.dirname(os.path.abspath(__file__)))
    m['main']([sys.executable, 'wcwidth'])