File: ocaml.py

package info (click to toggle)
ocaml 5.3.0-3
links: PTS, VCS
area: main
in suites: forky, sid, trixie
size: 43,124 kB
sloc: ml: 355,439; ansic: 51,636; sh: 25,098; asm: 5,413; makefile: 3,673; python: 919; javascript: 273; awk: 253; perl: 59; fortran: 21; cs: 9
file content (645 lines) | stat: -rw-r--r-- 23,706 bytes
#**************************************************************************
#*                                                                        *
#*                                 OCaml                                  *
#*                                                                        *
#*                           Nick Barnes, Tarides                         *
#*                                                                        *
#*   Copyright 2024 Tarides.                                              *
#*                                                                        *
#*   All rights reserved.  This file is distributed under the terms of    *
#*   the GNU Lesser General Public License version 2.1, with the          *
#*   special exception on linking described in the file LICENSE.          *
#*                                                                        *
#**************************************************************************

# This file contains any debugger-agnostic code for debugger plugins.
#
# Each debugger front end has three ciasses - targets, types, and
# values, which must provide these slots and methods:
#
# targets:
#
#     word_size: size of a word in 8-bit bytes.
#
#     double_size: the number of 8-bit bytes in a double-precision
#        float (i.e. 8).
#
#     value(address): a debugger "value" representing the given address
#         with type `value`.
#
#     global_variable(name): the value of a named global variable.
#
#     type(name): a named type.
#
#     symbol(address): the symbol name associated with the given address.
#
#     mapping(address): a string describing any file mapping
#         associated with the address, or None. Blocks on the Caml
#         heap do not have an associated file mapping.
#
# types:
#
#     size(): the number of bytes required for this type.
#
#     pointer(): the type of a pointer to this type.
#
#     array(size): the type of an array of given size of this type.
#
# values:
#
#     valid(): False if this value is somehow "invalid" (for example,
#       optimised away.
#
#     type(): The type of this value.
#
#     unsigned(): the value as an unsigned integer.
#
#     signed(): the value as a signed integer.
#
#     cast(t): the value cast to type `t`.
#
#     value(): the value cast to the "value" Caml runtime type.
#
#     pointer(): the value cast to "value*"
#
#     dereference(): the result of dereferencing the value, which must
#       be a pointer.
#
#     array_size(): only used when the value is an array, returns the
#       number of entries in the array.
#
#     sub(index): only used when the value is an array, returns an
#       entry, as a debugger value. TODO: switch to __getitem__ to make
#       this more transparent.
#
#     struct(): a dictionary {slot_name: value} representing the value,
#       which must be a struct. TODO: could use __getattribute__ to make
#       this more transparent.
#
#     field(index): the `index`th field of a value array whose address is
#       in the value, which can have any scalar type. `index` is a Python
#       number which can have any value, including negative.
#
#     field_pointer(index): a pointer to the `index`th field (see above).
#
#     byte_field(index): a byte value, from a byte array.
#
#     double_field(index): a double-precision floating-point value
#       from an array, as a Python float.
#
#     string(length): treating the value as an address, `length` bytes from
#       memory, decoded as UTF-8, as a Python string.
#
#     c_string(): treating the value as an address, returns the NUL-terminated
#       C string at the location as a Python string.
#
#     field_array(offset, size): an array of elements [offset,
#       offset+size), as a debugger-native array.
#
#     double_array(size): an array of double-precision floating point
#       members, as a debugger-native array.

MAX_BLOCK_SLOTS = 8
MAX_STRING_LEN = 80
STRING_SUFFIX = 8
STRING_PREFIX = MAX_STRING_LEN - STRING_SUFFIX - 5
MAX_STRING_SUMMARY = 20
STRING_SUMMARY_PREFIX = 8
STRING_SUMMARY_SUFFIX = MAX_STRING_SUMMARY - STRING_SUMMARY_PREFIX - 5

TAGS = {
    244: 'Forcing',
    245: 'Cont',
    246: 'Lazy',
    247: 'Closure',
    248: 'Object',
    249: 'Infix',
    250: 'Forward',
    251: 'Abstract',
    252: 'String',
    253: 'Double',
    254: 'Double_array',
    255: 'Custom'
}

# specific tag values which we display in particular ways.

TAG_CLOSURE = 247
TAG_INFIX = 249
TAG_STRING = 252
TAG_DOUBLE = 253
TAG_DOUBLE_ARRAY = 254
TAG_CUSTOM = 255

# constants for header word decoding.

HEADER_TAG_BITS = 8
HEADER_TAG_MASK = (1 << HEADER_TAG_BITS) - 1

HEADER_COLOR_BITS = 2
HEADER_COLOR_SHIFT = HEADER_TAG_BITS
HEADER_COLOR_MASK = ((1 << HEADER_COLOR_BITS) - 1) << HEADER_COLOR_SHIFT
NOT_MARKABLE = 3 << HEADER_COLOR_SHIFT

HEADER_WOSIZE_SHIFT = HEADER_TAG_BITS + HEADER_COLOR_BITS

# tags of this or above indicate blocks with no scannable fields
NO_SCAN_TAG = 251

# The debug runtime fills free and uninitialized memory with words:
#
#             D7xx D6D8 on 32-bit platforms
#   D7xx D7D7 D7xx D6D8 in 64-bit platforms
#
# where xx is one of the following 8-bit values, depending on the
# context of the memory word.

DEBUG_TAGS = {
    0x00: 'free minor',
    0x01: 'free major',
    0x03: 'free shrink',
    0x04: 'free truncate', # obsolete
    0x05: 'free unused',
    0x10: 'uninit minor',
    0x11: 'uninit major',
    0x15: 'uninit align',
    0x85: 'filler align',
    0x99: 'pool magic',
}

DEBUG_LOW_BYTES = [0xd8, 0xd6]
DEBUG_OTHER = 0xd7
DEBUG_TAG_BYTES = [2, 6]

def debug_decode(word, word_size):
    """If `word` is a debug padding word, return a string representation of
it. Otherwise, return None. `target` is used for word size."""

    if (word >> (word_size * 8)) not in {0,-1}:
        return
    bytes = [(word >> (i * 8)) & 0xff for i in range(word_size)]
    if bytes[:len(DEBUG_LOW_BYTES)] != DEBUG_LOW_BYTES:
        return
    pads = set(bytes[i]
               for i in range(len(DEBUG_LOW_BYTES), word_size)
               if i not in DEBUG_TAG_BYTES)
    if pads != {DEBUG_OTHER}: # not all pad bytes DEBUG_OTHER
        return
    tags = set(bytes[i] for i in DEBUG_TAG_BYTES if i < word_size)
    if len(tags) != 1: # differing tags on 64-bits
        return
    tag = list(tags)[0] # unique tag byte
    if tag not in DEBUG_TAGS:
        return f'Debug(0x{tag:x}?!)'
    return f'Debug({DEBUG_TAGS[tag]})'

# we show colors as [x], for some character x:

COLOR_SUMMARY = {
    'MARKED': 'm',
    'UNMARKED': 'u',
    'GARBAGE': 'g',
    'NOT MARKABLE': '-',
}

def colors(target):
    """Return a dictionary value -> name of the current GC colors
    (MARKED, UNMARKED, GARBAGE, NOT MARKABLE).
    """

    heapState = target.global_variable('caml_global_heap_state').struct()
    cols = {v.unsigned(): (f, COLOR_SUMMARY[f])
            for f, v in heapState.items()}
    cols[NOT_MARKABLE] = ('NOT MARKABLE', '-')
    return cols

class Value:
    def __init__(self, value, target):
        self._value = value
        self._target = target
        self.children = False
        self.num_children = 0
        self.valid = value.valid()
        if not self.valid:
            return

        self.word = value.signed()
        if self.word == 0:
            self.valid = False
            return

        self.immediate = (self.word & 1) == 1
        if self.immediate:
            return

        self.debug = debug_decode(self.word, target.word_size)
        if self.debug is not None:
            return

        self.pointer = value.pointer()
        self._header = self.pointer.field(-1).unsigned()
        self._wosize = self._header >> HEADER_WOSIZE_SHIFT
        self._tag = self._header & HEADER_TAG_MASK
        self._color_bits = self._header & HEADER_COLOR_MASK
        self.children = True
        self.num_children = self._wosize # overridden for some tags
        if self._tag == TAG_DOUBLE:
            self.children = False
            self.num_children = 0
        elif self._tag == TAG_DOUBLE_ARRAY:
            self.num_children = ((self.num_children * target.word_size)
                                 // target.double_size)
        elif self._tag == TAG_STRING:
            self.children = False
            self.num_children = 0
            byteSize = target.word_size * self._wosize
            lastByte = value.byte_field(byteSize-1).unsigned()
            self._length = byteSize-1-lastByte
            if self._length > 0:
                self._string = value.string(self._length)
            else:
                self._string = ''
        elif self._tag == TAG_CLOSURE:
            # collect code pointers and metadata for all functions
            # in this closure.
            self._functions = []
            # list of (code, arity [, additional code]) tuples.
            self._infix_map = {}
            # map from infix offset to tuple
            arity_shift = target.word_size * 8 - 8
            closinfo = value.field(1).signed()
            self._start_env = (closinfo & ((1 << arity_shift) - 1)) >> 1
            self.num_children = self._wosize - self._start_env
            block = 0
            while block < self._start_env:
                code = value.field(block).unsigned()
                closinfo = value.field(block+1).unsigned()
                arity = closinfo >> arity_shift
                if (arity == 0) or (arity == 1):
                    fn = (code, arity)
                    bump = 0
                else: # higher arity, so code is curry/tuplify
                    true_code = value.field(block+2).unsigned()
                    fn = (true_code, arity, code)
                    bump = 1
                self._functions.append(fn)
                self._infix_map[block] = fn
                block += 3 +bump # code, closinfo, [extra code], [infix header]
        elif self._tag == TAG_INFIX:
            self._container = Value(value.field_pointer(-self._wosize), target)
            self.num_children = 0
            self.children = False
        elif self._tag == TAG_CUSTOM:
            ptr_type = target.type('struct custom_operations').pointer()
            self._ops = value.field(0).cast(ptr_type).dereference().struct()
            self._id = self._ops['identifier'].c_string()
            self.children = False
            self.num_children = 0

    def tag_part(self):
        if self._tag in TAGS:
            return f'{TAGS[self._tag]}'
        elif self._tag == 0:
            return ''
        else:
            return f't{self._tag}'

    def infix_sym(self):
        cont = self._container
        sym = f'+{self._wosize}'
        # try to find symbol in infix map of container
        if cont._tag == TAG_CLOSURE: # always true!
            if self._wosize in cont._infix_map:
                code = cont._infix_map[self._wosize][0]
                sym = self._target.symbol(code)
                if sym is None:
                    sym = f'0x{code:x}'
        return sym

    def code_sym(self, t):
        code = t[0]
        sym = self._target.symbol(code)
        if sym is None:
            sym = f'0x{code:x}'
        if len(t) == 2:
            return sym
        else:
            return f'{sym}({self._target.symbol(t[2])})'

    def closure_syms(self):
        return [self.code_sym(t) for t in self._functions]

    def array_contents(self, short=False):
        if self._tag == TAG_DOUBLE_ARRAY:
            if self.num_children <= MAX_BLOCK_SLOTS:
                return [str(self._value.double_field(i))
                        for i in range(self.num_children)]
            return ([str(self._value.double_field(i))
                     for i in range(MAX_BLOCK_SLOTS-2)]
                    + ['...',
                       str(self._value.double_field(
                           self.num_children - 1))])

        if self.num_children < MAX_BLOCK_SLOTS:
            return [self.field_summary(i, short)
                    for i in range(self.num_children)]
        else:
            return ([self.field_summary(i, short)
                     for i in range(MAX_BLOCK_SLOTS-2)]
                    + ['...',
                       self.field_summary(self.num_children - 1, short)])

    def summary(self, short=False):
        """Return a short value summary string, suitable for display in a
        larger aggregate. If `short` then summarise the summary.
        """
        if not self.valid:
            return '[invalid]'
        if self.immediate:
            return f'{self.word // 2}'
        if self.debug is not None:
            return self.debug

        if self._tag == TAG_DOUBLE:
            return str(self._value.double_field(0))
        elif self._tag == TAG_STRING:
            if self._length > MAX_STRING_SUMMARY:
                return (repr(self._string[:STRING_SUMMARY_PREFIX])
                        + '...'
                        + repr(self._string[-STRING_SUMMARY_SUFFIX:])
                        + f'<{self._length}>')
            return repr(self._string)
        elif self._tag == TAG_INFIX:
            sym = self.infix_sym()
            return f'infix({sym}) in ' + self._container.summary(short=True)
        elif self._tag == TAG_CLOSURE:
            syms = self.closure_syms()
            if len(syms) > 1:
                sym = f'{syms[0]}, +{len(syms)-1}'
            else:
                sym = syms[0]
            return f'closure({sym})<{self.num_children}>'
        elif self._tag == TAG_CUSTOM:
            return f"custom {self._id}<{self._wosize}>"

        tag_part = self.tag_part()

        if short:
            if not tag_part:
                tag_part = 't0'
            return f'<{tag_part}:{self.num_children}>'

        if tag_part:
            tag_part += ':'

        contents = self.array_contents(short=True)

        return (f'({tag_part}' + ', '.join(contents) + ')')


    def field_summary(self, index, short=False):
        return (Value(self._value.field(index), self._target).
                summary(short))

    def __str__(self):
        if not self.valid:
            return '[invalid]'
        if self.immediate:
            return f'caml:{self.word // 2}'
        if self.debug is not None:
            return f'Caml:{self.debug}'

        color_char = colors(self._target).get(self._color_bits,
                                         f'BAD COLOR {self._color_bits}')[1]
        prefix = f'caml({color_char}):'

        if self._tag == TAG_DOUBLE:
            val = str(self._value.double_field(0))
            return f'{prefix}{val}'
        elif self._tag == TAG_STRING:
            if self._length > MAX_STRING_LEN:
                s = (repr(self._string[:STRING_PREFIX])
                     + '...' + repr(self._string[-STRING_SUFFIX:]))
            else:
                s = repr(self._string)
            return (f'{prefix}{s}<{self._length}>')
        elif self._tag == TAG_INFIX:
            sym = self.infix_sym()
            return (f'{prefix}infix({sym}) in'
                    + f' 0x{self._container._value.unsigned():x} '
                    + self._container.summary())
        elif self._tag == TAG_CLOSURE:
            syms = ', '.join(self.closure_syms())
            return (f'{prefix}closure({syms})'
                    + f' arity {self._functions[0][1]} ('
                    + ', '.join(self.field_summary(i + self._start_env)
                                for i in range(self.num_children))
                    + ')')
        elif self._tag == TAG_CUSTOM:
            return (f"{prefix}custom {self._id}"
                    f"<{self._wosize}>")

        tag_part = self.tag_part()
        if self._tag != 0:
            tag_part += ': '
        suffix = ('' if self.num_children <= MAX_BLOCK_SLOTS
                  else f'<{self.num_children}>')

        contents = self.array_contents()

        return (f'{prefix}({tag_part}'
                + ', '.join(contents)
                + f'){suffix}')

    # Useful in GDB and maybe one day in LLDB too.

    def child(self, index):
        if (not self.children) or index < 0 or index >= self.num_children:
            return
        if self._tag == TAG_DOUBLE_ARRAY:
            return self._value.double_field(index)
        elif self._tag == TAG_CLOSURE:
            return self._value.field(index + self._start_env).value()
        else:
            return self._value.field(index).value()

    # Useful in GDB and maybe one day in LLDB too.

    def child_array(self):
        """If the value is a block which can be regarded as an array,
        return the array as a debugger-native value."""
        if (not self.children):
            return
        if self._tag == TAG_DOUBLE_ARRAY:
            return self._value.double_array(self.num_children)
        elif self._tag == TAG_CLOSURE:
            return self._value.field_array(self._start_env, self.num_children)
        else:
            return self._value.field_array(0, self.num_children)

POOL_WSIZE = 4096

class Finder:
    def __init__(self, target):
        self._sizeclasses = None
        self._wsize_sizeclass = None
        self._target = target
        self.debug = False

    def sizeclasses(self):
        if self._sizeclasses is None:
            pool_freelist = (self._target.global_variable('pool_freelist').
                             struct())
            self._sizeclasses = (
                pool_freelist['global_avail_pools'].type().size() //
                pool_freelist['global_avail_pools'].sub(0).type().size())
        return self._sizeclasses

    def wsize_sizeclass(self, sz):
        if self._wsize_sizeclass is None:
            self._wsize_sizeclass = (self._target.
                                     global_variable('wsize_sizeclass'))
        return self._wsize_sizeclass.sub(sz).unsigned()

    def _log(self, *args):
        if self.debug:
            print(*args)

    def _found(self, where):
        if self.debug:
            print(f"FOUND 0x{self.address:x} {where}")
        self.found.append(where)
        self.keep_going = self.debug

    def search_pool_list(self, description, pool_list):
        "Search a single pool list for `self.address`."
        count = 0
        while self.keep_going and pool_list.unsigned():
            count += 1
            base = pool_list.unsigned()
            limit = base + POOL_WSIZE * self._target.word_size
            if base < self.address < limit:
                self._found(f"{description}: pool 0x{base:x}-0x{limit:x}")
            pool_list = pool_list.dereference().struct()['next']
        self._log(f"    searched {count} pools of {description}")

    def search_pools(self, description, pools):
        "Search an array `pool *pools[NUM_SIZECLASSES]` for `self.address`."
        self._log(f"  searching {description} pools")
        for i in range(self.sizeclasses()):
            pool_list = pools.sub(i)
            if pool_list.unsigned() == 0:
                continue
            self.search_pool_list(f"{description} "
                                  f"wsize={self.wsize_sizeclass(i)}",
                                  pool_list)
            if not self.keep_going:
                break

    def search_large(self, description, large_list):
        "Search a `large_alloc *` linked list for `self.address`."
        if large_list.unsigned() == 0:
            return
        count = 0
        while self.keep_going and large_list.unsigned():
            count += 1
            base = large_list.unsigned()
            block = base + large_list.dereference().type().size()
            val = self._target.value(block + self._target.word_size)
            val_ptr = val.cast(val.type().pointer())
            oval = Value(val_ptr, self._target)
            limit = block + (oval._wosize + 1) * self._target.word_size
            if base < self.address < limit:
                self._found(f"{description} large 0x{block:x}-0x{limit:x}")
            large_list = large_list.dereference().struct()['next']
        self._log(f"  searched {count} large blocks of {description}")

    def search_heap(self, description, heap_state_p):
        "Searches a single `struct caml_heap_state *` for self.address."
        heap_state = heap_state_p.dereference().struct()
        if self.keep_going:
            self.search_pools(f"{description} avail",
                              heap_state['avail_pools'])
        if self.keep_going:
            self.search_pools(f"{description} full",
                              heap_state['full_pools'])
        if self.keep_going:
            self.search_pools(f"{description} unswept avail",
                              heap_state['unswept_avail_pools'])
        if self.keep_going:
            self.search_pools(f"{description} unswept full",
                              heap_state['unswept_full_pools'])
        if self.keep_going:
            self.search_large(f"{description}",
                              heap_state['swept_large'])
        if self.keep_going:
            self.search_large(f"{description} unswept",
                              heap_state['unswept_large'])

    def search_domain(self, index, dom_state_p):
        "Search a single domain's heap for `self.address`."
        dom_state = dom_state_p.dereference().struct()
        young_start = dom_state['young_start'].unsigned()
        young_end = dom_state['young_end'].unsigned()
        description = f"domain {index}"
        self._log(f"searching {description}")
        if self.keep_going and (young_start <= self.address <= young_end):
                self._found(f"{description} minor heap "
                            f"0x{young_start:x}-0x{young_end:x}")
        if self.keep_going:
            self.search_heap(description, dom_state['shared_heap'])

    def find(self, expr, val):
        if not val.valid:
            print(f"{expr} not a valid expression")
            return
        if val.immediate:
            print(f"{expr} is immediate: {str(val)}")
            return
        if val.debug is not None:
            print(f"{expr} is a debug padding value: {val.debug}")
            return

        self.address = val.pointer.unsigned()
        mapping = self._target.mapping(self.address)
        if mapping:
            print(f"{expr} {str(val)} is from {mapping}, "
                  "not the heap.")
            return

        self.found = []
        self.keep_going = True

        # Search per-domain heaps.
        all_domains = self._target.global_variable('all_domains')
        Max_domains = all_domains.array_size()
        self._log(f"{Max_domains} domains.")
        for i in range(Max_domains):
            dom = all_domains.sub(i).struct()
            dom_state_p = dom['state']
            if dom_state_p.unsigned() == 0: # null pointer: no domain
                continue
            self.search_domain(i, dom_state_p)
            if not self.keep_going:
                break

        # Global (orphaned) heap
        pool_freelist = self._target.global_variable('pool_freelist').struct()
        if self.keep_going:
            self.search_pools('global avail',
                              pool_freelist['global_avail_pools'])
        if self.keep_going:
            self.search_pools('global full',
                              pool_freelist['global_full_pools'])
        if self.keep_going:
            self.search_large("global",
                              pool_freelist['global_large'])

        if self.found:
            print(f"{expr} {str(val)}: 0x{self.address:x} found:")
            for where in self.found:
                print(f"  {where}")
        else:
            print(f"{expr} {str(val)} not found on heap")

        self.debug = True