File: xam.py

package info (click to toggle)
psyco-doc 1.6-1
links: PTS
area: contrib
in suites: lenny
size: 1,832 kB
ctags: 3,236
sloc: ansic: 23,895; python: 5,646; perl: 1,309; makefile: 153
file content (570 lines) | stat: -rw-r--r-- 20,039 bytes
parent folder | download | duplicates (6)
from __future__ import nested_scopes
import os, sys, re, htmlentitydefs, struct, bisect
__metaclass__ = type

tmpfile = '~tmpfile.tmp'

# the disassembler to use. 'objdump' writes GNU-style instructions.
# 'ndisasm' uses Intel syntax.

objdump = 'objdump -b binary -m i386 --adjust-vma=%(origin)d -D %(file)s'
if sys.platform == "win32":
    try:
        from xam import __file__ as _xamfile
    except ImportError:
        raise ImportError, "could not import xam module"
    _win32_path = os.path.join(os.path.split(_xamfile)[0], "win32")
    objdump = os.path.join(_win32_path, objdump)
    _objdumpexe = objdump.split()[0]+".exe"
    # test whether it works:
    if os.system(_objdumpexe + " -v"):
        raise IOError, "file %s and cygwin1.dll must exist" % _objdumpexe
#objdump = 'ndisasm -o %(origin)d -u %(file)s'

# the files from which symbols are loaded.
# the order and number of files must match
# psyco_dump_code_buffers() in psyco.c.
symbolfiles = [sys.executable]
try:
    from psyco import _psyco
    symbolfiles.append(_psyco.__file__)
except ImportError:
    pass

# the program that lists symbols, and the output it gives
symbollister = 'nm %s'
re_symbolentry = re.compile(r'([0-9a-fA-F]+)\s\w\s(.*)')

if sys.platform == "win32":
    # no way to get full info into the executables by
    # VC7. /PDB:NONE no longer supported.
    # so we have to read the map files.
    if sys.executable.lower().endswith("_d.exe"):
        _mapfiles = ("python23_d.map", "_psyco_d.map")
    else:
        _mapfiles = ("python23.map", "_psyco.map")
    symbolfiles = [os.path.join(_win32_path, x) for x in _mapfiles]
    for _filepath in symbolfiles:
        if not os.path.exists(_filepath):
            raise IOError, "please make sure that '%s' exists" % _filepath
        
    class symbollister:
        def __init__(self, filename):
            self.file = file(filename)
            self.generator = self._readline()

        def _readline(self):
            for line in self.file:
                #  0001:000661e0       _PyEval_CallFunction       1e0671e0 f   modsupport.obj
                #  0003:0000e770       _PyClass_Type              1e0d8770     classobject.obj
                pieces = line.split()
                if len(pieces) == 5:
                    colonadr, name, adr, dummy, obj = pieces
                elif len(pieces) == 4:
                    colonadr, name, adr, obj = pieces
                    dummy = "d"
                else:
                    continue
                if colonadr.count(":") == 1 and obj.endswith(".obj"):
                    yield "%s %s %s\n" % (adr, dummy, name[1:])

        def readline(self):
            try:
                return self.generator.next()
            except StopIteration:
                return ""

        def close(self):
            self.file.close()
            
        def __iter__(self):
            return self.generator


re_addr = re.compile(r'[\s,$]0x([0-9a-fA-F]+)')
re_lineaddr = re.compile(r'\s*0?x?([0-9a-fA-F]+)')


symbols = {}
#rawtargets = {}
codeboundary = []

try:
    from xamsupport import any_pointer
except ImportError:
    def any_pointer(addr0, data, start, end, unpack=struct.unpack):
        for i in range(4, len(data)+1):
            offset, = unpack('l', data[i-4:i])
            if start <= addr0+i+offset < end or start <= offset < end:
                return 1
        return 0

def machine_code_dump(data, originaddr, format):
    if format == 'ivm':
        import ivmdump
        result = ivmdump.dump(data, originaddr)
    elif format == 'i386':
        f = open(tmpfile, 'wb')
        f.write(data)
        f.close()
        try:
            g = os.popen(objdump % {'file': tmpfile, 'origin': originaddr}, 'r')
            result = g.readlines()
            g.close()
        finally:
            os.unlink(tmpfile)
    return result

def load_symbol_file(filename, symb1, addr1):
    d = {}
    if type(symbollister) is str:
        g = os.popen(symbollister % filename, "r")
    else:
        g = symbollister(filename)
    while 1:
        line = g.readline()
        if not line:
            break
        match = re_symbolentry.match(line)
        if match:
            d[match.group(2)] = long(match.group(1), 16)
    g.close()
    if d.has_key(symb1):
        delta = addr1 - d[symb1]
    else:
        delta = 0
        print >> sys.stderr,"Warning: no symbol '%s' in '%s'" % (symb1, filename)
    for key, value in d.items():
        symbols[value + delta] = key


def symtext(sym, addr, inbuf=None, lineaddr=None):
    if isinstance(sym, CodeBuf):
        if sym is inbuf:
            name = 'top'
        else:
            name = '%s codebuf 0x%x' % (sym.mode, sym.addr)
        if addr > sym.addr:
            name += ' + %d' % (addr-sym.addr)
        return name
    else:
        return sym

revmap = {}
for key, value in htmlentitydefs.entitydefs.items():
    if type(value) is type(' '):
        revmap[value] = '&%s;' % key

def htmlquote(text):
    return ''.join([revmap.get(c,c) for c in text])

def lineaddresses(line):
    result = []
    i = 0
    while 1:
        match = re_addr.search(line, i)
        if not match:
            break
        i = match.end()
        addr = long(match.group(1), 16)
        result.append(addr)
    return result

def codeat(addr):
    i = bisect.bisect(codeboundary, (addr, None))
    if i>0:
        addrend, codebuf = codeboundary[i-1]
        if isinstance(codebuf, CodeBuf):
            return codebuf


re_int = re.compile(r"(\-?\d+)$")
re_ctvinfo = re.compile(r"ct (\d+) (\-?\d+)$")
re_rtvinfo = re.compile(r"rt (\-?\d+)$")
re_vtvinfo = re.compile(r"vt 0x([0-9a-fA-F]+)$")

LOC_LOCALS_PLUS = 3

class CodeBuf:
    __slots__ = ['mode', 'co_filename', 'co_name', 'nextinstr', 'addr',
                 'stackdepth', 'specdict', 'data', 'cache_text',
                 'disass_text', 'reverse_lookup', 'vlocals',
                 'complete_list', 'dumpfile', 'vlocalsofs', 'codemap']
    machine_code_format = '?'
    
    def __init__(self, mode, co_filename, co_name, nextinstr,
                 addr, stackdepth):
        self.mode = mode
        self.co_filename = co_filename
        self.co_name = co_name
        self.nextinstr = nextinstr
        self.addr = addr
        #self.data = data
        self.stackdepth = stackdepth
        #self.reverse_lookup = []  # list of (offset, codebuf pointing there)
        self.specdict = []
        if self.mode != "proxy":
            codeboundary.append((self.addr-0.5, self))
        else:
            self.data = ""
        #for i in range(4, len(data)+1):
        #    offset, = struct.unpack('l', data[i-4:i])
        #    rawtargets.setdefault(addr+i+offset, {})[self] = 1

    def getboundary(self):
        i = bisect.bisect(codeboundary, (self.addr-0.5, self))
        prev = codeboundary[i-1][1]
        next = codeboundary[i][1]
        #while not isinstance(next, BigBuffer) and next.addr == self.addr:
        #    i = i + 1
        #    next = codeboundary[i][1]
        while not isinstance(codeboundary[i][1], BigBuffer):
            i = i + 1
        bigbuf = codeboundary[i][1]
        return prev, next, bigbuf

    def splitheader(self):
        data = self.data
        addr = self.addr
        k = 0
        while data[k:k+1] == '\xCC':
            k = k + 1
        if data[k:k+4] == '\x66\x66\x66\x66':
            # detected a rt_local_buf_t structure
            next, key = struct.unpack('LL', data[k+4:k+12])
            data = data[k+12:]
            addr += k+12
        else:
            next = key = None
        return data, addr, next, key

    def __getattr__(self, attr):
        if attr == 'data':
            prev, next, bigbuf = self.getboundary()
            assert prev is self
            self.data = data = bigbuf.load(self.addr, next.addr)
            return data
        if attr == 'cache_text':
            # produce the disassembly listing
            data, addr, next, key = self.splitheader()
            self.cache_text = []
            if key is not None:
                self.cache_text.append(
                    'Created by promotion of the value 0x%x\n' % key)
            if next is not None:
                self.cache_text.append(
                    'Next promoted value at buffer 0x%x\n' % next)
            self.cache_text += machine_code_dump(data, addr,
                                                 CodeBuf.machine_code_format)
            return self.cache_text
        if attr == 'disass_text':
            txt = self.cache_text
            if self.specdict:
                txt.append('\n')
                txt.append("'do_promotion' dictionary:\n")
                for key, value in self.specdict:
                    txt.append('.\t%s:\t\t\n' % htmlquote(key))
                    txt.append('.\t\t0x%x\t\t\n' % value)
            self.disass_text = txt
            return txt
        if attr == 'reverse_lookup':
            # 'reverse_lookup' is a list of (offset, codebuf pointing there)
            self.reverse_lookup = []
            start = self.addr
            end = start + len(self.data)
            for codebuf in self.complete_list:
                if any_pointer(codebuf.addr, codebuf.data, start, end):
                    for line in codebuf.disass_text:
                        for addr in lineaddresses(line):
                            if start <= addr < end:
                                self.reverse_lookup.append((addr-start, codebuf))
            return self.reverse_lookup
        if attr == 'vlocals':
            self.dumpfile.seek(self.vlocalsofs)
            self.vlocals = self.load_vi_array({0: None})
            return self.vlocals
        raise AttributeError, attr

    def load_vi_array(self, d):
        dumpfile = self.dumpfile
        match = re_int.match(dumpfile.readline())
        assert match
        count = int(match.group(1))
        a = []
        for i in range(count):
            line = dumpfile.readline()
            match = re_int.match(line)
            assert match
            addr = long(match.group(1))
            if d.has_key(addr):
                vi = d[addr]
            else:
                line = dumpfile.readline()
                match = re_ctvinfo.match(line)
                if match:
                    vi = CompileTimeVInfo(int(match.group(1)),
                                          long(match.group(2)))
                else:
                    match = re_rtvinfo.match(line)
                    if match:
                        vi = RunTimeVInfo(long(match.group(1)), self.stackdepth)
                    else:
                        match = re_vtvinfo.match(line)
                        assert match
                        vi = VirtualTimeVInfo(long(match.group(1), 16))
                d[addr] = vi
                vi.addr = addr
                vi.array = self.load_vi_array(d)
            a.append(vi)
        a.reverse()
        return a

    def get_next_instr(self):
        if self.nextinstr >= 0:
            return self.nextinstr

    def spec_dict(self, key, value):
        self.specdict.append((key, value))
        #rawtargets.setdefault(value, {})[self] = 1
        try:
            del self.disass_text
        except:
            pass
        try:
            del self.reverse_lookup
        except:
            pass
    
##    def build_reverse_lookup(self):
##        for line in self.disass_text:
##            for addr in lineaddresses(line):
##                sym = symbols.get(addr)
##                if isinstance(sym, CodeBuf):
##                    sym.reverse_lookup.append((addr-sym.addr, self))
    
    def disassemble(self, symtext=symtext, linetext=None, snapshot=None):
        seen = {}
        data = []
        for line in self.disass_text:
            if line.endswith('\n'):
                line = line[:-1]
            match = re_lineaddr.match(line)
            if match:
                lineaddr = long(match.group(1), 16)
                if not seen.has_key(lineaddr):
                    if self.codemap.has_key(lineaddr) and snapshot:
                        for proxy in self.codemap[lineaddr]:
                            data.append(snapshot(proxy))
                    seen[lineaddr] = 1
                ofs = lineaddr - self.addr
                sources = [c for o, c in self.reverse_lookup if o == ofs]
                if sources and linetext:
                    line = linetext(line, lineaddr)
                if sources != [self]*len(sources):
                    data.append('\n')
            else:
                lineaddr = None
            for addr in lineaddresses(line):
                sym = symbols.get(addr) or codeat(addr)
                if sym:
                    line = '%s\t(%s)' % (line, symtext(sym, addr, self,lineaddr))
                    break
            data.append(line + '\n')
        return ''.join(data)


class BigBuffer:
    __slots__ = ['file', 'offset', 'start', 'length', 'addr', 'priority']
    def __init__(self, file, start, length):
        #if sys.stderr.softspace:
        #    print >> sys.stderr
        #print >> sys.stderr, 'BigBuffer:', hex(start), hex(start+length),
        #print >> sys.stderr, '(%d)' % length
        self.file = file
        self.offset = file.tell()
        self.start = start
        self.length = length
        self.addr = start + length   # end address
        self.priority = -len(codeboundary)
        codeboundary.append((self.addr-0.25, self))
        file.seek(self.length, 1)
    def load(self, begin, end):
        assert self.start <= begin <= self.addr, \
               (hex(self.start), hex(begin), hex(end), hex(self.addr))
        self.file.seek(self.offset + (begin-self.start))
        return self.file.read(min(self.addr, end) - begin)


class VInfo:
    __slots__ = ['addr', 'array']

class CompileTimeVInfo(VInfo):
    __slots__ = ['flags', 'value']
    def __init__(self, flags, value):
        self.flags = flags
        self.value = value
    def gettext(self):
        text = "Compile-time value 0x%x" % self.value
        if self.flags & 1:
            text += ", fixed"
        if self.flags & 2:
            text += ", reference"
        return text
    def getsummarytext(self):
        text = "Compile-time"
        if self.flags & 1:
            text += " fixed"
        text += " 0x%x" % self.value
        return text

class RunTimeVInfo(VInfo):
    __slots__ = ['source', 'stackdepth']
    REG_NAMES = ["eax", "ecx", "edx", "ebx", "esp", "ebp", "esi", "edi"]
    def __init__(self, source, stackdepth=None):
        self.source = source
        self.stackdepth = stackdepth
    def gettext(self):
        text = "Run-time source,"
        reg = self.source >> 28
        stack = self.source & 0x03FFFFFC
        if CodeBuf.machine_code_format == 'ivm':
            if reg:
                text += " in a register ??????"
            if not stack:
                text += " not in stack ??????"
            else:
                text += " in stack [%d] or from top #%d" % (
                    (self.stackdepth-stack)/4,
                    stack/4)
        else:
            if 0 <= reg < 8:
                text += " in register %s" % self.REG_NAMES[reg].upper()
                if stack:
                    text += " and"
            if stack:
                if self.stackdepth is None:
                    sd = ""
                else:
                    sd = "[ESP+0x%x] or " % (self.stackdepth - stack)
                text += " in stack %sfrom top %d" % (sd, stack)
        if not (self.source & 0x08000000):
            text += " holding a reference"
        if self.source & 0x04000000:
            text += " >=0"
        return text
    def getsummarytext(self):
        return "Run-time"

class VirtualTimeVInfo(VInfo):
    __slots__ = ['vs']
    def __init__(self, vs):
        self.vs = vs
    def gettext(self):
        return "Virtual-time source (%x)" % self.vs
    def getsummarytext(self):
        return "Virtual-time (%x)" % self.vs

def readdump(filename = 'psyco.dump'):
    del codeboundary[:]
    re_header = re.compile(r"Psyco dump [[](\w+?)[]]")
    re_symb1 = re.compile(r"(\w+?)[:]\s0x([0-9a-fA-F]+)")
    re_codebuf = re.compile(r"CodeBufferObject 0x([0-9a-fA-F]+) (\-?\d+) \'(.*?)\' \'(.*?)\' (\-?\d+) \'(.*?)\'$")
    re_specdict = re.compile(r"spec_dict 0x([0-9a-fA-F]+)")
    re_vinfo_array = re.compile(r"vinfo_array")
    re_spec1 = re.compile(r"0x([0-9a-fA-F]+)\s(.*)$")
    re_bigbuffer = re.compile(r"BigBuffer 0x([0-9a-fA-F]+) (\d+)$")
    
    codebufs = []
    dumpfile = open(filename, 'rb')
    match = re_header.match(dumpfile.readline())
    if not match:
        raise ValueError, "'%s' does not look like a Psyco dump" % filename
    CodeBuf.machine_code_format = match.group(1)
    
    bufcount, = struct.unpack("i", dumpfile.read(4))
    buftable = list(struct.unpack("l"*bufcount, dumpfile.read(4*bufcount)))
    buftable.reverse()
    if buftable:
        filesize = buftable[-1]
    else:
        filesize = sys.maxint
    filesize *= 1.0
    nextp = 0.1
    cbsortedsize = 0
    for filename in symbolfiles:
        line = dumpfile.readline()
        match = re_symb1.match(line)
        assert match
        load_symbol_file(filename, match.group(1), long(match.group(2), 16))
    while 1:
        line = dumpfile.readline()
        if not line:
            print "Note: unexpected end of file"
            break
        #print line.strip()
        match = re_codebuf.match(line)
        if match:
            percent = dumpfile.tell() / filesize
            if percent >= nextp:
                print >> sys.stderr, '%d%%...' % int(100*percent),
                nextp += 0.1
            #size = int(match.group(2))
            #data = dumpfile.read(size)
            #assert len(data) == size
            codebuf = CodeBuf(match.group(6), match.group(3), match.group(4),
                              int(match.group(5)), long(match.group(1), 16),
                              int(match.group(2)))
            codebuf.dumpfile = dumpfile
            codebuf.vlocalsofs = buftable.pop()
            codebufs.append(codebuf)
        else:
            match = re_specdict.match(line)
            if match:
                addr = long(match.group(1), 16)
                if len(codeboundary) != cbsortedsize:
                    codeboundary.sort()
                    cbsortedsize = len(codeboundary)
                codebuf = codeat(addr-4)
                if codebuf is None:
                    raise "spec_dict with no matching code buffer", line
                while 1:
                    line = dumpfile.readline()
                    if len(line)<=1:
                        break
                    match = re_spec1.match(line)
                    assert match
                    codebuf.spec_dict(match.group(2), long(match.group(1), 16))
            elif re_vinfo_array.match(line):
                assert len(codebufs) == bufcount
                break
            else:
                match = re_bigbuffer.match(line)
                if match:
                    BigBuffer(dumpfile, long(match.group(1), 16),
                              int(match.group(2)))
                else:
                    raise "invalid line", line
    print >> sys.stderr, 'sorting...',
    if len(codeboundary) != cbsortedsize:
        codeboundary.sort()
    codemap = {}
    #cblist = []
    codebufs.reverse()
    for codebuf in codebufs:
        codebuf.complete_list = codebufs
        codebuf.codemap = codemap
        codemap.setdefault(codebuf.addr, []).insert(0, codebuf)
        #prev, next, bigbuf = codebuf.getboundary()
        #cblist.append((bigbuf.priority, codebuf.addr, codebuf))
    #cblist.sort()
    #codebufs[:] = [codebuf for priority, addr, codebuf in cblist]
    print >> sys.stderr, '100%'
    return codebufs

if __name__ == '__main__':
    if len(sys.argv) > 1:
        codebufs = readdump(sys.argv[1])
    else:
        codebufs = readdump()
    for codebuf in codebufs:
        print codebuf.disassemble()