File: xedscan.py

package info (click to toggle)
acl2 8.6%2Bdfsg-2
  • links: PTS
  • area: main
  • in suites: trixie
  • size: 1,111,420 kB
  • sloc: lisp: 17,818,294; java: 125,359; python: 28,122; javascript: 23,458; cpp: 18,851; ansic: 11,569; perl: 7,678; xml: 5,591; sh: 3,976; makefile: 3,833; ruby: 2,633; yacc: 1,126; ml: 763; awk: 295; csh: 233; lex: 197; php: 178; tcl: 49; asm: 23; haskell: 17
file content (201 lines) | stat: -rwxr-xr-x 7,529 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
#!/usr/bin/env python3
# X86ISA Library
# Copyright (C) 2024 Kestrel Technology, LLC
#
# License: A 3-clause BSD license. See the file books/3BSD-mod.txt.
#
# Author: Sol Swords (sswords@gmail.com)

import sys
import re

usage = """Usage: xedscan.py datafile.txt output.lsp

NOTE: (Sol Swords, 1/4/2025)

At the moment this is a small proof of concept for generating
inst-listing.lisp opcode map entries by parsing XED data files.
See the comments in this python script for more details.
"""


# This was designed to pick up the X87 instruction set which was
# missing from our opcode maps. These use the "escape" opcodes D8
# through DF so we look for these opcodes and ignore all others. While
# the SDM treats these instructions specially (has tables for them
# separate from the standard opcode maps), their decoding isn't much
# different -- they just pack a lot of instructions into 8 opcodes by
# using MOD and sometimes RM bits as opcode extensions.  These are
# denoted in xed-isa.txt (from the xed source distribution) as in the
# following examples:

# PATTERN   : 0xDC MOD[mm] MOD!=3 REG[0b110] RM[nnn] MODRM()

# This breaks down as:
# 0xDC -- the opcode (multibyte opcodes are formatted as e.g. 0x0F 0x01)
# MOD[mm] -- I think this notation just means that this field isn't fixed, maybe binds some variable mm to the decoded value?
# MOD!=3 -- signifies the MOD fiel of MODRM d can't be 3
# REG[0b110] -- REG field of MODRM must be 6
# RM[nnn] -- again signifies RM field isn't fixed, maybe bound to nnn
# MODRM() -- maybe signifies that a MODRM byte is required? not sure, not always present.

# PATTERN   : 0xD9 MOD[0b11] MOD=3 REG[0b100] RM[0b001]

# This breaks down as:
# 0xD9: opcode
# MOD[0b11] -- MOD field must be 3
# MOD=3     -- MOD field must be 3, not sure what difference is to the above -- they always coincide
# REG[0b100] -- REG field must be 4
# RM[0b001] -- RM field must be 1.

# Despite not understanding a fair amount about these, this is
# sufficient to make basic opcode map entries for these x87 instructions.

# One thing we haven't explored is generating ARG entries for better instruction decoding.

# Exception conditions don't seem to be covered by the XED data files
# unless I'm missing something. We've hardcoded exceptions common to
# x87 instructions.

# Further use of the XED data files to extend / check accuracy of our
# opcode maps would be a good idea.  This script maybe provides a
# starting point, though there are lots of cases it won't handle.



if (len(sys.argv) != 3):
    print(usage, file=sys.stderr)

inname = sys.argv[1]
outname = sys.argv[2]

# Integer constants in xed datafiles seem to be either decimal or
# prefixed 0x or 0b.  But many places can have either an integer or
# some other string value, e.g. the RM field may be an integer or
# 'nnn'. So in those cases we catch the execption and return the input string.
def maybe_parse_integer(tok):
    code = tok[0:2]
    val = tok
    try:
        if (code == "0x"):
            val = int(tok[2:], 16)
        elif (code == "0b"):
            val = int(tok[2:], 2)
        else:
            val = int(tok)
    except ValueError:
        None
    return val


# At the moment I'm recognizing patterns that contain the following sorts of elements:
# 0xAB -- opcode bytes (only at the beginning)
# KEY[val] where val is perhaps a number
# KEY=val where val is perhaps a number (not sure if there's any difference)
#  (note we don't include KEY!=val here -- we just say key can't end in !)
# Other strings, which are just stored as keys associated to True.
def parse_pattern(pat):
    obj = {}
    tokens = pat.split(" ")
    opcode = 0
    it = iter(tokens)
    for tok in it:
        if (tok[0:2] == "0x"):
            opcode = (opcode<<8) + int(tok[2:], 16)
        else:
            break
    obj['opcode'] = opcode
    for tok in it:
        if (m := re.search("^(?P<key>.*)\\[(?P<val>.*)\\]$", tok)):
            obj[m.group('key')] = maybe_parse_integer(m.group('val'))
        elif (m := re.search("^(?P<key>.*[^!])=(?P<val>.*)$", tok)):
            obj[m.group('key')] = maybe_parse_integer(m.group('val'))
        else:
            obj[tok] = True
    return obj
        
            
    
# This just reads lines of the form KEY : VAL until we reach a closing
# } (on its own line).  For the PATTERN key, we parse the value using
# parse_pattern above, otherwise we just store the value as a string.
def parse_inst(infile):
    obj = {}
    for line in infile:
        while ((len(line) >= 2) and (line[-2] == '\\')):
            # join subsequent lines while they end in \
            line = line[:-2] + next(infile)
        line = line.strip()
        if (line == "}"):
            return obj
        if (line == "" or line[0] == "#"):
            continue
        sides = line.split(":", 1)
        if (len(sides) != 2):
            print("bad line: " + line, file=sys.stderr)
            continue
        key = sides[0].strip()
        val = sides[1].strip()
        if (key == "PATTERN"):
            val = parse_pattern(val)
        obj[key] = val

# Read and parse the xed data file into a bunch of inst objects, which
# are just dictionaries; at the moment only the PATTERN entry is
# parsed further, the rest are just stored as strings.
insts = []
with open(inname, "r", encoding="utf-8") as infile:
    for line in infile:
        while ((len(line) >= 2) and (line[-2] == '\\')):
            # join subsequent lines while they end in \
            line = line[:-2] + next(infile)
        line = line.strip()
        if (line == "{"):
            insts.append(parse_inst(infile))

def parse_features(extension):
    if (extension == "X87"):
        return " :FEAT '(:FPU)"
    else:
        return ""
    # if (extension == "BASE"):
    #     return ""
    # elif
    # else:
    #     return " :FEAT '(:%s)" % extension

def lisp_comment(string):
    res = ""
    for line in string.splitlines(True): # keep line breaks
        res = res + ";; " + line
    return res

# Write out a list of INST forms (opcode map entries) based on what was read from the file
with open(outname, "w", encoding="utf-8") as outfile:
    outfile.write('''\
;; Generated using:
;; xedscan.py %s %s\n\n''' % (inname, outname))
    for inst in insts:
        opcode = inst["PATTERN"]['opcode']
        # print("opcode: %x opcode>>8: %x opcode&0xf8: %x" % (opcode, opcode>>8, opcode & 0xf8));
        if ((opcode & 0xf8 == 0xd8) and (opcode >> 8 == 0)):
            pattern = inst["PATTERN"]
            rm = pattern["RM"]
            inst_entry = '''\
    (INST "%s"
          (OP :OP #x%x :REG %s :MOD %s%s%s)
          (ARG) ;; bozo x87 conventions
          nil
          '((:UD (UD-LOCK-USED))
            (:NM (NM-CR0-TS-IS-1)
                 (NM-CR0-EM-IS-1))))\n''' % (inst["ICLASS"],
                                           opcode,
                                           str(pattern["REG"]),
                                           ":MEM" if ('MOD!=3' in pattern) else str(pattern["MOD"]),
                                           "" if (rm == 'nnn') else " :R/M " + str(rm),
                                           parse_features(inst["EXTENSION"]) if "EXTENSION" in inst else "")
            # Comment out the inst entry if it is marked UNDOCUMENTED.
            if ("ATTRIBUTES" in inst and inst["ATTRIBUTES"].find("UNDOCUMENTED") >= 0):
                inst_entry = ";; Undocumented (from xed-isa.txt):\n" + lisp_comment(inst_entry)
            
            outfile.write(inst_entry)