1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321
|
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// X86spec reads the ``IntelĀ® 64 and IA-32 Architectures Software Developer's Manual''
// to collect instruction encoding details and writes those details to standard output
// in CSV format.
//
// Usage:
//
// x86spec [-f file] [-u url] >x86.csv
//
// The -f flag specifies the input file (default x86manual.pdf), the Intel instruction
// set reference manual in PDF form.
// If the input file does not exist, it will be created by downloading the manual.
//
// The -u flag specifies the URL from which to download the manual
// (default https://golang.org/s/x86manual, which redirects to Intel's site).
// The URL is downloaded only when the file named by the -f flag is missing.
//
// There are additional debugging flags, not shown. Run x86spec -help for the list.
//
// File Format
//
// TODO: Mention comments at top of file.
// TODO: Mention that this is version 0.2 of the file.
// TODO: Mention that file format will change incompatibly until version 1.0.
//
// Each CSV line contains these fields:
//
// 1. The Intel manual instruction mnemonic. For example, "SHR r/m32, imm8".
//
// 2. The Go assembler instruction mnemonic. For example, "SHRL imm8, r/m32".
//
// 3. The GNU binutils instruction mnemonic. For example, "shrl imm8, r/m32".
//
// 4. The instruction encoding. For example, "C1 /4 ib".
//
// 5. The validity of the instruction in 32-bit (aka compatiblity, legacy) mode.
//
// 6. The validity of the instruction in 64-bit mode.
//
// 7. The CPUID feature flags that signal support for the instruction.
//
// 8. Additional comma-separated tags containing hints about the instruction.
//
// 9. The read/write actions of the instruction on the arguments used in
// the Intel mnemonic. For example, "rw,r" to denote that "SHR r/m32, imm8"
// reads and writes its first argument but only reads its second argument.
//
// 10. Whether the opcode used in the Intel mnemonic has encoding forms
// distinguished only by operand size, like most arithmetic instructions.
// The string "Y" indicates yes, the string "" indicates no.
//
// 11. The data size of the operation in bits. In general this is the size corresponding
// to the Go and GNU assembler opcode suffix.
//
// The complete line used for the above examples is:
//
// "SHR r/m32, imm8","SHRL imm8, r/m32","shrl imm8, r/m32","C1 /5 ib","V","V","","operand32","rw,r","Y","32"
//
// Mnemonics
//
// The instruction mnemonics are as used in the Intel manual, with a few exceptions.
//
// Mnemonics claiming general memory forms but that really require fixed addressing modes
// are omitted in favor of their equivalents with implicit arguments..
// For example, "CMPS m16, m16" (really CMPS [SI], [DI]) is omitted in favor of "CMPSW".
//
// Instruction forms with an explicit REP, REPE, or REPNE prefix are also omitted.
// Encoders and decoders are expected to handle those prefixes separately.
//
// Perhaps most significantly, the argument syntaxes used in the mnemonic indicate
// exactly how to derive the argument from the instruction encoding, or vice versa.
//
// Immediate values: imm8, imm8u, imm16, imm16u, imm32, imm64.
// Immediates are signed by default; the u suffixes indicates an unsigned value.
//
// Memory operands. The forms m, m128, m14/28byte, m16, m16&16, m16&32, m16&64, m16:16, m16:32,
// m16:64, m16int, m256, m2byte, m32, m32&32, m32fp, m32int, m512byte, m64, m64fp, m64int,
// m8, m80bcd, m80dec, m80fp, m94/108byte. These operands always correspond to the
// memory address specified by the r/m half of the modrm encoding.
//
// Integer registers.
// The forms r8, r16, r32, r64 indicate a register selected by the modrm reg encoding.
// The forms rmr16, rmr32, rmr64 indicate a register (never memory) selected by the modrm r/m encoding.
// The forms r/m8, r/m16, r/m32, and r/m64 indicate a register or memory selected by the modrm r/m encoding.
// Forms with two sizes, like r32/m16 also indicate a register or memory selected by the modrm r/m encodng,
// but the size for a register argument differs from the size of a memory argument.
// The forms r8V, r16V, r32V, r64V indicate a register selected by the VEX.vvvv bits.
//
// Multimedia registers.
// The forms mm1, xmm1, and ymm1 indicate a multimedia register selected by the
// modrm reg encoding.
// The forms mm2, xmm2, and ymm2 indicate a register (never memory) selected by
// the modrm r/m encoding.
// The forms mm2/m64, xmm2/m128, and so on indicate a register or memory
// selected by the modrm r/m encoding.
// The forms xmmV and ymmV indicate a register selected by the VEX.vvvv bits.
// The forms xmmI and ymmI indicate a register selected by the top four bits of an /is4 immediate byte.
//
// Bound registers.
// The form bnd1 indicate a bound register selected by the modrm reg encoding.
// The form bnd2 indicates a bound register (never memory) selected by the modrm r/m encoding.
// The forms bnd2/m64 and bnd2/m128 indicate a register or memorys selected by the modrm r/m encoding.
// TODO: Describe mib.
//
// One-of-a-kind operands: rel8, rel16, rel32, ptr16:16, ptr16:32,
// moffs8, moffs16, moffs32, moffs64, vm32x, vm32y, vm64x, and vm64y
// are all as in the Intel manual.
//
// Encodings
//
// The encodings are also as used in the Intel manual, with automated corrections.
// For example, the Intel manual sometimes omits the modrm /r indicator or other trailing bytes,
// and it also contains typographical errors.
// These problems are corrected so that the CSV data may be used to generate
// tools for processing x86 machine code.
// See https://golang.org/x/arch/x86/x86map for one such generator.
//
// Valid32 and Valid64
//
// These columns hold validity abbreviations as defined in the Intel manual:
// V, I, N.E., N.P., N.S., or N.I.
// Tools processing the data are typically only concerned with whether the
// column is "V" (valid) or not.
// This data is also corrected compared to the manual.
// For example, the manual lists many instruction forms using REX bytes
// with an incorrect "V" in the Valid32 column.
//
// CPUID Feature Flags
//
// This column specifies CPUID feature flags that must be present in order
// to use the instruction. If multiple flags are required,
// they are listed separated by plus signs, as in PCLMULQDQ+AVX.
// The column can also list one of the values 486, Pentium, PentiumII, and P6,
// indicating that the instruction was introduced on that architecture version.
//
// Tags
//
// The tag column does not correspond to a traditional column in the Intel manual tables.
// Instead, it is itself a comma-separated list of tags or hints derived by analysis
// of the instruction set or the instruction encodings.
//
// The tags address16, address32, and address64 indicate that the instruction form
// applies when using the specified addressing size. It may therefore be necessary to use an
// address size prefix byte to access the instruction.
// If two address tags are listed, the instruction can be used with either of those
// address sizes. An instruction will never list all three address sizes.
// (In fact, today, no instruction lists two address sizes, but that may change.)
//
// The tags operand16, operand32, and operand64 indicate that the instruction form
// applies when using the specified operand size. It may therefore be necessary to use an
// operand size prefix byte to access the instruction.
// If two operand tags are listed, the instruction can be used with either of those
// operand sizes. An instruction will never list all three operand sizes.
//
// The tags modrm_regonly or modrm_memonly indicate that the modrm byte's
// r/m encoding must specify a register or memory, respectively.
// Especially in newer instructions, the modrm constraint may be the only way
// to distinguish two instruction forms. For example the MOVHLPS and MOVLPS
// instructions share the same encoding, except that the former requires the
// modrm byte's r/m to indicate a register, while the latter requires it to indicate memory.
//
// The tags pseudo and pseudo64 indicate that this instruction form is redundant
// with others listed in the table and should be ignored when generating disassembly
// or instruction scanning programs. The pseudo64 tag is reserved for the case where
// the manual lists an instruction twice, once with the optional 64-bit mode REX byte.
// Since most decoders will handle the REX byte separately, the form with the
// unnecessary REX is tagged pseudo64.
//
// Corrections and Additions
//
// The x86spec program makes various corrections to the Intel manual data
// as part of extracting the information. Those corrections are described above.
//
// The x86spec program also adds a few well-known undocumented instructions,
// such as UD1 and FFREEP.
//
// Examples
//
// The latest version of the CSV file is available in this Git repository and also
// online at https://golang.org/s/x86.csv. It is meant to be human-readable for
// quick reference and also to be input for generating tools that operate on
// x86 machine code.
//
// To print instruction syntaxes introduced by the Pentium II and P6,
// using https://rsc.io/csv2tsv to prepare the table for processing by awk:
//
// csv2tsv x86.csv | awk -F'\t' '$5 == "PentiumII" || $5 == "P6" { print $1 }'
//
// The x86map program (https://golang.org/x/arch/x86/x86map)
// reads the CSV file and generates an x86 instruction decoder in the form
// of a simple byte-code program. This decoder is the core of the disassembler
// in the x86asm package (https://golang.org/x/arch/x86/x86asm).
//
package main
import (
"bufio"
"flag"
"fmt"
"io"
"log"
"net/http"
"os"
"sort"
"strings"
)
const (
specFormatVersion = "0.2"
)
var (
flagDebugPage = flag.String("debugpage", "", "debug page `n` of the manual (can be comma-separated list)")
flagURL = flag.String("u", "https://golang.org/s/x86manual", "use `url` for download if needed")
flagFile = flag.String("f", "x86manual.pdf", "read manual from `file`, downloading if necessary")
flagCompat = flag.Bool("compat", false, "print compatibility statements")
debugging bool
onlySomePages bool
)
type instruction struct {
page int
opcode string
syntax string
valid64 string
valid32 string
cpuid string
desc string
tags []string
args []string
seq int // for use by cleanup
compat string
action string
multisize string
datasize int
gnuSyntax string
goSyntax string
}
func main() {
log.SetFlags(0)
log.SetPrefix("x86spec: ")
flags()
download()
insts := parse()
insts = cleanup(insts)
format(insts)
sort.Sort(bySyntax(insts))
write(os.Stdout, insts)
}
func flags() {
flag.Usage = func() {
fmt.Fprintf(os.Stderr, "usage: x86spec [options]\n")
flag.PrintDefaults()
os.Exit(2)
}
flag.Parse()
if flag.NArg() != 0 {
flag.Usage()
}
debugging = *flagDebugPage != ""
onlySomePages = *flagDebugPage != ""
}
func download() {
_, err := os.Stat(*flagFile)
if !os.IsNotExist(err) {
return
}
// Try downloading.
log.Printf("downloading manual to %s", *flagFile)
resp, err := http.Get(*flagURL)
if err != nil {
log.Fatal(err)
}
if resp.StatusCode != 200 {
log.Fatal(resp.Status)
}
f, err := os.Create(*flagFile)
if err != nil {
log.Fatal(err)
}
_, err = io.Copy(f, resp.Body)
if err != nil {
log.Fatal(err)
}
if err := f.Close(); err != nil {
log.Fatal(err)
}
}
func write(w io.Writer, insts []*instruction) {
bw := bufio.NewWriter(w)
defer bw.Flush()
for _, inst := range insts {
datasize := ""
if inst.datasize != 0 {
datasize = fmt.Sprint(inst.datasize)
}
writeCSV(bw, inst.syntax, inst.goSyntax, inst.gnuSyntax, inst.opcode, inst.valid32, inst.valid64, inst.cpuid, strings.Join(inst.tags, ","), inst.action, inst.multisize, datasize)
}
}
// Note: not using encoding/csv because we want the CSV to use quotes always,
// so that it is a little easier to process with non-CSV tools like grep,
// but the encoding/csv package does not have an "always quote" writing mode.
func writeCSV(w io.Writer, args ...string) {
for i, arg := range args {
if i > 0 {
fmt.Fprintf(w, ",")
}
fmt.Fprintf(w, `"%s"`, strings.Replace(arg, `"`, `""`, -1))
}
fmt.Fprintf(w, "\n")
}
|