File: avx512bf16.asm

package info (click to toggle)
nasm 3.01-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 23,660 kB
  • sloc: ansic: 129,101; asm: 40,471; perl: 8,238; sh: 4,146; makefile: 1,281; xml: 726; python: 582; lisp: 578; sed: 11
file content (108 lines) | stat: -rw-r--r-- 3,094 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
BITS 32
	vcvtne2ps2bf16 xmm1, xmm2, xmm3
	vcvtne2ps2bf16 ymm1, ymm2, ymm3
	vcvtne2ps2bf16 zmm1, zmm2, zmm3

	vcvtneps2bf16 xmm1, xmm2
	vcvtneps2bf16 xmm1, ymm2
	vcvtneps2bf16 ymm1, zmm2

	vdpbf16ps xmm1, xmm2, xmm3
	vdpbf16ps ymm1, ymm2, ymm3
	vdpbf16ps zmm1, zmm2, zmm3

	vcvtne2ps2bf16 xmm1, xmm2, [eax]
	vcvtne2ps2bf16 ymm1, ymm2, [ecx+1]
	vcvtne2ps2bf16 zmm1, zmm2, [2*edx+64]

	vcvtneps2bf16 xmm1, oword [eax]
	vcvtneps2bf16 xmm1, yword [ecx+1]
	vcvtneps2bf16 ymm1, [2*edx+64]

	vdpbf16ps xmm1, xmm2, [eax]
	vdpbf16ps ymm1, ymm2, [ecx+1]
	vdpbf16ps zmm1, zmm2, [2*edx+64]

	vcvtne2ps2bf16 xmm1, xmm2, [eax]{1to4}
	vcvtne2ps2bf16 ymm1, ymm2, [ecx+1]{1to8}
	vcvtne2ps2bf16 zmm1, zmm2, [2*edx+4]{1to16}

	vcvtneps2bf16 xmm1, [eax]{1to4}
	vcvtneps2bf16 xmm1, [ecx+1]{1to8}
	vcvtneps2bf16 ymm1, [2*edx+4]{1to16}

	vdpbf16ps xmm1, xmm2, [eax]{1to4}
	vdpbf16ps ymm1, ymm2, [ecx+1]{1to8}
	vdpbf16ps zmm1, zmm2, [2*edx+4]{1to16}

	vcvtne2ps2bf16 xmm1 {k1}, xmm2, xmm3
	vcvtne2ps2bf16 ymm1 {k1}, ymm2, ymm3
	vcvtne2ps2bf16 zmm1 {k1}, zmm2, zmm3

	vcvtneps2bf16 xmm1 {k1}, xmm2
	vcvtneps2bf16 xmm1 {k1}, ymm2
	vcvtneps2bf16 ymm1 {k1}, zmm2

	vdpbf16ps xmm1 {k1}, xmm2, xmm3
	vdpbf16ps ymm1 {k1}, ymm2, ymm3
	vdpbf16ps zmm1 {k1}, zmm2, zmm3

	vcvtne2ps2bf16 xmm1 {k1}, xmm2, [eax]
	vcvtne2ps2bf16 ymm1 {k1}, ymm2, [ecx+1]
	vcvtne2ps2bf16 zmm1 {k1}, zmm2, [2*edx+64]

	vcvtneps2bf16 xmm1 {k1}, oword [eax]
	vcvtneps2bf16 xmm1 {k1}, yword [ecx+1]
	vcvtneps2bf16 ymm1 {k1}, [2*edx+64]

	vdpbf16ps xmm1 {k1}, xmm2, [eax]
	vdpbf16ps ymm1 {k1}, ymm2, [ecx+1]
	vdpbf16ps zmm1 {k1}, zmm2, [2*edx+64]

	vcvtne2ps2bf16 xmm1 {k1}, xmm2, [eax]{1to4}
	vcvtne2ps2bf16 ymm1 {k1}, ymm2, [ecx+1]{1to8}
	vcvtne2ps2bf16 zmm1 {k1}, zmm2, [2*edx+4]{1to16}

	vcvtneps2bf16 xmm1 {k1}, [eax]{1to4}
	vcvtneps2bf16 xmm1 {k1}, [ecx+1]{1to8}
	vcvtneps2bf16 ymm1 {k1}, [2*edx+4]{1to16}

	vdpbf16ps xmm1 {k1}, xmm2, [eax]{1to4}
	vdpbf16ps ymm1 {k1}, ymm2, [ecx+1]{1to8}
	vdpbf16ps zmm1 {k1}, zmm2, [2*edx+4]{1to16}

	vcvtne2ps2bf16 xmm1 {k1}, xmm2, xmm3
	vcvtne2ps2bf16 ymm1 {k1}, ymm2, ymm3
	vcvtne2ps2bf16 zmm1 {k1}, zmm2, zmm3

	vcvtneps2bf16 xmm1 {k1}, xmm2
	vcvtneps2bf16 xmm1 {k1}, ymm2
	vcvtneps2bf16 ymm1 {k1}, zmm2

	vdpbf16ps xmm1 {k1}{z}, xmm2, xmm3
	vdpbf16ps ymm1 {k1}{z}, ymm2, ymm3
	vdpbf16ps zmm1 {k1}{z}, zmm2, zmm3

	vcvtne2ps2bf16 xmm1 {k1}{z}, xmm2, [eax]
	vcvtne2ps2bf16 ymm1 {k1}{z}, ymm2, [ecx+1]
	vcvtne2ps2bf16 zmm1 {k1}{z}, zmm2, [2*edx+64]

	vcvtneps2bf16 xmm1 {k1}{z}, oword [eax]
	vcvtneps2bf16 xmm1 {k1}{z}, yword [ecx+1]
	vcvtneps2bf16 ymm1 {k1}{z}, [2*edx+64]

	vdpbf16ps xmm1 {k1}{z}, xmm2, [eax]
	vdpbf16ps ymm1 {k1}{z}, ymm2, [ecx+1]
	vdpbf16ps zmm1 {k1}{z}, zmm2, [2*edx+64]

	vcvtne2ps2bf16 xmm1 {k1}{z}, xmm2, [eax]{1to4}
	vcvtne2ps2bf16 ymm1 {k1}{z}, ymm2, [ecx+1]{1to8}
	vcvtne2ps2bf16 zmm1 {k1}{z}, zmm2, [2*edx+4]{1to16}

	vcvtneps2bf16 xmm1 {k1}{z}, [eax]{1to4}
	vcvtneps2bf16 xmm1 {k1}{z}, [ecx+1]{1to8}
	vcvtneps2bf16 ymm1 {k1}{z}, [2*edx+4]{1to16}

	vdpbf16ps xmm1 {k1}{z}, xmm2, [eax]{1to4}
	vdpbf16ps ymm1 {k1}{z}, ymm2, [ecx+1]{1to8}
	vdpbf16ps zmm1 {k1}{z}, zmm2, [2*edx+4]{1to16}