File: amx.asm

package info (click to toggle)
nasm 3.01-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 23,660 kB
  • sloc: ansic: 129,101; asm: 40,471; perl: 8,238; sh: 4,146; makefile: 1,281; xml: 726; python: 582; lisp: 578; sed: 11
file content (125 lines) | stat: -rw-r--r-- 4,376 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
	bits 64

%macro amx 3
  %define treg tmm %+ %1
  %define treg2 tmm %+ %2
  %define treg3 tmm %+ %3
  %define zreg zmm %+ %1

	ldtilecfg [rsi]									;AMX_TILE
	sttilecfg [rdi]									;AMX_TILE

	tilezero			treg						;AMX_TILE

	tileloadd			treg, [rax]					;AMX_TILE
	tileloadd			treg, [rax+rdx]				;AMX_TILE
	tileloadd			treg, [rax+rdx*2]			;AMX_TILE

	tileloaddt1			treg, [rax]					;AMX_TILE
	tileloaddt1			treg, [rax+rdx]				;AMX_TILE
	tileloaddt1			treg, [rax+rdx*2]			;AMX_TILE

	tileloaddrs			treg, [rax]					;AMX-MOVRS
	tileloaddrs			treg, [rax+rdx]				;AMX-MOVRS
	tileloaddrs			treg, [rax+rdx*2]			;AMX-MOVRS

	tileloaddrst1		treg, [rax]					;AMX-MOVRS
	tileloaddrst1		treg, [rax+rdx]				;AMX-MOVRS
	tileloaddrst1		treg, [rax+rdx*2]			;AMX-MOVRS

	tdpbf16ps			treg, treg2, treg3			;AMX-BF16
	tdpbssd				treg, treg2, treg3			;AMX_INT8
	tdpbusd				treg, treg2, treg3			;AMX_INT8
	tdpbsud				treg, treg2, treg3			;AMX_INT8
	tdpbuud				treg, treg2, treg3			;AMX_INT8
	tdpfp16ps			treg, treg2, treg3			;AMX-FP16
	tcmmimfp16ps		treg, treg2, treg3			;AMX-COMPLEX
	tcmmrlfp16ps		treg, treg2, treg3			;AMX-COMPLEX

	tmmultf32ps			treg, treg2, treg3			;AMX_TF32

	tdpbf8ps			treg, treg2, treg3			;AMX-FP8
	tdpbhf8ps			treg, treg2, treg3			;AMX-FP8
	tdphbf8ps			treg, treg2, treg3			;AMX-FP8
	tdphf8ps			treg, treg2, treg3			;AMX-FP8

	tcvtrowd2ps			zreg, treg, eax				;AMX-AVX512
	tcvtrowd2ps			zreg, treg, %1				;AMX-AVX512
	tcvtrowps2bf16h		zreg, treg, eax				;AMX-AVX512
	tcvtrowps2bf16h		zreg, treg, %1				;AMX-AVX512
	tcvtrowps2bf16l		zreg, treg, eax				;AMX-AVX512
	tcvtrowps2bf16l		zreg, treg, %1				;AMX-AVX512
	tcvtrowps2phh		zreg, treg, eax				;AMX-AVX512
	tcvtrowps2phh		zreg, treg, %1				;AMX-AVX512
	tcvtrowps2phl		zreg, treg, eax				;AMX-AVX512
	tcvtrowps2phl		zreg, treg, %1				;AMX-AVX512
	tilemovrow			zreg, treg, eax				;AMX-AVX512
	tilemovrow			zreg, treg, %1				;AMX-AVX512

	; All the 16 AMX-TRANSPOSE instructions were removed from the 59th edition of
	; "Intel Architecture Instruction Set Extensions and Future Features Programming Reference"
	; September 2025, 319433-059
	; Similar to PCOMMIT, they are tagged as 'NEVER'

	[warning -obsolete-removed]
	t2rpntlvwz0			treg, [rax]					;AMX-TRANSPOSE
	t2rpntlvwz0			treg, [rax+rdx]				;AMX-TRANSPOSE
	t2rpntlvwz0			treg, [rax+rdx*2]			;AMX-TRANSPOSE

	t2rpntlvwz0t1			treg, [rax]					;AMX-TRANSPOSE
	t2rpntlvwz0t1			treg, [rax+rdx]				;AMX-TRANSPOSE
	t2rpntlvwz0t1			treg, [rax+rdx*2]			;AMX-TRANSPOSE

	t2rpntlvwz1			treg, [rax]					;AMX-TRANSPOSE
	t2rpntlvwz1			treg, [rax+rdx]				;AMX-TRANSPOSE
	t2rpntlvwz1			treg, [rax+rdx*2]			;AMX-TRANSPOSE

	t2rpntlvwz1t1			treg, [rax]					;AMX-TRANSPOSE
	t2rpntlvwz1t1			treg, [rax+rdx]				;AMX-TRANSPOSE
	t2rpntlvwz1t1			treg, [rax+rdx*2]			;AMX-TRANSPOSE

	ttransposed			treg, treg					;AMX-TRANSPOSE

	t2rpntlvwz0rs			treg, [rax]					;AMX-TRANSPOSE + AMX-MOVRS
	t2rpntlvwz0rs			treg, [rax+rdx]				;AMX-TRANSPOSE + AMX-MOVRS
	t2rpntlvwz0rs			treg, [rax+rdx*2]			;AMX-TRANSPOSE + AMX-MOVRS

	t2rpntlvwz0rst1		treg, [rax]					;AMX-TRANSPOSE + AMX-MOVRS
	t2rpntlvwz0rst1		treg, [rax+rdx]				;AMX-TRANSPOSE + AMX-MOVRS
	t2rpntlvwz0rst1		treg, [rax+rdx*2]			;AMX-TRANSPOSE + AMX-MOVRS

	t2rpntlvwz1rs			treg, [rax]					;AMX-TRANSPOSE + AMX-MOVRS
	t2rpntlvwz1rs			treg, [rax+rdx]				;AMX-TRANSPOSE + AMX-MOVRS
	t2rpntlvwz1rs			treg, [rax+rdx*2]			;AMX-TRANSPOSE + AMX-MOVRS

	t2rpntlvwz1rst1		treg, [rax]					;AMX-TRANSPOSE + AMX-MOVRS
	t2rpntlvwz1rst1		treg, [rax+rdx]				;AMX-TRANSPOSE + AMX-MOVRS
	t2rpntlvwz1rst1		treg, [rax+rdx*2]			;AMX-TRANSPOSE + AMX-MOVRS

	ttdpbf16ps				treg, treg2, treg3			;AMX-TRANSPOSE + AMX-BF16
	ttdpfp16ps				treg, treg2, treg3			;AMX-TRANSPOSE + AMX-FP16
	ttcmmimfp16ps treg, 	treg2, treg3				;AMX-TRANSPOSE + AMX-COMPLEX
	ttcmmrlfp16ps treg, 	treg2, treg3				;AMX-TRANSPOSE + AMX-COMPLEX
	tconjtcmmimfp16ps		treg, treg2, treg3			;AMX-TRANSPOSE + AMX-COMPLEX
	tconjtfp16				treg, treg					;AMX-TRANSPOSE + AMX-COMPLEX

	ttmmultf32ps			treg, treg2, treg3			;AMX-TRANSPOSE + AMX_TF32

	[warning *obsolete-removed]

	tilestored			[rax], treg					;AMX_TILE
	tilestored			[rax,rdx], treg				;AMX_TILE
	tilestored			[rax,rdx*2], treg			;AMX_TILE

	tilerelease										;AMX_TILE
%endmacro

%assign n 0
%assign m 1
%assign l 2
  %rep 8
	amx n, m, l
    %assign n ((n+1) % 8)
    %assign m ((m+1) % 8)
    %assign l ((l+1) % 8)
  %endrep