1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132
|
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
define amdgpu_cs float @test_cvt_f32_bf8_byte0(i32 %a) {
; GFX12-LABEL: test_cvt_f32_bf8_byte0:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_cvt_f32_bf8_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX12-NEXT: ; return to shader part epilog
%tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1)
%ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %tmp0, i32 0)
ret float %ret
}
define amdgpu_cs float @test_cvt_f32_bf8_byte1(i32 %a) {
; GFX12-LABEL: test_cvt_f32_bf8_byte1:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_cvt_f32_bf8_e64_dpp v0, v0 byte_sel:1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX12-NEXT: ; return to shader part epilog
%tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1)
%ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %tmp0, i32 1)
ret float %ret
}
define amdgpu_cs float @test_cvt_f32_bf8_byte2(i32 %a) {
; GFX12-LABEL: test_cvt_f32_bf8_byte2:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_cvt_f32_bf8_e64_dpp v0, v0 byte_sel:2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX12-NEXT: ; return to shader part epilog
%tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1)
%ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %tmp0, i32 2)
ret float %ret
}
define amdgpu_cs float @test_cvt_f32_fp8_byte3(i32 %a) {
; GFX12-LABEL: test_cvt_f32_fp8_byte3:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_cvt_f32_fp8_e64_dpp v0, v0 byte_sel:3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX12-NEXT: ; return to shader part epilog
%tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1)
%ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %tmp0, i32 3)
ret float %ret
}
define amdgpu_cs void @test_cvt_pk_bf8_f32_word0(i32 %a, float %y, i32 %old, ptr addrspace(1) %out) {
; GFX12-LABEL: test_cvt_pk_bf8_f32_word0:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_cvt_pk_bf8_f32_e64_dpp v2, v0, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX12-NEXT: global_store_b32 v[3:4], v2, off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
%tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1)
%tmp1 = bitcast i32 %tmp0 to float
%ret = tail call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %tmp1, float %y, i32 %old, i1 false)
store i32 %ret, ptr addrspace(1) %out
ret void
}
define amdgpu_cs void @test_cvt_pk_fp8_f32_word1(i32 %a, float %y, i32 %old, ptr addrspace(1) %out) {
; GFX12-LABEL: test_cvt_pk_fp8_f32_word1:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1 op_sel:[0,0,1]
; GFX12-NEXT: global_store_b32 v[3:4], v2, off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
%tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1)
%tmp1 = bitcast i32 %tmp0 to float
%ret = tail call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %tmp1, float %y, i32 %old, i1 true)
store i32 %ret, ptr addrspace(1) %out
ret void
}
define amdgpu_cs void @test_cvt_sr_bf8_f32_byte0(i32 %a, i32 %r, i32 %old, ptr addrspace(1) %out) {
; GFX12-LABEL: test_cvt_sr_bf8_f32_byte0:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_cvt_sr_bf8_f32_e64_dpp v2, v0, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX12-NEXT: global_store_b32 v[3:4], v2, off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
%tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1)
%tmp1 = bitcast i32 %tmp0 to float
%ret = tail call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %tmp1, i32 %r, i32 %old, i32 0)
store i32 %ret, ptr addrspace(1) %out
ret void
}
define amdgpu_cs void @test_cvt_sr_fp8_f32_byte1(i32 %a, i32 %r, i32 %old, ptr addrspace(1) %out) {
; GFX12-LABEL: test_cvt_sr_fp8_f32_byte1:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_cvt_sr_fp8_f32_e64_dpp v2, v0, v1 byte_sel:1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX12-NEXT: global_store_b32 v[3:4], v2, off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
%tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1)
%tmp1 = bitcast i32 %tmp0 to float
%ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %tmp1, i32 %r, i32 %old, i32 1)
store i32 %ret, ptr addrspace(1) %out
ret void
}
define amdgpu_cs void @test_cvt_sr_fp8_f32_byte2(i32 %a, i32 %r, i32 %old, ptr addrspace(1) %out) {
; GFX12-LABEL: test_cvt_sr_fp8_f32_byte2:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_cvt_sr_fp8_f32_e64_dpp v2, v0, v1 byte_sel:2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX12-NEXT: global_store_b32 v[3:4], v2, off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
%tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1)
%tmp1 = bitcast i32 %tmp0 to float
%ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %tmp1, i32 %r, i32 %old, i32 2)
store i32 %ret, ptr addrspace(1) %out
ret void
}
declare float @llvm.amdgcn.cvt.f32.bf8(i32, i32)
declare float @llvm.amdgcn.cvt.f32.fp8(i32, i32)
declare i32 @llvm.amdgcn.cvt.pk.bf8.f32(float, float, i32, i1)
declare i32 @llvm.amdgcn.cvt.pk.fp8.f32(float, float, i32, i1)
declare i32 @llvm.amdgcn.cvt.sr.bf8.f32(float, i32, i32, i32)
declare i32 @llvm.amdgcn.cvt.sr.fp8.f32(float, i32, i32, i32)
declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32, i32, i32, i1) #1
declare i32 @llvm.amdgcn.mov.dpp8.i32(i32, i32) #1
attributes #0 = { nounwind convergent }
attributes #1 = { nounwind readnone convergent }
|