1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137
  
     | 
    
      // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
// REQUIRES: amdgpu-registered-target
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -target-feature +wavefrontsize64 -DWMMA_GFX1100_TESTS -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1100
typedef float  v4f   __attribute__((ext_vector_type(4)));
typedef half   v8h   __attribute__((ext_vector_type(8)));
typedef half   v16h  __attribute__((ext_vector_type(16)));
typedef int    v2i   __attribute__((ext_vector_type(2)));
typedef int    v4i   __attribute__((ext_vector_type(4)));
typedef short  v8s   __attribute__((ext_vector_type(8)));
typedef short  v16s  __attribute__((ext_vector_type(16)));
#ifdef WMMA_GFX1100_TESTS
// Wave64
//
// amdgcn_wmma_f32_16x16x16_f16
//
// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f32_16x16x16_f16_w64(
// CHECK-GFX1100-NEXT:  entry:
// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <4 x float> [[C:%.*]])
// CHECK-GFX1100-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4:![0-9]+]]
// CHECK-GFX1100-NEXT:    ret void
//
void test_amdgcn_wmma_f32_16x16x16_f16_w64(global v4f* out, v16h a, v16h b, v4f c)
{
  *out = __builtin_amdgcn_wmma_f32_16x16x16_f16_w64(a, b, c);
}
//
// amdgcn_wmma_f32_16x16x16_bf16
//
// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf16_w64(
// CHECK-GFX1100-NEXT:  entry:
// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <4 x float> [[C:%.*]])
// CHECK-GFX1100-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
// CHECK-GFX1100-NEXT:    ret void
//
void test_amdgcn_wmma_f32_16x16x16_bf16_w64(global v4f* out, v16s a, v16s b, v4f c)
{
  *out = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w64(a, b, c);
}
//
// amdgcn_wmma_f16_16x16x16_f16
//
// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_w64(
// CHECK-GFX1100-NEXT:  entry:
// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x half> [[C:%.*]], i1 true)
// CHECK-GFX1100-NEXT:    store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
// CHECK-GFX1100-NEXT:    ret void
//
void test_amdgcn_wmma_f16_16x16x16_f16_w64(global v8h* out, v16h a, v16h b, v8h c)
{
  *out = __builtin_amdgcn_wmma_f16_16x16x16_f16_w64(a, b, c, true);
}
//
// amdgcn_wmma_bf16_16x16x16_bf16
//
// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_w64(
// CHECK-GFX1100-NEXT:  entry:
// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], i1 true)
// CHECK-GFX1100-NEXT:    store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
// CHECK-GFX1100-NEXT:    ret void
//
void test_amdgcn_wmma_bf16_16x16x16_bf16_w64(global v8s* out, v16s a, v16s b, v8s c)
{
  *out = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64(a, b, c, true);
}
//
// amdgcn_wmma_f16_16x16x16_f16_tied
//
// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_tied_w64(
// CHECK-GFX1100-NEXT:  entry:
// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied.v8f16.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x half> [[C:%.*]], i1 true)
// CHECK-GFX1100-NEXT:    store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
// CHECK-GFX1100-NEXT:    ret void
//
void test_amdgcn_wmma_f16_16x16x16_f16_tied_w64(global v8h* out, v16h a, v16h b, v8h c)
{
  *out = __builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64(a, b, c, true);
}
//
// amdgcn_wmma_bf16_16x16x16_bf16_tied
//
// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64(
// CHECK-GFX1100-NEXT:  entry:
// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied.v8i16.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], i1 true)
// CHECK-GFX1100-NEXT:    store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
// CHECK-GFX1100-NEXT:    ret void
//
void test_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64(global v8s* out, v16s a, v16s b, v8s c)
{
  *out = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64(a, b, c, true);
}
//
// amdgcn_wmma_i32_16x16x16_iu8
//
// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu8_w64(
// CHECK-GFX1100-NEXT:  entry:
// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 true, <4 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], i1 false)
// CHECK-GFX1100-NEXT:    store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
// CHECK-GFX1100-NEXT:    ret void
//
void test_amdgcn_wmma_i32_16x16x16_iu8_w64(global v4i* out, v4i a, v4i b, v4i c)
{
  *out = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w64(true, a, true, b, c, false);
}
//
// amdgcn_wmma_i32_16x16x16_iu4
//
// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu4_w64(
// CHECK-GFX1100-NEXT:  entry:
// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 true, <2 x i32> [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], i1 false)
// CHECK-GFX1100-NEXT:    store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
// CHECK-GFX1100-NEXT:    ret void
//
void test_amdgcn_wmma_i32_16x16x16_iu4_w64(global v4i* out, v2i a, v2i b, v4i c)
{
  *out = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w64(true, a, true, b, c, false);
}
#endif
 
     |