1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136
|
// RUN: %clang_cc1 -O1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - -emit-llvm %s | FileCheck %s
// Test ARM64 SIMD fused multiply add intrinsics
#include <arm_neon.h>
float32x2_t test_vfma_f32(float32x2_t a1, float32x2_t a2, float32x2_t a3) {
// CHECK: test_vfma_f32
return vfma_f32(a1, a2, a3);
// CHECK: llvm.fma.v2f32({{.*a2, .*a3, .*a1}})
// CHECK-NEXT: ret
}
float32x4_t test_vfmaq_f32(float32x4_t a1, float32x4_t a2, float32x4_t a3) {
// CHECK: test_vfmaq_f32
return vfmaq_f32(a1, a2, a3);
// CHECK: llvm.fma.v4f32({{.*a2, .*a3, .*a1}})
// CHECK-NEXT: ret
}
float64x2_t test_vfmaq_f64(float64x2_t a1, float64x2_t a2, float64x2_t a3) {
// CHECK: test_vfmaq_f64
return vfmaq_f64(a1, a2, a3);
// CHECK: llvm.fma.v2f64({{.*a2, .*a3, .*a1}})
// CHECK-NEXT: ret
}
float32x2_t test_vfma_lane_f32(float32x2_t a1, float32x2_t a2, float32x2_t a3) {
// CHECK: test_vfma_lane_f32
return vfma_lane_f32(a1, a2, a3, 1);
// NB: the test below is deliberately lose, so that we don't depend too much
// upon the exact IR used to select lane 1 (usually a shufflevector)
// CHECK: llvm.fma.v2f32(<2 x float> %a2, <2 x float> {{.*}}, <2 x float> %a1)
// CHECK-NEXT: ret
}
float32x4_t test_vfmaq_lane_f32(float32x4_t a1, float32x4_t a2, float32x2_t a3) {
// CHECK: test_vfmaq_lane_f32
return vfmaq_lane_f32(a1, a2, a3, 1);
// NB: the test below is deliberately lose, so that we don't depend too much
// upon the exact IR used to select lane 1 (usually a shufflevector)
// CHECK: llvm.fma.v4f32(<4 x float> %a2, <4 x float> {{.*}}, <4 x float> %a1)
// CHECK-NEXT: ret
}
float64x2_t test_vfmaq_lane_f64(float64x2_t a1, float64x2_t a2, float64x1_t a3) {
// CHECK: test_vfmaq_lane_f64
return vfmaq_lane_f64(a1, a2, a3, 0);
// NB: the test below is deliberately lose, so that we don't depend too much
// upon the exact IR used to select lane 1 (usually a shufflevector)
// CHECK: llvm.fma.v2f64(<2 x double> %a2, <2 x double> {{.*}}, <2 x double> %a1)
// CHECK-NEXT: ret
}
float32x2_t test_vfma_n_f32(float32x2_t a1, float32x2_t a2, float32_t a3) {
// CHECK: test_vfma_n_f32
return vfma_n_f32(a1, a2, a3);
// NB: the test below is deliberately lose, so that we don't depend too much
// upon the exact IR used to select lane 0 (usually two insertelements)
// CHECK: llvm.fma.v2f32
// CHECK-NEXT: ret
}
float32x4_t test_vfmaq_n_f32(float32x4_t a1, float32x4_t a2, float32_t a3) {
// CHECK: test_vfmaq_n_f32
return vfmaq_n_f32(a1, a2, a3);
// NB: the test below is deliberately lose, so that we don't depend too much
// upon the exact IR used to select lane 0 (usually four insertelements)
// CHECK: llvm.fma.v4f32
// CHECK-NEXT: ret
}
float64x2_t test_vfmaq_n_f64(float64x2_t a1, float64x2_t a2, float64_t a3) {
// CHECK: test_vfmaq_n_f64
return vfmaq_n_f64(a1, a2, a3);
// NB: the test below is deliberately lose, so that we don't depend too much
// upon the exact IR used to select lane 0 (usually two insertelements)
// CHECK: llvm.fma.v2f64
// CHECK-NEXT: ret
}
float32x2_t test_vfms_f32(float32x2_t a1, float32x2_t a2, float32x2_t a3) {
// CHECK: test_vfms_f32
return vfms_f32(a1, a2, a3);
// CHECK: [[NEG:%.*]] = fsub <2 x float> {{.*}}, %a2
// CHECK: llvm.fma.v2f32(<2 x float> %a3, <2 x float> [[NEG]], <2 x float> %a1)
// CHECK-NEXT: ret
}
float32x4_t test_vfmsq_f32(float32x4_t a1, float32x4_t a2, float32x4_t a3) {
// CHECK: test_vfmsq_f32
return vfmsq_f32(a1, a2, a3);
// CHECK: [[NEG:%.*]] = fsub <4 x float> {{.*}}, %a2
// CHECK: llvm.fma.v4f32(<4 x float> %a3, <4 x float> [[NEG]], <4 x float> %a1)
// CHECK-NEXT: ret
}
float64x2_t test_vfmsq_f64(float64x2_t a1, float64x2_t a2, float64x2_t a3) {
// CHECK: test_vfmsq_f64
return vfmsq_f64(a1, a2, a3);
// CHECK: [[NEG:%.*]] = fsub <2 x double> {{.*}}, %a2
// CHECK: llvm.fma.v2f64(<2 x double> %a3, <2 x double> [[NEG]], <2 x double> %a1)
// CHECK-NEXT: ret
}
float32x2_t test_vfms_lane_f32(float32x2_t a1, float32x2_t a2, float32x2_t a3) {
// CHECK: test_vfms_lane_f32
return vfms_lane_f32(a1, a2, a3, 1);
// NB: the test below is deliberately lose, so that we don't depend too much
// upon the exact IR used to select lane 1 (usually a shufflevector)
// CHECK: [[NEG:%.*]] = fsub <2 x float> {{.*}}, %a3
// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[NEG]]
// CHECK: llvm.fma.v2f32(<2 x float> {{.*}}, <2 x float> [[LANE]], <2 x float> %a1)
// CHECK-NEXT: ret
}
float32x4_t test_vfmsq_lane_f32(float32x4_t a1, float32x4_t a2, float32x2_t a3) {
// CHECK: test_vfmsq_lane_f32
return vfmsq_lane_f32(a1, a2, a3, 1);
// NB: the test below is deliberately lose, so that we don't depend too much
// upon the exact IR used to select lane 1 (usually a shufflevector)
// CHECK: [[NEG:%.*]] = fsub <2 x float> {{.*}}, %a3
// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[NEG]]
// CHECK: llvm.fma.v4f32(<4 x float> {{.*}}, <4 x float> [[LANE]], <4 x float> %a1)
// CHECK-NEXT: ret
}
float64x2_t test_vfmsq_lane_f64(float64x2_t a1, float64x2_t a2, float64x1_t a3) {
// CHECK: test_vfmsq_lane_f64
return vfmsq_lane_f64(a1, a2, a3, 0);
// NB: the test below is deliberately lose, so that we don't depend too much
// upon the exact IR used to select lane 1 (usually a shufflevector)
// CHECK: [[NEG:%.*]] = fsub <1 x double> {{.*}}, %a3
// CHECK: [[LANE:%.*]] = shufflevector <1 x double> [[NEG]]
// CHECK: llvm.fma.v2f64(<2 x double> {{.*}}, <2 x double> [[LANE]], <2 x double> %a1)
// CHECK-NEXT: ret
}
|