1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149
|
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
; RUN: llc -O0 < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s -check-prefix=CHECK_O0
; CHECK: vmovaps
; CHECK: vmovaps
; CHECK: vmovaps
; CHECK: vmovaps
; CHECK: vmovaps
; CHECK: vmovaps
define void @test_256_load(double* nocapture %d, float* nocapture %f, <4 x i64>* nocapture %i) nounwind uwtable ssp {
entry:
%0 = bitcast double* %d to <4 x double>*
%tmp1.i = load <4 x double>, <4 x double>* %0, align 32
%1 = bitcast float* %f to <8 x float>*
%tmp1.i17 = load <8 x float>, <8 x float>* %1, align 32
%tmp1.i16 = load <4 x i64>, <4 x i64>* %i, align 32
tail call void @dummy(<4 x double> %tmp1.i, <8 x float> %tmp1.i17, <4 x i64> %tmp1.i16) nounwind
store <4 x double> %tmp1.i, <4 x double>* %0, align 32
store <8 x float> %tmp1.i17, <8 x float>* %1, align 32
store <4 x i64> %tmp1.i16, <4 x i64>* %i, align 32
ret void
}
declare void @dummy(<4 x double>, <8 x float>, <4 x i64>)
;;
;; The two tests below check that we must fold load + scalar_to_vector
;; + ins_subvec+ zext into only a single vmovss or vmovsd or vinsertps from memory
; CHECK: mov00
define <8 x float> @mov00(<8 x float> %v, float * %ptr) nounwind {
%val = load float, float* %ptr
; CHECK: vmovss (%
%i0 = insertelement <8 x float> zeroinitializer, float %val, i32 0
ret <8 x float> %i0
; CHECK: ret
}
; CHECK: mov01
define <4 x double> @mov01(<4 x double> %v, double * %ptr) nounwind {
%val = load double, double* %ptr
; CHECK: vmovsd (%
%i0 = insertelement <4 x double> zeroinitializer, double %val, i32 0
ret <4 x double> %i0
; CHECK: ret
}
; CHECK: vmovaps %ymm
define void @storev16i16(<16 x i16> %a) nounwind {
store <16 x i16> %a, <16 x i16>* undef, align 32
unreachable
}
; CHECK: storev16i16_01
; CHECK: vextractf128
; CHECK: vmovups %xmm
define void @storev16i16_01(<16 x i16> %a) nounwind {
store <16 x i16> %a, <16 x i16>* undef, align 4
unreachable
}
; CHECK: storev32i8
; CHECK: vmovaps %ymm
define void @storev32i8(<32 x i8> %a) nounwind {
store <32 x i8> %a, <32 x i8>* undef, align 32
unreachable
}
; CHECK: storev32i8_01
; CHECK: vextractf128
; CHECK: vmovups %xmm
define void @storev32i8_01(<32 x i8> %a) nounwind {
store <32 x i8> %a, <32 x i8>* undef, align 4
unreachable
}
; It is faster to make two saves, if the data is already in XMM registers. For
; example, after making an integer operation.
; CHECK: _double_save
; CHECK-NOT: vinsertf128 $1
; CHECK-NOT: vinsertf128 $0
; CHECK: vmovaps %xmm
; CHECK: vmovaps %xmm
define void @double_save(<4 x i32> %A, <4 x i32> %B, <8 x i32>* %P) nounwind ssp {
entry:
%Z = shufflevector <4 x i32>%A, <4 x i32>%B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
store <8 x i32> %Z, <8 x i32>* %P, align 16
ret void
}
declare void @llvm.x86.avx.maskstore.ps.256(i8*, <8 x i32>, <8 x float>) nounwind
; CHECK_O0: _f_f
; CHECK-O0: vmovss LCPI
; CHECK-O0: vxorps %xmm
; CHECK-O0: vmovss %xmm
define void @f_f() nounwind {
allocas:
br i1 undef, label %cif_mask_all, label %cif_mask_mixed
cif_mask_all: ; preds = %allocas
unreachable
cif_mask_mixed: ; preds = %allocas
br i1 undef, label %cif_mixed_test_all, label %cif_mixed_test_any_check
cif_mixed_test_all: ; preds = %cif_mask_mixed
call void @llvm.x86.avx.maskstore.ps.256(i8* undef, <8 x i32> <i32 -1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, <8 x float> undef) nounwind
unreachable
cif_mixed_test_any_check: ; preds = %cif_mask_mixed
unreachable
}
; CHECK: add8i32
; CHECK: vmovups
; CHECK: vmovups
; CHECK-NOT: vinsertf128
; CHECK-NOT: vextractf128
; CHECK: vmovups
; CHECK: vmovups
define void @add8i32(<8 x i32>* %ret, <8 x i32>* %bp) nounwind {
%b = load <8 x i32>, <8 x i32>* %bp, align 1
%x = add <8 x i32> zeroinitializer, %b
store <8 x i32> %x, <8 x i32>* %ret, align 1
ret void
}
; CHECK: add4i64a64
; CHECK: vmovaps ({{.*}}), %ymm{{.*}}
; CHECK: vmovaps %ymm{{.*}}, ({{.*}})
define void @add4i64a64(<4 x i64>* %ret, <4 x i64>* %bp) nounwind {
%b = load <4 x i64>, <4 x i64>* %bp, align 64
%x = add <4 x i64> zeroinitializer, %b
store <4 x i64> %x, <4 x i64>* %ret, align 64
ret void
}
; CHECK: add4i64a16
; CHECK: vmovaps {{.*}}({{.*}}), %xmm{{.*}}
; CHECK: vmovaps {{.*}}({{.*}}), %xmm{{.*}}
; CHECK: vmovaps %xmm{{.*}}, {{.*}}({{.*}})
; CHECK: vmovaps %xmm{{.*}}, {{.*}}({{.*}})
define void @add4i64a16(<4 x i64>* %ret, <4 x i64>* %bp) nounwind {
%b = load <4 x i64>, <4 x i64>* %bp, align 16
%x = add <4 x i64> zeroinitializer, %b
store <4 x i64> %x, <4 x i64>* %ret, align 16
ret void
}
|