| 12
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
 100
 101
 102
 103
 104
 105
 106
 107
 108
 109
 110
 111
 112
 113
 114
 115
 116
 117
 118
 119
 120
 121
 122
 123
 124
 125
 126
 127
 128
 129
 130
 131
 132
 133
 134
 135
 136
 137
 138
 139
 140
 141
 142
 143
 144
 145
 146
 147
 148
 149
 150
 
 | ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -passes=loop-vectorize -mtriple=x86_64-apple-macosx -S -mcpu=corei7-avx -enable-interleaved-mem-accesses=false < %s | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
@kernel = global [512 x float] zeroinitializer, align 16
@kernel2 = global [512 x float] zeroinitializer, align 16
@kernel3 = global [512 x float] zeroinitializer, align 16
@kernel4 = global [512 x float] zeroinitializer, align 16
@src_data = global [1536 x float] zeroinitializer, align 16
@r_ = global i8 0, align 1
@g_ = global i8 0, align 1
@b_ = global i8 0, align 1
; We don't want to vectorize most loops containing gathers because they are
; expensive. This function represents a point where vectorization starts to
; become beneficial.
; Make sure we are conservative and don't vectorize it.
define void @_Z4testmm(i64 %size, i64 %offset) {
; CHECK-LABEL: @_Z4testmm(
; CHECK-NEXT:  entry:
; CHECK-NEXT:    [[CMP53:%.*]] = icmp eq i64 [[SIZE:%.*]], 0
; CHECK-NEXT:    br i1 [[CMP53]], label [[FOR_END:%.*]], label [[FOR_BODY_LR_PH:%.*]]
; CHECK:       for.body.lr.ph:
; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
; CHECK:       for.body:
; CHECK-NEXT:    [[R_057:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[ADD10:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT:    [[G_056:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[ADD20:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT:    [[V_055:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT:    [[B_054:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[ADD30:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT:    [[ADD:%.*]] = add i64 [[V_055]], [[OFFSET:%.*]]
; CHECK-NEXT:    [[MUL:%.*]] = mul i64 [[ADD]], 3
; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[MUL]]
; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [512 x float], ptr @kernel, i64 0, i64 [[V_055]]
; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
; CHECK-NEXT:    [[MUL3:%.*]] = fmul fast float [[TMP0]], [[TMP1]]
; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [512 x float], ptr @kernel2, i64 0, i64 [[V_055]]
; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
; CHECK-NEXT:    [[MUL5:%.*]] = fmul fast float [[MUL3]], [[TMP2]]
; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [512 x float], ptr @kernel3, i64 0, i64 [[V_055]]
; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX6]], align 4
; CHECK-NEXT:    [[MUL7:%.*]] = fmul fast float [[MUL5]], [[TMP3]]
; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [512 x float], ptr @kernel4, i64 0, i64 [[V_055]]
; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
; CHECK-NEXT:    [[MUL9:%.*]] = fmul fast float [[MUL7]], [[TMP4]]
; CHECK-NEXT:    [[ADD10]] = fadd fast float [[R_057]], [[MUL9]]
; CHECK-NEXT:    [[ARRAYIDX_SUM:%.*]] = add i64 [[MUL]], 1
; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[ARRAYIDX_SUM]]
; CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[ARRAYIDX11]], align 4
; CHECK-NEXT:    [[MUL13:%.*]] = fmul fast float [[TMP1]], [[TMP5]]
; CHECK-NEXT:    [[MUL15:%.*]] = fmul fast float [[TMP2]], [[MUL13]]
; CHECK-NEXT:    [[MUL17:%.*]] = fmul fast float [[TMP3]], [[MUL15]]
; CHECK-NEXT:    [[MUL19:%.*]] = fmul fast float [[TMP4]], [[MUL17]]
; CHECK-NEXT:    [[ADD20]] = fadd fast float [[G_056]], [[MUL19]]
; CHECK-NEXT:    [[ARRAYIDX_SUM52:%.*]] = add i64 [[MUL]], 2
; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[ARRAYIDX_SUM52]]
; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr [[ARRAYIDX21]], align 4
; CHECK-NEXT:    [[MUL23:%.*]] = fmul fast float [[TMP1]], [[TMP6]]
; CHECK-NEXT:    [[MUL25:%.*]] = fmul fast float [[TMP2]], [[MUL23]]
; CHECK-NEXT:    [[MUL27:%.*]] = fmul fast float [[TMP3]], [[MUL25]]
; CHECK-NEXT:    [[MUL29:%.*]] = fmul fast float [[TMP4]], [[MUL27]]
; CHECK-NEXT:    [[ADD30]] = fadd fast float [[B_054]], [[MUL29]]
; CHECK-NEXT:    [[INC]] = add i64 [[V_055]], 1
; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INC]], [[SIZE]]
; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]]
; CHECK:       for.cond.for.end_crit_edge:
; CHECK-NEXT:    [[ADD30_LCSSA:%.*]] = phi float [ [[ADD30]], [[FOR_BODY]] ]
; CHECK-NEXT:    [[ADD20_LCSSA:%.*]] = phi float [ [[ADD20]], [[FOR_BODY]] ]
; CHECK-NEXT:    [[ADD10_LCSSA:%.*]] = phi float [ [[ADD10]], [[FOR_BODY]] ]
; CHECK-NEXT:    [[PHITMP:%.*]] = fptoui float [[ADD10_LCSSA]] to i8
; CHECK-NEXT:    [[PHITMP60:%.*]] = fptoui float [[ADD20_LCSSA]] to i8
; CHECK-NEXT:    [[PHITMP61:%.*]] = fptoui float [[ADD30_LCSSA]] to i8
; CHECK-NEXT:    br label [[FOR_END]]
; CHECK:       for.end:
; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i8 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ]
; CHECK-NEXT:    [[G_0_LCSSA:%.*]] = phi i8 [ [[PHITMP60]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY]] ]
; CHECK-NEXT:    [[B_0_LCSSA:%.*]] = phi i8 [ [[PHITMP61]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY]] ]
; CHECK-NEXT:    store i8 [[R_0_LCSSA]], ptr @r_, align 1
; CHECK-NEXT:    store i8 [[G_0_LCSSA]], ptr @g_, align 1
; CHECK-NEXT:    store i8 [[B_0_LCSSA]], ptr @b_, align 1
; CHECK-NEXT:    ret void
;
entry:
  %cmp53 = icmp eq i64 %size, 0
  br i1 %cmp53, label %for.end, label %for.body.lr.ph
for.body.lr.ph:
  br label %for.body
for.body:
  %r.057 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add10, %for.body ]
  %g.056 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add20, %for.body ]
  %v.055 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
  %b.054 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add30, %for.body ]
  %add = add i64 %v.055, %offset
  %mul = mul i64 %add, 3
  %arrayidx = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 %mul
  %0 = load float, ptr %arrayidx, align 4
  %arrayidx2 = getelementptr inbounds [512 x float], ptr @kernel, i64 0, i64 %v.055
  %1 = load float, ptr %arrayidx2, align 4
  %mul3 = fmul fast float %0, %1
  %arrayidx4 = getelementptr inbounds [512 x float], ptr @kernel2, i64 0, i64 %v.055
  %2 = load float, ptr %arrayidx4, align 4
  %mul5 = fmul fast float %mul3, %2
  %arrayidx6 = getelementptr inbounds [512 x float], ptr @kernel3, i64 0, i64 %v.055
  %3 = load float, ptr %arrayidx6, align 4
  %mul7 = fmul fast float %mul5, %3
  %arrayidx8 = getelementptr inbounds [512 x float], ptr @kernel4, i64 0, i64 %v.055
  %4 = load float, ptr %arrayidx8, align 4
  %mul9 = fmul fast float %mul7, %4
  %add10 = fadd fast float %r.057, %mul9
  %arrayidx.sum = add i64 %mul, 1
  %arrayidx11 = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 %arrayidx.sum
  %5 = load float, ptr %arrayidx11, align 4
  %mul13 = fmul fast float %1, %5
  %mul15 = fmul fast float %2, %mul13
  %mul17 = fmul fast float %3, %mul15
  %mul19 = fmul fast float %4, %mul17
  %add20 = fadd fast float %g.056, %mul19
  %arrayidx.sum52 = add i64 %mul, 2
  %arrayidx21 = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 %arrayidx.sum52
  %6 = load float, ptr %arrayidx21, align 4
  %mul23 = fmul fast float %1, %6
  %mul25 = fmul fast float %2, %mul23
  %mul27 = fmul fast float %3, %mul25
  %mul29 = fmul fast float %4, %mul27
  %add30 = fadd fast float %b.054, %mul29
  %inc = add i64 %v.055, 1
  %exitcond = icmp ne i64 %inc, %size
  br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge
for.cond.for.end_crit_edge:
  %add30.lcssa = phi float [ %add30, %for.body ]
  %add20.lcssa = phi float [ %add20, %for.body ]
  %add10.lcssa = phi float [ %add10, %for.body ]
  %phitmp = fptoui float %add10.lcssa to i8
  %phitmp60 = fptoui float %add20.lcssa to i8
  %phitmp61 = fptoui float %add30.lcssa to i8
  br label %for.end
for.end:
  %r.0.lcssa = phi i8 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
  %g.0.lcssa = phi i8 [ %phitmp60, %for.cond.for.end_crit_edge ], [ 0, %entry ]
  %b.0.lcssa = phi i8 [ %phitmp61, %for.cond.for.end_crit_edge ], [ 0, %entry ]
  store i8 %r.0.lcssa, ptr @r_, align 1
  store i8 %g.0.lcssa, ptr @g_, align 1
  store i8 %b.0.lcssa, ptr @b_, align 1
  ret void
}
 |