| 12
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
 100
 101
 102
 103
 104
 105
 106
 107
 108
 109
 110
 111
 112
 113
 114
 115
 116
 117
 118
 119
 120
 121
 122
 123
 124
 125
 126
 127
 128
 129
 130
 131
 132
 133
 134
 135
 136
 137
 138
 139
 140
 141
 142
 143
 144
 145
 146
 147
 148
 149
 150
 151
 152
 153
 154
 155
 156
 157
 158
 159
 160
 161
 162
 163
 164
 165
 166
 167
 168
 169
 170
 171
 172
 173
 174
 175
 176
 177
 178
 179
 180
 181
 182
 183
 184
 185
 186
 187
 188
 189
 190
 191
 192
 193
 
 | # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops -verify-machineinstrs %s -o - | FileCheck %s
--- |
  ; Function Attrs: nofree norecurse nounwind
  define dso_local void @test(i32* noalias nocapture %arg, i32* noalias nocapture readonly %arg1, i32 %arg2, i16 zeroext %mask) local_unnamed_addr #0 {
  bb:
    %tmp = icmp eq i32 %arg2, 0
    %tmp1 = add i32 %arg2, 3
    %tmp2 = lshr i32 %tmp1, 2
    %tmp3 = shl nuw i32 %tmp2, 2
    %tmp4 = add i32 %tmp3, -4
    %tmp5 = lshr i32 %tmp4, 2
    %tmp6 = add nuw nsw i32 %tmp5, 1
    %conv.mask = zext i16 %mask to i32
    %invariant.mask = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %conv.mask)
    br i1 %tmp, label %bb27, label %bb3
  bb3:                                              ; preds = %bb
    %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp6)
    br label %bb9
  bb9:                                              ; preds = %bb9, %bb3
    %lsr.iv2 = phi i32* [ %scevgep3, %bb9 ], [ %arg1, %bb3 ]
    %lsr.iv = phi i32* [ %scevgep, %bb9 ], [ %arg, %bb3 ]
    %tmp7 = phi i32 [ %start, %bb3 ], [ %tmp12, %bb9 ]
    %tmp8 = phi i32 [ %arg2, %bb3 ], [ %tmp11, %bb9 ]
    %lsr.iv24 = bitcast i32* %lsr.iv2 to <4 x i32>*
    %lsr.iv1 = bitcast i32* %lsr.iv to <4 x i32>*
    %vctp = call <4 x i1> @llvm.arm.mve.vctp32(i32 %tmp8)
    %and = and <4 x i1> %vctp, %invariant.mask
    %tmp11 = sub i32 %tmp8, 4
    %tmp17 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv24, i32 4, <4 x i1> %and, <4 x i32> undef)
    %tmp18 = icmp ne <4 x i32> %tmp17, zeroinitializer
    %tmp20 = and <4 x i1> %tmp18, %vctp
    %tmp22 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1, i32 4, <4 x i1> %tmp20, <4 x i32> undef)
    %tmp23 = mul nsw <4 x i32> %tmp22, %tmp17
    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp23, <4 x i32>* %lsr.iv1, i32 4, <4 x i1> %tmp20)
    %tmp12 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp7, i32 1)
    %tmp13 = icmp ne i32 %tmp12, 0
    %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
    %scevgep3 = getelementptr i32, i32* %lsr.iv2, i32 4
    br i1 %tmp13, label %bb9, label %bb27
  bb27:                                             ; preds = %bb9, %bb
    ret void
  }
  declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
  declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
  declare i32 @llvm.start.loop.iterations.i32(i32)
  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
  declare <4 x i1> @llvm.arm.mve.vctp32(i32)
  declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32)
...
---
name:            test
alignment:       2
exposesReturnsTwice: false
legalized:       false
regBankSelected: false
selected:        false
failedISel:      false
tracksRegLiveness: true
hasWinCFI:       false
registers:       []
liveins:
  - { reg: '$r0', virtual-reg: '' }
  - { reg: '$r1', virtual-reg: '' }
  - { reg: '$r2', virtual-reg: '' }
  - { reg: '$r3', virtual-reg: '' }
frameInfo:
  isFrameAddressTaken: false
  isReturnAddressTaken: false
  hasStackMap:     false
  hasPatchPoint:   false
  stackSize:       12
  offsetAdjustment: -4
  maxAlignment:    4
  adjustsStack:    false
  hasCalls:        false
  stackProtector:  ''
  maxCallFrameSize: 0
  cvBytesOfCalleeSavedRegisters: 0
  hasOpaqueSPAdjustment: false
  hasVAStart:      false
  hasMustTailInVarArgFunc: false
  localFrameSize:  0
  savePoint:       ''
  restorePoint:    ''
fixedStack:      []
stack:
  - { id: 0, name: '', type: spill-slot, offset: -12, size: 4, alignment: 4,
      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
  - { id: 1, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4,
      stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false,
      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
  - { id: 2, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4,
      stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true,
      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
callSites:       []
constants:       []
machineFunctionInfo: {}
body:             |
  ; CHECK-LABEL: name: test
  ; CHECK: bb.0.bb:
  ; CHECK:   successors: %bb.3(0x30000000), %bb.1(0x50000000)
  ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r3
  ; CHECK:   frame-setup tPUSH 14 /* CC::al */, $noreg, killed $lr, implicit-def $sp, implicit $sp
  ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
  ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
  ; CHECK:   frame-setup CFI_INSTRUCTION offset $r7, -8
  ; CHECK:   dead $r7 = frame-setup tMOVr $sp, 14 /* CC::al */, $noreg
  ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_register $r7
  ; CHECK:   $sp = frame-setup tSUBspi $sp, 1, 14 /* CC::al */, $noreg
  ; CHECK:   tCBZ $r2, %bb.3
  ; CHECK: bb.1.bb3:
  ; CHECK:   successors: %bb.2(0x80000000)
  ; CHECK:   liveins: $r0, $r1, $r2, $r3
  ; CHECK:   $vpr = VMSR_P0 killed $r3, 14 /* CC::al */, $noreg
  ; CHECK:   VSTR_P0_off killed renamable $vpr, $sp, 0, 14 /* CC::al */, $noreg :: (store (s32) into %stack.0)
  ; CHECK:   $r3 = tMOVr $r0, 14 /* CC::al */, $noreg
  ; CHECK:   $lr = MVE_DLSTP_32 killed renamable $r2
  ; CHECK: bb.2.bb9:
  ; CHECK:   successors: %bb.2(0x7c000000), %bb.3(0x04000000)
  ; CHECK:   liveins: $lr, $r0, $r1, $r3
  ; CHECK:   renamable $vpr = VLDR_P0_off $sp, 0, 14 /* CC::al */, $noreg :: (load (s32) from %stack.0)
  ; CHECK:   MVE_VPST 8, implicit $vpr
  ; CHECK:   renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, killed renamable $vpr, $noreg :: (load (s128) from %ir.lsr.iv24, align 4)
  ; CHECK:   MVE_VPTv4i32r 8, renamable $q0, $zr, 1, implicit-def $vpr
  ; CHECK:   renamable $r3, renamable $q1 = MVE_VLDRWU32_post killed renamable $r3, 16, 1, renamable $vpr, $noreg :: (load (s128) from %ir.lsr.iv1, align 4)
  ; CHECK:   renamable $q0 = nsw MVE_VMULi32 killed renamable $q1, killed renamable $q0, 0, $noreg, $noreg, undef renamable $q0
  ; CHECK:   MVE_VPST 8, implicit $vpr
  ; CHECK:   MVE_VSTRWU32 killed renamable $q0, killed renamable $r0, 0, 1, killed renamable $vpr, $noreg :: (store (s128) into %ir.lsr.iv1, align 4)
  ; CHECK:   $r0 = tMOVr $r3, 14 /* CC::al */, $noreg
  ; CHECK:   $lr = MVE_LETP killed renamable $lr, %bb.2
  ; CHECK: bb.3.bb27:
  ; CHECK:   $sp = tADDspi $sp, 1, 14 /* CC::al */, $noreg
  ; CHECK:   tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc
  bb.0.bb:
    successors: %bb.3(0x30000000), %bb.1(0x50000000)
    liveins: $r0, $r1, $r2, $r3, $lr
    frame-setup tPUSH 14, $noreg, killed $lr, implicit-def $sp, implicit $sp
    frame-setup CFI_INSTRUCTION def_cfa_offset 8
    frame-setup CFI_INSTRUCTION offset $lr, -4
    frame-setup CFI_INSTRUCTION offset $r7, -8
    $r7 = frame-setup tMOVr $sp, 14, $noreg
    frame-setup CFI_INSTRUCTION def_cfa_register $r7
    $sp = frame-setup tSUBspi $sp, 1, 14, $noreg
    tCBZ $r2, %bb.3
  bb.1.bb3:
    successors: %bb.2(0x80000000)
    liveins: $r0, $r1, $r2, $r3
    renamable $r12 = t2ADDri renamable $r2, 3, 14, $noreg, $noreg
    renamable $lr = t2MOVi 1, 14, $noreg, $noreg
    renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg
    $vpr = VMSR_P0 killed $r3, 14, $noreg
    renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
    VSTR_P0_off killed renamable $vpr, $sp, 0, 14, $noreg :: (store (s32) into %stack.0)
    $r3 = tMOVr $r0, 14, $noreg
    renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg
    $lr = t2DoLoopStart renamable $lr
  bb.2.bb9:
    successors: %bb.2(0x7c000000), %bb.3(0x04000000)
    liveins: $lr, $r0, $r1, $r2, $r3
    renamable $vpr = VLDR_P0_off $sp, 0, 14, $noreg :: (load (s32) from %stack.0)
    MVE_VPST 4, implicit $vpr
    renamable $vpr = MVE_VCTP32 renamable $r2, 1, killed renamable $vpr, $noreg
    renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, killed renamable $vpr, $noreg :: (load (s128) from %ir.lsr.iv24, align 4)
    renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg, $noreg
    renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14, $noreg
    MVE_VPST 4, implicit $vpr
    renamable $vpr = MVE_VCMPi32r renamable $q0, $zr, 1, 1, killed renamable $vpr, $noreg
    renamable $r3, renamable $q1 = MVE_VLDRWU32_post killed renamable $r3, 16, 1, renamable $vpr, $noreg :: (load (s128) from %ir.lsr.iv1, align 4)
    renamable $q0 = nsw MVE_VMULi32 killed renamable $q1, killed renamable $q0, 0, $noreg, $noreg, undef renamable $q0
    MVE_VPST 8, implicit $vpr
    MVE_VSTRWU32 killed renamable $q0, killed renamable $r0, 0, 1, killed renamable $vpr, $noreg :: (store (s128) into %ir.lsr.iv1, align 4)
    renamable $lr = t2LoopDec killed renamable $lr, 1
    $r0 = tMOVr $r3, 14, $noreg
    t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr
    tB %bb.3, 14, $noreg
  bb.3.bb27:
    $sp = tADDspi $sp, 1, 14, $noreg
    tPOP_RET 14, $noreg, def $r7, def $pc
...
 |