1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184
|
# RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -force-streaming -verify-machineinstrs -enable-subreg-liveness -start-before=greedy %s -o - | FileCheck %s
# No available group of four strided x4 registers, fall back on default allocation order
---
name: form_4x_tuple_many_live
tracksRegLiveness: true
stack:
- { id: 0, name: '', type: default, offset: 0, size: 32, alignment: 16,
stack-id: scalable-vector, callee-saved-register: '', callee-saved-restored: true,
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
body: |
bb.0.entry:
liveins: $x0, $x1, $z0, $z17
; CHECK-LABEL: form_4x_tuple_many_live
; CHECK: stp d11, d10, [sp, #-48]!
; CHECK-NEXT: stp d9, d8, [sp, #16]
; CHECK-NEXT: str x29, [sp, #32]
; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 48 + 16 * VG
; CHECK-NEXT: .cfi_offset w29, -16
; CHECK-NEXT: .cfi_offset b8, -24
; CHECK-NEXT: .cfi_offset b9, -32
; CHECK-NEXT: .cfi_offset b10, -40
; CHECK-NEXT: .cfi_offset b11, -48
; CHECK-NEXT: lsl x9, x1, #1
; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: mov w8, wzr
; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0]
; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x1]
; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: add x10, x9, x1
; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x9]
; CHECK-NEXT: ld1b { z4.b - z7.b }, pn8/z, [x0, x10]
; CHECK-NEXT: mov z8.d, z16.d
; CHECK-NEXT: mov z9.d, z18.d
; CHECK-NEXT: mov z21.d, z22.d
; CHECK-NEXT: mov z10.d, z19.d
; CHECK-NEXT: mov z22.d, z23.d
; CHECK-NEXT: mov z25.d, z26.d
; CHECK-NEXT: mov z11.d, z4.d
; CHECK-NEXT: mov z23.d, z5.d
; CHECK-NEXT: mov z26.d, z27.d
; CHECK-NEXT: mov z27.d, z6.d
; CHECK-NEXT: mov z29.d, z30.d
; CHECK-NEXT: mov z30.d, z31.d
; CHECK-NEXT: mov z31.d, z7.d
; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0]
; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0]
; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0]
; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; CHECK-NEXT: st1b { z17.b }, p0, [x0]
; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: ldp d9, d8, [sp, #16]
; CHECK-NEXT: ldr x29, [sp, #32]
; CHECK-NEXT: ldp d11, d10, [sp], #48
; CHECK-NEXT: ret
%0:gpr64common = COPY $x0
%1:gpr64 = COPY $x1
%2:zpr = COPY $z0
%3:zpr = COPY $z17
%5:matrixindexgpr32_8_11 = COPY $wzr
%6:gpr64 = UBFMXri %1, 63, 62
%pred:pnr_p8to15 = PTRUE_C_B implicit $vg
%7:ppr_3b = PTRUE_B 31, implicit $vg
%8:gpr64 = ADDXrr %6, %1
%9:zpr4stridedorcontiguous = LD1B_4Z_IMM_PSEUDO %pred, %0, 0
%10:zpr4stridedorcontiguous = LD1B_4Z_PSEUDO %pred, %0, %1
%11:zpr4stridedorcontiguous = LD1B_4Z_PSEUDO %pred, %0, %6
%12:zpr4stridedorcontiguous = LD1B_4Z_PSEUDO %pred, %0, %8
%13:zpr4mul4 = FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO %9.zsub0, %10.zsub0, %11.zsub0, %12.zsub0
%14:zpr4mul4 = FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO %9.zsub1, %10.zsub1, %11.zsub1, %12.zsub1
%15:zpr4mul4 = FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO %9.zsub2, %10.zsub2, %11.zsub2, %12.zsub2
%16:zpr4mul4 = FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO %9.zsub3, %10.zsub3, %11.zsub3, %12.zsub3
$za = UDOT_VG4_M4ZZI_BtoS $za, %5, 0, %13, undef %28:zpr_4b, 0
$za = UDOT_VG4_M4ZZI_BtoS $za, %5, 0, %14, undef %30:zpr_4b, 0
$za = UDOT_VG4_M4ZZI_BtoS $za, %5, 0, %15, undef %32:zpr_4b, 0
$za = UDOT_VG4_M4ZZI_BtoS $za, %5, 0, %16, undef %34:zpr_4b, 0
ST1B_IMM %2, %7, %0, 0 :: (store (<vscale x 1 x s128>) into %stack.0)
ST1B_IMM %3, %7, %0, 0 :: (store (<vscale x 1 x s128>) into %stack.0)
RET_ReallyLR
...
# First multi-vector load to be allocated is not the first operand of the FORM_TRANSPOSED pseudo
---
name: form_4x_tuple_allocation_order
tracksRegLiveness: true
stack:
- { id: 0, name: '', type: default, offset: 0, size: 32, alignment: 16,
stack-id: scalable-vector, callee-saved-register: '', callee-saved-restored: true,
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
body: |
bb.0.entry:
liveins: $x0, $x1, $z0
; CHECK: str x29, [sp, #-16]!
; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
; CHECK-NEXT: .cfi_offset w29, -16
; CHECK-NEXT: lsl x9, x1, #1
; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: mov w8, wzr
; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: add x10, x9, x1
; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1]
; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0]
; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0]
; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0]
; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0]
; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: ldr x29, [sp], #16
; CHECK-NEXT: ret
%0:gpr64common = COPY $x0
%1:gpr64 = COPY $x1
%2:zpr = COPY $z0
%5:matrixindexgpr32_8_11 = COPY $wzr
%6:gpr64 = UBFMXri %1, 63, 62
%pred:pnr_p8to15 = PTRUE_C_B implicit $vg
%7:ppr_3b = PTRUE_B 31, implicit $vg
%8:gpr64 = ADDXrr %6, %1
%9:zpr4stridedorcontiguous = LD1B_4Z_PSEUDO %pred, %0, %8
%10:zpr4stridedorcontiguous = LD1B_4Z_PSEUDO %pred, %0, %6
%11:zpr4stridedorcontiguous = LD1B_4Z_PSEUDO %pred, %0, %1
%12:zpr4stridedorcontiguous = LD1B_4Z_IMM_PSEUDO %pred, %0, 0
%13:zpr4mul4 = FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO %12.zsub0, %11.zsub0, %10.zsub0, %9.zsub0
%14:zpr4mul4 = FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO %12.zsub1, %11.zsub1, %10.zsub1, %9.zsub1
%15:zpr4mul4 = FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO %12.zsub2, %11.zsub2, %10.zsub2, %9.zsub2
%16:zpr4mul4 = FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO %12.zsub3, %11.zsub3, %10.zsub3, %9.zsub3
$za = UDOT_VG4_M4ZZI_BtoS $za, %5, 0, %13, undef %28:zpr_4b, 0
$za = UDOT_VG4_M4ZZI_BtoS $za, %5, 0, %14, undef %30:zpr_4b, 0
$za = UDOT_VG4_M4ZZI_BtoS $za, %5, 0, %15, undef %32:zpr_4b, 0
$za = UDOT_VG4_M4ZZI_BtoS $za, %5, 0, %16, undef %34:zpr_4b, 0
ST1B_IMM %2, %7, %0, 0 :: (store (<vscale x 1 x s128>) into %stack.0)
RET_ReallyLR
...
# Strided order is [ $z16_z24 $z17_z25 $z18_z26 $z19_z27 $z20_z28 $z21_z29 $z22_z30 $z23_z31 $z0_z8 $z1_z9 $z2_z10 $z3_z11 $z4_z12 $z5_z13 $z6_z14 $z7_z15 ]
# Ensure we don't allocate $z23_z31 & $z0_z8 although they are consecutive
---
name: udot_form_2x_tuple_live_reg_order
tracksRegLiveness: true
body: |
bb.0.entry:
liveins: $x0, $x1, $z16, $z17, $z18, $z19, $z20, $z21, $z22
; CHECK: stp d9, d8, [sp, #-16]!
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: .cfi_offset b8, -8
; CHECK-NEXT: .cfi_offset b9, -16
; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: mov w8, wzr
; CHECK-NEXT: ld1b { z0.b, z8.b }, pn8/z, [x0]
; CHECK-NEXT: ld1b { z1.b, z9.b }, pn8/z, [x0, x1]
; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b
; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z8.b, z9.b }, z0.b
; CHECK-NEXT: ldp d9, d8, [sp], #16
; CHECK-NEXT: ret
%0:gpr64 = COPY $x1
%1:gpr64common = COPY $x0
%2:zpr = COPY $z16
%3:zpr = COPY $z17
%4:zpr = COPY $z18
%5:zpr = COPY $z19
%6:zpr = COPY $z20
%7:zpr = COPY $z21
%8:zpr = COPY $z22
%9:matrixindexgpr32_8_11 = COPY $wzr
%10:pnr_p8to15 = PTRUE_C_B implicit $vg
%11:zpr2stridedorcontiguous = LD1B_2Z_IMM_PSEUDO %10, %1, 0
%12:zpr2stridedorcontiguous = LD1B_2Z_PSEUDO %10, %1, %0
%13:zpr2 = FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO %11.zsub0, %12.zsub0
%14:zpr2 = FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO %11.zsub1, %12.zsub1
$za = UDOT_VG2_M2ZZ_BtoS $za, %9, 0, %13, undef %15:zpr_4b
$za = UDOT_VG2_M2ZZ_BtoS $za, %9, 0, %14, undef %16:zpr_4b
RET_ReallyLR
...
|