1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153
|
;=========================== begin_copyright_notice ============================
;
; Copyright (C) 2023 Intel Corporation
;
; SPDX-License-Identifier: MIT
;
;============================ end_copyright_notice =============================
;
; RUN: %opt %use_old_pass_manager% -enable-debugify -GenXLoadStoreLowering -march=genx64 -mcpu=Gen9 -mtriple=spir64-unknown-unknown -enable-ldst-lowering=true -mattr=+ocl_runtime -S < %s 2>&1 | FileCheck %s
; RUN: %opt %use_old_pass_manager% -enable-debugify -GenXLoadStoreLowering -march=genx64 -mcpu=XeHPC -mtriple=spir64-unknown-unknown -enable-ldst-lowering=true -mattr=+ocl_runtime -S < %s 2>&1 | FileCheck --check-prefix=CHECK-LSC %s
;
; CHECK-NOT: WARNING
; CHECK: CheckModuleDebugify: PASS
; CHECK-LSC-NOT: WARNING
; CHECK-LSC: CheckModuleDebugify: PASS
; COM: Basic test on load lowering pass
; COM: @llvm.masked.gather from addrspace(3)
target datalayout = "e-p:64:64-p3:32:32-i64:64-n8:16:32:64"
target triple = "genx64-unknown-unknown"
; Address space 3 (local) operations are lowered into bti(254)/slm intrinsics
declare <8 x i8> @llvm.masked.gather.v8i8.v8p3i8(<8 x i8 addrspace(3)*>, i32, <8 x i1>, <8 x i8>)
declare <8 x i16> @llvm.masked.gather.v8i16.v8p3i16(<8 x i16 addrspace(3)*>, i32, <8 x i1>, <8 x i16>)
declare <8 x i32> @llvm.masked.gather.v8i32.v8p3i32(<8 x i32 addrspace(3)*>, i32, <8 x i1>, <8 x i32>)
declare <8 x i64> @llvm.masked.gather.v8i64.v8p3i64(<8 x i64 addrspace(3)*>, i32, <8 x i1>, <8 x i64>)
declare <8 x half> @llvm.masked.gather.v8f16.v8p3f16(<8 x half addrspace(3)*>, i32, <8 x i1>, <8 x half>)
declare <8 x float> @llvm.masked.gather.v8f32.v8p3f32(<8 x float addrspace(3)*>, i32, <8 x i1>, <8 x float>)
declare <8 x double> @llvm.masked.gather.v8f64.v8p3f64(<8 x double addrspace(3)*>, i32, <8 x i1>, <8 x double>)
declare <8 x i8*> @llvm.masked.gather.v8p0i8.v8p3p0i8(<8 x i8* addrspace(3)*>, i32, <8 x i1>, <8 x i8*>)
define <8 x i8> @test_i8(<8 x i8 addrspace(3)*> %pi8, <8 x i1> %mask, <8 x i8> %passthru) {
; CHECK: [[ZEXT8:[^ ]+]] = zext <8 x i8> %passthru to <8 x i32>
; CHECK: [[ADDR8:[^ ]+]] = ptrtoint <8 x i8 addrspace(3)*> %pi8 to <8 x i32>
; CHECK: [[LOAD8:[^ ]+]] = call <8 x i32> @llvm.genx.gather.scaled.v8i32.v8i1.v8i32(<8 x i1> %mask, i32 0, i16 0, i32 254, i32 0, <8 x i32> [[ADDR8]], <8 x i32> [[ZEXT8]])
; CHECK: %res = trunc <8 x i32> [[LOAD8]] to <8 x i8>
; CHECK-LSC-DAG: [[PASSTHRU8:[^ ]+]] = zext <8 x i8> %passthru to <8 x i32>
; CHECK-LSC-DAG: [[ADDR8:[^ ]+]] = ptrtoint <8 x i8 addrspace(3)*> %pi8 to <8 x i32>
; CHECK-LSC: [[DATA8:[^ ]+]] = call <8 x i32> @llvm.vc.internal.lsc.load.slm.v8i32.v8i1.v2i8.v8i32(<8 x i1> %mask, i8 2, i8 5, i8 1, <2 x i8> zeroinitializer, i32 0, <8 x i32> [[ADDR8]], i16 1, i32 0, <8 x i32> [[PASSTHRU8]])
; CHECK-LSC: %res = trunc <8 x i32> [[DATA8]] to <8 x i8>
%res = call <8 x i8> @llvm.masked.gather.v8i8.v8p3i8(<8 x i8 addrspace(3)*> %pi8, i32 1, <8 x i1> %mask, <8 x i8> %passthru)
ret <8 x i8> %res
}
define <8 x i16> @test_i16(<8 x i16 addrspace(3)*> %pi16, <8 x i1> %mask, <8 x i16> %passthru) {
; CHECK: [[ZEXT16:[^ ]+]] = zext <8 x i16> %passthru to <8 x i32>
; CHECK: [[ADDR16:[^ ]+]] = ptrtoint <8 x i16 addrspace(3)*> %pi16 to <8 x i32>
; CHECK: [[LOAD16:[^ ]+]] = call <8 x i32> @llvm.genx.gather.scaled.v8i32.v8i1.v8i32(<8 x i1> %mask, i32 1, i16 0, i32 254, i32 0, <8 x i32> [[ADDR16]], <8 x i32> [[ZEXT16]])
; CHECK: %res = trunc <8 x i32> [[LOAD16]] to <8 x i16>
; CHECK-LSC-DAG: [[PASSTHRU16:[^ ]+]] = zext <8 x i16> %passthru to <8 x i32>
; CHECK-LSC-DAG: [[ADDR16:[^ ]+]] = ptrtoint <8 x i16 addrspace(3)*> %pi16 to <8 x i32>
; CHECK-LSC: [[DATA16:[^ ]+]] = call <8 x i32> @llvm.vc.internal.lsc.load.slm.v8i32.v8i1.v2i8.v8i32(<8 x i1> %mask, i8 2, i8 6, i8 1, <2 x i8> zeroinitializer, i32 0, <8 x i32> [[ADDR16]], i16 1, i32 0, <8 x i32> [[PASSTHRU16]])
; CHECK-LSC: %res = trunc <8 x i32> [[DATA16]] to <8 x i16>
%res = call <8 x i16> @llvm.masked.gather.v8i16.v8p3i16(<8 x i16 addrspace(3)*> %pi16, i32 2, <8 x i1> %mask, <8 x i16> %passthru)
ret <8 x i16> %res
}
define <8 x i32> @test_i32(<8 x i32 addrspace(3)*> %pi32, <8 x i1> %mask, <8 x i32> %passthru) {
; CHECK: [[ADDR32:[^ ]+]] = ptrtoint <8 x i32 addrspace(3)*> %pi32 to <8 x i32>
; CHECK: %res = call <8 x i32> @llvm.genx.gather.scaled.v8i32.v8i1.v8i32(<8 x i1> %mask, i32 2, i16 0, i32 254, i32 0, <8 x i32> [[ADDR32]], <8 x i32> %passthru)
; CHECK-LSC: [[ADDR32:[^ ]+]] = ptrtoint <8 x i32 addrspace(3)*> %pi32 to <8 x i32>
; CHECK-LSC: %res = call <8 x i32> @llvm.vc.internal.lsc.load.slm.v8i32.v8i1.v2i8.v8i32(<8 x i1> %mask, i8 2, i8 3, i8 1, <2 x i8> zeroinitializer, i32 0, <8 x i32> [[ADDR32]], i16 1, i32 0, <8 x i32> %passthru)
%res = call <8 x i32> @llvm.masked.gather.v8i32.v8p3i32(<8 x i32 addrspace(3)*> %pi32, i32 4, <8 x i1> %mask, <8 x i32> %passthru)
ret <8 x i32> %res
}
define <8 x i64> @test_i64(<8 x i64 addrspace(3)*> %pi64, <8 x i1> %mask, <8 x i64> %passthru) {
; CHECK: [[ADDR64:[^ ]+]] = ptrtoint <8 x i64 addrspace(3)*> %pi64 to <8 x i32>
; CHECK: [[CAST64:[^ ]+]] = bitcast <8 x i64> %passthru to <16 x i32>
; CHECK: [[STOA64:[^ ]+]] = call <16 x i32> @llvm.genx.rdregioni.v16i32.v16i32.i16(<16 x i32> [[CAST64]], i32 1, i32 8, i32 2, i16 0, i32 undef)
; CHECK: [[LOAD64:[^ ]+]] = call <16 x i32> @llvm.genx.gather4.scaled.v16i32.v8i1.v8i32(<8 x i1> %mask, i32 12, i16 0, i32 254, i32 0, <8 x i32> [[ADDR64]], <16 x i32> [[STOA64]])
; CHECK: [[ATOS64:[^ ]+]] = call <16 x i32> @llvm.genx.rdregioni.v16i32.v16i32.i16(<16 x i32> [[LOAD64]], i32 1, i32 2, i32 8, i16 0, i32 undef)
; CHECK: %res = bitcast <16 x i32> [[ATOS64]] to <8 x i64>
; CHECK-LSC: [[ADDR64:[^ ]+]] = ptrtoint <8 x i64 addrspace(3)*> %pi64 to <8 x i32>
; CHECK-LSC: %res = call <8 x i64> @llvm.vc.internal.lsc.load.slm.v8i64.v8i1.v2i8.v8i32(<8 x i1> %mask, i8 2, i8 4, i8 1, <2 x i8> zeroinitializer, i32 0, <8 x i32> [[ADDR32]], i16 1, i32 0, <8 x i64> %passthru)
%res = call <8 x i64> @llvm.masked.gather.v8i64.v8p3i64(<8 x i64 addrspace(3)*> %pi64, i32 8, <8 x i1> %mask, <8 x i64> %passthru)
ret <8 x i64> %res
}
define <8 x half> @test_f16(<8 x half addrspace(3)*> %pi16, <8 x i1> %mask, <8 x half> %passthru) {
; CHECK: [[CASTH:[^ ]+]] = bitcast <8 x half> %passthru to <8 x i16>
; CHECK: [[ZEXTH:[^ ]+]] = zext <8 x i16> [[CASTH]] to <8 x i32>
; CHECK: [[ADDRH:[^ ]+]] = ptrtoint <8 x half addrspace(3)*> %pi16 to <8 x i32>
; CHECK: [[LOADH:[^ ]+]] = call <8 x i32> @llvm.genx.gather.scaled.v8i32.v8i1.v8i32(<8 x i1> %mask, i32 1, i16 0, i32 254, i32 0, <8 x i32> [[ADDRH]], <8 x i32> [[ZEXTH]])
; CHECK: %5 = trunc <8 x i32> %4 to <8 x i16>
; CHECK: %res = bitcast <8 x i16> %5 to <8 x half>
; CHECK-LSC-DAG: [[CASTH:[^ ]+]] = bitcast <8 x half> %passthru to <8 x i16>
; CHECK-LSC-DAG: [[PASSTHRUH:[^ ]+]] = zext <8 x i16> [[CASTH]] to <8 x i32>
; CHECK-LSC-DAG: [[ADDRH:[^ ]+]] = ptrtoint <8 x half addrspace(3)*> %pi16 to <8 x i32>
; CHECK-LSC: [[DATAH:[^ ]+]] = call <8 x i32> @llvm.vc.internal.lsc.load.slm.v8i32.v8i1.v2i8.v8i32(<8 x i1> %mask, i8 2, i8 6, i8 1, <2 x i8> zeroinitializer, i32 0, <8 x i32> [[ADDRH]], i16 1, i32 0, <8 x i32> [[PASSTHRUH]])
; CHECK-LSC: [[TRUNC:[^ ]+]] = trunc <8 x i32> [[DATAH]] to <8 x i16>
; CHECK-LSC: %res = bitcast <8 x i16> [[TRUNC]] to <8 x half>
%res = call <8 x half> @llvm.masked.gather.v8f16.v8p3f16(<8 x half addrspace(3)*> %pi16, i32 2, <8 x i1> %mask, <8 x half> %passthru)
ret <8 x half> %res
}
define <8 x float> @test_f32(<8 x float addrspace(3)*> %pi32, <8 x i1> %mask, <8 x float> %passthru) {
; CHECK: [[ADDRF:[^ ]+]] = ptrtoint <8 x float addrspace(3)*> %pi32 to <8 x i32>
; CHECK: %res = call <8 x float> @llvm.genx.gather.scaled.v8f32.v8i1.v8i32(<8 x i1> %mask, i32 2, i16 0, i32 254, i32 0, <8 x i32> [[ADDRF]], <8 x float> %passthru)
; CHECK-LSC: [[ADDRF:[^ ]+]] = ptrtoint <8 x float addrspace(3)*> %pi32 to <8 x i32>
; CHECK-LSC: %res = call <8 x float> @llvm.vc.internal.lsc.load.slm.v8f32.v8i1.v2i8.v8i32(<8 x i1> %mask, i8 2, i8 3, i8 1, <2 x i8> zeroinitializer, i32 0, <8 x i32> [[ADDRF]], i16 1, i32 0, <8 x float> %passthru)
%res = call <8 x float> @llvm.masked.gather.v8f32.v8p3f32(<8 x float addrspace(3)*> %pi32, i32 4, <8 x i1> %mask, <8 x float> %passthru)
ret <8 x float> %res
}
define <8 x double> @test_f64(<8 x double addrspace(3)*> %pi64, <8 x i1> %mask, <8 x double> %passthru) {
; CHECK: [[ADDRD:[^ ]+]] = ptrtoint <8 x double addrspace(3)*> %pi64 to <8 x i32>
; CHECK: [[CASTD:[^ ]+]] = bitcast <8 x double> %passthru to <16 x i32>
; CHECK: [[STOAD:[^ ]+]] = call <16 x i32> @llvm.genx.rdregioni.v16i32.v16i32.i16(<16 x i32> [[CASTD]], i32 1, i32 8, i32 2, i16 0, i32 undef)
; CHECK: [[LOADD:[^ ]+]] = call <16 x i32> @llvm.genx.gather4.scaled.v16i32.v8i1.v8i32(<8 x i1> %mask, i32 12, i16 0, i32 254, i32 0, <8 x i32> [[ADDRD]], <16 x i32> [[STOAD]])
; CHECK: [[ATOSD:[^ ]+]] = call <16 x i32> @llvm.genx.rdregioni.v16i32.v16i32.i16(<16 x i32> [[LOADD]], i32 1, i32 2, i32 8, i16 0, i32 undef)
; CHECK: %res = bitcast <16 x i32> [[ATOSD]] to <8 x double>
; CHECK-LSC: [[ADDRD:[^ ]+]] = ptrtoint <8 x double addrspace(3)*> %pi64 to <8 x i32>
; CHECK-LSC: %res = call <8 x double> @llvm.vc.internal.lsc.load.slm.v8f64.v8i1.v2i8.v8i32(<8 x i1> %mask, i8 2, i8 4, i8 1, <2 x i8> zeroinitializer, i32 0, <8 x i32> [[ADDRD]], i16 1, i32 0, <8 x double> %passthru)
%res = call <8 x double> @llvm.masked.gather.v8f64.v8p3f64(<8 x double addrspace(3)*> %pi64, i32 8, <8 x i1> %mask, <8 x double> %passthru)
ret <8 x double> %res
}
define <8 x i8*> @test_ptr(<8 x i8* addrspace(3)*> %pptr, <8 x i1> %mask, <8 x i8*> %passthru) {
; CHECK: [[PTI:[^ ]+]] = ptrtoint <8 x i8*> %passthru to <8 x i64>
; CHECK: [[ADDRP:[^ ]+]] = ptrtoint <8 x i8* addrspace(3)*> %pptr to <8 x i32>
; CHECK: [[CASTP:[^ ]+]] = bitcast <8 x i64> [[PTI]] to <16 x i32>
; CHECK: [[STOAP:[^ ]+]] = call <16 x i32> @llvm.genx.rdregioni.v16i32.v16i32.i16(<16 x i32> [[CASTP]], i32 1, i32 8, i32 2, i16 0, i32 undef)
; CHECK: [[LOADP:[^ ]+]] = call <16 x i32> @llvm.genx.gather4.scaled.v16i32.v8i1.v8i32(<8 x i1> %mask, i32 12, i16 0, i32 254, i32 0, <8 x i32> [[ADDRP]], <16 x i32> [[STOAP]])
; CHECK: [[ATOSP:[^ ]+]] = call <16 x i32> @llvm.genx.rdregioni.v16i32.v16i32.i16(<16 x i32> [[LOADP]], i32 1, i32 2, i32 8, i16 0, i32 undef)
; CHECK: [[ITP:[^ ]+]] = bitcast <16 x i32> [[ATOSP]] to <8 x i64>
; CHECK: %res = inttoptr <8 x i64> [[ITP]] to <8 x i8*>
; CHECK-LSC-DAG: [[PTI:[^ ]+]] = ptrtoint <8 x i8*> %passthru to <8 x i64>
; CHECK-LSC-DAG: [[ADDRP:[^ ]+]] = ptrtoint <8 x i8* addrspace(3)*> %pptr to <8 x i32>
; CHECK-LSC: [[DATAP:[^ ]+]] = call <8 x i64> @llvm.vc.internal.lsc.load.slm.v8i64.v8i1.v2i8.v8i32(<8 x i1> %mask, i8 2, i8 4, i8 1, <2 x i8> zeroinitializer, i32 0, <8 x i32> [[ADDRP]], i16 1, i32 0, <8 x i64> [[PTI]])
; CHECK-LSC: %res = inttoptr <8 x i64> [[DATAP]] to <8 x i8*>
%res = call <8 x i8*> @llvm.masked.gather.v8p0i8.v8p3p0i8(<8 x i8* addrspace(3)*> %pptr, i32 8, <8 x i1> %mask, <8 x i8*> %passthru)
ret <8 x i8*> %res
}
define <8 x i64> @test_i64_unaligned(<8 x i64 addrspace(3)*> %pi64, <8 x i1> %mask, <8 x i64> %passthru) {
; CHECK: [[ADDRU64:[^ ]+]] = ptrtoint <8 x i64 addrspace(3)*> %pi64 to <8 x i32>
; CHECK: [[CASTU64:[^ ]+]] = bitcast <8 x i64> %passthru to <16 x i32>
; CHECK: [[LOWU64:[^ ]+]] = call <8 x i32> @llvm.genx.rdregioni.v8i32.v16i32.i16(<16 x i32> [[CASTU64]], i32 2, i32 1, i32 0, i16 0, i32 undef)
; CHECK: [[HIGHU64:[^ ]+]] = call <8 x i32> @llvm.genx.rdregioni.v8i32.v16i32.i16(<16 x i32> [[CASTU64]], i32 2, i32 1, i32 0, i16 4, i32 undef)
; CHECK: [[LOADL64:[^ ]+]] = call <8 x i32> @llvm.genx.gather.scaled.v8i32.v8i1.v8i32(<8 x i1> %mask, i32 2, i16 0, i32 254, i32 0, <8 x i32> [[ADDRU64]], <8 x i32> [[LOWU64]])
; CHECK: [[LOADH64:[^ ]+]] = call <8 x i32> @llvm.genx.gather.scaled.v8i32.v8i1.v8i32(<8 x i1> %mask, i32 2, i16 0, i32 254, i32 4, <8 x i32> [[ADDRU64]], <8 x i32> [[HIGHU64]])
; CHECK: [[INSL64:[^ ]+]] = call <16 x i32> @llvm.genx.wrregioni.v16i32.v8i32.i16.i1(<16 x i32> undef, <8 x i32> [[LOADL64]], i32 2, i32 1, i32 0, i16 0, i32 undef, i1 true)
; CHECK: [[INSH64:[^ ]+]] = call <16 x i32> @llvm.genx.wrregioni.v16i32.v8i32.i16.i1(<16 x i32> %7, <8 x i32> [[LOADH64]], i32 2, i32 1, i32 0, i16 4, i32 undef, i1 true)
; CHECK: %res = bitcast <16 x i32> [[INSH64]] to <8 x i64>
; CHECK-LSC: [[ADDRU64:[^ ]+]] = ptrtoint <8 x i64 addrspace(3)*> %pi64 to <8 x i32>
; CHECK-LSC: %res = call <8 x i64> @llvm.vc.internal.lsc.load.slm.v8i64.v8i1.v2i8.v8i32(<8 x i1> %mask, i8 2, i8 4, i8 1, <2 x i8> zeroinitializer, i32 0, <8 x i32> [[ADDRU64]], i16 1, i32 0, <8 x i64> %passthru)
%res = call <8 x i64> @llvm.masked.gather.v8i64.v8p3i64(<8 x i64 addrspace(3)*> %pi64, i32 1, <8 x i1> %mask, <8 x i64> %passthru)
ret <8 x i64> %res
}
|