File: v8.6a-neon-intrinsics.c

package info (click to toggle)
llvm-toolchain-21 1%3A21.1.0-1
links: PTS, VCS
area: main
in suites: sid
size: 2,235,796 kB
sloc: cpp: 7,617,614; ansic: 1,433,901; asm: 1,058,726; python: 252,096; f90: 94,671; objc: 70,753; lisp: 42,813; pascal: 18,401; sh: 10,032; ml: 5,111; perl: 4,720; awk: 3,523; makefile: 3,401; javascript: 2,272; xml: 892; fortran: 770
file content (205 lines) | stat: -rw-r--r-- 12,243 bytes
parent folder | download | duplicates (4)
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -target-feature +v8.6a -target-feature +i8mm \
// RUN: -disable-O0-optnone -emit-llvm -o - %s \
// RUN: | opt -S -passes=mem2reg,sroa \
// RUN: | FileCheck %s

// REQUIRES: aarch64-registered-target

#include <arm_neon.h>

// CHECK-LABEL: define dso_local <4 x i32> @test_vmmlaq_s32(
// CHECK-SAME: <4 x i32> noundef [[R:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] {
// CHECK-NEXT:  [[ENTRY:.*:]]
// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[R]] to <16 x i8>
// CHECK-NEXT:    [[VMMLA_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT:    [[VMMLA1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smmla.v4i32.v16i8(<4 x i32> [[VMMLA_I]], <16 x i8> [[A]], <16 x i8> [[B]])
// CHECK-NEXT:    ret <4 x i32> [[VMMLA1_I]]
//
int32x4_t test_vmmlaq_s32(int32x4_t r, int8x16_t a, int8x16_t b) {
  return vmmlaq_s32(r, a, b);
}

// CHECK-LABEL: define dso_local <4 x i32> @test_vmmlaq_u32(
// CHECK-SAME: <4 x i32> noundef [[R:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
// CHECK-NEXT:  [[ENTRY:.*:]]
// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[R]] to <16 x i8>
// CHECK-NEXT:    [[VMMLA_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT:    [[VMMLA1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32> [[VMMLA_I]], <16 x i8> [[A]], <16 x i8> [[B]])
// CHECK-NEXT:    ret <4 x i32> [[VMMLA1_I]]
//
uint32x4_t test_vmmlaq_u32(uint32x4_t r, uint8x16_t a, uint8x16_t b) {
  return vmmlaq_u32(r, a, b);
}

// CHECK-LABEL: define dso_local <4 x i32> @test_vusmmlaq_s32(
// CHECK-SAME: <4 x i32> noundef [[R:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
// CHECK-NEXT:  [[ENTRY:.*:]]
// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[R]] to <16 x i8>
// CHECK-NEXT:    [[VUSMMLA_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT:    [[VUSMMLA1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.usmmla.v4i32.v16i8(<4 x i32> [[VUSMMLA_I]], <16 x i8> [[A]], <16 x i8> [[B]])
// CHECK-NEXT:    ret <4 x i32> [[VUSMMLA1_I]]
//
int32x4_t test_vusmmlaq_s32(int32x4_t r, uint8x16_t a, int8x16_t b) {
  return vusmmlaq_s32(r, a, b);
}

// CHECK-LABEL: define dso_local <2 x i32> @test_vusdot_s32(
// CHECK-SAME: <2 x i32> noundef [[R:%.*]], <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
// CHECK-NEXT:  [[ENTRY:.*:]]
// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[R]] to <8 x i8>
// CHECK-NEXT:    [[VUSDOT_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT:    [[VUSDOT1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> [[VUSDOT_I]], <8 x i8> [[A]], <8 x i8> [[B]])
// CHECK-NEXT:    ret <2 x i32> [[VUSDOT1_I]]
//
int32x2_t test_vusdot_s32(int32x2_t r, uint8x8_t a, int8x8_t b) {
  return vusdot_s32(r, a, b);
}

// CHECK-LABEL: define dso_local <2 x i32> @test_vusdot_lane_s32(
// CHECK-SAME: <2 x i32> noundef [[R:%.*]], <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
// CHECK-NEXT:  [[ENTRY:.*:]]
// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[B]] to <2 x i32>
// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer
// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[R]] to <8 x i8>
// CHECK-NEXT:    [[VUSDOT_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
// CHECK-NEXT:    [[VUSDOT1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> [[VUSDOT_I]], <8 x i8> [[A]], <8 x i8> [[TMP3]])
// CHECK-NEXT:    ret <2 x i32> [[VUSDOT1_I]]
//
int32x2_t test_vusdot_lane_s32(int32x2_t r, uint8x8_t a, int8x8_t b) {
  return vusdot_lane_s32(r, a, b, 0);
}

// CHECK-LABEL: define dso_local <2 x i32> @test_vsudot_lane_s32(
// CHECK-SAME: <2 x i32> noundef [[R:%.*]], <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
// CHECK-NEXT:  [[ENTRY:.*:]]
// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[B]] to <2 x i32>
// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer
// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[R]] to <8 x i8>
// CHECK-NEXT:    [[VUSDOT_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
// CHECK-NEXT:    [[VUSDOT1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> [[VUSDOT_I]], <8 x i8> [[TMP3]], <8 x i8> [[A]])
// CHECK-NEXT:    ret <2 x i32> [[VUSDOT1_I]]
//
int32x2_t test_vsudot_lane_s32(int32x2_t r, int8x8_t a, uint8x8_t b) {
  return vsudot_lane_s32(r, a, b, 0);
}

// CHECK-LABEL: define dso_local <2 x i32> @test_vusdot_laneq_s32(
// CHECK-SAME: <2 x i32> noundef [[R:%.*]], <8 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
// CHECK-NEXT:  [[ENTRY:.*:]]
// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[B]] to <4 x i32>
// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP2]], <2 x i32> zeroinitializer
// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[R]] to <8 x i8>
// CHECK-NEXT:    [[VUSDOT_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
// CHECK-NEXT:    [[VUSDOT1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> [[VUSDOT_I]], <8 x i8> [[A]], <8 x i8> [[TMP3]])
// CHECK-NEXT:    ret <2 x i32> [[VUSDOT1_I]]
//
int32x2_t test_vusdot_laneq_s32(int32x2_t r, uint8x8_t a, int8x16_t b) {
  return vusdot_laneq_s32(r, a, b, 0);
}

// CHECK-LABEL: define dso_local <2 x i32> @test_vsudot_laneq_s32(
// CHECK-SAME: <2 x i32> noundef [[R:%.*]], <8 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
// CHECK-NEXT:  [[ENTRY:.*:]]
// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[B]] to <4 x i32>
// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP2]], <2 x i32> zeroinitializer
// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[R]] to <8 x i8>
// CHECK-NEXT:    [[VUSDOT_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
// CHECK-NEXT:    [[VUSDOT1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> [[VUSDOT_I]], <8 x i8> [[TMP3]], <8 x i8> [[A]])
// CHECK-NEXT:    ret <2 x i32> [[VUSDOT1_I]]
//
int32x2_t test_vsudot_laneq_s32(int32x2_t r, int8x8_t a, uint8x16_t b) {
  return vsudot_laneq_s32(r, a, b, 0);
}

// CHECK-LABEL: define dso_local <4 x i32> @test_vusdotq_s32(
// CHECK-SAME: <4 x i32> noundef [[R:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
// CHECK-NEXT:  [[ENTRY:.*:]]
// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[R]] to <16 x i8>
// CHECK-NEXT:    [[VUSDOT_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT:    [[VUSDOT1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> [[VUSDOT_I]], <16 x i8> [[A]], <16 x i8> [[B]])
// CHECK-NEXT:    ret <4 x i32> [[VUSDOT1_I]]
//
int32x4_t test_vusdotq_s32(int32x4_t r, uint8x16_t a, int8x16_t b) {
  return vusdotq_s32(r, a, b);
}

// CHECK-LABEL: define dso_local <4 x i32> @test_vusdotq_lane_s32(
// CHECK-SAME: <4 x i32> noundef [[R:%.*]], <16 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
// CHECK-NEXT:  [[ENTRY:.*:]]
// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[B]] to <2 x i32>
// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <4 x i32> zeroinitializer
// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8>
// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[R]] to <16 x i8>
// CHECK-NEXT:    [[VUSDOT_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
// CHECK-NEXT:    [[VUSDOT1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> [[VUSDOT_I]], <16 x i8> [[A]], <16 x i8> [[TMP3]])
// CHECK-NEXT:    ret <4 x i32> [[VUSDOT1_I]]
//
int32x4_t test_vusdotq_lane_s32(int32x4_t r, uint8x16_t a, int8x8_t b) {
  return vusdotq_lane_s32(r, a, b, 0);
}

// CHECK-LABEL: define dso_local <4 x i32> @test_vsudotq_lane_s32(
// CHECK-SAME: <4 x i32> noundef [[R:%.*]], <16 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
// CHECK-NEXT:  [[ENTRY:.*:]]
// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[B]] to <2 x i32>
// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <4 x i32> zeroinitializer
// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8>
// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[R]] to <16 x i8>
// CHECK-NEXT:    [[VUSDOT_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
// CHECK-NEXT:    [[VUSDOT1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> [[VUSDOT_I]], <16 x i8> [[TMP3]], <16 x i8> [[A]])
// CHECK-NEXT:    ret <4 x i32> [[VUSDOT1_I]]
//
int32x4_t test_vsudotq_lane_s32(int32x4_t r, int8x16_t a, uint8x8_t b) {
  return vsudotq_lane_s32(r, a, b, 0);
}

// CHECK-LABEL: define dso_local <4 x i32> @test_vusdotq_laneq_s32(
// CHECK-SAME: <4 x i32> noundef [[R:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
// CHECK-NEXT:  [[ENTRY:.*:]]
// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[B]] to <4 x i32>
// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP2]], <4 x i32> zeroinitializer
// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8>
// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[R]] to <16 x i8>
// CHECK-NEXT:    [[VUSDOT_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
// CHECK-NEXT:    [[VUSDOT1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> [[VUSDOT_I]], <16 x i8> [[A]], <16 x i8> [[TMP3]])
// CHECK-NEXT:    ret <4 x i32> [[VUSDOT1_I]]
//
int32x4_t test_vusdotq_laneq_s32(int32x4_t r, uint8x16_t a, int8x16_t b) {
  return vusdotq_laneq_s32(r, a, b, 0);
}

// CHECK-LABEL: define dso_local <4 x i32> @test_vsudotq_laneq_s32(
// CHECK-SAME: <4 x i32> noundef [[R:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
// CHECK-NEXT:  [[ENTRY:.*:]]
// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[B]] to <4 x i32>
// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP2]], <4 x i32> zeroinitializer
// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8>
// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[R]] to <16 x i8>
// CHECK-NEXT:    [[VUSDOT_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
// CHECK-NEXT:    [[VUSDOT1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> [[VUSDOT_I]], <16 x i8> [[TMP3]], <16 x i8> [[A]])
// CHECK-NEXT:    ret <4 x i32> [[VUSDOT1_I]]
//
int32x4_t test_vsudotq_laneq_s32(int32x4_t r, int8x16_t a, uint8x16_t b) {
  return vsudotq_laneq_s32(r, a, b, 0);
}