File: arm-v8.6a-neon-intrinsics.c

package info (click to toggle)
llvm-toolchain-13 1%3A13.0.1-6~deb11u1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 1,418,812 kB
  • sloc: cpp: 5,290,827; ansic: 996,570; asm: 544,593; python: 188,212; objc: 72,027; lisp: 30,291; f90: 25,395; sh: 24,900; javascript: 9,780; pascal: 9,398; perl: 7,484; ml: 5,432; awk: 3,523; makefile: 2,892; xml: 953; cs: 573; fortran: 539
file content (87 lines) | stat: -rw-r--r-- 4,228 bytes parent folder | download | duplicates (9)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
// RUN: %clang_cc1 -triple armv8.6a-arm-none-eabi -target-feature +neon -target-feature +fullfp16 -target-feature +i8mm \
// RUN: -fallow-half-arguments-and-returns -S -disable-O0-optnone -emit-llvm -o - %s \
// RUN: | opt -S -mem2reg -sroa \
// RUN: | FileCheck %s

// REQUIRES: arm-registered-target

#include <arm_neon.h>

// CHECK-LABEL: test_vmmlaq_s32
// CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.arm.neon.smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b)
// CHECK: ret <4 x i32> [[VAL]]
int32x4_t test_vmmlaq_s32(int32x4_t r, int8x16_t a, int8x16_t b) {
  return vmmlaq_s32(r, a, b);
}

// CHECK-LABEL: test_vmmlaq_u32
// CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.arm.neon.ummla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b)
// CHECK: ret <4 x i32> [[VAL]]
uint32x4_t test_vmmlaq_u32(uint32x4_t r, uint8x16_t a, uint8x16_t b) {
  return vmmlaq_u32(r, a, b);
}

// CHECK-LABEL: test_vusmmlaq_s32
// CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.arm.neon.usmmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b)
// CHECK: ret <4 x i32> [[VAL]]
int32x4_t test_vusmmlaq_s32(int32x4_t r, uint8x16_t a, int8x16_t b) {
  return vusmmlaq_s32(r, a, b);
}

// CHECK-LABEL: test_vusdot_s32
// CHECK: [[VAL:%.*]] = call <2 x i32> @llvm.arm.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b)
// CHECK: ret <2 x i32> [[VAL]]
int32x2_t test_vusdot_s32(int32x2_t r, uint8x8_t a, int8x8_t b) {
  return vusdot_s32(r, a, b);
}

// CHECK-LABEL: test_vusdot_lane_s32
// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %b to <2 x i32>
// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer
// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> %r to <8 x i8>
// CHECK: [[OP:%.*]] = call <2 x i32> @llvm.arm.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> [[TMP3]])
// CHECK: ret <2 x i32> [[OP]]
int32x2_t test_vusdot_lane_s32(int32x2_t r, uint8x8_t a, int8x8_t b) {
  return vusdot_lane_s32(r, a, b, 0);
}

// CHECK-LABEL: test_vsudot_lane_s32
// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %b to <2 x i32>
// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer
// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> %r to <8 x i8>
// CHECK: [[OP:%.*]] = call <2 x i32> @llvm.arm.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> [[TMP3]], <8 x i8> %a)
// CHECK: ret <2 x i32> [[OP]]
int32x2_t test_vsudot_lane_s32(int32x2_t r, int8x8_t a, uint8x8_t b) {
  return vsudot_lane_s32(r, a, b, 0);
}

// CHECK-LABEL: test_vusdotq_lane_s32
// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %b to <2 x i32>
// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <4 x i32> zeroinitializer
// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8>
// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> %r to <16 x i8>
// CHECK: [[OP:%.*]] = call <4 x i32> @llvm.arm.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> [[TMP4]])
// CHECK: ret <4 x i32> [[OP]]
int32x4_t test_vusdotq_lane_s32(int32x4_t r, uint8x16_t a, int8x8_t b) {
  return vusdotq_lane_s32(r, a, b, 0);
}

// CHECK-LABEL: test_vsudotq_lane_s32
// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %b to <2 x i32>
// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <4 x i32> zeroinitializer
// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> %r to <16 x i8>
// CHECK: [[OP:%.*]] = call <4 x i32> @llvm.arm.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %3, <16 x i8> %a)
// CHECK: ret <4 x i32> [[OP]]
int32x4_t test_vsudotq_lane_s32(int32x4_t r, int8x16_t a, uint8x8_t b) {
  return vsudotq_lane_s32(r, a, b, 0);
}