File: fmac.sdwa.ll

package info (click to toggle)
llvm-toolchain-9 1%3A9.0.1-16
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 882,436 kB
  • sloc: cpp: 4,167,636; ansic: 714,256; asm: 457,610; python: 155,927; objc: 65,094; sh: 42,856; lisp: 26,908; perl: 7,786; pascal: 7,722; makefile: 6,881; ml: 5,581; awk: 3,648; cs: 2,027; xml: 888; javascript: 381; ruby: 156
file content (76 lines) | stat: -rw-r--r-- 3,001 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1010 %s

; GCN-LABEL: {{^}}addMul2D:
; GFX1010: v_fmac_f16
; GFX1010: v_fmac_f16
define hidden <4 x half> @addMul2D(<4 x i8>* nocapture readonly, float addrspace(4)* nocapture readonly, <2 x i32>, i32) local_unnamed_addr #0 {
  %5 = extractelement <2 x i32> %2, i64 1
  %6 = icmp sgt i32 %5, 0
  br i1 %6, label %7, label %38

7:                                                ; preds = %4
  %8 = extractelement <2 x i32> %2, i64 0
  %9 = icmp sgt i32 %8, 0
  br label %10

10:                                               ; preds = %34, %7
  %11 = phi <4 x half> [ zeroinitializer, %7 ], [ %35, %34 ]
  %12 = phi i32 [ 0, %7 ], [ %36, %34 ]
  br i1 %9, label %13, label %34

13:                                               ; preds = %10
  %14 = mul nsw i32 %12, %3
  %15 = mul nsw i32 %12, %8
  br label %16

16:                                               ; preds = %16, %13
  %17 = phi <4 x half> [ %11, %13 ], [ %31, %16 ]
  %18 = phi i32 [ 0, %13 ], [ %32, %16 ]
  %19 = add nsw i32 %18, %14
  %20 = sext i32 %19 to i64
  %21 = getelementptr inbounds <4 x i8>, <4 x i8>* %0, i64 %20
  %22 = load <4 x i8>, <4 x i8>* %21, align 4
  %23 = tail call <4 x half> @_Z13convert_half4Dv4_h(<4 x i8> %22) #8
  %24 = add nsw i32 %18, %15
  %25 = sext i32 %24 to i64
  %26 = getelementptr inbounds float, float addrspace(4)* %1, i64 %25
  %27 = load float, float addrspace(4)* %26, align 4
  %28 = fptrunc float %27 to half
  %29 = insertelement <4 x half> undef, half %28, i32 0
  %30 = shufflevector <4 x half> %29, <4 x half> undef, <4 x i32> zeroinitializer
  %31 = tail call <4 x half> @llvm.fmuladd.v4f16(<4 x half> %23, <4 x half> %30, <4 x half> %17)
  %32 = add nuw nsw i32 %18, 1
  %33 = icmp eq i32 %32, %8
  br i1 %33, label %34, label %16

34:                                               ; preds = %16, %10
  %35 = phi <4 x half> [ %11, %10 ], [ %31, %16 ]
  %36 = add nuw nsw i32 %12, 1
  %37 = icmp eq i32 %36, %5
  br i1 %37, label %38, label %10

38:                                               ; preds = %34, %4
  %39 = phi <4 x half> [ zeroinitializer, %4 ], [ %35, %34 ]
  ret <4 x half> %39
}

define linkonce_odr hidden <4 x half> @_Z13convert_half4Dv4_h(<4 x i8>) local_unnamed_addr #1 {
  %2 = extractelement <4 x i8> %0, i64 0
  %3 = uitofp i8 %2 to half
  %4 = insertelement <4 x half> undef, half %3, i32 0
  %5 = extractelement <4 x i8> %0, i64 1
  %6 = uitofp i8 %5 to half
  %7 = insertelement <4 x half> %4, half %6, i32 1
  %8 = extractelement <4 x i8> %0, i64 2
  %9 = uitofp i8 %8 to half
  %10 = insertelement <4 x half> %7, half %9, i32 2
  %11 = extractelement <4 x i8> %0, i64 3
  %12 = uitofp i8 %11 to half
  %13 = insertelement <4 x half> %10, half %12, i32 3
  ret <4 x half> %13
}

declare <4 x half> @llvm.fmuladd.v4f16(<4 x half>, <4 x half>, <4 x half>)

attributes #0 = { convergent nounwind readonly}
attributes #1 = { norecurse nounwind readnone }