File: microbench-scf-async-parallel-for.mlir

package info (click to toggle)
llvm-toolchain-13 1%3A13.0.1-11
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 1,418,840 kB
  • sloc: cpp: 5,290,826; ansic: 996,570; asm: 544,593; python: 188,212; objc: 72,027; lisp: 30,291; f90: 25,395; sh: 24,898; javascript: 9,780; pascal: 9,398; perl: 7,484; ml: 5,432; awk: 3,523; makefile: 2,913; xml: 953; cs: 573; fortran: 539
file content (151 lines) | stat: -rw-r--r-- 6,708 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
// RUN:   mlir-opt %s                                                          \
// RUN:               -async-parallel-for                                      \
// RUN:               -async-to-async-runtime                                  \
// RUN:               -async-runtime-ref-counting                              \
// RUN:               -async-runtime-ref-counting-opt                          \
// RUN:               -convert-async-to-llvm                                   \
// RUN:               -convert-linalg-to-loops                                 \
// RUN:               -convert-scf-to-std                                      \
// RUN:               -std-expand                                              \
// RUN:               -convert-vector-to-llvm                                  \
// RUN:               -convert-memref-to-llvm                                  \
// RUN:               -convert-std-to-llvm -print-ir-after-all                 \
// RUN: | mlir-cpu-runner                                                      \
// RUN: -e entry -entry-point-result=void -O3                                  \
// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext  \
// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext\
// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_async_runtime%shlibext \
// RUN: | FileCheck %s --dump-input=always

// RUN:   mlir-opt %s                                                          \
// RUN:               -async-parallel-for=async-dispatch=false                 \
// RUN:               -async-to-async-runtime                                  \
// RUN:               -async-runtime-ref-counting                              \
// RUN:               -async-runtime-ref-counting-opt                          \
// RUN:               -convert-async-to-llvm                                   \
// RUN:               -convert-linalg-to-loops                                 \
// RUN:               -convert-scf-to-std                                      \
// RUN:               -std-expand                                              \
// RUN:               -convert-vector-to-llvm                                  \
// RUN:               -convert-memref-to-llvm                                  \
// RUN:               -convert-std-to-llvm                                     \
// RUN: | mlir-cpu-runner                                                      \
// RUN: -e entry -entry-point-result=void -O3                                  \
// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext  \
// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext\
// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_async_runtime%shlibext \
// RUN: | FileCheck %s --dump-input=always

// RUN:   mlir-opt %s                                                          \
// RUN:               -convert-linalg-to-loops                                 \
// RUN:               -convert-scf-to-std                                      \
// RUN:               -convert-vector-to-llvm                                  \
// RUN:               -convert-memref-to-llvm                                  \
// RUN:               -convert-std-to-llvm                                     \
// RUN: | mlir-cpu-runner                                                      \
// RUN: -e entry -entry-point-result=void -O3                                  \
// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext  \
// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext\
// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_async_runtime%shlibext \
// RUN: | FileCheck %s --dump-input=always

#map0 = affine_map<(d0, d1) -> (d0, d1)>

func @scf_parallel(%lhs: memref<?x?xf32>,
                   %rhs: memref<?x?xf32>,
                   %sum: memref<?x?xf32>) {
  %c0 = constant 0 : index
  %c1 = constant 1 : index

  %d0 = memref.dim %lhs, %c0 : memref<?x?xf32>
  %d1 = memref.dim %lhs, %c1 : memref<?x?xf32>

  scf.parallel (%i, %j) = (%c0, %c0) to (%d0, %d1) step (%c1, %c1) {
    %lv = memref.load %lhs[%i, %j] : memref<?x?xf32>
    %rv = memref.load %lhs[%i, %j] : memref<?x?xf32>
    %r = addf %lv, %rv : f32
    memref.store %r, %sum[%i, %j] : memref<?x?xf32>
  }

  return
}

func @entry() {
  %f1 = constant 1.0 : f32
  %f4 = constant 4.0 : f32
  %c0 = constant 0 : index
  %c1 = constant 1 : index
  %cM = constant 1000 : index

  //
  // Sanity check for the function under test.
  //

  %LHS10 = memref.alloc() {alignment = 64} : memref<1x10xf32>
  %RHS10 = memref.alloc() {alignment = 64} : memref<1x10xf32>
  %DST10 = memref.alloc() {alignment = 64} : memref<1x10xf32>

  linalg.fill(%f1, %LHS10) : f32, memref<1x10xf32>
  linalg.fill(%f1, %RHS10) : f32, memref<1x10xf32>

  %LHS = memref.cast %LHS10 : memref<1x10xf32> to memref<?x?xf32>
  %RHS = memref.cast %RHS10 : memref<1x10xf32> to memref<?x?xf32>
  %DST = memref.cast %DST10 : memref<1x10xf32> to memref<?x?xf32>

  call @scf_parallel(%LHS, %RHS, %DST)
    : (memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>) -> ()

  // CHECK: [2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
  %U = memref.cast %DST10 :  memref<1x10xf32> to memref<*xf32>
  call @print_memref_f32(%U): (memref<*xf32>) -> ()

  memref.dealloc %LHS10: memref<1x10xf32>
  memref.dealloc %RHS10: memref<1x10xf32>
  memref.dealloc %DST10: memref<1x10xf32>

  //
  // Allocate data for microbenchmarks.
  //

  %LHS1024 = memref.alloc() {alignment = 64} : memref<1024x1024xf32>
  %RHS1024 = memref.alloc() {alignment = 64} : memref<1024x1024xf32>
  %DST1024 = memref.alloc() {alignment = 64} : memref<1024x1024xf32>

  %LHS0 = memref.cast %LHS1024 : memref<1024x1024xf32> to memref<?x?xf32>
  %RHS0 = memref.cast %RHS1024 : memref<1024x1024xf32> to memref<?x?xf32>
  %DST0 = memref.cast %DST1024 : memref<1024x1024xf32> to memref<?x?xf32>

  //
  // Warm up.
  //

  call @scf_parallel(%LHS0, %RHS0, %DST0)
    : (memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>) -> ()

  //
  // Measure execution time.
  //

  %t0 = call @rtclock() : () -> f64
  scf.for %i = %c0 to %cM step %c1 {
    call @scf_parallel(%LHS0, %RHS0, %DST0)
      : (memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>) -> ()
  }
  %t1 = call @rtclock() : () -> f64
  %t1024 = subf %t1, %t0 : f64

  // Print timings.
  vector.print %t1024 : f64

  // Free.
  memref.dealloc %LHS1024: memref<1024x1024xf32>
  memref.dealloc %RHS1024: memref<1024x1024xf32>
  memref.dealloc %DST1024: memref<1024x1024xf32>

  return
}

func private @rtclock() -> f64

func private @print_memref_f32(memref<*xf32>)
  attributes { llvm.emit_c_interface }