File: transform-create-async-groups.mlir

package info (click to toggle)
swiftlang 6.0.3-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 2,519,992 kB
  • sloc: cpp: 9,107,863; ansic: 2,040,022; asm: 1,135,751; python: 296,500; objc: 82,456; f90: 60,502; lisp: 34,951; pascal: 19,946; sh: 18,133; perl: 7,482; ml: 4,937; javascript: 4,117; makefile: 3,840; awk: 3,535; xml: 914; fortran: 619; cs: 573; ruby: 573
file content (153 lines) | stat: -rw-r--r-- 7,435 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
// RUN: mlir-opt %s -test-transform-dialect-interpreter -split-input-file --verify-diagnostics | FileCheck %s

// Check that we produce async copies from the vector.transfer_xxx operations.
builtin.module {
  // CHECK-LABEL: @copies_to_asyncs
  func.func @copies_to_asyncs(%a: memref<1024x1024xf32>) {
    %0 = memref.alloc() : memref<4x32x16xf32, #gpu.address_space<workgroup>>
    %c0 = arith.constant 0 : index
    %c4 = arith.constant 4 : index
    %cst_0 = arith.constant 0.000000e+00 : f32
    // Make sure we emit the bypassL1.
    // CHECK: %[[CP0:.*]] = nvgpu.device_async_copy {{.*}}, {{.*}}, 4  {bypassL1} :
    %1 = vector.transfer_read %a[%c0, %c0], %cst_0 {in_bounds = [true]} : memref<1024x1024xf32>, vector<4xf32>
    vector.transfer_write %1, %0[%c0, %c0, %c0] {in_bounds = [true]} : vector<4xf32>, memref<4x32x16xf32, #gpu.address_space<workgroup>>
    // CHECK-NOT: nvgpu.device_async_create_group

    // CHECK: %[[CP1:.*]] = nvgpu.device_async_copy {{.*}}, {{.*}}, 1
    %2 = vector.transfer_read %a[%c0, %c4], %cst_0 {in_bounds = [true]} : memref<1024x1024xf32>, vector<1xf32>
    vector.transfer_write %2, %0[%c0, %c4, %c0] {in_bounds = [true]} : vector<1xf32>, memref<4x32x16xf32, #gpu.address_space<workgroup>>
    // CHECK: %[[G:.*]] = nvgpu.device_async_create_group %[[CP0]], %[[CP1]]
    // CHECK: nvgpu.device_async_wait %[[G]]
    return
  }

  transform.sequence failures(propagate) {
  ^bb1(%variant_op: !transform.any_op):
    %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
    transform.nvgpu.create_async_groups %top_level_func {bypass_l1} : (!transform.any_op) -> (!transform.any_op)
  }
}

// -----

// Check that we properly take `bypass_l1 = false` into account.
// I.e., we shouldn't be generating bypassL1 attributes.
builtin.module {
  // CHECK-LABEL: @copies_to_asyncs_no_mma
  func.func @copies_to_asyncs_no_mma(%a: memref<1024x1024xf32>) {
    %0 = memref.alloc() : memref<4x32x16xf32, #gpu.address_space<workgroup>>
    %c0 = arith.constant 0 : index
    %c4 = arith.constant 4 : index
    %cst_0 = arith.constant 0.000000e+00 : f32
    // Make sure we don't emit the bypassL1.
    // CHECK: %[[CP0:.*]] = nvgpu.device_async_copy {{.*}}, {{.*}}, 4 :
    %1 = vector.transfer_read %a[%c0, %c0], %cst_0 {in_bounds = [true]} : memref<1024x1024xf32>, vector<4xf32>
    vector.transfer_write %1, %0[%c0, %c0, %c0] {in_bounds = [true]} : vector<4xf32>, memref<4x32x16xf32, #gpu.address_space<workgroup>>
    // CHECK-NOT: nvgpu.device_async_create_group

    // CHECK: %[[CP1:.*]] = nvgpu.device_async_copy {{.*}}, {{.*}}, 1 :
    %2 = vector.transfer_read %a[%c0, %c4], %cst_0 {in_bounds = [true]} : memref<1024x1024xf32>, vector<1xf32>
    vector.transfer_write %2, %0[%c0, %c4, %c0] {in_bounds = [true]} : vector<1xf32>, memref<4x32x16xf32, #gpu.address_space<workgroup>>
    // CHECK: %[[G:.*]] = nvgpu.device_async_create_group %[[CP0]], %[[CP1]]
    // CHECK: nvgpu.device_async_wait %[[G]]
    return
  }

  transform.sequence failures(propagate) {
  ^bb1(%variant_op: !transform.any_op):
    %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
    transform.nvgpu.create_async_groups %top_level_func : (!transform.any_op) -> (!transform.any_op)
  }
}

// -----

// Check that pattern works with vector.load/vector.store.
builtin.module {
  // CHECK-LABEL: @copies_to_asyncs_load_store
  func.func @copies_to_asyncs_load_store(%a: memref<1024x1024xf32>) {
    %0 = memref.alloc() : memref<4x32x16xf32, #gpu.address_space<workgroup>>
    %c0 = arith.constant 0 : index
    %c4 = arith.constant 4 : index
    %cst_0 = arith.constant 0.000000e+00 : f32
    // CHECK: %[[CP0:.*]] = nvgpu.device_async_copy {{.*}}, {{.*}}, 4 :
    %1 = vector.load %a[%c0, %c0] : memref<1024x1024xf32>, vector<4xf32>
    vector.store %1, %0[%c0, %c0, %c0] : memref<4x32x16xf32, #gpu.address_space<workgroup>>, vector<4xf32>
    // CHECK-NOT: nvgpu.device_async_create_group

    // CHECK: %[[CP1:.*]] = nvgpu.device_async_copy {{.*}}, {{.*}}, 1 :
    %2 = vector.load %a[%c0, %c4] : memref<1024x1024xf32>, vector<1xf32>
    vector.store %2, %0[%c0, %c4, %c0] : memref<4x32x16xf32, #gpu.address_space<workgroup>>, vector<1xf32>
    // CHECK: %[[G:.*]] = nvgpu.device_async_create_group %[[CP0]], %[[CP1]]
    // CHECK: nvgpu.device_async_wait %[[G]]
    return
  }

  transform.sequence failures(propagate) {
  ^bb1(%variant_op: !transform.any_op):
    %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
    transform.nvgpu.create_async_groups %top_level_func : (!transform.any_op) -> (!transform.any_op)
  }
}

// -----

// Check that pattern skips unaligned and unsupported sizes.
builtin.module {
  // CHECK-LABEL: @copies_to_asyncs_load_store
  func.func @copies_to_asyncs_load_store(%a: memref<1024x1024xf32>, %b: memref<1024x1024xf16>) {
    %alloc = memref.alloc() : memref<4x32x16xf32, #gpu.address_space<workgroup>>
    %alloc_1 = memref.alloc() : memref<4x32x16xf16, #gpu.address_space<workgroup>>
    %c0 = arith.constant 0 : index
    %c4 = arith.constant 4 : index
    %cst_0 = arith.constant 0.000000e+00 : f32

    // Requires 1-D vector load
    // CHECK-NOT: nvgpu.device_async_copy
    //     CHECK: vector.load
    //     CHECK: vector.store
    %1 = vector.load %a[%c0, %c4] : memref<1024x1024xf32>, vector<2x2xf32>
    vector.store %1, %alloc[%c0, %c4, %c0] : memref<4x32x16xf32, #gpu.address_space<workgroup>>, vector<2x2xf32>
    // CHECK-NOT: nvgpu.device_async_create_group

    // CHECK-NOT: nvgpu.device_async_copy
    //     CHECK: vector.load
    //     CHECK: vector.store
    %2 = vector.load %b[%c0, %c4] : memref<1024x1024xf16>, vector<1xf16>
    vector.store %2, %alloc_1[%c0, %c4, %c0] : memref<4x32x16xf16, #gpu.address_space<workgroup>>, vector<1xf16>
    // CHECK-NOT: nvgpu.device_async_create_group
    return
  }

  transform.sequence failures(propagate) {
  ^bb1(%variant_op: !transform.any_op):
    %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
    transform.nvgpu.create_async_groups %top_level_func : (!transform.any_op) -> (!transform.any_op)
  }
}

// -----

// vector.transfer_read with a mask.
builtin.module {
  // CHECK-LABEL: @read_with_mask(
  // CHECK-SAME: %{{.*}}: memref<1024x1024xf32>, %[[sz:.*]]: index
  func.func @read_with_mask(%a: memref<1024x1024xf32>, %sz: index) {
    %0 = memref.alloc() : memref<4x32x16xf32, #gpu.address_space<workgroup>>
    %c0 = arith.constant 0 : index
    %cst_0 = arith.constant 0.000000e+00 : f32
    // CHECK: nvgpu.device_async_copy {{.*}}, {{.*}}, 4, %[[sz]] {bypassL1} :
    %mask = vector.create_mask %sz : vector<4xi1>
    %1 = vector.transfer_read %a[%c0, %c0], %cst_0, %mask {in_bounds = [true]} : memref<1024x1024xf32>, vector<4xf32>
    vector.transfer_write %1, %0[%c0, %c0, %c0] {in_bounds = [true]} : vector<4xf32>, memref<4x32x16xf32, #gpu.address_space<workgroup>>

    return
  }

  transform.sequence failures(propagate) {
  ^bb1(%variant_op: !transform.any_op):
    %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
    transform.nvgpu.create_async_groups %top_level_func {bypass_l1} : (!transform.any_op) -> (!transform.any_op)
  }
}