1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159
|
//
// NOTE: this test requires gpu-sm80
//
// with RT lib:
//
// RUN: mlir-opt %s \
// RUN: --sparse-compiler="enable-runtime-library=true enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \
// RUN: | TENSOR0="%mlir_src_dir/test/Integration/data/test.mtx" \
// RUN: mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_c_runner_utils \
// RUN: --e entry --entry-point-result=void \
// RUN: | FileCheck %s
//
// without RT lib:
//
// RUN: mlir-opt %s \
// RUN: --sparse-compiler="enable-runtime-library=false enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \
// RUN: | TENSOR0="%mlir_src_dir/test/Integration/data/test.mtx" \
// RUN: mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_c_runner_utils \
// RUN: --e entry --entry-point-result=void \
// RUN: | FileCheck %s
//
!Filename = !llvm.ptr<i8>
#CSR = #sparse_tensor.encoding<{
lvlTypes = ["dense", "compressed"]
}>
#trait_sampled_dense_dense = {
indexing_maps = [
affine_map<(i,j,k) -> (i,k)>, // A
affine_map<(i,j,k) -> (k,j)>, // B
affine_map<(i,j,k) -> (i,j)> // S (in/out)
],
iterator_types = ["parallel", "parallel", "reduction"],
doc = "S(i,j) += spy[S(i,j)] x SUM_k A(i,k) B(k,j)"
}
//
// Integration test that lowers a kernel annotated as sparse to
// actual sparse code, initializes sparse storage schemes, and
// runs the resulting code with the JIT compiler.
//
module {
llvm.func @mgpuCreateSparseEnv()
llvm.func @mgpuDestroySparseEnv()
//
// A kernel that computes a sampled dense matrix matrix multiplication
// using a "spy" function and in-place update of the sampling sparse matrix.
//
func.func @sampled_dense_dense(%args: tensor<?x?xf32, #CSR>,
%arga: tensor<?x?xf32>,
%argb: tensor<?x?xf32>) -> tensor<?x?xf32, #CSR> {
%result = linalg.generic #trait_sampled_dense_dense
ins(%arga, %argb: tensor<?x?xf32>, tensor<?x?xf32>)
outs(%args: tensor<?x?xf32, #CSR>) {
^bb(%a: f32, %b: f32, %s: f32):
%f0 = arith.constant 0.0 : f32
%u = sparse_tensor.unary %s : f32 to f32
present={
^bb0(%p: f32):
%mul = arith.mulf %a, %b : f32
sparse_tensor.yield %mul : f32
}
absent={}
%r = sparse_tensor.reduce %s, %u, %f0 : f32 {
^bb0(%p: f32, %q: f32):
%add = arith.addf %p, %q : f32
sparse_tensor.yield %add : f32
}
linalg.yield %r : f32
} -> tensor<?x?xf32, #CSR>
return %result : tensor<?x?xf32, #CSR>
}
func.func private @getTensorFilename(index) -> (!Filename)
//
// Main driver.
//
func.func @entry() {
llvm.call @mgpuCreateSparseEnv() : () -> ()
%d0 = arith.constant 0.0 : f32
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c5 = arith.constant 5 : index
%c10 = arith.constant 10 : index
// Initialize dense matrices.
%a = tensor.generate %c5, %c10 {
^bb0(%i: index, %j: index):
%p = arith.addi %i, %c1 : index
%q = arith.index_cast %p : index to i32
%d = arith.sitofp %q : i32 to f32
tensor.yield %d : f32
} : tensor<?x?xf32>
%b = tensor.generate %c10, %c5 {
^bb0(%i: index, %j: index):
%p = arith.addi %j, %c1 : index
%q = arith.index_cast %p : index to i32
%d = arith.sitofp %q : i32 to f32
tensor.yield %d : f32
} : tensor<?x?xf32>
// Read the sparse matrix from file, construct sparse storage.
%fileName = call @getTensorFilename(%c0) : (index) -> (!Filename)
%s = sparse_tensor.new %fileName : !Filename to tensor<?x?xf32, #CSR>
// Call the kernel.
%0 = call @sampled_dense_dense(%s, %a, %b)
: (tensor<?x?xf32, #CSR>,
tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32, #CSR>
//
// Print the result for verification.
//
// CHECK: ( 11, 41.4, 42, 102.5, 93, 44.1, 164, 105.2, 255 )
//
%vm = sparse_tensor.values %0 : tensor<?x?xf32, #CSR> to memref<?xf32>
%vv = vector.transfer_read %vm[%c0], %d0 : memref<?xf32>, vector<9xf32>
vector.print %vv : vector<9xf32>
// Create a much sparser sampling matrix.
%t = arith.constant sparse<[[0,0], [0,1], [1,0], [3,4], [7,7]],
[1.0, 2.0, 3.0, 4.0, 5.0]
> : tensor<8x8xf32>
%q = sparse_tensor.convert %t : tensor<8x8xf32> to tensor<?x?xf32, #CSR>
%a2 = arith.constant dense<2.0> : tensor<8x8xf32>
%b1 = arith.constant dense<1.0> : tensor<8x8xf32>
%a2c = tensor.cast %a2 : tensor<8x8xf32> to tensor<?x?xf32>
%b1c = tensor.cast %b1 : tensor<8x8xf32> to tensor<?x?xf32>
// Call the kernel again.
%1 = call @sampled_dense_dense(%q, %a2c, %b1c)
: (tensor<?x?xf32, #CSR>,
tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32, #CSR>
//
// Print the result for verification.
//
// CHECK: ( ( 17, 18, 0, 0, 0, 0, 0, 0 ), ( 19, 0, 0, 0, 0, 0, 0, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 0 ), ( 0, 0, 0, 0, 20, 0, 0, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 21 ) )
//
%d = sparse_tensor.convert %1 : tensor<?x?xf32, #CSR> to tensor<?x?xf32>
%mm = vector.transfer_read %d[%c0, %c0], %d0 : tensor<?x?xf32>, vector<8x8xf32>
vector.print %mm : vector<8x8xf32>
// Release the resources.
bufferization.dealloc_tensor %0 : tensor<?x?xf32, #CSR>
bufferization.dealloc_tensor %1 : tensor<?x?xf32, #CSR>
llvm.call @mgpuDestroySparseEnv() : () -> ()
return
}
}
|