File: Ch2.py

package info (click to toggle)
swiftlang 6.1.3-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 2,791,604 kB
  • sloc: cpp: 9,901,740; ansic: 2,201,431; asm: 1,091,827; python: 308,252; objc: 82,166; f90: 80,126; lisp: 38,358; pascal: 25,559; sh: 20,429; ml: 5,058; perl: 4,745; makefile: 4,484; awk: 3,535; javascript: 3,018; xml: 918; fortran: 664; cs: 573; ruby: 396
file content (93 lines) | stat: -rw-r--r-- 3,244 bytes parent folder | download | duplicates (9)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# RUN: env SUPPORT_LIB=%mlir_cuda_runtime \
# RUN:   %PYTHON %s | FileCheck %s

# ===----------------------------------------------------------------------===//
#  Chapter 2 : 2D Saxpy with TMA
# ===----------------------------------------------------------------------===//
#
# This program demonstrates 2D Saxpy. It is same as Chapter 1,
# but it loads data using TMA (Tensor Memory Accelerator)
#
# This chapter introduces demonstrates:
#  1. Computes 2D SAXPY in the same way as Ch1.py but loads data using TMA
#  2. Create and initialize 1 asynchronous transactional barrier (mbarrier)
#  3. Thread-0 Load request data load from TMA for each thread block
#  4. Each thread block loads <1x32xf32> for x and y.
#  5. Wait for completion of TMA load with mbarrier
#
# ===----------------------------------------------------------------------===//

from mlir import ir
from mlir.dialects import nvgpu, scf, arith, memref, vector, gpu
from tools.nvdsl import *
from mlir import runtime as rt
from mlir.extras import types as T
import numpy as np


@NVDSL.mlir_func
def saxpy(x, y, alpha):
    token_ty = gpu.AsyncTokenType.get()
    t1 = gpu.wait(token_ty, [])
    x_dev, t2 = gpu.alloc(x.type, token_ty, [t1], [], [])
    y_dev, t3 = gpu.alloc(y.type, token_ty, [t2], [], [])
    t4 = gpu.memcpy(token_ty, [t3], x_dev, x)
    t5 = gpu.memcpy(token_ty, [t4], y_dev, y)
    t6 = gpu.wait(token_ty, [t5])

    x_tma = TMA([1, N], x.type)
    y_tma = TMA([1, N], y.type)
    x_tma.create_descriptor(x_dev)
    y_tma.create_descriptor(y_dev)
    sz_x = get_type_size(x_tma.tma_memref)
    sz_y = get_type_size(x_tma.tma_memref)
    sz = sz_x + sz_y

    @NVDSL.mlir_gpu_launch(grid=(M, 1, 1), block=(N, 1, 1), smem=sz)
    def saxpy_tma_kernel():
        bidx = gpu.block_id(gpu.Dimension.x)
        tidx = gpu.thread_id(gpu.Dimension.x)
        isThread0 = tidx == 0

        # 1. Create and initialize asynchronous transactional barrier (mbarrier)
        mbar_group = Mbarriers(number_of_barriers=1)
        mbar_group[0].init(1, predicate=isThread0)

        # 2. Execute Tensor Memory Accelerator (TMA) Load
        x_smem = get_dynamic_shared_memory([1, N], T.f32())
        y_smem = get_dynamic_shared_memory([1, N], T.f32(), offset=sz_x)
        x_tma.load(x_smem, mbar_group[0], coords=[0, bidx], predicate=isThread0)
        y_tma.load(y_smem, mbar_group[0], coords=[0, bidx], predicate=isThread0)
        mbar_group[0].arrive(txcount=sz, predicate=isThread0)

        # 3. Wait for completion of TMA load with mbarrier
        mbar_group[0].try_wait()

        x_val = memref.load(x_smem, [const(0), tidx])
        y_val = memref.load(y_smem, [const(0), tidx])

        # SAXPY: y[i] += a * x[i];
        y_val += x_val * alpha

        memref.store(y_val, y_dev, [bidx, tidx])

    saxpy_tma_kernel()

    t7 = gpu.memcpy(token_ty, [t6], y, y_dev)
    gpu.wait(token_ty, [t7])


# 3. Pass numpy arrays to MLIR
M = 256
N = 32
alpha = 2.0
x = np.random.randn(M, N).astype(np.float32)
y = np.ones((M, N), np.float32)
saxpy(x, y, alpha)

#  4. Verify MLIR with reference computation
ref = np.ones((M, N), np.float32)
ref += x * alpha
np.testing.assert_allclose(y, ref, rtol=5e-03, atol=1e-01)
print("PASS")
# CHECK-NOT: Mismatched elements