File: cuda.py

package info (click to toggle)
python-gimmik 3.2.1-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 204 kB
  • sloc: python: 323; makefile: 4
file content (57 lines) | stat: -rw-r--r-- 2,240 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# -*- coding: utf-8 -*-

from gimmik.base import MatMul


class CUDAMatMul(MatMul):
    platform = 'cuda'
    basemeta = {'block': (128, 1, 1), 'width': 1, 'shared': 0,
                'dynamic_shared': 0}

    def _kernel_generators(self, dtype, dsize, *, compute_capability=None):
        # B loading, C streaming kernel
        yield ('cstream', {}, {})

        # B streaming, C accumulation kernel
        yield ('bstream', {}, {})

        # Four-way m-split B streaming, C accumulation kernel
        ms, bsz, blkx = 4, 24, 32
        args = {'msplit': ms, 'bsz': bsz, 'blockx': blkx}
        meta = {'block': (blkx, ms, 1), 'shared': 2*bsz*blkx*dsize}
        yield ('bstream-msplit', args, meta)

        # Two-way k-split B loading, C streaming kernel
        ks, csz, blkx = 2, 24, 32
        args = {'ksplit': ks, 'csz': csz, 'blockx': blkx}
        meta = {'block': (blkx, ks, 1), 'shared': (ks - 1)*csz*blkx*dsize}
        yield ('cstream-ksplit', args, meta)

        # At single precision also consider vectorized kernels
        if (dtype == 'float' and
            self.aligne is not None and self.aligne % 2 == 0):
            # Vector B loading, C streaming kernel
            args = {'dtype': 'float2', 'width': 2}
            meta = {'width': 2}
            yield ('cstream', args, meta)

            # Vector four-way m-split B streaming, C accumulation kernel
            ms, bsz, blkx = 4, 16, 32
            args = {'dtype': 'float2', 'width': 2, 'msplit': ms,
                    'bsz': bsz, 'blockx': blkx}
            meta = {'block': (blkx, ms, 1), 'width': 2,
                    'shared': 2*blkx*bsz*2*dsize}
            yield ('bstream-msplit', args, meta)

            # Vector two-way k-split B loading, C streaming kernel
            ks, csz, blkx = 2, 24, 32
            args = {'dtype': 'float2', 'width': 2, 'ksplit': ks,
                    'csz': csz, 'blockx': blkx}
            meta = {'block': (blkx, ks, 1), 'width': 2,
                    'shared': 2*(ks - 1)*csz*blkx*dsize}
            yield ('cstream-ksplit', args, meta)

    def _process_meta(self, meta):
        if self.n is not None:
            div = meta['block'][0]*meta['width']
            meta['grid'] = (-(-self.n // div), 1, 1)