File: measure_gpuarray_speed.py

package info (click to toggle)
pycuda 2016.1.2%2Bgit20161024-1
  • links: PTS, VCS
  • area: contrib
  • in suites: stretch
  • size: 1,560 kB
  • ctags: 2,268
  • sloc: python: 11,951; cpp: 9,839; makefile: 139; sh: 1
file content (89 lines) | stat: -rwxr-xr-x 2,191 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
from __future__ import absolute_import
from __future__ import print_function
#! /usr/bin/env python
import pycuda.driver as drv
import pycuda.autoinit
import numpy
import numpy.linalg as la
from six.moves import range
from six.moves import zip




def main():
    import pycuda.gpuarray as gpuarray

    sizes = []
    times_gpu = []
    flops_gpu = []
    flops_cpu = []
    times_cpu = []

    from pycuda.tools import bitlog2
    max_power = bitlog2(drv.mem_get_info()[0]) - 2
    # they're floats, i.e. 4 bytes each
    for power in range(10, max_power):
        size = 1<<power
        print(size)
        sizes.append(size)
        a = gpuarray.zeros((size,), dtype=numpy.float32)
        b = gpuarray.zeros((size,), dtype=numpy.float32)
        b.fill(1)

        if power > 20:
            count = 100
        else:
            count = 1000

        # gpu -----------------------------------------------------------------
        start = drv.Event()
        end = drv.Event()
        start.record()

        for i in range(count):
            a+b

        end.record()
        end.synchronize()

        secs = start.time_till(end)*1e-3

        times_gpu.append(secs/count)
        flops_gpu.append(size)
        del a
        del b

        # cpu -----------------------------------------------------------------
        a_cpu = numpy.random.randn(size).astype(numpy.float32)
        b_cpu = numpy.random.randn(size).astype(numpy.float32)

        #start timer
        from time import time
        start = time()
        for i in range(count):
            a_cpu + b_cpu
        secs = time() - start

        times_cpu.append(secs/count)
        flops_cpu.append(size)


    # calculate pseudo flops
    flops_gpu = [f/t for f, t in zip(flops_gpu,times_gpu)]
    flops_cpu = [f/t for f, t in zip(flops_cpu,times_cpu)]

    from pytools import Table
    tbl = Table()
    tbl.add_row(("Size", "Time GPU", "Size/Time GPU",
        "Time CPU","Size/Time CPU","GPU vs CPU speedup"))
    for s, t, f, t_cpu, f_cpu in zip(sizes, times_gpu, flops_gpu, times_cpu, flops_cpu):
        tbl.add_row((s, t, f, t_cpu, f_cpu, f/f_cpu))
    print(tbl)





if __name__ == "__main__":
    main()