File: demo_meta_template.py

package info (click to toggle)
pyopencl 0.92.dfsg-1
  • links: PTS, VCS
  • area: contrib
  • in suites: squeeze
  • size: 572 kB
  • ctags: 843
  • sloc: python: 3,982; cpp: 3,333; makefile: 101; sh: 2
file content (53 lines) | stat: -rw-r--r-- 1,467 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import pyopencl as cl
import numpy
import numpy.linalg as la

local_size = 256
thread_strides = 32
macroblock_count = 33
dtype = numpy.float32
total_size = local_size*thread_strides*macroblock_count

ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)

a = numpy.random.randn(total_size).astype(dtype)
b = numpy.random.randn(total_size).astype(dtype)

mf = cl.mem_flags
a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a)
b_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b)
c_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.nbytes)

from jinja2 import Template

tpl = Template("""
    __kernel void add(
            __global {{ type_name }} *tgt, 
            __global const {{ type_name }} *op1, 
            __global const {{ type_name }} *op2)
    {
      int idx = get_local_id(0)
        + {{ local_size }} * {{ thread_strides }}
        * get_group_id(0);

      {% for i in range(thread_strides) %}
          {% set offset = i*local_size %}
          tgt[idx + {{ offset }}] = 
            op1[idx + {{ offset }}] 
            + op2[idx + {{ offset }}];
      {% endfor %}
    }""")

rendered_tpl = tpl.render(type_name="float", 
    local_size=local_size, thread_strides=thread_strides)

knl = cl.Program(ctx, str(rendered_tpl)).build().add

knl(queue, (local_size*macroblock_count,), (local_size,),
        c_buf, a_buf, b_buf)

c = numpy.empty_like(a)
cl.enqueue_read_buffer(queue, c_buf, c).wait()

assert la.norm(c-(a+b)) == 0