1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133
|
/*
Github Issue #1435
*/
#include "pocl_opencl.h"
#define CL_HPP_ENABLE_EXCEPTIONS
#define CL_HPP_MINIMUM_OPENCL_VERSION 120
#define CL_HPP_TARGET_OPENCL_VERSION 120
#include <CL/opencl.hpp>
#include <cassert>
#include <iostream>
using namespace std;
const char *SOURCE = R"RAW(
__kernel void medfilt2d(__global float *image, // input image
__global float *result, // output array
__local float4 *l_data,// local storage 4x the number of threads
int khs1, // Kernel half-size along dim1 (nb lines)
int khs2, // Kernel half-size along dim2 (nb columns)
int height, // Image size along dim1 (nb lines)
int width) // Image size along dim2 (nb columns)
{
int threadid = get_local_id(0);
int x = get_global_id(1);
if (x < width)
{
union
{
float ary[8];
float8 vec;
} output, input;
input.vec = (float8)(MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT);
int kfs1 = 2 * khs1 + 1;
int kfs2 = 2 * khs2 + 1;
int nbands = (kfs1 + 7) / 8;
for (int y=0; y<height; y++)
{
//Select only the active threads, some may remain inactive
int nb_threads = (nbands * kfs2);
int band_nr = threadid / kfs2;
int band_id = threadid % kfs2;
int pos_x = clamp((int)(x + band_id - khs2), (int) 0, (int) width-1);
int max_vec = clamp(kfs1 - 8 * band_nr, 0, 8);
if (y == 0)
{
for (int i=0; i<max_vec; i++)
{
if (threadid<nb_threads)
{
int pos_y = clamp((int)(y + 8 * band_nr + i - khs1), (int) 0, (int) height-1);
input.ary[i] = image[pos_x + width * pos_y];
}
}
}
else
{
//store storage.s0 to some shared memory to retrieve it from another thread.
l_data[threadid].s0 = input.vec.s0;
//Offset to the bottom
input.vec = (float8)(input.vec.s1,
input.vec.s2,
input.vec.s3,
input.vec.s4,
input.vec.s5,
input.vec.s6,
input.vec.s7,
MAXFLOAT);
barrier(CLK_LOCAL_MEM_FENCE);
int read_from = threadid + kfs2;
if (read_from < nb_threads)
input.vec.s7 = l_data[read_from].s0;
else if (threadid < nb_threads) //we are on the last band
{
int pos_y = clamp((int)(y + 8 * band_nr + max_vec - 1 - khs1), (int) 0, (int) height-1);
input.ary[max_vec - 1] = image[pos_x + width * pos_y];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
}
}
)RAW";
#if 0
// the shorter code that should trigger the same issue
const char *SOURCE = R"RAW(
__kernel void testkernel(__local float2 *b) {
struct {
int c[1];
float2 d;
} e;
for (int f = 0; f < 2; f++) {
if (f)
for (int g; g < (int)b[0].x; g++)
e.c[g] = 0;
else if (b)
e.d.s0 = b[0].s0;
barrier(0);
}
}
)RAW";
#endif
int main(int argc, char *argv[]) {
cl::Device device = cl::Device::getDefault();
cl::Program program(SOURCE);
program.build("-cl-std=CL1.2");
// This triggers compilation of dynamic WG binaries.
cl::Program::Binaries binaries{};
int err = program.getInfo<>(CL_PROGRAM_BINARIES, &binaries);
if (err == CL_SUCCESS) {
printf("OK\n");
return EXIT_SUCCESS;
} else {
printf("FAIL\n");
return EXIT_FAILURE;
}
}
|