File: locallaplacian.cl

package info (click to toggle)
darktable 5.4.1-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 64,644 kB
  • sloc: ansic: 367,957; cpp: 102,838; xml: 20,091; lisp: 15,092; sh: 3,767; javascript: 3,264; perl: 1,925; python: 1,551; ruby: 975; makefile: 543; asm: 46; sql: 38; awk: 21
file content (317 lines) | stat: -rw-r--r-- 10,713 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
/*
    This file is part of darktable,
    copyright (c) 2016 johannes hanika.

    darktable is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    darktable is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with darktable.  If not, see <http://www.gnu.org/licenses/>.
*/

#include "common.h"

kernel void
pad_input(
    read_only  image2d_t input,
    write_only image2d_t padded,
    const int wd,                  // dimensions of input
    const int ht,
    const int max_supp,            // size of border
    const int wd2,                 // padded dimensions
    const int ht2)
{
  const int x = get_global_id(0);
  const int y = get_global_id(1);
  int cx = x - max_supp, cy = y - max_supp;

  if(x >= wd2 || y >= ht2) return;
  // fill boundary with max_supp px:
  if(cx >= wd) cx = wd-1;
  if(cy >= ht) cy = ht-1;
  if(cx < 0) cx = 0;
  if(cy < 0) cy = 0;

  float4 pixel = read_imagef(input, sampleri, (int2)(cx, cy));
  write_imagef (padded, (int2)(x, y), pixel.x*0.01f);
}

float expand_gaussian(
    read_only image2d_t coarse,
    const int i,
    const int j,
    const int wd,
    const int ht)
{
  float c = 0.0f;
  const float w[5] = {1.0f/16.0f, 4.0f/16.0f, 6.0f/16.0f, 4.0f/16.0f, 1.0f/16.0f};
  const int cx = i/2;
  const int cy = j/2;
  switch((i&1) + 2*(j&1))
  {
    case 0: // both are even, 3x3 stencil
      for(int ii=-1;ii<=1;ii++) for(int jj=-1;jj<=1;jj++)
      {
        float4 pixel = read_imagef(coarse, sampleri, (int2)(cx+ii, cy+jj));
        c += pixel.x*w[2*jj+2]*w[2*ii+2];
      }
      break;
    case 1: // i is odd, 2x3 stencil
      for(int ii=0;ii<=1;ii++) for(int jj=-1;jj<=1;jj++)
      {
        float4 pixel = read_imagef(coarse, sampleri, (int2)(cx+ii, cy+jj));
        c += pixel.x*w[2*jj+2]*w[2*ii+1];
      }
      break;
    case 2: // j is odd, 3x2 stencil
      for(int ii=-1;ii<=1;ii++) for(int jj=0;jj<=1;jj++)
      {
        float4 pixel = read_imagef(coarse, sampleri, (int2)(cx+ii, cy+jj));
        c += pixel.x*w[2*jj+1]*w[2*ii+2];
      }
      break;
    default: // case 3: // both are odd, 2x2 stencil
      for(int ii=0;ii<=1;ii++) for(int jj=0;jj<=1;jj++)
      {
        float4 pixel = read_imagef(coarse, sampleri, (int2)(cx+ii, cy+jj));
        c += pixel.x*w[2*jj+1]*w[2*ii+1];
      }
      break;
  }
  return 4.0f * c;
}

kernel void
gauss_expand(
    read_only  image2d_t coarse, // coarse input
    write_only image2d_t fine,   // upsampled blurry output
    const int wd,                // resolution of fine, also run kernel on fine res
    const int ht)
{
  const int x = get_global_id(0);
  const int y = get_global_id(1);
  int cx = x, cy = y;

  float4 pixel;
  if(x >= wd || y >= ht) return;
  // fill boundary with 1 or 2 px:
  if(wd & 1) { if(x > wd-2) cx = wd-2; }
  else       { if(x > wd-3) cx = wd-3; }
  if(ht & 1) { if(y > ht-2) cy = ht-2; }
  else       { if(y > ht-3) cy = ht-3; }
  if(cx <= 0) cx = 1;
  if(cy <= 0) cy = 1;

  pixel.x = expand_gaussian(coarse, cx, cy, wd, ht);
  write_imagef (fine, (int2)(x, y), pixel);
}

kernel void
gauss_reduce(
    read_only  image2d_t input,  // fine input buffer
    write_only image2d_t coarse, // coarse scale, blurred input buf
    const int wd,                // coarse res (also run this kernel on coarse res only)
    const int ht)
{
  const int x = get_global_id(0);
  const int y = get_global_id(1);
  int cx = x, cy = y;

  float4 pixel;
  if(x >= wd || y >= ht) return;
  // fill boundary with 1 px:
  if(x >= wd-1) cx = wd-2;
  if(y >= ht-1) cy = ht-2;
  if(cx <= 0) cx = 1;
  if(cy <= 0) cy = 1;

  // blur, store only coarse res
  pixel.x = 0.0f;
  const float w[5] = {1.0f/16.0f, 4.0f/16.0f, 6.0f/16.0f, 4.0f/16.0f, 1.0f/16.0f};
  // direct 5x5 stencil only on required pixels:
  for(int jj=-2;jj<=2;jj++) for(int ii=-2;ii<=2;ii++)
    pixel.x += read_imagef(input, sampleri, (int2)(2*cx+ii, 2*cy+jj)).x * w[ii+2] * w[jj+2];
  write_imagef (coarse, (int2)(x, y), pixel);
}

float laplacian(
    read_only image2d_t coarse, // coarse res gaussian
    read_only image2d_t fine,   // fine res gaussian
    const int i,                // fine index
    const int j,
    const int ci,               // clamped fine index
    const int cj,
    const int wd,               // fine width
    const int ht)               // fine height
{
  const float c = expand_gaussian(coarse, ci, cj, wd, ht);
  return read_imagef(fine, sampleri, (int2)(i, j)).x - c;
}

kernel void
laplacian_assemble(
    read_only  image2d_t input,      // original input buffer, gauss at current fine pyramid level
    read_only  image2d_t output1,    // state of reconstruction, coarse output buffer
    write_only image2d_t output0,    // reconstruction, one level finer, run kernel on this dimension
    read_only  image2d_t buf_g0_l0,  // image2d_array_t only supported in ocl 2.0 :(
    read_only  image2d_t buf_g0_l1,
    read_only  image2d_t buf_g1_l0,
    read_only  image2d_t buf_g1_l1,
    read_only  image2d_t buf_g2_l0,
    read_only  image2d_t buf_g2_l1,
    read_only  image2d_t buf_g3_l0,
    read_only  image2d_t buf_g3_l1,
    read_only  image2d_t buf_g4_l0,
    read_only  image2d_t buf_g4_l1,
    read_only  image2d_t buf_g5_l0,
    read_only  image2d_t buf_g5_l1,
    // read_only  image2d_t buf_g6_l0,
    // read_only  image2d_t buf_g6_l1,
    // read_only  image2d_t buf_g7_l0,
    // read_only  image2d_t buf_g7_l1,
    const int  pw,                   // width and height of the fine buffers (l0)
    const int  ph)
{
  const int x = get_global_id(0);
  const int y = get_global_id(1);
  int i = x, j = y;

  if(x >= pw || y >= ph) return;
  // fill boundary with 1 or 2 px:
  if(pw & 1) { if(x > pw-2) i = pw-2; }
  else       { if(x > pw-3) i = pw-3; }
  if(ph & 1) { if(y > ph-2) j = ph-2; }
  else       { if(y > ph-3) j = ph-3; }
  if(x <= 0) i = 1;
  if(y <= 0) j = 1;

  float4 pixel;
  pixel.x = expand_gaussian(output1, i, j, pw, ph);

  const int num_gamma = 6; // this sucks, have to hardcode for the lack of arrays
  const float v = read_imagef(input, sampleri, (int2)(x, y)).x;
  int hi = 1;
  // what we mean is this:
  // for(;hi<num_gamma-1 && gamma[hi] <= v;hi++);
  for(;hi<num_gamma-1 && ((float)hi+.5f)/(float)num_gamma <= v;hi++);
  int lo = hi-1;
  // const float a = fmin(fmax((v - gamma[lo])/(gamma[hi]-gamma[lo]), 0.0f), 1.0f);
  const float a = fmin(fmax(v*num_gamma - ((float)lo+.5f), 0.0f), 1.0f);
#ifdef AMD
  // See #3756 for further information why this is necessary on AMD using both
  // AMDGPU-Pro and ROCm drivers.
  float r;
  r = select(r, laplacian(buf_g0_l1, buf_g0_l0, x, y, i, j, pw, ph) * (1.0f-a) + laplacian(buf_g1_l1, buf_g1_l0, x, y, i, j, pw, ph) * a, lo == 0);
  r = select(r, laplacian(buf_g1_l1, buf_g1_l0, x, y, i, j, pw, ph) * (1.0f-a) + laplacian(buf_g2_l1, buf_g2_l0, x, y, i, j, pw, ph) * a, lo == 1);
  r = select(r, laplacian(buf_g2_l1, buf_g2_l0, x, y, i, j, pw, ph) * (1.0f-a) + laplacian(buf_g3_l1, buf_g3_l0, x, y, i, j, pw, ph) * a, lo == 2);
  r = select(r, laplacian(buf_g3_l1, buf_g3_l0, x, y, i, j, pw, ph) * (1.0f-a) + laplacian(buf_g4_l1, buf_g4_l0, x, y, i, j, pw, ph) * a, lo == 3);
  r = select(r, laplacian(buf_g4_l1, buf_g4_l0, x, y, i, j, pw, ph) * (1.0f-a) + laplacian(buf_g5_l1, buf_g5_l0, x, y, i, j, pw, ph) * a, lo >= 4);
  pixel.x += r;
#else
  float l0, l1;
  switch(lo)
  { // oh man, this sucks:
    case 0:
      l0 = laplacian(buf_g0_l1, buf_g0_l0, x, y, i, j, pw, ph);
      l1 = laplacian(buf_g1_l1, buf_g1_l0, x, y, i, j, pw, ph);
      break;
    case 1:
      l0 = laplacian(buf_g1_l1, buf_g1_l0, x, y, i, j, pw, ph);
      l1 = laplacian(buf_g2_l1, buf_g2_l0, x, y, i, j, pw, ph);
      break;
    case 2:
      l0 = laplacian(buf_g2_l1, buf_g2_l0, x, y, i, j, pw, ph);
      l1 = laplacian(buf_g3_l1, buf_g3_l0, x, y, i, j, pw, ph);
      break;
    case 3:
      l0 = laplacian(buf_g3_l1, buf_g3_l0, x, y, i, j, pw, ph);
      l1 = laplacian(buf_g4_l1, buf_g4_l0, x, y, i, j, pw, ph);
      break;
    default: //case 4:
      l0 = laplacian(buf_g4_l1, buf_g4_l0, x, y, i, j, pw, ph);
      l1 = laplacian(buf_g5_l1, buf_g5_l0, x, y, i, j, pw, ph);
      break;
    // case 5:
    //   l0 = laplacian(buf_g5_l1, buf_g5_l0, x, y, i, j, pw, ph);
    //   l1 = laplacian(buf_g6_l1, buf_g6_l0, x, y, i, j, pw, ph);
    //   break;
    // default: // case 6:
    //   l0 = laplacian(buf_g6_l1, buf_g6_l0, x, y, i, j, pw, ph);
    //   l1 = laplacian(buf_g7_l1, buf_g7_l0, x, y, i, j, pw, ph);
    //   break;
  }
  pixel.x += l0 * (1.0f-a) + l1 * a;
#endif
  write_imagef (output0, (int2)(x, y), pixel);
}

float curve(
    const float x,
    const float g,
    const float sigma,
    const float shadows,
    const float highlights,
    const float clarity)
{
  const float c = x-g;
  float val;
  const float ssigma = c > 0.0f ? sigma : - sigma;
  const float shadhi = c > 0.0f ? shadows : highlights;
  if (fabs(c) > 2*sigma) val = g + ssigma + shadhi * (c-ssigma); // linear part
  else
  { // blend in via quadratic bezier
    const float t = clipf(c / (2.0f*ssigma));
    const float t2 = t * t;
    const float mt = 1.0f-t;
    val = g + ssigma * 2.0f*mt*t + t2*(ssigma + ssigma*shadhi);
  }
  // midtone local contrast
  val += clarity * c * dt_fast_expf(-c*c/(2.0f*sigma*sigma/3.0f));
  return val;
}

kernel void
process_curve(
    read_only  image2d_t input,
    write_only image2d_t output,
    const float g,
    const float sigma,
    const float shadows,
    const float highlights,
    const float clarity,
    const int wd,
    const int ht)
{
  const int x = get_global_id(0);
  const int y = get_global_id(1);
  if(x >= wd || y >= ht) return;

  float4 pixel = read_imagef(input, sampleri, (int2)(x, y));
  pixel.x = curve(pixel.x, g, sigma, shadows, highlights, clarity);
  write_imagef (output, (int2)(x, y), pixel);
}

kernel void
write_back(
    read_only  image2d_t input,
    read_only  image2d_t processed,
    write_only image2d_t output,
    const int max_supp,
    const int wd,
    const int ht)
{
  const int x = get_global_id(0);
  const int y = get_global_id(1);
  if(x >= wd || y >= ht) return;

  float4 pixel = read_imagef(input, sampleri, (int2)(x, y));
  pixel.x = 100.0f*read_imagef(processed, sampleri, (int2)(x+max_supp, y+max_supp)).x;
  write_imagef (output, (int2)(x, y), pixel);
}