File: unsharp_generator.cpp

package info (click to toggle)
halide 21.0.0-4
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 55,752 kB
  • sloc: cpp: 289,334; ansic: 22,751; python: 7,486; makefile: 4,299; sh: 2,508; java: 1,549; javascript: 282; pascal: 207; xml: 127; asm: 9
file content (114 lines) | stat: -rw-r--r-- 4,184 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#include "Halide.h"

namespace {

class Unsharp : public Halide::Generator<Unsharp> {
public:
    GeneratorParam<float> sigma{"sigma", 1.5f};

    Input<Buffer<float, 3>> input{"input"};
    Output<Buffer<float, 3>> output{"output"};

    void generate() {
        Var x("x"), y("y"), c("c");

        const float kPi = 3.14159265358979310000f;

        Func kernel("kernel");
        kernel(x) = exp(-x * x / (2 * sigma * sigma)) / (sqrtf(2 * kPi) * sigma);

        Func input_bounded = Halide::BoundaryConditions::repeat_edge(input);

        Func gray("gray");
        gray(x, y) = (0.299f * input_bounded(x, y, 0) +
                      0.587f * input_bounded(x, y, 1) +
                      0.114f * input_bounded(x, y, 2));

        Func blur_y("blur_y");
        blur_y(x, y) = (kernel(0) * gray(x, y) +
                        kernel(1) * (gray(x, y - 1) +
                                     gray(x, y + 1)) +
                        kernel(2) * (gray(x, y - 2) +
                                     gray(x, y + 2)) +
                        kernel(3) * (gray(x, y - 3) +
                                     gray(x, y + 3)));

        Func blur_x("blur_x");
        blur_x(x, y) = (kernel(0) * blur_y(x, y) +
                        kernel(1) * (blur_y(x - 1, y) +
                                     blur_y(x + 1, y)) +
                        kernel(2) * (blur_y(x - 2, y) +
                                     blur_y(x + 2, y)) +
                        kernel(3) * (blur_y(x - 3, y) +
                                     blur_y(x + 3, y)));

        Func sharpen("sharpen");
        sharpen(x, y) = 2 * gray(x, y) - blur_x(x, y);

        Func ratio("ratio");
        ratio(x, y) = sharpen(x, y) / gray(x, y);

        output(x, y, c) = ratio(x, y) * input(x, y, c);

        // Estimates (for autoscheduler; ignored otherwise)
        {
            input.dim(0).set_estimate(0, 1536);
            input.dim(1).set_estimate(0, 2560);
            input.dim(2).set_estimate(0, 3);
            output.dim(0).set_estimate(0, 1536);
            output.dim(1).set_estimate(0, 2560);
            output.dim(2).set_estimate(0, 3);
        }

        // Schedule
        if (!using_autoscheduler()) {
            // Some Intel Mac Minis have GPUs that require tile sizes smaller than 32x32
            // for this pipeline because they have too few registers. Drop to 16x16 to
            // avoid unexpected crashes in CI.
            const int tile_size = get_target().has_feature(Target::Metal) ? 16 : 32;

            if (get_target().has_gpu_feature()) {
                // The timing of this schedule is oddly noisy. Runs
                // from 0.4ms to 0.5ms on a 2060 RTX.  Oddly, the
                // better runtimes occur when running under nvprof.
                Var xi, yi;
                output.compute_root()
                    .reorder(c, x, y)
                    .gpu_tile(x, y, xi, yi, tile_size, tile_size)
                    .bound(c, 0, 3)
                    .unroll(c);
                ratio.compute_at(output, xi);
                gray.compute_at(output, x)
                    .tile(x, y, xi, yi, 2, 2)
                    .unroll(xi)
                    .unroll(yi)
                    .gpu_threads(x, y);
                blur_y.compute_at(output, x)
                    .unroll(x, 2)
                    .gpu_threads(x, y);
            } else {
                // 1.93ms on an Intel i9-9960X using 16 threads
                Var yo, yi;
                const int vec = natural_vector_size<float>();

                output.split(y, yo, yi, 32)
                    .vectorize(x, vec)
                    .parallel(yo)
                    .reorder(x, c, yi, yo);
                gray.compute_at(output, yi)
                    .store_at(output, yo)
                    .vectorize(x, vec);
                blur_y.compute_at(output, yi)
                    .store_at(output, yo)
                    .vectorize(x, vec);
                ratio.compute_at(output, yi)
                    .store_at(output, yo)
                    .vectorize(x, vec);
            }
        }
    }
};

}  // namespace

HALIDE_REGISTER_GENERATOR(Unsharp, unsharp)