File: compositing_generator.cpp

package info (click to toggle)
halide 21.0.0-4
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 55,752 kB
  • sloc: cpp: 289,334; ansic: 22,751; python: 7,486; makefile: 4,299; sh: 2,508; java: 1,549; javascript: 282; pascal: 207; xml: 127; asm: 9
file content (204 lines) | stat: -rw-r--r-- 7,308 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
#include "Halide.h"

namespace {

// This app does Porter-Duff compositing using a runtime-provided list of blend
// modes and layers. It demonstrates how to write a mini-interpreter in Halide
// that ingests some byte code to determine what to do. It also demonstrates
// some fixed-point math patterns useful in compositing.
class Compositing : public Halide::Generator<Compositing> {
    Var x{"x"}, y{"y"}, c{"c"};

    static constexpr int num_layers = 6;
    static constexpr int num_blend_modes = 5;

public:
    // A stack of RGBA layers to composite.
    Input<Buffer<uint8_t, 3>[num_layers]> layer_rgba { "layer_rgba" };

    // The blend modes to use for each layer after the first.
    Input<Buffer<int, 1>> ops{"ops"};

    // An RGBA output buffer.
    Output<Buffer<uint8_t, 3>> output{"output"};

    Tuple premultiply_alpha(Tuple in) {
        if (get_target().has_gpu_feature()) {
            const float scale = 1 / 255.0f;
            // Use floats on GPU for intermediates
            in[3] = in[3] * scale;
            for (int i = 0; i < 3; i++) {
                in[i] = in[i] * (scale * in[3]);
            }
        } else {
            // On CPU, use uint16s for color components with premultiplied alpha, and uint8 for alpha.
            for (int i = 0; i < 3; i++) {
                in[i] = widening_mul(in[i], in[3]);
            }
        }
        return in;
    }

    Tuple normalize(Tuple in) {
        if (in[0].type().is_float()) {
            for (int i = 0; i < 4; i++) {
                Expr scale = i < 3 ? 255.0f / in[3] : 255.0f;
                in[i] *= scale;
                in[i] = saturating_cast(UInt(8), in[i]);
            }
        } else {
            for (int i = 0; i < 3; i++) {
                in[i] = fast_integer_divide(in[i] + in[3] / 2, in[3]);
                in[i] = saturating_cast(UInt(8), in[i]);
            }
        }
        return in;
    }

    Expr scale(Expr a, Expr b) {
        if (a.type().is_float()) {
            return a * b;
        } else {
            assert(b.type() == UInt(8));
            Expr c = widening_mul(a, cast(a.type(), b));
            // The below is equivalent to c = (c + 127) / 255;
            c += rounding_shift_right(c, 8);
            c = rounding_shift_right(c, 8);
            return cast(a.type(), c);
        }
    }

    Expr invert(const Expr &e) {
        if (e.type().is_float()) {
            return 1 - e;
        } else {
            return ~e;
        }
    }

    // Various Porter-Duff blend modes, in terms of the operators above.
    Tuple over(const Tuple &a, const Tuple &b) {
        std::vector<Expr> c(4);
        for (int i = 0; i < 3; i++) {
            c[i] = b[i] + scale(a[i], invert(b[3]));
        }
        c[3] = b[3] + scale(a[3], invert(b[3]));
        return Tuple{c};
    }

    Tuple atop(const Tuple &a, const Tuple &b) {
        std::vector<Expr> c(4);
        for (int i = 0; i < 3; i++) {
            c[i] = scale(b[i], a[3]) + scale(a[i], invert(b[3]));
        }
        c[3] = a[3];
        return Tuple{c};
    }

    Tuple xor_(const Tuple &a, const Tuple &b) {
        std::vector<Expr> c(4);
        for (int i = 0; i < 3; i++) {
            c[i] = scale(b[i], invert(a[3])) + scale(a[i], invert(b[3]));
        }
        c[3] = scale(b[3], invert(a[3])) + scale(a[3], invert(b[3]));
        return Tuple{c};
    }

    Tuple in(const Tuple &a, const Tuple &b) {
        std::vector<Expr> c(4);
        for (int i = 0; i < 3; i++) {
            c[i] = scale(a[i], b[3]);
        }
        c[3] = scale(a[3], b[3]);
        return Tuple{c};
    }

    Tuple out(const Tuple &a, const Tuple &b) {
        std::vector<Expr> c(4);
        for (int i = 0; i < 3; i++) {
            c[i] = scale(a[i], invert(b[3]));
        }
        c[3] = scale(a[3], invert(b[3]));
        return Tuple{c};
    }

    void generate() {
        // RGB and alpha potentially have different types, so we store them as a Tuple
        std::vector<Tuple> layer_vec;
        for (int i = 0; i < num_layers; i++) {
            layer_vec.push_back({layer_rgba[i](x, y, 0), layer_rgba[i](x, y, 1), layer_rgba[i](x, y, 2), layer_rgba[i](x, y, 3)});
        }

        // Combine the separate layers into a single Func
        Func layer_muxed{"layer_muxed"};
        Var k{"k"};
        layer_muxed(x, y, k) = mux(k, layer_vec);

        // Convert to premultiplied alpha in the working type (float on GPU, uint16 on CPU)
        Func blended{"blended"};
        blended(x, y) = premultiply_alpha(layer_vec[0]);

        // We will perform all blend modes on all layers, and then use an
        // RDom::where clause to restrict it to the desired blend mode for each
        // layer. If we then unroll over r[0], this compiles to a switch
        // statement. It is a useful pattern for writing mini interpreters that
        // ingest a bytecode and use it to switch between various ops.
        RDom r(0, num_blend_modes, 0, num_layers - 1);
        r.where(r[0] == ops(r[1]));
        Tuple a = blended(x, y);
        Tuple b = premultiply_alpha(layer_muxed(x, y, r[1] + 1));
        std::vector<Tuple> blends = {over(a, b), atop(a, b), xor_(a, b), in(a, b), out(a, b)};
        blended(x, y) = mux(r[0], blends);

        // Divide by alpha and convert back to uint8.
        output(x, y, c) = mux(c, normalize(blended(x, y)));

        /* ESTIMATES */
        for (int i = 0; i < num_layers; i++) {
            layer_rgba[i].set_estimates({{0, 1536}, {0, 2560}, {0, 4}});
        }
        output.set_estimates({{0, 1536}, {0, 2560}, {0, 4}});
        ops.set_estimates({{0, num_layers - 1}});

        /* THE SCHEDULE */
        if (using_autoscheduler()) {
            // Nothing.
        } else if (get_target().has_gpu_feature()) {
            // GPU schedule. 2.4ms on an RTX 2060
            Var xi, yi;
            output.gpu_tile(x, y, xi, yi, 32, 8);
            blended.update().unroll(r[0]).unroll(r[1]);
        } else {
            // CPU schedule. 2ms on an i9-9960X at 3.1 GHz with 16 threads
            const int vec = natural_vector_size<uint8_t>();
            Var yo{"yo"}, yi{"yi"};
            output.split(y, yo, yi, 8)
                .parallel(yo)
                .vectorize(x, vec)
                .reorder(c, x, yi, yo)
                .bound(c, 0, 4)
                .unroll(c);

            // Compute the intermediate state per row of the output, so that our
            // switch over the op codes can be outside the loop over x.
            blended.store_in(MemoryType::Stack)
                .compute_at(output, yi)
                .vectorize(x, vec)
                .update()
                .reorder(x, r[0], r[1])
                // Unroll over the possible blend modes to get a switch statement.
                .unroll(r[0])
                // Unroll over layers to remove the mux in layer_muxed. Ideally
                // this wouldn't be necessary because LLVM should really convert
                // a select of loads of the same index into a select between the
                // base pointers hoisted outside of the inner loop, but
                // unfortunately it doesn't.
                .unroll(r[1])
                .vectorize(x, vec);
        }
    }
};

}  // namespace

HALIDE_REGISTER_GENERATOR(Compositing, compositing)