1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142
|
#include "Halide.h"
#include "halide_benchmark.h"
#include "halide_test_dirs.h"
#include <cstdio>
using namespace Halide;
using namespace Halide::Tools;
enum {
scalar_trans,
vec_y_trans,
vec_x_trans
};
Buffer<uint16_t> test_transpose(int mode) {
Func input, block, block_transpose, output;
Var x, y;
input(x, y) = cast<uint16_t>(x + y);
input.compute_root();
block(x, y) = input(x, y);
block_transpose(x, y) = block(y, x);
output(x, y) = block_transpose(x, y);
Var xi, yi;
output.tile(x, y, xi, yi, 8, 8).vectorize(xi).unroll(yi);
// Do 8 vectorized loads from the input.
block.compute_at(output, x).vectorize(x).unroll(y);
std::string algorithm;
switch (mode) {
case scalar_trans:
block_transpose.compute_at(output, x).unroll(x).unroll(y);
algorithm = "Scalar transpose";
output.compile_to_assembly(Internal::get_test_tmp_dir() + "scalar_transpose.s", std::vector<Argument>());
break;
case vec_y_trans:
block_transpose.compute_at(output, x).vectorize(y).unroll(x);
algorithm = "Transpose vectorized in y";
output.compile_to_assembly(Internal::get_test_tmp_dir() + "fast_transpose_y.s", std::vector<Argument>());
break;
case vec_x_trans:
block_transpose.compute_at(output, x).vectorize(x).unroll(y);
algorithm = "Transpose vectorized in x";
output.compile_to_assembly(Internal::get_test_tmp_dir() + "fast_transpose_x.s", std::vector<Argument>());
break;
}
Buffer<uint16_t> result(1024, 1024);
output.compile_jit();
output.realize(result);
double t = benchmark([&]() {
output.realize(result);
});
std::cout << "Dummy Func version: " << algorithm << " bandwidth " << 1024 * 1024 / t << " byte/s.\n";
return result;
}
/* This illustrates how to achieve the same scheduling behavior using the 'in()'
* directive as opposed to creating dummy Funcs as done in 'test_transpose()' */
Buffer<uint16_t> test_transpose_wrap(int mode) {
Func input, block_transpose, block, output;
Var x, y;
input(x, y) = cast<uint16_t>(x + y);
input.compute_root();
output(x, y) = input(y, x);
Var xi, yi;
output.tile(x, y, xi, yi, 8, 8).vectorize(xi).unroll(yi);
// Do 8 vectorized loads from the input.
block_transpose = input.in(output).compute_at(output, x).vectorize(x).unroll(y);
std::string algorithm;
switch (mode) {
case scalar_trans:
block = block_transpose.in(output).reorder_storage(y, x).compute_at(output, x).unroll(x).unroll(y);
algorithm = "Scalar transpose";
output.compile_to_assembly(Internal::get_test_tmp_dir() + "scalar_transpose.s", std::vector<Argument>());
break;
case vec_y_trans:
block = block_transpose.in(output).reorder_storage(y, x).compute_at(output, x).vectorize(y).unroll(x);
algorithm = "Transpose vectorized in y";
output.compile_to_assembly(Internal::get_test_tmp_dir() + "fast_transpose_y.s", std::vector<Argument>());
break;
case vec_x_trans:
block = block_transpose.in(output).reorder_storage(y, x).compute_at(output, x).vectorize(x).unroll(y);
algorithm = "Transpose vectorized in x";
output.compile_to_assembly(Internal::get_test_tmp_dir() + "fast_transpose_x.s", std::vector<Argument>());
break;
}
Buffer<uint16_t> result(1024, 1024);
output.compile_jit();
output.realize(result);
double t = benchmark([&]() {
output.realize(result);
});
std::cout << "Wrapper version: " << algorithm << " bandwidth " << 1024 * 1024 / t << " byte/s.\n";
return result;
}
int main(int argc, char **argv) {
Target target = get_jit_target_from_environment();
if (target.arch == Target::WebAssembly) {
printf("[SKIP] Performance tests are meaningless and/or misleading under WebAssembly interpreter.\n");
return 0;
}
test_transpose(scalar_trans);
test_transpose_wrap(scalar_trans);
test_transpose(vec_y_trans);
test_transpose_wrap(vec_y_trans);
Buffer<uint16_t> im1 = test_transpose(vec_x_trans);
Buffer<uint16_t> im2 = test_transpose_wrap(vec_x_trans);
// Check correctness of the wrapper version
for (int y = 0; y < im2.height(); y++) {
for (int x = 0; x < im2.width(); x++) {
if (im2(x, y) != im1(x, y)) {
printf("wrapper(%d, %d) = %d instead of %d\n",
x, y, im2(x, y), im1(x, y));
return 1;
}
}
}
printf("Success!\n");
return 0;
}
|