1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96
|
#include "Halide.h"
using namespace Halide;
// Implements a simple gather pipeline to make use of VTCM available on v65+
// hexagon DSP.
template<typename ITYPE>
bool test() {
const Target target = get_jit_target_from_environment();
const int W_img = 128;
const int H_img = 8;
const int W_lut = 256;
const int H_lut = (target.has_feature(Target::HVX_v65)) ? 32 : 1;
srand(time(0));
// Separate channel for xCoord and yCoord for LUT index.
Buffer<ITYPE> input(W_img, 2);
for (int x = 0; x < W_img; x++) {
input(x, 0) = (ITYPE)rand() % W_lut;
input(x, 1) = (ITYPE)rand() % H_lut;
}
// Two Dimensional LUT.
Buffer<ITYPE> lut(W_lut, H_lut);
for (int y = 0; y < H_lut; y++) {
for (int x = 0; x < W_lut; x++) {
lut(x, y) = (ITYPE)rand();
}
}
Var x, y;
Func lut_vtcm, output_vtcm, output;
// Implement: output(x, y) = lut(input(x, 0), input(x, 1))
// output and lut must have store_in(MemoryType::VTCM) to generate vgathers.
Expr xCoord = clamp(cast<int32_t>(input(x, 0)), 0, W_lut - 1);
Expr yCoord = clamp(cast<int32_t>(input(x, 1)), 0, H_lut - 1);
lut_vtcm(x, y) = lut(x, y);
output_vtcm(x, y) = lut_vtcm(xCoord, yCoord);
output(x, y) = output_vtcm(x, y);
if (target.has_feature(Target::HVX)) {
const int vector_size = target.has_feature(Target::HVX) ? 128 : 64;
Var yi;
output
.hexagon()
.split(y, y, yi, H_img / 2)
.parallel(y)
.vectorize(x, vector_size);
if (target.features_any_of({Target::HVX_v65, Target::HVX_v66,
Target::HVX_v68})) {
lut_vtcm
.store_in(MemoryType::VTCM)
.compute_at(output, Var::outermost())
.vectorize(x, vector_size);
output_vtcm
.store_in(MemoryType::VTCM)
.compute_at(output, y)
.vectorize(x, vector_size);
}
}
Buffer<ITYPE> output_buf = output.realize({W_img, H_img});
for (int y = 0; y < H_img; y++) {
for (int x = 0; x < W_img; x++) {
int xCoord = std::max(std::min((int)(input(x, 0)), W_lut - 1), 0);
int yCoord = std::max(std::min((int)(input(x, 1)), H_lut - 1), 0);
ITYPE correct = lut(xCoord, yCoord);
if (output_buf(x, y) != correct) {
printf("output(%d, %d) = %d instead of %d\n", x, y, output_buf(x, y), correct);
return false;
}
}
}
return true;
}
int main() {
// With hexagon targets >=v65 with hvx, we expect to see gathers for
// uint16_t, int16_t, uint32_t, int32_t
// For targets <v65 with hvx, we should generate dynamic_shuffle which are
// compiled to vlut instructions.
if (!test<uint8_t>() ||
!test<int8_t>() ||
!test<uint16_t>() ||
!test<int16_t>() ||
!test<uint32_t>() ||
!test<int32_t>()) return 1;
printf("Success!\n");
return 0;
}
|