File: 08_masked_load_store.cpp

package info (click to toggle)
ispc 1.26.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 95,356 kB
  • sloc: cpp: 55,778; python: 6,681; yacc: 3,074; lex: 1,095; ansic: 714; sh: 283; makefile: 16
file content (123 lines) | stat: -rw-r--r-- 7,369 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
// Copyright (c) 2023-2024, Intel Corporation
// SPDX-License-Identifier: BSD-3-Clause

#include <benchmark/benchmark.h>
#include <bitset>
#include <cstdint>
#include <stdio.h>

#include "../common.h"
#include "08_masked_load_store_ispc.h"

static Docs
    docs("Check masked load/store implementation.\n"
         "ISPC has several approaches for implementations of masked_load/masked_store functionality:\n"
         "1. generic implementation using per lane traversing\n"
         "2. generic implementation using llvm.masked.load/store intrinsics\n"
         "3. target-specific implementation using target-specific instructions (starting avx)\n"
         "This benchmark allows to effectively compare and tune different implementations for different types.\n"
         "Expectation:\n"
         " - No regressions\n");

WARM_UP_RUN();

// Minimum size is maximum target width, i.e. 64.
// Larger buffer is better, but preferably to stay within L1.
#define ARGS Arg(8192)
// #define ARGS RangeMultiplier(2)->Range(64, 64<<15)->Complexity(benchmark::oN)

template <typename T> static void init(T *src_a, T *src_b, T *dst, int count) {
    for (int i = 0; i < count; i++) {
        src_a[i] = static_cast<T>(i / count);
        src_b[i] = static_cast<T>((i + 1) / count);
        dst[i] = 0;
    }
}

static void init_mask_all_on(bool *mask, int count) {
    for (int i = 0; i < count; i++) {
        mask[i] = 1;
    }
}

static void init_mask_half_on(bool *mask, int count) {
    for (int i = 0; i < count; i++) {
        mask[i] = i % 2;
    }
}

template <typename T> static void check(T *src_a, T *src_b, T *dst, bool *mask, int count) {
    for (int i = 0; i < count; i++) {
        if (mask[i] == 1) {
            if (dst[i] != src_b[i] - src_a[i]) {
                printf("Error i=%d\n", i);
                return;
            }
        } else {
            if (dst[i] != 0) {
                printf("Error kuku i=%d\n", i);
                return;
            }
        }
    }
}

#define LOAD_STORE_ALL_ON(T_C, T_ISPC)                                                                                 \
    static void masked_load_store_all_on_##T_C(benchmark::State &state) {                                              \
        int count = static_cast<int>(state.range(0));                                                                  \
        T_C *src_a = static_cast<T_C *>(aligned_alloc_helper(sizeof(T_C) * count));                                    \
        T_C *src_b = static_cast<T_C *>(aligned_alloc_helper(sizeof(T_C) * count));                                    \
        T_C *dst = static_cast<T_C *>(aligned_alloc_helper(sizeof(T_C) * count));                                      \
        bool *mask = static_cast<bool *>(aligned_alloc_helper(sizeof(bool) * count));                                  \
        init(src_a, src_b, dst, count);                                                                                \
        init_mask_all_on(mask, count);                                                                                 \
        for (auto _ : state) {                                                                                         \
            ispc::masked_load_store_##T_ISPC(src_a, src_b, dst, count, mask);                                          \
        }                                                                                                              \
                                                                                                                       \
        check(src_a, src_b, dst, mask, count);                                                                         \
        aligned_free_helper(src_a);                                                                                    \
        aligned_free_helper(src_b);                                                                                    \
        aligned_free_helper(dst);                                                                                      \
        aligned_free_helper(mask);                                                                                     \
        state.SetComplexityN(state.range(0));                                                                          \
    }                                                                                                                  \
    BENCHMARK(masked_load_store_all_on_##T_C)->ARGS;

#define LOAD_STORE_HALF_ON(T_C, T_ISPC)                                                                                \
    static void masked_load_store_half_on_##T_C(benchmark::State &state) {                                             \
        int count = static_cast<int>(state.range(0));                                                                  \
        T_C *src_a = static_cast<T_C *>(aligned_alloc_helper(sizeof(T_C) * count));                                    \
        T_C *src_b = static_cast<T_C *>(aligned_alloc_helper(sizeof(T_C) * count));                                    \
        T_C *dst = static_cast<T_C *>(aligned_alloc_helper(sizeof(T_C) * count));                                      \
        bool *mask = static_cast<bool *>(aligned_alloc_helper(sizeof(bool) * count));                                  \
        init(src_a, src_b, dst, count);                                                                                \
        init_mask_half_on(mask, count);                                                                                \
        for (auto _ : state) {                                                                                         \
            ispc::masked_load_store_##T_ISPC(src_a, src_b, dst, count, mask);                                          \
        }                                                                                                              \
                                                                                                                       \
        check(src_a, src_b, dst, mask, count);                                                                         \
        aligned_free_helper(src_a);                                                                                    \
        aligned_free_helper(src_b);                                                                                    \
        aligned_free_helper(dst);                                                                                      \
        aligned_free_helper(mask);                                                                                     \
        state.SetComplexityN(state.range(0));                                                                          \
    }                                                                                                                  \
    BENCHMARK(masked_load_store_half_on_##T_C)->ARGS;

LOAD_STORE_ALL_ON(int8_t, int8)
LOAD_STORE_ALL_ON(int16_t, int16)
LOAD_STORE_ALL_ON(int32_t, int32)
LOAD_STORE_ALL_ON(float, float)
LOAD_STORE_ALL_ON(double, double)
LOAD_STORE_ALL_ON(int64_t, int64)

LOAD_STORE_HALF_ON(int8_t, int8)
LOAD_STORE_HALF_ON(int16_t, int16)
LOAD_STORE_HALF_ON(int32_t, int32)
LOAD_STORE_HALF_ON(float, float)
LOAD_STORE_HALF_ON(double, double)
LOAD_STORE_HALF_ON(int64_t, int64)

BENCHMARK_MAIN();