File: 04_fastdiv.cpp

package info (click to toggle)
ispc 1.26.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 95,356 kB
  • sloc: cpp: 55,778; python: 6,681; yacc: 3,074; lex: 1,095; ansic: 714; sh: 283; makefile: 16
file content (93 lines) | stat: -rw-r--r-- 4,162 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
// Copyright (c) 2021-2024, Intel Corporation
// SPDX-License-Identifier: BSD-3-Clause

#include <benchmark/benchmark.h>
#include <bitset>
#include <cstdint>
#include <stdio.h>

#include "../common.h"
#include "04_fastdiv_ispc.h"

static Docs docs("Check fast_idiv implmentation of stdlib functions:\n"
                 "[int8, uint8, int16, uint16, int32, uint32, int64, uint64] x [13, 16] versions.\n"
                 "Conditions to trigger fast_idiv:\n"
                 " - The value being divided must be an int8/16/32.\n"
                 " - The divisor must be the same compile-time constant value for all of the vector lanes.\n"
                 "Expectation:\n"
                 " - No regressions\n");

WARM_UP_RUN();

// Minimum size is maximum target width, i.e. 64.
// Larger buffer is better, but preferably to stay within L1.
#define ARGS Arg(8192)
// #define ARGS RangeMultiplier(2)->Range(64, 64<<15)->Complexity(benchmark::oN)

template <typename T> static void init_src(T *src, int count) {
    for (int i = 0; i < count; i++) {
        // These computations may involve overflow/underflow, but this is ok.
        src[i] = ((T)i) - ((T)count / 2);
    }
}

template <typename T> static void init_dst(T *dst, int count) {
    for (int i = 0; i < count; i++) {
        dst[i] = 0;
    }
}

template <typename T> static void check(T *src, T *dst, int divisor, int count) {
    for (int i = 0; i < count; i++) {
        T val = src[i] / divisor;
        if (val != dst[i]) {
            printf("Error i=%d\n", i);
            return;
        }
    }
}

#define FASTDIV(T_C, T_ISPC, DIV_VAL)                                                                                  \
    static void fastdiv_##T_ISPC##_##DIV_VAL(benchmark::State &state) {                                                \
        int count = static_cast<int>(state.range(0));                                                                  \
        T_C *dst = static_cast<T_C *>(aligned_alloc_helper(sizeof(T_C) * count));                                      \
        T_C *src = static_cast<T_C *>(aligned_alloc_helper(sizeof(T_C) * count));                                      \
        init_src(src, count);                                                                                          \
        init_dst(dst, count);                                                                                          \
                                                                                                                       \
        for (auto _ : state) {                                                                                         \
            ispc::fastdiv_##T_ISPC##_##DIV_VAL(src, dst, count);                                                       \
        }                                                                                                              \
                                                                                                                       \
        check(src, dst, DIV_VAL, count);                                                                               \
        aligned_free_helper(src);                                                                                      \
        aligned_free_helper(dst);                                                                                      \
        state.SetComplexityN(state.range(0));                                                                          \
    }                                                                                                                  \
    BENCHMARK(fastdiv_##T_ISPC##_##DIV_VAL)->ARGS;

FASTDIV(uint64_t, uint64, 13)
FASTDIV(int64_t, int64, 13)

FASTDIV(uint32_t, uint32, 13)
FASTDIV(int32_t, int32, 13)

FASTDIV(uint16_t, uint16, 13)
FASTDIV(int16_t, int16, 13)

FASTDIV(uint8_t, uint8, 13)
FASTDIV(int8_t, int8, 13)

FASTDIV(uint64_t, uint64, 16)
FASTDIV(int64_t, int64, 16)

FASTDIV(uint32_t, uint32, 16)
FASTDIV(int32_t, int32, 16)

FASTDIV(uint16_t, uint16, 16)
FASTDIV(int16_t, int16, 16)

FASTDIV(uint8_t, uint8, 16)
FASTDIV(int8_t, int8, 16)

BENCHMARK_MAIN();