File: 03_popcnt.cpp

package info (click to toggle)
ispc 1.26.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 95,356 kB
  • sloc: cpp: 55,778; python: 6,681; yacc: 3,074; lex: 1,095; ansic: 714; sh: 283; makefile: 16
file content (87 lines) | stat: -rw-r--r-- 4,399 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
// Copyright (c) 2021-2024, Intel Corporation
// SPDX-License-Identifier: BSD-3-Clause

#include <benchmark/benchmark.h>
#include <bitset>
#include <cstdint>
#include <stdio.h>

#include "../common.h"
#include "03_popcnt_ispc.h"

static Docs docs("Check popcnt implmentation of stdlib functions:\n"
                 "[int32, int64] x [uniform, varying] x [all, even] versions.\n"
                 "Observations:\n"
                 " - popcnt is very lightweight, so 8 popcnt are chained.\n"
                 " - chaining doesn't cause optimizing out popcnt, 8 seems to be good enough"
                 " - varying versions have overhead on insert/extract\n"
                 " - for even versions mask is statically known and compiler is able to optimize out inactive lanes\n"
                 "Expectation:\n"
                 " - No regressions\n");

WARM_UP_RUN();

// Minimum size is maximum target width, i.e. 64.
// Larger buffer is better, but preferably to stay within L1.
#define ARGS Arg(8192)
// #define ARGS RangeMultiplier(2)->Range(64, 64<<15)->Complexity(benchmark::oN)

template <typename T> static void init(T *src, T *dst, int count) {
    for (int i = 0; i < count; i++) {
        src[i] = static_cast<T>(i);
        dst[i] = 0;
    }
}

template <typename T> static void check_all(T *src, T *dst, int count) {
    for (int i = 0; i < count; i++) {
        // for single popcnt() here's the formula, but we chain 8 popcnt()
        // int count = std::bitset< std::numeric_limits<T>::digits >(src[i]).count();
        int count = (i == 0) ? 0 : 1;
        if (dst[i] != count) {
            printf("Error i=%d\n", i);
            return;
        }
    }
}

template <typename T> static void check_even(T *src, T *dst, int count) {
    for (int i = 0; i < count; i += 2) {
        // for single popcnt() here's the formula, but we chain 8 popcnt()
        // int count = std::bitset< std::numeric_limits<T>::digits >(src[i]).count();
        int count = (i == 0) ? 0 : 1;
        if (dst[i] != count) {
            printf("Error i=%d\n", i);
            return;
        }
    }
}

#define POPCNT(T_C, T_ISPC, V, ALL)                                                                                    \
    static void popcnt_stdlib_##V##_##T_ISPC##_##ALL(benchmark::State &state) {                                        \
        int count = static_cast<int>(state.range(0));                                                                  \
        T_C *src = static_cast<T_C *>(aligned_alloc_helper(sizeof(T_C) * count));                                      \
        T_C *dst = static_cast<T_C *>(aligned_alloc_helper(sizeof(T_C) * count));                                      \
        init(src, dst, count);                                                                                         \
                                                                                                                       \
        for (auto _ : state) {                                                                                         \
            ispc::popcnt_##V##_##T_ISPC##_##ALL(src, dst, count);                                                      \
        }                                                                                                              \
                                                                                                                       \
        check_##ALL(src, dst, count);                                                                                  \
        aligned_free_helper(src);                                                                                      \
        aligned_free_helper(dst);                                                                                      \
        state.SetComplexityN(state.range(0));                                                                          \
    }                                                                                                                  \
    BENCHMARK(popcnt_stdlib_##V##_##T_ISPC##_##ALL)->ARGS;

POPCNT(int, int32, uniform, all)
POPCNT(int, int32, varying, all)
POPCNT(int, int32, uniform, even)
POPCNT(int, int32, varying, even)
POPCNT(int64_t, int64, uniform, all)
POPCNT(int64_t, int64, varying, all)
POPCNT(int64_t, int64, uniform, even)
POPCNT(int64_t, int64, varying, even)

BENCHMARK_MAIN();