File: benchmark_config_dispatch.cpp

package info (click to toggle)
rocprim 6.4.3-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 11,428 kB
  • sloc: cpp: 153,383; python: 1,397; sh: 404; xml: 217; makefile: 119
file content (128 lines) | stat: -rw-r--r-- 3,820 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128

#include "benchmark_utils.hpp"
#include "cmdparser.hpp"
#include <rocprim/device/config_types.hpp>

#include <benchmark/benchmark.h>

#include <hip/hip_runtime.h>

#include <iostream>

#ifndef DEFAULT_N
const size_t DEFAULT_BYTES = 1024 * 1024 * 32 * 4;
#endif


enum class stream_kind
{
    default_stream,
    per_thread_stream,
    explicit_stream,
    async_stream
};

static void BM_host_target_arch(benchmark::State& state, const stream_kind stream_kind)
{
    const hipStream_t stream = [stream_kind]() -> hipStream_t
    {
        hipStream_t stream = 0;
        switch(stream_kind)
        {
            case stream_kind::default_stream: return stream;
            case stream_kind::per_thread_stream: return hipStreamPerThread;
            case stream_kind::explicit_stream: HIP_CHECK(hipStreamCreate(&stream)); return stream;
            case stream_kind::async_stream:
                HIP_CHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
                return stream;
        }
    }();

    for(auto _ : state)
    {
        rocprim::detail::target_arch target_arch;
        HIP_CHECK(rocprim::detail::host_target_arch(stream, target_arch));
        benchmark::DoNotOptimize(target_arch);
    }

    if(stream_kind != stream_kind::default_stream && stream_kind != stream_kind::per_thread_stream)
    {
        HIP_CHECK(hipStreamDestroy(stream));
    }
}

__global__ void empty_kernel() {}

// An empty kernel launch for baseline
static void BM_kernel_launch(benchmark::State& state)
{
    static constexpr hipStream_t stream = 0;

    for(auto _ : state)
    {
        hipLaunchKernelGGL(empty_kernel, dim3(1), dim3(1), 0, stream);
        HIP_CHECK(hipGetLastError());
    }
    HIP_CHECK(hipStreamSynchronize(stream));
}

#define CREATE_BENCHMARK(ST, SK)                \
    benchmark::RegisterBenchmark(               \
        bench_naming::format_name(              \
            "{lvl:na"                           \
            ",algo:" #ST                        \
            ",cfg:default_config}"              \
        ).c_str(),                              \
        &BM_host_target_arch,                   \
        SK                                      \
    )                                           \


int main(int argc, char** argv)
{
    cli::Parser parser(argc, argv);
    parser.set_optional<size_t>("size", "size", DEFAULT_BYTES, "number of bytes");
    parser.set_optional<int>("trials", "trials", 100, "number of iterations");
    parser.set_optional<std::string>("name_format",
                                    "name_format",
                                    "human",
                                    "either: json,human,txt");
    parser.run_and_exit_if_error();

    // Parse argv
    benchmark::Initialize(&argc, argv);
    const int    trials = parser.get<int>("trials");
    bench_naming::set_format(parser.get<std::string>("name_format"));

    // HIP

    std::vector<benchmark::internal::Benchmark*> benchmarks{
        CREATE_BENCHMARK(default_stream, stream_kind::default_stream),
        CREATE_BENCHMARK(per_thread_stream, stream_kind::per_thread_stream),
        CREATE_BENCHMARK(explicit_stream, stream_kind::explicit_stream),
        CREATE_BENCHMARK(async_stream, stream_kind::async_stream),
        benchmark::RegisterBenchmark(
            bench_naming::format_name("{lvl:na,algo:empty_kernel,cfg:default_config}").c_str(),
            BM_kernel_launch)};

    // Use manual timing
    for(auto& b : benchmarks)
    {
        b->UseManualTime();
        b->Unit(benchmark::kMillisecond);
    }

    // Force number of iterations
    if(trials > 0)
    {
        for(auto& b : benchmarks)
        {
            b->Iterations(trials);
        }
    }

    // Run benchmarks
    benchmark::RunSpecifiedBenchmarks();
    return 0;

}