1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128
|
#include "benchmark_utils.hpp"
#include "cmdparser.hpp"
#include <rocprim/device/config_types.hpp>
#include <benchmark/benchmark.h>
#include <hip/hip_runtime.h>
#include <iostream>
#ifndef DEFAULT_N
const size_t DEFAULT_BYTES = 1024 * 1024 * 32 * 4;
#endif
enum class stream_kind
{
default_stream,
per_thread_stream,
explicit_stream,
async_stream
};
static void BM_host_target_arch(benchmark::State& state, const stream_kind stream_kind)
{
const hipStream_t stream = [stream_kind]() -> hipStream_t
{
hipStream_t stream = 0;
switch(stream_kind)
{
case stream_kind::default_stream: return stream;
case stream_kind::per_thread_stream: return hipStreamPerThread;
case stream_kind::explicit_stream: HIP_CHECK(hipStreamCreate(&stream)); return stream;
case stream_kind::async_stream:
HIP_CHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
return stream;
}
}();
for(auto _ : state)
{
rocprim::detail::target_arch target_arch;
HIP_CHECK(rocprim::detail::host_target_arch(stream, target_arch));
benchmark::DoNotOptimize(target_arch);
}
if(stream_kind != stream_kind::default_stream && stream_kind != stream_kind::per_thread_stream)
{
HIP_CHECK(hipStreamDestroy(stream));
}
}
__global__ void empty_kernel() {}
// An empty kernel launch for baseline
static void BM_kernel_launch(benchmark::State& state)
{
static constexpr hipStream_t stream = 0;
for(auto _ : state)
{
hipLaunchKernelGGL(empty_kernel, dim3(1), dim3(1), 0, stream);
HIP_CHECK(hipGetLastError());
}
HIP_CHECK(hipStreamSynchronize(stream));
}
#define CREATE_BENCHMARK(ST, SK) \
benchmark::RegisterBenchmark( \
bench_naming::format_name( \
"{lvl:na" \
",algo:" #ST \
",cfg:default_config}" \
).c_str(), \
&BM_host_target_arch, \
SK \
) \
int main(int argc, char** argv)
{
cli::Parser parser(argc, argv);
parser.set_optional<size_t>("size", "size", DEFAULT_BYTES, "number of bytes");
parser.set_optional<int>("trials", "trials", 100, "number of iterations");
parser.set_optional<std::string>("name_format",
"name_format",
"human",
"either: json,human,txt");
parser.run_and_exit_if_error();
// Parse argv
benchmark::Initialize(&argc, argv);
const int trials = parser.get<int>("trials");
bench_naming::set_format(parser.get<std::string>("name_format"));
// HIP
std::vector<benchmark::internal::Benchmark*> benchmarks{
CREATE_BENCHMARK(default_stream, stream_kind::default_stream),
CREATE_BENCHMARK(per_thread_stream, stream_kind::per_thread_stream),
CREATE_BENCHMARK(explicit_stream, stream_kind::explicit_stream),
CREATE_BENCHMARK(async_stream, stream_kind::async_stream),
benchmark::RegisterBenchmark(
bench_naming::format_name("{lvl:na,algo:empty_kernel,cfg:default_config}").c_str(),
BM_kernel_launch)};
// Use manual timing
for(auto& b : benchmarks)
{
b->UseManualTime();
b->Unit(benchmark::kMillisecond);
}
// Force number of iterations
if(trials > 0)
{
for(auto& b : benchmarks)
{
b->Iterations(trials);
}
}
// Run benchmarks
benchmark::RunSpecifiedBenchmarks();
return 0;
}
|