1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
|
// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
#include <stdio.h>
#include <stdlib.h>
#include <string>
#include <chrono>
#include <thread>
#include <common/logging/logging.h>
#include <libkineto.h>
#include "kineto/libkineto/sample_programs/kineto_playground.cuh"
using namespace kineto;
static const std::string kFileName = "/tmp/kineto_playground_trace.json";
int main() {
warmup();
// Kineto config
std::set<libkineto::ActivityType> types_cupti_prof = {
libkineto::ActivityType::CUDA_PROFILER_RANGE,
};
// Use a special kineto__cuda_core_flop metric that counts individual
// CUDA core floating point instructions by operation type (fma,fadd,fmul,dadd ...)
// You can also use kineto__tensor_core_insts or any metric
// or any metric defined by CUPTI Profiler below
// https://docs.nvidia.com/cupti/Cupti/r_main.html#r_profiler
std::string profiler_config = "ACTIVITIES_WARMUP_PERIOD_SECS=0\n "
"CUPTI_PROFILER_METRICS=kineto__cuda_core_flops\n "
"CUPTI_PROFILER_ENABLE_PER_KERNEL=true";
auto& profiler = libkineto::api().activityProfiler();
profiler.prepareTrace(types_cupti_prof, profiler_config);
// Good to warm up after prepareTrace to get cupti initialization to settle
warmup();
profiler.startTrace();
basicMemcpyToDevice();
compute();
basicMemcpyFromDevice();
auto trace = profiler.stopTrace();
LOG(INFO) << "Stopped and processed trace. Got " << trace->activities()->size() << " activities.";
trace->save(kFileName);
return 0;
}
|