File: kineto_cupti_profiler.cpp

package info (click to toggle)

pytorch 1.13.1%2Bdfsg-4

links: PTS, VCS
area: main
in suites: bookworm
size: 139,252 kB
sloc: cpp: 1,100,274; python: 706,454; ansic: 83,052; asm: 7,618; java: 3,273; sh: 2,841; javascript: 612; makefile: 323; xml: 269; ruby: 185; yacc: 144; objc: 68; lex: 44

file content (52 lines) | stat: -rw-r--r-- 1,541 bytes

// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.

#include <stdio.h>
#include <stdlib.h>
#include <string>
#include <chrono>
#include <thread>

#include <common/logging/logging.h>
#include <libkineto.h>

#include "kineto/libkineto/sample_programs/kineto_playground.cuh"

using namespace kineto;

static const std::string kFileName = "/tmp/kineto_playground_trace.json";

int main() {
  warmup();

  // Kineto config
  std::set<libkineto::ActivityType> types_cupti_prof = {
    libkineto::ActivityType::CUDA_PROFILER_RANGE,
  };

  // Use a special kineto__cuda_core_flop metric that counts individual
  // CUDA core floating point instructions by operation type (fma,fadd,fmul,dadd ...)
  // You can also use kineto__tensor_core_insts or any metric
  // or any metric defined by CUPTI Profiler below
  //   https://docs.nvidia.com/cupti/Cupti/r_main.html#r_profiler

  std::string profiler_config = "ACTIVITIES_WARMUP_PERIOD_SECS=0\n "
    "CUPTI_PROFILER_METRICS=kineto__cuda_core_flops\n "
    "CUPTI_PROFILER_ENABLE_PER_KERNEL=true";

  auto& profiler = libkineto::api().activityProfiler();
  profiler.prepareTrace(types_cupti_prof, profiler_config);

  // Good to warm up after prepareTrace to get cupti initialization to settle
  warmup();

  profiler.startTrace();
  basicMemcpyToDevice();
  compute();
  basicMemcpyFromDevice();

  auto trace = profiler.stopTrace();
  LOG(INFO) << "Stopped and processed trace. Got " << trace->activities()->size() << " activities.";
  trace->save(kFileName);
  return 0;
}