File: kineto_stress_test.cpp

package info (click to toggle)
pytorch 1.13.1%2Bdfsg-4
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 139,252 kB
  • sloc: cpp: 1,100,274; python: 706,454; ansic: 83,052; asm: 7,618; java: 3,273; sh: 2,841; javascript: 612; makefile: 323; xml: 269; ruby: 185; yacc: 144; objc: 68; lex: 44
file content (179 lines) | stat: -rw-r--r-- 5,136 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.

#include <stdio.h>
#include <stdlib.h>
#include <cstdint>
#include <iostream>
#include <string>
#include <thread>
#include <vector>

#include <libkineto.h>
#include <cuda.h>
#include <cupti_activity.h>
#include <sys/types.h>
#include <unistd.h>

#include "kineto/libkineto/sample_programs/random_ops_stress_test.cuh"

using namespace kineto_stress_test;

static const std::string kFileName = "/tmp/kineto_stress_test_trace.json";

void trace_collection_thread() {
  useconds_t t_trace_length_us = 2000000;

  // Configure CUPTI buffer sizes
  // size_t attrValue = 0, attrValueSize = sizeof(size_t);
  // attrValue = 3 * 1024 * 1024;
  // cuptiActivitySetAttribute(CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE,
  //    &attrValueSize, &attrValue);

  // Config Kineto
  std::set<libkineto::ActivityType> types = {
    libkineto::ActivityType::CONCURRENT_KERNEL,
    libkineto::ActivityType::GPU_MEMCPY,
    libkineto::ActivityType::GPU_MEMSET,
    libkineto::ActivityType::CUDA_RUNTIME,
    libkineto::ActivityType::EXTERNAL_CORRELATION,
    libkineto::ActivityType::OVERHEAD
  };
  auto& profiler = libkineto::api().activityProfiler();
  libkineto::api().initProfilerIfRegistered();
  profiler.prepareTrace(types);

  // Collect the trace
  profiler.startTrace();
  usleep(t_trace_length_us);
  auto trace = profiler.stopTrace();

  // Save the trace
  trace->save(kFileName);
}

int main() {
  tensor_cache_args cache_args;

  // Sets GPU memory utilization
  cache_args.sz_cache_KB = 16000000;

  // If small, density is higher due to shorter kernel times
  cache_args.sz_min_tensor_KB = 64;

  // If large, we will have kernels with high duration thus smaller
  // event density. That's because kernels will have to run on larger
  // buffer sizes.
  cache_args.sz_max_tensor_KB = 256;

  // Simulates the chance of uploading a batch to the GPU.
  // It reduces event density if it's set too high
  cache_args.prob_h2d = 0.0005;

  // Simulates the chance of downloading results from the GPU.
  // It reduces event density if it's set too high
  cache_args.prob_d2h = 0.0001;

  stress_test_args test_args;

  // Number of operations per stress test
  test_args.num_operations = 10000000;

  // Can improve event density
  test_args.num_cuda_streams = 10;

  // Simulates cuda mallocs happening in the PT Cuda Cache Allocator
  test_args.prob_cuda_malloc = 0.001;

  // The min number of compute iterations in the cuda kernel. If high
  // this reduces event density.
  test_args.min_iters_kernel = 1;

  // The max number of compute iterations in the cuda kernel. If high
  // this reduces event density.
  test_args.max_iters_kernel = 5;

  // The min idle time between kernel launches in microseconds
  test_args.min_idle_us = 0;

  // The max idle time between kernel launches in microseconds
  test_args.max_idle_us = 1;

  // If true, we randomly sleep a number of microseconds between kernel
  // launches.
  test_args.simulate_host_time = false;

  // Number of threads that run the stress test
  uint32_t num_workers = 1;

  // Create more workers
  std::vector<std::thread> v_workers;
  v_workers.reserve(num_workers);

  // Generate for non-kineto tests
  generate_tensor_cache(cache_args);

  // Warmup run
  uint32_t num_test_operations = test_args.num_operations;
  test_args.num_operations = 200;
  run_stress_test(0, 1, false, test_args);
  test_args.num_operations = num_test_operations;

  // Run without kineto tracing
  clock_t t_start = clock();
  for (int i = 0; i < num_workers; ++i) {
    v_workers.push_back(std::thread(run_stress_test, i, num_workers, false, test_args));
  }
  for (auto& t : v_workers) {
    t.join();
  }
  clock_t t_stop = clock();
  v_workers.clear();
  double t_no_trace = (double)(t_stop - t_start) / 1e+3;

  // Re-init the random values
  re_initialize_buffer_values();

  // Tracing thread
  std::thread kineto_thread;

  // Run with kineto tracing
  t_start = clock();
  for (int i = 0; i < num_workers; ++i) {
    if (i == 0) {
      // Will run kineto on a different thread
      kineto_thread = std::thread(trace_collection_thread);
    }

    // Run the iterations
    v_workers.push_back(std::thread(run_stress_test, i, num_workers, true, test_args));
  }
  for (auto& t : v_workers) {
    t.join();
  }
  kineto_thread.join();
  v_workers.clear();

  t_stop = clock();
  double t_with_trace = (double)(t_stop - t_start) / 1e+3;

  // Run again after kineto tracing
  t_start = clock();
  for (int i = 0; i < num_workers; ++i) {
    v_workers.push_back(std::thread(run_stress_test, i, num_workers, false, test_args));
  }
  for (auto& t : v_workers) {
    t.join();
  }
  t_stop = clock();
  v_workers.clear();
  double t_after_trace = (double)(t_stop - t_start) / 1e+3;

  std::cout << "Execution time before tracing (ms): " << t_no_trace << std::endl;
  std::cout << "Execution time with tracing enabled (ms): " << t_with_trace << std::endl;
  std::cout << "Execution time after tracing (ms): " << t_after_trace << std::endl;

  // Free the tensor cache on the GPU
  free_tensor_cache();

  return 0;
}