File: perf_observer.cc

package info (click to toggle)
pytorch 1.13.1%2Bdfsg-4
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 139,252 kB
  • sloc: cpp: 1,100,274; python: 706,454; ansic: 83,052; asm: 7,618; java: 3,273; sh: 2,841; javascript: 612; makefile: 323; xml: 269; ruby: 185; yacc: 144; objc: 68; lex: 44
file content (330 lines) | stat: -rw-r--r-- 9,526 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
#include "observers/perf_observer.h"
#include "observers/observer_config.h"
#ifndef C10_MOBILE
#include "caffe2/core/flags.h"
#include "observers/net_observer_reporter_print.h"
#endif

#include <random>
// NOLINTNEXTLINE(modernize-deprecated-headers)
#include <time.h>
#include "caffe2/core/common.h"
#include "caffe2/core/init.h"
#include "caffe2/core/operator.h"

#if defined(TARGET_OS_MAC) || \
defined(TARGET_OS_IPHONE) || \
defined(TARGET_IPHONE_SIMULATOR)
#define _APPLE 1
#endif

#ifdef _WIN32
#ifndef WIN32_LEAN_AND_MEAN
#define WIN32_LEAN_AND_MEAN
#endif
#include <windows.h>
#endif

#ifdef _APPLE
#include <mach/mach_time.h>
#include <sys/time.h>
#include <sys/resource.h>
#endif

#ifndef C10_MOBILE
C10_DEFINE_int64(
    aiBench_netInitSampleRate,
    0,
    "One in N sampling rate for net delay");

C10_DEFINE_int64(
    aiBench_netFollowupSampleRate,
    0,
    "One in N sampling rate for net delay");

C10_DEFINE_int64(
    aiBench_netFollowupSampleCount,
    0,
    "control the following c logs");

C10_DEFINE_int64(
    aiBench_operatorNetSampleRatio,
    0,
    "One in N sampling rate for operator delay");

C10_DEFINE_int64(
    aiBench_skipIters,
    0,
    "skip the first N iterations of the net run");
#endif

namespace caffe2 {
namespace {

bool registerGlobalPerfNetObserverCreator(int* /*pargc*/, char*** /*pargv*/) {
  AddGlobalNetObserverCreator([](NetBase* subject) {
    return std::make_unique<PerfNetObserver>(subject);
  });

#if !defined(C10_MOBILE)
  // for aibench usage
  caffe2::ObserverConfig::setReporter(
      std::make_unique<caffe2::NetObserverReporterPrint>());

  caffe2::ObserverConfig::initSampleRate(
      FLAGS_aiBench_netInitSampleRate,
      FLAGS_aiBench_netFollowupSampleRate,
      FLAGS_aiBench_netFollowupSampleCount,
      FLAGS_aiBench_operatorNetSampleRatio,
      FLAGS_aiBench_skipIters);
#endif

  return true;
}
} // namespace

#ifdef _WIN32
double getTicksPerMillisecond() {
  static LARGE_INTEGER ticks_per_sec;
  if (!ticks_per_sec.QuadPart) {
    QueryPerformanceFrequency(&ticks_per_sec);
    if (!ticks_per_sec.QuadPart) {
      return 0.0;
    }
  }

  return static_cast<double>(ticks_per_sec.QuadPart) / 1000.0;
}
#elif !defined _APPLE
double getClockTimeMilliseconds(clockid_t clk_id) {
  int result;
  struct timespec tp;
  result = clock_gettime(clk_id, &tp);
  if (result == -1) {
    return 0.0;
  } else {
    return tp.tv_sec * 1000.0 + tp.tv_nsec / 1000000.0;
  }
}
#endif

double getWallClockTimeMilliseconds() {
#ifdef _WIN32
  double ticks_per_ms = getTicksPerMillisecond();
  if (ticks_per_ms) {
    LARGE_INTEGER ticks;
    if (QueryPerformanceCounter(&ticks)) {
      return static_cast<double>(ticks.QuadPart) / ticks_per_ms;
    }
  }

  return 0.0;
#elif defined _APPLE
  static mach_timebase_info_data_t info;
  if (info.denom == 0) {
    mach_timebase_info(&info);
  }

  uint64_t now = mach_absolute_time();
  now = now * info.numer / info.denom; // convert to nanoseconds
  return now / 1000000.0;
#else
  return getClockTimeMilliseconds(CLOCK_MONOTONIC);
#endif
}

double getCpuTimeMilliseconds() {
#ifdef _WIN32
  FILETIME creation_time;
  FILETIME exit_time;
  FILETIME kernel_time;
  FILETIME user_time;
  if (GetProcessTimes(
      GetCurrentProcess(),
      &creation_time,
      &exit_time,
      &kernel_time,
      &user_time)) {
    ULARGE_INTEGER kernel;
    ULARGE_INTEGER user;
    kernel.HighPart = kernel_time.dwHighDateTime;
    kernel.LowPart = kernel_time.dwLowDateTime;
    user.HighPart = user_time.dwHighDateTime;
    user.LowPart = user_time.dwLowDateTime;
    return (static_cast<double>(kernel.QuadPart) +
        static_cast<double>(user.QuadPart)) / 10000.0;
  }

  return 0.0;
#elif defined _APPLE
  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
  struct rusage ru;
  if (getrusage(RUSAGE_SELF, &ru)) {
    return 0.0;
  }

  return ru.ru_utime.tv_sec * 1000.0
      + ru.ru_utime.tv_usec / 1000.0
      + ru.ru_stime.tv_sec * 1000.0
      + ru.ru_stime.tv_usec / 1000.0;
#else
  return getClockTimeMilliseconds(CLOCK_PROCESS_CPUTIME_ID);
#endif
}

REGISTER_CAFFE2_EARLY_INIT_FUNCTION(
    registerGlobalPerfNetObserverCreator,
    &registerGlobalPerfNetObserverCreator,
    "Caffe2 net global observer creator");

// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
PerfNetObserver::PerfNetObserver(NetBase* subject_)
    : NetObserver(subject_), numRuns_(0) {}

// NOLINTNEXTLINE(modernize-use-equals-default)
PerfNetObserver::~PerfNetObserver() {}

void PerfNetObserver::Start() {
  static int visitCount = 0;
  // Select whether to log the operator or the net.
  // We have one sample rate for the entire app.
  int netInitSampleRate = ObserverConfig::getNetInitSampleRate();
  int netFollowupSampleRate = ObserverConfig::getNetFollowupSampleRate();
  int netFollowupSampleCount = ObserverConfig::getNetFollowupSampleCount();
  int operatorNetSampleRatio = ObserverConfig::getOpoeratorNetSampleRatio();
  int skipIters = ObserverConfig::getSkipIters();
  int sampleRate = visitCount > 0 ? netFollowupSampleRate : netInitSampleRate;
  // NOLINTNEXTLINE(clang-analyzer-security.insecureAPI.rand)
  if (skipIters <= static_cast<int>(numRuns_) && sampleRate > 0 && rand() % sampleRate == 0) {
    visitCount++;
    if (visitCount == netFollowupSampleCount) {
      visitCount = 0;
    }
    // NOLINTNEXTLINE(clang-analyzer-security.insecureAPI.rand)
    if (operatorNetSampleRatio > 0 && rand() % operatorNetSampleRatio == 0) {
      logType_ = PerfNetObserver::OPERATOR_DELAY;
    } else {
      logType_ = PerfNetObserver::NET_DELAY;
    }
  } else {
    logType_ = PerfNetObserver::NONE;
  }
  numRuns_++;

  if (logType_ == PerfNetObserver::OPERATOR_DELAY) {
    /* Always recreate new operator  observers
       whenever we measure operator delay */
    const auto& operators = subject_->GetOperators();
    for (auto* op : operators) {
      observerMap_[op] = op->AttachObserver(
          std::make_unique<PerfOperatorObserver>(op, this));
    }
  }

  wallMilliseconds_ = getWallClockTimeMilliseconds();
  cpuMilliseconds_ = getCpuTimeMilliseconds();
}

void PerfNetObserver::Stop() {
  if (logType_ == PerfNetObserver::NONE) {
    return;
  }
  std::map<std::string, PerformanceInformation> info;
  PerformanceInformation net_perf;
  net_perf.cpuMilliseconds =
      getCpuTimeMilliseconds() - cpuMilliseconds_;
  net_perf.latency =
      getWallClockTimeMilliseconds() - wallMilliseconds_;

  if (logType_ == PerfNetObserver::OPERATOR_DELAY) {
    const auto& operators = subject_->GetOperators();
    for (unsigned idx = 0; idx < operators.size(); ++idx) {
      const auto* op = operators[idx];
      auto name = getObserverName(op, static_cast<int>(idx));
      PerformanceInformation p;
      const PerfOperatorObserver* opObserver =
          static_cast<const PerfOperatorObserver*>(observerMap_[op]);
      p.latency = opObserver->getWallMilliseconds();
      p.cpuMilliseconds = opObserver->getCpuMilliseconds();
      p.engine = op->engine();
      p.type = op->type();
      p.tensor_shapes =
          static_cast<const PerfOperatorObserver*>(observerMap_[op])
              ->getTensorShapes();

      if (op->has_debug_def()) {
        // NOLINTNEXTLINE(performance-for-range-copy)
        for (auto arg : op->debug_def().arg()) {
          p.args.emplace_back(arg);
        }
      }

      info.insert({name, p});
    }

    /* clear all operator delay after use so that we don't spent time
       collecting the operator delay info in later runs */
    for (auto* op : operators) {
      op->DetachObserver(observerMap_[op]);
    }
    observerMap_.clear();
  }
  info.insert({"NET_DELAY", net_perf});
  ObserverConfig::getReporter()->report(subject_, info);
}

caffe2::string PerfNetObserver::getObserverName(const OperatorBase* op, int idx)
    const {
  string opType = op->has_debug_def() ? op->debug_def().type() : "NO_TYPE";
  string displayName =
      (op->has_debug_def() ? op->debug_def().name().size()
               ? op->debug_def().name()
               : (op->debug_def().output_size() ? op->debug_def().output(0)
                                                : "NO_OUTPUT")
                           : "NO_DEF");
  caffe2::string name =
      "ID_" + c10::to_string(idx) + "_" + opType + "_" + displayName;
  return name;
}

PerfOperatorObserver::PerfOperatorObserver(
    OperatorBase* op,
    PerfNetObserver* netObserver)
    : ObserverBase<OperatorBase>(op),
      netObserver_(netObserver),
      wallMilliseconds_(0),
      cpuMilliseconds_(0) {
  CAFFE_ENFORCE(netObserver_, "Observers can't operate outside of the net");
}

// NOLINTNEXTLINE(modernize-use-equals-default)
PerfOperatorObserver::~PerfOperatorObserver() {}

void PerfOperatorObserver::Start() {
  wallMilliseconds_ = getWallClockTimeMilliseconds();
  cpuMilliseconds_ = getCpuTimeMilliseconds();
}

void PerfOperatorObserver::Stop() {
  /* Time from the start of the net minus the time spent on all other
     operators is the time spent on this operator */
  cpuMilliseconds_ =
      getCpuTimeMilliseconds() - cpuMilliseconds_;
  wallMilliseconds_ =
      getWallClockTimeMilliseconds() - wallMilliseconds_;
  tensor_shapes_ = subject_->InputTensorShapes();
}

double PerfOperatorObserver::getWallMilliseconds() const {
  return wallMilliseconds_;
}

double PerfOperatorObserver::getCpuMilliseconds() const {
  return cpuMilliseconds_;
}

std::vector<TensorShape> PerfOperatorObserver::getTensorShapes() const {
  return tensor_shapes_;
}

} // namespace caffe2