1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86
|
#include <torch/csrc/distributed/c10d/reducer_timer.hpp>
#include <ATen/cuda/CUDAEvent.h>
#include <c10/core/DeviceGuard.h>
namespace c10d {
namespace {
const int kMilliSecondToNanosSecond = 1000000;
class CudaTimer : public Timer {
private:
c10::Device device;
at::cuda::CUDAEvent forward_start = at::cuda::CUDAEvent(cudaEventDefault);
at::cuda::CUDAEvent backward_compute_start =
at::cuda::CUDAEvent(cudaEventDefault);
at::cuda::CUDAEvent backward_compute_end =
at::cuda::CUDAEvent(cudaEventDefault);
at::cuda::CUDAEvent backward_comm_start =
at::cuda::CUDAEvent(cudaEventDefault);
at::cuda::CUDAEvent backward_comm_end = at::cuda::CUDAEvent(cudaEventDefault);
at::cuda::CUDAEvent& getEvent(Event event) {
switch (event) {
case Event::kForwardStart:
return forward_start;
case Event::kBackwardComputeStart:
return backward_compute_start;
case Event::kBackwardComputeEnd:
return backward_compute_end;
case Event::kBackwardCommStart:
return backward_comm_start;
case Event::kBackwardCommEnd:
return backward_comm_end;
default:
TORCH_INTERNAL_ASSERT(false);
}
}
public:
explicit CudaTimer(c10::Device dev) : device(dev) {}
void record(Event event) override {
// Parent class sets the host-side time
Timer::record(event);
c10::DeviceGuard g(device);
getEvent(event).record();
}
c10::optional<int64_t> measureDifference(Event start, Event end) override {
c10::DeviceGuard g(device);
at::cuda::CUDAEvent& start_event = getEvent(start);
at::cuda::CUDAEvent& end_event = getEvent(end);
// It is possible users did not call backward or run codes in
// no-sync mode, in this case, some cudaEvents like "backward_compute_end"
// or "backward_comm_start" or "backward_comm_end" will not be recorded.
// cudaEvent is created when it is first time to be recorded.
// If it is never recorded/created, skip synchronize and calculation.
// Otherwise it will throw cuda errors.
if (!start_event.isCreated() || !end_event.isCreated()) {
return c10::nullopt;
}
// set_runtime_stats_and_log is called at the beginning of forward call,
// when it is cheap to synchronize the cuda events of previous iteration,
// as mostly all cuda operations are finished in previous iteration.
start_event.synchronize();
end_event.synchronize();
float milliseconds = start_event.elapsed_time(end_event);
// If gpu_end is not recorded in this iteration,
// milliseconds will have invalid value.
// For some cases like DDP runs on non-sync mode,
// gpu_end can not be recorded in this iteration and thus can not
// calculate the valid avg_time.
// In this case, skip calculating the avg_time and return.
if (milliseconds < 0) {
return c10::nullopt;
}
return int64_t(milliseconds * kMilliSecondToNanosSecond);
}
};
C10_REGISTER_TYPED_CLASS(TimerRegistry, c10::kCUDA, CudaTimer);
} // namespace
} // namespace c10d
|