File: logger.hpp

package info (click to toggle)
pytorch-cuda 2.6.0%2Bdfsg-7
  • links: PTS, VCS
  • area: contrib
  • in suites: forky, sid, trixie
  • size: 161,620 kB
  • sloc: python: 1,278,832; cpp: 900,322; ansic: 82,710; asm: 7,754; java: 3,363; sh: 2,811; javascript: 2,443; makefile: 597; ruby: 195; xml: 84; objc: 68
file content (138 lines) | stat: -rw-r--r-- 5,175 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#include <c10/util/Logging.h>
#include <torch/csrc/distributed/c10d/reducer.hpp>

#include <utility>

namespace c10d {

class TORCH_API Logger {
 public:
  explicit Logger(std::shared_ptr<c10d::Reducer> reducer);
  // Set logging data that can be got during DistributedDataParallel
  // construction time.
  void set_construction_data_and_log(
      const std::string& module_name,
      const std::vector<int>& device_ids,
      int output_device,
      bool broadcast_buffers,
      bool has_sync_bn,
      bool static_graph);

  void set_static_graph();

  // An interface for users to get DDPLoggingData and log them
  // in the applications. Explanation of logging fields are in
  // "struct DDPLoggingData" of "torch/c10/util/Logging.h".
  at::DDPLoggingData get_ddp_logging_data();

  // Stream insertion operator for logging data to stream under
  // TORCH_DISTRIBUTED_DEBUG.
  friend std::ostream& operator<<(std::ostream& output, const Logger& logger);

  ~Logger() noexcept(false) {
    // Log if DDP graph is static in Logger dtor instead of Reducer dtor since
    // Logger is deleted before Reducer.
    log_if_graph_static(reducer_->ddp_graph_static());
  }

  // Set environment variables.
  void set_env_variables();
  // Set parameters stats.
  void set_parameter_stats();
  // Get size of each bucket (Bytes).
  std::vector<int64_t> get_bucket_sizes();
  // Get variable indices for each bucket.
  std::vector<std::vector<size_t>> get_per_bucket_variable_indices();
  // Set comm. hook, if used
  void set_comm_hook(const std::string& hook);
  // Set running with uneven input detection (model.join() context manager)
  void set_uneven_input_join();

  // Reset performance stats at current iteration
  void reset_performance_stats();

  // Calculate avg stats using cpu timer and gpu timer
  // that has been recorded in reducer.
  void calculate_avg_time(
      int64_t& avg_time,
      int64_t& time_duration,
      Timer& timer,
      Timer::Event start_event,
      Timer::Event end_event);

  // Set the absolute time of the event that has been recorded in reducer.
  void set_event_time(int64_t& event_time, Timer& timer, Timer::Event event);
  // Set stats that can be collected only during
  // training loop. It is called at the beginning of forward call
  // to record the run time stats of sampled iterations that previously ran.
  // GPU performance stats are collected only for single process
  // single device program and single device module right now.
  // TODO to support single process multiple devices and multi device modules,
  // events need to be created and recorded on multiple devices.
  void set_runtime_stats_and_log();

  // Called when DDP/reducer is failing with an error. The
  // logging data structure will have two fields filled: "has_error" indicating
  // that this iteration encountered an error and other fields are not valid,
  // and "error", a string which contains the error message that DDP failed
  // with.
  template <typename... Args>
  void set_error_and_log(const std::string& ddp_error, const Args&... args) {
    ddp_logging_data_->ints_map["has_error"] = 1;
    auto err = c10::str(ddp_error, args...);
    ddp_logging_data_->strs_map["error"] = err;
    // Report the iteration we are erroring at so user knows how many examples
    // successfully processed before this error was hit.
    ddp_logging_data_->ints_map["iteration"] = reducer_->num_iterations_;
    at::LogPyTorchDDPUsage(*ddp_logging_data_);
  }

  // When running without static graph, called when reducer is destroyed to log
  // if graph was actually static and is a candidate for static graph
  // optimization.
  void log_if_graph_static(bool is_static);

 private:
  // ddp_logging_data_ is used to hold all the ddp related logging
  // data fields.
  std::unique_ptr<at::DDPLoggingData> ddp_logging_data_;
  std::shared_ptr<c10d::Reducer> reducer_;
  // track the number of iterations when runtime stats are collected so far.
  long num_iterations_stats_recorded_ = 0;
};

// a generic logging data struct that holds different types of logging data.
// starting with key value pairs of strings and integers,
// It can be extended to more types as needed.
struct C10dLoggingData {
  // logging fields that are string types.
  std::map<std::string, std::string> strings;
  // logging fields that are int64_t types.
  std::map<std::string, int64_t> integers;
};

class TORCH_API C10dLogger {
 public:
  C10dLogger(const C10dLogger&) = default;
  C10dLogger(C10dLogger&&) = delete;
  C10dLogger& operator=(const C10dLogger&) = default;
  C10dLogger& operator=(C10dLogger&&) = delete;
  virtual ~C10dLogger() = default;
  virtual void log(const C10dLoggingData& data);
  static C10dLogger* getLogger();
  static void registerLogger(std::unique_ptr<C10dLogger>);

 protected:
  // singletion, hide constructor from the public
  C10dLogger(std::string logDestination)
      : logDestination_(std::move(logDestination)) {}

  // the name of the destination this logger should log to
  std::string logDestination_;

 private:
  static std::unique_ptr<C10dLogger> logger_;
  static std::atomic<bool> registered_;
};

} // namespace c10d