1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138
|
#include <c10/util/Logging.h>
#include <torch/csrc/distributed/c10d/reducer.hpp>
#include <utility>
namespace c10d {
class TORCH_API Logger {
public:
explicit Logger(std::shared_ptr<c10d::Reducer> reducer);
// Set logging data that can be got during DistributedDataParallel
// construction time.
void set_construction_data_and_log(
const std::string& module_name,
const std::vector<int>& device_ids,
int output_device,
bool broadcast_buffers,
bool has_sync_bn,
bool static_graph);
void set_static_graph();
// An interface for users to get DDPLoggingData and log them
// in the applications. Explanation of logging fields are in
// "struct DDPLoggingData" of "torch/c10/util/Logging.h".
at::DDPLoggingData get_ddp_logging_data();
// Stream insertion operator for logging data to stream under
// TORCH_DISTRIBUTED_DEBUG.
friend std::ostream& operator<<(std::ostream& output, const Logger& logger);
~Logger() noexcept(false) {
// Log if DDP graph is static in Logger dtor instead of Reducer dtor since
// Logger is deleted before Reducer.
log_if_graph_static(reducer_->ddp_graph_static());
}
// Set environment variables.
void set_env_variables();
// Set parameters stats.
void set_parameter_stats();
// Get size of each bucket (Bytes).
std::vector<int64_t> get_bucket_sizes();
// Get variable indices for each bucket.
std::vector<std::vector<size_t>> get_per_bucket_variable_indices();
// Set comm. hook, if used
void set_comm_hook(const std::string& hook);
// Set running with uneven input detection (model.join() context manager)
void set_uneven_input_join();
// Reset performance stats at current iteration
void reset_performance_stats();
// Calculate avg stats using cpu timer and gpu timer
// that has been recorded in reducer.
void calculate_avg_time(
int64_t& avg_time,
int64_t& time_duration,
Timer& timer,
Timer::Event start_event,
Timer::Event end_event);
// Set the absolute time of the event that has been recorded in reducer.
void set_event_time(int64_t& event_time, Timer& timer, Timer::Event event);
// Set stats that can be collected only during
// training loop. It is called at the beginning of forward call
// to record the run time stats of sampled iterations that previously ran.
// GPU performance stats are collected only for single process
// single device program and single device module right now.
// TODO to support single process multiple devices and multi device modules,
// events need to be created and recorded on multiple devices.
void set_runtime_stats_and_log();
// Called when DDP/reducer is failing with an error. The
// logging data structure will have two fields filled: "has_error" indicating
// that this iteration encountered an error and other fields are not valid,
// and "error", a string which contains the error message that DDP failed
// with.
template <typename... Args>
void set_error_and_log(const std::string& ddp_error, const Args&... args) {
ddp_logging_data_->ints_map["has_error"] = 1;
auto err = c10::str(ddp_error, args...);
ddp_logging_data_->strs_map["error"] = err;
// Report the iteration we are erroring at so user knows how many examples
// successfully processed before this error was hit.
ddp_logging_data_->ints_map["iteration"] = reducer_->num_iterations_;
at::LogPyTorchDDPUsage(*ddp_logging_data_);
}
// When running without static graph, called when reducer is destroyed to log
// if graph was actually static and is a candidate for static graph
// optimization.
void log_if_graph_static(bool is_static);
private:
// ddp_logging_data_ is used to hold all the ddp related logging
// data fields.
std::unique_ptr<at::DDPLoggingData> ddp_logging_data_;
std::shared_ptr<c10d::Reducer> reducer_;
// track the number of iterations when runtime stats are collected so far.
long num_iterations_stats_recorded_ = 0;
};
// a generic logging data struct that holds different types of logging data.
// starting with key value pairs of strings and integers,
// It can be extended to more types as needed.
struct C10dLoggingData {
// logging fields that are string types.
std::map<std::string, std::string> strings;
// logging fields that are int64_t types.
std::map<std::string, int64_t> integers;
};
class TORCH_API C10dLogger {
public:
C10dLogger(const C10dLogger&) = default;
C10dLogger(C10dLogger&&) = delete;
C10dLogger& operator=(const C10dLogger&) = default;
C10dLogger& operator=(C10dLogger&&) = delete;
virtual ~C10dLogger() = default;
virtual void log(const C10dLoggingData& data);
static C10dLogger* getLogger();
static void registerLogger(std::unique_ptr<C10dLogger>);
protected:
// singletion, hide constructor from the public
C10dLogger(std::string logDestination)
: logDestination_(std::move(logDestination)) {}
// the name of the destination this logger should log to
std::string logDestination_;
private:
static std::unique_ptr<C10dLogger> logger_;
static std::atomic<bool> registered_;
};
} // namespace c10d
|