1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
|
#include <torch/csrc/distributed/c10d/GroupRegistry.hpp>
#include <torch/csrc/distributed/c10d/RankLocal.hpp>
namespace {
// Each rank operates on a different `c10d::ProcessGroup` instance for the same
// logical process group. Use `RankLocal<GroupRegistry>::get()` to ensure each
// rank gets a unique registry.
class GroupRegistry {
public:
void register_group(
std::string group_name,
// NOLINTNEXTLINE(performance-unnecessary-value-param)
c10::intrusive_ptr<c10d::ProcessGroup> group) {
std::unique_lock write_lock(lock_);
auto [_, inserted] =
registry_.try_emplace(std::move(group_name), std::move(group));
TORCH_CHECK(
inserted,
"A process group is already registered under the name",
group_name);
}
c10::intrusive_ptr<c10d::ProcessGroup> resolve_group(
const std::string& group_name) {
std::shared_lock read_lock(lock_);
auto it = registry_.find(group_name);
TORCH_CHECK(
it != registry_.end(),
"Could not resolve the process group registered under the name ",
group_name);
auto group = it->second.lock();
TORCH_CHECK(
group != nullptr,
"Process group registered under the name ",
group_name,
" has already been destroyed.");
return group;
}
void unregister_group(const std::string& group_name) {
std::unique_lock write_lock(lock_);
registry_.erase(group_name);
}
void unregister_all_groups() {
std::unique_lock write_lock(lock_);
registry_.clear();
}
private:
std::map<std::string, c10::weak_intrusive_ptr<c10d::ProcessGroup>> registry_;
std::shared_mutex lock_;
};
} // namespace
namespace c10d {
static bool thread_isolation_mode = false;
static GroupRegistry process_registry;
void set_thread_isolation_mode(bool enable) {
thread_isolation_mode = enable;
}
bool get_thread_isolation_mode() {
return thread_isolation_mode;
}
void register_process_group(
const std::string& group_name,
const c10::intrusive_ptr<c10d::ProcessGroup>& group) {
if (thread_isolation_mode) {
RankLocal<::GroupRegistry>::get().register_group(group_name, group);
} else {
process_registry.register_group(group_name, group);
}
}
c10::intrusive_ptr<c10d::ProcessGroup> resolve_process_group(
const std::string& group_name) {
if (thread_isolation_mode) {
return RankLocal<::GroupRegistry>::get().resolve_group(group_name);
} else {
return process_registry.resolve_group(group_name);
}
}
void unregister_process_group(const std::string& group_name) {
if (thread_isolation_mode) {
RankLocal<::GroupRegistry>::get().unregister_group(group_name);
} else {
process_registry.unregister_group(group_name);
}
}
void unregister_all_process_groups() {
if (thread_isolation_mode) {
RankLocal<::GroupRegistry>::get().unregister_all_groups();
} else {
process_registry.unregister_all_groups();
}
}
} // namespace c10d
|