1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
|
#pragma once
#include "caffe2/contrib/gloo/common.h"
#include "caffe2/core/operator.h"
#include <gloo/algorithm.h>
#include <gloo/barrier_all_to_one.h>
#include <gloo/common/error.h>
#include <gloo/context.h>
namespace caffe2 {
namespace gloo {
template <class Context>
class BarrierOp final : public Operator<Context> {
public:
BarrierOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<Context>(operator_def, ws),
ws_(ws),
status_blob_(
OperatorBase::GetSingleArgument<std::string>("status_blob", "")) {
if (status_blob_ != "") {
ws_->CreateBlob(status_blob_);
}
}
virtual ~BarrierOp() {}
bool RunOnDevice() override {
auto context = OperatorBase::Input<std::shared_ptr<::gloo::Context>>(0);
std::call_once(once_, [&] {
initContext_ = context;
// Use an all-to-one barrier synchronizing against rank 0
algorithm_.reset(new ::gloo::BarrierAllToOne(initContext_, 0));
});
// If any parameter has changed in between runs, the initialized
// algorithm is invalid and cannot be used.
CAFFE_ENFORCE(context == initContext_, "Context has changed");
try {
algorithm_->run();
} catch (::gloo::IoException& ioe) {
LOG(ERROR) << "Caught gloo IO exception: " << ioe.what();
if (status_blob_ != "") {
signalFailure(ws_->GetBlob(status_blob_), ioe);
return false;
} else {
throw;
}
}
return true;
}
protected:
std::once_flag once_;
std::shared_ptr<::gloo::Context> initContext_;
std::unique_ptr<::gloo::Algorithm> algorithm_;
Workspace* ws_;
std::string status_blob_;
};
} // namespace gloo
} // namespace caffe2
|