1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166
|
// Copyright 2004-present Facebook. All Rights Reserved.
#ifndef CAFFE2_CORE_MIOPEN_WRAPPERS_H_
#define CAFFE2_CORE_MIOPEN_WRAPPERS_H_
#include "caffe2/core/hip/common_miopen.h"
#include "caffe2/core/hip/context_gpu.h"
#include <c10/hip/HIPGuard.h>
namespace caffe2 {
class MIOPENWrapper;
/**
* MIOPENWorkspace is a wrapper around a raw cuda pointer that holds the miopen
* scratch space. This struct is meant to be only used in MIOPENWrapper to
* provide a program-wide scratch space for MIOPEN. The reason behind it is that
* miopen function calls are usually very efficient, hence one probably does not
* want to run multiple miopen calls at the same time. As a result, one should
* not need more than one miopen workspace per device.
*/
struct MIOPENWorkspace
{
~MIOPENWorkspace() noexcept {}
void* get(size_t nbytes)
{
if(nbytes_ < nbytes)
{
reset();
data_ = HIPContext::New(nbytes);
nbytes_ = nbytes;
}
CAFFE_ENFORCE_GE(nbytes_, nbytes);
return data_.get();
}
void reset()
{
data_.clear();
nbytes_ = 0;
}
private:
at::DataPtr data_;
size_t nbytes_{0};
};
// MIOPENState is the owner of the MIOPENWorkspace, and serializes all
// executions of operations that use the state onto it's own stream
// (so multiple Net workers can reuse the same workspace from
// different threads and HIP streams).
class MIOPENState
{
public:
explicit MIOPENState(size_t gpu_id) : gpu_id_(gpu_id)
{
HIPGuard g(gpu_id_);
MIOPEN_ENFORCE(miopenCreate(&miopen_handle_));
HIP_ENFORCE(hipEventCreate(&before_));
HIP_ENFORCE(hipEventCreate(&after_));
HIP_ENFORCE(hipStreamCreate(&stream_));
MIOPEN_ENFORCE(miopenSetStream(miopen_handle_, stream_));
}
~MIOPENState() noexcept
{
HIPGuard g(gpu_id_);
MIOPEN_CHECK(miopenDestroy(miopen_handle_));
HIP_CHECK(hipStreamDestroy(stream_));
HIP_CHECK(hipEventDestroy(after_));
HIP_CHECK(hipEventDestroy(before_));
}
miopenHandle_t& miopen_handle() { return miopen_handle_; }
MIOPENWorkspace& workspace() { return workspace_; }
template <typename F>
void execute(hipStream_t stream, F&& f)
{
HIP_ENFORCE(hipEventRecord(before_, stream));
HIP_ENFORCE(hipStreamWaitEvent(stream_, before_, 0));
f(this);
HIP_ENFORCE(hipEventRecord(after_, stream_));
HIP_ENFORCE(hipStreamWaitEvent(stream, after_, 0));
}
private:
miopenHandle_t miopen_handle_{nullptr};
hipEvent_t before_{nullptr};
hipEvent_t after_{nullptr};
hipStream_t stream_{nullptr};
MIOPENWorkspace workspace_;
size_t gpu_id_{0};
C10_DISABLE_COPY_AND_ASSIGN(MIOPENState);
};
/**
* MIOPENWrapper is a class that wraps the miopen handles and miopen workspaces.
*
* The wrapper ensures that for each thread and each gpu, there is one
* identical miopen handle, which is also associated with the thread-local
* per-device hip stream. The wrapper also hosts the device-specific miopen
* workspace (scratch space for some miopen functions).
*
*/
class MIOPENWrapper
{
public:
/**
* Creates a miopen wrapper associated with a HIPContext object. Note that
* the HIPContext object should outlive the MIOPENWrapper.
*/
explicit MIOPENWrapper(HIPContext* context) : context_(context) {}
/**
* Returns the inline miopen handle that executes on the current
* thread's hip_stream.
*/
miopenHandle_t inline_miopen_handle() { return context_->miopen_handle(); }
// Executes the closure F on the MIOPENState associated with state_idx
template <typename F>
void with_miopen_state(size_t state_idx, F&& f)
{
CAFFE_ENFORCE(state_idx < CAFFE2_COMPILE_TIME_MAX_MIOPEN_STATES, "Invalid state_idx");
auto& sync_state = miopen_states()[context_->device_id()][state_idx];
HIPGuard dg(context_->device_id());
// We need to serialize execution on the MIOPENState as we can't
// allow multiple threads to race through the cudaEventRecord
// calls (so a worker thread might wait on another worker thread's
// execution)
std::lock_guard<std::mutex> g(sync_state.mutex);
if(!sync_state.state.get())
{
sync_state.state.reset(new MIOPENState(context_->device_id()));
}
TORCH_CHECK_NOTNULL(sync_state.state.get())->execute(context_->hip_stream(), f);
}
protected:
// Pointer to an external cuda context that the miopen wrapper will use.
HIPContext* context_;
static constexpr size_t CAFFE2_COMPILE_TIME_MAX_MIOPEN_STATES = 4;
struct SyncedMIOPENState
{
std::mutex mutex;
std::unique_ptr<MIOPENState> state;
};
using PerGPUMIOPENStates = std::array<
std::array<SyncedMIOPENState, CAFFE2_COMPILE_TIME_MAX_MIOPEN_STATES>,
C10_COMPILE_TIME_MAX_GPUS>;
static PerGPUMIOPENStates& miopen_states();
C10_DISABLE_COPY_AND_ASSIGN(MIOPENWrapper);
};
}; // namespace caffe2
#endif
|