1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177
|
// Note(jiayq): the import_array function is done inside
// caffe2_python.cc. Read
// http://docs.scipy.org/doc/numpy-1.10.1/reference/c-api.array.html#miscellaneous
// for more details.
#define NO_IMPORT_ARRAY
#include "pybind_state.h"
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#ifdef CAFFE2_USE_CUDNN
#include "caffe2/core/common_cudnn.h"
#endif // CAFFE2_USE_CUDNN
#include <c10/cuda/CUDAGuard.h>
#include "caffe2/core/context_gpu.h"
#include "caffe2/operators/operator_fallback_gpu.h"
#include "caffe2/python/pybind_state_registry.h"
#ifdef CAFFE2_USE_TRT
#include "caffe2/contrib/tensorrt/tensorrt_tranformer.h"
#endif // CAFFE2_USE_TRT
namespace caffe2 {
namespace python {
REGISTER_CUDA_OPERATOR(Python, GPUFallbackOp);
REGISTER_CUDA_OPERATOR(PythonGradient, GPUFallbackOp);
REGISTER_CUDA_OPERATOR(PythonDLPack, GPUFallbackOp);
REGISTER_CUDA_OPERATOR(PythonDLPackGradient, GPUFallbackOp);
REGISTER_BLOB_FEEDER(CUDA, TensorFeeder<CUDAContext>);
namespace py = pybind11;
void addCUDAGlobalMethods(py::module& m) {
m.def("num_cuda_devices", &NumCudaDevices);
m.def("get_cuda_version", &CudaVersion);
#ifdef CAFFE2_USE_CUDNN
m.def("get_cudnn_version", &cudnnCompiledVersion);
m.attr("cudnn_convolution_fwd_algo_count") =
py::int_((int)CUDNN_CONVOLUTION_FWD_ALGO_COUNT);
m.attr("cudnn_convolution_bwd_data_algo_count") =
py::int_((int)CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT);
m.attr("cudnn_convolution_bwd_filter_algo_count") =
py::int_((int)CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT);
#else
m.def("get_cudnn_version", []() { return static_cast<size_t>(0); });
m.attr("cudnn_convolution_fwd_algo_count") = py::int_(0);
m.attr("cudnn_convolution_bwd_data_algo_count") = py::int_(0);
m.attr("cudnn_convolution_bwd_filter_algo_count") = py::int_(0);
#endif
m.def("get_gpu_memory_info", [](int device_id) {
CUDAGuard guard(device_id);
size_t device_free, device_total;
CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
return std::pair<size_t, size_t>{device_free, device_total};
});
m.def("get_cuda_peer_access_pattern", []() {
std::vector<std::vector<bool>> pattern;
CAFFE_ENFORCE(caffe2::GetCudaPeerAccessPattern(&pattern));
return pattern;
});
m.def("get_device_properties", [](int deviceid) {
auto& prop = GetDeviceProperty(deviceid);
std::map<std::string, py::object> obj;
obj["name"] = py::cast(prop.name);
obj["major"] = py::cast(prop.major);
obj["minor"] = py::cast(prop.minor);
obj["totalGlobalMem"] = py::cast(prop.totalGlobalMem);
return obj;
});
m.def(
"onnx_to_trt_op",
[](const py::bytes& onnx_model_str,
const std::unordered_map<std::string, std::vector<int>>&
output_size_hints,
int max_batch_size,
int max_workspace_size,
int verbosity,
bool debug_builder) -> py::bytes {
#ifdef CAFFE2_USE_TRT
TensorRTTransformer t(
max_batch_size, max_workspace_size, verbosity, debug_builder);
auto op_def =
t.BuildTrtOp(onnx_model_str.cast<std::string>(), output_size_hints);
std::string out;
op_def.SerializeToString(&out);
return py::bytes(out);
#else
CAFFE_THROW("Please build Caffe2 with USE_TENSORRT=1");
#endif // CAFFE2_USE_TRT
});
m.def(
"transform_trt",
[](const py::bytes& pred_net_str,
const std::unordered_map<std::string, std::vector<int>>& shapes,
int max_batch_size,
int max_workspace_size,
int verbosity,
bool debug_builder,
bool build_serializable_op) -> py::bytes {
#ifdef CAFFE2_USE_TRT
caffe2::NetDef pred_net;
if (!ParseProtoFromLargeString(
pred_net_str.cast<std::string>(), &pred_net)) {
LOG(ERROR) << "broken pred_net protobuf";
}
std::unordered_map<std::string, TensorShape> tensor_shapes;
for (const auto& it : shapes) {
tensor_shapes.emplace(
it.first, CreateTensorShape(it.second, TensorProto::FLOAT));
}
TensorRTTransformer ts(
max_batch_size,
max_workspace_size,
verbosity,
debug_builder,
build_serializable_op);
ts.Transform(GetCurrentWorkspace(), &pred_net, tensor_shapes);
std::string pred_net_str2;
pred_net.SerializeToString(&pred_net_str2);
return py::bytes(pred_net_str2);
#else
CAFFE_THROW("Please build Caffe2 with USE_TENSORRT=1");
#endif // CAFFE2_USE_TRT
});
};
void addCUDAObjectMethods(py::module& m) {
py::class_<DLPackWrapper<CUDAContext>>(m, "DLPackTensorCUDA")
.def_property_readonly(
"data",
[](DLPackWrapper<CUDAContext>* t) -> py::object {
CAFFE_ENFORCE_EQ(
t->device_option.device_type(),
PROTO_CUDA,
"Expected CUDA device option for CUDA tensor");
return t->data();
},
"Return DLPack tensor with tensor's data.")
.def(
"feed",
[](DLPackWrapper<CUDAContext>* t, py::object obj) {
CAFFE_ENFORCE_EQ(
t->device_option.device_type(),
PROTO_CUDA,
"Expected CUDA device option for CUDA tensor");
t->feed(obj);
},
"Copy data from given DLPack tensor into this tensor.")
.def_property_readonly(
"_shape",
[](const DLPackWrapper<CUDAContext>& t) { return t.tensor->sizes(); })
.def(
"_reshape",
[](DLPackWrapper<CUDAContext>* t, std::vector<int64_t> dims) {
t->tensor->Resize(dims);
});
}
PYBIND11_MODULE(caffe2_pybind11_state_gpu, m) {
m.doc() = "pybind11 stateful interface to Caffe2 workspaces - GPU edition";
addGlobalMethods(m);
addCUDAGlobalMethods(m);
addObjectMethods(m);
addCUDAObjectMethods(m);
for (const auto& addition : PybindAdditionRegistry()->Keys()) {
PybindAdditionRegistry()->Create(addition, m);
}
}
} // namespace python
} // namespace caffe2
|