1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396
|
#include "caffe2/core/operator.h"
#include "mpscnn.h"
#include "mpscnn_context.h"
#import <Metal/Metal.h>
#import <MetalPerformanceShaders/MetalPerformanceShaders.h>
#import <UIKit/UIDevice.h>
namespace caffe2 {
struct Analysis {
struct SSA {
using BlobVersions = std::unordered_map<std::string, size_t>;
BlobVersions inVersions;
BlobVersions outVersions;
};
std::vector<SSA> ssa;
std::unordered_map<
std::string,
std::unordered_map<size_t, std::vector<size_t>>>
inUsages;
};
Analysis analyzeNet(const NetDef& net) {
Analysis::SSA::BlobVersions frontier;
Analysis analysis;
auto play = [&](size_t i, const OperatorDef& op) {
Analysis::SSA::BlobVersions inVersions;
for (const auto& s : op.input()) {
inVersions[s] = frontier[s];
analysis.inUsages[s][frontier[s]].push_back(i);
}
Analysis::SSA::BlobVersions outVersions;
for (const auto& s : op.output()) {
if (frontier.find(s) != frontier.end()) {
frontier[s] += 1;
}
outVersions[s] = frontier[s];
}
analysis.ssa.push_back(Analysis::SSA{inVersions, outVersions});
};
for (auto i = 0; i < net.op_size(); ++i) {
play(i, net.op(i));
}
return analysis;
}
static void rewriteInput(OperatorDef* op, int i) {
auto input = op->input(i);
op->set_input(i, input + "_M");
}
static void rewriteOutput(OperatorDef* op, int i) {
auto output = op->output(i);
op->set_output(i, output + "_M");
}
static void insertOutputCopyFromMPSCNNOp(
NetDef& predictNet,
const std::vector<std::string>& cpu_blobs) {
auto* op = predictNet.add_op();
op->set_type("CopyFromMPSCNN");
for (int i = 0; i < cpu_blobs.size(); ++i) {
op->add_input(cpu_blobs[i] + "_M");
op->add_output(cpu_blobs[i]);
}
}
NetDef insertInputOutputCopyOps(const NetDef& def) {
// Do some validation of the outputs. For this version, we require:
// - a single input (first element of external_input()) is consumed by the
// NetDef - a single output (first element of external_output()) is produced
// by the NetDef. - the input is consumed by def.op(0), and this is the only
// consumer. - the output is produced by def.op(-1).
CAFFE_ENFORCE_GE(def.external_input_size(), 1);
CAFFE_ENFORCE_GE(def.external_output_size(), 1);
auto analysis = analyzeNet(def);
// enforce a single use of the input blob.
CAFFE_ENFORCE_GE(def.op_size(), 1);
const auto& inputBlob = def.external_input(0);
// Enforce that the input blob has a single usage - in the first operator.
CAFFE_ENFORCE(analysis.inUsages[inputBlob][0] == (std::vector<size_t>{0}));
// Enforce that the external_output(0) blob is produced by the last operator
// in this sequence.
const auto& outputBlob = def.external_output(0);
CAFFE_ENFORCE(
analysis.ssa.back().outVersions.find(outputBlob) !=
analysis.ssa.back().outVersions.end());
const auto& outputBlobVersion = analysis.ssa.back().outVersions[outputBlob];
// This should hold true by definition of the SSA analysis.
CAFFE_ENFORCE(
analysis.inUsages[outputBlob].find(outputBlobVersion) ==
analysis.inUsages[outputBlob].end());
NetDef mdef;
mdef.CopyFrom(def);
mdef.clear_op();
{
auto& op = *(mdef.add_op());
op.set_type("CopyToMPSCNN");
op.add_input(def.external_input(0));
op.add_output("__METAL_INPUT_COPY__");
}
std::unordered_set<std::string> output_set;
for (auto i = 0; i < def.op_size(); ++i) {
const auto& ogOp = def.op(i);
auto op = mdef.add_op();
op->CopyFrom(ogOp);
if (i == 0) {
CAFFE_ENFORCE_EQ(op->input(0), def.external_input(0));
op->set_input(0, "__METAL_INPUT_COPY__");
}
/*
* Let's say we have a Blob called "X" that is both the external output
* and will be used in the later operators. And it's on Metal. First, we'll
* rename the output of the operator to "X_M", therefore all the following
* operators that referenced this blob will need to change the input name
* and then we will copy "X_M" to CPU as "X" in the end.
*
*/
for (auto j = 0; j < op->input_size(); ++j) {
if (output_set.find(op->input(j)) != output_set.end()) {
rewriteInput(op, j);
// we'll add one CopyFromMPSCNN operator in the end
// to copy all the output blobs from MPSCNN to CPU
}
}
// if the output is in external output, copy from metal when necessary
for (auto j = 0; j < op->output_size(); ++j) {
for (auto k = 0; k < def.external_output_size(); ++k) {
// Assuming external output blob has unique name, e.g. only version 0
// of the blob is used as the output
if (op->output(j) == def.external_output(k)) {
output_set.insert(op->output(j));
// rewrite output to output_M for the operator
rewriteOutput(op, j);
}
}
}
}
// We copy all the output from Metal to CPU at once in the end
std::vector<std::string> external_outputs;
for (int i = 0; i < def.external_output_size(); ++i) {
external_outputs.push_back(def.external_output(i));
}
insertOutputCopyFromMPSCNNOp(mdef, external_outputs);
return mdef;
}
bool nextIsOnlyUserOfCurrent(
const Analysis& analysis,
size_t currentIdx,
const OperatorDef& currentOp,
const OperatorDef& nextOp) {
CAFFE_ENFORCE_EQ(currentOp.output_size(), 1);
CAFFE_ENFORCE_GE(nextOp.input_size(), 1);
CAFFE_ENFORCE_EQ(currentOp.output(0), nextOp.input(0));
const auto outputName = currentOp.output(0);
// Find the version of the output name we are currently looking at.
// This is guaranteed to exist by SSA analysis.
const auto currentOutputVersion =
analysis.ssa.at(currentIdx).outVersions.at(outputName);
VLOG(2) << "Blob: " << outputName << ", idx: " << currentOutputVersion;
// Find the usages of this in the SSA analysis.
// Has this blob every been used?
if (analysis.inUsages.find(outputName) == analysis.inUsages.end()) {
return false;
}
// Has this version of the blob ever been used?
if (analysis.inUsages.at(outputName).find(currentOutputVersion) ==
analysis.inUsages.at(outputName).end()) {
return false;
}
const auto currentOutputUsages =
analysis.inUsages.at(outputName).at(currentOutputVersion);
VLOG(2) << "Blob: " << outputName << ", idx: " << currentOutputVersion
<< ", usages[0]: " << currentOutputUsages[0];
return currentOutputUsages == std::vector<size_t>{currentIdx + 1};
}
bool tryFuseAdjacentOps(
const Analysis& analysis,
size_t currentIdx,
const OperatorDef& currentOp,
const OperatorDef& nextOp,
OperatorDef* fusedOp) {
// Check for possible invalid opportunities.
// Must be identical outputs, with either in-place usage for nextOp, *or* the
// only use of the output of currentOp is the consumption by nextOp.
if (currentOp.output_size() != 1 || !nextOp.input_size() ||
nextOp.output_size() != 1) {
return false;
}
if (currentOp.output(0) != nextOp.input(0)) {
return false;
}
if (!nextIsOnlyUserOfCurrent(analysis, currentIdx, currentOp, nextOp)) {
return false;
}
// Can we autogenerate this at registration time instead?
static const std::map<std::pair<std::string, std::string>, std::string>
fusionOpportunities = {{
{{"MPSCNNConv", "MPSCNNRelu"}, "MPSCNNConvRelu"},
{{"MPSCNNConv", "MPSCNNSigmoid"}, "MPSCNNConvSigmoid"},
{{"MPSCNNFC", "MPSCNNRelu"}, "MPSCNNFCRelu"},
{{"MPSCNNInstanceNorm", "MPSCNNPRelu"}, "MPSCNNInstanceNormPRelu"},
}};
auto it = fusionOpportunities.find({currentOp.type(), nextOp.type()});
if (it == fusionOpportunities.end()) {
return false;
}
// MPSCNNConvRelu and MPSCNNConvSigmoid cannot be in-place
if (currentOp.type() == "MPSCNNConv" &&
currentOp.input(0) == nextOp.output(0)) {
return false;
}
LOG(INFO) << "Found a fusion between adjacent ops: (" << currentOp.type()
<< ", " << nextOp.type() << ") -> " << it->second;
fusedOp->CopyFrom(currentOp);
fusedOp->set_type(it->second);
for (auto i = 1; i < nextOp.input_size(); ++i) {
fusedOp->add_input(nextOp.input(i));
}
fusedOp->set_output(0, nextOp.output(0));
return true;
}
NetDef runMPSCNNFusion(const NetDef& def) {
CAFFE_ENFORCE_GE(def.op_size(), 1);
NetDef mdef;
mdef.CopyFrom(def);
mdef.clear_op();
auto i = 0;
auto analysis = analyzeNet(def);
while (i < def.op_size()) {
if (i == def.op_size() - 1) {
VLOG(2) << "Last operator, skipping";
auto* op = mdef.add_op();
op->CopyFrom(def.op(i));
i += 1;
continue;
}
const auto& currentOp = def.op(i);
const auto& nextOp = def.op(i + 1);
OperatorDef fusedOp;
if (tryFuseAdjacentOps(analysis, i, currentOp, nextOp, &fusedOp)) {
VLOG(2) << "Found an adjacent fusion at: " << i;
// We can fuse.
auto* op = mdef.add_op();
op->CopyFrom(fusedOp);
i += 2;
continue;
}
VLOG(2) << "No fusion available";
// Just emit the current type.
auto* op = mdef.add_op();
op->CopyFrom(currentOp);
i += 1;
}
return mdef;
}
NetDef rewriteForMetal(const NetDef& def) {
NetDef mdef;
mdef.CopyFrom(def);
const auto& opKeyList = CPUOperatorRegistry()->Keys();
const auto& opKeySet =
std::set<std::string>(opKeyList.begin(), opKeyList.end());
for (auto i = 0; i < mdef.op_size(); ++i) {
auto* op = mdef.mutable_op(i);
const auto mpscnnOp = std::string("MPSCNN") + op->type();
CAFFE_ENFORCE(opKeySet.find(mpscnnOp) != opKeySet.end());
op->set_type(mpscnnOp);
}
mdef = runMPSCNNFusion(mdef);
static std::set<std::string> mpscnnInputOps = {
"CopyToMPSCNN", "MPSCNNPackedInt8BGRANHWCToNCHWCStylizerPreprocess"};
static std::set<std::string> mpscnnOutputOps = {
"CopyFromMPSCNN", "MPSCNNBRGNCHWCToPackedInt8BGRAStylizerDeprocess"};
if (mpscnnInputOps.find(mdef.op(0).type()) == mpscnnInputOps.end() &&
mpscnnOutputOps.find(mdef.op(mdef.op_size() - 1).type()) ==
mpscnnOutputOps.end()) {
mdef = insertInputOutputCopyOps(mdef);
}
CAFFE_ENFORCE_GE(mdef.op_size(), 2);
CAFFE_ENFORCE(mpscnnInputOps.find(mdef.op(0).type()) != mpscnnInputOps.end());
CAFFE_ENFORCE(
mpscnnOutputOps.find(mdef.op(mdef.op_size() - 1).type()) !=
mpscnnOutputOps.end());
return mdef;
}
void dumpDef(const NetDef& d) {
for (const auto& op : d.op()) {
LOG(INFO) << op.input(0) << " -> " << op.type() << " -> " << op.output(0);
}
}
NetDef annotateDefWithReadCounts(const NetDef& net) {
// Now we have usage versions, we want to compute, for each blob version, the
// number of usages of each blob version. ReadCount
auto analysis = analyzeNet(net);
using ReadCount = std::unordered_map<std::string, size_t>;
std::vector<ReadCount> readCounts;
auto computeReadCount = [&](size_t i, const OperatorDef& op) {
ReadCount rcs;
for (const auto bv : analysis.ssa[i].outVersions) {
const auto versionUsages = analysis.inUsages[bv.first][bv.second];
rcs[bv.first] = versionUsages.size();
}
readCounts.push_back(rcs);
};
for (auto i = 0; i < net.op_size(); ++i) {
computeReadCount(i, net.op(i));
}
NetDef annotatedNet;
annotatedNet.CopyFrom(net);
for (auto i = 0; i < annotatedNet.op_size(); ++i) {
auto* op = annotatedNet.mutable_op(i);
// TODO - relax this? CAFFE_ENFORCE_EQ(op->output_size(), 1);
const auto& blob = op->output(0);
const size_t readCount = readCounts[i][blob];
if (readCount > 1) {
auto* arg = op->add_arg();
arg->set_name(kMPSCNNReadCountArg);
arg->set_i(readCount);
LOG(INFO) << "Op: " << i << ", ty: " << op->type() << ", blob: " << blob
<< ", read count: " << readCount;
}
}
return annotatedNet;
}
bool tryConvertToMPSCNN(
const NetDef& initNet,
const NetDef& predictNet,
NetDef* metalPredictNet) {
// iOS 10.0 and above.
#define SYSTEM_VERSION_GREATER_THAN_OR_EQUAL_TO(v) \
([[[UIDevice currentDevice] systemVersion] \
compare:v \
options:NSNumericSearch] != NSOrderedAscending)
if (!SYSTEM_VERSION_GREATER_THAN_OR_EQUAL_TO(@"11.0")) {
LOG(ERROR) << "MPSCNN is only supported for ios version above 11.0.";
return false;
}
#undef SYSTEM_VERSION_GREATER_THAN_OR_EQUAL_TO
// The iOS GPU Family 3 v2 feature set. Introduced with the Apple A9 GPU and
// iOS 10.0. Don't instantiate the MPSCNNContext, as that compiles the kernel
// source.
if (![MTLCreateSystemDefaultDevice()
supportsFeatureSet:MTLFeatureSet_iOS_GPUFamily3_v2]) {
LOG(ERROR) << "The iOS GPU is less than an A9, so MPSCNN is not available";
return false;
}
try {
// Instantiating the net and catching failures allows us to
Workspace ws;
ws.RunNetOnce(initNet);
// Throws if unsupported operators are found.
*metalPredictNet = rewriteForMetal(predictNet);
*metalPredictNet = annotateDefWithReadCounts(*metalPredictNet);
// Throws if unsupported parameters are found.
ws.CreateNet(*metalPredictNet);
LOG(INFO) << "MPSCNN is successfully enabled";
return true;
} catch (const std::exception& e) {
LOG(ERROR) << "Caught exception trying to convert NetDef to MPSCNN: "
<< e.what();
return false;
}
}
void mpscnnRecordExecutionFinish() {
[getMPSCNNContext().commandQueue insertDebugCaptureBoundary];
}
}
|