1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
|
import copy
from caffe2.proto import caffe2_pb2
from caffe2.python import core
def rewrite_init_net_simple(net):
for op in net.op:
op.device_option.device_type = caffe2_pb2.IDEEP
def last_producer(ops, blob):
for (i, op) in reversed(list(enumerate(ops))):
if blob in op.output:
return i
raise ValueError("Failed to find last producer of blob, %s", blob)
def fix_BoxWithNMSLimit(net):
outputs = set()
for op in net.op:
if op.type == 'BoxWithNMSLimit':
outputs.add(op.output[0])
outputs.add(op.output[1])
outputs.add(op.output[2])
for op in net.op:
if op.type == 'CopyIDEEPToCPU':
if op.input[0] in outputs:
print("Chaning CopyIDEEPToCPU to Copy for {}".format(op.input[0]))
op.type = 'Copy'
op.device_option.device_type = caffe2_pb2.CPU
def rewrite_run_net_simple(net):
# Simple rewrite for now - assume entire graph can be executed
# with MKL, so just insert copy ops for external_input[0] and
# external_output[0]
def mkl_tmp(name):
return "{}__MKL__".format(name)
input_blob = net.external_input[0]
if input_blob != net.op[0].input[0]:
raise Exception(
"Input blob: {} is not consumed by first op: {}".format(
input_blob, net.op[0]))
# Modify input/outputs to point to copied MKL blobs.
from_cpu = "CopyCPUToIDEEP"
to_cpu = "CopyIDEEPToCPU"
copy_input_op = core.CreateOperator(
from_cpu, input_blob, mkl_tmp(input_blob))
net.op[0].input[0] = mkl_tmp(input_blob)
copy_output_ops = [
core.CreateOperator(to_cpu, mkl_tmp(output_blob), output_blob)
for output_blob in net.external_output]
for output_blob in net.external_output:
last_producer_idx = last_producer(net.op, output_blob)
renamed_outputs = [blob if blob != output_blob else mkl_tmp(blob)
for blob in net.op[last_producer_idx].output]
net.op[last_producer_idx].output[:] = renamed_outputs
# Rename any subsequent consumers of an output blob.
for op in net.op[last_producer_idx + 1:]:
renamed_input = [blob if blob != output_blob else mkl_tmp(blob)
for blob in op.input]
op.input[:] = renamed_input
ops = [copy_input_op] + net.op[:] + copy_output_ops
del net.op[:]
net.op.extend(ops)
device = caffe2_pb2.IDEEP
for op in net.op:
op.device_option.MergeFrom(
core.DeviceOption(device_type=device))
op.engine = ""
# Temporarily disable conv+relu fusion until we verify further
# net.ParseFromString(
# C.transform_optimizeForMKLDNN(net.SerializeToString()))
fix_BoxWithNMSLimit(net)
def rewrite_run_net_simple_xrayocr_lstm(net):
# For xrayocr model with lstm, only rewrite the non-lstm part of the net to
# enable mkl, then copy the temporary output blob at the break point
# and all external inputs for lstm part to cpu, and execuate rest of the net
# (two lstm) on cpu
# This only works for the xrayocr lstm model which uses the first 'Shape' op
# to decide the break point, and after two lstm it's external_output
# directly so there's no need to copy back to ideep/mkl
def mkl_tmp(name):
return "{}__MKL__".format(name)
def cpu_tmp(name):
return "{}__CPU__".format(name)
input_blob = net.external_input[0]
if input_blob != net.op[0].input[0]:
raise Exception(
"Input blob: {} is not consumed by first op: {}".format(
input_blob, net.op[0]))
# Modify input/outputs to point to copied MKL blobs.
from_cpu = "CopyCPUToIDEEP"
to_cpu = "CopyIDEEPToCPU"
copy_input_op = core.CreateOperator(
from_cpu, input_blob, mkl_tmp(input_blob))
net.op[0].input[0] = mkl_tmp(input_blob)
# the net may contain some external_inputs falsely added during ONNX->Caffe2
# This should be taken care of in early steps during pytorch_to_caffe2,
# but if not it can cause issue in follow up steps, so check here to confirm
for input_blob in net.external_input:
for op in net.op:
# look for if the external_input blob is output of any op in the net
assert input_blob not in op.output
external_output = None
external_inputs_to_cpu = set()
find_first_shape_op = False
cpu_op_start_idx = -1
for op_idx, op in enumerate(net.op):
# the first Shape op mark the starting point of LSTM chunk of the net
if not find_first_shape_op:
if op.type == 'Shape':
external_output = op.input
find_first_shape_op = True
cpu_op_start_idx = op_idx
else:
# any external input in the LSTM part need to be copied to CPU
for in_blob in op.input:
if in_blob in net.external_input:
external_inputs_to_cpu.add(in_blob)
# make sure we found the expected break point of the net
assert external_output is not None
# create op to copy external input blobs used in LSTM part from IDEEP to CPU
copy_extra_input_ops = []
for in_blob in external_inputs_to_cpu:
copy_extra_input_ops.append(core.CreateOperator(to_cpu, in_blob,
cpu_tmp(in_blob)))
# rename input blobs in LSTM part to use the CPU copy
for op in net.op[cpu_op_start_idx:]:
renamed_input = [blob if blob != in_blob else cpu_tmp(in_blob)
for blob in op.input]
op.input[:] = renamed_input
copy_output_ops = [
core.CreateOperator(to_cpu, mkl_tmp(output_blob), output_blob)
for output_blob in external_output]
for output_blob in external_output:
last_producer_idx = last_producer(net.op, output_blob)
renamed_outputs = [blob if blob != output_blob else mkl_tmp(blob)
for blob in net.op[last_producer_idx].output]
net.op[last_producer_idx].output[:] = renamed_outputs
# rearrange all ops in correct order
ops = [copy_input_op] + net.op[:cpu_op_start_idx] \
+ copy_output_ops + copy_extra_input_ops + net.op[cpu_op_start_idx:]
del net.op[:]
net.op.extend(ops)
device = caffe2_pb2.IDEEP
for op in net.op:
# the first Shape op mark the starting point of LSTM chunk of the net
if op.type == 'Shape':
# all LSTM ops should run on CPU
device = caffe2_pb2.CPU
op.device_option.MergeFrom(
core.DeviceOption(device_type=device))
op.engine = ""
# RecurrentNetwork has a nested step_net that needs special treatment
if op.type == 'RecurrentNetwork':
for arg in op.arg:
if arg.name == 'step_net':
for nested_op in arg.n.op:
# set device to CPU
nested_op.device_option.MergeFrom(
core.DeviceOption(device_type=device))
nested_op.engine = ""
# rename inputs in op of nested net
renamed_input = []
for blob in nested_op.input:
renamed_input.append(blob
if blob not in external_inputs_to_cpu
else cpu_tmp(blob))
nested_op.input[:] = renamed_input
# rename external inputs of nested net
new_external_input = []
for blob in arg.n.external_input:
new_external_input.append(blob
if blob not in external_inputs_to_cpu
else cpu_tmp(blob))
arg.n.external_input[:] = new_external_input
# Temporarily disable conv+relu fusion until we verify further
# net.ParseFromString(
# C.transform_optimizeForMKLDNN(net.SerializeToString()))
fix_BoxWithNMSLimit(net)
def rewrite_model_helper_simple(model):
model = copy.deepcopy(model)
# All parameter initialization should run on MKL
rewrite_init_net_simple(model.param_init_net.Proto())
rewrite_run_net_simple(model.net.Proto())
return model
|