File: rewrite_graph.py

package info (click to toggle)
pytorch 1.13.1%2Bdfsg-4
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 139,252 kB
  • sloc: cpp: 1,100,274; python: 706,454; ansic: 83,052; asm: 7,618; java: 3,273; sh: 2,841; javascript: 612; makefile: 323; xml: 269; ruby: 185; yacc: 144; objc: 68; lex: 44
file content (215 lines) | stat: -rw-r--r-- 8,433 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215





import copy
from caffe2.proto import caffe2_pb2
from caffe2.python import core


def rewrite_init_net_simple(net):
    for op in net.op:
        op.device_option.device_type = caffe2_pb2.IDEEP

def last_producer(ops, blob):
    for (i, op) in reversed(list(enumerate(ops))):
        if blob in op.output:
            return i
    raise ValueError("Failed to find last producer of blob, %s", blob)


def fix_BoxWithNMSLimit(net):
    outputs = set()
    for op in net.op:
        if op.type == 'BoxWithNMSLimit':
            outputs.add(op.output[0])
            outputs.add(op.output[1])
            outputs.add(op.output[2])
    for op in net.op:
        if op.type == 'CopyIDEEPToCPU':
            if op.input[0] in outputs:
                print("Chaning CopyIDEEPToCPU to Copy for {}".format(op.input[0]))
                op.type = 'Copy'
                op.device_option.device_type = caffe2_pb2.CPU


def rewrite_run_net_simple(net):
    # Simple rewrite for now - assume entire graph can be executed
    # with MKL, so just insert copy ops for external_input[0] and
    # external_output[0]
    def mkl_tmp(name):
        return "{}__MKL__".format(name)

    input_blob = net.external_input[0]
    if input_blob != net.op[0].input[0]:
        raise Exception(
            "Input blob: {} is not consumed by first op: {}".format(
                input_blob, net.op[0]))
    # Modify input/outputs to point to copied MKL blobs.
    from_cpu = "CopyCPUToIDEEP"
    to_cpu = "CopyIDEEPToCPU"
    copy_input_op = core.CreateOperator(
        from_cpu, input_blob, mkl_tmp(input_blob))
    net.op[0].input[0] = mkl_tmp(input_blob)

    copy_output_ops = [
        core.CreateOperator(to_cpu, mkl_tmp(output_blob), output_blob)
        for output_blob in net.external_output]

    for output_blob in net.external_output:
        last_producer_idx = last_producer(net.op, output_blob)
        renamed_outputs = [blob if blob != output_blob else mkl_tmp(blob)
                           for blob in net.op[last_producer_idx].output]
        net.op[last_producer_idx].output[:] = renamed_outputs
        # Rename any subsequent consumers of an output blob.
        for op in net.op[last_producer_idx + 1:]:
            renamed_input = [blob if blob != output_blob else mkl_tmp(blob)
                             for blob in op.input]
            op.input[:] = renamed_input

    ops = [copy_input_op] + net.op[:] + copy_output_ops
    del net.op[:]
    net.op.extend(ops)
    device = caffe2_pb2.IDEEP
    for op in net.op:
        op.device_option.MergeFrom(
            core.DeviceOption(device_type=device))
        op.engine = ""

    # Temporarily disable conv+relu fusion until we verify further
    # net.ParseFromString(
    #     C.transform_optimizeForMKLDNN(net.SerializeToString()))
    fix_BoxWithNMSLimit(net)


def rewrite_run_net_simple_xrayocr_lstm(net):
    # For xrayocr model with lstm, only rewrite the non-lstm part of the net to
    # enable mkl, then copy the temporary output blob at the break point
    # and all external inputs for lstm part to cpu, and execuate rest of the net
    # (two lstm) on cpu
    # This only works for the xrayocr lstm model which uses the first 'Shape' op
    # to decide the break point, and after two lstm it's external_output
    # directly so there's no need to copy back to ideep/mkl

    def mkl_tmp(name):
        return "{}__MKL__".format(name)

    def cpu_tmp(name):
        return "{}__CPU__".format(name)

    input_blob = net.external_input[0]
    if input_blob != net.op[0].input[0]:
        raise Exception(
            "Input blob: {} is not consumed by first op: {}".format(
                input_blob, net.op[0]))
    # Modify input/outputs to point to copied MKL blobs.
    from_cpu = "CopyCPUToIDEEP"
    to_cpu = "CopyIDEEPToCPU"
    copy_input_op = core.CreateOperator(
        from_cpu, input_blob, mkl_tmp(input_blob))
    net.op[0].input[0] = mkl_tmp(input_blob)

    # the net may contain some external_inputs falsely added during ONNX->Caffe2
    # This should be taken care of in early steps during pytorch_to_caffe2,
    # but if not it can cause issue in follow up steps, so check here to confirm
    for input_blob in net.external_input:
        for op in net.op:
            # look for if the external_input blob is output of any op in the net
            assert input_blob not in op.output

    external_output = None
    external_inputs_to_cpu = set()
    find_first_shape_op = False
    cpu_op_start_idx = -1
    for op_idx, op in enumerate(net.op):
        # the first Shape op mark the starting point of LSTM chunk of the net
        if not find_first_shape_op:
            if op.type == 'Shape':
                external_output = op.input
                find_first_shape_op = True
                cpu_op_start_idx = op_idx
        else:
            # any external input in the LSTM part need to be copied to CPU
            for in_blob in op.input:
                if in_blob in net.external_input:
                    external_inputs_to_cpu.add(in_blob)

    # make sure we found the expected break point of the net
    assert external_output is not None

    # create op to copy external input blobs used in LSTM part from IDEEP to CPU
    copy_extra_input_ops = []
    for in_blob in external_inputs_to_cpu:
        copy_extra_input_ops.append(core.CreateOperator(to_cpu, in_blob,
                                                        cpu_tmp(in_blob)))
        # rename input blobs in LSTM part to use the CPU copy
        for op in net.op[cpu_op_start_idx:]:
            renamed_input = [blob if blob != in_blob else cpu_tmp(in_blob)
                             for blob in op.input]
            op.input[:] = renamed_input

    copy_output_ops = [
        core.CreateOperator(to_cpu, mkl_tmp(output_blob), output_blob)
        for output_blob in external_output]

    for output_blob in external_output:
        last_producer_idx = last_producer(net.op, output_blob)
        renamed_outputs = [blob if blob != output_blob else mkl_tmp(blob)
                           for blob in net.op[last_producer_idx].output]
        net.op[last_producer_idx].output[:] = renamed_outputs

    # rearrange all ops in correct order
    ops = [copy_input_op] + net.op[:cpu_op_start_idx] \
          + copy_output_ops + copy_extra_input_ops + net.op[cpu_op_start_idx:]
    del net.op[:]
    net.op.extend(ops)

    device = caffe2_pb2.IDEEP
    for op in net.op:
        # the first Shape op mark the starting point of LSTM chunk of the net
        if op.type == 'Shape':
            # all LSTM ops should run on CPU
            device = caffe2_pb2.CPU
        op.device_option.MergeFrom(
            core.DeviceOption(device_type=device))
        op.engine = ""

        # RecurrentNetwork has a nested step_net that needs special treatment
        if op.type == 'RecurrentNetwork':
            for arg in op.arg:
                if arg.name == 'step_net':
                    for nested_op in arg.n.op:
                        # set device to CPU
                        nested_op.device_option.MergeFrom(
                            core.DeviceOption(device_type=device))
                        nested_op.engine = ""

                        # rename inputs in op of nested net
                        renamed_input = []
                        for blob in nested_op.input:
                            renamed_input.append(blob
                                if blob not in external_inputs_to_cpu
                                else cpu_tmp(blob))
                        nested_op.input[:] = renamed_input

                    # rename external inputs of nested net
                    new_external_input = []
                    for blob in arg.n.external_input:
                        new_external_input.append(blob
                            if blob not in external_inputs_to_cpu
                            else cpu_tmp(blob))
                    arg.n.external_input[:] = new_external_input

    # Temporarily disable conv+relu fusion until we verify further
    # net.ParseFromString(
    #     C.transform_optimizeForMKLDNN(net.SerializeToString()))
    fix_BoxWithNMSLimit(net)


def rewrite_model_helper_simple(model):
    model = copy.deepcopy(model)
    # All parameter initialization should run on MKL
    rewrite_init_net_simple(model.param_init_net.Proto())
    rewrite_run_net_simple(model.net.Proto())
    return model