File: onnxifi_transformer.cc

package info (click to toggle)
pytorch 1.13.1%2Bdfsg-4
links: PTS, VCS
area: main
in suites: bookworm
size: 139,252 kB
sloc: cpp: 1,100,274; python: 706,454; ansic: 83,052; asm: 7,618; java: 3,273; sh: 2,841; javascript: 612; makefile: 323; xml: 269; ruby: 185; yacc: 144; objc: 68; lex: 44
file content (1469 lines) | stat: -rw-r--r-- 52,143 bytes
#include "caffe2/opt/onnxifi_transformer.h"

#include <iostream>
#include <unordered_set>

#include "onnx/proto_utils.h"

#include "caffe2/core/context.h"
#include "caffe2/core/logging.h"
#include "caffe2/core/operator.h"
#include "caffe2/core/tensor.h"
#include "caffe2/onnx/onnx_exporter.h"
#include "caffe2/opt/backend_cutting.h"
#include "caffe2/utils/proto_utils.h"
#include "caffe2/utils/string_utils.h"

namespace caffe2 {

namespace {
const std::string kRealBatchSizeBlob = "real_batch_size";
const std::string kInitializers = "initializers";
constexpr size_t kBufferSize = 64;

// Convert ShapeInfo map to TensorShape map
std::unordered_map<std::string, TensorShape> stripShapeInfoMap(
    const ShapeInfoMap& info_map) {
  std::unordered_map<std::string, TensorShape> shape_map;
  for (const auto& kv : info_map) {
    shape_map.emplace(kv.first, kv.second.shape);
  }
  return shape_map;
}

std::vector<::ONNX_NAMESPACE::ValueInfoProto> convertToValueInfo(
    const std::vector<std::string>& names,
    const std::unordered_map<std::string, TensorShape>& shape_hints,
    const std::unordered_map<std::string, ::ONNX_NAMESPACE::TypeProto>&
        extra_shape_hints) {
  std::vector<::ONNX_NAMESPACE::ValueInfoProto> r;
  for (const auto& s : names) {
    r.emplace_back();
    auto& value_info = r.back();
    value_info.set_name(s);
    const auto it = shape_hints.find(s);
    if (it == shape_hints.end()) {
      const auto eit = extra_shape_hints.find(s);
      if (eit == extra_shape_hints.end()) {
        LOG(WARNING) << "Cannot get shape of " << s;
      } else {
        value_info.mutable_type()->CopyFrom(eit->second);
      }
    } else {
      auto* tensor_type = value_info.mutable_type()->mutable_tensor_type();
      tensor_type->set_elem_type(
          onnx::Caffe2TypeToOnnxType(it->second.data_type()));
      auto* shape = tensor_type->mutable_shape();
      for (int i = 0; i < it->second.dims().size(); ++i) {
        shape->add_dim()->set_dim_value(it->second.dims(i));
      }
    }
  }
  return r;
}

// Given a net, with primiary inputs and outputs defined in its
// external_inputs/outputs, and given the set of weights and extra weights
// (created during conversion to ONNX if exists), we check whether some of the
// weights are used in the net, and if so, we put it in the initialize_list and
// add it to the external_inputs too.
// \param net [in] c2 net (cutoff from a bigger net)
// \param weights_in_ws [in] all the weights in the workspace
// \param extra_weights [in] extra weights possibly generated during ONNX
// conversion \param initialization_list [out] weights that needs to be offload
// to backend \param total_inputs_vec [out] total #inputs of the net that
// doesn't have a producer
void getWeightsAndInputs(
    const NetDef& net,
    const std::unordered_set<std::string>& weights_in_ws,
    const std::vector<std::string>& extra_weights,
    std::unordered_set<std::string>* initialization_list,
    std::vector<std::string>* total_inputs_vec) {
  std::unordered_set<std::string> total_inputs;

  // extra weights is definitely extra weights/inputs
  for (const auto& extra_weight : extra_weights) {
    if (total_inputs.emplace(extra_weight).second) {
      total_inputs_vec->emplace_back(extra_weight);
    }
    initialization_list->emplace(extra_weight);
  }

  // Boundary inputs that should not be weights
  std::unordered_set<std::string> boundary_inputs;
  for (const auto& i : net.external_input()) {
    boundary_inputs.emplace(i);
  }

  for (const auto& op : net.op()) {
    for (const auto& input : op.input()) {
      bool not_seen = total_inputs.emplace(input).second;
      if (!not_seen) {
        continue;
      }
      if (weights_in_ws.count(input)) {
        // We add weights as inputs too
        total_inputs_vec->emplace_back(input);
        initialization_list->emplace(input);
        VLOG(2) << "Add weights: " << input;
      } else if (boundary_inputs.count(input)) {
        VLOG(2) << "Adding boundary input: " << input;
        total_inputs_vec->emplace_back(input);
      }
    }
  }
}

void collectInputsAndOutputs(
    const OperatorDef& op,
    std::set<std::string>* inputs,
    std::set<std::string>* outputs) {
  for (const auto& blob : op.input()) {
    inputs->emplace(blob);
  }
  for (const auto& blob : op.output()) {
    outputs->emplace(blob);
  }
}

void fetchInputsToIfOpsSubnet(NetDef* net) {
  NetDef clone(*net);
  clone.clear_op();
  for (auto& op : net->op()) {
    if (op.type() == "If" || op.type() == "AsyncIf") {
      OperatorDef new_op(op);
      ArgumentHelper helper(op);
      std::set<std::string> subnet_inputs, subnet_outputs;
      if (helper.HasSingleArgumentOfType<NetDef>("then_net")) {
        auto then_net = helper.GetSingleArgument<NetDef>("then_net", NetDef());
        for (const auto& nested_op : then_net.op()) {
          collectInputsAndOutputs(nested_op, &subnet_inputs, &subnet_outputs);
        }
      }
      if (helper.HasSingleArgumentOfType<NetDef>("else_net")) {
        auto else_net = helper.GetSingleArgument<NetDef>("else_net", NetDef());
        for (const auto& nested_op : else_net.op()) {
          collectInputsAndOutputs(nested_op, &subnet_inputs, &subnet_outputs);
        }
      }
      for (const std::string& blob : subnet_inputs) {
        if (subnet_outputs.count(blob) == 0) {
          new_op.add_input(blob);
        }
      }
      clone.add_op()->CopyFrom(new_op);
    } else {
      clone.add_op()->CopyFrom(op);
    }
  }
  net->Swap(&clone);
}

void fillModelInfo(::ONNX_NAMESPACE::ModelProto* model) {
  model->set_ir_version(::ONNX_NAMESPACE::Version::IR_VERSION);
  model->set_producer_name("caffe2");
  auto* opset_id = model->add_opset_import();
  opset_id->set_domain("");
  opset_id->set_version(7);
}

int64_t getBlob1stDimSize(const ShapeInfo& shape_info) {
  if (shape_info.shape.dims_size() == 0) {
    return 0;
  } else {
    return shape_info.shape.dims(0);
  }
}

NetDef composeResultNet(const OperatorDef& onnxifi_op) {
  NetDef net_opt;
  net_opt.add_op()->CopyFrom(onnxifi_op);
  return net_opt;
}

void enforceFp32InputsToFp16(
    const std::unordered_set<std::string>& weights,
    NetDef* pred_net,
    ShapeInfoMap* shape_hints) {
  std::unordered_map<std::string, ShapeInfo> user_input_map;
  for (const auto& i : pred_net->external_input()) {
    if (weights.count(i)) {
      continue;
    }
    auto it = shape_hints->find(i);
    if (it == shape_hints->end() ||
        it->second.shape.data_type() != TensorProto_DataType_FLOAT) {
      continue;
    }
    auto& shape_info = it->second;
    user_input_map[i] = shape_info;
    shape_info.shape.set_data_type(TensorProto_DataType_FLOAT16);
  }

  if (user_input_map.empty()) {
    return;
  }

  std::vector<OperatorDef> ops;
  for (const auto& op : pred_net->op()) {
    ops.emplace_back(op);
  }
  pred_net->clear_op();
  int current_pos = ops.size();

  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
  const char kBridgeTensorSuffix[] = "_to_float_bridge";
  std::vector<OperatorDef> converts;
  for (const auto& elem : user_input_map) {
    const auto& name = elem.first;
    const auto& shape_info = elem.second;
    std::string new_name = name + kBridgeTensorSuffix;
    shape_hints->emplace(new_name, shape_info);
    converts.emplace_back(CreateOperatorDef(
        "HalfToFloat",
        "",
        {name},
        {new_name},
        {MakeArgument<int>(kNetPos, current_pos++)}));
  }
  for (const auto& op : converts) {
    pred_net->add_op()->CopyFrom(op);
  }

  for (auto& op : ops) {
    for (auto& input : *op.mutable_input()) {
      if (user_input_map.count(input)) {
        input += kBridgeTensorSuffix;
      }
    }
  }

  for (const auto& op : ops) {
    pred_net->add_op()->CopyFrom(op);
  }
}

void mergeFp32InputsAndConvertToFp16(
    size_t batch_size,
    const std::unordered_set<std::string>& weights,
    NetDef* pred_net,
    ShapeInfoMap* shape_hints) {
  std::unordered_map<std::string, ShapeInfo> user_input_map;
  for (const auto& i : pred_net->external_input()) {
    if (weights.count(i)) {
      continue;
    }
    const auto it = shape_hints->find(i);
    // Heuristic: the input has to be of float type, 2-dimensional and the first
    // dimension has to be of batch size
    if (it == shape_hints->end() ||
        it->second.shape.data_type() != TensorProto_DataType_FLOAT) {
      continue;
    }
    auto shape_info = it->second;
    if (shape_info.shape.dims_size() != 2 ||
        // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
        shape_info.shape.dims(0) != batch_size) {
      continue;
    }
    shape_info.shape.set_data_type(TensorProto_DataType_FLOAT16);

    user_input_map[i] = shape_info;
  }

  if (user_input_map.empty()) {
    return;
  }
  std::unordered_map<std::string, std::vector<std::string>>
      user_inputs_by_partition;
  std::unordered_map<std::string, std::unordered_set<std::string>>
      user_input_set_by_partition;
  for (const auto& op : pred_net->op()) {
    for (const auto& i : op.input()) {
      if (user_input_map.find(i) != user_input_map.end()) {
        const auto& partition = op.device_option().node_name().empty()
            ? "default"
            : op.device_option().node_name();
        if (user_input_set_by_partition[partition].find(i) ==
            user_input_set_by_partition[partition].end()) {
          user_inputs_by_partition[partition].emplace_back(i);
          user_input_set_by_partition[partition].insert(i);
        }
      }
    }
  }

  std::vector<OperatorDef> ops;
  for (const auto& op : pred_net->op()) {
    ops.emplace_back(op);
  }
  pred_net->clear_op();
  int current_pos = ops.size();

  for (const auto& elem : user_inputs_by_partition) {
    const auto& partition = elem.first;
    const auto& user_inputs = elem.second;
    const auto& user_input_set = user_input_set_by_partition[partition];

    OperatorDef op1;
    op1.set_type("Concat");
    for (const auto& i : user_inputs) {
      op1.add_input(i);
    }
    op1.add_output(partition + "_fp32_input_concated");
    op1.add_output(partition + "_fp32_input_concated_split_info");
    auto shape_info = user_input_map[user_inputs.front()];
    int total = 0;
    for (const auto& u : user_inputs) {
      total += user_input_map[u].shape.dims(1);
    }
    shape_info.shape.set_dims(1, total);
    AddArgument("axis", 1, &op1);
    AddArgument(kNetPos, current_pos++, &op1);
    pred_net->add_op()->CopyFrom(op1);

    // TODO: a possible optimization is to fuse the fp16 conversion into Concat
    OperatorDef op2;
    op2.set_type("FloatToHalf");
    op2.add_input(partition + "_fp32_input_concated");
    op2.add_output(partition + "_fp16_input_concated");
    AddArgument("clip", 1, &op2);
    AddArgument(kNetPos, current_pos++, &op2);
    shape_hints->emplace(partition + "_fp16_input_concated", shape_info);
    pred_net->add_op()->CopyFrom(op2);

    OperatorDef op3;
    op3.set_type("Split");
    op3.add_input(partition + "_fp16_input_concated");
    op3.mutable_device_option()->set_node_name(partition);

    std::vector<OperatorDef> converts;
    for (const auto& i : user_inputs) {
      // NOLINTNEXTLINE(performance-inefficient-string-concatenation)
      std::string new_name = partition + "_" + i + "_split_fp16";
      op3.add_output(new_name);
      shape_hints->emplace(new_name, user_input_map[i]);
      converts.emplace_back(CreateOperatorDef(
          "HalfToFloat",
          "",
          // NOLINTNEXTLINE(performance-inefficient-string-concatenation)
          {partition + "_" + i + "_split_fp16"},
          // NOLINTNEXTLINE(performance-inefficient-string-concatenation)
          {partition + "_" + i + "_split"},
          {MakeArgument<int>(kNetPos, current_pos++)}));
      converts.back().mutable_device_option()->set_node_name(partition);

      auto converted_shape = user_input_map[i];
      converted_shape.shape.set_data_type(TensorProto_DataType_FLOAT);
      // NOLINTNEXTLINE(performance-inefficient-string-concatenation)
      shape_hints->emplace(partition + "_" + i + "_split", converted_shape);
    }
    AddArgument("axis", 1, &op3);
    AddArgument(kNetPos, current_pos++, &op3);
    auto* arg = op3.add_arg();
    arg->set_name("split");
    for (const auto& u : user_inputs) {
      arg->add_ints(user_input_map[u].shape.dims(1));
    }
    pred_net->add_op()->CopyFrom(op3);
    for (const auto& op : converts) {
      pred_net->add_op()->CopyFrom(op);
    }

    for (auto& op : ops) {
      if ((!op.device_option().node_name().empty() &&
           op.device_option().node_name() == partition) ||
          (op.device_option().node_name().empty() && partition == "default")) {
        for (auto& i : *op.mutable_input()) {
          if (user_input_set.count(i)) {
            // NOLINTNEXTLINE(performance-inefficient-string-concatenation)
            i = partition + "_" + i + "_split";
          }
        }
      }
    }
  }

  for (const auto& op : ops) {
    pred_net->add_op()->CopyFrom(op);
  }
}

} // namespace

void splitSparseLengthsSumSparse(NetDef* net, const Workspace& ws) {
  const static std::unordered_map<string, string> slss = {
      {"SparseLengthsSum4BitRowwiseSparse", "SparseLengthsSumFused4BitRowwise"},
      {"SparseLengthsWeightedSum4BitRowwiseSparse",
       "SparseLengthsWeightedSumFused4BitRowwise"},
      {"SparseLengthsSum8BitRowwiseSparse", "SparseLengthsSumFused8BitRowwise"},
      {"SparseLengthsWeightedSum8BitRowwiseSparse",
       "SparseLengthsWeightedSumFused8BitRowwise"},
      {"SparseLengthsSum2BitRowwiseSparse", "SparseLengthsSumFused2BitRowwise"},
      {"SparseLengthsWeightedSum2BitRowwiseSparse",
       "SparseLengthsWeightedSumFused2BitRowwise"}};
  NetDef new_net;
  new_net.CopyFrom(*net);
  new_net.mutable_op()->Clear();
  for (const auto& op : net->op()) {
    const auto it = slss.find(op.type());
    if (it == slss.end()) {
      new_net.add_op()->CopyFrom(op);
    } else {
      const bool is_weighted =
          (op.type().find("Weighted") != std::string::npos);
      const auto& compressed_mapping = op.input(is_weighted ? 4 : 3);
      const auto* b = ws.GetBlob(compressed_mapping);
      bool fallback = false;
      if (b && b->IsType<Tensor>()) {
        const auto& t = BlobGetTensor(*b, CPU);
        fallback = ((t.numel() == 1) && (t.template data<int32_t>()[0] == 0));
      }

      if (fallback) {
        // If fallback, we just replace the original slss op with a normal sls
        // op
        OperatorDef new_op;
        new_op.CopyFrom(op);
        new_op.set_type(it->second);
        new_op.mutable_input()->RemoveLast();
        new_net.add_op()->CopyFrom(new_op);
      } else {
        // Otherwise, we replace slss with slss_lookup followed by a normal sls
        OperatorDef new_op;
        new_op.CopyFrom(op);
        new_op.set_type("SparseLengthsSumSparseLookup");
        new_op.clear_input();
        const auto& indices_in = is_weighted ? op.input(2) : op.input(1);
        const auto& lengths_in = is_weighted ? op.input(3) : op.input(2);
        const auto& compress_mapping = is_weighted ? op.input(4) : op.input(3);
        const auto& weights_in = is_weighted ? op.input(1) : "";
        new_op.add_input(indices_in);
        new_op.add_input(lengths_in);
        new_op.add_input(compress_mapping);
        const auto indices_out = indices_in + "_decomp";
        const auto lengths_out = lengths_in + "_decomp";
        const auto weights_out = weights_in + "_decomp";
        new_op.clear_output();
        new_op.add_output(indices_out);
        new_op.add_output(lengths_out);
        if (is_weighted) {
          new_op.add_input(weights_in);
          new_op.add_output(weights_out);
        }
        new_net.add_op()->CopyFrom(new_op);

        new_op.CopyFrom(op);
        new_op.set_type(it->second);
        new_op.mutable_input()->RemoveLast();
        *new_op.mutable_input()->Mutable(is_weighted ? 2 : 1) = indices_out;
        *new_op.mutable_input()->Mutable(is_weighted ? 3 : 2) = lengths_out;
        if (is_weighted) {
          *new_op.mutable_input()->Mutable(1) = weights_out;
        }
        new_net.add_op()->CopyFrom(new_op);
      }
    }
  }

  new_net.Swap(net);
}

OnnxifiOptionHelper::OnnxifiOptionHelper() {
  lib_ = onnx::initOnnxifiLibrary();
  CAFFE_ENFORCE(lib_, "Cannot initialize ONNXIFI library");
}

bool OnnxifiOptionHelper::setOnnxifiOption(
    const std::string& option,
    const std::string& value) {
#ifdef ONNXIFI_ENABLE_EXT
  onnxStatus (*onnxSetOptionFunctionPointer)(
      const char* optionName, const char* optionValue) = nullptr;
  union {
    onnxExtensionFunctionPointer p;
    decltype(onnxSetOptionFunctionPointer) set;
  } u{};
  onnxBackendID backend_id = nullptr;
  if (lib_->onnxGetExtensionFunctionAddress(
          backend_id, "onnxSetOptionFunction", &u.p) !=
      ONNXIFI_STATUS_SUCCESS) {
    LOG(ERROR) << "Cannot find onnxSetOptionFunction";
    return false;
  } else {
    onnxSetOptionFunctionPointer = u.set;
  }
  if (onnxSetOptionFunctionPointer != nullptr &&
      (*onnxSetOptionFunctionPointer)(option.c_str(), value.c_str()) ==
          ONNXIFI_STATUS_SUCCESS) {
    return true;
  }
#endif
  return false;
}

std::string OnnxifiOptionHelper::getOnnxifiOption(const std::string& option) {
#ifdef ONNXIFI_ENABLE_EXT
  onnxStatus (*onnxGetOptionFunctionPointer)(
      const char* optionName, char* optionValue, size_t* optionValueLength) =
      nullptr;
  union {
    onnxExtensionFunctionPointer p;
    decltype(onnxGetOptionFunctionPointer) get;
  } u{};
  onnxBackendID backend_id = nullptr;
  if (lib_->onnxGetExtensionFunctionAddress(
          backend_id, "onnxGetOptionFunction", &u.p) !=
      ONNXIFI_STATUS_SUCCESS) {
    LOG(ERROR) << "Cannot find onnxGetOptionFunction";
    return "";
  } else {
    onnxGetOptionFunctionPointer = u.get;
  }

  constexpr size_t ll = 1024;
  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
  char buf[ll];
  size_t len = ll;
  if (onnxGetOptionFunctionPointer != nullptr &&
      (*onnxGetOptionFunctionPointer)(option.c_str(), buf, &len) ==
          ONNXIFI_STATUS_SUCCESS) {
    return std::string(buf, len);
  }
#endif

  return "";
}

// NOLINTNEXTLINE(modernize-pass-by-value)
OnnxifiTransformer::OnnxifiTransformer(const OnnxifiTransformerOptions& opts)
    : BackendTransformerBase(), opts_(opts) {
  lib_ = onnx::initOnnxifiLibrary();
  CAFFE_ENFORCE(lib_, "Cannot initialize ONNXIFI library");
  CAFFE_ENFORCE_EQ(
      lib_->onnxGetBackendIDs(nullptr, &num_backends_),
      ONNXIFI_STATUS_FALLBACK);
  CAFFE_ENFORCE_GT(
      num_backends_, 0, "At least 1 onnxifi backend should be available");
  backend_ids_.resize(num_backends_);
  CAFFE_ENFORCE_EQ(
      lib_->onnxGetBackendIDs(backend_ids_.data(), &num_backends_),
      ONNXIFI_STATUS_SUCCESS);
}

OnnxifiTransformer::~OnnxifiTransformer() {
  for (unsigned i = 0; i < num_backends_; ++i) {
    if (lib_->onnxReleaseBackendID(backend_ids_[i]) != ONNXIFI_STATUS_SUCCESS) {
      LOG(ERROR) << "Error when calling onnxReleaseBackendID";
    }
  }
}

bool OnnxifiTransformer::canPassOutputShapeHintsPerBs(
    const OperatorDef& op,
    const std::unordered_map<int, ShapeInfoMap>& shape_hints_per_bs) const {
  if (shape_hints_per_bs.empty()) {
    return false;
  }

  for (int bs = 1; bs < opts_.bound_shape_spec.max_batch_size; ++bs) {
    auto shape_hints_search = shape_hints_per_bs.find(bs);
    if (shape_hints_search == shape_hints_per_bs.end()) {
      return false;
    }
    const auto& shape_hints = shape_hints_search->second;

    for (int output_idx = 0; output_idx < op.output_size(); ++output_idx) {
      auto shape_hint_search = shape_hints.find(op.output(output_idx));
      if (shape_hint_search == shape_hints.end()) {
        return false;
      }
    }
  }

  return true;
}

OperatorDef OnnxifiTransformer::buildOnnxifiOp(
    const std::string& onnx_model_str,
    const std::unordered_set<std::string>& initialization_list,
    const std::vector<std::string>& external_inputs,
    const std::vector<std::string>& external_outputs,
    const ShapeInfoMap& shape_hints_max_bs,
    const std::unordered_map<int, ShapeInfoMap>& shape_hints_per_bs) {
  OperatorDef op;
  op.set_type("Onnxifi");
  auto* onnx_model_arg = op.add_arg();
  onnx_model_arg->set_name("onnx_model");
  onnx_model_arg->set_s(onnx_model_str);

  // Add the names of the initializer blobs that we want to fetch from the
  // workspace later
  auto* initializers_arg = op.add_arg();
  initializers_arg->set_name(kInitializers);
  for (const auto& s : initialization_list) {
    initializers_arg->add_strings(s);
  }

  // Add the input/output
  int idx = 0;
  auto* input_names = op.add_arg();
  input_names->set_name("input_names");
  for (const auto& input : external_inputs) {
    if (!initialization_list.count(input)) {
      op.add_input(input);
      input_names->add_strings(input);
    }
  }
  auto* output_names = op.add_arg();
  output_names->set_name("output_names");
  for (const auto& output : external_outputs) {
    op.add_output(output);
    output_names->add_strings(output);
  }

  // Find out the index of input that has a nominal batch size
  const auto max_batch_size = opts_.bound_shape_spec.max_batch_size;
  idx = 0;
  int nominal_batch_idx{0};
  for (const auto& input : external_inputs) {
    if (!initialization_list.count(input)) {
      const auto it = shape_hints_max_bs.find(input);
      CAFFE_ENFORCE(
          it != shape_hints_max_bs.end(),
          "Input shape for ",
          input,
          " not found");
      const auto& info = it->second;
      if (info.getDimType(0) == TensorBoundShape_DimType_BATCH &&
          getBlob1stDimSize(info) == max_batch_size) {
        nominal_batch_idx = idx;
        break;
      }
      ++idx;
    }
  }

  // Add output size hints for max batch size
  auto* output_shape_info_arg = op.add_arg();
  output_shape_info_arg->set_name("output_shape_info");
  auto* output_qshape_info_arg = op.add_arg();
  output_qshape_info_arg->set_name("output_qshape_info");
  for (int i = 0; i < op.output_size(); ++i) {
    const auto& o = op.output(i);
    const auto it = shape_hints_max_bs.find(o);
    if (it != shape_hints_max_bs.end()) {
      if (!it->second.is_quantized) {
        output_shape_info_arg->mutable_tensors()->Add()->CopyFrom(
            wrapShapeInfoIntoTensorProto(o, it->second));
      } else {
        output_qshape_info_arg->mutable_qtensors()->Add()->CopyFrom(
            wrapShapeInfoIntoQTensorProto(o, it->second));
      }
      VLOG(2) << "Adding output hint: " << o;
    }
  }

  // Add output size hints per batch size
  if (canPassOutputShapeHintsPerBs(op, shape_hints_per_bs)) {
    VLOG(2) << "Passing in output shape hints for batch sizes in [1, "
            << opts_.bound_shape_spec.max_batch_size << ")";
    AddArgument("use_passed_output_shapes", 1, &op);

    for (int bs = 1; bs < opts_.bound_shape_spec.max_batch_size; ++bs) {
      auto* output_shape_arg = op.add_arg();
      output_shape_arg->set_name("output_shapes_bs_" + caffe2::to_string(bs));
      auto* output_qshape_arg = op.add_arg();
      output_qshape_arg->set_name("output_qshapes_bs_" + caffe2::to_string(bs));

      const auto& shape_hints = shape_hints_per_bs.find(bs)->second;

      for (int output_idx = 0; output_idx < op.output_size(); ++output_idx) {
        const auto& output_name = op.output(output_idx);
        const auto& shape_hint = shape_hints.find(output_name)->second;
        if (!shape_hint.is_quantized) {
          output_shape_arg->mutable_tensors()->Add()->CopyFrom(
              wrapShapeInfoIntoTensorProto(output_name, shape_hint));
        } else {
          output_shape_arg->mutable_qtensors()->Add()->CopyFrom(
              wrapShapeInfoIntoQTensorProto(output_name, shape_hint));
        }
      }
    }
  } else {
    AddArgument("use_passed_output_shapes", 0, &op);
  }

  // Tell Onnxifi op that the model is in onnx or c2 proto format
  AddArgument("use_onnx", opts_.use_onnx ? 1 : 0, &op);

  // Tell Onnxifi op which backend id to use
  AddArgument("backend_id", idx_, &op);

  // Add model_id and net_pos to the onnxifi model
  AddArgument(kModelId, model_id_, &op);
  AddArgument(kNetPos, c10::to_string(onnxifi_op_id_++), &op);

  // Add output resizing hints
  if (opts_.adjust_batch) {
    AddArgument("adjust_output_batch", 1, &op);
  } else {
    AddArgument("adjust_output_batch", 0, &op);
  }
  AddArgument("max_batch_size", opts_.bound_shape_spec.max_batch_size, &op);
  AddArgument("max_seq_size", opts_.bound_shape_spec.max_seq_size, &op);
  AddArgument("timeout", opts_.timeout, &op);
  AddArgument("nominal_batch_idx", nominal_batch_idx, &op);
  AddArgument("use_onnxifi_batch_size", opts_.use_onnxifi_batch_size, &op);

  return op;
}

NetDef OnnxifiTransformer::SubnetToOnnxifiOpViaC2(
    const caffe2::NetDef& net,
    const std::unordered_set<std::string>& weights_in_ws,
    const ShapeInfoMap& shape_hints_max_bs,
    const std::unordered_map<int, ShapeInfoMap>& shape_hints_per_bs) {
  int onnxifi_op_id = onnxifi_op_id_;
  if (opts_.debug) {
    WriteProtoToTextFile(
        net,
        "debug_original_net_" + c10::to_string(onnxifi_op_id) + ".pb_txt",
        false);
  }
  // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
  if (opts_.min_ops > net.op_size()) {
    return net;
  }
  // We already have all the ops and external inputs and outputs!
  NetDef onnxifi_net(net);

  // Remove the second output of Concat/Reshape from external_output. Remove
  // rest of the outputs of LayerNorm too. In addition, we remove those outputs
  // from the Onnxifi op too.
  // TODO: This approach is a bit hacky as we assume that the second output is
  // never used. A more appropriate approach can be learned from the ONNX path,
  // where we statically computes the split_info given input shape and insert a
  // GivenTensorIntFill op
  std::unordered_set<std::string> split_infos;
  for (auto& op : *onnxifi_net.mutable_op()) {
    if ((op.type() == "Concat" || op.type() == "Reshape") &&
        op.output_size() == 2) {
      split_infos.emplace(op.output(1));
    } else if (
        op.type() == "SparseLengthsSum" ||
        op.type() == "SparseLengthsSumFused8BitRowwise" ||
        op.type() == "SparseLengthsWeightedSum" ||
        op.type() == "SparseLengthsWeightedSumFused8BitRowwise" ||
        op.type() == "SparseLengthsSumFused4BitRowwise" ||
        op.type() == "SparseLengthsWeightedSumFused4BitRowwise") {
      int weighted = (op.type() == "SparseLengthsWeightedSum" ||
                      op.type() == "SparseLengthsWeightedSumFused8BitRowwise" ||
                      op.type() == "SparseLengthsWeightedSumFused4BitRowwise")
          ? 1
          : 0;
      const auto& indices_hint = shape_hints_max_bs.at(op.input(1 + weighted));
      const auto& lengths_hint = shape_hints_max_bs.at(op.input(2 + weighted));
      const auto& indices_shape = indices_hint.shape;
      const auto& lengths_shape = lengths_hint.shape;
      if ((indices_hint.getDimType(0) ==
               TensorBoundShape_DimType_BATCH_OF_FEATURE_MAX ||
           indices_hint.getDimType(0) ==
               TensorBoundShape_DimType_BATCH_OF_FEATURE_MAX_DEFAULT) &&
          indices_shape.dims_size() == 1 && lengths_shape.dims_size() == 1 &&
          indices_shape.dims(0) == lengths_shape.dims(0)) {
        op.add_arg()->CopyFrom(MakeArgument<int>("length1", 1));
      }
    } else if (op.type() == "LayerNorm" && op.output_size() > 1) {
      for (int i = 1; i < op.output_size(); ++i) {
        split_infos.emplace(op.output(i));
      }
    }
  }
  onnxifi_net.clear_external_output();
  for (const auto& o : net.external_output()) {
    if (!split_infos.count(o)) {
      onnxifi_net.add_external_output(o);
    }
  }

  // Figure out weights and add it to external_inputs too
  std::unordered_set<std::string> initialization_list;
  std::vector<std::string> total_inputs_vec;
  getWeightsAndInputs(
      net,
      weights_in_ws,
      std::vector<std::string>(),
      &initialization_list,
      &total_inputs_vec);
  auto* shape_arg = onnxifi_net.add_arg();
  auto* qshape_arg = onnxifi_net.add_arg();
  shape_arg->set_name("input_shape_info");
  qshape_arg->set_name("input_qshape_info");
  std::sort(total_inputs_vec.begin(), total_inputs_vec.end());
  onnxifi_net.clear_external_input();
  for (const auto& i : total_inputs_vec) {
    onnxifi_net.add_external_input(i);
    auto info = shape_hints_max_bs.at(i);
    if (!info.is_quantized) {
      shape_arg->mutable_tensors()->Add()->CopyFrom(
          wrapShapeInfoIntoTensorProto(i, shape_hints_max_bs.at(i)));
    } else {
      qshape_arg->mutable_qtensors()->Add()->CopyFrom(
          wrapShapeInfoIntoQTensorProto(i, shape_hints_max_bs.at(i)));
    }
  }

  // Add partition info
  for (const auto& p : partition_infos_) {
    onnxifi_net.add_partition_info()->CopyFrom(p);
  }

  // Add initializers (weights) list to the net as an arg
  auto* w_arg = onnxifi_net.add_arg();
  w_arg->set_name(kInitializers);
  for (const auto& i : initialization_list) {
    w_arg->add_strings(i);
  }

  // Build ONNXIFI Op
  std::string model_str;
  onnxifi_net.SerializeToString(&model_str);
  std::vector<std::string> onnxifi_net_inputs(
      onnxifi_net.external_input().begin(), onnxifi_net.external_input().end());
  std::vector<std::string> onnxifi_net_outputs(
      onnxifi_net.external_output().begin(),
      onnxifi_net.external_output().end());
  auto onnxifi_op = buildOnnxifiOp(
      model_str,
      initialization_list,
      onnxifi_net_inputs,
      onnxifi_net_outputs,
      shape_hints_max_bs,
      shape_hints_per_bs);
  NetDef net_opt = composeResultNet(onnxifi_op);

  // Debugging stuff
  if (opts_.debug) {
    WriteProtoToTextFile(
        onnxifi_net,
        "debug_onnxifi_net_" + c10::to_string(onnxifi_op_id) + ".pb_txt",
        false);
    WriteProtoToTextFile(
        net_opt,
        "debug_optimized_net_" + c10::to_string(onnxifi_op_id) + ".pb_txt",
        false);
  }
  return net_opt;
}

NetDef OnnxifiTransformer::SubnetToOnnxifiOpViaOnnx(
    const caffe2::NetDef& net,
    const std::unordered_set<std::string>& weights_in_ws,
    Workspace* ws,
    onnx::OnnxExporter* exporter,
    ShapeInfoMap* shape_hints_max_bs,
    const std::unordered_map<int, ShapeInfoMap>& shape_hints_per_bs) {
  // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
  if (opts_.min_ops > net.op_size()) {
    return net;
  }
  ::ONNX_NAMESPACE::ModelProto onnx_model;
  fillModelInfo(&onnx_model);

  // NOLINTNEXTLINE(performance-unnecessary-copy-initialization)
  caffe2::NetDef onnxifi_net(net);

  // Convert c2 ops to onnx ops, add const weights if there are any
  DeviceOption option;
  CPUContext context(option);
  context.SwitchToDevice();
  std::vector<std::string> extra_weights;
  for (const auto& op : onnxifi_net.op()) {
    const auto results = exporter->Caffe2OpToOnnxNodes(op, shape_hints_onnx_);
    for (const auto& n : results.first) {
      onnx_model.mutable_graph()->add_node()->CopyFrom(n);
    }
    for (const auto& t : results.second) {
      VLOG(2) << "Adding extra init tensor: " << t.name();
      TensorShape shape;
      shape.mutable_dims()->CopyFrom(t.dims());
      auto ret = shape_hints_onnx_.emplace(t.name(), std::move(shape));
      shape_hints_max_bs->emplace(
          std::piecewise_construct,
          std::forward_as_tuple(ret.first->first),
          std::forward_as_tuple(
              std::vector<TensorBoundShape::DimType>(
                  // NOLINTNEXTLINE(bugprone-use-after-move)
                  shape.dims_size(), TensorBoundShape_DimType_CONSTANT),
              ret.first->second));

      // Feed into workspace as CPU Tensors
      auto* blob = ws->CreateBlob(t.name());
      auto* cpu_tensor = BlobGetMutableTensor(blob, CPU);
      std::vector<int64_t> dims;
      for (const auto& d : t.dims()) {
        dims.push_back(d);
      }
      cpu_tensor->Resize(dims);
      if (t.data_type() == ::ONNX_NAMESPACE::TensorProto::FLOAT) {
        context.CopyBytesSameDevice(
            cpu_tensor->numel() * sizeof(float),
            static_cast<const void*>(t.raw_data().data()),
            cpu_tensor->raw_mutable_data(TypeMeta::Make<float>()));
      } else if (t.data_type() == ::ONNX_NAMESPACE::TensorProto::INT64) {
        context.CopyBytesSameDevice(
            cpu_tensor->numel() * sizeof(int64_t),
            static_cast<const void*>(t.raw_data().data()),
            cpu_tensor->raw_mutable_data(TypeMeta::Make<int64_t>()));
      } else {
        CAFFE_THROW(
            "Unsupported tensor data type for conversion: ", t.data_type());
      }
      context.FinishDeviceComputation();

      // Add mappings
      extra_weights.emplace_back(t.name());
    }
  }

  // Convert outputs and compute output shape hints
  std::vector<std::string> onnxifi_net_outputs;
  for (const auto& o : net.external_output()) {
    onnxifi_net_outputs.emplace_back(o);
  }
  auto io_vec = convertToValueInfo(
      onnxifi_net_outputs,
      shape_hints_onnx_,
      std::unordered_map<std::string, ::ONNX_NAMESPACE::TypeProto>());
  for (const auto& i : io_vec) {
    onnx_model.mutable_graph()->add_output()->CopyFrom(i);
  }

  // Convert inputs and figure out weights
  std::unordered_set<std::string> initialization_list;
  std::vector<std::string> onnxifi_net_inputs;
  getWeightsAndInputs(
      net,
      weights_in_ws,
      extra_weights,
      &initialization_list,
      &onnxifi_net_inputs);
  io_vec = convertToValueInfo(
      onnxifi_net_inputs,
      shape_hints_onnx_,
      std::unordered_map<std::string, ::ONNX_NAMESPACE::TypeProto>());
  for (const auto& i : io_vec) {
    onnx_model.mutable_graph()->add_input()->CopyFrom(i);
  }

  // Onnx model is ready. Build ONNXIFI Op
  std::string model_str;
  onnx_model.SerializeToString(&model_str);
  auto onnxifi_op = buildOnnxifiOp(
      model_str,
      initialization_list,
      onnxifi_net_inputs,
      onnxifi_net_outputs,
      *shape_hints_max_bs,
      shape_hints_per_bs);
  NetDef net_opt = composeResultNet(onnxifi_op);

  // Debugging stuff
  if (opts_.debug) {
    WriteProtoToTextFile(onnx_model, "debug_onnxifi_net.onnx_txt", false);
    WriteProtoToTextFile(net_opt, "debug_optimized_net.pb_txt", false);
  }
  return net_opt;
}

bool OnnxifiTransformer::supportOpOnnx(
    const caffe2::OperatorDef& op,
    onnx::OnnxExporter* exporter,
    const std::unordered_set<int>& blocklisted_ops,
    onnxBackendID backend_id) const {
  try {
    int pos =
        ArgumentHelper::GetSingleArgument<OperatorDef, int>(op, kNetPos, -1);
    if (blocklisted_ops.count(pos)) {
      LOG(INFO) << "Skipping blocklisted op " << op.type() << " at pos " << pos;
      return false;
    }
    const OpSchema* schema = OpSchemaRegistry::Schema(op.type());
    // NB: this might not be a hard constraint as we can just export C2
    // domain specific ops to ONNX
    if (!schema || schema->onnx_schema().empty()) {
      LOG(INFO) << "Cannot export c2 op " << op.type()
                << " to onnx as there is no corresponding ONNX schema.";
      return false;
    }

    ::ONNX_NAMESPACE::ModelProto onnx_model;
    fillModelInfo(&onnx_model);
    auto results = exporter->Caffe2OpToOnnxNodes(op, shape_hints_onnx_);
    std::unordered_set<std::string> used_inputs;
    std::unordered_set<std::string> used_outputs;
    std::vector<std::string> boundary_inputs;
    std::vector<std::string> boundary_outputs;
    std::unordered_set<std::string> reshape_info;
    // nodes are in topological order, so we just need to iterate
    for (const auto& n : results.first) {
      onnx_model.mutable_graph()->add_node()->CopyFrom(n);
      for (const auto& i : n.input()) {
        bool is_new = used_inputs.emplace(i).second;
        // The input is not seen and it's not referred by any nodes before as
        // output, we count it as an boundary input
        if (is_new && !used_outputs.count(i)) {
          boundary_inputs.emplace_back(i);
        }
      }
      for (const auto& o : n.output()) {
        used_outputs.emplace(o);
      }

      // For reshape node, if it has more than 1 inputs, we need to feed the
      // second input which contains the shape info
      if (n.op_type() == "Reshape" && n.input_size() > 1) {
        reshape_info.emplace(n.input(1));
      }
    }
    // Second iteration to account all the boundary outputs, which is a newly
    // seen output and is not referred as input before
    used_outputs.clear();
    for (const auto& n : results.first) {
      for (const auto& o : n.output()) {
        bool is_new = used_outputs.emplace(o).second;
        if (is_new && !used_inputs.count(o)) {
          boundary_outputs.emplace_back(o);
        }
      }
    }
    std::unordered_map<std::string, ::ONNX_NAMESPACE::TypeProto>
        extra_shape_hints;
    for (const auto& t : results.second) {
      extra_shape_hints.emplace(t.name(), onnx::ExtraTypeProto(t));
      if (reshape_info.count(t.name())) {
        onnx_model.mutable_graph()->add_initializer()->CopyFrom(t);
      }
    }

    // Add input/output shape info
    auto io_vec = convertToValueInfo(
        boundary_inputs, shape_hints_onnx_, extra_shape_hints);
    for (const auto& i : io_vec) {
      onnx_model.mutable_graph()->add_input()->CopyFrom(i);
    }
    io_vec = convertToValueInfo(
        boundary_outputs, shape_hints_onnx_, extra_shape_hints);
    for (const auto& i : io_vec) {
      onnx_model.mutable_graph()->add_output()->CopyFrom(i);
    }

    std::string onnx_model_str;
    onnx_model.SerializeToString(&onnx_model_str);
    auto ret = lib_->onnxGetBackendCompatibility(
        backend_id, onnx_model_str.size(), onnx_model_str.c_str());
    if (ret != ONNXIFI_STATUS_SUCCESS) {
      LOG(INFO) << "Don't support onnx for " << op.type() << " c2 op (" << ret
                << ")";
      return false;
    } else {
      return true;
    }
  } catch (const std::exception& ex) {
    LOG(ERROR) << "Caught exception when converting op " << op.type()
               << ", what: " << ex.what();
    return false;
  }
}

bool OnnxifiTransformer::supportOpC2(
    const caffe2::OperatorDef& op,
    const ShapeInfoMap& shape_hints,
    const std::unordered_set<std::string>& weights,
    const std::unordered_set<int>& blocklisted_ops,
    onnxBackendID backend_id) const {
  try {
    int pos =
        ArgumentHelper::GetSingleArgument<OperatorDef, int>(op, kNetPos, -1);
    if (blocklisted_ops.count(pos)) {
      LOG(INFO) << "Skipping blocklisted op " << op.type() << " at pos " << pos;
      return false;
    }

    // Build a c2 net with one op
    NetDef net;
    net.add_op()->CopyFrom(op);
    std::unordered_set<std::string> seenExternalInputs;
    for (const auto& i : op.input()) {
      if (seenExternalInputs.count(i)) {
        continue;
      }
      seenExternalInputs.insert(i);
      net.add_external_input(i);
    }
    for (const auto& o : op.output()) {
      net.add_external_output(o);
    }
    // Remove the second output of Concat/Reshape from the external_output
    if ((op.type() == "Concat" || op.type() == "Reshape") &&
        op.output_size() == 2) {
      net.mutable_external_output()->RemoveLast();
    } else if (op.type() == "LayerNorm" && op.output_size() > 1) {
      int remove = op.output_size() - 1;
      for (int i = 0; i < remove; ++i) {
        net.mutable_external_output()->RemoveLast();
      }
    }

    // Encode the input/output shapes to an argument
    auto* shape_arg = net.add_arg();
    auto* qshape_arg = net.add_arg();
    shape_arg->set_name("input_shape_info");
    qshape_arg->set_name("input_qshape_info");
    std::unordered_set<std::string> seenInputsForShapeArgs;
    for (const auto& i : op.input()) {
      if (seenInputsForShapeArgs.count(i)) {
        continue;
      }
      seenInputsForShapeArgs.insert(i);
      const auto it = shape_hints.find(i);
      if (it == shape_hints.end()) {
        VLOG(1) << "Skipping " << op.type() << " (" << pos
                << ") due to missing shape info for input " << i;
        return false;
      }
      if ((it->second).is_quantized == false) {
        shape_arg->mutable_tensors()->Add()->CopyFrom(
            wrapShapeInfoIntoTensorProto(i, it->second));
      } else {
        qshape_arg->mutable_qtensors()->Add()->CopyFrom(
            wrapShapeInfoIntoQTensorProto(i, it->second));
      }
    }

    qshape_arg = net.add_arg();
    shape_arg = net.add_arg();
    shape_arg->set_name("output_shape_info");
    qshape_arg->set_name("output_qshape_info");
    for (const auto& i : op.output()) {
      const auto it = shape_hints.find(i);
      if (it == shape_hints.end()) {
        VLOG(1) << "Skipping " << op.type() << " (" << pos
                << ") due to missing shape info for output " << i;
        return false;
      }
      if ((it->second).is_quantized == false) {
        shape_arg->mutable_tensors()->Add()->CopyFrom(
            wrapShapeInfoIntoTensorProto(i, it->second));
      } else {
        qshape_arg->mutable_qtensors()->Add()->CopyFrom(
            wrapShapeInfoIntoQTensorProto(i, it->second));
      }
    }

    // Annnote the inputs that are weights
    auto w_arg = net.add_arg();
    w_arg->set_name(kInitializers);
    for (const auto& i : op.input()) {
      if (weights.count(i)) {
        w_arg->add_strings(i);
      }
    }

    std::string c2_model_str;
    net.SerializeToString(&c2_model_str);
    auto ret = lib_->onnxGetBackendCompatibility(
        backend_id, c2_model_str.size(), c2_model_str.c_str());
    if (ret != ONNXIFI_STATUS_SUCCESS) {
      LOG(INFO) << "Don't support c2 op " << op.type() << " at pos " << pos
                << " (" << ret << ")";
      return false;
    } else {
      return true;
    }
  } catch (const std::exception& ex) {
    LOG(ERROR) << "Caught exception when converting op " << op.type()
               << ", what: " << ex.what();
    return false;
  }
}

void OnnxifiTransformer::tieGatherAndSparseLengthsWeightedSumOps(
    const NetDef& net,
    const ShapeInfoMap& shape_hints,
    const std::unordered_set<std::string>& weights,
    std::unordered_set<int>* blocklisted_ops) const {
  std::unordered_map<std::string, int> output_pos;
  onnx::OnnxExporter exporter(nullptr);
  onnxBackendID backend_id = backend_ids_[idx_];

  for (const auto& op : net.op()) {
    std::string check;
    if (op.type() == "Gather") {
      int pos =
          ArgumentHelper::GetSingleArgument<OperatorDef, int>(op, kNetPos, -1);
      for (const auto& output : op.output()) {
        output_pos.emplace(output, pos);
      }
    } else if (StartsWith(op.type(), "SparseLengthsWeighted")) {
      auto supported = opts_.use_onnx
          ? supportOpOnnx(op, &exporter, *blocklisted_ops, backend_id)
          : supportOpC2(op, shape_hints, weights, *blocklisted_ops, backend_id);
      if (!supported && op.input_size() > 1) {
        check = op.input(1);
      }
    } else if (
        op.type() == "SparseLengthsSumSparseLookup" && op.input_size() > 3) {
      check = op.input(3);
    }
    if (!check.empty()) {
      const auto it = output_pos.find(check);
      if (it == output_pos.end()) {
        continue;
      }
      blocklisted_ops->emplace(it->second);
      // We know that current op is not going to be supported. Might as well
      // blocklist it too
      blocklisted_ops->emplace(
          ArgumentHelper::GetSingleArgument<OperatorDef, int>(op, kNetPos, -1));
    }
  }
}

void OnnxifiTransformer::blocklistCpuPartition(
    const NetDef& net,
    std::unordered_set<int>* blocklisted_ops) const {
  std::unordered_set<std::string> cpu_partitions;
  for (const auto& p : partition_infos_) {
    if (p.device_id_size() == 0) {
      cpu_partitions.emplace(p.name());
    }
  }
  for (const auto& op : net.op()) {
    const auto& pname = op.device_option().node_name();
    if (cpu_partitions.count(pname)) {
      blocklisted_ops->emplace(
          ArgumentHelper::GetSingleArgument<OperatorDef, int>(op, kNetPos, -1));
    }
  }
}

void OnnxifiTransformer::applyFilteringRules(
    const NetDef& net,
    const ShapeInfoMap& shape_hints,
    const std::unordered_set<std::string>& weights,
    std::unordered_set<int>* blocklisted_ops) const {
  tieGatherAndSparseLengthsWeightedSumOps(
      net, shape_hints, weights, blocklisted_ops);
  blocklistCpuPartition(net, blocklisted_ops);
}

std::vector<onnxBackendID> OnnxifiTransformer::getBackendId() {
  idx_ = 0;

  if (opts_.use_onnx) {
    return backend_ids_;
  }
  // Try to find a backend that support Caffe2 proto. Note that this is quite
  // opportunistic as we don't officially support Caffe2 proto.
  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
  char buf[kBufferSize];
  // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
  for (int i = 0; i < backend_ids_.size(); ++i) {
    size_t len = kBufferSize;
    auto ret = lib_->onnxGetBackendInfo(
        backend_ids_[i], ONNXIFI_BACKEND_DEVICE, buf, &len);
    if (ret == ONNXIFI_STATUS_SUCCESS && strstr(buf, "Caffe2")) {
      LOG(INFO) << "Using backend with Caffe2 Proto, ID: " << i;
      idx_ = i;
      break;
    }
  }
  return backend_ids_;
}

opt::CutResult OnnxifiTransformer::TransformViaC2(
    NetDef* pred_net,
    const std::unordered_set<std::string>& weights,
    const std::unordered_set<int>& blocklisted_ops,
    const ShapeInfoMap& shape_hints_max_bs,
    const std::unordered_map<int, ShapeInfoMap>& shape_hints_per_bs) {
  onnxBackendID backend_id = backend_ids_[idx_];

  auto c2_supports =
      [this, &shape_hints_max_bs, &blocklisted_ops, backend_id, &weights](
          const caffe2::OperatorDef& op) {
        return supportOpC2(
            op, shape_hints_max_bs, weights, blocklisted_ops, backend_id);
      };

  auto c2_converter = [this,
                       &weights,
                       &shape_hints_max_bs,
                       &shape_hints_per_bs](const caffe2::NetDef& net) {
    return SubnetToOnnxifiOpViaC2(
        net, weights, shape_hints_max_bs, shape_hints_per_bs);
  };

  return opt::OptimizeForBackend(
      *pred_net, c2_supports, c2_converter, opts_.debug);
}

opt::CutResult OnnxifiTransformer::TransformViaOnnx(
    Workspace* ws,
    NetDef* pred_net,
    const std::unordered_set<std::string>& weights,
    const std::unordered_set<int>& blocklisted_ops,
    ShapeInfoMap* shape_hints_max_bs,
    const std::unordered_map<int, ShapeInfoMap>& shape_hints_per_bs) {
  onnxBackendID backend_id = backend_ids_[idx_];

  // function to tell whether the ONNXIFI backend supports a given C2 op or not
  onnx::OnnxExporter exporter(nullptr);
  auto onnx_supports = [this, &exporter, &blocklisted_ops, backend_id](
                           const caffe2::OperatorDef& op) {
    return supportOpOnnx(op, &exporter, blocklisted_ops, backend_id);
  };

  // function to convert runnable subgraph into an onnxifi op. We need to keep
  // the same exporter throughout the process to avoid duplicated dummy name
  // generation
  onnx::OnnxExporter exporter2(nullptr);
  auto onnx_converter = [this,
                         ws,
                         &weights,
                         shape_hints_max_bs,
                         &exporter2,
                         &shape_hints_per_bs](
                            const caffe2::NetDef& net) mutable {
    return SubnetToOnnxifiOpViaOnnx(
        net, weights, ws, &exporter2, shape_hints_max_bs, shape_hints_per_bs);
  };

  return opt::OptimizeForBackend(
      *pred_net, onnx_supports, onnx_converter, opts_.debug);
}

void OnnxifiTransformer::extractPartitionInfo(const NetDef& net) {
  partition_infos_.clear();
  for (const auto& p : net.partition_info()) {
    partition_infos_.emplace_back(p);
  }
}

// Cutting off the runnable part and replace with ONNXIFI ops. Asssume the nets
// were topologically sorted
void OnnxifiTransformer::transform(
    Workspace* ws,
    NetDef* pred_net,
    const std::vector<std::string>& weight_names,
    const ShapeInfoMap& input_shape_hints,
    const std::unordered_set<int>& blocklisted_ops) {
  CAFFE_ENFORCE(ws);
  CAFFE_ENFORCE(pred_net, "Predict net cannot be nullptr");

  if (opts_.debug) {
    WriteProtoToTextFile(*pred_net, "debug_pre_ssa_net.pb_txt", false);
  }

  // Get model id and reset Onnxifi op id to 0
  model_id_ = getModelId(*pred_net);
  onnxifi_op_id_ = 0;

  // Unroll If ops
  fetchInputsToIfOpsSubnet(pred_net);

  std::unordered_set<std::string> weights(
      weight_names.begin(), weight_names.end());

  // SSA Rewrite the net if it has not been rewritten
  ShapeInfoMap shape_hints_mapped;
  if (opts_.predictor_net_ssa_rewritten) {
    LOG(INFO) << "predictor net has been ssaRewritten, skip rewritting here";
    annotateOpIndex(pred_net);
    shape_hints_mapped = input_shape_hints;
    for (const auto& w : weights) {
      input_mapping_.emplace(w, w);
    }
  } else {
    shape_hints_mapped = ssaRewriteAndMapNames(ws, pred_net, input_shape_hints);
  }

  // Populate shape info
  // TODO(yingz): We should not need to create mapped_ws since we did not change
  // any input mappings during ssarewrite. However this is here for the
  // following reason: BlackBoxPredictor calls RunNetOnce before onnxifi to
  // populate dimension info. However during this, it was observed, that new
  // blob for output is created. This causes problem if inferShape uses original
  // ws since it does not expect the output blob to be present.
  Workspace mapped_ws(ws, input_mapping_);
  ShapeInfoMap shape_hints_max_bs = inferShapes(
      &mapped_ws, pred_net, shape_hints_mapped, opts_.bound_shape_spec);
  if (opts_.use_onnx) {
    shape_hints_onnx_ = stripShapeInfoMap(shape_hints_max_bs);
  }
  if (opts_.enforce_fp32_inputs_into_fp16) {
    enforceFp32InputsToFp16(weights, pred_net, &shape_hints_max_bs);
  }
  if (opts_.merge_fp32_inputs_into_fp16) {
    mergeFp32InputsAndConvertToFp16(
        opts_.bound_shape_spec.max_batch_size,
        weights,
        pred_net,
        &shape_hints_max_bs);
  }

  if (opts_.debug) {
    caffe2::NetDef ssa_net;
    ssa_net.CopyFrom(*pred_net);
    auto* w_arg = ssa_net.add_arg();
    w_arg->set_name(kInitializers);
    for (const auto& w : weights) {
      w_arg->add_strings(w);
    }
    dumpNet(ssa_net, shape_hints_max_bs, "debug_ssa_net.pb_txt");
  }
  extractPartitionInfo(*pred_net);

  // Get backend id
  getBackendId();

  // Apply some filtering rules
  std::unordered_set<int> new_blocklisted_ops(
      blocklisted_ops.begin(), blocklisted_ops.end());
  applyFilteringRules(
      *pred_net, shape_hints_max_bs, weights, &new_blocklisted_ops);

  // Transform the net
  opt::CutResult cutResult = opts_.use_onnx ? TransformViaOnnx(
                                        ws,
                                        pred_net,
                                        weights,
                                        new_blocklisted_ops,
                                        &shape_hints_max_bs,
                                        opts_.shape_hints_per_bs)
                                  : TransformViaC2(
                                        pred_net,
                                        weights,
                                        new_blocklisted_ops,
                                        shape_hints_max_bs,
                                        opts_.shape_hints_per_bs);

  auto net_opt = std::move(cutResult.net);
  // Need to figure out a proper place to handle device option
  net_opt.mutable_device_option()->CopyFrom(pred_net->device_option());
  net_opt.set_type(pred_net->type());

  pred_net->Swap(&net_opt);

  addShapeToNet(*pred_net, shape_hints_max_bs);
  if (opts_.debug) {
    WriteProtoToTextFile(*pred_net, "debug_full_opt_net.pb_txt", false);
  }
  if (opts_.verify_only_single_subnet && cutResult.numberOfSubnets > 1) {
    CAFFE_THROW("Multiple Onnxifi ops were created: ", cutResult.numberOfSubnets, " subnets were found. There may be unsupported operators in the model.");
  }
}

} // namespace caffe2