File: darkmode_classifier.cc

package info (click to toggle)
chromium 139.0.7258.138-1
links: PTS, VCS
area: main
in suites: forky, sid
size: 6,120,676 kB
sloc: cpp: 35,100,869; ansic: 7,163,530; javascript: 4,103,002; python: 1,436,920; asm: 946,517; xml: 746,709; pascal: 187,653; perl: 88,691; sh: 88,436; objc: 79,953; sql: 51,488; cs: 44,583; fortran: 24,137; makefile: 22,147; tcl: 15,277; php: 13,980; yacc: 8,984; ruby: 7,485; awk: 3,720; lisp: 3,096; lex: 1,327; ada: 727; jsp: 228; sed: 36
file content (1188 lines) | stat: -rw-r--r-- 51,561 bytes
parent folder | download | duplicates (6)
// Copyright 2017 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifdef UNSAFE_BUFFERS_BUILD
// TODO(crbug.com/351564777): Remove this and convert code to safer constructs.
#pragma allow_unsafe_buffers
#endif

// This file is automatically generated using tfNative from a neural network,
// trained by TensorFlow. Please do not edit.

#include "darkmode_classifier.h"

#include <algorithm>
#include <array>
#include <cassert>
#include <cmath>
#include <cstdint>
#include <cstring>
#include <limits>
#include <tuple>
#if USE_EIGEN
#include "third_party/eigen3/Eigen/Core"
#endif
namespace darkmode_tfnative_model {
namespace {

// -----------------------------------------------------------------------------
// OP LIBRARY
// Copied here to make sure that the inferece code always stays in sync with the
// lib that it was generated for.
// -----------------------------------------------------------------------------

// Default to using std::copy and std::fill over memcpy and memset as they
// are usually faster, thanks to the compiler getting stricter alignment
// guarantees.
#ifndef USE_TYPED_MEMSETMEMCPY
#define USE_TYPED_MEMSETMEMCPY 1
#endif
#define USE_EIGEN 0
#ifndef USE_EIGEN
#error Please define USE_EIGEN to either 0 or 1
#endif

// Helper to reinterpret memory as Eigen matrices.
#if USE_EIGEN
template <typename Scalar>
using ConstMatrixMap = typename Eigen::Map<
    const Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>>;
template <typename Scalar>
using ConstRowVectorMap =
    typename Eigen::Map<const Eigen::Matrix<Scalar, Eigen::Dynamic, 1>>;
template <typename Scalar>
using RowVectorMap =
    typename Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, 1>>;
template <typename Scalar>
using MatrixMap =
    typename Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>>;
#endif

#define BENCHMARK_TIMER(...)

// The size of a shape in terms of number of coefficients.
inline int ShapeSize(const int32_t rank, const int32_t* shape) {
  int size = 1;
  for (int i = 0; i < rank; ++i)
    size *= shape[i];
  return size;
}

// Helper to compute the size of the inner loop for an op that uses indices to
// specify which axes are reduced.
template <typename Tidx>
int32_t GetReduceInnerSize(int32_t input_tensor_rank,
                           const int32_t* __restrict input_shape,
                           int32_t index_tensor_rank,
                           const int32_t* __restrict index_shape,
                           const Tidx* __restrict index_values) {
  assert(index_tensor_rank <= 1);
  const int32_t num_indices = index_tensor_rank > 0 ? index_shape[0] : 1;
  int32_t inner_size = 1;
  for (int32_t i = 0; i < num_indices; ++i) {
    inner_size *= input_shape[index_values[i]];
  }
  return inner_size;
}

template <typename T>
void ConcatV2Args2(int32_t arg0_rank,
                   const int32_t* __restrict arg0_shape,
                   const T* __restrict arg0_values,
                   int32_t arg1_rank,
                   const int32_t* __restrict arg1_shape,
                   const T* __restrict arg1_values,
                   const int32_t* __restrict axis_value,
                   T* __restrict output_values) {
  BENCHMARK_TIMER("ConcatV2Args2");
  const int axis = axis_value[0];
  const int num_lines = ShapeSize(axis, arg0_shape);
  const int arg0_line_size = ShapeSize(arg0_rank - axis, arg0_shape + axis);
  const int arg1_line_size = ShapeSize(arg1_rank - axis, arg1_shape + axis);
  for (int line = 0; line < num_lines; ++line) {
    std::copy(arg0_values, arg0_values + arg0_line_size, output_values);
    arg0_values += arg0_line_size;
    output_values += arg0_line_size;
    std::copy(arg1_values, arg1_values + arg1_line_size, output_values);
    arg1_values += arg1_line_size;
    output_values += arg1_line_size;
  }
}

template <typename T>
void Conv2DAsGemm(const int32_t* __restrict in_shape,
                  const T* __restrict in_values,
                  const int32_t* __restrict filter_shape,
                  const T* __restrict filter_values,
                  T* __restrict output_values) {
  BENCHMARK_TIMER("Conv2DAsGemm");
#if USE_EIGEN
  const auto in = ConstMatrixMap<T>(in_values, in_shape[0], in_shape[1]);
  const auto filter =
      ConstMatrixMap<T>(filter_values, filter_shape[3],
                        filter_shape[0] * filter_shape[1] * filter_shape[2]);
  auto result = MatrixMap<T>(output_values, filter_shape[3], in_shape[1]);
  result.noalias() = filter * in;
#else
  const int32_t out_rows = in_shape[1];
  const int32_t out_cols = filter_shape[3];
  const int32_t dot_len = in_shape[0];
  for (int row = 0; row < out_rows; ++row) {
    for (int col = 0; col < out_cols; ++col) {
      T value = 0;
      for (int i = 0; i < dot_len; ++i) {
        value +=
            in_values[row * dot_len + i] * filter_values[i * out_cols + col];
      }
      *output_values++ = value;
    }
  }
#endif
}

template <typename T>
void DepthwiseConv2dNative(const int32_t* __restrict input_shape,
                           const T* __restrict input_values,
                           const int32_t* __restrict kernel_shape,
                           const T* __restrict kernel_values,
                           int32_t stride_y,
                           int32_t stride_x,
                           int32_t out_height,
                           int32_t out_width,
                           T* __restrict output_values) {
  BENCHMARK_TIMER("DepthwiseConv2dNative");
  // Give the shape values nicer names.
  assert(input_shape[3] == kernel_shape[2]);
  const int batch_size = input_shape[0];
  const int kernel_height = kernel_shape[0];
  const int kernel_width = kernel_shape[1];
  const int in_depth = kernel_shape[2];
  const int depth_mul = kernel_shape[3];
  const int in_height = input_shape[1];
  const int in_width = input_shape[2];

  // Compute the amount of padding needed to get the desired output size.
  const int pad_height =
      ((out_height - 1) * stride_y + kernel_height - in_height) / 2;
  const int pad_width =
      ((out_width - 1) * stride_x + kernel_width - in_width) / 2;

  // Cache the strides for address computations.
  const std::array<int, 4> in_strides = {
      input_shape[1] * input_shape[2] * input_shape[3],  // batch
      input_shape[2] * input_shape[3],                   // y
      input_shape[3],                                    // x
      1,                                                 // channel
  };
  const std::array<int, 4> kernel_strides = {
      kernel_shape[1] * kernel_shape[2] * kernel_shape[3],  // y
      kernel_shape[2] * kernel_shape[3],                    // x
      kernel_shape[3],                                      // in channels
      1,                                                    // channel mult
  };

  T* out_write_ptr = output_values;
  for (int batch = 0; batch < batch_size; ++batch) {
    for (int out_y = 0; out_y < out_height; ++out_y) {
      for (int out_x = 0; out_x < out_width; ++out_x) {
        // Compute the input read offsets.
        const int in_y_origin = (out_y * stride_y) - pad_height;
        const int in_x_origin = (out_x * stride_x) - pad_width;

        // Compute the range of the kernel to be applied (we may need to clip
        // when we'd read outside of the valid input region - for SAME).
        const int kernel_y_start = std::max(0, -in_y_origin);
        const int kernel_y_end =
            std::min(kernel_height, in_height - in_y_origin);
        const int kernel_x_start = std::max(0, -in_x_origin);
        const int kernel_x_end = std::min(kernel_width, in_width - in_x_origin);

        for (int in_c = 0; in_c < in_depth; ++in_c) {
          for (int mul_c = 0; mul_c < depth_mul; ++mul_c, ++out_write_ptr) {
            // Convolve.
            T sum = 0;
            for (int k_y = kernel_y_start; k_y < kernel_y_end; ++k_y) {
              const int in_y = in_y_origin + k_y;
              assert(in_y >= 0 && in_y < in_height);
              for (int k_x = kernel_x_start; k_x < kernel_x_end; ++k_x) {
                const int in_x = in_x_origin + k_x;
                assert(in_x >= 0 && in_x < in_width);
                const T input_value =
                    input_values[batch * in_strides[0] +  // batch
                                 in_y * in_strides[1] +   // y
                                 in_x * in_strides[2] +   // x
                                 in_c];                   // in chan
                const T kernel_value =
                    kernel_values[k_y * kernel_strides[0] +   // y
                                  k_x * kernel_strides[1] +   // x
                                  in_c * kernel_strides[2] +  // in chan
                                  mul_c];                     // chan mult
                sum += input_value * kernel_value;
              }
            }
            *out_write_ptr = sum;
          }  // mul_c
        }    // in_c
      }      // out_x
    }        // out_y
  }          // batch
}

template <typename T>
void FullyConnected(const int32_t* __restrict input_shape,
                    const T* __restrict input_values,
                    const int32_t* __restrict weight_shape,
                    const T* __restrict weight_values,
                    const int32_t* __restrict bias_shape,
                    const T* __restrict bias_values,
                    T* __restrict output_values) {
  BENCHMARK_TIMER("FullyConnected");
#if USE_EIGEN
  const auto in =
      ConstMatrixMap<T>(input_values, input_shape[1], input_shape[0]);
  const auto weight =
      ConstMatrixMap<T>(weight_values, weight_shape[1], weight_shape[0]);
  const auto bias = ConstRowVectorMap<T>(bias_values, bias_shape[0]);
  auto result = MatrixMap<T>(output_values, weight_shape[1], input_shape[0]);
  result.noalias() = (weight * in).colwise() + bias;
#else
  const int batch_size = input_shape[0];
  const int num_inputs = weight_shape[0];
  const int num_outputs = weight_shape[1];
  assert(input_shape[1] == num_inputs);
  assert(bias_shape[0] == num_outputs);
  for (int batch = 0; batch < batch_size; ++batch) {
    for (int out_i = 0; out_i < num_outputs; ++out_i) {
      T value = 0;
      for (int in_i = 0; in_i < num_inputs; ++in_i) {
        value += input_values[batch * num_inputs + in_i] *
                 weight_values[in_i * num_outputs + out_i];
      }
      value += bias_values[out_i];
      output_values[batch * num_outputs + out_i] = value;
    }
  }
#endif
}

template <typename T, typename TIndex>
void Gather(int params_rank,
            const int32_t* __restrict params_shape,
            const T* __restrict params_values,
            int indices_rank,
            const int32_t* __restrict indices_shape,
            const TIndex* __restrict indices_values,
            T* __restrict output_values) {
  BENCHMARK_TIMER("Gather");
  const int num_indices = ShapeSize(indices_rank, indices_shape);
  const int num_params = params_shape[0];
  const int slice_size = ShapeSize(params_rank - 1, params_shape + 1);
  for (int i = 0; i < num_indices; ++i) {
    const int index = indices_values[i];
    if (index < 0 || index >= num_params) {
      std::fill(output_values, output_values + slice_size, 0);
    } else {
      std::copy(params_values + index * slice_size,
                params_values + index * slice_size + slice_size, output_values);
    }
    output_values += slice_size;
  }
}

template <typename T, typename TIndex>
void Im2Col(const int32_t* __restrict input_shape,
            const T* __restrict input_values,
            const int32_t* __restrict kernel_shape,
            int32_t stride_y,
            int32_t stride_x,
            int32_t out_height,
            int32_t out_width,
            TIndex* output_shape,
            T* __restrict output_values) {
  BENCHMARK_TIMER("Im2Col");
  // Give the shape values nicer names.
  assert(input_shape[3] == kernel_shape[2]);
  const int batch_size = input_shape[0];
  const int kernel_height = kernel_shape[0];
  const int kernel_width = kernel_shape[1];
  const int in_depth = kernel_shape[2];
  const int in_height = input_shape[1];
  const int in_width = input_shape[2];

  // Compute the amount of padding needed to get the desired output size.
  const int pad_height =
      ((out_height - 1) * stride_y + kernel_height - in_height) / 2;
  const int pad_width =
      ((out_width - 1) * stride_x + kernel_width - in_width) / 2;

  // Cache the strides for address computations.
  const int x_stride = input_shape[3];
  const int y_stride = input_shape[2] * x_stride;
  const int batch_stride = input_shape[1] * y_stride;

  // Write the output shape.
  output_shape[0] = kernel_height * kernel_width * in_depth;
  output_shape[1] = input_shape[0] * out_width * out_height;

  for (int batch = 0; batch < batch_size; ++batch) {
    for (int out_y = 0; out_y < out_height; ++out_y) {
      for (int out_x = 0; out_x < out_width; ++out_x) {
        // Compute the input read offsets.
        const int in_y_origin = (out_y * stride_y) - pad_height;
        const int in_x_origin = (out_x * stride_x) - pad_width;

        // Compute the range of the kernel to be applied (we may need to clip
        // when we'd read outside of the valid input region - for SAME).
        const int kernel_y_start = std::max(0, -in_y_origin);
        const int kernel_y_end =
            std::min(kernel_height, in_height - in_y_origin);
        const int kernel_x_start = std::max(0, -in_x_origin);
        const int kernel_x_end = std::min(kernel_width, in_width - in_x_origin);

        // Padding top.
        if (kernel_y_start != 0) {
          const int num_lines = kernel_y_start;
          const int num_coeffs = num_lines * kernel_width * in_depth;
#if USE_TYPED_MEMSETMEMCPY
          std::fill(output_values, output_values + num_coeffs, 0);
#else
          std::memset(output_values, 0, num_coeffs * sizeof(T));
#endif
          output_values += num_coeffs;
        }
        for (int k_y = kernel_y_start; k_y < kernel_y_end; ++k_y) {
          // Padding left.
          if (kernel_x_start != 0) {
            const int num_coeffs = kernel_x_start * in_depth;
#if USE_TYPED_MEMSETMEMCPY
            std::fill(output_values, output_values + num_coeffs, 0);
#else
            std::memset(output_values, 0, num_coeffs * sizeof(T));
#endif
            output_values += num_coeffs;
          }
          // Valid values.
          {
            const int in_y = in_y_origin + k_y;
            const int in_x = in_x_origin + kernel_x_start;
            const int num_coeffs = (kernel_x_end - kernel_x_start) * in_depth;
#if USE_TYPED_MEMSETMEMCPY
            const int offset =
                batch * batch_stride + in_y * y_stride + in_x * x_stride;
            std::copy(input_values + offset, input_values + offset + num_coeffs,
                      output_values);
#else
            std::memcpy(output_values,
                        input_values  // Reusing the restricted pointer.
                            + batch * batch_stride  // batch
                            + in_y * y_stride       // y
                            + in_x * x_stride,      // x
                        num_coeffs * sizeof(T));
#endif
            output_values += num_coeffs;
          }
          // Padding right.
          if (kernel_x_end != kernel_width) {
            const int num_coeffs = (kernel_width - kernel_x_end) * in_depth;
#if USE_TYPED_MEMSETMEMCPY
            std::fill(output_values, output_values + num_coeffs, 0);
#else
            std::memset(output_values, 0, num_coeffs * sizeof(T));
#endif
            output_values += num_coeffs;
          }
        }
        // Padding bottom.
        if (kernel_y_end != kernel_height) {
          const int num_lines = kernel_height - kernel_y_end;
          const int num_coeffs = num_lines * kernel_width * in_depth;
#if USE_TYPED_MEMSETMEMCPY
          std::fill(output_values, output_values + num_coeffs, 0);
#else
          std::memset(output_values, 0, num_coeffs * sizeof(T));
#endif
          output_values += num_coeffs;
        }
      }
    }
  }
}

template <typename T>
void MaxPool(const int32_t* __restrict input_shape,
             const T* __restrict input_values,
             int32_t stride_y,
             int32_t stride_x,
             int32_t kernel_height,
             int32_t kernel_width,
             int32_t out_height,
             int32_t out_width,
             T* __restrict output_values) {
  BENCHMARK_TIMER("MaxPool");
  // Give the shape values nicer names.
  const int batch_size = input_shape[0];
  const int in_height = input_shape[1];
  const int in_width = input_shape[2];
  const int depth = input_shape[3];

  // Compute the amount of padding needed to get the desired output size.
  const int pad_height =
      ((out_height - 1) * stride_y + kernel_height - in_height) / 2;
  const int pad_width =
      ((out_width - 1) * stride_x + kernel_width - in_width) / 2;

  // Cache the strides for address computations.
  const std::array<int, 4> in_strides = {
      input_shape[1] * input_shape[2] * input_shape[3],  // batch
      input_shape[2] * input_shape[3],                   // y
      input_shape[3],                                    // x
      1,                                                 // channel
  };

  T* out_write_ptr = output_values;
  for (int batch = 0; batch < batch_size; ++batch) {
    for (int out_y = 0; out_y < out_height; ++out_y) {
      for (int out_x = 0; out_x < out_width; ++out_x) {
        // Compute the input read offsets.
        const int in_y_origin = (out_y * stride_y) - pad_height;
        const int in_x_origin = (out_x * stride_x) - pad_width;

        // Compute the range of the kernel to be applied (we may need to clip
        // when we'd read outside of the valid input region - for SAME).
        const int kernel_y_start = std::max(0, -in_y_origin);
        const int kernel_y_end =
            std::min(kernel_height, in_height - in_y_origin);
        const int kernel_x_start = std::max(0, -in_x_origin);
        const int kernel_x_end = std::min(kernel_width, in_width - in_x_origin);

        for (int chan = 0; chan < depth; ++chan, ++out_write_ptr) {
          // Convolve.
          T max_value = std::numeric_limits<T>::lowest();
          for (int k_y = kernel_y_start; k_y < kernel_y_end; ++k_y) {
            const int in_y = in_y_origin + k_y;
            assert(in_y >= 0 && in_y < in_height);
            for (int k_x = kernel_x_start; k_x < kernel_x_end; ++k_x) {
              const int in_x = in_x_origin + k_x;
              assert(in_x >= 0 && in_x < in_width);
              const T input_value =
                  input_values[batch * in_strides[0] +  // batch
                               in_y * in_strides[1] +   // y
                               in_x * in_strides[2] +   // x
                               chan];                   // channel
              max_value = std::max(max_value, input_value);
            }  // kernel_x
          }    // kernel_y
          *out_write_ptr = max_value;
        }  // chan
      }    // out_x
    }      // out_y
  }        // batch
}

template <typename T>
void Memcpy(const int32_t rank,
            const int32_t* __restrict input_shape,
            const T* __restrict input_values,
            T* __restrict output_values) {
  BENCHMARK_TIMER("Memcpy");
  const int size = ShapeSize(rank, input_shape);
  for (int i = 0; i < size; ++i) {
    output_values[i] = input_values[i];
  }
}

template <typename T>
void Softmax(const int32_t rank,
             const int32_t* __restrict input_shape,
             const T* __restrict input_values,
             const int32_t reduce_dim,
             T* __restrict output_values,
             T* __restrict scratch_values) {
  BENCHMARK_TIMER("Softmax");
  const int size = ShapeSize(rank, input_shape);
  if (rank == 2 && reduce_dim == 1) {
    T logits_max = std::numeric_limits<T>::lowest();

    // Max.
    for (int i = 0; i < size; ++i) {
      logits_max = std::max(logits_max, input_values[i]);
    }

    // Pre-compute exp.
    for (int i = 0; i < size; ++i) {
      scratch_values[i] = std::exp(input_values[i] - logits_max);
    }

    // Sum over the last dimension, then divide the exps and write out.
    for (int offset = 0; offset < size; offset += input_shape[1]) {
      T sum = 0;
      const int end_offset = offset + input_shape[1];
      for (int i = offset; i < end_offset; ++i)
        sum += scratch_values[i];
      const T rcp_denom = static_cast<T>(1) / sum;
      for (int i = 0; i < input_shape[1]; ++i) {
        output_values[offset + i] = scratch_values[offset + i] * rcp_denom;
      }
    }
  } else {
    assert(false && "Generic Softmax not yet supported.");
  }
}

// Returns the start position for a slice in a single dimension.
template <typename T>
int StridedSliceBegin(int range_mask,
                      const T* __restrict range_values,
                      const T* __restrict strides,
                      const int32_t* __restrict input_shape,
                      int dim) {
  const bool is_explicit = 0 == (range_mask & (1 << dim));
  if (is_explicit) {
    return range_values[dim];
  } else {
    const bool is_reverse = strides[dim] < 0;
    return is_reverse ? input_shape[dim] - 1 : 0;
  }
}

// Returns the end position for a slice in a single dimension.
template <typename T>
int StridedSliceEnd(int range_mask,
                    const T* __restrict range_values,
                    const T* __restrict strides,
                    const int32_t* __restrict input_shape,
                    int dim) {
  const bool is_explicit = 0 == (range_mask & (1 << dim));
  if (is_explicit) {
    return range_values[dim];
  } else {
    const bool is_reverse = strides[dim] < 0;
    return is_reverse ? -1 : input_shape[dim];
  }
}

template <typename T, typename TIdx>
void StridedSlice(const int32_t input_rank,
                  const int32_t* __restrict input_shape,
                  const T* __restrict input_values,
                  const TIdx* __restrict begin,
                  const TIdx* __restrict end,
                  const TIdx* __restrict strides,
                  int32_t begin_mask,
                  int32_t end_mask,
                  T* __restrict output_values) {
  BENCHMARK_TIMER("StridedSlice");
  const int MAX_RANK = 8;
  assert(input_rank < MAX_RANK);

  // Compute the address strides for each dimension.
  std::array<int, MAX_RANK> dim_addr_strides = {};
  dim_addr_strides[input_rank - 1] = 1;
  for (int dim = input_rank - 2; dim >= 0; --dim) {
    dim_addr_strides[dim] = dim_addr_strides[dim + 1] * input_shape[dim + 1];
  }

  // Resolve the masks and get explicit ranges for each dimension.
  std::array<int, MAX_RANK> dim_begin;
  std::array<int, MAX_RANK> dim_end;
  std::array<bool, MAX_RANK> dim_is_full_range;
  for (int dim = 0; dim < input_rank; ++dim) {
    const int stride = strides[dim];
    dim_begin[dim] =
        StridedSliceBegin(begin_mask, begin, strides, input_shape, dim);
    dim_end[dim] = StridedSliceEnd(end_mask, end, strides, input_shape, dim);
    dim_is_full_range[dim] =
        dim_begin[dim] == 0 && dim_end[dim] == input_shape[dim] && stride == 1;

    // Our termination criteria for loops is that we hit the end exactly, so
    // we need to ensure that we don't step over the end with stride != 1.
    const int length_mod = (dim_end[dim] - dim_begin[dim]) % stride;
    if (length_mod != 0) {
      dim_end[dim] += stride - length_mod;
    }
  }

  // Find out how large the blocks are that we can copy contiguously. (All
  // dimensions on the right for which we fetch the full range)
  int last_sliced_dim = input_rank - 1;
  int block_size = 1;
  for (int dim = input_rank - 1; dim >= 0 && dim_is_full_range[dim]; --dim) {
    block_size *= input_shape[dim];
    last_sliced_dim--;
  }

  // Initialize the read pos for each dimension according to the begin offsets.
  std::array<int, MAX_RANK> read_pos = {};
  for (int dim = 0; dim < input_rank; ++dim) {
    read_pos[dim] = dim_begin[dim];
  }

  while (read_pos[0] != dim_end[0]) {
    // Compute the read offset for the current position.
    int32_t read_offset = 0;
    for (int dim = 0; dim <= last_sliced_dim; ++dim) {
      const int addr_stride = dim_addr_strides[dim];
      if (read_pos[dim] < 0) {
        read_offset += (input_shape[dim] + read_pos[dim]) * addr_stride;
      } else {
        read_offset += read_pos[dim] * addr_stride;
      }
    }

#if USE_TYPED_MEMSETMEMCPY
    std::copy(input_values + read_offset,
              input_values + read_offset + block_size, output_values);
#else
    std::memcpy(output_values, input_values + read_offset,
                block_size * sizeof(T));
#endif
    output_values += block_size;

    // Advance the read position.
    for (int dim = last_sliced_dim; dim >= 0; --dim) {
      read_pos[dim] += strides[dim];
      if (dim == 0 || read_pos[dim] != dim_end[dim])
        break;
      read_pos[dim] = dim_begin[dim];
    }
  }
}

template <typename T>
void TransposeRank3(const int32_t* __restrict input_shape,
                    const T* __restrict input_values,
                    const int32_t* __restrict perm,
                    T* __restrict output_values) {
  BENCHMARK_TIMER("TransposeRank3");
  const std::array<int32_t, 3> in_strides = {
      input_shape[1] * input_shape[2],
      input_shape[2],
      1,
  };
  const std::array<int32_t, 3> out_strides = {
      in_strides[perm[0]],
      in_strides[perm[1]],
      in_strides[perm[2]],
  };
  const std::array<int32_t, 3> out_shape = {
      input_shape[perm[0]],
      input_shape[perm[1]],
      input_shape[perm[2]],
  };

  int32_t write_offset = 0;
  for (int32_t it0 = 0; it0 < out_shape[0]; ++it0) {
    const int32_t read_offset0 = it0 * out_strides[0];
    for (int32_t it1 = 0; it1 < out_shape[1]; ++it1) {
      const int32_t read_offset01 = read_offset0 + it1 * out_strides[1];
      for (int32_t it2 = 0; it2 < out_shape[2]; ++it2, ++write_offset) {
        const int32_t read_offset = read_offset01 + it2 * out_strides[2];
        output_values[write_offset] = input_values[read_offset];
      }
    }
  }
}

template <typename T>
void TransposeRank4(const int32_t* __restrict input_shape,
                    const T* __restrict input_values,
                    const int32_t* __restrict perm,
                    T* __restrict output_values) {
  BENCHMARK_TIMER("TransposeRank4");
  const std::array<int32_t, 4> in_strides = {
      input_shape[1] * input_shape[2] * input_shape[3],
      input_shape[2] * input_shape[3],
      input_shape[3],
      1,
  };
  const std::array<int32_t, 4> out_strides = {
      in_strides[perm[0]],
      in_strides[perm[1]],
      in_strides[perm[2]],
      in_strides[perm[3]],
  };
  const std::array<int32_t, 4> out_shape = {
      input_shape[perm[0]],
      input_shape[perm[1]],
      input_shape[perm[2]],
      input_shape[perm[3]],
  };

  int32_t write_offset = 0;
  for (int32_t it0 = 0; it0 < out_shape[0]; ++it0) {
    const int32_t read_offset0 = it0 * out_strides[0];
    for (int32_t it1 = 0; it1 < out_shape[1]; ++it1) {
      const int32_t read_offset01 = read_offset0 + it1 * out_strides[1];
      for (int32_t it2 = 0; it2 < out_shape[2]; ++it2) {
        const int32_t read_offset012 = read_offset01 + it2 * out_strides[2];
        for (int32_t it3 = 0; it3 < out_shape[3]; ++it3, ++write_offset) {
          const int32_t read_offset = read_offset012 + it3 * out_strides[3];
          output_values[write_offset] = input_values[read_offset];
        }
      }
    }
  }
}

template <typename T, typename TIdx, typename TDepth>
void OneHot(const int32_t input_rank,
            const int32_t* __restrict input_shape,
            const TIdx* __restrict input_values,
            const TDepth* __restrict depth,
            const T* __restrict on_value,
            const T* __restrict off_value,
            const int32_t axis,
            T* __restrict output_values) {
  BENCHMARK_TIMER("OneHot");
  const int32_t num_elements = ShapeSize(input_rank, input_shape);
  // We can assume axis >= 0 in this implementation.
  const int32_t prefix_dim_size = ShapeSize(axis, input_shape);
  const int32_t suffix_dim_size = num_elements / prefix_dim_size;
  int32_t write_offset = 0;
  for (int32_t i = 0; i < prefix_dim_size; i++) {
    int32_t read_offset_pre = i * suffix_dim_size;
    for (TDepth d = 0; d < *depth; d++) {
      for (int32_t j = 0; j < suffix_dim_size; j++, write_offset++) {
        const int32_t read_offset = read_offset_pre + j;
        output_values[write_offset] =
            (input_values[read_offset] == d) ? *on_value : *off_value;
      }
    }
  }
}

template <typename T, typename TIdx, typename TDepth>
void OneHotLastDim(const int32_t input_rank,
                   const int32_t* __restrict input_shape,
                   const TIdx* __restrict input_values,
                   const TDepth* __restrict depth,
                   const T* __restrict on_value,
                   const T* __restrict off_value,
                   T* __restrict output_values) {
  BENCHMARK_TIMER("OneHotLastDim");
  const int32_t num_elements = ShapeSize(input_rank, input_shape);
  int32_t write_offset = 0;
  for (int32_t i = 0; i < num_elements; i++) {
    for (TDepth d = 0; d < *depth; d++, write_offset++) {
      output_values[write_offset] =
          (input_values[i] == d) ? *on_value : *off_value;
    }
  }
}

// -----------------------------------------------------------------------------
// Simple unary ops
// -----------------------------------------------------------------------------

// We use macros instead of template functions with templated functors here
// because it's a lot less verbose and easier for the compiler to optimize.

#if USE_EIGEN

#define SIMPLE_UNARY_OP(OP_NAME, _, EXPR_EIGEN)                           \
  template <typename T>                                                   \
  void OP_NAME(const int32_t rank, const int32_t* __restrict input_shape, \
               const T* __restrict input_values,                          \
               T* __restrict output_values) {                             \
    BENCHMARK_TIMER(#OP_NAME);                                            \
    const int size = ShapeSize(rank, input_shape);                        \
    auto values = ConstRowVectorMap<T>(input_values, size).array();       \
    auto output = RowVectorMap<T>(output_values, size).array();           \
    output = EXPR_EIGEN;                                                  \
  }

#else

#define SIMPLE_UNARY_OP(OP_NAME, EXPR, _)                                 \
  template <typename T>                                                   \
  void OP_NAME(const int32_t rank, const int32_t* __restrict input_shape, \
               const T* __restrict input_values,                          \
               T* __restrict output_values) {                             \
    BENCHMARK_TIMER(#OP_NAME);                                            \
    const int size = ShapeSize(rank, input_shape);                        \
    for (int i = 0; i < size; ++i) {                                      \
      const T value = input_values[i];                                    \
      output_values[i] = EXPR;                                            \
    }                                                                     \
  }

#endif

// Second macro param is value expression, third entry is Eigen vector
// expression.
SIMPLE_UNARY_OP(Abs, std::abs(value), values.abs())
SIMPLE_UNARY_OP(Acos, std::acos(value), values.acos())
SIMPLE_UNARY_OP(Asin, std::asin(value), values.asin())
SIMPLE_UNARY_OP(Atan, std::atan(value), values.atan())
SIMPLE_UNARY_OP(Cos, std::cos(value), values.cos())
SIMPLE_UNARY_OP(Cosh, std::cosh(value), values.cosh())
SIMPLE_UNARY_OP(Exp, std::exp(value), values.exp())
SIMPLE_UNARY_OP(Elu,
                value < 0 ? std::expm1(value) : value,
                // Use branchless version of Elu: min(ReLU, e^x - 1)
                values.max(0).min(values.exp() - 1))
SIMPLE_UNARY_OP(Log, std::log(value), values.log())
SIMPLE_UNARY_OP(Log1p, std::log1p(value), values.log1p())
SIMPLE_UNARY_OP(Neg, -value, -values)
SIMPLE_UNARY_OP(Reciprocal, static_cast<T>(1) / value, values.cwiseInverse())
SIMPLE_UNARY_OP(Relu, std::max(value, static_cast<T>(0)), values.max(0))
SIMPLE_UNARY_OP(Relu6,
                std::min(std::max(value, static_cast<T>(0)), static_cast<T>(6)),
                values.max(0).min(6))
SIMPLE_UNARY_OP(Rsqrt, static_cast<T>(1) / std::sqrt(value), values.rsqrt())
SIMPLE_UNARY_OP(Sigmoid,
                static_cast<T>(1) / (1 + std::exp(-value)),
                ((-values).exp() + 1).cwiseInverse())
SIMPLE_UNARY_OP(Sin, std::sin(value), values.sin())
SIMPLE_UNARY_OP(Sinh, std::sinh(value), values.sinh())
SIMPLE_UNARY_OP(Sqrt, std::sqrt(value), values.sqrt())
SIMPLE_UNARY_OP(Square, value* value, values.square())
SIMPLE_UNARY_OP(Tan, std::tan(value), values.tan())
SIMPLE_UNARY_OP(Tanh, std::tanh(value), values.tanh())

// -----------------------------------------------------------------------------
// Broadcasting binary ops
// -----------------------------------------------------------------------------

template <typename T, typename OP>
void OpNoBroadcast(const int32_t left_rank,
                   const int32_t* __restrict left_shape,
                   const T* __restrict left_values,
                   const int32_t right_rank,
                   const int32_t* __restrict right_shape,
                   const T* __restrict right_values,
                   T* __restrict output_values,
                   OP op) {
  BENCHMARK_TIMER(op.name, "NoBroadcast");
  const int32_t size = ShapeSize(left_rank, left_shape);
#if USE_EIGEN
  auto lhs = ConstRowVectorMap<T>(left_values, size).array();
  auto rhs = ConstRowVectorMap<T>(right_values, size).array();
  auto output = RowVectorMap<T>(output_values, size).array();
  op.apply(lhs, rhs, output);
#else
  for (int32_t i = 0; i < size; ++i) {
    output_values[i] = op(left_values[i], right_values[i]);
  }
#endif
}

template <typename T, typename OP>
void OpInnerBroadcast(int32_t left_rank,
                      const int32_t* __restrict left_shape,
                      const T* __restrict left_values,
                      int32_t right_rank,
                      const int32_t* __restrict right_shape,
                      const T* __restrict right_values,
                      T* __restrict output_values,
                      OP op) {
  BENCHMARK_TIMER(op.name, "InnerBroadcast");
  const int32_t output_size = ShapeSize(left_rank, left_shape);
  const int32_t inner_size = ShapeSize(right_rank, right_shape);
  const int32_t outer_size = output_size / inner_size;
#if USE_EIGEN
  if (inner_size == 1) {
    // Apply the same value to all elements.
    auto left = ConstMatrixMap<T>(left_values, inner_size, outer_size);
    auto output = MatrixMap<T>(output_values, inner_size, outer_size);
    op.apply(left.array(), right_values[0], output.array());
  } else {
    auto left = ConstMatrixMap<T>(left_values, inner_size, outer_size);
    auto right = ConstRowVectorMap<T>(right_values, inner_size);
    auto output = MatrixMap<T>(output_values, inner_size, outer_size);
    for (int32_t col = 0; col < outer_size; col++) {
      op.apply(left.col(col).array(), right.array(), output.col(col).array());
    }
  }
#else
  for (int32_t idx_out = 0; idx_out < outer_size; ++idx_out) {
    for (int32_t idx_in = 0; idx_in < inner_size; ++idx_in) {
      const int32_t offset = idx_out * inner_size + idx_in;
      output_values[offset] = op(left_values[offset], right_values[idx_in]);
    }
  }
#endif
}

#define BROADCAST_BINARY_OP(OP_NAME, EXPR, EXPR_EIGEN)                         \
  template <typename T>                                                        \
  struct Op##OP_NAME {                                                         \
    const char* name = #OP_NAME;                                               \
    T operator()(const T lhs, const T rhs) { return EXPR; }                    \
    template <typename X, typename Y, typename Z>                              \
    void apply(const X& lhs, const Y& rhs, Z out) {                            \
      out = EXPR_EIGEN;                                                        \
    }                                                                          \
  };                                                                           \
  template <typename T>                                                        \
  void OP_NAME##NoBroadcast(                                                   \
      const int32_t left_rank, const int32_t* __restrict left_shape,           \
      const T* __restrict left_values, const int32_t right_rank,               \
      const int32_t* __restrict right_shape, const T* __restrict right_values, \
      T* __restrict output_values) {                                           \
    OpNoBroadcast(left_rank, left_shape, left_values, right_rank, right_shape, \
                  right_values, output_values, Op##OP_NAME<T>());              \
  }                                                                            \
  template <typename T>                                                        \
  void OP_NAME##InnerBroadcast(                                                \
      const int32_t left_rank, const int32_t* __restrict left_shape,           \
      const T* __restrict left_values, const int32_t right_rank,               \
      const int32_t* __restrict right_shape, const T* __restrict right_values, \
      T* __restrict output_values) {                                           \
    OpInnerBroadcast(left_rank, left_shape, left_values, right_rank,           \
                     right_shape, right_values, output_values,                 \
                     Op##OP_NAME<T>());                                        \
  }

// Second macro param is value expression, third entry is Eigen vector
// expression.
BROADCAST_BINARY_OP(Add, lhs + rhs, lhs + rhs)
BROADCAST_BINARY_OP(Maximum, std::max(lhs, rhs), lhs.max(rhs))
BROADCAST_BINARY_OP(Minimum, std::min(lhs, rhs), lhs.min(rhs))
BROADCAST_BINARY_OP(Mul, lhs* rhs, lhs* rhs)
BROADCAST_BINARY_OP(Sub, lhs - rhs, lhs - rhs)
BROADCAST_BINARY_OP(SquaredDifference,
                    (lhs - rhs) * (lhs - rhs),
                    (lhs - rhs).square())

// -----------------------------------------------------------------------------
// Reduce ops
// -----------------------------------------------------------------------------

// We use macros instead of template functions with templated functors here
// because it's a lot less verbose and easier for the compiler to optimize.
#define REDUCE_OP(OP_NAME, DEFAULT_VALUE, UPDATE_EXPR, RESULT_EXPR)           \
  template <typename T, typename Tidx>                                        \
  void OP_NAME##InnerReduce(                                                  \
      int32_t input_rank, const int32_t* __restrict input_shape,              \
      const T* __restrict input_values, int32_t index_tensor_rank,            \
      const int32_t* __restrict index_shape,                                  \
      const Tidx* __restrict index_values, T* __restrict output_values) {     \
    BENCHMARK_TIMER(#OP_NAME, "InnerReduce");                                 \
    const int32_t inner_size =                                                \
        GetReduceInnerSize(input_rank, input_shape, index_tensor_rank,        \
                           index_shape, index_values);                        \
    const int32_t input_size = ShapeSize(input_rank, input_shape);            \
    const int32_t outer_size = input_size / inner_size;                       \
    for (int32_t idx_out = 0; idx_out < outer_size; ++idx_out) {              \
      T value = DEFAULT_VALUE;                                                \
      for (int32_t idx_in = 0; idx_in < inner_size; ++idx_in) {               \
        const T prev = value;                                                 \
        const T next = input_values[idx_out * inner_size + idx_in];           \
        value = UPDATE_EXPR;                                                  \
      }                                                                       \
      const T count = inner_size;                                             \
      (void)sizeof(count);                                                    \
      output_values[idx_out] = RESULT_EXPR;                                   \
    }                                                                         \
  }                                                                           \
  template <typename T, typename Tidx>                                        \
  void OP_NAME##GenericReduceRank4(                                           \
      int32_t input_rank, const int32_t* __restrict input_shape,              \
      const T* __restrict input_values, int32_t index_tensor_rank,            \
      const int32_t* __restrict index_shape,                                  \
      const Tidx* __restrict index_values, T* __restrict output_values) {     \
    assert(input_rank == 4);                                                  \
    assert(index_tensor_rank <= 1);                                           \
    BENCHMARK_TIMER(#OP_NAME, "GenericReduceRank4");                          \
    int out_shape[4] = {input_shape[0], input_shape[1], input_shape[2],       \
                        input_shape[3]};                                      \
    bool reduce_mask[4] = {false, false, false, false};                       \
    const int num_indices = index_tensor_rank > 0 ? index_shape[0] : 1;       \
    for (int i = 0; i < num_indices; ++i) {                                   \
      reduce_mask[index_values[i]] = true;                                    \
      out_shape[index_values[i]] = 1;                                         \
    }                                                                         \
    const int out_strides[4] = {                                              \
        reduce_mask[0] ? 0 : out_shape[1] * out_shape[2] * out_shape[3],      \
        reduce_mask[1] ? 0 : out_shape[2] * out_shape[3],                     \
        reduce_mask[2] ? 0 : out_shape[3],                                    \
        reduce_mask[3] ? 0 : 1,                                               \
    };                                                                        \
    const int output_size = ShapeSize(input_rank, out_shape);                 \
    std::fill_n(output_values, output_size, DEFAULT_VALUE);                   \
    for (int dim0 = 0; dim0 < input_shape[0]; ++dim0) {                       \
      for (int dim1 = 0; dim1 < input_shape[1]; ++dim1) {                     \
        for (int dim2 = 0; dim2 < input_shape[2]; ++dim2) {                   \
          for (int dim3 = 0; dim3 < input_shape[3]; ++dim3, ++input_values) { \
            T* out_ptr = output_values + out_strides[0] * dim0 +              \
                         out_strides[1] * dim1 + out_strides[2] * dim2 +      \
                         out_strides[3] * dim3;                               \
            const T prev = *out_ptr;                                          \
            const T next = *input_values;                                     \
            *out_ptr = UPDATE_EXPR;                                           \
          }                                                                   \
        }                                                                     \
      }                                                                       \
    }                                                                         \
    const T count = (reduce_mask[0] ? input_shape[0] : 1) *                   \
                    (reduce_mask[1] ? input_shape[1] : 1) *                   \
                    (reduce_mask[2] ? input_shape[2] : 1) *                   \
                    (reduce_mask[3] ? input_shape[3] : 1);                    \
    (void)sizeof(count);                                                      \
    for (int i = 0; i < output_size; ++i) {                                   \
      const T value = output_values[i];                                       \
      output_values[i] = RESULT_EXPR;                                         \
    }                                                                         \
  }

REDUCE_OP(Max, std::numeric_limits<T>::lowest(), std::max(prev, next), value)
REDUCE_OP(Sum, 0, prev + next, value)
REDUCE_OP(Mean, 0, prev + next, value / count)

#undef REDUCE_OP

// -----------------------------------------------------------------------------
// Dequantize ops
// -----------------------------------------------------------------------------

template <typename T>
void DequantizeMinCombined(const int32_t rank,
                           const int32_t* __restrict input_shape,
                           const T* __restrict input_values,
                           const float* __restrict min_range,
                           const float* __restrict max_range,
                           float* __restrict output_values) {
  BENCHMARK_TIMER("DequantizeMinCombined");
  const int size = ShapeSize(rank, input_shape);
  const float offset =
      std::is_signed<T>::value
          ? (static_cast<float>(std::numeric_limits<T>::max()) -
             std::numeric_limits<T>::min() + 1) /
                2.0f
          : 0.0f;
  const float range_scale = (max_range[0] - min_range[0]) /
                            (static_cast<float>(std::numeric_limits<T>::max()) -
                             std::numeric_limits<T>::min());
  for (int i = 0; i < size; i++) {
    output_values[i] =
        ((static_cast<int32_t>(input_values[i]) + offset) * range_scale) +
        min_range[0];
  }
}

template <typename T>
void DequantizeMinFirst(const int32_t rank,
                        const int32_t* __restrict input_shape,
                        const T* __restrict input_values,
                        const float* __restrict min_range,
                        const float* __restrict max_range,
                        float* __restrict output_values) {
  BENCHMARK_TIMER("DequantizeMinFirst");
  const int size = ShapeSize(rank, input_shape);
  const float range_scale = (max_range[0] - min_range[0]) /
                            (static_cast<float>(std::numeric_limits<T>::max()) -
                             std::numeric_limits<T>::min());
  const float range_min_rounded =
      (max_range[0] == min_range[0]
           ? min_range[0]
           : round(min_range[0] / range_scale) * range_scale);
  for (int i = 0; i < size; i++) {
    output_values[i] = ((static_cast<int32_t>(input_values[i]) -
                         std::numeric_limits<T>::min()) *
                        range_scale) +
                       range_min_rounded;
  }
}

// -----------------------------------------------------------------------------
// CONSTANTS
// Note that for now, endianness of the target machine needs to match that of
// the one training was performed on.
// -----------------------------------------------------------------------------
const int32_t dnn_hiddenlayer_0_weights_part_0_shape[2] = {4, 10};
const union {
  uint8_t bytes[160];
  float values[40];
} dnn_hiddenlayer_0_weights_part_0 = {{
    0xbc, 0x22, 0x0a, 0xbf, 0xb4, 0x46, 0x8c, 0x3f, 0xba, 0x31, 0x34, 0xbe,
    0x4c, 0x65, 0xdb, 0xbe, 0xf0, 0x54, 0x5e, 0xbe, 0xc1, 0x5d, 0xb3, 0x3f,
    0xf4, 0xe6, 0x15, 0xbf, 0x05, 0xc6, 0x34, 0xbf, 0xc0, 0x37, 0x7e, 0xbd,
    0x6c, 0x35, 0x0b, 0xbf, 0xca, 0x53, 0x26, 0xbf, 0x58, 0xb4, 0x87, 0x3f,
    0x37, 0xee, 0x39, 0xbf, 0xda, 0xfa, 0xf9, 0xbe, 0x97, 0xc1, 0x06, 0xbf,
    0xf9, 0x4e, 0x81, 0x3f, 0xb2, 0x44, 0x85, 0xbf, 0x7f, 0x98, 0x7c, 0x3d,
    0x15, 0x26, 0xbc, 0xbe, 0x5c, 0x48, 0x05, 0x3f, 0xc8, 0xaa, 0xa1, 0xbd,
    0x35, 0xb3, 0x43, 0xbe, 0xeb, 0x46, 0x91, 0x3f, 0x80, 0x71, 0xe3, 0x3c,
    0xd1, 0x98, 0x79, 0x3f, 0x3c, 0xd0, 0x0d, 0xbf, 0x1e, 0x02, 0xd3, 0x3e,
    0x5d, 0x4b, 0xa2, 0xbf, 0x68, 0xac, 0xaa, 0xbd, 0xf8, 0xe1, 0x75, 0x3e,
    0x4a, 0x9c, 0x27, 0xbe, 0xf8, 0xae, 0xb2, 0xbe, 0x7f, 0x9d, 0x91, 0x3f,
    0x1e, 0x8b, 0xa8, 0xbe, 0x35, 0x7e, 0xb2, 0x3f, 0xbe, 0x8c, 0xd3, 0xbe,
    0xf9, 0xcd, 0xb5, 0x3f, 0xa1, 0x50, 0xaa, 0x3f, 0xe4, 0x6d, 0xdd, 0xbe,
    0x0d, 0xce, 0xd3, 0xbe,
}};
const int32_t dnn_hiddenlayer_0_biases_part_0_shape[1] = {10};
const union {
  uint8_t bytes[40];
  float values[10];
} dnn_hiddenlayer_0_biases_part_0 = {{
    0x00, 0x00, 0x00, 0x00, 0xbf, 0x6a, 0x53, 0x3e, 0xd3, 0xc1,
    0xd0, 0x3e, 0x00, 0x00, 0x00, 0x00, 0xb6, 0xd8, 0xc0, 0x3e,
    0xca, 0xe7, 0x35, 0x3e, 0x23, 0xa5, 0x44, 0x3f, 0x61, 0xfd,
    0xd2, 0x3e, 0x00, 0x00, 0x00, 0x00, 0xb6, 0xe0, 0x43, 0x3c,
}};
const int32_t dnn_logits_biases_part_0_shape[1] = {1};
const union {
  uint8_t bytes[4];
  float values[1];
} dnn_logits_biases_part_0 = {{
    0x75,
    0xca,
    0xd7,
    0xbe,
}};
const int32_t dnn_logits_weights_part_0_shape[2] = {10, 1};
const union {
  uint8_t bytes[40];
  float values[10];
} dnn_logits_weights_part_0 = {{
    0x13, 0x12, 0x39, 0x3f, 0xf3, 0xa5, 0xc2, 0xbf, 0x81, 0x7f,
    0xbe, 0x3f, 0xf8, 0x17, 0x26, 0x3e, 0xa4, 0x19, 0xa6, 0x3f,
    0xf0, 0xc9, 0xb7, 0xbf, 0x6a, 0x99, 0xd2, 0x3f, 0x8a, 0x7d,
    0xe9, 0x3f, 0x83, 0x9a, 0x3a, 0xbf, 0xf1, 0x6c, 0x08, 0x3e,
}};

}  // anonymous namespace

// -----------------------------------------------------------------------------
// INFERENCE
// -----------------------------------------------------------------------------

int32_t input0Shape[2] = {1, 4};
int32_t logits_MatMul_merged_with_dnn_logits_BiasAdd0Shape[2] = {1, 1};

void Inference(
    const float* __restrict input0 /* shape: 1,4 */,
    float* __restrict logits_MatMul_merged_with_dnn_logits_BiasAdd0 /* shape:
                                                                       1,1 */
    ,
    FixedAllocations* __restrict fixed) {
  const int32_t input0_shape[] = {1, 4};
  std::array<int32_t, 2> logits_MatMul_merged_with_dnn_logits_BiasAdd0_shape;

  // dnn/hiddenlayer_0/MatMul_merged_with_dnn/hiddenlayer_0/BiasAdd
  FullyConnected<float>(input0_shape, input0,
                        dnn_hiddenlayer_0_weights_part_0_shape,
                        dnn_hiddenlayer_0_weights_part_0.values,
                        dnn_hiddenlayer_0_biases_part_0_shape,
                        dnn_hiddenlayer_0_biases_part_0.values, fixed->alloc0);
  fixed->alloc0_shape[0] = 1;
  fixed->alloc0_shape[1] = 10;

  // dnn/hiddenlayer_0/hiddenlayer_0/Relu
  Relu<float>(2,  // rank
              fixed->alloc0_shape, fixed->alloc0, fixed->alloc1);
  fixed->alloc1_shape[0] = 1;
  fixed->alloc1_shape[1] = 10;

  // dnn/logits/MatMul_merged_with_dnn/logits/BiasAdd
  FullyConnected<float>(
      fixed->alloc1_shape, fixed->alloc1, dnn_logits_weights_part_0_shape,
      dnn_logits_weights_part_0.values, dnn_logits_biases_part_0_shape,
      dnn_logits_biases_part_0.values,
      logits_MatMul_merged_with_dnn_logits_BiasAdd0);
  logits_MatMul_merged_with_dnn_logits_BiasAdd0_shape[0] = 1;
  logits_MatMul_merged_with_dnn_logits_BiasAdd0_shape[1] = 1;
}

}  // namespace darkmode_tfnative_model