File: lengths_reducer_ops.h

package info (click to toggle)
pytorch 1.13.1%2Bdfsg-4
links: PTS, VCS
area: main
in suites: bookworm
size: 139,252 kB
sloc: cpp: 1,100,274; python: 706,454; ansic: 83,052; asm: 7,618; java: 3,273; sh: 2,841; javascript: 612; makefile: 323; xml: 269; ruby: 185; yacc: 144; objc: 68; lex: 44
file content (725 lines) | stat: -rw-r--r-- 23,469 bytes
#pragma once

#include <c10/util/irange.h>
#include "caffe2/core/context.h"
#include "caffe2/core/operator.h"
#include "caffe2/perfkernels/embedding_lookup.h"
#ifdef USE_FBGEMM
#include "fbgemm/Fbgemm.h"
#endif
#include <algorithm>
#include <functional>

namespace caffe2 {

// A templated class that implements SparseLengths[Sum,WeightedSum,Mean].
template <
    typename T, // output type
    class InputTypes, // supported input types, such as TensorTypes<float>
    bool USE_WEIGHT = false, // Whether it is SparseLengthsWeightedSum
    bool USE_MEAN = false, // Whether this is SparseLengthsMean
    bool USE_POSITIONAL_WEIGHT = false
    // USE_WEIGHT = true and USE_POSITIONAL_WEIGHT = true
    // -> SparseLengthsPositionalWeightedSum
    >
class CPUSparseLengthsReductionOp : public Operator<CPUContext> {
 public:
  USE_OPERATOR_FUNCTIONS(CPUContext);
  template <class... Args>
  explicit CPUSparseLengthsReductionOp(Args&&... args)
      : Operator<CPUContext>(std::forward<Args>(args)...) {
    static_assert(
        !(USE_WEIGHT & USE_MEAN), "Cannot both specify weight and mean.");
  }

  ~CPUSparseLengthsReductionOp() {}

  // Currently, we support float and at::Half inputs for input data type, and
  // int32_t and int64_t for the index type.

  bool RunOnDevice() override {
    return DispatchHelper<InputTypes>::call(this, Input(DATA));
  }

  template <typename InputType>
  bool DoRunWithType() {
    return DispatchHelper<TensorTypes2<int32_t, int64_t>, InputType>::call(
        this, Input(INDICES));
  }

  template <typename InputType, typename IndexType>
  bool DoRunWithType2() {
    auto& dataInput = Input(DATA);
    auto& indicesInput = Input(INDICES);
    auto& lengthsInput = Input(LENGTHS);

    const int64_t M = lengthsInput.size(0);
    const int64_t indices_size = indicesInput.numel();

    auto shape = dataInput.sizes().vec();
    shape[0] = M;
    auto* output = Output(0, shape, at::dtype<T>());
    T* out_data = output->template mutable_data<T>();

    if (indices_size == 0) {
      if (M > 0) {
        memset(out_data, 0, output->numel() * sizeof(T));
      }
      return true;
    }

    CAFFE_ENFORCE_EQ(1, indicesInput.dim(), "INDICES must be a vector");
    CAFFE_ENFORCE_EQ(1, lengthsInput.dim(), "LENGTHS must be a vector");
    const int64_t N = dataInput.size(0);
    const int D = dataInput.size_from_dim(1);

    const InputType* in_data = dataInput.template data<InputType>();
    const IndexType* indices = indicesInput.template data<IndexType>();
    const int* lengths = lengthsInput.template data<int>();
    const T* in_weight = nullptr;

    if (USE_WEIGHT) {
      // static if
      auto& weightInput = Input(WEIGHT);
      CAFFE_ENFORCE_EQ(1, weightInput.dim(), "WEIGHT must be a vector");
      if (!USE_POSITIONAL_WEIGHT) {
        CAFFE_ENFORCE_EQ(
            weightInput.numel(),
            indices_size,
            "Weight should have the same length as indices.");
      }
      in_weight = weightInput.template data<T>();
    }

#ifdef USE_FBGEMM
    // If this is the first call or block size has changed (should never
    // happen actually), generate a kernel.
    if (D != last_block_size) {
      last_block_size = D;
      if (std::is_same<InputType, float>::value) {
        if (std::is_same<IndexType, std::int32_t>::value) {
          kernel_fp32_i32_ =
              fbgemm::GenerateEmbeddingSpMDM<float, std::int32_t>(
                  D,
                  USE_WEIGHT,
                  USE_MEAN,
                  /*prefetch distance*/ 16,
                  USE_POSITIONAL_WEIGHT,
                  /*use_offsets*/ false);
        } else {
          CAFFE_ENFORCE((std::is_same<IndexType, std::int64_t>::value));
          kernel_fp32_i64_ =
              fbgemm::GenerateEmbeddingSpMDM<float, std::int64_t>(
                  D,
                  USE_WEIGHT,
                  USE_MEAN,
                  /*prefetch distance*/ 16,
                  USE_POSITIONAL_WEIGHT,
                  /*use_offsets*/ false);
        }
      } else {
        CAFFE_ENFORCE((std::is_same<InputType, at::Half>::value));
        if (std::is_same<IndexType, std::int32_t>::value) {
          kernel_fp16_i32_ =
              fbgemm::GenerateEmbeddingSpMDM<fbgemm::float16, std::int32_t>(
                  D,
                  USE_WEIGHT,
                  USE_MEAN,
                  /*prefetch distance*/ 16,
                  USE_POSITIONAL_WEIGHT,
                  /*use_offsets*/ false);
        } else {
          CAFFE_ENFORCE((std::is_same<IndexType, std::int64_t>::value));
          kernel_fp16_i64_ =
              fbgemm::GenerateEmbeddingSpMDM<fbgemm::float16, std::int64_t>(
                  D,
                  USE_WEIGHT,
                  USE_MEAN,
                  /*prefetch distance*/ 16,
                  USE_POSITIONAL_WEIGHT,
                  /*use_offsets*/ false);
        }
      }
    }

    bool success;
    if (std::is_same<InputType, float>::value) {
      if (std::is_same<IndexType, std::int32_t>::value) {
        success = kernel_fp32_i32_(
            M,
            indices_size,
            N,
            reinterpret_cast<const float*>(in_data),
            indicesInput.template data<std::int32_t>(),
            lengths,
            in_weight,
            out_data);
      } else {
        success = kernel_fp32_i64_(
            M,
            indices_size,
            N,
            reinterpret_cast<const float*>(in_data),
            indicesInput.template data<std::int64_t>(),
            lengths,
            in_weight,
            out_data);
      }
    } else {
      if (std::is_same<IndexType, std::int32_t>::value) {
        success = kernel_fp16_i32_(
            M,
            indices_size,
            N,
            reinterpret_cast<const fbgemm::float16*>(in_data),
            indicesInput.template data<std::int32_t>(),
            lengths,
            in_weight,
            out_data);
      } else {
        success = kernel_fp16_i64_(
            M,
            indices_size,
            N,
            reinterpret_cast<const fbgemm::float16*>(in_data),
            indicesInput.template data<std::int64_t>(),
            lengths,
            in_weight,
            out_data);
      }
    }

    if (success) {
      return true;
    }

    int64_t current = 0;
    for (const auto m : c10::irange(M)) {
      for (int i = 0; i < lengths[m]; ++i) {
        CAFFE_ENFORCE_LT(
            current,
            indices_size,
            "Your input seems to be incorrect: the sum of lengths values "
            "should be the size of the indices tensor, but it appears not.");
        IndexType idx = indices[current];
        CAFFE_ENFORCE(
            0 <= idx && idx < N,
            "Index ",
            current,
            " is out of bounds: ",
            idx,
            ", range 0 to ",
            N,
            ", actual batch length is ",
            M);
        ++current;
      }
    }
    CAFFE_ENFORCE_EQ(
        current,
        indices_size,
        "Your input seems to be incorrect: the sum of lengths values should be "
        "the size of the indices tensor, but it appears not.");

    return false;
#endif

    // delegate work to perfkernel that branches based on architecture
    EmbeddingLookup<IndexType, InputType, T, USE_POSITIONAL_WEIGHT>(
        D,
        M,
        indices_size,
        N,
        in_data,
        indices,
        lengths,
        in_weight,
        nullptr, // scale_bias field is only used in SparseLengths8BitsRowwiseOp
        USE_MEAN,
        out_data);
    return true;
  }

  enum {
    DATA = 0, // Data input.
    WEIGHT = 1, // Weight input used in SparseLengthsWeightedSum
    INDICES = 1 + USE_WEIGHT, // 1 in SparseLengths[Sum,Mean] and
                              // 2 in SparseLengthsWeightedSum
    LENGTHS = 2 + USE_WEIGHT, // 2 in SparseLengths[Sum, Mean],
                              // 3 in SparseLengthsWeightedSum
  };

#ifdef USE_FBGEMM
 private:
  std::int64_t last_block_size{-1};
  fbgemm::EmbeddingSpMDMKernelSignature<float, std::int32_t>::Type
      kernel_fp32_i32_;
  fbgemm::EmbeddingSpMDMKernelSignature<float, std::int64_t>::Type
      kernel_fp32_i64_;
  fbgemm::EmbeddingSpMDMKernelSignature<fbgemm::float16, std::int32_t>::Type
      kernel_fp16_i32_;
  fbgemm::EmbeddingSpMDMKernelSignature<fbgemm::float16, std::int64_t>::Type
      kernel_fp16_i64_;
#endif
};

template <typename T, class Context, class Engine = DefaultEngine>
class TTSparseLengthsSumOp final : public Operator<Context> {
 public:
  USE_OPERATOR_CONTEXT_FUNCTIONS;
  template <class... Args>
  explicit TTSparseLengthsSumOp(Args&&... args)
      : Operator<Context>(std::forward<Args>(args)...),
        factor_i(this->template GetRepeatedArgument<int>(
            "factor_i",
            vector<int>{1, 1, 1})),
        factor_j(this->template GetRepeatedArgument<int>(
            "factor_j",
            vector<int>{1, 1, 1})),
        ranks(this->template GetRepeatedArgument<int>(
            "ranks",
            vector<int>{1, 1, 1, 1})),
        emb_size(this->template GetSingleArgument<int>("emb_size", 64)) {
    // cumprod of i, used for index slice
    l_cumprod.push_back(1);
    for (const auto i : c10::irange(1, factor_i.size())) {
      l_cumprod.push_back(l_cumprod[i - 1] * factor_i[i - 1]);
    }
  }

  ~TTSparseLengthsSumOp() {}

  void Ind2Sub(int64_t* out_factor_index, const int64_t* indices, int len) {
    // TODO: vectorization
    auto N = factor_i.size();
    for (const auto j : c10::irange(len)) {
      auto idx = indices[j];
      for (int i = N; i > 0; i--) {
        out_factor_index[j * N + i - 1] = idx / l_cumprod[i - 1];
        idx = idx % l_cumprod[i - 1];
      }
    }
  }

  bool GetSlice(
      std::vector<std::vector<T>>& tgt_slice,
      const T* core,
      const vector<int64_t>& ind_slice,
      int bs,
      int idx) {
    // implement the functinality index_select(core, 1, ind_slice)
    auto num_of_elements = ranks[idx] * factor_j[idx] * ranks[idx + 1];
    for (const auto i : c10::irange(bs)) {
      memcpy(
          tgt_slice[i].data(),
          core + ind_slice[i] * num_of_elements,
          num_of_elements * sizeof(T));
    }
    return true;
  }

  // ind: it stores the index to each tensor core
  // bs: the number of indices
  // GatherAllRows uses two steps to calculate the lengthsum functionality: 1) it uses tensor train
  // to calculate the embedding for each index. 2) it sums the embedding for each bag.
  // In Step 1), it batches all the indices together. Specifically, for every index, it uses the pre-computed
  // ind of each tensor core to extract the corresponding slice of the core. Then it does gemm operation
  // sequentially on the slices to produce the embedding result for each index.
  // In Step 2), it takes the embedding computed in step 1) and apply the sum operation for each bag.
  bool GatherAllRows(
      int64_t* ind,
      int bs,
      int x_len,
      vector<const T*> cores,
      int segments,
      const int* lengths,
      T* out_data) {
    // compute the largest memory consumption of intermediate result
    // TODO: dynamic allocation size: cur_rows*factor_j[i]*ranks[i+1]
    // and also explore the contiguous memory storage for res and int_res
    int max_rank = *max_element(ranks.begin(), ranks.end());
    std::vector<std::vector<T>> res(bs, std::vector<T>(emb_size * max_rank, 0));
    std::vector<std::vector<T>> int_res(
        bs, std::vector<T>(emb_size * max_rank, 0));

    // Store the matrix A
    vector<T*> Y_ptr(bs);
    // Store the intermediate result in each layer
    vector<T*> Z_ptr(bs);

    for (const auto b : c10::irange(bs)) {
      Y_ptr[b] = res[b].data();
      Z_ptr[b] = int_res[b].data();
    }

    vector<int64_t> ind_slice(bs);
    int rows = 0;
    for (const auto i : c10::irange(x_len)) {
      // slice cur
      for (const auto j : c10::irange(bs)) {
        ind_slice[j] = ind[x_len * j + i];
      }
      if (i == 0) {
        GetSlice(res, cores[i], ind_slice, bs, i);
        rows = factor_j[0];
      } else {
        std::vector<std::vector<T>> slice(
            bs, std::vector<T>(ranks[i] * factor_j[i] * ranks[i + 1], 0));
        vector<const T*> X_ptr(bs);
        for (const auto b : c10::irange(bs)) {
          X_ptr[b] = slice[b].data();
        }
        GetSlice(slice, cores[i], ind_slice, bs, i);

        math::GemmBatched<T, CPUContext>(
            CblasNoTrans,
            CblasNoTrans,
            bs,
            rows,
            factor_j[i] * ranks[i + 1],
            ranks[i],
            1.0f,
            const_cast<const T**>(Y_ptr.data()),
            X_ptr.data(),
            0.0f,
            Z_ptr.data(),
            &context_);
        for (const auto b : c10::irange(bs)) {
          std::memcpy(Y_ptr[b], Z_ptr[b], (emb_size * max_rank) * sizeof(T));
        }
        rows *= factor_j[i];
      }
      // save the intermediate output for backward path
      // shape for the core
      auto shape = vector<int64_t>({bs, rows, ranks[i + 1]});
      if (i < 2) {
        auto* core_data = Output(i + 1, shape, at::dtype<T>());
        T* out_core = core_data->template mutable_data<T>();
        for (const auto b : c10::irange(bs)) {
          std::memcpy(
              out_core + b * rows * ranks[i + 1],
              Y_ptr[b],
              rows * ranks[i + 1] * sizeof(T));
        }
      }
    }

    // reduction and store back to output
    vector<int64_t> cum_lengths(segments);
    for (const auto seg : c10::irange(segments)) {
      cum_lengths[seg] =
          seg == 0 ? lengths[0] : lengths[seg] + cum_lengths[seg - 1];
    }

    int length_idx = 0;
    vector<T> tmp_sum(emb_size, 0.0f);
    for (int i = 0; i <= bs; i++) {
      while ((length_idx < segments) && (i == cum_lengths[length_idx])) {
        // store the tmp_sum into output
        memcpy(
            &out_data[length_idx * emb_size],
            tmp_sum.data(),
            emb_size * sizeof(T));
        length_idx++;
        fill(tmp_sum.begin(), tmp_sum.end(), 0.0f);
      }
      if (i == bs) {
        break;
      }
      transform(
          res[i].begin(),
          res[i].begin() + emb_size,
          tmp_sum.begin(),
          tmp_sum.begin(),
          std::plus<T>());
    }
    return true;
  }

  bool RunOnDevice() override {
    const auto& dataInput0 = Input(0);
    const auto& dataInput1 = Input(1);
    const auto& dataInput2 = Input(2);
    const auto& indicesInput = Input(3);
    const auto& lengthsInput = Input(4);

    CAFFE_ENFORCE_EQ(1, indicesInput.dim(), "INDICES must be a vector");
    CAFFE_ENFORCE_EQ(1, lengthsInput.dim(), "LENGTHS must be a vector");

    int N = factor_i.size();
    const int64_t M = lengthsInput.size(0);

    auto shape = vector<int64_t>({M, emb_size});
    auto* output = Output(0, shape, at::dtype<T>());
    T* out_data = output->template mutable_data<T>();

    const T* core0 = dataInput0.template data<T>();
    const T* core1 = dataInput1.template data<T>();
    const T* core2 = dataInput2.template data<T>();

    const int* lengths = lengthsInput.template data<int>();

    vector<const T*> cores = {core0, core1, core2};

    const int64_t* indices = indicesInput.template data<int64_t>();

    // Store the factor index for backward path
    auto index_shape = vector<int64_t>({indicesInput.size(), N});
    auto* index_data = Output(3, index_shape, at::dtype<int64_t>());
    int64_t* out_factor_index = index_data->template mutable_data<int64_t>();

    // Store the factorized index for each core
    Ind2Sub(out_factor_index, indices, indicesInput.size());

    return GatherAllRows(
        out_factor_index, indicesInput.size(), N, cores, M, lengths, out_data);
  }

 protected:
  vector<int> factor_i;
  vector<int> factor_j;
  vector<int> ranks;
  vector<int> l_cumprod;
  int emb_size;
};

template <typename T, class Context>
class TTSparseLengthsSumGradientOp final : public Operator<Context> {
 public:
  USE_OPERATOR_CONTEXT_FUNCTIONS;
  template <class... Args>
  explicit TTSparseLengthsSumGradientOp(Args&&... args)
      : Operator<Context>(std::forward<Args>(args)...) {}
  bool RunOnDevice() override;

  ~TTSparseLengthsSumGradientOp() {}
};

// implement the graident op for TTLengthSumGradient op
template <typename T, class Context>
bool TTSparseLengthsSumGradientOp<T, Context>::RunOnDevice() {
  const auto& core0 = Input(0);
  const auto& core1 = Input(1);
  const auto& core2 = Input(2);
  const auto& lengths = Input(3);
  const auto& core0_out = Input(4);
  const auto& core1_out = Input(5);
  const auto& index_out = Input(6);
  const auto& dY = Input(7);

  const int* lengths_data = lengths.template data<int>();
  const T* dY_data = dY.template data<T>();

  // restore the arguments from shape
  const int64_t bs = index_out.size(0);
  const int64_t emb_size = dY.size(1);
  const int64_t num_segments = lengths.size(0);

  auto core0_shape = core0.sizes().vec();
  auto core1_shape = core1.sizes().vec();
  auto core2_shape = core2.sizes().vec();
  auto core0_out_shape = core0_out.sizes().vec();
  auto core1_out_shape = core1_out.sizes().vec();

  auto* dCore0 = Output(0, core0_shape, at::dtype<T>());
  auto* dCore1 = Output(1, core1_shape, at::dtype<T>());
  auto* dCore2 = Output(2, core2_shape, at::dtype<T>());

  T* dCore0_data = dCore0->template mutable_data<T>();
  T* dCore1_data = dCore1->template mutable_data<T>();
  T* dCore2_data = dCore2->template mutable_data<T>();

  memset(
      dCore0_data,
      0.0f,
      sizeof(T) *
          accumulate(
              core0_shape.begin(), core0_shape.end(), 1, std::multiplies<T>()));
  memset(
      dCore1_data,
      0.0f,
      sizeof(T) *
          accumulate(
              core1_shape.begin(), core1_shape.end(), 1, std::multiplies<T>()));
  memset(
      dCore2_data,
      0.0f,
      sizeof(T) *
          accumulate(
              core2_shape.begin(), core2_shape.end(), 1, std::multiplies<T>()));

  int64_t* index_out_data = index_out.template mutable_data<int64_t>();

  vector<vector<int64_t>> index_slice(bs, vector<int64_t>(3, 0));
  for (const auto b : c10::irange(bs)) {
    memcpy(index_slice[b].data(), index_out_data + b * 3, 3 * sizeof(int64_t));
  }

  vector<const T*> A_ptr(bs);
  vector<T*> B_ptr(bs);
  vector<T*> C_ptr(bs);
  // size of each batch
  int64_t num_of_elements = 0;

  // construct the ranks
  // expand the gradient into all indices
  vector<vector<T>> core2_out_grad(bs, vector<T>(emb_size, 0));
  int64_t data_index = 0;
  for (const auto range_index : c10::irange(num_segments)) {
    for (int64_t start = data_index;
         data_index < start + lengths_data[range_index];
         ++data_index) {
      memcpy(
          core2_out_grad[data_index].data(),
          dY_data + range_index * emb_size,
          emb_size * sizeof(T));
    }
  }

  // =======================================================
  // Calculate dCore2_data:
  // 1) Transpose core1_out and multiply iwth core2_out_grad
  // 2)  add to dCore2_data
  vector<vector<T>> dCore2_data_slice_grad(
      bs, vector<T>(core2_shape[1] * core2_shape[2] * core2_shape[3], 0));
  const T* core1_out_data = core1_out.template data<T>();
  // const T* core1_out_p[bs];
  for (const auto b : c10::irange(bs)) {
    A_ptr[b] = core1_out_data + b * core1_out.size(1) * core1_out.size(2);
    B_ptr[b] = core2_out_grad[b].data();
    C_ptr[b] = dCore2_data_slice_grad[b].data();
  }

  math::GemmBatched<T, CPUContext>(
      CblasTrans,
      CblasNoTrans,
      bs,
      core2.size(1), // M
      core2.size(2) * core2.size(3), // N
      core1_out.size(1), // K
      1.0f,
      const_cast<const T**>(A_ptr.data()),
      const_cast<const T**>(B_ptr.data()),
      0.0f,
      C_ptr.data(),
      &context_);

  // update the corresponding slice
  num_of_elements = core2_shape[1] * core2_shape[2] * core2_shape[3];

  T* core2_data = core2.template mutable_data<T>();
  vector<vector<T>> core2_slice(
      bs, vector<T>(core2_shape[1] * core2_shape[2] * core2_shape[3], 0));

  for (const auto b : c10::irange(bs)) {
    for (const auto i : c10::irange(num_of_elements)) {
      dCore2_data[index_slice[b][2] * num_of_elements + i] += C_ptr[b][i];
    }
    memcpy(
        core2_slice[b].data(),
        core2_data + index_slice[b][2] * num_of_elements,
        sizeof(T) * num_of_elements);
  }

  // Calculate core1_out_grad
  vector<vector<T>> core1_out_grad(
      bs, vector<T>(core1_out_shape[1] * core1_out_shape[2], 0));

  for (const auto b : c10::irange(bs)) {
    A_ptr[b] = core2_out_grad[b].data();
    B_ptr[b] = core2_slice[b].data();
    C_ptr[b] = core1_out_grad[b].data();
  }

  math::GemmBatched<T, CPUContext>(
      CblasNoTrans,
      CblasTrans,
      bs,
      core1_out.size(1), // M
      core2_shape[1], // N
      core2_shape[2] * core2_shape[3], // K
      1.0f,
      const_cast<const T**>(A_ptr.data()),
      const_cast<const T**>(B_ptr.data()),
      0.0f,
      C_ptr.data(),
      &context_);

  // =======================================================
  // Calcuate dCore1_data:
  // 1) Transpose core1_out_grad and multiply with core0_out
  // 2) Transpose the result and then add to dCore1_data
  vector<vector<T>> dCore1_data_slice_grad(
      bs, vector<T>(core1_shape[1] * core1_shape[2] * core1_shape[3], 0));
  const T* core0_out_data = core0_out.template data<T>();
  for (const auto b : c10::irange(bs)) {
    A_ptr[b] = core0_out_data + b * core0_out.size(1) * core0_out.size(2);
    B_ptr[b] = core1_out_grad[b].data();
    C_ptr[b] = dCore1_data_slice_grad[b].data();
  }

  math::GemmBatched<T, CPUContext>(
      CblasTrans,
      CblasNoTrans,
      bs,
      core1.size(1), // M
      core1.size(2) * core1.size(3), // N
      core0_out.size(1), // K
      1.0f,
      const_cast<const T**>(A_ptr.data()),
      const_cast<const T**>(B_ptr.data()),
      0.0f,
      C_ptr.data(),
      &context_);

  // update the corresponding slice
  num_of_elements = core1_shape[1] * core1_shape[2] * core1_shape[3];
  T* core1_data = core1.template mutable_data<T>();
  vector<vector<T>> core1_slice(
      bs, vector<T>(core1_shape[1] * core1_shape[2] * core1_shape[3], 0));

  for (const auto b : c10::irange(bs)) {
    for (const auto i : c10::irange(num_of_elements)) {
      dCore1_data[index_slice[b][1] * num_of_elements + i] += C_ptr[b][i];
    }
    memcpy(
        core1_slice[b].data(),
        core1_data + index_slice[b][1] * num_of_elements,
        sizeof(T) * num_of_elements);
  }

  // Calcuate core0_out_grad
  vector<vector<T>> core0_out_grad(
      bs, vector<T>(core0_out_shape[1] * core0_out_shape[2], 0));

  for (const auto b : c10::irange(bs)) {
    A_ptr[b] = core1_out_grad[b].data();
    B_ptr[b] = core1_slice[b].data();
    C_ptr[b] = core0_out_grad[b].data();
  }

  math::GemmBatched<T, CPUContext>(
      CblasNoTrans,
      CblasTrans,
      bs,
      core0_out.size(1), // M
      core1_shape[1], // N
      core1_shape[2] * core1_shape[3], // K
      1.0f,
      const_cast<const T**>(A_ptr.data()),
      const_cast<const T**>(B_ptr.data()),
      0.0f,
      C_ptr.data(),
      &context_);

  num_of_elements = core0_shape[1] * core0_shape[2] * core0_shape[3];

  for (const auto b : c10::irange(bs)) {
    for (const auto i : c10::irange(num_of_elements)) {
      dCore0_data[index_slice[b][0] * num_of_elements + i] += C_ptr[b][i];
    }
  }
  return true;
}

} // namespace caffe2