1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157
|
// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// single_thread_gemm.h: Single-threaded GEMM implementation.
// This is a good place to start reading code, as it shows the overall
// structure of a GEMM and is much simpler than multi_thread_gemm.h.
#ifndef GEMMLOWP_INTERNAL_SINGLE_THREAD_GEMM_H_
#define GEMMLOWP_INTERNAL_SINGLE_THREAD_GEMM_H_
#include <cassert>
#include "../public/map.h"
#include "allocator.h"
#include "compute.h"
#include "kernel.h"
#include "pack.h"
#include "unpack.h"
#ifdef GEMMLOWP_PROFILING_SIZES
#ifndef GEMMLOWP_PROFILING
#error GEMMLOWP_PROFILING_SIZES without GEMMLOWP_PROFILING
#endif
#include <string>
#include <unordered_map>
#endif
namespace gemmlowp {
class SingleThreadGemmContext {
public:
Allocator* allocator() { return &allocator_; }
void set_l1_bytes_to_use(int n) { l1_bytes_to_use_ = n; }
void set_l2_bytes_to_use(int n) { l2_bytes_to_use_ = n; }
void set_l2_rhs_factor(float n) { l2_rhs_factor_ = n; }
int l1_bytes_to_use() const { return l1_bytes_to_use_; }
int l2_bytes_to_use() const { return l2_bytes_to_use_; }
float l2_rhs_factor() const { return l2_rhs_factor_; }
protected:
Allocator allocator_;
// The cache configurationt to use.
int l1_bytes_to_use_ = kDefaultL1CacheSize;
int l2_bytes_to_use_ = kDefaultL2CacheSize;
float l2_rhs_factor_ = kDefaultL2RhsFactor;
};
template <typename KernelFormat, typename InputScalar, typename OutputScalar,
typename BitDepthParams, MapOrder LhsOrder, MapOrder RhsOrder,
MapOrder ResultOrder, typename LhsOffset, typename RhsOffset,
typename OutputPipelineType>
void SingleThreadGemm(SingleThreadGemmContext* context,
const KernelBase& kernel,
const MatrixMap<const InputScalar, LhsOrder>& lhs,
const MatrixMap<const InputScalar, RhsOrder>& rhs,
MatrixMap<OutputScalar, ResultOrder>* result,
const LhsOffset& lhs_offset, const RhsOffset& rhs_offset,
const OutputPipelineType& output_pipeline) {
ScopedProfilingLabel label("gemmlowp::SingleThreadGemm");
assert(lhs.cols() == rhs.rows());
int rows = result->rows();
int cols = result->cols();
int depth = lhs.cols();
// zero sizes should have been caught earlier and early-returned.
assert(rows > 0);
assert(cols > 0);
assert(depth > 0);
// The case of rows<cols should have been caught earlier and transposed.
assert(rows >= cols);
Allocator* allocator = context->allocator();
BlockParams block_params;
block_params.Init<KernelFormat>(
rows, cols, depth, 1, context->l1_bytes_to_use(),
context->l2_bytes_to_use(), context->l2_rhs_factor());
#ifdef GEMMLOWP_PROFILING_SIZES
// Using a static map of label strings. Not reentrant at all!
static std::unordered_map<std::uint64_t, std::string> labels_map;
std::uint64_t sizes_hash = static_cast<std::uint64_t>(rows) ^
(static_cast<std::uint64_t>(depth) << 16) ^
(static_cast<std::uint64_t>(cols) << 32);
if (!labels_map.count(sizes_hash)) {
char label[256];
snprintf(label, sizeof(label),
"(rows = %d, depth = %d, cols = %d, l2_rows = %d, l2_depth = %d, "
"l2_cols = %d, l1_rows = %d, l1_depth = %d, l1_cols = %d)",
rows, depth, cols, block_params.l2_rows, block_params.l2_depth,
block_params.l2_cols, block_params.l1_rows, block_params.l1_depth,
block_params.l1_cols);
labels_map[sizes_hash] = label;
}
ScopedProfilingLabel size_label(labels_map[sizes_hash].c_str());
#endif
PackedSideBlock<typename KernelFormat::Lhs> packed_lhs(Side::Lhs, allocator,
block_params);
PackedSideBlock<typename KernelFormat::Rhs> packed_rhs(Side::Rhs, allocator,
block_params);
PackedResult packed_result(allocator, block_params);
allocator->Commit();
const bool pack_rhs_once = block_params.l2_cols >= cols;
if (pack_rhs_once) {
PackRhs(&packed_rhs, rhs);
}
for (int r = 0; r < rows; r += block_params.l2_rows) {
int rs = std::min(block_params.l2_rows, rows - r);
PackLhs(&packed_lhs, lhs.block(r, 0, rs, depth));
for (int c = 0; c < cols; c += block_params.l2_cols) {
int cs = std::min(block_params.l2_cols, cols - c);
if (!pack_rhs_once) {
PackRhs(&packed_rhs, rhs.block(0, c, depth, cs));
}
Compute(kernel, block_params, &packed_result, packed_lhs, packed_rhs,
depth);
UnpackResult<KernelFormat>(
result, MatrixBlockBounds(r, c, rs, cs), packed_result, depth,
packed_lhs.sums_of_each_slice(), packed_rhs.sums_of_each_slice(),
lhs_offset.block(r, rs), rhs_offset.block(c, cs), output_pipeline);
}
}
allocator->Decommit();
}
} // namespace gemmlowp
#endif // GEMMLOWP_INTERNAL_SINGLE_THREAD_GEMM_H_
|