1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301
|
//===-- runtime/matmul-transpose.cpp --------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// Implements a fused matmul-transpose operation
//
// There are two main entry points; one establishes a descriptor for the
// result and allocates it, and the other expects a result descriptor that
// points to existing storage.
//
// This implementation must handle all combinations of numeric types and
// kinds (100 - 165 cases depending on the target), plus all combinations
// of logical kinds (16). A single template undergoes many instantiations
// to cover all of the valid possibilities.
//
// The usefulness of this optimization should be reviewed once Matmul is swapped
// to use the faster BLAS routines.
#include "flang/Runtime/matmul-transpose.h"
#include "terminator.h"
#include "tools.h"
#include "flang/Runtime/c-or-cpp.h"
#include "flang/Runtime/cpp-type.h"
#include "flang/Runtime/descriptor.h"
#include <cstring>
namespace {
using namespace Fortran::runtime;
// Contiguous numeric TRANSPOSE(matrix)*matrix multiplication
// TRANSPOSE(matrix(n, rows)) * matrix(n,cols) ->
// matrix(rows, n) * matrix(n,cols) -> matrix(rows,cols)
// The transpose is implemented by swapping the indices of accesses into the LHS
//
// Straightforward algorithm:
// DO 1 I = 1, NROWS
// DO 1 J = 1, NCOLS
// RES(I,J) = 0
// DO 1 K = 1, N
// 1 RES(I,J) = RES(I,J) + X(K,I)*Y(K,J)
//
// With loop distribution and transposition to avoid the inner sum
// reduction and to avoid non-unit strides:
// DO 1 I = 1, NROWS
// DO 1 J = 1, NCOLS
// 1 RES(I,J) = 0
// DO 2 J = 1, NCOLS
// DO 2 I = 1, NROWS
// DO 2 K = 1, N
// 2 RES(I,J) = RES(I,J) + X(K,I)*Y(K,J) ! loop-invariant last term
template <TypeCategory RCAT, int RKIND, typename XT, typename YT>
inline static void MatrixTransposedTimesMatrix(
CppTypeFor<RCAT, RKIND> *RESTRICT product, SubscriptValue rows,
SubscriptValue cols, const XT *RESTRICT x, const YT *RESTRICT y,
SubscriptValue n) {
using ResultType = CppTypeFor<RCAT, RKIND>;
std::memset(product, 0, rows * cols * sizeof *product);
for (SubscriptValue j{0}; j < cols; ++j) {
for (SubscriptValue i{0}; i < rows; ++i) {
for (SubscriptValue k{0}; k < n; ++k) {
ResultType x_ki = static_cast<ResultType>(x[i * n + k]);
ResultType y_kj = static_cast<ResultType>(y[j * n + k]);
product[j * rows + i] += x_ki * y_kj;
}
}
}
}
// Contiguous numeric matrix*vector multiplication
// matrix(rows,n) * column vector(n) -> column vector(rows)
// Straightforward algorithm:
// DO 1 I = 1, NROWS
// RES(I) = 0
// DO 1 K = 1, N
// 1 RES(I) = RES(I) + X(K,I)*Y(K)
// With loop distribution and transposition to avoid the inner
// sum reduction and to avoid non-unit strides:
// DO 1 I = 1, NROWS
// 1 RES(I) = 0
// DO 2 I = 1, NROWS
// DO 2 K = 1, N
// 2 RES(I) = RES(I) + X(K,I)*Y(K)
template <TypeCategory RCAT, int RKIND, typename XT, typename YT>
inline static void MatrixTransposedTimesVector(
CppTypeFor<RCAT, RKIND> *RESTRICT product, SubscriptValue rows,
SubscriptValue n, const XT *RESTRICT x, const YT *RESTRICT y) {
using ResultType = CppTypeFor<RCAT, RKIND>;
std::memset(product, 0, rows * sizeof *product);
for (SubscriptValue i{0}; i < rows; ++i) {
for (SubscriptValue k{0}; k < n; ++k) {
ResultType x_ki = static_cast<ResultType>(x[i * n + k]);
ResultType y_k = static_cast<ResultType>(y[k]);
product[i] += x_ki * y_k;
}
}
}
// Implements an instance of MATMUL for given argument types.
template <bool IS_ALLOCATING, TypeCategory RCAT, int RKIND, typename XT,
typename YT>
inline static void DoMatmulTranspose(
std::conditional_t<IS_ALLOCATING, Descriptor, const Descriptor> &result,
const Descriptor &x, const Descriptor &y, Terminator &terminator) {
int xRank{x.rank()};
int yRank{y.rank()};
int resRank{xRank + yRank - 2};
if (xRank * yRank != 2 * resRank) {
terminator.Crash(
"MATMUL-TRANSPOSE: bad argument ranks (%d * %d)", xRank, yRank);
}
SubscriptValue extent[2]{x.GetDimension(1).Extent(),
resRank == 2 ? y.GetDimension(1).Extent() : 0};
if constexpr (IS_ALLOCATING) {
result.Establish(
RCAT, RKIND, nullptr, resRank, extent, CFI_attribute_allocatable);
for (int j{0}; j < resRank; ++j) {
result.GetDimension(j).SetBounds(1, extent[j]);
}
if (int stat{result.Allocate()}) {
terminator.Crash(
"MATMUL-TRANSPOSE: could not allocate memory for result; STAT=%d",
stat);
}
} else {
RUNTIME_CHECK(terminator, resRank == result.rank());
RUNTIME_CHECK(
terminator, result.ElementBytes() == static_cast<std::size_t>(RKIND));
RUNTIME_CHECK(terminator, result.GetDimension(0).Extent() == extent[0]);
RUNTIME_CHECK(terminator,
resRank == 1 || result.GetDimension(1).Extent() == extent[1]);
}
SubscriptValue n{x.GetDimension(0).Extent()};
if (n != y.GetDimension(0).Extent()) {
terminator.Crash(
"MATMUL-TRANSPOSE: unacceptable operand shapes (%jdx%jd, %jdx%jd)",
static_cast<std::intmax_t>(x.GetDimension(0).Extent()),
static_cast<std::intmax_t>(x.GetDimension(1).Extent()),
static_cast<std::intmax_t>(y.GetDimension(0).Extent()),
static_cast<std::intmax_t>(y.GetDimension(1).Extent()));
}
using WriteResult =
CppTypeFor<RCAT == TypeCategory::Logical ? TypeCategory::Integer : RCAT,
RKIND>;
const SubscriptValue rows{extent[0]};
const SubscriptValue cols{extent[1]};
if constexpr (RCAT != TypeCategory::Logical) {
if (x.IsContiguous() && y.IsContiguous() &&
(IS_ALLOCATING || result.IsContiguous())) {
// Contiguous numeric matrices
if (resRank == 2) { // M*M -> M
MatrixTransposedTimesMatrix<RCAT, RKIND, XT, YT>(
result.template OffsetElement<WriteResult>(), rows, cols,
x.OffsetElement<XT>(), y.OffsetElement<YT>(), n);
return;
}
if (xRank == 2) { // M*V -> V
MatrixTransposedTimesVector<RCAT, RKIND, XT, YT>(
result.template OffsetElement<WriteResult>(), rows, n,
x.OffsetElement<XT>(), y.OffsetElement<YT>());
return;
}
// else V*M -> V (not allowed because TRANSPOSE() is only defined for rank
// 1 matrices
terminator.Crash(
"MATMUL-TRANSPOSE: unacceptable operand shapes (%jdx%jd, %jdx%jd)",
static_cast<std::intmax_t>(x.GetDimension(0).Extent()),
static_cast<std::intmax_t>(n),
static_cast<std::intmax_t>(y.GetDimension(0).Extent()),
static_cast<std::intmax_t>(y.GetDimension(1).Extent()));
return;
}
}
// General algorithms for LOGICAL and noncontiguity
SubscriptValue xLB[2], yLB[2], resLB[2];
x.GetLowerBounds(xLB);
y.GetLowerBounds(yLB);
result.GetLowerBounds(resLB);
using ResultType = CppTypeFor<RCAT, RKIND>;
if (resRank == 2) { // M*M -> M
for (SubscriptValue i{0}; i < rows; ++i) {
for (SubscriptValue j{0}; j < cols; ++j) {
ResultType res_ij;
if constexpr (RCAT == TypeCategory::Logical) {
res_ij = false;
} else {
res_ij = 0;
}
for (SubscriptValue k{0}; k < n; ++k) {
SubscriptValue xAt[2]{k + xLB[0], i + xLB[1]};
SubscriptValue yAt[2]{k + yLB[0], j + yLB[1]};
if constexpr (RCAT == TypeCategory::Logical) {
ResultType x_ki = IsLogicalElementTrue(x, xAt);
ResultType y_kj = IsLogicalElementTrue(y, yAt);
res_ij = res_ij || (x_ki && y_kj);
} else {
ResultType x_ki = static_cast<ResultType>(*x.Element<XT>(xAt));
ResultType y_kj = static_cast<ResultType>(*y.Element<YT>(yAt));
res_ij += x_ki * y_kj;
}
}
SubscriptValue resAt[2]{i + resLB[0], j + resLB[1]};
*result.template Element<WriteResult>(resAt) = res_ij;
}
}
} else if (xRank == 2) { // M*V -> V
for (SubscriptValue i{0}; i < rows; ++i) {
ResultType res_i;
if constexpr (RCAT == TypeCategory::Logical) {
res_i = false;
} else {
res_i = 0;
}
for (SubscriptValue k{0}; k < n; ++k) {
SubscriptValue xAt[2]{k + xLB[0], i + xLB[1]};
SubscriptValue yAt[1]{k + yLB[0]};
if constexpr (RCAT == TypeCategory::Logical) {
ResultType x_ki = IsLogicalElementTrue(x, xAt);
ResultType y_k = IsLogicalElementTrue(y, yAt);
res_i = res_i || (x_ki && y_k);
} else {
ResultType x_ki = static_cast<ResultType>(*x.Element<XT>(xAt));
ResultType y_k = static_cast<ResultType>(*y.Element<YT>(yAt));
res_i += x_ki * y_k;
}
}
SubscriptValue resAt[1]{i + resLB[0]};
*result.template Element<WriteResult>(resAt) = res_i;
}
} else { // V*M -> V
// TRANSPOSE(V) not allowed by fortran standard
terminator.Crash(
"MATMUL-TRANSPOSE: unacceptable operand shapes (%jdx%jd, %jdx%jd)",
static_cast<std::intmax_t>(x.GetDimension(0).Extent()),
static_cast<std::intmax_t>(n),
static_cast<std::intmax_t>(y.GetDimension(0).Extent()),
static_cast<std::intmax_t>(y.GetDimension(1).Extent()));
}
}
// Maps the dynamic type information from the arguments' descriptors
// to the right instantiation of DoMatmul() for valid combinations of
// types.
template <bool IS_ALLOCATING> struct MatmulTranspose {
using ResultDescriptor =
std::conditional_t<IS_ALLOCATING, Descriptor, const Descriptor>;
template <TypeCategory XCAT, int XKIND> struct MM1 {
template <TypeCategory YCAT, int YKIND> struct MM2 {
void operator()(ResultDescriptor &result, const Descriptor &x,
const Descriptor &y, Terminator &terminator) const {
if constexpr (constexpr auto resultType{
GetResultType(XCAT, XKIND, YCAT, YKIND)}) {
if constexpr (Fortran::common::IsNumericTypeCategory(
resultType->first) ||
resultType->first == TypeCategory::Logical) {
return DoMatmulTranspose<IS_ALLOCATING, resultType->first,
resultType->second, CppTypeFor<XCAT, XKIND>,
CppTypeFor<YCAT, YKIND>>(result, x, y, terminator);
}
}
terminator.Crash("MATMUL-TRANSPOSE: bad operand types (%d(%d), %d(%d))",
static_cast<int>(XCAT), XKIND, static_cast<int>(YCAT), YKIND);
}
};
void operator()(ResultDescriptor &result, const Descriptor &x,
const Descriptor &y, Terminator &terminator, TypeCategory yCat,
int yKind) const {
ApplyType<MM2, void>(yCat, yKind, terminator, result, x, y, terminator);
}
};
void operator()(ResultDescriptor &result, const Descriptor &x,
const Descriptor &y, const char *sourceFile, int line) const {
Terminator terminator{sourceFile, line};
auto xCatKind{x.type().GetCategoryAndKind()};
auto yCatKind{y.type().GetCategoryAndKind()};
RUNTIME_CHECK(terminator, xCatKind.has_value() && yCatKind.has_value());
ApplyType<MM1, void>(xCatKind->first, xCatKind->second, terminator, result,
x, y, terminator, yCatKind->first, yCatKind->second);
}
};
} // namespace
namespace Fortran::runtime {
extern "C" {
void RTNAME(MatmulTranspose)(Descriptor &result, const Descriptor &x,
const Descriptor &y, const char *sourceFile, int line) {
MatmulTranspose<true>{}(result, x, y, sourceFile, line);
}
void RTNAME(MatmulTransposeDirect)(const Descriptor &result,
const Descriptor &x, const Descriptor &y, const char *sourceFile,
int line) {
MatmulTranspose<false>{}(result, x, y, sourceFile, line);
}
} // extern "C"
} // namespace Fortran::runtime
|