| 12
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
 100
 101
 102
 103
 104
 105
 106
 107
 108
 109
 110
 111
 112
 113
 114
 115
 116
 117
 118
 119
 120
 121
 122
 123
 124
 125
 126
 127
 128
 129
 130
 131
 132
 133
 134
 135
 136
 137
 138
 139
 140
 141
 142
 143
 144
 145
 146
 147
 148
 149
 150
 151
 152
 153
 154
 155
 156
 157
 158
 159
 160
 161
 162
 163
 164
 165
 166
 167
 168
 169
 170
 171
 172
 173
 174
 175
 176
 177
 178
 179
 180
 181
 182
 183
 184
 185
 186
 187
 188
 189
 190
 191
 192
 193
 194
 195
 196
 197
 198
 199
 200
 201
 202
 203
 204
 205
 206
 207
 208
 209
 210
 211
 212
 213
 214
 215
 216
 217
 218
 219
 220
 221
 222
 223
 224
 225
 226
 227
 228
 229
 230
 231
 232
 233
 234
 235
 236
 237
 238
 239
 240
 241
 242
 243
 244
 245
 246
 247
 248
 249
 250
 251
 252
 253
 254
 255
 256
 257
 258
 259
 260
 261
 262
 263
 264
 265
 266
 267
 268
 269
 270
 271
 272
 273
 274
 275
 276
 277
 278
 279
 280
 281
 282
 283
 284
 285
 286
 287
 288
 289
 290
 291
 292
 293
 294
 295
 296
 297
 298
 299
 300
 301
 302
 303
 304
 305
 306
 307
 308
 309
 310
 311
 312
 313
 314
 315
 316
 317
 318
 319
 320
 321
 322
 323
 324
 325
 326
 327
 328
 329
 330
 331
 332
 333
 334
 335
 336
 337
 338
 339
 340
 341
 342
 343
 344
 345
 346
 347
 348
 349
 350
 351
 352
 353
 354
 355
 356
 357
 
 | //===-- ParallelSnippetGenerator.cpp ----------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "ParallelSnippetGenerator.h"
#include "BenchmarkRunner.h"
#include "MCInstrDescView.h"
#include "Target.h"
// FIXME: Load constants into registers (e.g. with fld1) to not break
// instructions like x87.
// Ideally we would like the only limitation on executing instructions to be the
// availability of the CPU resources (e.g. execution ports) needed to execute
// them, instead of the availability of their data dependencies.
// To achieve that, one approach is to generate instructions that do not have
// data dependencies between them.
//
// For some instructions, this is trivial:
//    mov rax, qword ptr [rsi]
//    mov rax, qword ptr [rsi]
//    mov rax, qword ptr [rsi]
//    mov rax, qword ptr [rsi]
// For the above snippet, haswell just renames rax four times and executes the
// four instructions two at a time on P23 and P0126.
//
// For some instructions, we just need to make sure that the source is
// different from the destination. For example, IDIV8r reads from GPR and
// writes to AX. We just need to ensure that the Var is assigned a
// register which is different from AX:
//    idiv bx
//    idiv bx
//    idiv bx
//    idiv bx
// The above snippet will be able to fully saturate the ports, while the same
// with ax would issue one uop every `latency(IDIV8r)` cycles.
//
// Some instructions make this harder because they both read and write from
// the same register:
//    inc rax
//    inc rax
//    inc rax
//    inc rax
// This has a data dependency from each instruction to the next, limit the
// number of instructions that can be issued in parallel.
// It turns out that this is not a big issue on recent Intel CPUs because they
// have heuristics to balance port pressure. In the snippet above, subsequent
// instructions will end up evenly distributed on {P0,P1,P5,P6}, but some CPUs
// might end up executing them all on P0 (just because they can), or try
// avoiding P5 because it's usually under high pressure from vector
// instructions.
// This issue is even more important for high-latency instructions because
// they increase the idle time of the CPU, e.g. :
//    imul rax, rbx
//    imul rax, rbx
//    imul rax, rbx
//    imul rax, rbx
//
// To avoid that, we do the renaming statically by generating as many
// independent exclusive assignments as possible (until all possible registers
// are exhausted) e.g.:
//    imul rax, rbx
//    imul rcx, rbx
//    imul rdx, rbx
//    imul r8,  rbx
//
// Some instruction even make the above static renaming impossible because
// they implicitly read and write from the same operand, e.g. ADC16rr reads
// and writes from EFLAGS.
// In that case we just use a greedy register assignment and hope for the
// best.
namespace llvm {
namespace exegesis {
static bool hasVariablesWithTiedOperands(const Instruction &Instr) {
  SmallVector<const Variable *, 8> Result;
  for (const auto &Var : Instr.Variables)
    if (Var.hasTiedOperands())
      return true;
  return false;
}
ParallelSnippetGenerator::~ParallelSnippetGenerator() = default;
void ParallelSnippetGenerator::instantiateMemoryOperands(
    const unsigned ScratchSpacePointerInReg,
    std::vector<InstructionTemplate> &Instructions) const {
  if (ScratchSpacePointerInReg == 0)
    return; // no memory operands.
  const auto &ET = State.getExegesisTarget();
  const unsigned MemStep = ET.getMaxMemoryAccessSize();
  const size_t OriginalInstructionsSize = Instructions.size();
  size_t I = 0;
  for (InstructionTemplate &IT : Instructions) {
    ET.fillMemoryOperands(IT, ScratchSpacePointerInReg, I * MemStep);
    ++I;
  }
  while (Instructions.size() < kMinNumDifferentAddresses) {
    InstructionTemplate IT = Instructions[I % OriginalInstructionsSize];
    ET.fillMemoryOperands(IT, ScratchSpacePointerInReg, I * MemStep);
    ++I;
    Instructions.push_back(std::move(IT));
  }
  assert(I * MemStep < BenchmarkRunner::ScratchSpace::kSize &&
         "not enough scratch space");
}
enum class RegRandomizationStrategy : uint8_t {
  PickRandomRegs,
  SingleStaticRegPerOperand,
  SingleStaticReg,
  FIRST = PickRandomRegs,
  LAST = SingleStaticReg,
};
} // namespace exegesis
template <> struct enum_iteration_traits<exegesis::RegRandomizationStrategy> {
  static constexpr bool is_iterable = true;
};
namespace exegesis {
const char *getDescription(RegRandomizationStrategy S) {
  switch (S) {
  case RegRandomizationStrategy::PickRandomRegs:
    return "randomizing registers";
  case RegRandomizationStrategy::SingleStaticRegPerOperand:
    return "one unique register for each position";
  case RegRandomizationStrategy::SingleStaticReg:
    return "reusing the same register for all positions";
  }
  llvm_unreachable("Unknown UseRegRandomizationStrategy enum");
}
static std::variant<std::nullopt_t, MCOperand, Register>
generateSingleRegisterForInstrAvoidingDefUseOverlap(
    const LLVMState &State, const BitVector &ForbiddenRegisters,
    const BitVector &ImplicitUseAliases, const BitVector &ImplicitDefAliases,
    const BitVector &Uses, const BitVector &Defs, const InstructionTemplate &IT,
    const Operand &Op, const ArrayRef<InstructionTemplate> Instructions,
    RegRandomizationStrategy S) {
  const Instruction &Instr = IT.getInstr();
  assert(Op.isReg() && Op.isExplicit() && !Op.isMemory() &&
         !IT.getValueFor(Op).isValid());
  assert((!Op.isUse() || !Op.isTied()) &&
         "Not expecting to see a tied use reg");
  if (Op.isUse()) {
    switch (S) {
    case RegRandomizationStrategy::PickRandomRegs:
      break;
    case RegRandomizationStrategy::SingleStaticReg:
    case RegRandomizationStrategy::SingleStaticRegPerOperand: {
      if (!Instructions.empty())
        return Instructions.front().getValueFor(Op);
      if (S != RegRandomizationStrategy::SingleStaticReg)
        break;
      BitVector PossibleRegisters = Op.getRegisterAliasing().sourceBits();
      const BitVector UseAliases = getAliasedBits(State.getRegInfo(), Uses);
      if (std::optional<int> CommonBit =
              getFirstCommonBit(PossibleRegisters, UseAliases))
        return *CommonBit;
      break;
    }
    }
  }
  BitVector PossibleRegisters = Op.getRegisterAliasing().sourceBits();
  remove(PossibleRegisters, ForbiddenRegisters);
  if (Op.isDef()) {
    remove(PossibleRegisters, ImplicitUseAliases);
    const BitVector UseAliases = getAliasedBits(State.getRegInfo(), Uses);
    remove(PossibleRegisters, UseAliases);
  }
  if (Op.isUse()) {
    remove(PossibleRegisters, ImplicitDefAliases);
    // NOTE: in general, using same reg for multiple Use's is fine.
    if (S == RegRandomizationStrategy::SingleStaticRegPerOperand) {
      const BitVector UseAliases = getAliasedBits(State.getRegInfo(), Uses);
      remove(PossibleRegisters, UseAliases);
    }
  }
  bool IsDefWithTiedUse =
      Instr.Variables[Op.getVariableIndex()].hasTiedOperands();
  if (Op.isUse() || IsDefWithTiedUse) {
    // Now, important bit: if we have used some register for def,
    // then we can not use that same register for *any* use,
    // be it either an untied use, or an use tied to a def.
    // But def-ing same regs is fine, as long as there are no uses!
    const BitVector DefsAliases = getAliasedBits(State.getRegInfo(), Defs);
    remove(PossibleRegisters, DefsAliases);
  }
  if (!PossibleRegisters.any())
    return std::nullopt;
  return randomBit(PossibleRegisters);
}
static std::optional<InstructionTemplate>
generateSingleSnippetForInstrAvoidingDefUseOverlap(
    const LLVMState &State, const BitVector &ForbiddenRegisters,
    const BitVector &ImplicitUseAliases, const BitVector &ImplicitDefAliases,
    BitVector &Uses, BitVector &Defs, InstructionTemplate IT,
    const ArrayRef<InstructionTemplate> Instructions,
    RegRandomizationStrategy S) {
  const Instruction &Instr = IT.getInstr();
  for (const Operand &Op : Instr.Operands) {
    if (!Op.isReg() || !Op.isExplicit() || Op.isMemory() ||
        IT.getValueFor(Op).isValid())
      continue;
    assert((!Op.isUse() || !Op.isTied()) && "Will not get tied uses.");
    std::variant<std::nullopt_t, MCOperand, Register> R =
        generateSingleRegisterForInstrAvoidingDefUseOverlap(
            State, ForbiddenRegisters, ImplicitUseAliases, ImplicitDefAliases,
            Uses, Defs, IT, Op, Instructions, S);
    if (std::holds_alternative<std::nullopt_t>(R))
      return {};
    MCOperand MCOp;
    if (std::holds_alternative<MCOperand>(R))
      MCOp = std::get<MCOperand>(R);
    else {
      Register RandomReg = std::get<Register>(R);
      if (Op.isDef())
        Defs.set(RandomReg);
      if (Op.isUse())
        Uses.set(RandomReg);
      MCOp = MCOperand::createReg(RandomReg);
    }
    IT.getValueFor(Op) = MCOp;
  }
  return IT;
}
static std::vector<InstructionTemplate>
generateSnippetForInstrAvoidingDefUseOverlap(
    const LLVMState &State, const InstructionTemplate &IT,
    RegRandomizationStrategy S, const BitVector &ForbiddenRegisters) {
  // We don't want to accidentally serialize the instruction,
  // so we must be sure that we don't pick a def that is an implicit use,
  // or a use that is an implicit def, so record implicit regs now.
  BitVector ImplicitUses(State.getRegInfo().getNumRegs());
  BitVector ImplicitDefs(State.getRegInfo().getNumRegs());
  for (const auto &Op : IT.getInstr().Operands) {
    if (Op.isReg() && Op.isImplicit() && !Op.isMemory()) {
      assert(Op.isImplicitReg() && "Not an implicit register operand?");
      if (Op.isUse())
        ImplicitUses.set(Op.getImplicitReg());
      else {
        assert(Op.isDef() && "Not a use and not a def?");
        ImplicitDefs.set(Op.getImplicitReg());
      }
    }
  }
  const BitVector ImplicitUseAliases =
      getAliasedBits(State.getRegInfo(), ImplicitUses);
  const BitVector ImplicitDefAliases =
      getAliasedBits(State.getRegInfo(), ImplicitDefs);
  BitVector Defs(State.getRegInfo().getNumRegs());
  BitVector Uses(State.getRegInfo().getNumRegs());
  std::vector<InstructionTemplate> Instructions;
  while (true) {
    std::optional<InstructionTemplate> TmpIT =
        generateSingleSnippetForInstrAvoidingDefUseOverlap(
            State, ForbiddenRegisters, ImplicitUseAliases, ImplicitDefAliases,
            Uses, Defs, IT, Instructions, S);
    if (!TmpIT)
      return Instructions;
    Instructions.push_back(std::move(*TmpIT));
    if (!hasVariablesWithTiedOperands(IT.getInstr()))
      return Instructions;
    assert(Instructions.size() <= 128 && "Stuck in endless loop?");
  }
}
Expected<std::vector<CodeTemplate>>
ParallelSnippetGenerator::generateCodeTemplates(
    InstructionTemplate Variant, const BitVector &ForbiddenRegisters) const {
  const Instruction &Instr = Variant.getInstr();
  CodeTemplate CT;
  CT.ScratchSpacePointerInReg =
      Instr.hasMemoryOperands()
          ? State.getExegesisTarget().getScratchMemoryRegister(
                State.getTargetMachine().getTargetTriple())
          : 0;
  const AliasingConfigurations SelfAliasing(Instr, Instr, ForbiddenRegisters);
  if (SelfAliasing.empty()) {
    CT.Info = "instruction is parallel, repeating a random one.";
    CT.Instructions.push_back(std::move(Variant));
    instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
    return getSingleton(std::move(CT));
  }
  if (SelfAliasing.hasImplicitAliasing()) {
    CT.Info = "instruction is serial, repeating a random one.";
    CT.Instructions.push_back(std::move(Variant));
    instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
    return getSingleton(std::move(CT));
  }
  std::vector<CodeTemplate> Result;
  bool HasTiedOperands = hasVariablesWithTiedOperands(Instr);
  // If there are no tied operands, then we don't want to "saturate backedge",
  // and the template we will produce will have only a single instruction.
  unsigned NumUntiedUseRegs = count_if(Instr.Operands, [](const Operand &Op) {
    return Op.isReg() && Op.isExplicit() && !Op.isMemory() && Op.isUse() &&
           !Op.isTied();
  });
  SmallVector<RegRandomizationStrategy, 3> Strategies;
  if (HasTiedOperands || NumUntiedUseRegs >= 3)
    Strategies.push_back(RegRandomizationStrategy::PickRandomRegs);
  if (NumUntiedUseRegs >= 2)
    Strategies.push_back(RegRandomizationStrategy::SingleStaticRegPerOperand);
  Strategies.push_back(RegRandomizationStrategy::SingleStaticReg);
  for (RegRandomizationStrategy S : Strategies) {
    CodeTemplate CurrCT = CT.clone();
    CurrCT.Info =
        Twine("instruction has ")
            .concat(HasTiedOperands ? "" : "no ")
            .concat("tied variables, avoiding "
                    "Read-After-Write issue, picking random def and use "
                    "registers not aliasing each other, for uses, ")
            .concat(getDescription(S))
            .str();
    CurrCT.Instructions = generateSnippetForInstrAvoidingDefUseOverlap(
        State, Variant, S, ForbiddenRegisters);
    if (CurrCT.Instructions.empty())
      return make_error<StringError>(
          Twine("Failed to produce any snippet via: ").concat(CurrCT.Info),
          inconvertibleErrorCode());
    instantiateMemoryOperands(CurrCT.ScratchSpacePointerInReg,
                              CurrCT.Instructions);
    Result.push_back(std::move(CurrCT));
  }
  return Result;
}
constexpr const size_t ParallelSnippetGenerator::kMinNumDifferentAddresses;
} // namespace exegesis
} // namespace llvm
 |