llvm-project/llvm/tools/llvm-exegesis/lib/Uops.cpp

212 lines
8.1 KiB
C++

//===-- Uops.cpp ------------------------------------------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
#include "Uops.h"
#include "BenchmarkResult.h"
#include "InstructionSnippetGenerator.h"
#include "PerfHelper.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/MC/MCInstrDesc.h"
#include "llvm/MC/MCSchedule.h"
#include "llvm/Support/Error.h"
#include <algorithm>
#include <random>
#include <unordered_map>
#include <unordered_set>
namespace exegesis {
// FIXME: Handle memory (see PR36906)
static bool isInvalidOperand(const llvm::MCOperandInfo &OpInfo) {
switch (OpInfo.OperandType) {
default:
return true;
case llvm::MCOI::OPERAND_IMMEDIATE:
case llvm::MCOI::OPERAND_REGISTER:
return false;
}
}
static llvm::Error makeError(llvm::Twine Msg) {
return llvm::make_error<llvm::StringError>(Msg,
llvm::inconvertibleErrorCode());
}
static std::vector<llvm::MCInst> generateIndependentAssignments(
const LLVMState &State, const llvm::MCInstrDesc &InstrDesc,
llvm::SmallVector<Variable, 8> Vars, int MaxAssignments) {
std::unordered_set<llvm::MCPhysReg> IsUsedByAnyVar;
for (const Variable &Var : Vars) {
if (Var.IsUse) {
IsUsedByAnyVar.insert(Var.PossibleRegisters.begin(),
Var.PossibleRegisters.end());
}
}
std::vector<llvm::MCInst> Pattern;
for (int A = 0; A < MaxAssignments; ++A) {
// FIXME: This is a bit pessimistic. We should get away with an
// assignment that ensures that the set of assigned registers for uses and
// the set of assigned registers for defs do not intersect (registers
// for uses (resp defs) do not have to be all distinct).
const std::vector<llvm::MCPhysReg> Regs = getExclusiveAssignment(Vars);
if (Regs.empty())
break;
// Remove all assigned registers defs that are used by at least one other
// variable from the list of possible variable registers. This ensures that
// we never create a RAW hazard that would lead to serialization.
for (size_t I = 0, E = Vars.size(); I < E; ++I) {
llvm::MCPhysReg Reg = Regs[I];
if (Vars[I].IsDef && IsUsedByAnyVar.count(Reg)) {
Vars[I].PossibleRegisters.remove(Reg);
}
}
// Create an MCInst and check assembly.
llvm::MCInst Inst = generateMCInst(InstrDesc, Vars, Regs);
if (!State.canAssemble(Inst))
continue;
Pattern.push_back(std::move(Inst));
}
return Pattern;
}
UopsBenchmarkRunner::~UopsBenchmarkRunner() = default;
const char *UopsBenchmarkRunner::getDisplayName() const { return "uops"; }
llvm::Expected<std::vector<llvm::MCInst>> UopsBenchmarkRunner::createCode(
const LLVMState &State, const unsigned OpcodeIndex,
const unsigned NumRepetitions, const JitFunctionContext &Context) const {
const auto &InstrInfo = State.getInstrInfo();
const auto &RegInfo = State.getRegInfo();
const llvm::MCInstrDesc &InstrDesc = InstrInfo.get(OpcodeIndex);
for (const llvm::MCOperandInfo &OpInfo : InstrDesc.operands()) {
if (isInvalidOperand(OpInfo))
return makeError("Only registers and immediates are supported");
}
// FIXME: Load constants into registers (e.g. with fld1) to not break
// instructions like x87.
// Ideally we would like the only limitation on executing uops to be the issue
// ports. Maximizing port pressure increases the likelihood that the load is
// distributed evenly across possible ports.
// To achieve that, one approach is to generate instructions that do not have
// data dependencies between them.
//
// For some instructions, this is trivial:
// mov rax, qword ptr [rsi]
// mov rax, qword ptr [rsi]
// mov rax, qword ptr [rsi]
// mov rax, qword ptr [rsi]
// For the above snippet, haswell just renames rax four times and executes the
// four instructions two at a time on P23 and P0126.
//
// For some instructions, we just need to make sure that the source is
// different from the destination. For example, IDIV8r reads from GPR and
// writes to AX. We just need to ensure that the variable is assigned a
// register which is different from AX:
// idiv bx
// idiv bx
// idiv bx
// idiv bx
// The above snippet will be able to fully saturate the ports, while the same
// with ax would issue one uop every `latency(IDIV8r)` cycles.
//
// Some instructions make this harder because they both read and write from
// the same register:
// inc rax
// inc rax
// inc rax
// inc rax
// This has a data dependency from each instruction to the next, limit the
// number of instructions that can be issued in parallel.
// It turns out that this is not a big issue on recent Intel CPUs because they
// have heuristics to balance port pressure. In the snippet above, subsequent
// instructions will end up evenly distributed on {P0,P1,P5,P6}, but some CPUs
// might end up executing them all on P0 (just because they can), or try
// avoiding P5 because it's usually under high pressure from vector
// instructions.
// This issue is even more important for high-latency instructions because
// they increase the idle time of the CPU, e.g. :
// imul rax, rbx
// imul rax, rbx
// imul rax, rbx
// imul rax, rbx
//
// To avoid that, we do the renaming statically by generating as many
// independent exclusive assignments as possible (until all possible registers
// are exhausted) e.g.:
// imul rax, rbx
// imul rcx, rbx
// imul rdx, rbx
// imul r8, rbx
//
// Some instruction even make the above static renaming impossible because
// they implicitly read and write from the same operand, e.g. ADC16rr reads
// and writes from EFLAGS.
// In that case we just use a greedy register assignment and hope for the
// best.
const auto Vars = getVariables(RegInfo, InstrDesc, Context.getReservedRegs());
// Generate as many independent exclusive assignments as possible.
constexpr const int MaxStaticRenames = 20;
std::vector<llvm::MCInst> Pattern =
generateIndependentAssignments(State, InstrDesc, Vars, MaxStaticRenames);
if (Pattern.empty()) {
// We don't even have a single exclusive assignment, fallback to a greedy
// assignment.
// FIXME: Tell the user about this decision to help debugging.
const std::vector<llvm::MCPhysReg> Regs = getGreedyAssignment(Vars);
if (!Vars.empty() && Regs.empty())
return makeError("No feasible greedy assignment");
llvm::MCInst Inst = generateMCInst(InstrDesc, Vars, Regs);
if (!State.canAssemble(Inst))
return makeError("Cannot assemble greedy assignment");
Pattern.push_back(std::move(Inst));
}
// Generate repetitions of the pattern until benchmark_iterations is reached.
std::vector<llvm::MCInst> Result;
Result.reserve(NumRepetitions);
for (unsigned I = 0; I < NumRepetitions; ++I)
Result.push_back(Pattern[I % Pattern.size()]);
return Result;
}
std::vector<BenchmarkMeasure>
UopsBenchmarkRunner::runMeasurements(const LLVMState &State,
const JitFunction &Function,
const unsigned NumRepetitions) const {
const auto &SchedModel = State.getSubtargetInfo().getSchedModel();
std::vector<BenchmarkMeasure> Result;
for (unsigned ProcResIdx = 1;
ProcResIdx < SchedModel.getNumProcResourceKinds(); ++ProcResIdx) {
const char *const PfmCounters = SchedModel.getExtraProcessorInfo()
.PfmCounters.IssueCounters[ProcResIdx];
if (!PfmCounters)
continue;
// FIXME: Sum results when there are several counters for a single ProcRes
// (e.g. P23 on SandyBridge).
pfm::Counter Counter{pfm::PerfEvent(PfmCounters)};
Counter.start();
Function();
Counter.stop();
Result.push_back({llvm::itostr(ProcResIdx),
static_cast<double>(Counter.read()) / NumRepetitions,
SchedModel.getProcResource(ProcResIdx)->Name});
}
return Result;
}
} // namespace exegesis