361 lines
		
	
	
		
			14 KiB
		
	
	
	
		
			C++
		
	
	
	
			
		
		
	
	
			361 lines
		
	
	
		
			14 KiB
		
	
	
	
		
			C++
		
	
	
	
//===------------------ AMDGPUCustomBehaviour.cpp ---------------*-C++ -* -===//
 | 
						|
//
 | 
						|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 | 
						|
// See https://llvm.org/LICENSE.txt for license information.
 | 
						|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 | 
						|
//
 | 
						|
//===----------------------------------------------------------------------===//
 | 
						|
/// \file
 | 
						|
///
 | 
						|
/// This file implements methods from the AMDGPUCustomBehaviour class.
 | 
						|
///
 | 
						|
//===----------------------------------------------------------------------===//
 | 
						|
 | 
						|
#include "AMDGPUCustomBehaviour.h"
 | 
						|
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 | 
						|
#include "SIInstrInfo.h"
 | 
						|
#include "TargetInfo/AMDGPUTargetInfo.h"
 | 
						|
#include "llvm/MC/TargetRegistry.h"
 | 
						|
#include "llvm/Support/WithColor.h"
 | 
						|
 | 
						|
namespace llvm {
 | 
						|
namespace mca {
 | 
						|
 | 
						|
void AMDGPUInstrPostProcess::postProcessInstruction(
 | 
						|
    std::unique_ptr<Instruction> &Inst, const MCInst &MCI) {
 | 
						|
  switch (MCI.getOpcode()) {
 | 
						|
  case AMDGPU::S_WAITCNT:
 | 
						|
  case AMDGPU::S_WAITCNT_EXPCNT:
 | 
						|
  case AMDGPU::S_WAITCNT_LGKMCNT:
 | 
						|
  case AMDGPU::S_WAITCNT_VMCNT:
 | 
						|
  case AMDGPU::S_WAITCNT_VSCNT:
 | 
						|
  case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
 | 
						|
  case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
 | 
						|
  case AMDGPU::S_WAITCNT_VMCNT_gfx10:
 | 
						|
  case AMDGPU::S_WAITCNT_VSCNT_gfx10:
 | 
						|
  case AMDGPU::S_WAITCNT_gfx10:
 | 
						|
  case AMDGPU::S_WAITCNT_gfx6_gfx7:
 | 
						|
  case AMDGPU::S_WAITCNT_vi:
 | 
						|
    return processWaitCnt(Inst, MCI);
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
// s_waitcnt instructions encode important information as immediate operands
 | 
						|
// which are lost during the MCInst -> mca::Instruction lowering.
 | 
						|
void AMDGPUInstrPostProcess::processWaitCnt(std::unique_ptr<Instruction> &Inst,
 | 
						|
                                            const MCInst &MCI) {
 | 
						|
  for (int Idx = 0, N = MCI.size(); Idx < N; Idx++) {
 | 
						|
    MCAOperand Op;
 | 
						|
    const MCOperand &MCOp = MCI.getOperand(Idx);
 | 
						|
    if (MCOp.isReg()) {
 | 
						|
      Op = MCAOperand::createReg(MCOp.getReg());
 | 
						|
    } else if (MCOp.isImm()) {
 | 
						|
      Op = MCAOperand::createImm(MCOp.getImm());
 | 
						|
    }
 | 
						|
    Op.setIndex(Idx);
 | 
						|
    Inst->addOperand(Op);
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
AMDGPUCustomBehaviour::AMDGPUCustomBehaviour(const MCSubtargetInfo &STI,
 | 
						|
                                             const mca::SourceMgr &SrcMgr,
 | 
						|
                                             const MCInstrInfo &MCII)
 | 
						|
    : CustomBehaviour(STI, SrcMgr, MCII) {
 | 
						|
  generateWaitCntInfo();
 | 
						|
}
 | 
						|
 | 
						|
unsigned AMDGPUCustomBehaviour::checkCustomHazard(ArrayRef<InstRef> IssuedInst,
 | 
						|
                                                  const InstRef &IR) {
 | 
						|
  const Instruction &Inst = *IR.getInstruction();
 | 
						|
  unsigned Opcode = Inst.getOpcode();
 | 
						|
 | 
						|
  // llvm-mca is generally run on fully compiled assembly so we wouldn't see any
 | 
						|
  // pseudo instructions here. However, there are plans for the future to make
 | 
						|
  // it possible to use mca within backend passes. As such, I have left the
 | 
						|
  // pseudo version of s_waitcnt within this switch statement.
 | 
						|
  switch (Opcode) {
 | 
						|
  default:
 | 
						|
    return 0;
 | 
						|
  case AMDGPU::S_WAITCNT: // This instruction
 | 
						|
  case AMDGPU::S_WAITCNT_EXPCNT:
 | 
						|
  case AMDGPU::S_WAITCNT_LGKMCNT:
 | 
						|
  case AMDGPU::S_WAITCNT_VMCNT:
 | 
						|
  case AMDGPU::S_WAITCNT_VSCNT: // to this instruction are all pseudo.
 | 
						|
  case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
 | 
						|
  case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
 | 
						|
  case AMDGPU::S_WAITCNT_VMCNT_gfx10:
 | 
						|
  case AMDGPU::S_WAITCNT_VSCNT_gfx10:
 | 
						|
  case AMDGPU::S_WAITCNT_gfx10:
 | 
						|
  case AMDGPU::S_WAITCNT_gfx6_gfx7:
 | 
						|
  case AMDGPU::S_WAITCNT_vi:
 | 
						|
    // s_endpgm also behaves as if there is an implicit
 | 
						|
    // s_waitcnt 0, but I'm not sure if it would be appropriate
 | 
						|
    // to model this in llvm-mca based on how the iterations work
 | 
						|
    // while simulating the pipeline over and over.
 | 
						|
    return handleWaitCnt(IssuedInst, IR);
 | 
						|
  }
 | 
						|
 | 
						|
  return 0;
 | 
						|
}
 | 
						|
 | 
						|
unsigned AMDGPUCustomBehaviour::handleWaitCnt(ArrayRef<InstRef> IssuedInst,
 | 
						|
                                              const InstRef &IR) {
 | 
						|
  // Currently, all s_waitcnt instructions are handled except s_waitcnt_depctr.
 | 
						|
  // I do not know how that instruction works so I did not attempt to model it.
 | 
						|
  // set the max values to begin
 | 
						|
  unsigned Vmcnt = 63;
 | 
						|
  unsigned Expcnt = 7;
 | 
						|
  unsigned Lgkmcnt = 31;
 | 
						|
  unsigned Vscnt = 63;
 | 
						|
  unsigned CurrVmcnt = 0;
 | 
						|
  unsigned CurrExpcnt = 0;
 | 
						|
  unsigned CurrLgkmcnt = 0;
 | 
						|
  unsigned CurrVscnt = 0;
 | 
						|
  unsigned CyclesToWaitVm = ~0U;
 | 
						|
  unsigned CyclesToWaitExp = ~0U;
 | 
						|
  unsigned CyclesToWaitLgkm = ~0U;
 | 
						|
  unsigned CyclesToWaitVs = ~0U;
 | 
						|
 | 
						|
  computeWaitCnt(IR, Vmcnt, Expcnt, Lgkmcnt, Vscnt);
 | 
						|
 | 
						|
  // We will now look at each of the currently executing instructions
 | 
						|
  // to find out if this wait instruction still needs to wait.
 | 
						|
  for (const InstRef &PrevIR : IssuedInst) {
 | 
						|
    const Instruction &PrevInst = *PrevIR.getInstruction();
 | 
						|
    const unsigned PrevInstIndex = PrevIR.getSourceIndex() % SrcMgr.size();
 | 
						|
    const WaitCntInfo &PrevInstWaitInfo = InstrWaitCntInfo[PrevInstIndex];
 | 
						|
    const int CyclesLeft = PrevInst.getCyclesLeft();
 | 
						|
    assert(CyclesLeft != UNKNOWN_CYCLES &&
 | 
						|
           "We should know how many cycles are left for this instruction");
 | 
						|
    if (PrevInstWaitInfo.VmCnt) {
 | 
						|
      CurrVmcnt++;
 | 
						|
      if ((unsigned)CyclesLeft < CyclesToWaitVm)
 | 
						|
        CyclesToWaitVm = CyclesLeft;
 | 
						|
    }
 | 
						|
    if (PrevInstWaitInfo.ExpCnt) {
 | 
						|
      CurrExpcnt++;
 | 
						|
      if ((unsigned)CyclesLeft < CyclesToWaitExp)
 | 
						|
        CyclesToWaitExp = CyclesLeft;
 | 
						|
    }
 | 
						|
    if (PrevInstWaitInfo.LgkmCnt) {
 | 
						|
      CurrLgkmcnt++;
 | 
						|
      if ((unsigned)CyclesLeft < CyclesToWaitLgkm)
 | 
						|
        CyclesToWaitLgkm = CyclesLeft;
 | 
						|
    }
 | 
						|
    if (PrevInstWaitInfo.VsCnt) {
 | 
						|
      CurrVscnt++;
 | 
						|
      if ((unsigned)CyclesLeft < CyclesToWaitVs)
 | 
						|
        CyclesToWaitVs = CyclesLeft;
 | 
						|
    }
 | 
						|
  }
 | 
						|
 | 
						|
  unsigned CyclesToWait = ~0U;
 | 
						|
  if (CurrVmcnt > Vmcnt && CyclesToWaitVm < CyclesToWait)
 | 
						|
    CyclesToWait = CyclesToWaitVm;
 | 
						|
  if (CurrExpcnt > Expcnt && CyclesToWaitExp < CyclesToWait)
 | 
						|
    CyclesToWait = CyclesToWaitExp;
 | 
						|
  if (CurrLgkmcnt > Lgkmcnt && CyclesToWaitLgkm < CyclesToWait)
 | 
						|
    CyclesToWait = CyclesToWaitLgkm;
 | 
						|
  if (CurrVscnt > Vscnt && CyclesToWaitVs < CyclesToWait)
 | 
						|
    CyclesToWait = CyclesToWaitVs;
 | 
						|
 | 
						|
  // We may underestimate how many cycles we need to wait, but this
 | 
						|
  // isn't a big deal. Our return value is just how many cycles until
 | 
						|
  // this function gets run again. So as long as we don't overestimate
 | 
						|
  // the wait time, we'll still end up stalling at this instruction
 | 
						|
  // for the correct number of cycles.
 | 
						|
 | 
						|
  if (CyclesToWait == ~0U)
 | 
						|
    return 0;
 | 
						|
  return CyclesToWait;
 | 
						|
}
 | 
						|
 | 
						|
void AMDGPUCustomBehaviour::computeWaitCnt(const InstRef &IR, unsigned &Vmcnt,
 | 
						|
                                           unsigned &Expcnt, unsigned &Lgkmcnt,
 | 
						|
                                           unsigned &Vscnt) {
 | 
						|
  AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(STI.getCPU());
 | 
						|
  const Instruction &Inst = *IR.getInstruction();
 | 
						|
  unsigned Opcode = Inst.getOpcode();
 | 
						|
 | 
						|
  switch (Opcode) {
 | 
						|
  case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
 | 
						|
  case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
 | 
						|
  case AMDGPU::S_WAITCNT_VMCNT_gfx10:
 | 
						|
  case AMDGPU::S_WAITCNT_VSCNT_gfx10: {
 | 
						|
    // Should probably be checking for nullptr
 | 
						|
    // here, but I'm not sure how I should handle the case
 | 
						|
    // where we see a nullptr.
 | 
						|
    const MCAOperand *OpReg = Inst.getOperand(0);
 | 
						|
    const MCAOperand *OpImm = Inst.getOperand(1);
 | 
						|
    assert(OpReg && OpReg->isReg() && "First operand should be a register.");
 | 
						|
    assert(OpImm && OpImm->isImm() && "Second operand should be an immediate.");
 | 
						|
    if (OpReg->getReg() != AMDGPU::SGPR_NULL) {
 | 
						|
      // Instruction is using a real register.
 | 
						|
      // Since we can't know what value this register will have,
 | 
						|
      // we can't compute what the value of this wait should be.
 | 
						|
      WithColor::warning() << "The register component of "
 | 
						|
                           << MCII.getName(Opcode) << " will be completely "
 | 
						|
                           << "ignored. So the wait may not be accurate.\n";
 | 
						|
    }
 | 
						|
    switch (Opcode) {
 | 
						|
    // Redundant switch so I don't have to repeat the code above
 | 
						|
    // for each case. There are more clever ways to avoid this
 | 
						|
    // extra switch and anyone can feel free to implement one of them.
 | 
						|
    case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
 | 
						|
      Expcnt = OpImm->getImm();
 | 
						|
      break;
 | 
						|
    case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
 | 
						|
      Lgkmcnt = OpImm->getImm();
 | 
						|
      break;
 | 
						|
    case AMDGPU::S_WAITCNT_VMCNT_gfx10:
 | 
						|
      Vmcnt = OpImm->getImm();
 | 
						|
      break;
 | 
						|
    case AMDGPU::S_WAITCNT_VSCNT_gfx10:
 | 
						|
      Vscnt = OpImm->getImm();
 | 
						|
      break;
 | 
						|
    }
 | 
						|
    return;
 | 
						|
  }
 | 
						|
  case AMDGPU::S_WAITCNT_gfx10:
 | 
						|
  case AMDGPU::S_WAITCNT_gfx6_gfx7:
 | 
						|
  case AMDGPU::S_WAITCNT_vi:
 | 
						|
    unsigned WaitCnt = Inst.getOperand(0)->getImm();
 | 
						|
    AMDGPU::decodeWaitcnt(IV, WaitCnt, Vmcnt, Expcnt, Lgkmcnt);
 | 
						|
    return;
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
void AMDGPUCustomBehaviour::generateWaitCntInfo() {
 | 
						|
  // The core logic from this function is taken from
 | 
						|
  // SIInsertWaitcnts::updateEventWaitcntAfter() In that pass, the instructions
 | 
						|
  // that are being looked at are in the MachineInstr format, whereas we have
 | 
						|
  // access to the MCInst format. The side effects of this are that we can't use
 | 
						|
  // the mayAccessVMEMThroughFlat(Inst) or mayAccessLDSThroughFlat(Inst)
 | 
						|
  // functions. Therefore, we conservatively assume that these functions will
 | 
						|
  // return true. This may cause a few instructions to be incorrectly tagged
 | 
						|
  // with an extra CNT. However, these are instructions that do interact with at
 | 
						|
  // least one CNT so giving them an extra CNT shouldn't cause issues in most
 | 
						|
  // scenarios.
 | 
						|
  AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(STI.getCPU());
 | 
						|
  InstrWaitCntInfo.resize(SrcMgr.size());
 | 
						|
 | 
						|
  for (const auto &EN : llvm::enumerate(SrcMgr.getInstructions())) {
 | 
						|
    const std::unique_ptr<Instruction> &Inst = EN.value();
 | 
						|
    unsigned Index = EN.index();
 | 
						|
    unsigned Opcode = Inst->getOpcode();
 | 
						|
    const MCInstrDesc &MCID = MCII.get(Opcode);
 | 
						|
    if ((MCID.TSFlags & SIInstrFlags::DS) &&
 | 
						|
        (MCID.TSFlags & SIInstrFlags::LGKM_CNT)) {
 | 
						|
      InstrWaitCntInfo[Index].LgkmCnt = true;
 | 
						|
      if (isAlwaysGDS(Opcode) || hasModifiersSet(Inst, AMDGPU::OpName::gds))
 | 
						|
        InstrWaitCntInfo[Index].ExpCnt = true;
 | 
						|
    } else if (MCID.TSFlags & SIInstrFlags::FLAT) {
 | 
						|
      // We conservatively assume that mayAccessVMEMThroughFlat(Inst)
 | 
						|
      // and mayAccessLDSThroughFlat(Inst) would both return true for this
 | 
						|
      // instruction. We have to do this because those functions use
 | 
						|
      // information about the memory operands that we don't have access to.
 | 
						|
      InstrWaitCntInfo[Index].LgkmCnt = true;
 | 
						|
      if (!STI.hasFeature(AMDGPU::FeatureVscnt))
 | 
						|
        InstrWaitCntInfo[Index].VmCnt = true;
 | 
						|
      else if (MCID.mayLoad() && !(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet))
 | 
						|
        InstrWaitCntInfo[Index].VmCnt = true;
 | 
						|
      else
 | 
						|
        InstrWaitCntInfo[Index].VsCnt = true;
 | 
						|
    } else if (isVMEM(MCID) && !AMDGPU::getMUBUFIsBufferInv(Opcode)) {
 | 
						|
      if (!STI.hasFeature(AMDGPU::FeatureVscnt))
 | 
						|
        InstrWaitCntInfo[Index].VmCnt = true;
 | 
						|
      else if ((MCID.mayLoad() &&
 | 
						|
                !(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet)) ||
 | 
						|
               ((MCID.TSFlags & SIInstrFlags::MIMG) && !MCID.mayLoad() &&
 | 
						|
                !MCID.mayStore()))
 | 
						|
        InstrWaitCntInfo[Index].VmCnt = true;
 | 
						|
      else if (MCID.mayStore())
 | 
						|
        InstrWaitCntInfo[Index].VsCnt = true;
 | 
						|
 | 
						|
      // (IV.Major < 7) is meant to represent
 | 
						|
      // GCNTarget.vmemWriteNeedsExpWaitcnt()
 | 
						|
      // which is defined as
 | 
						|
      // { return getGeneration() < SEA_ISLANDS; }
 | 
						|
      if (IV.Major < 7 &&
 | 
						|
          (MCID.mayStore() || (MCID.TSFlags & SIInstrFlags::IsAtomicRet)))
 | 
						|
        InstrWaitCntInfo[Index].ExpCnt = true;
 | 
						|
    } else if (MCID.TSFlags & SIInstrFlags::SMRD) {
 | 
						|
      InstrWaitCntInfo[Index].LgkmCnt = true;
 | 
						|
    } else if (MCID.TSFlags & SIInstrFlags::EXP) {
 | 
						|
      InstrWaitCntInfo[Index].ExpCnt = true;
 | 
						|
    } else {
 | 
						|
      switch (Opcode) {
 | 
						|
      case AMDGPU::S_SENDMSG:
 | 
						|
      case AMDGPU::S_SENDMSGHALT:
 | 
						|
      case AMDGPU::S_MEMTIME:
 | 
						|
      case AMDGPU::S_MEMREALTIME:
 | 
						|
        InstrWaitCntInfo[Index].LgkmCnt = true;
 | 
						|
        break;
 | 
						|
      }
 | 
						|
    }
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
// taken from SIInstrInfo::isVMEM()
 | 
						|
bool AMDGPUCustomBehaviour::isVMEM(const MCInstrDesc &MCID) {
 | 
						|
  return MCID.TSFlags & SIInstrFlags::MUBUF ||
 | 
						|
         MCID.TSFlags & SIInstrFlags::MTBUF ||
 | 
						|
         MCID.TSFlags & SIInstrFlags::MIMG;
 | 
						|
}
 | 
						|
 | 
						|
// taken from SIInstrInfo::hasModifiersSet()
 | 
						|
bool AMDGPUCustomBehaviour::hasModifiersSet(
 | 
						|
    const std::unique_ptr<Instruction> &Inst, unsigned OpName) const {
 | 
						|
  int Idx = AMDGPU::getNamedOperandIdx(Inst->getOpcode(), OpName);
 | 
						|
  if (Idx == -1)
 | 
						|
    return false;
 | 
						|
 | 
						|
  const MCAOperand *Op = Inst->getOperand(Idx);
 | 
						|
  if (Op == nullptr || !Op->isImm() || !Op->getImm())
 | 
						|
    return false;
 | 
						|
 | 
						|
  return true;
 | 
						|
}
 | 
						|
 | 
						|
// taken from SIInstrInfo::isAlwaysGDS()
 | 
						|
bool AMDGPUCustomBehaviour::isAlwaysGDS(uint16_t Opcode) const {
 | 
						|
  return Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::DS_GWS_INIT ||
 | 
						|
         Opcode == AMDGPU::DS_GWS_SEMA_V || Opcode == AMDGPU::DS_GWS_SEMA_BR ||
 | 
						|
         Opcode == AMDGPU::DS_GWS_SEMA_P ||
 | 
						|
         Opcode == AMDGPU::DS_GWS_SEMA_RELEASE_ALL ||
 | 
						|
         Opcode == AMDGPU::DS_GWS_BARRIER;
 | 
						|
}
 | 
						|
 | 
						|
} // namespace mca
 | 
						|
} // namespace llvm
 | 
						|
 | 
						|
using namespace llvm;
 | 
						|
using namespace mca;
 | 
						|
 | 
						|
static CustomBehaviour *
 | 
						|
createAMDGPUCustomBehaviour(const MCSubtargetInfo &STI,
 | 
						|
                            const mca::SourceMgr &SrcMgr,
 | 
						|
                            const MCInstrInfo &MCII) {
 | 
						|
  return new AMDGPUCustomBehaviour(STI, SrcMgr, MCII);
 | 
						|
}
 | 
						|
 | 
						|
static InstrPostProcess *
 | 
						|
createAMDGPUInstrPostProcess(const MCSubtargetInfo &STI,
 | 
						|
                             const MCInstrInfo &MCII) {
 | 
						|
  return new AMDGPUInstrPostProcess(STI, MCII);
 | 
						|
}
 | 
						|
 | 
						|
/// Extern function to initialize the targets for the AMDGPU backend
 | 
						|
 | 
						|
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTargetMCA() {
 | 
						|
  TargetRegistry::RegisterCustomBehaviour(getTheAMDGPUTarget(),
 | 
						|
                                          createAMDGPUCustomBehaviour);
 | 
						|
  TargetRegistry::RegisterInstrPostProcess(getTheAMDGPUTarget(),
 | 
						|
                                           createAMDGPUInstrPostProcess);
 | 
						|
 | 
						|
  TargetRegistry::RegisterCustomBehaviour(getTheGCNTarget(),
 | 
						|
                                          createAMDGPUCustomBehaviour);
 | 
						|
  TargetRegistry::RegisterInstrPostProcess(getTheGCNTarget(),
 | 
						|
                                           createAMDGPUInstrPostProcess);
 | 
						|
}
 |