535 lines
		
	
	
		
			19 KiB
		
	
	
	
		
			C++
		
	
	
	
			
		
		
	
	
			535 lines
		
	
	
		
			19 KiB
		
	
	
	
		
			C++
		
	
	
	
//===- AMDGPUResourceUsageAnalysis.h ---- analysis of resources -----------===//
 | 
						|
//
 | 
						|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 | 
						|
// See https://llvm.org/LICENSE.txt for license information.
 | 
						|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 | 
						|
//
 | 
						|
//===----------------------------------------------------------------------===//
 | 
						|
//
 | 
						|
/// \file
 | 
						|
/// \brief Analyzes how many registers and other resources are used by
 | 
						|
/// functions.
 | 
						|
///
 | 
						|
/// The results of this analysis are used to fill the register usage, flat
 | 
						|
/// usage, etc. into hardware registers.
 | 
						|
///
 | 
						|
/// The analysis takes callees into account. E.g. if a function A that needs 10
 | 
						|
/// VGPRs calls a function B that needs 20 VGPRs, querying the VGPR usage of A
 | 
						|
/// will return 20.
 | 
						|
/// It is assumed that an indirect call can go into any function except
 | 
						|
/// hardware-entrypoints. Therefore the register usage of functions with
 | 
						|
/// indirect calls is estimated as the maximum of all non-entrypoint functions
 | 
						|
/// in the module.
 | 
						|
///
 | 
						|
//===----------------------------------------------------------------------===//
 | 
						|
 | 
						|
#include "AMDGPUResourceUsageAnalysis.h"
 | 
						|
#include "AMDGPU.h"
 | 
						|
#include "GCNSubtarget.h"
 | 
						|
#include "SIMachineFunctionInfo.h"
 | 
						|
#include "llvm/Analysis/CallGraph.h"
 | 
						|
#include "llvm/CodeGen/TargetPassConfig.h"
 | 
						|
#include "llvm/IR/GlobalAlias.h"
 | 
						|
#include "llvm/IR/GlobalValue.h"
 | 
						|
#include "llvm/Target/TargetMachine.h"
 | 
						|
 | 
						|
using namespace llvm;
 | 
						|
using namespace llvm::AMDGPU;
 | 
						|
 | 
						|
#define DEBUG_TYPE "amdgpu-resource-usage"
 | 
						|
 | 
						|
char llvm::AMDGPUResourceUsageAnalysis::ID = 0;
 | 
						|
char &llvm::AMDGPUResourceUsageAnalysisID = AMDGPUResourceUsageAnalysis::ID;
 | 
						|
 | 
						|
// We need to tell the runtime some amount ahead of time if we don't know the
 | 
						|
// true stack size. Assume a smaller number if this is only due to dynamic /
 | 
						|
// non-entry block allocas.
 | 
						|
static cl::opt<uint32_t> AssumedStackSizeForExternalCall(
 | 
						|
    "amdgpu-assume-external-call-stack-size",
 | 
						|
    cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden,
 | 
						|
    cl::init(16384));
 | 
						|
 | 
						|
static cl::opt<uint32_t> AssumedStackSizeForDynamicSizeObjects(
 | 
						|
    "amdgpu-assume-dynamic-stack-object-size",
 | 
						|
    cl::desc("Assumed extra stack use if there are any "
 | 
						|
             "variable sized objects (in bytes)"),
 | 
						|
    cl::Hidden, cl::init(4096));
 | 
						|
 | 
						|
INITIALIZE_PASS(AMDGPUResourceUsageAnalysis, DEBUG_TYPE,
 | 
						|
                "Function register usage analysis", true, true)
 | 
						|
 | 
						|
static const Function *getCalleeFunction(const MachineOperand &Op) {
 | 
						|
  if (Op.isImm()) {
 | 
						|
    assert(Op.getImm() == 0);
 | 
						|
    return nullptr;
 | 
						|
  }
 | 
						|
  if (auto *GA = dyn_cast<GlobalAlias>(Op.getGlobal()))
 | 
						|
    return cast<Function>(GA->getOperand(0));
 | 
						|
  return cast<Function>(Op.getGlobal());
 | 
						|
}
 | 
						|
 | 
						|
static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI,
 | 
						|
                                  const SIInstrInfo &TII, unsigned Reg) {
 | 
						|
  for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) {
 | 
						|
    if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent()))
 | 
						|
      return true;
 | 
						|
  }
 | 
						|
 | 
						|
  return false;
 | 
						|
}
 | 
						|
 | 
						|
int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumSGPRs(
 | 
						|
    const GCNSubtarget &ST) const {
 | 
						|
  return NumExplicitSGPR +
 | 
						|
         IsaInfo::getNumExtraSGPRs(&ST, UsesVCC, UsesFlatScratch,
 | 
						|
                                   ST.getTargetID().isXnackOnOrAny());
 | 
						|
}
 | 
						|
 | 
						|
int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
 | 
						|
    const GCNSubtarget &ST, int32_t ArgNumAGPR, int32_t ArgNumVGPR) const {
 | 
						|
  return AMDGPU::getTotalNumVGPRs(ST.hasGFX90AInsts(), ArgNumAGPR, ArgNumVGPR);
 | 
						|
}
 | 
						|
 | 
						|
int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
 | 
						|
    const GCNSubtarget &ST) const {
 | 
						|
  return getTotalNumVGPRs(ST, NumAGPR, NumVGPR);
 | 
						|
}
 | 
						|
 | 
						|
bool AMDGPUResourceUsageAnalysis::runOnModule(Module &M) {
 | 
						|
  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
 | 
						|
  if (!TPC)
 | 
						|
    return false;
 | 
						|
 | 
						|
  MachineModuleInfo &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
 | 
						|
  const TargetMachine &TM = TPC->getTM<TargetMachine>();
 | 
						|
  bool HasIndirectCall = false;
 | 
						|
 | 
						|
  for (Function &F : M) {
 | 
						|
    if (F.isDeclaration())
 | 
						|
      continue;
 | 
						|
 | 
						|
    MachineFunction *MF = MMI.getMachineFunction(F);
 | 
						|
    assert(MF && "function must have been generated already");
 | 
						|
 | 
						|
    auto CI = CallGraphResourceInfo.insert(
 | 
						|
        std::make_pair(&F, SIFunctionResourceInfo()));
 | 
						|
    SIFunctionResourceInfo &Info = CI.first->second;
 | 
						|
    assert(CI.second && "should only be called once per function");
 | 
						|
    Info = analyzeResourceUsage(*MF, TM);
 | 
						|
    HasIndirectCall |= Info.HasIndirectCall;
 | 
						|
  }
 | 
						|
 | 
						|
  if (HasIndirectCall)
 | 
						|
    propagateIndirectCallRegisterUsage();
 | 
						|
 | 
						|
  return false;
 | 
						|
}
 | 
						|
 | 
						|
AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo
 | 
						|
AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
 | 
						|
    const MachineFunction &MF, const TargetMachine &TM) const {
 | 
						|
  SIFunctionResourceInfo Info;
 | 
						|
 | 
						|
  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 | 
						|
  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
 | 
						|
  const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
 | 
						|
  const MachineRegisterInfo &MRI = MF.getRegInfo();
 | 
						|
  const SIInstrInfo *TII = ST.getInstrInfo();
 | 
						|
  const SIRegisterInfo &TRI = TII->getRegisterInfo();
 | 
						|
 | 
						|
  Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) ||
 | 
						|
                         MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI) ||
 | 
						|
                         MRI.isLiveIn(MFI->getPreloadedReg(
 | 
						|
                             AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT));
 | 
						|
 | 
						|
  // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
 | 
						|
  // instructions aren't used to access the scratch buffer. Inline assembly may
 | 
						|
  // need it though.
 | 
						|
  //
 | 
						|
  // If we only have implicit uses of flat_scr on flat instructions, it is not
 | 
						|
  // really needed.
 | 
						|
  if (Info.UsesFlatScratch && !MFI->hasFlatScratchInit() &&
 | 
						|
      (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) &&
 | 
						|
       !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) &&
 | 
						|
       !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) {
 | 
						|
    Info.UsesFlatScratch = false;
 | 
						|
  }
 | 
						|
 | 
						|
  Info.PrivateSegmentSize = FrameInfo.getStackSize();
 | 
						|
 | 
						|
  // Assume a big number if there are any unknown sized objects.
 | 
						|
  Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
 | 
						|
  if (Info.HasDynamicallySizedStack)
 | 
						|
    Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects;
 | 
						|
 | 
						|
  if (MFI->isStackRealigned())
 | 
						|
    Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();
 | 
						|
 | 
						|
  Info.UsesVCC =
 | 
						|
      MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI);
 | 
						|
 | 
						|
  // If there are no calls, MachineRegisterInfo can tell us the used register
 | 
						|
  // count easily.
 | 
						|
  // A tail call isn't considered a call for MachineFrameInfo's purposes.
 | 
						|
  if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
 | 
						|
    MCPhysReg HighestVGPRReg = AMDGPU::NoRegister;
 | 
						|
    for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) {
 | 
						|
      if (MRI.isPhysRegUsed(Reg)) {
 | 
						|
        HighestVGPRReg = Reg;
 | 
						|
        break;
 | 
						|
      }
 | 
						|
    }
 | 
						|
 | 
						|
    if (ST.hasMAIInsts()) {
 | 
						|
      MCPhysReg HighestAGPRReg = AMDGPU::NoRegister;
 | 
						|
      for (MCPhysReg Reg : reverse(AMDGPU::AGPR_32RegClass.getRegisters())) {
 | 
						|
        if (MRI.isPhysRegUsed(Reg)) {
 | 
						|
          HighestAGPRReg = Reg;
 | 
						|
          break;
 | 
						|
        }
 | 
						|
      }
 | 
						|
      Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister
 | 
						|
                         ? 0
 | 
						|
                         : TRI.getHWRegIndex(HighestAGPRReg) + 1;
 | 
						|
    }
 | 
						|
 | 
						|
    MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
 | 
						|
    for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {
 | 
						|
      if (MRI.isPhysRegUsed(Reg)) {
 | 
						|
        HighestSGPRReg = Reg;
 | 
						|
        break;
 | 
						|
      }
 | 
						|
    }
 | 
						|
 | 
						|
    // We found the maximum register index. They start at 0, so add one to get
 | 
						|
    // the number of registers.
 | 
						|
    Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister
 | 
						|
                       ? 0
 | 
						|
                       : TRI.getHWRegIndex(HighestVGPRReg) + 1;
 | 
						|
    Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister
 | 
						|
                               ? 0
 | 
						|
                               : TRI.getHWRegIndex(HighestSGPRReg) + 1;
 | 
						|
 | 
						|
    return Info;
 | 
						|
  }
 | 
						|
 | 
						|
  int32_t MaxVGPR = -1;
 | 
						|
  int32_t MaxAGPR = -1;
 | 
						|
  int32_t MaxSGPR = -1;
 | 
						|
  uint64_t CalleeFrameSize = 0;
 | 
						|
 | 
						|
  for (const MachineBasicBlock &MBB : MF) {
 | 
						|
    for (const MachineInstr &MI : MBB) {
 | 
						|
      // TODO: Check regmasks? Do they occur anywhere except calls?
 | 
						|
      for (const MachineOperand &MO : MI.operands()) {
 | 
						|
        unsigned Width = 0;
 | 
						|
        bool IsSGPR = false;
 | 
						|
        bool IsAGPR = false;
 | 
						|
 | 
						|
        if (!MO.isReg())
 | 
						|
          continue;
 | 
						|
 | 
						|
        Register Reg = MO.getReg();
 | 
						|
        switch (Reg) {
 | 
						|
        case AMDGPU::EXEC:
 | 
						|
        case AMDGPU::EXEC_LO:
 | 
						|
        case AMDGPU::EXEC_HI:
 | 
						|
        case AMDGPU::SCC:
 | 
						|
        case AMDGPU::M0:
 | 
						|
        case AMDGPU::M0_LO16:
 | 
						|
        case AMDGPU::M0_HI16:
 | 
						|
        case AMDGPU::SRC_SHARED_BASE:
 | 
						|
        case AMDGPU::SRC_SHARED_LIMIT:
 | 
						|
        case AMDGPU::SRC_PRIVATE_BASE:
 | 
						|
        case AMDGPU::SRC_PRIVATE_LIMIT:
 | 
						|
        case AMDGPU::SGPR_NULL:
 | 
						|
        case AMDGPU::MODE:
 | 
						|
          continue;
 | 
						|
 | 
						|
        case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
 | 
						|
          llvm_unreachable("src_pops_exiting_wave_id should not be used");
 | 
						|
 | 
						|
        case AMDGPU::NoRegister:
 | 
						|
          assert(MI.isDebugInstr() &&
 | 
						|
                 "Instruction uses invalid noreg register");
 | 
						|
          continue;
 | 
						|
 | 
						|
        case AMDGPU::VCC:
 | 
						|
        case AMDGPU::VCC_LO:
 | 
						|
        case AMDGPU::VCC_HI:
 | 
						|
        case AMDGPU::VCC_LO_LO16:
 | 
						|
        case AMDGPU::VCC_LO_HI16:
 | 
						|
        case AMDGPU::VCC_HI_LO16:
 | 
						|
        case AMDGPU::VCC_HI_HI16:
 | 
						|
          Info.UsesVCC = true;
 | 
						|
          continue;
 | 
						|
 | 
						|
        case AMDGPU::FLAT_SCR:
 | 
						|
        case AMDGPU::FLAT_SCR_LO:
 | 
						|
        case AMDGPU::FLAT_SCR_HI:
 | 
						|
          continue;
 | 
						|
 | 
						|
        case AMDGPU::XNACK_MASK:
 | 
						|
        case AMDGPU::XNACK_MASK_LO:
 | 
						|
        case AMDGPU::XNACK_MASK_HI:
 | 
						|
          llvm_unreachable("xnack_mask registers should not be used");
 | 
						|
 | 
						|
        case AMDGPU::LDS_DIRECT:
 | 
						|
          llvm_unreachable("lds_direct register should not be used");
 | 
						|
 | 
						|
        case AMDGPU::TBA:
 | 
						|
        case AMDGPU::TBA_LO:
 | 
						|
        case AMDGPU::TBA_HI:
 | 
						|
        case AMDGPU::TMA:
 | 
						|
        case AMDGPU::TMA_LO:
 | 
						|
        case AMDGPU::TMA_HI:
 | 
						|
          llvm_unreachable("trap handler registers should not be used");
 | 
						|
 | 
						|
        case AMDGPU::SRC_VCCZ:
 | 
						|
          llvm_unreachable("src_vccz register should not be used");
 | 
						|
 | 
						|
        case AMDGPU::SRC_EXECZ:
 | 
						|
          llvm_unreachable("src_execz register should not be used");
 | 
						|
 | 
						|
        case AMDGPU::SRC_SCC:
 | 
						|
          llvm_unreachable("src_scc register should not be used");
 | 
						|
 | 
						|
        default:
 | 
						|
          break;
 | 
						|
        }
 | 
						|
 | 
						|
        if (AMDGPU::SReg_32RegClass.contains(Reg) ||
 | 
						|
            AMDGPU::SReg_LO16RegClass.contains(Reg) ||
 | 
						|
            AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
 | 
						|
          assert(!AMDGPU::TTMP_32RegClass.contains(Reg) &&
 | 
						|
                 "trap handler registers should not be used");
 | 
						|
          IsSGPR = true;
 | 
						|
          Width = 1;
 | 
						|
        } else if (AMDGPU::VGPR_32RegClass.contains(Reg) ||
 | 
						|
                   AMDGPU::VGPR_LO16RegClass.contains(Reg) ||
 | 
						|
                   AMDGPU::VGPR_HI16RegClass.contains(Reg)) {
 | 
						|
          IsSGPR = false;
 | 
						|
          Width = 1;
 | 
						|
        } else if (AMDGPU::AGPR_32RegClass.contains(Reg) ||
 | 
						|
                   AMDGPU::AGPR_LO16RegClass.contains(Reg)) {
 | 
						|
          IsSGPR = false;
 | 
						|
          IsAGPR = true;
 | 
						|
          Width = 1;
 | 
						|
        } else if (AMDGPU::SReg_64RegClass.contains(Reg)) {
 | 
						|
          assert(!AMDGPU::TTMP_64RegClass.contains(Reg) &&
 | 
						|
                 "trap handler registers should not be used");
 | 
						|
          IsSGPR = true;
 | 
						|
          Width = 2;
 | 
						|
        } else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
 | 
						|
          IsSGPR = false;
 | 
						|
          Width = 2;
 | 
						|
        } else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
 | 
						|
          IsSGPR = false;
 | 
						|
          IsAGPR = true;
 | 
						|
          Width = 2;
 | 
						|
        } else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
 | 
						|
          IsSGPR = false;
 | 
						|
          Width = 3;
 | 
						|
        } else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
 | 
						|
          IsSGPR = true;
 | 
						|
          Width = 3;
 | 
						|
        } else if (AMDGPU::AReg_96RegClass.contains(Reg)) {
 | 
						|
          IsSGPR = false;
 | 
						|
          IsAGPR = true;
 | 
						|
          Width = 3;
 | 
						|
        } else if (AMDGPU::SReg_128RegClass.contains(Reg)) {
 | 
						|
          assert(!AMDGPU::TTMP_128RegClass.contains(Reg) &&
 | 
						|
                 "trap handler registers should not be used");
 | 
						|
          IsSGPR = true;
 | 
						|
          Width = 4;
 | 
						|
        } else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
 | 
						|
          IsSGPR = false;
 | 
						|
          Width = 4;
 | 
						|
        } else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
 | 
						|
          IsSGPR = false;
 | 
						|
          IsAGPR = true;
 | 
						|
          Width = 4;
 | 
						|
        } else if (AMDGPU::VReg_160RegClass.contains(Reg)) {
 | 
						|
          IsSGPR = false;
 | 
						|
          Width = 5;
 | 
						|
        } else if (AMDGPU::SReg_160RegClass.contains(Reg)) {
 | 
						|
          IsSGPR = true;
 | 
						|
          Width = 5;
 | 
						|
        } else if (AMDGPU::AReg_160RegClass.contains(Reg)) {
 | 
						|
          IsSGPR = false;
 | 
						|
          IsAGPR = true;
 | 
						|
          Width = 5;
 | 
						|
        } else if (AMDGPU::VReg_192RegClass.contains(Reg)) {
 | 
						|
          IsSGPR = false;
 | 
						|
          Width = 6;
 | 
						|
        } else if (AMDGPU::SReg_192RegClass.contains(Reg)) {
 | 
						|
          IsSGPR = true;
 | 
						|
          Width = 6;
 | 
						|
        } else if (AMDGPU::AReg_192RegClass.contains(Reg)) {
 | 
						|
          IsSGPR = false;
 | 
						|
          IsAGPR = true;
 | 
						|
          Width = 6;
 | 
						|
        } else if (AMDGPU::VReg_224RegClass.contains(Reg)) {
 | 
						|
          IsSGPR = false;
 | 
						|
          Width = 7;
 | 
						|
        } else if (AMDGPU::SReg_224RegClass.contains(Reg)) {
 | 
						|
          IsSGPR = true;
 | 
						|
          Width = 7;
 | 
						|
        } else if (AMDGPU::AReg_224RegClass.contains(Reg)) {
 | 
						|
          IsSGPR = false;
 | 
						|
          IsAGPR = true;
 | 
						|
          Width = 7;
 | 
						|
        } else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
 | 
						|
          assert(!AMDGPU::TTMP_256RegClass.contains(Reg) &&
 | 
						|
                 "trap handler registers should not be used");
 | 
						|
          IsSGPR = true;
 | 
						|
          Width = 8;
 | 
						|
        } else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
 | 
						|
          IsSGPR = false;
 | 
						|
          Width = 8;
 | 
						|
        } else if (AMDGPU::AReg_256RegClass.contains(Reg)) {
 | 
						|
          IsSGPR = false;
 | 
						|
          IsAGPR = true;
 | 
						|
          Width = 8;
 | 
						|
        } else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
 | 
						|
          assert(!AMDGPU::TTMP_512RegClass.contains(Reg) &&
 | 
						|
                 "trap handler registers should not be used");
 | 
						|
          IsSGPR = true;
 | 
						|
          Width = 16;
 | 
						|
        } else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
 | 
						|
          IsSGPR = false;
 | 
						|
          Width = 16;
 | 
						|
        } else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
 | 
						|
          IsSGPR = false;
 | 
						|
          IsAGPR = true;
 | 
						|
          Width = 16;
 | 
						|
        } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
 | 
						|
          IsSGPR = true;
 | 
						|
          Width = 32;
 | 
						|
        } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
 | 
						|
          IsSGPR = false;
 | 
						|
          Width = 32;
 | 
						|
        } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
 | 
						|
          IsSGPR = false;
 | 
						|
          IsAGPR = true;
 | 
						|
          Width = 32;
 | 
						|
        } else {
 | 
						|
          llvm_unreachable("Unknown register class");
 | 
						|
        }
 | 
						|
        unsigned HWReg = TRI.getHWRegIndex(Reg);
 | 
						|
        int MaxUsed = HWReg + Width - 1;
 | 
						|
        if (IsSGPR) {
 | 
						|
          MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
 | 
						|
        } else if (IsAGPR) {
 | 
						|
          MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
 | 
						|
        } else {
 | 
						|
          MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
 | 
						|
        }
 | 
						|
      }
 | 
						|
 | 
						|
      if (MI.isCall()) {
 | 
						|
        // Pseudo used just to encode the underlying global. Is there a better
 | 
						|
        // way to track this?
 | 
						|
 | 
						|
        const MachineOperand *CalleeOp =
 | 
						|
            TII->getNamedOperand(MI, AMDGPU::OpName::callee);
 | 
						|
 | 
						|
        const Function *Callee = getCalleeFunction(*CalleeOp);
 | 
						|
        DenseMap<const Function *, SIFunctionResourceInfo>::const_iterator I =
 | 
						|
            CallGraphResourceInfo.end();
 | 
						|
 | 
						|
        // Avoid crashing on undefined behavior with an illegal call to a
 | 
						|
        // kernel. If a callsite's calling convention doesn't match the
 | 
						|
        // function's, it's undefined behavior. If the callsite calling
 | 
						|
        // convention does match, that would have errored earlier.
 | 
						|
        if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv()))
 | 
						|
          report_fatal_error("invalid call to entry function");
 | 
						|
 | 
						|
        bool IsIndirect = !Callee || Callee->isDeclaration();
 | 
						|
        if (!IsIndirect)
 | 
						|
          I = CallGraphResourceInfo.find(Callee);
 | 
						|
 | 
						|
        // FIXME: Call site could have norecurse on it
 | 
						|
        if (!Callee || !Callee->doesNotRecurse()) {
 | 
						|
          Info.HasRecursion = true;
 | 
						|
 | 
						|
          // TODO: If we happen to know there is no stack usage in the
 | 
						|
          // callgraph, we don't need to assume an infinitely growing stack.
 | 
						|
          if (!MI.isReturn()) {
 | 
						|
            // We don't need to assume an unknown stack size for tail calls.
 | 
						|
 | 
						|
            // FIXME: This only benefits in the case where the kernel does not
 | 
						|
            // directly call the tail called function. If a kernel directly
 | 
						|
            // calls a tail recursive function, we'll assume maximum stack size
 | 
						|
            // based on the regular call instruction.
 | 
						|
            CalleeFrameSize =
 | 
						|
              std::max(CalleeFrameSize,
 | 
						|
                       static_cast<uint64_t>(AssumedStackSizeForExternalCall));
 | 
						|
          }
 | 
						|
        }
 | 
						|
 | 
						|
        if (IsIndirect || I == CallGraphResourceInfo.end()) {
 | 
						|
          CalleeFrameSize =
 | 
						|
              std::max(CalleeFrameSize,
 | 
						|
                       static_cast<uint64_t>(AssumedStackSizeForExternalCall));
 | 
						|
 | 
						|
          // Register usage of indirect calls gets handled later
 | 
						|
          Info.UsesVCC = true;
 | 
						|
          Info.UsesFlatScratch = ST.hasFlatAddressSpace();
 | 
						|
          Info.HasDynamicallySizedStack = true;
 | 
						|
          Info.HasIndirectCall = true;
 | 
						|
        } else {
 | 
						|
          // We force CodeGen to run in SCC order, so the callee's register
 | 
						|
          // usage etc. should be the cumulative usage of all callees.
 | 
						|
          MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR);
 | 
						|
          MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR);
 | 
						|
          MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR);
 | 
						|
          CalleeFrameSize =
 | 
						|
              std::max(I->second.PrivateSegmentSize, CalleeFrameSize);
 | 
						|
          Info.UsesVCC |= I->second.UsesVCC;
 | 
						|
          Info.UsesFlatScratch |= I->second.UsesFlatScratch;
 | 
						|
          Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack;
 | 
						|
          Info.HasRecursion |= I->second.HasRecursion;
 | 
						|
          Info.HasIndirectCall |= I->second.HasIndirectCall;
 | 
						|
        }
 | 
						|
      }
 | 
						|
    }
 | 
						|
  }
 | 
						|
 | 
						|
  Info.NumExplicitSGPR = MaxSGPR + 1;
 | 
						|
  Info.NumVGPR = MaxVGPR + 1;
 | 
						|
  Info.NumAGPR = MaxAGPR + 1;
 | 
						|
  Info.PrivateSegmentSize += CalleeFrameSize;
 | 
						|
 | 
						|
  return Info;
 | 
						|
}
 | 
						|
 | 
						|
void AMDGPUResourceUsageAnalysis::propagateIndirectCallRegisterUsage() {
 | 
						|
  // Collect the maximum number of registers from non-hardware-entrypoints.
 | 
						|
  // All these functions are potential targets for indirect calls.
 | 
						|
  int32_t NonKernelMaxSGPRs = 0;
 | 
						|
  int32_t NonKernelMaxVGPRs = 0;
 | 
						|
  int32_t NonKernelMaxAGPRs = 0;
 | 
						|
 | 
						|
  for (const auto &I : CallGraphResourceInfo) {
 | 
						|
    if (!AMDGPU::isEntryFunctionCC(I.getFirst()->getCallingConv())) {
 | 
						|
      auto &Info = I.getSecond();
 | 
						|
      NonKernelMaxSGPRs = std::max(NonKernelMaxSGPRs, Info.NumExplicitSGPR);
 | 
						|
      NonKernelMaxVGPRs = std::max(NonKernelMaxVGPRs, Info.NumVGPR);
 | 
						|
      NonKernelMaxAGPRs = std::max(NonKernelMaxAGPRs, Info.NumAGPR);
 | 
						|
    }
 | 
						|
  }
 | 
						|
 | 
						|
  // Add register usage for functions with indirect calls.
 | 
						|
  // For calls to unknown functions, we assume the maximum register usage of
 | 
						|
  // all non-hardware-entrypoints in the current module.
 | 
						|
  for (auto &I : CallGraphResourceInfo) {
 | 
						|
    auto &Info = I.getSecond();
 | 
						|
    if (Info.HasIndirectCall) {
 | 
						|
      Info.NumExplicitSGPR = std::max(Info.NumExplicitSGPR, NonKernelMaxSGPRs);
 | 
						|
      Info.NumVGPR = std::max(Info.NumVGPR, NonKernelMaxVGPRs);
 | 
						|
      Info.NumAGPR = std::max(Info.NumAGPR, NonKernelMaxAGPRs);
 | 
						|
    }
 | 
						|
  }
 | 
						|
}
 |