#include "AMDGPUResourceUsageAnalysis.h"
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/Analysis/CallGraph.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/GlobalAlias.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/Target/TargetMachine.h"
using namespace llvm;
using namespace llvm::AMDGPU;
#define DEBUG_TYPE "amdgpu-resource-usage"
char llvm::AMDGPUResourceUsageAnalysis::ID = 0;
char &llvm::AMDGPUResourceUsageAnalysisID = AMDGPUResourceUsageAnalysis::ID;
static cl::opt<uint32_t> AssumedStackSizeForExternalCall(
"amdgpu-assume-external-call-stack-size",
cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden,
cl::init(16384));
static cl::opt<uint32_t> AssumedStackSizeForDynamicSizeObjects(
"amdgpu-assume-dynamic-stack-object-size",
cl::desc("Assumed extra stack use if there are any "
"variable sized objects (in bytes)"),
cl::Hidden, cl::init(4096));
INITIALIZE_PASS(AMDGPUResourceUsageAnalysis, DEBUG_TYPE,
"Function register usage analysis", true, true)
static const Function *getCalleeFunction(const MachineOperand &Op) {
if (Op.isImm()) {
assert(Op.getImm() == 0);
return nullptr;
}
if (auto *GA = dyn_cast<GlobalAlias>(Op.getGlobal()))
return cast<Function>(GA->getOperand(0));
return cast<Function>(Op.getGlobal());
}
static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI,
const SIInstrInfo &TII, unsigned Reg) {
for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) {
if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent()))
return true;
}
return false;
}
int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumSGPRs(
const GCNSubtarget &ST) const {
return NumExplicitSGPR +
IsaInfo::getNumExtraSGPRs(&ST, UsesVCC, UsesFlatScratch,
ST.getTargetID().isXnackOnOrAny());
}
int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
const GCNSubtarget &ST, int32_t ArgNumAGPR, int32_t ArgNumVGPR) const {
return AMDGPU::getTotalNumVGPRs(ST.hasGFX90AInsts(), ArgNumAGPR, ArgNumVGPR);
}
int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
const GCNSubtarget &ST) const {
return getTotalNumVGPRs(ST, NumAGPR, NumVGPR);
}
bool AMDGPUResourceUsageAnalysis::runOnModule(Module &M) {
auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
if (!TPC)
return false;
MachineModuleInfo &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
const TargetMachine &TM = TPC->getTM<TargetMachine>();
bool HasIndirectCall = false;
CallGraph CG = CallGraph(M);
auto End = po_end(&CG);
for (auto IT = po_begin(&CG); IT != End; ++IT) {
Function *F = IT->getFunction();
if (!F || F->isDeclaration())
continue;
MachineFunction *MF = MMI.getMachineFunction(*F);
assert(MF && "function must have been generated already");
auto CI = CallGraphResourceInfo.insert(
std::make_pair(F, SIFunctionResourceInfo()));
SIFunctionResourceInfo &Info = CI.first->second;
assert(CI.second && "should only be called once per function");
Info = analyzeResourceUsage(*MF, TM);
HasIndirectCall |= Info.HasIndirectCall;
}
if (HasIndirectCall)
propagateIndirectCallRegisterUsage();
return false;
}
AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo
AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
const MachineFunction &MF, const TargetMachine &TM) const {
SIFunctionResourceInfo Info;
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
const MachineRegisterInfo &MRI = MF.getRegInfo();
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo &TRI = TII->getRegisterInfo();
Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) ||
MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI) ||
MRI.isLiveIn(MFI->getPreloadedReg(
AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT));
if (Info.UsesFlatScratch && !MFI->hasFlatScratchInit() &&
(!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) &&
!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) &&
!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) {
Info.UsesFlatScratch = false;
}
Info.PrivateSegmentSize = FrameInfo.getStackSize();
Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
if (Info.HasDynamicallySizedStack)
Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects;
if (MFI->isStackRealigned())
Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();
Info.UsesVCC =
MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI);
if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
MCPhysReg HighestVGPRReg = AMDGPU::NoRegister;
for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) {
if (MRI.isPhysRegUsed(Reg)) {
HighestVGPRReg = Reg;
break;
}
}
if (ST.hasMAIInsts()) {
MCPhysReg HighestAGPRReg = AMDGPU::NoRegister;
for (MCPhysReg Reg : reverse(AMDGPU::AGPR_32RegClass.getRegisters())) {
if (MRI.isPhysRegUsed(Reg)) {
HighestAGPRReg = Reg;
break;
}
}
Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister
? 0
: TRI.getHWRegIndex(HighestAGPRReg) + 1;
}
MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {
if (MRI.isPhysRegUsed(Reg)) {
HighestSGPRReg = Reg;
break;
}
}
Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister
? 0
: TRI.getHWRegIndex(HighestVGPRReg) + 1;
Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister
? 0
: TRI.getHWRegIndex(HighestSGPRReg) + 1;
return Info;
}
int32_t MaxVGPR = -1;
int32_t MaxAGPR = -1;
int32_t MaxSGPR = -1;
uint64_t CalleeFrameSize = 0;
for (const MachineBasicBlock &MBB : MF) {
for (const MachineInstr &MI : MBB) {
for (const MachineOperand &MO : MI.operands()) {
unsigned Width = 0;
bool IsSGPR = false;
bool IsAGPR = false;
if (!MO.isReg())
continue;
Register Reg = MO.getReg();
switch (Reg) {
case AMDGPU::EXEC:
case AMDGPU::EXEC_LO:
case AMDGPU::EXEC_HI:
case AMDGPU::SCC:
case AMDGPU::M0:
case AMDGPU::M0_LO16:
case AMDGPU::M0_HI16:
case AMDGPU::SRC_SHARED_BASE:
case AMDGPU::SRC_SHARED_LIMIT:
case AMDGPU::SRC_PRIVATE_BASE:
case AMDGPU::SRC_PRIVATE_LIMIT:
case AMDGPU::SGPR_NULL:
case AMDGPU::SGPR_NULL64:
case AMDGPU::MODE:
continue;
case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
llvm_unreachable("src_pops_exiting_wave_id should not be used");
case AMDGPU::NoRegister:
assert(MI.isDebugInstr() &&
"Instruction uses invalid noreg register");
continue;
case AMDGPU::VCC:
case AMDGPU::VCC_LO:
case AMDGPU::VCC_HI:
case AMDGPU::VCC_LO_LO16:
case AMDGPU::VCC_LO_HI16:
case AMDGPU::VCC_HI_LO16:
case AMDGPU::VCC_HI_HI16:
Info.UsesVCC = true;
continue;
case AMDGPU::FLAT_SCR:
case AMDGPU::FLAT_SCR_LO:
case AMDGPU::FLAT_SCR_HI:
continue;
case AMDGPU::XNACK_MASK:
case AMDGPU::XNACK_MASK_LO:
case AMDGPU::XNACK_MASK_HI:
llvm_unreachable("xnack_mask registers should not be used");
case AMDGPU::LDS_DIRECT:
llvm_unreachable("lds_direct register should not be used");
case AMDGPU::TBA:
case AMDGPU::TBA_LO:
case AMDGPU::TBA_HI:
case AMDGPU::TMA:
case AMDGPU::TMA_LO:
case AMDGPU::TMA_HI:
llvm_unreachable("trap handler registers should not be used");
case AMDGPU::SRC_VCCZ:
llvm_unreachable("src_vccz register should not be used");
case AMDGPU::SRC_EXECZ:
llvm_unreachable("src_execz register should not be used");
case AMDGPU::SRC_SCC:
llvm_unreachable("src_scc register should not be used");
default:
break;
}
if (AMDGPU::SReg_32RegClass.contains(Reg) ||
AMDGPU::SReg_LO16RegClass.contains(Reg) ||
AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
assert(!AMDGPU::TTMP_32RegClass.contains(Reg) &&
"trap handler registers should not be used");
IsSGPR = true;
Width = 1;
} else if (AMDGPU::VGPR_32RegClass.contains(Reg) ||
AMDGPU::VGPR_LO16RegClass.contains(Reg) ||
AMDGPU::VGPR_HI16RegClass.contains(Reg)) {
IsSGPR = false;
Width = 1;
} else if (AMDGPU::AGPR_32RegClass.contains(Reg) ||
AMDGPU::AGPR_LO16RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 1;
} else if (AMDGPU::SReg_64RegClass.contains(Reg)) {
assert(!AMDGPU::TTMP_64RegClass.contains(Reg) &&
"trap handler registers should not be used");
IsSGPR = true;
Width = 2;
} else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
IsSGPR = false;
Width = 2;
} else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 2;
} else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
IsSGPR = false;
Width = 3;
} else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
IsSGPR = true;
Width = 3;
} else if (AMDGPU::AReg_96RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 3;
} else if (AMDGPU::SReg_128RegClass.contains(Reg)) {
assert(!AMDGPU::TTMP_128RegClass.contains(Reg) &&
"trap handler registers should not be used");
IsSGPR = true;
Width = 4;
} else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
IsSGPR = false;
Width = 4;
} else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 4;
} else if (AMDGPU::VReg_160RegClass.contains(Reg)) {
IsSGPR = false;
Width = 5;
} else if (AMDGPU::SReg_160RegClass.contains(Reg)) {
IsSGPR = true;
Width = 5;
} else if (AMDGPU::AReg_160RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 5;
} else if (AMDGPU::VReg_192RegClass.contains(Reg)) {
IsSGPR = false;
Width = 6;
} else if (AMDGPU::SReg_192RegClass.contains(Reg)) {
IsSGPR = true;
Width = 6;
} else if (AMDGPU::AReg_192RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 6;
} else if (AMDGPU::VReg_224RegClass.contains(Reg)) {
IsSGPR = false;
Width = 7;
} else if (AMDGPU::SReg_224RegClass.contains(Reg)) {
IsSGPR = true;
Width = 7;
} else if (AMDGPU::AReg_224RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 7;
} else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
assert(!AMDGPU::TTMP_256RegClass.contains(Reg) &&
"trap handler registers should not be used");
IsSGPR = true;
Width = 8;
} else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
IsSGPR = false;
Width = 8;
} else if (AMDGPU::AReg_256RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 8;
} else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
assert(!AMDGPU::TTMP_512RegClass.contains(Reg) &&
"trap handler registers should not be used");
IsSGPR = true;
Width = 16;
} else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
IsSGPR = false;
Width = 16;
} else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 16;
} else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
IsSGPR = true;
Width = 32;
} else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
IsSGPR = false;
Width = 32;
} else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 32;
} else {
llvm_unreachable("Unknown register class");
}
unsigned HWReg = TRI.getHWRegIndex(Reg);
int MaxUsed = HWReg + Width - 1;
if (IsSGPR) {
MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
} else if (IsAGPR) {
MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
} else {
MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
}
}
if (MI.isCall()) {
const MachineOperand *CalleeOp =
TII->getNamedOperand(MI, AMDGPU::OpName::callee);
const Function *Callee = getCalleeFunction(*CalleeOp);
DenseMap<const Function *, SIFunctionResourceInfo>::const_iterator I =
CallGraphResourceInfo.end();
if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv()))
report_fatal_error("invalid call to entry function");
bool IsIndirect = !Callee || Callee->isDeclaration();
if (!IsIndirect)
I = CallGraphResourceInfo.find(Callee);
if (!Callee || !Callee->doesNotRecurse()) {
Info.HasRecursion = true;
if (!MI.isReturn()) {
CalleeFrameSize =
std::max(CalleeFrameSize,
static_cast<uint64_t>(AssumedStackSizeForExternalCall));
}
}
if (IsIndirect || I == CallGraphResourceInfo.end()) {
CalleeFrameSize =
std::max(CalleeFrameSize,
static_cast<uint64_t>(AssumedStackSizeForExternalCall));
Info.UsesVCC = true;
Info.UsesFlatScratch = ST.hasFlatAddressSpace();
Info.HasDynamicallySizedStack = true;
Info.HasIndirectCall = true;
} else {
MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR);
MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR);
MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR);
CalleeFrameSize =
std::max(I->second.PrivateSegmentSize, CalleeFrameSize);
Info.UsesVCC |= I->second.UsesVCC;
Info.UsesFlatScratch |= I->second.UsesFlatScratch;
Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack;
Info.HasRecursion |= I->second.HasRecursion;
Info.HasIndirectCall |= I->second.HasIndirectCall;
}
}
}
}
Info.NumExplicitSGPR = MaxSGPR + 1;
Info.NumVGPR = MaxVGPR + 1;
Info.NumAGPR = MaxAGPR + 1;
Info.PrivateSegmentSize += CalleeFrameSize;
return Info;
}
void AMDGPUResourceUsageAnalysis::propagateIndirectCallRegisterUsage() {
int32_t NonKernelMaxSGPRs = 0;
int32_t NonKernelMaxVGPRs = 0;
int32_t NonKernelMaxAGPRs = 0;
for (const auto &I : CallGraphResourceInfo) {
if (!AMDGPU::isEntryFunctionCC(I.getFirst()->getCallingConv())) {
auto &Info = I.getSecond();
NonKernelMaxSGPRs = std::max(NonKernelMaxSGPRs, Info.NumExplicitSGPR);
NonKernelMaxVGPRs = std::max(NonKernelMaxVGPRs, Info.NumVGPR);
NonKernelMaxAGPRs = std::max(NonKernelMaxAGPRs, Info.NumAGPR);
}
}
for (auto &I : CallGraphResourceInfo) {
auto &Info = I.getSecond();
if (Info.HasIndirectCall) {
Info.NumExplicitSGPR = std::max(Info.NumExplicitSGPR, NonKernelMaxSGPRs);
Info.NumVGPR = std::max(Info.NumVGPR, NonKernelMaxVGPRs);
Info.NumAGPR = std::max(Info.NumAGPR, NonKernelMaxAGPRs);
}
}
}