#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIInstrInfo.h"
#include "llvm/ADT/SetVector.h"
using namespace llvm;
#define DEBUG_TYPE "amdgpu-insert-delay-alu"
namespace {
class AMDGPUInsertDelayAlu : public MachineFunctionPass {
public:
static char ID;
const SIInstrInfo *SII;
const TargetRegisterInfo *TRI;
TargetSchedModel SchedModel;
AMDGPUInsertDelayAlu() : MachineFunctionPass(ID) {}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
MachineFunctionPass::getAnalysisUsage(AU);
}
static bool instructionWaitsForVALU(const MachineInstr &MI) {
const uint64_t VA_VDST_0 = SIInstrFlags::DS | SIInstrFlags::EXP |
SIInstrFlags::FLAT | SIInstrFlags::MIMG |
SIInstrFlags::MTBUF | SIInstrFlags::MUBUF;
if (MI.getDesc().TSFlags & VA_VDST_0)
return true;
if (MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B32 ||
MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B64)
return true;
if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
(MI.getOperand(0).getImm() & 0xf000) == 0)
return true;
return false;
}
enum DelayType { VALU, TRANS, SALU, OTHER };
static DelayType getDelayType(uint64_t TSFlags) {
if (TSFlags & SIInstrFlags::TRANS)
return TRANS;
if (TSFlags & SIInstrFlags::VALU)
return VALU;
if (TSFlags & SIInstrFlags::SALU)
return SALU;
return OTHER;
}
struct DelayInfo {
static const unsigned VALU_MAX = 5;
static const unsigned TRANS_MAX = 4;
uint8_t VALUCycles = 0;
uint8_t VALUNum = VALU_MAX;
uint8_t TRANSCycles = 0;
uint8_t TRANSNum = TRANS_MAX;
uint8_t TRANSNumVALU = VALU_MAX;
uint8_t SALUCycles = 0;
DelayInfo() = default;
DelayInfo(DelayType Type, unsigned Cycles) {
switch (Type) {
default:
llvm_unreachable("unexpected type");
case VALU:
VALUCycles = Cycles;
VALUNum = 0;
break;
case TRANS:
TRANSCycles = Cycles;
TRANSNum = 0;
TRANSNumVALU = 0;
break;
case SALU:
SALUCycles = Cycles;
break;
}
}
bool operator==(const DelayInfo &RHS) const {
return VALUCycles == RHS.VALUCycles && VALUNum == RHS.VALUNum &&
TRANSCycles == RHS.TRANSCycles && TRANSNum == RHS.TRANSNum &&
TRANSNumVALU == RHS.TRANSNumVALU && SALUCycles == RHS.SALUCycles;
}
bool operator!=(const DelayInfo &RHS) const { return !(*this == RHS); }
void merge(const DelayInfo &RHS) {
VALUCycles = std::max(VALUCycles, RHS.VALUCycles);
VALUNum = std::min(VALUNum, RHS.VALUNum);
TRANSCycles = std::max(TRANSCycles, RHS.TRANSCycles);
TRANSNum = std::min(TRANSNum, RHS.TRANSNum);
TRANSNumVALU = std::min(TRANSNumVALU, RHS.TRANSNumVALU);
SALUCycles = std::max(SALUCycles, RHS.SALUCycles);
}
bool advance(DelayType Type, unsigned Cycles) {
bool Erase = true;
VALUNum += (Type == VALU);
if (VALUNum >= VALU_MAX || VALUCycles <= Cycles) {
VALUNum = VALU_MAX;
VALUCycles = 0;
} else {
VALUCycles -= Cycles;
Erase = false;
}
TRANSNum += (Type == TRANS);
TRANSNumVALU += (Type == VALU);
if (TRANSNum >= TRANS_MAX || TRANSCycles <= Cycles) {
TRANSNum = TRANS_MAX;
TRANSNumVALU = VALU_MAX;
TRANSCycles = 0;
} else {
TRANSCycles -= Cycles;
Erase = false;
}
if (SALUCycles <= Cycles) {
SALUCycles = 0;
} else {
SALUCycles -= Cycles;
Erase = false;
}
return Erase;
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void dump() const {
if (VALUCycles)
dbgs() << " VALUCycles=" << (int)VALUCycles;
if (VALUNum < VALU_MAX)
dbgs() << " VALUNum=" << (int)VALUNum;
if (TRANSCycles)
dbgs() << " TRANSCycles=" << (int)TRANSCycles;
if (TRANSNum < TRANS_MAX)
dbgs() << " TRANSNum=" << (int)TRANSNum;
if (TRANSNumVALU < VALU_MAX)
dbgs() << " TRANSNumVALU=" << (int)TRANSNumVALU;
if (SALUCycles)
dbgs() << " SALUCycles=" << (int)SALUCycles;
}
#endif
};
struct DelayState : DenseMap<unsigned, DelayInfo> {
void merge(const DelayState &RHS) {
for (const auto &KV : RHS) {
iterator It;
bool Inserted;
std::tie(It, Inserted) = insert(KV);
if (!Inserted)
It->second.merge(KV.second);
}
}
void advance(DelayType Type, unsigned Cycles) {
iterator Next;
for (auto I = begin(), E = end(); I != E; I = Next) {
Next = std::next(I);
if (I->second.advance(Type, Cycles))
erase(I);
}
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void dump(const TargetRegisterInfo *TRI) const {
if (empty()) {
dbgs() << " empty\n";
return;
}
SmallVector<const_iterator, 8> Order;
Order.reserve(size());
for (const_iterator I = begin(), E = end(); I != E; ++I)
Order.push_back(I);
llvm::sort(Order, [](const const_iterator &A, const const_iterator &B) {
return A->first < B->first;
});
for (const_iterator I : Order) {
dbgs() << " " << printRegUnit(I->first, TRI);
I->second.dump();
dbgs() << "\n";
}
}
#endif
};
DenseMap<MachineBasicBlock *, DelayState> BlockState;
MachineInstr *emitDelayAlu(MachineInstr &MI, DelayInfo Delay,
MachineInstr *LastDelayAlu) {
unsigned Imm = 0;
if (Delay.TRANSNum < DelayInfo::TRANS_MAX)
Imm |= 4 + Delay.TRANSNum;
if (Delay.VALUNum < DelayInfo::VALU_MAX &&
Delay.VALUNum <= Delay.TRANSNumVALU) {
if (Imm & 0xf)
Imm |= Delay.VALUNum << 7;
else
Imm |= Delay.VALUNum;
}
if (Delay.SALUCycles) {
if (Imm & 0x780) {
} else if (Imm & 0xf) {
Imm |= (Delay.SALUCycles + 8) << 7;
} else {
Imm |= Delay.SALUCycles + 8;
}
}
if (!Imm)
return LastDelayAlu;
if (!(Imm & 0x780) && LastDelayAlu) {
unsigned Skip = 0;
for (auto I = MachineBasicBlock::instr_iterator(LastDelayAlu),
E = MachineBasicBlock::instr_iterator(MI);
++I != E;) {
if (!I->isBundle() && !I->isMetaInstruction())
++Skip;
}
if (Skip < 6) {
MachineOperand &Op = LastDelayAlu->getOperand(0);
unsigned LastImm = Op.getImm();
assert((LastImm & ~0xf) == 0 &&
"Remembered an s_delay_alu with no room for another delay!");
LastImm |= Imm << 7 | Skip << 4;
Op.setImm(LastImm);
return nullptr;
}
}
auto &MBB = *MI.getParent();
MachineInstr *DelayAlu =
BuildMI(MBB, MI, DebugLoc(), SII->get(AMDGPU::S_DELAY_ALU)).addImm(Imm);
return (Imm & 0x780) ? nullptr : DelayAlu;
}
bool runOnMachineBasicBlock(MachineBasicBlock &MBB, bool Emit) {
DelayState State;
for (auto *Pred : MBB.predecessors())
State.merge(BlockState[Pred]);
LLVM_DEBUG(dbgs() << " State at start of " << printMBBReference(MBB)
<< "\n";
State.dump(TRI););
bool Changed = false;
MachineInstr *LastDelayAlu = nullptr;
for (auto &MI : MBB.instrs()) {
if (MI.isBundle() || MI.isMetaInstruction())
continue;
switch (MI.getOpcode()) {
case AMDGPU::SI_RETURN_TO_EPILOG:
continue;
}
DelayType Type = getDelayType(MI.getDesc().TSFlags);
if (instructionWaitsForVALU(MI)) {
State = DelayState();
} else if (Type != OTHER) {
DelayInfo Delay;
for (const auto &Op : MI.explicit_uses()) {
if (Op.isReg()) {
if (MI.getOpcode() == AMDGPU::V_WRITELANE_B32 && Op.isTied())
continue;
for (MCRegUnitIterator UI(Op.getReg(), TRI); UI.isValid(); ++UI) {
auto It = State.find(*UI);
if (It != State.end()) {
Delay.merge(It->second);
State.erase(*UI);
}
}
}
}
if (Emit && !MI.isBundledWithPred()) {
LastDelayAlu = emitDelayAlu(MI, Delay, LastDelayAlu);
}
}
if (Type != OTHER) {
for (const auto &Op : MI.defs()) {
unsigned Latency = SchedModel.computeOperandLatency(
&MI, MI.getOperandNo(&Op), nullptr, 0);
for (MCRegUnitIterator UI(Op.getReg(), TRI); UI.isValid(); ++UI)
State[*UI] = DelayInfo(Type, Latency);
}
}
unsigned Cycles = SIInstrInfo::getNumWaitStates(MI);
State.advance(Type, Cycles);
LLVM_DEBUG(dbgs() << " State after " << MI; State.dump(TRI););
}
if (Emit) {
assert(State == BlockState[&MBB] &&
"Basic block state should not have changed on final pass!");
} else if (State != BlockState[&MBB]) {
BlockState[&MBB] = std::move(State);
Changed = true;
}
return Changed;
}
bool runOnMachineFunction(MachineFunction &MF) override {
if (skipFunction(MF.getFunction()))
return false;
LLVM_DEBUG(dbgs() << "AMDGPUInsertDelayAlu running on " << MF.getName()
<< "\n");
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
if (!ST.hasDelayAlu())
return false;
SII = ST.getInstrInfo();
TRI = ST.getRegisterInfo();
SchedModel.init(&ST);
SetVector<MachineBasicBlock *> WorkList;
for (auto &MBB : reverse(MF))
WorkList.insert(&MBB);
while (!WorkList.empty()) {
auto &MBB = *WorkList.pop_back_val();
bool Changed = runOnMachineBasicBlock(MBB, false);
if (Changed)
WorkList.insert(MBB.succ_begin(), MBB.succ_end());
}
LLVM_DEBUG(dbgs() << "Final pass over all BBs\n");
bool Changed = false;
for (auto &MBB : MF)
Changed |= runOnMachineBasicBlock(MBB, true);
return Changed;
}
};
}
char AMDGPUInsertDelayAlu::ID = 0;
char &llvm::AMDGPUInsertDelayAluID = AMDGPUInsertDelayAlu::ID;
INITIALIZE_PASS(AMDGPUInsertDelayAlu, DEBUG_TYPE, "AMDGPU Insert Delay ALU",
false, false)