#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Value.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/PassRegistry.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Transforms/Utils.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
#define DEBUG_TYPE "hardware-loops"
#define HW_LOOPS_NAME "Hardware Loop Insertion"
using namespace llvm;
static cl::opt<bool>
ForceHardwareLoops("force-hardware-loops", cl::Hidden, cl::init(false),
cl::desc("Force hardware loops intrinsics to be inserted"));
static cl::opt<bool>
ForceHardwareLoopPHI(
"force-hardware-loop-phi", cl::Hidden, cl::init(false),
cl::desc("Force hardware loop counter to be updated through a phi"));
static cl::opt<bool>
ForceNestedLoop("force-nested-hardware-loop", cl::Hidden, cl::init(false),
cl::desc("Force allowance of nested hardware loops"));
static cl::opt<unsigned>
LoopDecrement("hardware-loop-decrement", cl::Hidden, cl::init(1),
cl::desc("Set the loop decrement value"));
static cl::opt<unsigned>
CounterBitWidth("hardware-loop-counter-bitwidth", cl::Hidden, cl::init(32),
cl::desc("Set the loop counter bitwidth"));
static cl::opt<bool>
ForceGuardLoopEntry(
"force-hardware-loop-guard", cl::Hidden, cl::init(false),
cl::desc("Force generation of loop guard intrinsic"));
STATISTIC(NumHWLoops, "Number of loops converted to hardware loops");
#ifndef NDEBUG
static void debugHWLoopFailure(const StringRef DebugMsg,
Instruction *I) {
dbgs() << "HWLoops: " << DebugMsg;
if (I)
dbgs() << ' ' << *I;
else
dbgs() << '.';
dbgs() << '\n';
}
#endif
static OptimizationRemarkAnalysis
createHWLoopAnalysis(StringRef RemarkName, Loop *L, Instruction *I) {
Value *CodeRegion = L->getHeader();
DebugLoc DL = L->getStartLoc();
if (I) {
CodeRegion = I->getParent();
if (I->getDebugLoc())
DL = I->getDebugLoc();
}
OptimizationRemarkAnalysis R(DEBUG_TYPE, RemarkName, DL, CodeRegion);
R << "hardware-loop not created: ";
return R;
}
namespace {
void reportHWLoopFailure(const StringRef Msg, const StringRef ORETag,
OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I = nullptr) {
LLVM_DEBUG(debugHWLoopFailure(Msg, I));
ORE->emit(createHWLoopAnalysis(ORETag, TheLoop, I) << Msg);
}
using TTI = TargetTransformInfo;
class HardwareLoops : public FunctionPass {
public:
static char ID;
HardwareLoops() : FunctionPass(ID) {
initializeHardwareLoopsPass(*PassRegistry::getPassRegistry());
}
bool runOnFunction(Function &F) override;
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<LoopInfoWrapperPass>();
AU.addPreserved<LoopInfoWrapperPass>();
AU.addRequired<DominatorTreeWrapperPass>();
AU.addPreserved<DominatorTreeWrapperPass>();
AU.addRequired<ScalarEvolutionWrapperPass>();
AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<TargetTransformInfoWrapperPass>();
AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
}
bool TryConvertLoop(Loop *L);
bool TryConvertLoop(HardwareLoopInfo &HWLoopInfo);
private:
ScalarEvolution *SE = nullptr;
LoopInfo *LI = nullptr;
const DataLayout *DL = nullptr;
OptimizationRemarkEmitter *ORE = nullptr;
const TargetTransformInfo *TTI = nullptr;
DominatorTree *DT = nullptr;
bool PreserveLCSSA = false;
AssumptionCache *AC = nullptr;
TargetLibraryInfo *LibInfo = nullptr;
Module *M = nullptr;
bool MadeChange = false;
};
class HardwareLoop {
Value *InitLoopCount();
Value *InsertIterationSetup(Value *LoopCountInit);
void InsertLoopDec();
Instruction *InsertLoopRegDec(Value *EltsRem);
PHINode *InsertPHICounter(Value *NumElts, Value *EltsRem);
void UpdateBranch(Value *EltsRem);
public:
HardwareLoop(HardwareLoopInfo &Info, ScalarEvolution &SE,
const DataLayout &DL,
OptimizationRemarkEmitter *ORE) :
SE(SE), DL(DL), ORE(ORE), L(Info.L), M(L->getHeader()->getModule()),
ExitCount(Info.ExitCount),
CountType(Info.CountType),
ExitBranch(Info.ExitBranch),
LoopDecrement(Info.LoopDecrement),
UsePHICounter(Info.CounterInReg),
UseLoopGuard(Info.PerformEntryTest) { }
void Create();
private:
ScalarEvolution &SE;
const DataLayout &DL;
OptimizationRemarkEmitter *ORE = nullptr;
Loop *L = nullptr;
Module *M = nullptr;
const SCEV *ExitCount = nullptr;
Type *CountType = nullptr;
BranchInst *ExitBranch = nullptr;
Value *LoopDecrement = nullptr;
bool UsePHICounter = false;
bool UseLoopGuard = false;
BasicBlock *BeginBB = nullptr;
};
}
char HardwareLoops::ID = 0;
bool HardwareLoops::runOnFunction(Function &F) {
if (skipFunction(F))
return false;
LLVM_DEBUG(dbgs() << "HWLoops: Running on " << F.getName() << "\n");
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
DL = &F.getParent()->getDataLayout();
ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
LibInfo = TLIP ? &TLIP->getTLI(F) : nullptr;
PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
M = F.getParent();
for (Loop *L : *LI)
if (L->isOutermost())
TryConvertLoop(L);
return MadeChange;
}
bool HardwareLoops::TryConvertLoop(Loop *L) {
bool AnyChanged = false;
for (Loop *SL : *L)
AnyChanged |= TryConvertLoop(SL);
if (AnyChanged) {
reportHWLoopFailure("nested hardware-loops not supported", "HWLoopNested",
ORE, L);
return true; }
LLVM_DEBUG(dbgs() << "HWLoops: Loop " << L->getHeader()->getName() << "\n");
HardwareLoopInfo HWLoopInfo(L);
if (!HWLoopInfo.canAnalyze(*LI)) {
reportHWLoopFailure("cannot analyze loop, irreducible control flow",
"HWLoopCannotAnalyze", ORE, L);
return false;
}
if (!ForceHardwareLoops &&
!TTI->isHardwareLoopProfitable(L, *SE, *AC, LibInfo, HWLoopInfo)) {
reportHWLoopFailure("it's not profitable to create a hardware-loop",
"HWLoopNotProfitable", ORE, L);
return false;
}
if (CounterBitWidth.getNumOccurrences())
HWLoopInfo.CountType =
IntegerType::get(M->getContext(), CounterBitWidth);
if (LoopDecrement.getNumOccurrences())
HWLoopInfo.LoopDecrement =
ConstantInt::get(HWLoopInfo.CountType, LoopDecrement);
MadeChange |= TryConvertLoop(HWLoopInfo);
return MadeChange && (!HWLoopInfo.IsNestingLegal && !ForceNestedLoop);
}
bool HardwareLoops::TryConvertLoop(HardwareLoopInfo &HWLoopInfo) {
Loop *L = HWLoopInfo.L;
LLVM_DEBUG(dbgs() << "HWLoops: Try to convert profitable loop: " << *L);
if (!HWLoopInfo.isHardwareLoopCandidate(*SE, *LI, *DT, ForceNestedLoop,
ForceHardwareLoopPHI)) {
reportHWLoopFailure("loop is not a candidate", "HWLoopNoCandidate", ORE, L);
return false;
}
assert(
(HWLoopInfo.ExitBlock && HWLoopInfo.ExitBranch && HWLoopInfo.ExitCount) &&
"Hardware Loop must have set exit info.");
BasicBlock *Preheader = L->getLoopPreheader();
if (!Preheader)
Preheader = InsertPreheaderForLoop(L, DT, LI, nullptr, PreserveLCSSA);
if (!Preheader)
return false;
HardwareLoop HWLoop(HWLoopInfo, *SE, *DL, ORE);
HWLoop.Create();
++NumHWLoops;
return true;
}
void HardwareLoop::Create() {
LLVM_DEBUG(dbgs() << "HWLoops: Converting loop..\n");
Value *LoopCountInit = InitLoopCount();
if (!LoopCountInit) {
reportHWLoopFailure("could not safely create a loop count expression",
"HWLoopNotSafe", ORE, L);
return;
}
Value *Setup = InsertIterationSetup(LoopCountInit);
if (UsePHICounter || ForceHardwareLoopPHI) {
Instruction *LoopDec = InsertLoopRegDec(LoopCountInit);
Value *EltsRem = InsertPHICounter(Setup, LoopDec);
LoopDec->setOperand(0, EltsRem);
UpdateBranch(LoopDec);
} else
InsertLoopDec();
for (auto *I : L->blocks())
DeleteDeadPHIs(I);
}
static bool CanGenerateTest(Loop *L, Value *Count) {
BasicBlock *Preheader = L->getLoopPreheader();
if (!Preheader->getSinglePredecessor())
return false;
BasicBlock *Pred = Preheader->getSinglePredecessor();
if (!isa<BranchInst>(Pred->getTerminator()))
return false;
auto *BI = cast<BranchInst>(Pred->getTerminator());
if (BI->isUnconditional() || !isa<ICmpInst>(BI->getCondition()))
return false;
auto ICmp = cast<ICmpInst>(BI->getCondition());
LLVM_DEBUG(dbgs() << " - Found condition: " << *ICmp << "\n");
if (!ICmp->isEquality())
return false;
auto IsCompareZero = [](ICmpInst *ICmp, Value *Count, unsigned OpIdx) {
if (auto *Const = dyn_cast<ConstantInt>(ICmp->getOperand(OpIdx)))
return Const->isZero() && ICmp->getOperand(OpIdx ^ 1) == Count;
return false;
};
Value *CountBefZext =
isa<ZExtInst>(Count) ? cast<ZExtInst>(Count)->getOperand(0) : nullptr;
if (!IsCompareZero(ICmp, Count, 0) && !IsCompareZero(ICmp, Count, 1) &&
!IsCompareZero(ICmp, CountBefZext, 0) &&
!IsCompareZero(ICmp, CountBefZext, 1))
return false;
unsigned SuccIdx = ICmp->getPredicate() == ICmpInst::ICMP_NE ? 0 : 1;
if (BI->getSuccessor(SuccIdx) != Preheader)
return false;
return true;
}
Value *HardwareLoop::InitLoopCount() {
LLVM_DEBUG(dbgs() << "HWLoops: Initialising loop counter value:\n");
SCEVExpander SCEVE(SE, DL, "loopcnt");
if (!ExitCount->getType()->isPointerTy() &&
ExitCount->getType() != CountType)
ExitCount = SE.getZeroExtendExpr(ExitCount, CountType);
ExitCount = SE.getAddExpr(ExitCount, SE.getOne(CountType));
if (SE.isLoopEntryGuardedByCond(L, ICmpInst::ICMP_NE, ExitCount,
SE.getZero(ExitCount->getType()))) {
LLVM_DEBUG(dbgs() << " - Attempting to use test.set counter.\n");
UseLoopGuard |= ForceGuardLoopEntry;
} else
UseLoopGuard = false;
BasicBlock *BB = L->getLoopPreheader();
if (UseLoopGuard && BB->getSinglePredecessor() &&
cast<BranchInst>(BB->getTerminator())->isUnconditional()) {
BasicBlock *Predecessor = BB->getSinglePredecessor();
if (!SCEVE.isSafeToExpandAt(ExitCount, Predecessor->getTerminator()))
UseLoopGuard = false;
else
BB = Predecessor;
}
if (!SCEVE.isSafeToExpandAt(ExitCount, BB->getTerminator())) {
LLVM_DEBUG(dbgs() << "- Bailing, unsafe to expand ExitCount "
<< *ExitCount << "\n");
return nullptr;
}
Value *Count = SCEVE.expandCodeFor(ExitCount, CountType,
BB->getTerminator());
UseLoopGuard = UseLoopGuard && CanGenerateTest(L, Count);
BeginBB = UseLoopGuard ? BB : L->getLoopPreheader();
LLVM_DEBUG(dbgs() << " - Loop Count: " << *Count << "\n"
<< " - Expanded Count in " << BB->getName() << "\n"
<< " - Will insert set counter intrinsic into: "
<< BeginBB->getName() << "\n");
return Count;
}
Value* HardwareLoop::InsertIterationSetup(Value *LoopCountInit) {
IRBuilder<> Builder(BeginBB->getTerminator());
Type *Ty = LoopCountInit->getType();
bool UsePhi = UsePHICounter || ForceHardwareLoopPHI;
Intrinsic::ID ID = UseLoopGuard
? (UsePhi ? Intrinsic::test_start_loop_iterations
: Intrinsic::test_set_loop_iterations)
: (UsePhi ? Intrinsic::start_loop_iterations
: Intrinsic::set_loop_iterations);
Function *LoopIter = Intrinsic::getDeclaration(M, ID, Ty);
Value *LoopSetup = Builder.CreateCall(LoopIter, LoopCountInit);
if (UseLoopGuard) {
assert((isa<BranchInst>(BeginBB->getTerminator()) &&
cast<BranchInst>(BeginBB->getTerminator())->isConditional()) &&
"Expected conditional branch");
Value *SetCount =
UsePhi ? Builder.CreateExtractValue(LoopSetup, 1) : LoopSetup;
auto *LoopGuard = cast<BranchInst>(BeginBB->getTerminator());
LoopGuard->setCondition(SetCount);
if (LoopGuard->getSuccessor(0) != L->getLoopPreheader())
LoopGuard->swapSuccessors();
}
LLVM_DEBUG(dbgs() << "HWLoops: Inserted loop counter: " << *LoopSetup
<< "\n");
if (UsePhi && UseLoopGuard)
LoopSetup = Builder.CreateExtractValue(LoopSetup, 0);
return !UsePhi ? LoopCountInit : LoopSetup;
}
void HardwareLoop::InsertLoopDec() {
IRBuilder<> CondBuilder(ExitBranch);
Function *DecFunc =
Intrinsic::getDeclaration(M, Intrinsic::loop_decrement,
LoopDecrement->getType());
Value *Ops[] = { LoopDecrement };
Value *NewCond = CondBuilder.CreateCall(DecFunc, Ops);
Value *OldCond = ExitBranch->getCondition();
ExitBranch->setCondition(NewCond);
if (!L->contains(ExitBranch->getSuccessor(0)))
ExitBranch->swapSuccessors();
RecursivelyDeleteTriviallyDeadInstructions(OldCond);
LLVM_DEBUG(dbgs() << "HWLoops: Inserted loop dec: " << *NewCond << "\n");
}
Instruction* HardwareLoop::InsertLoopRegDec(Value *EltsRem) {
IRBuilder<> CondBuilder(ExitBranch);
Function *DecFunc =
Intrinsic::getDeclaration(M, Intrinsic::loop_decrement_reg,
{ EltsRem->getType() });
Value *Ops[] = { EltsRem, LoopDecrement };
Value *Call = CondBuilder.CreateCall(DecFunc, Ops);
LLVM_DEBUG(dbgs() << "HWLoops: Inserted loop dec: " << *Call << "\n");
return cast<Instruction>(Call);
}
PHINode* HardwareLoop::InsertPHICounter(Value *NumElts, Value *EltsRem) {
BasicBlock *Preheader = L->getLoopPreheader();
BasicBlock *Header = L->getHeader();
BasicBlock *Latch = ExitBranch->getParent();
IRBuilder<> Builder(Header->getFirstNonPHI());
PHINode *Index = Builder.CreatePHI(NumElts->getType(), 2);
Index->addIncoming(NumElts, Preheader);
Index->addIncoming(EltsRem, Latch);
LLVM_DEBUG(dbgs() << "HWLoops: PHI Counter: " << *Index << "\n");
return Index;
}
void HardwareLoop::UpdateBranch(Value *EltsRem) {
IRBuilder<> CondBuilder(ExitBranch);
Value *NewCond =
CondBuilder.CreateICmpNE(EltsRem, ConstantInt::get(EltsRem->getType(), 0));
Value *OldCond = ExitBranch->getCondition();
ExitBranch->setCondition(NewCond);
if (!L->contains(ExitBranch->getSuccessor(0)))
ExitBranch->swapSuccessors();
RecursivelyDeleteTriviallyDeadInstructions(OldCond);
}
INITIALIZE_PASS_BEGIN(HardwareLoops, DEBUG_TYPE, HW_LOOPS_NAME, false, false)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
INITIALIZE_PASS_END(HardwareLoops, DEBUG_TYPE, HW_LOOPS_NAME, false, false)
FunctionPass *llvm::createHardwareLoopsPass() { return new HardwareLoops(); }