#include "llvm/Transforms/Scalar/LoopIdiomRecognize.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/CmpInstAnalysis.h"
#include "llvm/Analysis/LoopAccessAnalysis.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/MemoryLocation.h"
#include "llvm/Analysis/MemorySSA.h"
#include "llvm/Analysis/MemorySSAUpdater.h"
#include "llvm/Analysis/MustExecute.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/PassManager.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
#include "llvm/IR/ValueHandle.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/InstructionCost.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/BuildLibCalls.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
#include <utility>
#include <vector>
using namespace llvm;
#define DEBUG_TYPE "loop-idiom"
STATISTIC(NumMemSet, "Number of memset's formed from loop stores");
STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores");
STATISTIC(NumMemMove, "Number of memmove's formed from loop load+stores");
STATISTIC(
NumShiftUntilBitTest,
"Number of uncountable loops recognized as 'shift until bitttest' idiom");
STATISTIC(NumShiftUntilZero,
"Number of uncountable loops recognized as 'shift until zero' idiom");
bool DisableLIRP::All;
static cl::opt<bool, true>
DisableLIRPAll("disable-" DEBUG_TYPE "-all",
cl::desc("Options to disable Loop Idiom Recognize Pass."),
cl::location(DisableLIRP::All), cl::init(false),
cl::ReallyHidden);
bool DisableLIRP::Memset;
static cl::opt<bool, true>
DisableLIRPMemset("disable-" DEBUG_TYPE "-memset",
cl::desc("Proceed with loop idiom recognize pass, but do "
"not convert loop(s) to memset."),
cl::location(DisableLIRP::Memset), cl::init(false),
cl::ReallyHidden);
bool DisableLIRP::Memcpy;
static cl::opt<bool, true>
DisableLIRPMemcpy("disable-" DEBUG_TYPE "-memcpy",
cl::desc("Proceed with loop idiom recognize pass, but do "
"not convert loop(s) to memcpy."),
cl::location(DisableLIRP::Memcpy), cl::init(false),
cl::ReallyHidden);
static cl::opt<bool> UseLIRCodeSizeHeurs(
"use-lir-code-size-heurs",
cl::desc("Use loop idiom recognition code size heuristics when compiling"
"with -Os/-Oz"),
cl::init(true), cl::Hidden);
namespace {
class LoopIdiomRecognize {
Loop *CurLoop = nullptr;
AliasAnalysis *AA;
DominatorTree *DT;
LoopInfo *LI;
ScalarEvolution *SE;
TargetLibraryInfo *TLI;
const TargetTransformInfo *TTI;
const DataLayout *DL;
OptimizationRemarkEmitter &ORE;
bool ApplyCodeSizeHeuristics;
std::unique_ptr<MemorySSAUpdater> MSSAU;
public:
explicit LoopIdiomRecognize(AliasAnalysis *AA, DominatorTree *DT,
LoopInfo *LI, ScalarEvolution *SE,
TargetLibraryInfo *TLI,
const TargetTransformInfo *TTI, MemorySSA *MSSA,
const DataLayout *DL,
OptimizationRemarkEmitter &ORE)
: AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL), ORE(ORE) {
if (MSSA)
MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
}
bool runOnLoop(Loop *L);
private:
using StoreList = SmallVector<StoreInst *, 8>;
using StoreListMap = MapVector<Value *, StoreList>;
StoreListMap StoreRefsForMemset;
StoreListMap StoreRefsForMemsetPattern;
StoreList StoreRefsForMemcpy;
bool HasMemset;
bool HasMemsetPattern;
bool HasMemcpy;
enum LegalStoreKind {
None = 0,
Memset,
MemsetPattern,
Memcpy,
UnorderedAtomicMemcpy,
DontUse };
bool runOnCountableLoop();
bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
SmallVectorImpl<BasicBlock *> &ExitBlocks);
void collectStores(BasicBlock *BB);
LegalStoreKind isLegalStore(StoreInst *SI);
enum class ForMemset { No, Yes };
bool processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount,
ForMemset For);
template <typename MemInst>
bool processLoopMemIntrinsic(
BasicBlock *BB,
bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *),
const SCEV *BECount);
bool processLoopMemCpy(MemCpyInst *MCI, const SCEV *BECount);
bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
bool processLoopStridedStore(Value *DestPtr, const SCEV *StoreSizeSCEV,
MaybeAlign StoreAlignment, Value *StoredVal,
Instruction *TheStore,
SmallPtrSetImpl<Instruction *> &Stores,
const SCEVAddRecExpr *Ev, const SCEV *BECount,
bool IsNegStride, bool IsLoopMemset = false);
bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount);
bool processLoopStoreOfLoopLoad(Value *DestPtr, Value *SourcePtr,
const SCEV *StoreSize, MaybeAlign StoreAlign,
MaybeAlign LoadAlign, Instruction *TheStore,
Instruction *TheLoad,
const SCEVAddRecExpr *StoreEv,
const SCEVAddRecExpr *LoadEv,
const SCEV *BECount);
bool avoidLIRForMultiBlockLoop(bool IsMemset = false,
bool IsLoopMemset = false);
bool runOnNoncountableLoop();
bool recognizePopcount();
void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,
PHINode *CntPhi, Value *Var);
bool recognizeAndInsertFFS(); void transformLoopToCountable(Intrinsic::ID IntrinID, BasicBlock *PreCondBB,
Instruction *CntInst, PHINode *CntPhi,
Value *Var, Instruction *DefX,
const DebugLoc &DL, bool ZeroCheck,
bool IsCntPhiUsedOutsideLoop);
bool recognizeShiftUntilBitTest();
bool recognizeShiftUntilZero();
};
class LoopIdiomRecognizeLegacyPass : public LoopPass {
public:
static char ID;
explicit LoopIdiomRecognizeLegacyPass() : LoopPass(ID) {
initializeLoopIdiomRecognizeLegacyPassPass(
*PassRegistry::getPassRegistry());
}
bool runOnLoop(Loop *L, LPPassManager &LPM) override {
if (DisableLIRP::All)
return false;
if (skipLoop(L))
return false;
AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
TargetLibraryInfo *TLI =
&getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
*L->getHeader()->getParent());
const TargetTransformInfo *TTI =
&getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
*L->getHeader()->getParent());
const DataLayout *DL = &L->getHeader()->getModule()->getDataLayout();
auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
MemorySSA *MSSA = nullptr;
if (MSSAAnalysis)
MSSA = &MSSAAnalysis->getMSSA();
OptimizationRemarkEmitter ORE(L->getHeader()->getParent());
LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, MSSA, DL, ORE);
return LIR.runOnLoop(L);
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<TargetLibraryInfoWrapperPass>();
AU.addRequired<TargetTransformInfoWrapperPass>();
AU.addPreserved<MemorySSAWrapperPass>();
getLoopAnalysisUsage(AU);
}
};
}
char LoopIdiomRecognizeLegacyPass::ID = 0;
PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM,
LoopStandardAnalysisResults &AR,
LPMUpdater &) {
if (DisableLIRP::All)
return PreservedAnalyses::all();
const auto *DL = &L.getHeader()->getModule()->getDataLayout();
OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI,
AR.MSSA, DL, ORE);
if (!LIR.runOnLoop(&L))
return PreservedAnalyses::all();
auto PA = getLoopPassPreservedAnalyses();
if (AR.MSSA)
PA.preserve<MemorySSAAnalysis>();
return PA;
}
INITIALIZE_PASS_BEGIN(LoopIdiomRecognizeLegacyPass, "loop-idiom",
"Recognize loop idioms", false, false)
INITIALIZE_PASS_DEPENDENCY(LoopPass)
INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
INITIALIZE_PASS_END(LoopIdiomRecognizeLegacyPass, "loop-idiom",
"Recognize loop idioms", false, false)
Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognizeLegacyPass(); }
static void deleteDeadInstruction(Instruction *I) {
I->replaceAllUsesWith(PoisonValue::get(I->getType()));
I->eraseFromParent();
}
bool LoopIdiomRecognize::runOnLoop(Loop *L) {
CurLoop = L;
if (!L->getLoopPreheader())
return false;
StringRef Name = L->getHeader()->getParent()->getName();
if (Name == "memset" || Name == "memcpy")
return false;
ApplyCodeSizeHeuristics =
L->getHeader()->getParent()->hasOptSize() && UseLIRCodeSizeHeurs;
HasMemset = TLI->has(LibFunc_memset);
HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);
HasMemcpy = TLI->has(LibFunc_memcpy);
if (HasMemset || HasMemsetPattern || HasMemcpy)
if (SE->hasLoopInvariantBackedgeTakenCount(L))
return runOnCountableLoop();
return runOnNoncountableLoop();
}
bool LoopIdiomRecognize::runOnCountableLoop() {
const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop);
assert(!isa<SCEVCouldNotCompute>(BECount) &&
"runOnCountableLoop() called on a loop without a predictable"
"backedge-taken count");
if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
if (BECst->getAPInt() == 0)
return false;
SmallVector<BasicBlock *, 8> ExitBlocks;
CurLoop->getUniqueExitBlocks(ExitBlocks);
LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
<< CurLoop->getHeader()->getParent()->getName()
<< "] Countable Loop %" << CurLoop->getHeader()->getName()
<< "\n");
SimpleLoopSafetyInfo SafetyInfo;
SafetyInfo.computeLoopSafetyInfo(CurLoop);
if (SafetyInfo.anyBlockMayThrow())
return false;
bool MadeChange = false;
for (auto *BB : CurLoop->getBlocks()) {
if (LI->getLoopFor(BB) != CurLoop)
continue;
MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks);
}
return MadeChange;
}
static APInt getStoreStride(const SCEVAddRecExpr *StoreEv) {
const SCEVConstant *ConstStride = cast<SCEVConstant>(StoreEv->getOperand(1));
return ConstStride->getAPInt();
}
static Constant *getMemSetPatternValue(Value *V, const DataLayout *DL) {
Constant *C = dyn_cast<Constant>(V);
if (!C)
return nullptr;
uint64_t Size = DL->getTypeSizeInBits(V->getType());
if (Size == 0 || (Size & 7) || (Size & (Size - 1)))
return nullptr;
if (DL->isBigEndian())
return nullptr;
Size /= 8;
if (Size > 16)
return nullptr;
if (Size == 16)
return C;
unsigned ArraySize = 16 / Size;
ArrayType *AT = ArrayType::get(V->getType(), ArraySize);
return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C));
}
LoopIdiomRecognize::LegalStoreKind
LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
if (SI->isVolatile())
return LegalStoreKind::None;
if (!SI->isUnordered())
return LegalStoreKind::None;
if (SI->getMetadata(LLVMContext::MD_nontemporal))
return LegalStoreKind::None;
Value *StoredVal = SI->getValueOperand();
Value *StorePtr = SI->getPointerOperand();
if (DL->isNonIntegralPointerType(StoredVal->getType()->getScalarType()))
return LegalStoreKind::None;
TypeSize SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
if (SizeInBits.isScalable() || (SizeInBits.getFixedSize() & 7) ||
(SizeInBits.getFixedSize() >> 32) != 0)
return LegalStoreKind::None;
const SCEVAddRecExpr *StoreEv =
dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
return LegalStoreKind::None;
if (!isa<SCEVConstant>(StoreEv->getOperand(1)))
return LegalStoreKind::None;
Value *SplatValue = isBytewiseValue(StoredVal, *DL);
bool UnorderedAtomic = SI->isUnordered() && !SI->isSimple();
if (!UnorderedAtomic && HasMemset && SplatValue && !DisableLIRP::Memset &&
CurLoop->isLoopInvariant(SplatValue)) {
return LegalStoreKind::Memset;
}
if (!UnorderedAtomic && HasMemsetPattern && !DisableLIRP::Memset &&
StorePtr->getType()->getPointerAddressSpace() == 0 &&
getMemSetPatternValue(StoredVal, DL)) {
return LegalStoreKind::MemsetPattern;
}
if (HasMemcpy && !DisableLIRP::Memcpy) {
APInt Stride = getStoreStride(StoreEv);
unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
if (StoreSize != Stride && StoreSize != -Stride)
return LegalStoreKind::None;
LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
if (!LI || LI->isVolatile())
return LegalStoreKind::None;
if (!LI->isUnordered())
return LegalStoreKind::None;
const SCEVAddRecExpr *LoadEv =
dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
return LegalStoreKind::None;
if (StoreEv->getOperand(1) != LoadEv->getOperand(1))
return LegalStoreKind::None;
UnorderedAtomic = UnorderedAtomic || LI->isAtomic();
return UnorderedAtomic ? LegalStoreKind::UnorderedAtomicMemcpy
: LegalStoreKind::Memcpy;
}
return LegalStoreKind::None;
}
void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
StoreRefsForMemset.clear();
StoreRefsForMemsetPattern.clear();
StoreRefsForMemcpy.clear();
for (Instruction &I : *BB) {
StoreInst *SI = dyn_cast<StoreInst>(&I);
if (!SI)
continue;
switch (isLegalStore(SI)) {
case LegalStoreKind::None:
break;
case LegalStoreKind::Memset: {
Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
StoreRefsForMemset[Ptr].push_back(SI);
} break;
case LegalStoreKind::MemsetPattern: {
Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
StoreRefsForMemsetPattern[Ptr].push_back(SI);
} break;
case LegalStoreKind::Memcpy:
case LegalStoreKind::UnorderedAtomicMemcpy:
StoreRefsForMemcpy.push_back(SI);
break;
default:
assert(false && "unhandled return value");
break;
}
}
}
bool LoopIdiomRecognize::runOnLoopBlock(
BasicBlock *BB, const SCEV *BECount,
SmallVectorImpl<BasicBlock *> &ExitBlocks) {
for (BasicBlock *ExitBlock : ExitBlocks)
if (!DT->dominates(BB, ExitBlock))
return false;
bool MadeChange = false;
collectStores(BB);
for (auto &SL : StoreRefsForMemset)
MadeChange |= processLoopStores(SL.second, BECount, ForMemset::Yes);
for (auto &SL : StoreRefsForMemsetPattern)
MadeChange |= processLoopStores(SL.second, BECount, ForMemset::No);
for (auto &SI : StoreRefsForMemcpy)
MadeChange |= processLoopStoreOfLoopLoad(SI, BECount);
MadeChange |= processLoopMemIntrinsic<MemCpyInst>(
BB, &LoopIdiomRecognize::processLoopMemCpy, BECount);
MadeChange |= processLoopMemIntrinsic<MemSetInst>(
BB, &LoopIdiomRecognize::processLoopMemSet, BECount);
return MadeChange;
}
bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
const SCEV *BECount, ForMemset For) {
SetVector<StoreInst *> Heads, Tails;
SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;
SmallVector<unsigned, 16> IndexQueue;
for (unsigned i = 0, e = SL.size(); i < e; ++i) {
assert(SL[i]->isSimple() && "Expected only non-volatile stores.");
Value *FirstStoredVal = SL[i]->getValueOperand();
Value *FirstStorePtr = SL[i]->getPointerOperand();
const SCEVAddRecExpr *FirstStoreEv =
cast<SCEVAddRecExpr>(SE->getSCEV(FirstStorePtr));
APInt FirstStride = getStoreStride(FirstStoreEv);
unsigned FirstStoreSize = DL->getTypeStoreSize(SL[i]->getValueOperand()->getType());
if (FirstStride == FirstStoreSize || -FirstStride == FirstStoreSize) {
Heads.insert(SL[i]);
continue;
}
Value *FirstSplatValue = nullptr;
Constant *FirstPatternValue = nullptr;
if (For == ForMemset::Yes)
FirstSplatValue = isBytewiseValue(FirstStoredVal, *DL);
else
FirstPatternValue = getMemSetPatternValue(FirstStoredVal, DL);
assert((FirstSplatValue || FirstPatternValue) &&
"Expected either splat value or pattern value.");
IndexQueue.clear();
unsigned j = 0;
for (j = i + 1; j < e; ++j)
IndexQueue.push_back(j);
for (j = i; j > 0; --j)
IndexQueue.push_back(j - 1);
for (auto &k : IndexQueue) {
assert(SL[k]->isSimple() && "Expected only non-volatile stores.");
Value *SecondStorePtr = SL[k]->getPointerOperand();
const SCEVAddRecExpr *SecondStoreEv =
cast<SCEVAddRecExpr>(SE->getSCEV(SecondStorePtr));
APInt SecondStride = getStoreStride(SecondStoreEv);
if (FirstStride != SecondStride)
continue;
Value *SecondStoredVal = SL[k]->getValueOperand();
Value *SecondSplatValue = nullptr;
Constant *SecondPatternValue = nullptr;
if (For == ForMemset::Yes)
SecondSplatValue = isBytewiseValue(SecondStoredVal, *DL);
else
SecondPatternValue = getMemSetPatternValue(SecondStoredVal, DL);
assert((SecondSplatValue || SecondPatternValue) &&
"Expected either splat value or pattern value.");
if (isConsecutiveAccess(SL[i], SL[k], *DL, *SE, false)) {
if (For == ForMemset::Yes) {
if (isa<UndefValue>(FirstSplatValue))
FirstSplatValue = SecondSplatValue;
if (FirstSplatValue != SecondSplatValue)
continue;
} else {
if (isa<UndefValue>(FirstPatternValue))
FirstPatternValue = SecondPatternValue;
if (FirstPatternValue != SecondPatternValue)
continue;
}
Tails.insert(SL[k]);
Heads.insert(SL[i]);
ConsecutiveChain[SL[i]] = SL[k];
break;
}
}
}
SmallPtrSet<Value *, 16> TransformedStores;
bool Changed = false;
for (StoreInst *I : Heads) {
if (Tails.count(I))
continue;
SmallPtrSet<Instruction *, 8> AdjacentStores;
StoreInst *HeadStore = I;
unsigned StoreSize = 0;
while (Tails.count(I) || Heads.count(I)) {
if (TransformedStores.count(I))
break;
AdjacentStores.insert(I);
StoreSize += DL->getTypeStoreSize(I->getValueOperand()->getType());
I = ConsecutiveChain[I];
}
Value *StoredVal = HeadStore->getValueOperand();
Value *StorePtr = HeadStore->getPointerOperand();
const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
APInt Stride = getStoreStride(StoreEv);
if (StoreSize != Stride && StoreSize != -Stride)
continue;
bool IsNegStride = StoreSize == -Stride;
Type *IntIdxTy = DL->getIndexType(StorePtr->getType());
const SCEV *StoreSizeSCEV = SE->getConstant(IntIdxTy, StoreSize);
if (processLoopStridedStore(StorePtr, StoreSizeSCEV,
MaybeAlign(HeadStore->getAlign()), StoredVal,
HeadStore, AdjacentStores, StoreEv, BECount,
IsNegStride)) {
TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end());
Changed = true;
}
}
return Changed;
}
template <typename MemInst>
bool LoopIdiomRecognize::processLoopMemIntrinsic(
BasicBlock *BB,
bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *),
const SCEV *BECount) {
bool MadeChange = false;
for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
Instruction *Inst = &*I++;
if (MemInst *MI = dyn_cast<MemInst>(Inst)) {
WeakTrackingVH InstPtr(&*I);
if (!(this->*Processor)(MI, BECount))
continue;
MadeChange = true;
if (!InstPtr)
I = BB->begin();
}
}
return MadeChange;
}
bool LoopIdiomRecognize::processLoopMemCpy(MemCpyInst *MCI,
const SCEV *BECount) {
if (MCI->isVolatile() || !isa<ConstantInt>(MCI->getLength()))
return false;
if ((!HasMemcpy && !isa<MemCpyInlineInst>(MCI)) || DisableLIRP::Memcpy)
return false;
Value *Dest = MCI->getDest();
Value *Source = MCI->getSource();
if (!Dest || !Source)
return false;
const SCEVAddRecExpr *StoreEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Dest));
if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
return false;
const SCEVAddRecExpr *LoadEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Source));
if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
return false;
uint64_t SizeInBytes = cast<ConstantInt>(MCI->getLength())->getZExtValue();
if ((SizeInBytes >> 32) != 0)
return false;
const SCEVConstant *ConstStoreStride =
dyn_cast<SCEVConstant>(StoreEv->getOperand(1));
const SCEVConstant *ConstLoadStride =
dyn_cast<SCEVConstant>(LoadEv->getOperand(1));
if (!ConstStoreStride || !ConstLoadStride)
return false;
APInt StoreStrideValue = ConstStoreStride->getAPInt();
APInt LoadStrideValue = ConstLoadStride->getAPInt();
if (StoreStrideValue.getBitWidth() > 64 || LoadStrideValue.getBitWidth() > 64)
return false;
if (SizeInBytes != StoreStrideValue && SizeInBytes != -StoreStrideValue) {
ORE.emit([&]() {
return OptimizationRemarkMissed(DEBUG_TYPE, "SizeStrideUnequal", MCI)
<< ore::NV("Inst", "memcpy") << " in "
<< ore::NV("Function", MCI->getFunction())
<< " function will not be hoisted: "
<< ore::NV("Reason", "memcpy size is not equal to stride");
});
return false;
}
int64_t StoreStrideInt = StoreStrideValue.getSExtValue();
int64_t LoadStrideInt = LoadStrideValue.getSExtValue();
if (StoreStrideInt != LoadStrideInt)
return false;
return processLoopStoreOfLoopLoad(
Dest, Source, SE->getConstant(Dest->getType(), SizeInBytes),
MCI->getDestAlign(), MCI->getSourceAlign(), MCI, MCI, StoreEv, LoadEv,
BECount);
}
bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
const SCEV *BECount) {
if (MSI->isVolatile())
return false;
if (!HasMemset || DisableLIRP::Memset)
return false;
Value *Pointer = MSI->getDest();
const SCEVAddRecExpr *Ev = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Pointer));
if (!Ev || Ev->getLoop() != CurLoop)
return false;
if (!Ev->isAffine()) {
LLVM_DEBUG(dbgs() << " Pointer is not affine, abort\n");
return false;
}
const SCEV *PointerStrideSCEV = Ev->getOperand(1);
const SCEV *MemsetSizeSCEV = SE->getSCEV(MSI->getLength());
if (!PointerStrideSCEV || !MemsetSizeSCEV)
return false;
bool IsNegStride = false;
const bool IsConstantSize = isa<ConstantInt>(MSI->getLength());
if (IsConstantSize) {
LLVM_DEBUG(dbgs() << " memset size is constant\n");
uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue();
const SCEVConstant *ConstStride = dyn_cast<SCEVConstant>(Ev->getOperand(1));
if (!ConstStride)
return false;
APInt Stride = ConstStride->getAPInt();
if (SizeInBytes != Stride && SizeInBytes != -Stride)
return false;
IsNegStride = SizeInBytes == -Stride;
} else {
LLVM_DEBUG(dbgs() << " memset size is non-constant\n");
if (Pointer->getType()->getPointerAddressSpace() != 0) {
LLVM_DEBUG(dbgs() << " pointer is not in address space zero, "
<< "abort\n");
return false;
}
if (!SE->isLoopInvariant(MemsetSizeSCEV, CurLoop)) {
LLVM_DEBUG(dbgs() << " memset size is not a loop-invariant, "
<< "abort\n");
return false;
}
IsNegStride = PointerStrideSCEV->isNonConstantNegative();
const SCEV *PositiveStrideSCEV =
IsNegStride ? SE->getNegativeSCEV(PointerStrideSCEV)
: PointerStrideSCEV;
LLVM_DEBUG(dbgs() << " MemsetSizeSCEV: " << *MemsetSizeSCEV << "\n"
<< " PositiveStrideSCEV: " << *PositiveStrideSCEV
<< "\n");
if (PositiveStrideSCEV != MemsetSizeSCEV) {
const SCEV *FoldedPositiveStride =
SE->applyLoopGuards(PositiveStrideSCEV, CurLoop);
const SCEV *FoldedMemsetSize =
SE->applyLoopGuards(MemsetSizeSCEV, CurLoop);
LLVM_DEBUG(dbgs() << " Try to fold SCEV based on loop guard\n"
<< " FoldedMemsetSize: " << *FoldedMemsetSize << "\n"
<< " FoldedPositiveStride: " << *FoldedPositiveStride
<< "\n");
if (FoldedPositiveStride != FoldedMemsetSize) {
LLVM_DEBUG(dbgs() << " SCEV don't match, abort\n");
return false;
}
}
}
Value *SplatValue = MSI->getValue();
if (!SplatValue || !CurLoop->isLoopInvariant(SplatValue))
return false;
SmallPtrSet<Instruction *, 1> MSIs;
MSIs.insert(MSI);
return processLoopStridedStore(Pointer, SE->getSCEV(MSI->getLength()),
MSI->getDestAlign(), SplatValue, MSI, MSIs, Ev,
BECount, IsNegStride, true);
}
static bool
mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L,
const SCEV *BECount, const SCEV *StoreSizeSCEV,
AliasAnalysis &AA,
SmallPtrSetImpl<Instruction *> &IgnoredInsts) {
LocationSize AccessSize = LocationSize::afterPointer();
const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount);
const SCEVConstant *ConstSize = dyn_cast<SCEVConstant>(StoreSizeSCEV);
if (BECst && ConstSize)
AccessSize = LocationSize::precise((BECst->getValue()->getZExtValue() + 1) *
ConstSize->getValue()->getZExtValue());
MemoryLocation StoreLoc(Ptr, AccessSize);
for (BasicBlock *B : L->blocks())
for (Instruction &I : *B)
if (!IgnoredInsts.contains(&I) &&
isModOrRefSet(
intersectModRef(AA.getModRefInfo(&I, StoreLoc), Access)))
return true;
return false;
}
static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount,
Type *IntPtr, const SCEV *StoreSizeSCEV,
ScalarEvolution *SE) {
const SCEV *Index = SE->getTruncateOrZeroExtend(BECount, IntPtr);
if (!StoreSizeSCEV->isOne()) {
Index = SE->getMulExpr(Index,
SE->getTruncateOrZeroExtend(StoreSizeSCEV, IntPtr),
SCEV::FlagNUW);
}
return SE->getMinusSCEV(Start, Index);
}
static const SCEV *getTripCount(const SCEV *BECount, Type *IntPtr,
Loop *CurLoop, const DataLayout *DL,
ScalarEvolution *SE) {
const SCEV *TripCountS = nullptr;
if (DL->getTypeSizeInBits(BECount->getType()) <
DL->getTypeSizeInBits(IntPtr) &&
SE->isLoopEntryGuardedByCond(
CurLoop, ICmpInst::ICMP_NE, BECount,
SE->getNegativeSCEV(SE->getOne(BECount->getType())))) {
TripCountS = SE->getZeroExtendExpr(
SE->getAddExpr(BECount, SE->getOne(BECount->getType()), SCEV::FlagNUW),
IntPtr);
} else {
TripCountS = SE->getAddExpr(SE->getTruncateOrZeroExtend(BECount, IntPtr),
SE->getOne(IntPtr), SCEV::FlagNUW);
}
return TripCountS;
}
static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr,
const SCEV *StoreSizeSCEV, Loop *CurLoop,
const DataLayout *DL, ScalarEvolution *SE) {
const SCEV *TripCountSCEV = getTripCount(BECount, IntPtr, CurLoop, DL, SE);
return SE->getMulExpr(TripCountSCEV,
SE->getTruncateOrZeroExtend(StoreSizeSCEV, IntPtr),
SCEV::FlagNUW);
}
bool LoopIdiomRecognize::processLoopStridedStore(
Value *DestPtr, const SCEV *StoreSizeSCEV, MaybeAlign StoreAlignment,
Value *StoredVal, Instruction *TheStore,
SmallPtrSetImpl<Instruction *> &Stores, const SCEVAddRecExpr *Ev,
const SCEV *BECount, bool IsNegStride, bool IsLoopMemset) {
Module *M = TheStore->getModule();
Value *SplatValue = isBytewiseValue(StoredVal, *DL);
Constant *PatternValue = nullptr;
if (!SplatValue)
PatternValue = getMemSetPatternValue(StoredVal, DL);
assert((SplatValue || PatternValue) &&
"Expected either splat value or pattern value.");
unsigned DestAS = DestPtr->getType()->getPointerAddressSpace();
BasicBlock *Preheader = CurLoop->getLoopPreheader();
IRBuilder<> Builder(Preheader->getTerminator());
SCEVExpander Expander(*SE, *DL, "loop-idiom");
SCEVExpanderCleaner ExpCleaner(Expander);
Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS);
Type *IntIdxTy = DL->getIndexType(DestPtr->getType());
bool Changed = false;
const SCEV *Start = Ev->getStart();
if (IsNegStride)
Start = getStartForNegStride(Start, BECount, IntIdxTy, StoreSizeSCEV, SE);
if (!Expander.isSafeToExpand(Start))
return Changed;
Value *BasePtr =
Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator());
Changed = true;
if (mayLoopAccessLocation(BasePtr, ModRefInfo::ModRef, CurLoop, BECount,
StoreSizeSCEV, *AA, Stores))
return Changed;
if (avoidLIRForMultiBlockLoop(true, IsLoopMemset))
return Changed;
const SCEV *NumBytesS =
getNumBytes(BECount, IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE);
if (!Expander.isSafeToExpand(NumBytesS))
return Changed;
Value *NumBytes =
Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
CallInst *NewCall;
if (SplatValue) {
AAMDNodes AATags = TheStore->getAAMetadata();
for (Instruction *Store : Stores)
AATags = AATags.merge(Store->getAAMetadata());
if (auto CI = dyn_cast<ConstantInt>(NumBytes))
AATags = AATags.extendTo(CI->getZExtValue());
else
AATags = AATags.extendTo(-1);
NewCall = Builder.CreateMemSet(
BasePtr, SplatValue, NumBytes, MaybeAlign(StoreAlignment),
false, AATags.TBAA, AATags.Scope, AATags.NoAlias);
} else if (isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16)) {
Type *Int8PtrTy = DestInt8PtrTy;
StringRef FuncName = "memset_pattern16";
FunctionCallee MSP = getOrInsertLibFunc(M, *TLI, LibFunc_memset_pattern16,
Builder.getVoidTy(), Int8PtrTy, Int8PtrTy, IntIdxTy);
inferNonMandatoryLibFuncAttrs(M, FuncName, *TLI);
GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true,
GlobalValue::PrivateLinkage,
PatternValue, ".memset_pattern");
GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); GV->setAlignment(Align(16));
Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy);
NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes});
} else
return Changed;
NewCall->setDebugLoc(TheStore->getDebugLoc());
if (MSSAU) {
MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
}
LLVM_DEBUG(dbgs() << " Formed memset: " << *NewCall << "\n"
<< " from store to: " << *Ev << " at: " << *TheStore
<< "\n");
ORE.emit([&]() {
OptimizationRemark R(DEBUG_TYPE, "ProcessLoopStridedStore",
NewCall->getDebugLoc(), Preheader);
R << "Transformed loop-strided store in "
<< ore::NV("Function", TheStore->getFunction())
<< " function into a call to "
<< ore::NV("NewFunction", NewCall->getCalledFunction())
<< "() intrinsic";
if (!Stores.empty())
R << ore::setExtraArgs();
for (auto *I : Stores) {
R << ore::NV("FromBlock", I->getParent()->getName())
<< ore::NV("ToBlock", Preheader->getName());
}
return R;
});
for (auto *I : Stores) {
if (MSSAU)
MSSAU->removeMemoryAccess(I, true);
deleteDeadInstruction(I);
}
if (MSSAU && VerifyMemorySSA)
MSSAU->getMemorySSA()->verifyMemorySSA();
++NumMemSet;
ExpCleaner.markResultUsed();
return true;
}
bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
const SCEV *BECount) {
assert(SI->isUnordered() && "Expected only non-volatile non-ordered stores.");
Value *StorePtr = SI->getPointerOperand();
const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
LoadInst *LI = cast<LoadInst>(SI->getValueOperand());
assert(LI->isUnordered() && "Expected only non-volatile non-ordered loads.");
Value *LoadPtr = LI->getPointerOperand();
const SCEVAddRecExpr *LoadEv = cast<SCEVAddRecExpr>(SE->getSCEV(LoadPtr));
const SCEV *StoreSizeSCEV = SE->getConstant(StorePtr->getType(), StoreSize);
return processLoopStoreOfLoopLoad(StorePtr, LoadPtr, StoreSizeSCEV,
SI->getAlign(), LI->getAlign(), SI, LI,
StoreEv, LoadEv, BECount);
}
class MemmoveVerifier {
public:
explicit MemmoveVerifier(const Value &LoadBasePtr, const Value &StoreBasePtr,
const DataLayout &DL)
: DL(DL), BP1(llvm::GetPointerBaseWithConstantOffset(
LoadBasePtr.stripPointerCasts(), LoadOff, DL)),
BP2(llvm::GetPointerBaseWithConstantOffset(
StoreBasePtr.stripPointerCasts(), StoreOff, DL)),
IsSameObject(BP1 == BP2) {}
bool loadAndStoreMayFormMemmove(unsigned StoreSize, bool IsNegStride,
const Instruction &TheLoad,
bool IsMemCpy) const {
if (IsMemCpy) {
if ((!IsNegStride && LoadOff <= StoreOff) ||
(IsNegStride && LoadOff >= StoreOff))
return false;
} else {
int64_t LoadSize =
DL.getTypeSizeInBits(TheLoad.getType()).getFixedSize() / 8;
if (BP1 != BP2 || LoadSize != int64_t(StoreSize))
return false;
if ((!IsNegStride && LoadOff < StoreOff + int64_t(StoreSize)) ||
(IsNegStride && LoadOff + LoadSize > StoreOff))
return false;
}
return true;
}
private:
const DataLayout &DL;
int64_t LoadOff = 0;
int64_t StoreOff = 0;
const Value *BP1;
const Value *BP2;
public:
const bool IsSameObject;
};
bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
Value *DestPtr, Value *SourcePtr, const SCEV *StoreSizeSCEV,
MaybeAlign StoreAlign, MaybeAlign LoadAlign, Instruction *TheStore,
Instruction *TheLoad, const SCEVAddRecExpr *StoreEv,
const SCEVAddRecExpr *LoadEv, const SCEV *BECount) {
if (isa<MemCpyInlineInst>(TheStore))
return false;
BasicBlock *Preheader = CurLoop->getLoopPreheader();
IRBuilder<> Builder(Preheader->getTerminator());
SCEVExpander Expander(*SE, *DL, "loop-idiom");
SCEVExpanderCleaner ExpCleaner(Expander);
bool Changed = false;
const SCEV *StrStart = StoreEv->getStart();
unsigned StrAS = DestPtr->getType()->getPointerAddressSpace();
Type *IntIdxTy = Builder.getIntNTy(DL->getIndexSizeInBits(StrAS));
APInt Stride = getStoreStride(StoreEv);
const SCEVConstant *ConstStoreSize = dyn_cast<SCEVConstant>(StoreSizeSCEV);
assert(ConstStoreSize && "store size is expected to be a constant");
int64_t StoreSize = ConstStoreSize->getValue()->getZExtValue();
bool IsNegStride = StoreSize == -Stride;
if (IsNegStride)
StrStart =
getStartForNegStride(StrStart, BECount, IntIdxTy, StoreSizeSCEV, SE);
Value *StoreBasePtr = Expander.expandCodeFor(
StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator());
Changed = true;
SmallPtrSet<Instruction *, 2> IgnoredInsts;
IgnoredInsts.insert(TheStore);
bool IsMemCpy = isa<MemCpyInst>(TheStore);
const StringRef InstRemark = IsMemCpy ? "memcpy" : "load and store";
bool LoopAccessStore =
mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount,
StoreSizeSCEV, *AA, IgnoredInsts);
if (LoopAccessStore) {
if (!TheLoad->hasOneUse())
return Changed;
IgnoredInsts.insert(TheLoad);
if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop,
BECount, StoreSizeSCEV, *AA, IgnoredInsts)) {
ORE.emit([&]() {
return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessStore",
TheStore)
<< ore::NV("Inst", InstRemark) << " in "
<< ore::NV("Function", TheStore->getFunction())
<< " function will not be hoisted: "
<< ore::NV("Reason", "The loop may access store location");
});
return Changed;
}
IgnoredInsts.erase(TheLoad);
}
const SCEV *LdStart = LoadEv->getStart();
unsigned LdAS = SourcePtr->getType()->getPointerAddressSpace();
if (IsNegStride)
LdStart =
getStartForNegStride(LdStart, BECount, IntIdxTy, StoreSizeSCEV, SE);
Value *LoadBasePtr = Expander.expandCodeFor(
LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator());
MemmoveVerifier Verifier(*LoadBasePtr, *StoreBasePtr, *DL);
if (IsMemCpy && !Verifier.IsSameObject)
IgnoredInsts.erase(TheStore);
if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount,
StoreSizeSCEV, *AA, IgnoredInsts)) {
ORE.emit([&]() {
return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessLoad", TheLoad)
<< ore::NV("Inst", InstRemark) << " in "
<< ore::NV("Function", TheStore->getFunction())
<< " function will not be hoisted: "
<< ore::NV("Reason", "The loop may access load location");
});
return Changed;
}
bool UseMemMove = IsMemCpy ? Verifier.IsSameObject : LoopAccessStore;
if (UseMemMove)
if (!Verifier.loadAndStoreMayFormMemmove(StoreSize, IsNegStride, *TheLoad,
IsMemCpy))
return Changed;
if (avoidLIRForMultiBlockLoop())
return Changed;
const SCEV *NumBytesS =
getNumBytes(BECount, IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE);
Value *NumBytes =
Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
AAMDNodes AATags = TheLoad->getAAMetadata();
AAMDNodes StoreAATags = TheStore->getAAMetadata();
AATags = AATags.merge(StoreAATags);
if (auto CI = dyn_cast<ConstantInt>(NumBytes))
AATags = AATags.extendTo(CI->getZExtValue());
else
AATags = AATags.extendTo(-1);
CallInst *NewCall = nullptr;
if (!TheStore->isAtomic() && !TheLoad->isAtomic()) {
if (UseMemMove)
NewCall = Builder.CreateMemMove(
StoreBasePtr, StoreAlign, LoadBasePtr, LoadAlign, NumBytes,
false, AATags.TBAA, AATags.Scope, AATags.NoAlias);
else
NewCall =
Builder.CreateMemCpy(StoreBasePtr, StoreAlign, LoadBasePtr, LoadAlign,
NumBytes, false, AATags.TBAA,
AATags.TBAAStruct, AATags.Scope, AATags.NoAlias);
} else {
if (UseMemMove)
return Changed;
assert((StoreAlign && LoadAlign) &&
"Expect unordered load/store to have align.");
if (StoreAlign.value() < StoreSize || LoadAlign.value() < StoreSize)
return Changed;
if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize())
return Changed;
NewCall = Builder.CreateElementUnorderedAtomicMemCpy(
StoreBasePtr, StoreAlign.value(), LoadBasePtr, LoadAlign.value(),
NumBytes, StoreSize, AATags.TBAA, AATags.TBAAStruct, AATags.Scope,
AATags.NoAlias);
}
NewCall->setDebugLoc(TheStore->getDebugLoc());
if (MSSAU) {
MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
}
LLVM_DEBUG(dbgs() << " Formed new call: " << *NewCall << "\n"
<< " from load ptr=" << *LoadEv << " at: " << *TheLoad
<< "\n"
<< " from store ptr=" << *StoreEv << " at: " << *TheStore
<< "\n");
ORE.emit([&]() {
return OptimizationRemark(DEBUG_TYPE, "ProcessLoopStoreOfLoopLoad",
NewCall->getDebugLoc(), Preheader)
<< "Formed a call to "
<< ore::NV("NewFunction", NewCall->getCalledFunction())
<< "() intrinsic from " << ore::NV("Inst", InstRemark)
<< " instruction in " << ore::NV("Function", TheStore->getFunction())
<< " function"
<< ore::setExtraArgs()
<< ore::NV("FromBlock", TheStore->getParent()->getName())
<< ore::NV("ToBlock", Preheader->getName());
});
if (MSSAU)
MSSAU->removeMemoryAccess(TheStore, true);
deleteDeadInstruction(TheStore);
if (MSSAU && VerifyMemorySSA)
MSSAU->getMemorySSA()->verifyMemorySSA();
if (UseMemMove)
++NumMemMove;
else
++NumMemCpy;
ExpCleaner.markResultUsed();
return true;
}
bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
bool IsLoopMemset) {
if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > 1) {
if (CurLoop->isOutermost() && (!IsMemset || !IsLoopMemset)) {
LLVM_DEBUG(dbgs() << " " << CurLoop->getHeader()->getParent()->getName()
<< " : LIR " << (IsMemset ? "Memset" : "Memcpy")
<< " avoided: multi-block top-level loop\n");
return true;
}
}
return false;
}
bool LoopIdiomRecognize::runOnNoncountableLoop() {
LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
<< CurLoop->getHeader()->getParent()->getName()
<< "] Noncountable Loop %"
<< CurLoop->getHeader()->getName() << "\n");
return recognizePopcount() || recognizeAndInsertFFS() ||
recognizeShiftUntilBitTest() || recognizeShiftUntilZero();
}
static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry,
bool JmpOnZero = false) {
if (!BI || !BI->isConditional())
return nullptr;
ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
if (!Cond)
return nullptr;
ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1));
if (!CmpZero || !CmpZero->isZero())
return nullptr;
BasicBlock *TrueSucc = BI->getSuccessor(0);
BasicBlock *FalseSucc = BI->getSuccessor(1);
if (JmpOnZero)
std::swap(TrueSucc, FalseSucc);
ICmpInst::Predicate Pred = Cond->getPredicate();
if ((Pred == ICmpInst::ICMP_NE && TrueSucc == LoopEntry) ||
(Pred == ICmpInst::ICMP_EQ && FalseSucc == LoopEntry))
return Cond->getOperand(0);
return nullptr;
}
static PHINode *getRecurrenceVar(Value *VarX, Instruction *DefX,
BasicBlock *LoopEntry) {
auto *PhiX = dyn_cast<PHINode>(VarX);
if (PhiX && PhiX->getParent() == LoopEntry &&
(PhiX->getOperand(0) == DefX || PhiX->getOperand(1) == DefX))
return PhiX;
return nullptr;
}
static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
Instruction *&CntInst, PHINode *&CntPhi,
Value *&Var) {
BasicBlock *LoopEntry;
Instruction *DefX2, *CountInst;
Value *VarX1, *VarX0;
PHINode *PhiX, *CountPhi;
DefX2 = CountInst = nullptr;
VarX1 = VarX0 = nullptr;
PhiX = CountPhi = nullptr;
LoopEntry = *(CurLoop->block_begin());
{
if (Value *T = matchCondition(
dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
DefX2 = dyn_cast<Instruction>(T);
else
return false;
}
{
if (!DefX2 || DefX2->getOpcode() != Instruction::And)
return false;
BinaryOperator *SubOneOp;
if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0))))
VarX1 = DefX2->getOperand(1);
else {
VarX1 = DefX2->getOperand(0);
SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1));
}
if (!SubOneOp || SubOneOp->getOperand(0) != VarX1)
return false;
ConstantInt *Dec = dyn_cast<ConstantInt>(SubOneOp->getOperand(1));
if (!Dec ||
!((SubOneOp->getOpcode() == Instruction::Sub && Dec->isOne()) ||
(SubOneOp->getOpcode() == Instruction::Add &&
Dec->isMinusOne()))) {
return false;
}
}
PhiX = getRecurrenceVar(VarX1, DefX2, LoopEntry);
if (!PhiX)
return false;
{
CountInst = nullptr;
for (Instruction &Inst : llvm::make_range(
LoopEntry->getFirstNonPHI()->getIterator(), LoopEntry->end())) {
if (Inst.getOpcode() != Instruction::Add)
continue;
ConstantInt *Inc = dyn_cast<ConstantInt>(Inst.getOperand(1));
if (!Inc || !Inc->isOne())
continue;
PHINode *Phi = getRecurrenceVar(Inst.getOperand(0), &Inst, LoopEntry);
if (!Phi)
continue;
bool LiveOutLoop = false;
for (User *U : Inst.users()) {
if ((cast<Instruction>(U))->getParent() != LoopEntry) {
LiveOutLoop = true;
break;
}
}
if (LiveOutLoop) {
CountInst = &Inst;
CountPhi = Phi;
break;
}
}
if (!CountInst)
return false;
}
{
auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator());
Value *T = matchCondition(PreCondBr, CurLoop->getLoopPreheader());
if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1))
return false;
CntInst = CountInst;
CntPhi = CountPhi;
Var = T;
}
return true;
}
static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL,
Intrinsic::ID &IntrinID, Value *&InitX,
Instruction *&CntInst, PHINode *&CntPhi,
Instruction *&DefX) {
BasicBlock *LoopEntry;
Value *VarX = nullptr;
DefX = nullptr;
CntInst = nullptr;
CntPhi = nullptr;
LoopEntry = *(CurLoop->block_begin());
if (Value *T = matchCondition(
dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
DefX = dyn_cast<Instruction>(T);
else
return false;
if (!DefX || !DefX->isShift())
return false;
IntrinID = DefX->getOpcode() == Instruction::Shl ? Intrinsic::cttz :
Intrinsic::ctlz;
ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1));
if (!Shft || !Shft->isOne())
return false;
VarX = DefX->getOperand(0);
PHINode *PhiX = getRecurrenceVar(VarX, DefX, LoopEntry);
if (!PhiX)
return false;
InitX = PhiX->getIncomingValueForBlock(CurLoop->getLoopPreheader());
if (DefX->getOpcode() == Instruction::AShr && !isKnownNonNegative(InitX, DL))
return false;
for (Instruction &Inst : llvm::make_range(
LoopEntry->getFirstNonPHI()->getIterator(), LoopEntry->end())) {
if (Inst.getOpcode() != Instruction::Add)
continue;
ConstantInt *Inc = dyn_cast<ConstantInt>(Inst.getOperand(1));
if (!Inc || (!Inc->isOne() && !Inc->isMinusOne()))
continue;
PHINode *Phi = getRecurrenceVar(Inst.getOperand(0), &Inst, LoopEntry);
if (!Phi)
continue;
CntInst = &Inst;
CntPhi = Phi;
break;
}
if (!CntInst)
return false;
return true;
}
bool LoopIdiomRecognize::recognizeAndInsertFFS() {
if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
return false;
Intrinsic::ID IntrinID;
Value *InitX;
Instruction *DefX = nullptr;
PHINode *CntPhi = nullptr;
Instruction *CntInst = nullptr;
size_t IdiomCanonicalSize = 6;
if (!detectShiftUntilZeroIdiom(CurLoop, *DL, IntrinID, InitX,
CntInst, CntPhi, DefX))
return false;
bool IsCntPhiUsedOutsideLoop = false;
for (User *U : CntPhi->users())
if (!CurLoop->contains(cast<Instruction>(U))) {
IsCntPhiUsedOutsideLoop = true;
break;
}
bool IsCntInstUsedOutsideLoop = false;
for (User *U : CntInst->users())
if (!CurLoop->contains(cast<Instruction>(U))) {
IsCntInstUsedOutsideLoop = true;
break;
}
if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop)
return false;
bool ZeroCheck = false;
BasicBlock *PH = CurLoop->getLoopPreheader();
if (!IsCntPhiUsedOutsideLoop) {
auto *PreCondBB = PH->getSinglePredecessor();
if (!PreCondBB)
return false;
auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
if (!PreCondBI)
return false;
if (matchCondition(PreCondBI, PH) != InitX)
return false;
ZeroCheck = true;
}
const Value *Args[] = {InitX,
ConstantInt::getBool(InitX->getContext(), ZeroCheck)};
auto InstWithoutDebugIt = CurLoop->getHeader()->instructionsWithoutDebug();
uint32_t HeaderSize =
std::distance(InstWithoutDebugIt.begin(), InstWithoutDebugIt.end());
IntrinsicCostAttributes Attrs(IntrinID, InitX->getType(), Args);
InstructionCost Cost =
TTI->getIntrinsicInstrCost(Attrs, TargetTransformInfo::TCK_SizeAndLatency);
if (HeaderSize != IdiomCanonicalSize &&
Cost > TargetTransformInfo::TCC_Basic)
return false;
transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX,
DefX->getDebugLoc(), ZeroCheck,
IsCntPhiUsedOutsideLoop);
return true;
}
bool LoopIdiomRecognize::recognizePopcount() {
if (TTI->getPopcntSupport(32) != TargetTransformInfo::PSK_FastHardware)
return false;
if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
return false;
BasicBlock *LoopBody = *(CurLoop->block_begin());
if (LoopBody->size() >= 20) {
return false;
}
BasicBlock *PH = CurLoop->getLoopPreheader();
if (!PH || &PH->front() != PH->getTerminator())
return false;
auto *EntryBI = dyn_cast<BranchInst>(PH->getTerminator());
if (!EntryBI || EntryBI->isConditional())
return false;
auto *PreCondBB = PH->getSinglePredecessor();
if (!PreCondBB)
return false;
auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
if (!PreCondBI || PreCondBI->isUnconditional())
return false;
Instruction *CntInst;
PHINode *CntPhi;
Value *Val;
if (!detectPopcountIdiom(CurLoop, PreCondBB, CntInst, CntPhi, Val))
return false;
transformLoopToPopcount(PreCondBB, CntInst, CntPhi, Val);
return true;
}
static CallInst *createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
const DebugLoc &DL) {
Value *Ops[] = {Val};
Type *Tys[] = {Val->getType()};
Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
Function *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys);
CallInst *CI = IRBuilder.CreateCall(Func, Ops);
CI->setDebugLoc(DL);
return CI;
}
static CallInst *createFFSIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
const DebugLoc &DL, bool ZeroCheck,
Intrinsic::ID IID) {
Value *Ops[] = {Val, IRBuilder.getInt1(ZeroCheck)};
Type *Tys[] = {Val->getType()};
Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
Function *Func = Intrinsic::getDeclaration(M, IID, Tys);
CallInst *CI = IRBuilder.CreateCall(Func, Ops);
CI->setDebugLoc(DL);
return CI;
}
void LoopIdiomRecognize::transformLoopToCountable(
Intrinsic::ID IntrinID, BasicBlock *Preheader, Instruction *CntInst,
PHINode *CntPhi, Value *InitX, Instruction *DefX, const DebugLoc &DL,
bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) {
BranchInst *PreheaderBr = cast<BranchInst>(Preheader->getTerminator());
IRBuilder<> Builder(PreheaderBr);
Builder.SetCurrentDebugLocation(DL);
Value *InitXNext;
if (IsCntPhiUsedOutsideLoop) {
if (DefX->getOpcode() == Instruction::AShr)
InitXNext = Builder.CreateAShr(InitX, 1);
else if (DefX->getOpcode() == Instruction::LShr)
InitXNext = Builder.CreateLShr(InitX, 1);
else if (DefX->getOpcode() == Instruction::Shl) InitXNext = Builder.CreateShl(InitX, 1);
else
llvm_unreachable("Unexpected opcode!");
} else
InitXNext = InitX;
Value *Count =
createFFSIntrinsic(Builder, InitXNext, DL, ZeroCheck, IntrinID);
Type *CountTy = Count->getType();
Count = Builder.CreateSub(
ConstantInt::get(CountTy, CountTy->getIntegerBitWidth()), Count);
Value *NewCount = Count;
if (IsCntPhiUsedOutsideLoop)
Count = Builder.CreateAdd(Count, ConstantInt::get(CountTy, 1));
NewCount = Builder.CreateZExtOrTrunc(NewCount, CntInst->getType());
Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader);
if (cast<ConstantInt>(CntInst->getOperand(1))->isOne()) {
ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
if (!InitConst || !InitConst->isZero())
NewCount = Builder.CreateAdd(NewCount, CntInitVal);
} else {
NewCount = Builder.CreateSub(CntInitVal, NewCount);
}
BasicBlock *Body = *(CurLoop->block_begin());
auto *LbBr = cast<BranchInst>(Body->getTerminator());
ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
PHINode *TcPhi = PHINode::Create(CountTy, 2, "tcphi", &Body->front());
Builder.SetInsertPoint(LbCond);
Instruction *TcDec = cast<Instruction>(Builder.CreateSub(
TcPhi, ConstantInt::get(CountTy, 1), "tcdec", false, true));
TcPhi->addIncoming(Count, Preheader);
TcPhi->addIncoming(TcDec, Body);
CmpInst::Predicate Pred =
(LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ;
LbCond->setPredicate(Pred);
LbCond->setOperand(0, TcDec);
LbCond->setOperand(1, ConstantInt::get(CountTy, 0));
if (IsCntPhiUsedOutsideLoop)
CntPhi->replaceUsesOutsideBlock(NewCount, Body);
else
CntInst->replaceUsesOutsideBlock(NewCount, Body);
SE->forgetLoop(CurLoop);
}
void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
Instruction *CntInst,
PHINode *CntPhi, Value *Var) {
BasicBlock *PreHead = CurLoop->getLoopPreheader();
auto *PreCondBr = cast<BranchInst>(PreCondBB->getTerminator());
const DebugLoc &DL = CntInst->getDebugLoc();
IRBuilder<> Builder(PreCondBr);
Value *PopCnt, *PopCntZext, *NewCount, *TripCnt;
{
PopCnt = createPopcntIntrinsic(Builder, Var, DL);
NewCount = PopCntZext =
Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType()));
if (NewCount != PopCnt)
(cast<Instruction>(NewCount))->setDebugLoc(DL);
TripCnt = NewCount;
Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead);
ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
if (!InitConst || !InitConst->isZero()) {
NewCount = Builder.CreateAdd(NewCount, CntInitVal);
(cast<Instruction>(NewCount))->setDebugLoc(DL);
}
}
{
ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition());
Value *Opnd0 = PopCntZext;
Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0);
if (PreCond->getOperand(0) != Var)
std::swap(Opnd0, Opnd1);
ICmpInst *NewPreCond = cast<ICmpInst>(
Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1));
PreCondBr->setCondition(NewPreCond);
RecursivelyDeleteTriviallyDeadInstructions(PreCond, TLI);
}
BasicBlock *Body = *(CurLoop->block_begin());
{
auto *LbBr = cast<BranchInst>(Body->getTerminator());
ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
Type *Ty = TripCnt->getType();
PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());
Builder.SetInsertPoint(LbCond);
Instruction *TcDec = cast<Instruction>(
Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
"tcdec", false, true));
TcPhi->addIncoming(TripCnt, PreHead);
TcPhi->addIncoming(TcDec, Body);
CmpInst::Predicate Pred =
(LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_UGT : CmpInst::ICMP_SLE;
LbCond->setPredicate(Pred);
LbCond->setOperand(0, TcDec);
LbCond->setOperand(1, ConstantInt::get(Ty, 0));
}
CntInst->replaceUsesOutsideBlock(NewCount, Body);
SE->forgetLoop(CurLoop);
}
template <typename SubPattern_t> struct match_LoopInvariant {
SubPattern_t SubPattern;
const Loop *L;
match_LoopInvariant(const SubPattern_t &SP, const Loop *L)
: SubPattern(SP), L(L) {}
template <typename ITy> bool match(ITy *V) {
return L->isLoopInvariant(V) && SubPattern.match(V);
}
};
template <typename Ty>
inline match_LoopInvariant<Ty> m_LoopInvariant(const Ty &M, const Loop *L) {
return match_LoopInvariant<Ty>(M, L);
}
static bool detectShiftUntilBitTestIdiom(Loop *CurLoop, Value *&BaseX,
Value *&BitMask, Value *&BitPos,
Value *&CurrX, Instruction *&NextX) {
LLVM_DEBUG(dbgs() << DEBUG_TYPE
" Performing shift-until-bittest idiom detection.\n");
if (CurLoop->getNumBlocks() != 1 || CurLoop->getNumBackEdges() != 1) {
LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n");
return false;
}
BasicBlock *LoopHeaderBB = CurLoop->getHeader();
BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
assert(LoopPreheaderBB && "There is always a loop preheader.");
using namespace PatternMatch;
ICmpInst::Predicate Pred;
Value *CmpLHS, *CmpRHS;
BasicBlock *TrueBB, *FalseBB;
if (!match(LoopHeaderBB->getTerminator(),
m_Br(m_ICmp(Pred, m_Value(CmpLHS), m_Value(CmpRHS)),
m_BasicBlock(TrueBB), m_BasicBlock(FalseBB)))) {
LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n");
return false;
}
auto MatchVariableBitMask = [&]() {
return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) &&
match(CmpLHS,
m_c_And(m_Value(CurrX),
m_CombineAnd(
m_Value(BitMask),
m_LoopInvariant(m_Shl(m_One(), m_Value(BitPos)),
CurLoop))));
};
auto MatchConstantBitMask = [&]() {
return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) &&
match(CmpLHS, m_And(m_Value(CurrX),
m_CombineAnd(m_Value(BitMask), m_Power2()))) &&
(BitPos = ConstantExpr::getExactLogBase2(cast<Constant>(BitMask)));
};
auto MatchDecomposableConstantBitMask = [&]() {
APInt Mask;
return llvm::decomposeBitTestICmp(CmpLHS, CmpRHS, Pred, CurrX, Mask) &&
ICmpInst::isEquality(Pred) && Mask.isPowerOf2() &&
(BitMask = ConstantInt::get(CurrX->getType(), Mask)) &&
(BitPos = ConstantInt::get(CurrX->getType(), Mask.logBase2()));
};
if (!MatchVariableBitMask() && !MatchConstantBitMask() &&
!MatchDecomposableConstantBitMask()) {
LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge comparison.\n");
return false;
}
auto *CurrXPN = dyn_cast<PHINode>(CurrX);
if (!CurrXPN || CurrXPN->getParent() != LoopHeaderBB) {
LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n");
return false;
}
BaseX = CurrXPN->getIncomingValueForBlock(LoopPreheaderBB);
NextX =
dyn_cast<Instruction>(CurrXPN->getIncomingValueForBlock(LoopHeaderBB));
assert(CurLoop->isLoopInvariant(BaseX) &&
"Expected BaseX to be avaliable in the preheader!");
if (!NextX || !match(NextX, m_Shl(m_Specific(CurrX), m_One()))) {
LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n");
return false;
}
assert(ICmpInst::isEquality(Pred) &&
"Should only get equality predicates here.");
if (Pred != ICmpInst::Predicate::ICMP_EQ) {
Pred = ICmpInst::getInversePredicate(Pred);
std::swap(TrueBB, FalseBB);
}
if (TrueBB != LoopHeaderBB) {
LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n");
return false;
}
return true;
}
bool LoopIdiomRecognize::recognizeShiftUntilBitTest() {
bool MadeChange = false;
Value *X, *BitMask, *BitPos, *XCurr;
Instruction *XNext;
if (!detectShiftUntilBitTestIdiom(CurLoop, X, BitMask, BitPos, XCurr,
XNext)) {
LLVM_DEBUG(dbgs() << DEBUG_TYPE
" shift-until-bittest idiom detection failed.\n");
return MadeChange;
}
LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom detected!\n");
BasicBlock *LoopHeaderBB = CurLoop->getHeader();
BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
assert(LoopPreheaderBB && "There is always a loop preheader.");
BasicBlock *SuccessorBB = CurLoop->getExitBlock();
assert(SuccessorBB && "There is only a single successor.");
IRBuilder<> Builder(LoopPreheaderBB->getTerminator());
Builder.SetCurrentDebugLocation(cast<Instruction>(XCurr)->getDebugLoc());
Intrinsic::ID IntrID = Intrinsic::ctlz;
Type *Ty = X->getType();
unsigned Bitwidth = Ty->getScalarSizeInBits();
TargetTransformInfo::TargetCostKind CostKind =
TargetTransformInfo::TCK_SizeAndLatency;
IntrinsicCostAttributes Attrs(
IntrID, Ty, {UndefValue::get(Ty), Builder.getTrue()});
InstructionCost Cost = TTI->getIntrinsicInstrCost(Attrs, CostKind);
if (Cost > TargetTransformInfo::TCC_Basic) {
LLVM_DEBUG(dbgs() << DEBUG_TYPE
" Intrinsic is too costly, not beneficial\n");
return MadeChange;
}
if (TTI->getArithmeticInstrCost(Instruction::Shl, Ty, CostKind) >
TargetTransformInfo::TCC_Basic) {
LLVM_DEBUG(dbgs() << DEBUG_TYPE " Shift is too costly, not beneficial\n");
return MadeChange;
}
MadeChange = true;
Value *LowBitMask = Builder.CreateAdd(BitMask, Constant::getAllOnesValue(Ty),
BitPos->getName() + ".lowbitmask");
Value *Mask =
Builder.CreateOr(LowBitMask, BitMask, BitPos->getName() + ".mask");
Value *XMasked = Builder.CreateAnd(X, Mask, X->getName() + ".masked");
CallInst *XMaskedNumLeadingZeros = Builder.CreateIntrinsic(
IntrID, Ty, {XMasked, Builder.getTrue()},
nullptr, XMasked->getName() + ".numleadingzeros");
Value *XMaskedNumActiveBits = Builder.CreateSub(
ConstantInt::get(Ty, Ty->getScalarSizeInBits()), XMaskedNumLeadingZeros,
XMasked->getName() + ".numactivebits", true,
Bitwidth != 2);
Value *XMaskedLeadingOnePos =
Builder.CreateAdd(XMaskedNumActiveBits, Constant::getAllOnesValue(Ty),
XMasked->getName() + ".leadingonepos", false,
Bitwidth > 2);
Value *LoopBackedgeTakenCount = Builder.CreateSub(
BitPos, XMaskedLeadingOnePos, CurLoop->getName() + ".backedgetakencount",
true, true);
Value *LoopTripCount =
Builder.CreateAdd(LoopBackedgeTakenCount, ConstantInt::get(Ty, 1),
CurLoop->getName() + ".tripcount", true,
Bitwidth != 2);
Value *NewX = Builder.CreateShl(X, LoopBackedgeTakenCount);
NewX->takeName(XCurr);
if (auto *I = dyn_cast<Instruction>(NewX))
I->copyIRFlags(XNext, true);
Value *NewXNext;
if (XNext->hasNoSignedWrap() || XNext->hasNoUnsignedWrap() ||
PatternMatch::match(
BitPos, PatternMatch::m_SpecificInt_ICMP(
ICmpInst::ICMP_NE, APInt(Ty->getScalarSizeInBits(),
Ty->getScalarSizeInBits() - 1))))
NewXNext = Builder.CreateShl(X, LoopTripCount);
else {
NewXNext = Builder.CreateShl(NewX, ConstantInt::get(Ty, 1));
}
NewXNext->takeName(XNext);
if (auto *I = dyn_cast<Instruction>(NewXNext))
I->copyIRFlags(XNext, true);
XCurr->replaceUsesOutsideBlock(NewX, LoopHeaderBB);
XNext->replaceUsesOutsideBlock(NewXNext, LoopHeaderBB);
Builder.SetInsertPoint(&LoopHeaderBB->front());
auto *IV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv");
Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
auto *IVNext =
Builder.CreateAdd(IV, ConstantInt::get(Ty, 1), IV->getName() + ".next",
true, Bitwidth != 2);
auto *IVCheck = Builder.CreateICmpEQ(IVNext, LoopTripCount,
CurLoop->getName() + ".ivcheck");
Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB);
LoopHeaderBB->getTerminator()->eraseFromParent();
IV->addIncoming(ConstantInt::get(Ty, 0), LoopPreheaderBB);
IV->addIncoming(IVNext, LoopHeaderBB);
SE->forgetLoop(CurLoop);
LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom optimized!\n");
++NumShiftUntilBitTest;
return MadeChange;
}
static bool detectShiftUntilZeroIdiom(Loop *CurLoop, ScalarEvolution *SE,
Instruction *&ValShiftedIsZero,
Intrinsic::ID &IntrinID, Instruction *&IV,
Value *&Start, Value *&Val,
const SCEV *&ExtraOffsetExpr,
bool &InvertedCond) {
LLVM_DEBUG(dbgs() << DEBUG_TYPE
" Performing shift-until-zero idiom detection.\n");
if (CurLoop->getNumBlocks() != 1 || CurLoop->getNumBackEdges() != 1) {
LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n");
return false;
}
Instruction *ValShifted, *NBits, *IVNext;
Value *ExtraOffset;
BasicBlock *LoopHeaderBB = CurLoop->getHeader();
BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
assert(LoopPreheaderBB && "There is always a loop preheader.");
using namespace PatternMatch;
ICmpInst::Predicate Pred;
BasicBlock *TrueBB, *FalseBB;
if (!match(LoopHeaderBB->getTerminator(),
m_Br(m_Instruction(ValShiftedIsZero), m_BasicBlock(TrueBB),
m_BasicBlock(FalseBB))) ||
!match(ValShiftedIsZero,
m_ICmp(Pred, m_Instruction(ValShifted), m_Zero())) ||
!ICmpInst::isEquality(Pred)) {
LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n");
return false;
}
if (!match(ValShifted, m_Shift(m_LoopInvariant(m_Value(Val), CurLoop),
m_Instruction(NBits)))) {
LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad comparisons value computation.\n");
return false;
}
IntrinID = ValShifted->getOpcode() == Instruction::Shl ? Intrinsic::cttz
: Intrinsic::ctlz;
if (match(NBits, m_c_Add(m_Instruction(IV),
m_LoopInvariant(m_Value(ExtraOffset), CurLoop))) &&
(NBits->hasNoSignedWrap() || NBits->hasNoUnsignedWrap()))
ExtraOffsetExpr = SE->getNegativeSCEV(SE->getSCEV(ExtraOffset));
else if (match(NBits,
m_Sub(m_Instruction(IV),
m_LoopInvariant(m_Value(ExtraOffset), CurLoop))) &&
NBits->hasNoSignedWrap())
ExtraOffsetExpr = SE->getSCEV(ExtraOffset);
else {
IV = NBits;
ExtraOffsetExpr = SE->getZero(NBits->getType());
}
auto *IVPN = dyn_cast<PHINode>(IV);
if (!IVPN || IVPN->getParent() != LoopHeaderBB) {
LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n");
return false;
}
Start = IVPN->getIncomingValueForBlock(LoopPreheaderBB);
IVNext = dyn_cast<Instruction>(IVPN->getIncomingValueForBlock(LoopHeaderBB));
if (!IVNext || !match(IVNext, m_Add(m_Specific(IVPN), m_One()))) {
LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n");
return false;
}
assert(ICmpInst::isEquality(Pred) &&
"Should only get equality predicates here.");
InvertedCond = Pred != ICmpInst::Predicate::ICMP_EQ;
if (InvertedCond) {
Pred = ICmpInst::getInversePredicate(Pred);
std::swap(TrueBB, FalseBB);
}
if (FalseBB != LoopHeaderBB) {
LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n");
return false;
}
if (ValShifted->getOpcode() == Instruction::AShr &&
!isMustProgress(CurLoop) && !SE->isKnownNonNegative(SE->getSCEV(Val))) {
LLVM_DEBUG(dbgs() << DEBUG_TYPE " Can not prove the loop is finite.\n");
return false;
}
return true;
}
bool LoopIdiomRecognize::recognizeShiftUntilZero() {
bool MadeChange = false;
Instruction *ValShiftedIsZero;
Intrinsic::ID IntrID;
Instruction *IV;
Value *Start, *Val;
const SCEV *ExtraOffsetExpr;
bool InvertedCond;
if (!detectShiftUntilZeroIdiom(CurLoop, SE, ValShiftedIsZero, IntrID, IV,
Start, Val, ExtraOffsetExpr, InvertedCond)) {
LLVM_DEBUG(dbgs() << DEBUG_TYPE
" shift-until-zero idiom detection failed.\n");
return MadeChange;
}
LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-zero idiom detected!\n");
BasicBlock *LoopHeaderBB = CurLoop->getHeader();
BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
assert(LoopPreheaderBB && "There is always a loop preheader.");
BasicBlock *SuccessorBB = CurLoop->getExitBlock();
assert(SuccessorBB && "There is only a single successor.");
IRBuilder<> Builder(LoopPreheaderBB->getTerminator());
Builder.SetCurrentDebugLocation(IV->getDebugLoc());
Type *Ty = Val->getType();
unsigned Bitwidth = Ty->getScalarSizeInBits();
TargetTransformInfo::TargetCostKind CostKind =
TargetTransformInfo::TCK_SizeAndLatency;
IntrinsicCostAttributes Attrs(
IntrID, Ty, {UndefValue::get(Ty), Builder.getFalse()});
InstructionCost Cost = TTI->getIntrinsicInstrCost(Attrs, CostKind);
if (Cost > TargetTransformInfo::TCC_Basic) {
LLVM_DEBUG(dbgs() << DEBUG_TYPE
" Intrinsic is too costly, not beneficial\n");
return MadeChange;
}
MadeChange = true;
bool OffsetIsZero = false;
if (auto *ExtraOffsetExprC = dyn_cast<SCEVConstant>(ExtraOffsetExpr))
OffsetIsZero = ExtraOffsetExprC->isZero();
CallInst *ValNumLeadingZeros = Builder.CreateIntrinsic(
IntrID, Ty, {Val, Builder.getFalse()},
nullptr, Val->getName() + ".numleadingzeros");
Value *ValNumActiveBits = Builder.CreateSub(
ConstantInt::get(Ty, Ty->getScalarSizeInBits()), ValNumLeadingZeros,
Val->getName() + ".numactivebits", true,
Bitwidth != 2);
SCEVExpander Expander(*SE, *DL, "loop-idiom");
Expander.setInsertPoint(&*Builder.GetInsertPoint());
Value *ExtraOffset = Expander.expandCodeFor(ExtraOffsetExpr);
Value *ValNumActiveBitsOffset = Builder.CreateAdd(
ValNumActiveBits, ExtraOffset, ValNumActiveBits->getName() + ".offset",
OffsetIsZero, true);
Value *IVFinal = Builder.CreateIntrinsic(Intrinsic::smax, {Ty},
{ValNumActiveBitsOffset, Start},
nullptr, "iv.final");
auto *LoopBackedgeTakenCount = cast<Instruction>(Builder.CreateSub(
IVFinal, Start, CurLoop->getName() + ".backedgetakencount",
OffsetIsZero, true));
Value *LoopTripCount =
Builder.CreateAdd(LoopBackedgeTakenCount, ConstantInt::get(Ty, 1),
CurLoop->getName() + ".tripcount", true,
Bitwidth != 2);
IV->replaceUsesOutsideBlock(IVFinal, LoopHeaderBB);
Builder.SetInsertPoint(&LoopHeaderBB->front());
auto *CIV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv");
Builder.SetInsertPoint(LoopHeaderBB->getFirstNonPHI());
auto *CIVNext =
Builder.CreateAdd(CIV, ConstantInt::get(Ty, 1), CIV->getName() + ".next",
true, Bitwidth != 2);
auto *CIVCheck = Builder.CreateICmpEQ(CIVNext, LoopTripCount,
CurLoop->getName() + ".ivcheck");
auto *NewIVCheck = CIVCheck;
if (InvertedCond) {
NewIVCheck = Builder.CreateNot(CIVCheck);
NewIVCheck->takeName(ValShiftedIsZero);
}
auto *IVDePHId = Builder.CreateAdd(CIV, Start, "", false,
true); IVDePHId->takeName(IV);
Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
Builder.CreateCondBr(CIVCheck, SuccessorBB, LoopHeaderBB);
LoopHeaderBB->getTerminator()->eraseFromParent();
CIV->addIncoming(ConstantInt::get(Ty, 0), LoopPreheaderBB);
CIV->addIncoming(CIVNext, LoopHeaderBB);
SE->forgetLoop(CurLoop);
IV->replaceAllUsesWith(IVDePHId);
IV->eraseFromParent();
ValShiftedIsZero->replaceAllUsesWith(NewIVCheck);
ValShiftedIsZero->eraseFromParent();
LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-zero idiom optimized!\n");
++NumShiftUntilZero;
return MadeChange;
}