#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/iterator_range.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/MemoryLocation.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GetElementPtrTypeIterator.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Vectorize.h"
#include <algorithm>
#include <cassert>
#include <cstdlib>
#include <tuple>
#include <utility>
using namespace llvm;
#define DEBUG_TYPE "load-store-vectorizer"
STATISTIC(NumVectorInstructions, "Number of vector accesses generated");
STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized");
static const unsigned StackAdjustedAlignment = 4;
namespace {
using ChainID = const Value *;
using InstrList = SmallVector<Instruction *, 8>;
using InstrListMap = MapVector<ChainID, InstrList>;
class Vectorizer {
Function &F;
AliasAnalysis &AA;
AssumptionCache &AC;
DominatorTree &DT;
ScalarEvolution &SE;
TargetTransformInfo &TTI;
const DataLayout &DL;
IRBuilder<> Builder;
public:
Vectorizer(Function &F, AliasAnalysis &AA, AssumptionCache &AC,
DominatorTree &DT, ScalarEvolution &SE, TargetTransformInfo &TTI)
: F(F), AA(AA), AC(AC), DT(DT), SE(SE), TTI(TTI),
DL(F.getParent()->getDataLayout()), Builder(SE.getContext()) {}
bool run();
private:
unsigned getPointerAddressSpace(Value *I);
static const unsigned MaxDepth = 3;
bool isConsecutiveAccess(Value *A, Value *B);
bool areConsecutivePointers(Value *PtrA, Value *PtrB, APInt PtrDelta,
unsigned Depth = 0) const;
bool lookThroughComplexAddresses(Value *PtrA, Value *PtrB, APInt PtrDelta,
unsigned Depth) const;
bool lookThroughSelects(Value *PtrA, Value *PtrB, const APInt &PtrDelta,
unsigned Depth) const;
void reorder(Instruction *I);
std::pair<BasicBlock::iterator, BasicBlock::iterator>
getBoundaryInstrs(ArrayRef<Instruction *> Chain);
void eraseInstructions(ArrayRef<Instruction *> Chain);
std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>
splitOddVectorElts(ArrayRef<Instruction *> Chain, unsigned ElementSizeBits);
ArrayRef<Instruction *> getVectorizablePrefix(ArrayRef<Instruction *> Chain);
std::pair<InstrListMap, InstrListMap> collectInstructions(BasicBlock *BB);
bool vectorizeChains(InstrListMap &Map);
bool vectorizeInstructions(ArrayRef<Instruction *> Instrs);
bool
vectorizeLoadChain(ArrayRef<Instruction *> Chain,
SmallPtrSet<Instruction *, 16> *InstructionsProcessed);
bool
vectorizeStoreChain(ArrayRef<Instruction *> Chain,
SmallPtrSet<Instruction *, 16> *InstructionsProcessed);
bool accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
Align Alignment);
};
class LoadStoreVectorizerLegacyPass : public FunctionPass {
public:
static char ID;
LoadStoreVectorizerLegacyPass() : FunctionPass(ID) {
initializeLoadStoreVectorizerLegacyPassPass(*PassRegistry::getPassRegistry());
}
bool runOnFunction(Function &F) override;
StringRef getPassName() const override {
return "GPU Load and Store Vectorizer";
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<AAResultsWrapperPass>();
AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<ScalarEvolutionWrapperPass>();
AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<TargetTransformInfoWrapperPass>();
AU.setPreservesCFG();
}
};
}
char LoadStoreVectorizerLegacyPass::ID = 0;
INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
"Vectorize load and Store instructions", false, false)
INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker);
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
"Vectorize load and store instructions", false, false)
Pass *llvm::createLoadStoreVectorizerPass() {
return new LoadStoreVectorizerLegacyPass();
}
bool LoadStoreVectorizerLegacyPass::runOnFunction(Function &F) {
if (skipFunction(F) || F.hasFnAttribute(Attribute::NoImplicitFloat))
return false;
AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
TargetTransformInfo &TTI =
getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
AssumptionCache &AC =
getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
Vectorizer V(F, AA, AC, DT, SE, TTI);
return V.run();
}
PreservedAnalyses LoadStoreVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {
if (F.hasFnAttribute(Attribute::NoImplicitFloat))
return PreservedAnalyses::all();
AliasAnalysis &AA = AM.getResult<AAManager>(F);
DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
ScalarEvolution &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(F);
Vectorizer V(F, AA, AC, DT, SE, TTI);
bool Changed = V.run();
PreservedAnalyses PA;
PA.preserveSet<CFGAnalyses>();
return Changed ? PA : PreservedAnalyses::all();
}
static void propagateMetadata(Instruction *I, ArrayRef<Instruction *> IL) {
SmallVector<Value *, 8> VL(IL.begin(), IL.end());
propagateMetadata(I, VL);
}
bool Vectorizer::run() {
bool Changed = false;
for (BasicBlock *BB : post_order(&F)) {
InstrListMap LoadRefs, StoreRefs;
std::tie(LoadRefs, StoreRefs) = collectInstructions(BB);
Changed |= vectorizeChains(LoadRefs);
Changed |= vectorizeChains(StoreRefs);
}
return Changed;
}
unsigned Vectorizer::getPointerAddressSpace(Value *I) {
if (LoadInst *L = dyn_cast<LoadInst>(I))
return L->getPointerAddressSpace();
if (StoreInst *S = dyn_cast<StoreInst>(I))
return S->getPointerAddressSpace();
return -1;
}
bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
Value *PtrA = getLoadStorePointerOperand(A);
Value *PtrB = getLoadStorePointerOperand(B);
unsigned ASA = getPointerAddressSpace(A);
unsigned ASB = getPointerAddressSpace(B);
if (!PtrA || !PtrB || (ASA != ASB))
return false;
Type *PtrATy = getLoadStoreType(A);
Type *PtrBTy = getLoadStoreType(B);
if (PtrA == PtrB ||
PtrATy->isVectorTy() != PtrBTy->isVectorTy() ||
DL.getTypeStoreSize(PtrATy) != DL.getTypeStoreSize(PtrBTy) ||
DL.getTypeStoreSize(PtrATy->getScalarType()) !=
DL.getTypeStoreSize(PtrBTy->getScalarType()))
return false;
unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
APInt Size(PtrBitWidth, DL.getTypeStoreSize(PtrATy));
return areConsecutivePointers(PtrA, PtrB, Size);
}
bool Vectorizer::areConsecutivePointers(Value *PtrA, Value *PtrB,
APInt PtrDelta, unsigned Depth) const {
unsigned PtrBitWidth = DL.getPointerTypeSizeInBits(PtrA->getType());
APInt OffsetA(PtrBitWidth, 0);
APInt OffsetB(PtrBitWidth, 0);
PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(PtrA->getType());
if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(PtrB->getType()))
return false;
assert(OffsetA.getMinSignedBits() <= NewPtrBitWidth &&
OffsetB.getMinSignedBits() <= NewPtrBitWidth);
OffsetA = OffsetA.sextOrTrunc(NewPtrBitWidth);
OffsetB = OffsetB.sextOrTrunc(NewPtrBitWidth);
PtrDelta = PtrDelta.sextOrTrunc(NewPtrBitWidth);
APInt OffsetDelta = OffsetB - OffsetA;
if (PtrA == PtrB)
return OffsetDelta == PtrDelta;
APInt BaseDelta = PtrDelta - OffsetDelta;
const SCEV *PtrSCEVA = SE.getSCEV(PtrA);
const SCEV *PtrSCEVB = SE.getSCEV(PtrB);
const SCEV *C = SE.getConstant(BaseDelta);
const SCEV *X = SE.getAddExpr(PtrSCEVA, C);
if (X == PtrSCEVB)
return true;
const SCEV *Dist = SE.getMinusSCEV(PtrSCEVB, PtrSCEVA);
if (C == Dist)
return true;
return lookThroughComplexAddresses(PtrA, PtrB, BaseDelta, Depth);
}
static bool checkNoWrapFlags(Instruction *I, bool Signed) {
BinaryOperator *BinOpI = cast<BinaryOperator>(I);
return (Signed && BinOpI->hasNoSignedWrap()) ||
(!Signed && BinOpI->hasNoUnsignedWrap());
}
static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA,
unsigned MatchingOpIdxA, Instruction *AddOpB,
unsigned MatchingOpIdxB, bool Signed) {
assert(AddOpA->getOpcode() == Instruction::Add &&
AddOpB->getOpcode() == Instruction::Add &&
checkNoWrapFlags(AddOpA, Signed) && checkNoWrapFlags(AddOpB, Signed));
if (AddOpA->getOperand(MatchingOpIdxA) ==
AddOpB->getOperand(MatchingOpIdxB)) {
Value *OtherOperandA = AddOpA->getOperand(MatchingOpIdxA == 1 ? 0 : 1);
Value *OtherOperandB = AddOpB->getOperand(MatchingOpIdxB == 1 ? 0 : 1);
Instruction *OtherInstrA = dyn_cast<Instruction>(OtherOperandA);
Instruction *OtherInstrB = dyn_cast<Instruction>(OtherOperandB);
if (OtherInstrB && OtherInstrB->getOpcode() == Instruction::Add &&
checkNoWrapFlags(OtherInstrB, Signed) &&
isa<ConstantInt>(OtherInstrB->getOperand(1))) {
int64_t CstVal =
cast<ConstantInt>(OtherInstrB->getOperand(1))->getSExtValue();
if (OtherInstrB->getOperand(0) == OtherOperandA &&
IdxDiff.getSExtValue() == CstVal)
return true;
}
if (OtherInstrA && OtherInstrA->getOpcode() == Instruction::Add &&
checkNoWrapFlags(OtherInstrA, Signed) &&
isa<ConstantInt>(OtherInstrA->getOperand(1))) {
int64_t CstVal =
cast<ConstantInt>(OtherInstrA->getOperand(1))->getSExtValue();
if (OtherInstrA->getOperand(0) == OtherOperandB &&
IdxDiff.getSExtValue() == -CstVal)
return true;
}
if (OtherInstrA && OtherInstrB &&
OtherInstrA->getOpcode() == Instruction::Add &&
OtherInstrB->getOpcode() == Instruction::Add &&
checkNoWrapFlags(OtherInstrA, Signed) &&
checkNoWrapFlags(OtherInstrB, Signed) &&
isa<ConstantInt>(OtherInstrA->getOperand(1)) &&
isa<ConstantInt>(OtherInstrB->getOperand(1))) {
int64_t CstValA =
cast<ConstantInt>(OtherInstrA->getOperand(1))->getSExtValue();
int64_t CstValB =
cast<ConstantInt>(OtherInstrB->getOperand(1))->getSExtValue();
if (OtherInstrA->getOperand(0) == OtherInstrB->getOperand(0) &&
IdxDiff.getSExtValue() == (CstValB - CstValA))
return true;
}
}
return false;
}
bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB,
APInt PtrDelta,
unsigned Depth) const {
auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA);
auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB);
if (!GEPA || !GEPB)
return lookThroughSelects(PtrA, PtrB, PtrDelta, Depth);
if (GEPA->getNumOperands() != GEPB->getNumOperands() ||
GEPA->getPointerOperand() != GEPB->getPointerOperand())
return false;
gep_type_iterator GTIA = gep_type_begin(GEPA);
gep_type_iterator GTIB = gep_type_begin(GEPB);
for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) {
if (GTIA.getOperand() != GTIB.getOperand())
return false;
++GTIA;
++GTIB;
}
Instruction *OpA = dyn_cast<Instruction>(GTIA.getOperand());
Instruction *OpB = dyn_cast<Instruction>(GTIB.getOperand());
if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() ||
OpA->getType() != OpB->getType())
return false;
if (PtrDelta.isNegative()) {
if (PtrDelta.isMinSignedValue())
return false;
PtrDelta.negate();
std::swap(OpA, OpB);
}
uint64_t Stride = DL.getTypeAllocSize(GTIA.getIndexedType());
if (PtrDelta.urem(Stride) != 0)
return false;
unsigned IdxBitWidth = OpA->getType()->getScalarSizeInBits();
APInt IdxDiff = PtrDelta.udiv(Stride).zext(IdxBitWidth);
if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))
return false;
bool Signed = isa<SExtInst>(OpA);
Value *ValA = OpA->getOperand(0);
OpB = dyn_cast<Instruction>(OpB->getOperand(0));
if (!OpB || ValA->getType() != OpB->getType())
return false;
bool Safe = false;
if (OpB->getOpcode() == Instruction::Add &&
isa<ConstantInt>(OpB->getOperand(1)) &&
IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue()) &&
checkNoWrapFlags(OpB, Signed))
Safe = true;
OpA = dyn_cast<Instruction>(ValA);
if (!Safe && OpA && OpA->getOpcode() == Instruction::Add &&
OpB->getOpcode() == Instruction::Add && checkNoWrapFlags(OpA, Signed) &&
checkNoWrapFlags(OpB, Signed)) {
for (unsigned MatchingOpIdxA : {0, 1})
for (unsigned MatchingOpIdxB : {0, 1})
if (!Safe)
Safe = checkIfSafeAddSequence(IdxDiff, OpA, MatchingOpIdxA, OpB,
MatchingOpIdxB, Signed);
}
unsigned BitWidth = ValA->getType()->getScalarSizeInBits();
if (!Safe) {
KnownBits Known(BitWidth);
computeKnownBits(ValA, Known, DL, 0, &AC, OpB, &DT);
APInt BitsAllowedToBeSet = Known.Zero.zext(IdxDiff.getBitWidth());
if (Signed)
BitsAllowedToBeSet.clearBit(BitWidth - 1);
if (BitsAllowedToBeSet.ult(IdxDiff))
return false;
}
const SCEV *OffsetSCEVA = SE.getSCEV(ValA);
const SCEV *OffsetSCEVB = SE.getSCEV(OpB);
const SCEV *C = SE.getConstant(IdxDiff.trunc(BitWidth));
const SCEV *X = SE.getAddExpr(OffsetSCEVA, C);
return X == OffsetSCEVB;
}
bool Vectorizer::lookThroughSelects(Value *PtrA, Value *PtrB,
const APInt &PtrDelta,
unsigned Depth) const {
if (Depth++ == MaxDepth)
return false;
if (auto *SelectA = dyn_cast<SelectInst>(PtrA)) {
if (auto *SelectB = dyn_cast<SelectInst>(PtrB)) {
return SelectA->getCondition() == SelectB->getCondition() &&
areConsecutivePointers(SelectA->getTrueValue(),
SelectB->getTrueValue(), PtrDelta, Depth) &&
areConsecutivePointers(SelectA->getFalseValue(),
SelectB->getFalseValue(), PtrDelta, Depth);
}
}
return false;
}
void Vectorizer::reorder(Instruction *I) {
SmallPtrSet<Instruction *, 16> InstructionsToMove;
SmallVector<Instruction *, 16> Worklist;
Worklist.push_back(I);
while (!Worklist.empty()) {
Instruction *IW = Worklist.pop_back_val();
int NumOperands = IW->getNumOperands();
for (int i = 0; i < NumOperands; i++) {
Instruction *IM = dyn_cast<Instruction>(IW->getOperand(i));
if (!IM || IM->getOpcode() == Instruction::PHI)
continue;
if (IM->getParent() != I->getParent())
continue;
if (!IM->comesBefore(I)) {
InstructionsToMove.insert(IM);
Worklist.push_back(IM);
}
}
}
for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;
++BBI) {
if (!InstructionsToMove.count(&*BBI))
continue;
Instruction *IM = &*BBI;
--BBI;
IM->removeFromParent();
IM->insertBefore(I);
}
}
std::pair<BasicBlock::iterator, BasicBlock::iterator>
Vectorizer::getBoundaryInstrs(ArrayRef<Instruction *> Chain) {
Instruction *C0 = Chain[0];
BasicBlock::iterator FirstInstr = C0->getIterator();
BasicBlock::iterator LastInstr = C0->getIterator();
BasicBlock *BB = C0->getParent();
unsigned NumFound = 0;
for (Instruction &I : *BB) {
if (!is_contained(Chain, &I))
continue;
++NumFound;
if (NumFound == 1) {
FirstInstr = I.getIterator();
}
if (NumFound == Chain.size()) {
LastInstr = I.getIterator();
break;
}
}
return std::make_pair(FirstInstr, ++LastInstr);
}
void Vectorizer::eraseInstructions(ArrayRef<Instruction *> Chain) {
SmallVector<Instruction *, 16> Instrs;
for (Instruction *I : Chain) {
Value *PtrOperand = getLoadStorePointerOperand(I);
assert(PtrOperand && "Instruction must have a pointer operand.");
Instrs.push_back(I);
if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(PtrOperand))
Instrs.push_back(GEP);
}
for (Instruction *I : Instrs)
if (I->use_empty())
I->eraseFromParent();
}
std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>
Vectorizer::splitOddVectorElts(ArrayRef<Instruction *> Chain,
unsigned ElementSizeBits) {
unsigned ElementSizeBytes = ElementSizeBits / 8;
unsigned SizeBytes = ElementSizeBytes * Chain.size();
unsigned NumLeft = (SizeBytes - (SizeBytes % 4)) / ElementSizeBytes;
if (NumLeft == Chain.size()) {
if ((NumLeft & 1) == 0)
NumLeft /= 2; else
--NumLeft; } else if (NumLeft == 0)
NumLeft = 1;
return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft));
}
ArrayRef<Instruction *>
Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
SmallVector<Instruction *, 16> MemoryInstrs;
SmallVector<Instruction *, 16> ChainInstrs;
bool IsLoadChain = isa<LoadInst>(Chain[0]);
LLVM_DEBUG({
for (Instruction *I : Chain) {
if (IsLoadChain)
assert(isa<LoadInst>(I) &&
"All elements of Chain must be loads, or all must be stores.");
else
assert(isa<StoreInst>(I) &&
"All elements of Chain must be loads, or all must be stores.");
}
});
for (Instruction &I : make_range(getBoundaryInstrs(Chain))) {
if ((isa<LoadInst>(I) || isa<StoreInst>(I)) && is_contained(Chain, &I)) {
ChainInstrs.push_back(&I);
continue;
}
if (!isGuaranteedToTransferExecutionToSuccessor(&I)) {
LLVM_DEBUG(dbgs() << "LSV: Found instruction may not transfer execution: "
<< I << '\n');
break;
}
if (I.mayReadOrWriteMemory())
MemoryInstrs.push_back(&I);
}
unsigned ChainInstrIdx = 0;
Instruction *BarrierMemoryInstr = nullptr;
for (unsigned E = ChainInstrs.size(); ChainInstrIdx < E; ++ChainInstrIdx) {
Instruction *ChainInstr = ChainInstrs[ChainInstrIdx];
if (BarrierMemoryInstr && BarrierMemoryInstr->comesBefore(ChainInstr))
break;
for (Instruction *MemInstr : MemoryInstrs) {
if (BarrierMemoryInstr && BarrierMemoryInstr->comesBefore(MemInstr))
break;
auto *MemLoad = dyn_cast<LoadInst>(MemInstr);
auto *ChainLoad = dyn_cast<LoadInst>(ChainInstr);
if (MemLoad && ChainLoad)
continue;
auto IsInvariantLoad = [](const LoadInst *LI) -> bool {
return LI->hasMetadata(LLVMContext::MD_invariant_load);
};
if (IsLoadChain) {
if (ChainInstr->comesBefore(MemInstr) ||
(ChainLoad && IsInvariantLoad(ChainLoad)))
continue;
} else {
if (MemInstr->comesBefore(ChainInstr) ||
(MemLoad && IsInvariantLoad(MemLoad)))
continue;
}
ModRefInfo MR =
AA.getModRefInfo(MemInstr, MemoryLocation::get(ChainInstr));
if (IsLoadChain ? isModSet(MR) : isModOrRefSet(MR)) {
LLVM_DEBUG({
dbgs() << "LSV: Found alias:\n"
" Aliasing instruction:\n"
<< " " << *MemInstr << '\n'
<< " Aliased instruction and pointer:\n"
<< " " << *ChainInstr << '\n'
<< " " << *getLoadStorePointerOperand(ChainInstr) << '\n';
});
BarrierMemoryInstr = MemInstr;
break;
}
}
if (IsLoadChain && BarrierMemoryInstr) {
assert(BarrierMemoryInstr->comesBefore(ChainInstr));
break;
}
}
SmallPtrSet<Instruction *, 8> VectorizableChainInstrs(
ChainInstrs.begin(), ChainInstrs.begin() + ChainInstrIdx);
unsigned ChainIdx = 0;
for (unsigned ChainLen = Chain.size(); ChainIdx < ChainLen; ++ChainIdx) {
if (!VectorizableChainInstrs.count(Chain[ChainIdx]))
break;
}
return Chain.slice(0, ChainIdx);
}
static ChainID getChainID(const Value *Ptr) {
const Value *ObjPtr = getUnderlyingObject(Ptr);
if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
return Sel->getCondition();
}
return ObjPtr;
}
std::pair<InstrListMap, InstrListMap>
Vectorizer::collectInstructions(BasicBlock *BB) {
InstrListMap LoadRefs;
InstrListMap StoreRefs;
for (Instruction &I : *BB) {
if (!I.mayReadOrWriteMemory())
continue;
if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
if (!LI->isSimple())
continue;
if (!TTI.isLegalToVectorizeLoad(LI))
continue;
Type *Ty = LI->getType();
if (!VectorType::isValidElementType(Ty->getScalarType()))
continue;
unsigned TySize = DL.getTypeSizeInBits(Ty);
if ((TySize % 8) != 0)
continue;
if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
continue;
Value *Ptr = LI->getPointerOperand();
unsigned AS = Ptr->getType()->getPointerAddressSpace();
unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
unsigned VF = VecRegSize / TySize;
VectorType *VecTy = dyn_cast<VectorType>(Ty);
if (TySize > VecRegSize / 2 ||
(VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
continue;
const ChainID ID = getChainID(Ptr);
LoadRefs[ID].push_back(LI);
} else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
if (!SI->isSimple())
continue;
if (!TTI.isLegalToVectorizeStore(SI))
continue;
Type *Ty = SI->getValueOperand()->getType();
if (!VectorType::isValidElementType(Ty->getScalarType()))
continue;
if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
continue;
unsigned TySize = DL.getTypeSizeInBits(Ty);
if ((TySize % 8) != 0)
continue;
Value *Ptr = SI->getPointerOperand();
unsigned AS = Ptr->getType()->getPointerAddressSpace();
unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
unsigned VF = VecRegSize / TySize;
VectorType *VecTy = dyn_cast<VectorType>(Ty);
if (TySize > VecRegSize / 2 ||
(VecTy && TTI.getStoreVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
continue;
const ChainID ID = getChainID(Ptr);
StoreRefs[ID].push_back(SI);
}
}
return {LoadRefs, StoreRefs};
}
bool Vectorizer::vectorizeChains(InstrListMap &Map) {
bool Changed = false;
for (const std::pair<ChainID, InstrList> &Chain : Map) {
unsigned Size = Chain.second.size();
if (Size < 2)
continue;
LLVM_DEBUG(dbgs() << "LSV: Analyzing a chain of length " << Size << ".\n");
for (unsigned CI = 0, CE = Size; CI < CE; CI += 64) {
unsigned Len = std::min<unsigned>(CE - CI, 64);
ArrayRef<Instruction *> Chunk(&Chain.second[CI], Len);
Changed |= vectorizeInstructions(Chunk);
}
}
return Changed;
}
bool Vectorizer::vectorizeInstructions(ArrayRef<Instruction *> Instrs) {
LLVM_DEBUG(dbgs() << "LSV: Vectorizing " << Instrs.size()
<< " instructions.\n");
SmallVector<int, 16> Heads, Tails;
int ConsecutiveChain[64];
for (int i = 0, e = Instrs.size(); i < e; ++i) {
ConsecutiveChain[i] = -1;
for (int j = e - 1; j >= 0; --j) {
if (i == j)
continue;
if (isConsecutiveAccess(Instrs[i], Instrs[j])) {
if (ConsecutiveChain[i] != -1) {
int CurDistance = std::abs(ConsecutiveChain[i] - i);
int NewDistance = std::abs(ConsecutiveChain[i] - j);
if (j < i || NewDistance > CurDistance)
continue; }
Tails.push_back(j);
Heads.push_back(i);
ConsecutiveChain[i] = j;
}
}
}
bool Changed = false;
SmallPtrSet<Instruction *, 16> InstructionsProcessed;
for (int Head : Heads) {
if (InstructionsProcessed.count(Instrs[Head]))
continue;
bool LongerChainExists = false;
for (unsigned TIt = 0; TIt < Tails.size(); TIt++)
if (Head == Tails[TIt] &&
!InstructionsProcessed.count(Instrs[Heads[TIt]])) {
LongerChainExists = true;
break;
}
if (LongerChainExists)
continue;
SmallVector<Instruction *, 16> Operands;
int I = Head;
while (I != -1 && (is_contained(Tails, I) || is_contained(Heads, I))) {
if (InstructionsProcessed.count(Instrs[I]))
break;
Operands.push_back(Instrs[I]);
I = ConsecutiveChain[I];
}
bool Vectorized = false;
if (isa<LoadInst>(*Operands.begin()))
Vectorized = vectorizeLoadChain(Operands, &InstructionsProcessed);
else
Vectorized = vectorizeStoreChain(Operands, &InstructionsProcessed);
Changed |= Vectorized;
}
return Changed;
}
bool Vectorizer::vectorizeStoreChain(
ArrayRef<Instruction *> Chain,
SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
StoreInst *S0 = cast<StoreInst>(Chain[0]);
Type *StoreTy = nullptr;
for (Instruction *I : Chain) {
StoreTy = cast<StoreInst>(I)->getValueOperand()->getType();
if (StoreTy->isIntOrIntVectorTy())
break;
if (StoreTy->isPtrOrPtrVectorTy()) {
StoreTy = Type::getIntNTy(F.getParent()->getContext(),
DL.getTypeSizeInBits(StoreTy));
break;
}
}
assert(StoreTy && "Failed to find store type");
unsigned Sz = DL.getTypeSizeInBits(StoreTy);
unsigned AS = S0->getPointerAddressSpace();
unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
unsigned VF = VecRegSize / Sz;
unsigned ChainSize = Chain.size();
Align Alignment = S0->getAlign();
if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
InstructionsProcessed->insert(Chain.begin(), Chain.end());
return false;
}
ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);
if (NewChain.empty()) {
InstructionsProcessed->insert(Chain.begin(), Chain.end());
return false;
}
if (NewChain.size() == 1) {
InstructionsProcessed->insert(NewChain.front());
return false;
}
Chain = NewChain;
ChainSize = Chain.size();
unsigned EltSzInBytes = Sz / 8;
unsigned SzInBytes = EltSzInBytes * ChainSize;
FixedVectorType *VecTy;
auto *VecStoreTy = dyn_cast<FixedVectorType>(StoreTy);
if (VecStoreTy)
VecTy = FixedVectorType::get(StoreTy->getScalarType(),
Chain.size() * VecStoreTy->getNumElements());
else
VecTy = FixedVectorType::get(StoreTy, Chain.size());
unsigned TargetVF = TTI.getStoreVectorFactor(VF, Sz, SzInBytes, VecTy);
if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
" Creating two separate arrays.\n");
bool Vectorized = false;
Vectorized |=
vectorizeStoreChain(Chain.slice(0, TargetVF), InstructionsProcessed);
Vectorized |=
vectorizeStoreChain(Chain.slice(TargetVF), InstructionsProcessed);
return Vectorized;
}
LLVM_DEBUG({
dbgs() << "LSV: Stores to vectorize:\n";
for (Instruction *I : Chain)
dbgs() << " " << *I << "\n";
});
InstructionsProcessed->insert(Chain.begin(), Chain.end());
if (accessIsMisaligned(SzInBytes, AS, Alignment)) {
if (S0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {
auto Chains = splitOddVectorElts(Chain, Sz);
bool Vectorized = false;
Vectorized |= vectorizeStoreChain(Chains.first, InstructionsProcessed);
Vectorized |= vectorizeStoreChain(Chains.second, InstructionsProcessed);
return Vectorized;
}
Align NewAlign = getOrEnforceKnownAlignment(S0->getPointerOperand(),
Align(StackAdjustedAlignment),
DL, S0, nullptr, &DT);
if (NewAlign >= Alignment)
Alignment = NewAlign;
else
return false;
}
if (!TTI.isLegalToVectorizeStoreChain(SzInBytes, Alignment, AS)) {
auto Chains = splitOddVectorElts(Chain, Sz);
bool Vectorized = false;
Vectorized |= vectorizeStoreChain(Chains.first, InstructionsProcessed);
Vectorized |= vectorizeStoreChain(Chains.second, InstructionsProcessed);
return Vectorized;
}
BasicBlock::iterator First, Last;
std::tie(First, Last) = getBoundaryInstrs(Chain);
Builder.SetInsertPoint(&*Last);
Value *Vec = PoisonValue::get(VecTy);
if (VecStoreTy) {
unsigned VecWidth = VecStoreTy->getNumElements();
for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
StoreInst *Store = cast<StoreInst>(Chain[I]);
for (unsigned J = 0, NE = VecStoreTy->getNumElements(); J != NE; ++J) {
unsigned NewIdx = J + I * VecWidth;
Value *Extract = Builder.CreateExtractElement(Store->getValueOperand(),
Builder.getInt32(J));
if (Extract->getType() != StoreTy->getScalarType())
Extract = Builder.CreateBitCast(Extract, StoreTy->getScalarType());
Value *Insert =
Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(NewIdx));
Vec = Insert;
}
}
} else {
for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
StoreInst *Store = cast<StoreInst>(Chain[I]);
Value *Extract = Store->getValueOperand();
if (Extract->getType() != StoreTy->getScalarType())
Extract =
Builder.CreateBitOrPointerCast(Extract, StoreTy->getScalarType());
Value *Insert =
Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(I));
Vec = Insert;
}
}
StoreInst *SI = Builder.CreateAlignedStore(
Vec,
Builder.CreateBitCast(S0->getPointerOperand(), VecTy->getPointerTo(AS)),
Alignment);
propagateMetadata(SI, Chain);
eraseInstructions(Chain);
++NumVectorInstructions;
NumScalarsVectorized += Chain.size();
return true;
}
bool Vectorizer::vectorizeLoadChain(
ArrayRef<Instruction *> Chain,
SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
LoadInst *L0 = cast<LoadInst>(Chain[0]);
Type *LoadTy = nullptr;
for (const auto &V : Chain) {
LoadTy = cast<LoadInst>(V)->getType();
if (LoadTy->isIntOrIntVectorTy())
break;
if (LoadTy->isPtrOrPtrVectorTy()) {
LoadTy = Type::getIntNTy(F.getParent()->getContext(),
DL.getTypeSizeInBits(LoadTy));
break;
}
}
assert(LoadTy && "Can't determine LoadInst type from chain");
unsigned Sz = DL.getTypeSizeInBits(LoadTy);
unsigned AS = L0->getPointerAddressSpace();
unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
unsigned VF = VecRegSize / Sz;
unsigned ChainSize = Chain.size();
Align Alignment = L0->getAlign();
if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
InstructionsProcessed->insert(Chain.begin(), Chain.end());
return false;
}
ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);
if (NewChain.empty()) {
InstructionsProcessed->insert(Chain.begin(), Chain.end());
return false;
}
if (NewChain.size() == 1) {
InstructionsProcessed->insert(NewChain.front());
return false;
}
Chain = NewChain;
ChainSize = Chain.size();
unsigned EltSzInBytes = Sz / 8;
unsigned SzInBytes = EltSzInBytes * ChainSize;
VectorType *VecTy;
auto *VecLoadTy = dyn_cast<FixedVectorType>(LoadTy);
if (VecLoadTy)
VecTy = FixedVectorType::get(LoadTy->getScalarType(),
Chain.size() * VecLoadTy->getNumElements());
else
VecTy = FixedVectorType::get(LoadTy, Chain.size());
unsigned TargetVF = TTI.getLoadVectorFactor(VF, Sz, SzInBytes, VecTy);
if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
" Creating two separate arrays.\n");
bool Vectorized = false;
Vectorized |=
vectorizeLoadChain(Chain.slice(0, TargetVF), InstructionsProcessed);
Vectorized |=
vectorizeLoadChain(Chain.slice(TargetVF), InstructionsProcessed);
return Vectorized;
}
InstructionsProcessed->insert(Chain.begin(), Chain.end());
if (accessIsMisaligned(SzInBytes, AS, Alignment)) {
if (L0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {
auto Chains = splitOddVectorElts(Chain, Sz);
bool Vectorized = false;
Vectorized |= vectorizeLoadChain(Chains.first, InstructionsProcessed);
Vectorized |= vectorizeLoadChain(Chains.second, InstructionsProcessed);
return Vectorized;
}
Align NewAlign = getOrEnforceKnownAlignment(L0->getPointerOperand(),
Align(StackAdjustedAlignment),
DL, L0, nullptr, &DT);
if (NewAlign >= Alignment)
Alignment = NewAlign;
else
return false;
}
if (!TTI.isLegalToVectorizeLoadChain(SzInBytes, Alignment, AS)) {
auto Chains = splitOddVectorElts(Chain, Sz);
bool Vectorized = false;
Vectorized |= vectorizeLoadChain(Chains.first, InstructionsProcessed);
Vectorized |= vectorizeLoadChain(Chains.second, InstructionsProcessed);
return Vectorized;
}
LLVM_DEBUG({
dbgs() << "LSV: Loads to vectorize:\n";
for (Instruction *I : Chain)
I->dump();
});
BasicBlock::iterator First, Last;
std::tie(First, Last) = getBoundaryInstrs(Chain);
Builder.SetInsertPoint(&*First);
Value *Bitcast =
Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS));
LoadInst *LI =
Builder.CreateAlignedLoad(VecTy, Bitcast, MaybeAlign(Alignment));
propagateMetadata(LI, Chain);
for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
Value *CV = Chain[I];
Value *V;
if (VecLoadTy) {
unsigned VecWidth = VecLoadTy->getNumElements();
auto Mask =
llvm::to_vector<8>(llvm::seq<int>(I * VecWidth, (I + 1) * VecWidth));
V = Builder.CreateShuffleVector(LI, Mask, CV->getName());
} else {
V = Builder.CreateExtractElement(LI, Builder.getInt32(I), CV->getName());
}
if (V->getType() != CV->getType()) {
V = Builder.CreateBitOrPointerCast(V, CV->getType());
}
CV->replaceAllUsesWith(V);
}
Instruction *BCInst = dyn_cast<Instruction>(Bitcast);
reorder((BCInst && BCInst != L0->getPointerOperand()) ? BCInst : LI);
eraseInstructions(Chain);
++NumVectorInstructions;
NumScalarsVectorized += Chain.size();
return true;
}
bool Vectorizer::accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
Align Alignment) {
if (Alignment.value() % SzInBytes == 0)
return false;
bool Fast = false;
bool Allows = TTI.allowsMisalignedMemoryAccesses(F.getParent()->getContext(),
SzInBytes * 8, AddressSpace,
Alignment, &Fast);
LLVM_DEBUG(dbgs() << "LSV: Target said misaligned is allowed? " << Allows
<< " and fast? " << Fast << "\n";);
return !Allows || !Fast;
}