#include "llvm/Transforms/Scalar/LoopIdiomRecognize.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/CmpInstAnalysis.h"
#include "llvm/Analysis/LoopAccessAnalysis.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/MemoryLocation.h"
#include "llvm/Analysis/MemorySSA.h"
#include "llvm/Analysis/MemorySSAUpdater.h"
#include "llvm/Analysis/MustExecute.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/PassManager.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
#include "llvm/IR/ValueHandle.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/InstructionCost.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/BuildLibCalls.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
#include <utility>
#include <vector>
using namespace llvm;
#define DEBUG_TYPE "loop-idiom"
STATISTIC(NumMemSet, "Number of memset's formed from loop stores");
STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores");
STATISTIC(NumMemMove, "Number of memmove's formed from loop load+stores");
STATISTIC(
    NumShiftUntilBitTest,
    "Number of uncountable loops recognized as 'shift until bitttest' idiom");
STATISTIC(NumShiftUntilZero,
          "Number of uncountable loops recognized as 'shift until zero' idiom");
bool DisableLIRP::All;
static cl::opt<bool, true>
    DisableLIRPAll("disable-" DEBUG_TYPE "-all",
                   cl::desc("Options to disable Loop Idiom Recognize Pass."),
                   cl::location(DisableLIRP::All), cl::init(false),
                   cl::ReallyHidden);
bool DisableLIRP::Memset;
static cl::opt<bool, true>
    DisableLIRPMemset("disable-" DEBUG_TYPE "-memset",
                      cl::desc("Proceed with loop idiom recognize pass, but do "
                               "not convert loop(s) to memset."),
                      cl::location(DisableLIRP::Memset), cl::init(false),
                      cl::ReallyHidden);
bool DisableLIRP::Memcpy;
static cl::opt<bool, true>
    DisableLIRPMemcpy("disable-" DEBUG_TYPE "-memcpy",
                      cl::desc("Proceed with loop idiom recognize pass, but do "
                               "not convert loop(s) to memcpy."),
                      cl::location(DisableLIRP::Memcpy), cl::init(false),
                      cl::ReallyHidden);
static cl::opt<bool> UseLIRCodeSizeHeurs(
    "use-lir-code-size-heurs",
    cl::desc("Use loop idiom recognition code size heuristics when compiling"
             "with -Os/-Oz"),
    cl::init(true), cl::Hidden);
namespace {
class LoopIdiomRecognize {
  Loop *CurLoop = nullptr;
  AliasAnalysis *AA;
  DominatorTree *DT;
  LoopInfo *LI;
  ScalarEvolution *SE;
  TargetLibraryInfo *TLI;
  const TargetTransformInfo *TTI;
  const DataLayout *DL;
  OptimizationRemarkEmitter &ORE;
  bool ApplyCodeSizeHeuristics;
  std::unique_ptr<MemorySSAUpdater> MSSAU;
public:
  explicit LoopIdiomRecognize(AliasAnalysis *AA, DominatorTree *DT,
                              LoopInfo *LI, ScalarEvolution *SE,
                              TargetLibraryInfo *TLI,
                              const TargetTransformInfo *TTI, MemorySSA *MSSA,
                              const DataLayout *DL,
                              OptimizationRemarkEmitter &ORE)
      : AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL), ORE(ORE) {
    if (MSSA)
      MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
  }
  bool runOnLoop(Loop *L);
private:
  using StoreList = SmallVector<StoreInst *, 8>;
  using StoreListMap = MapVector<Value *, StoreList>;
  StoreListMap StoreRefsForMemset;
  StoreListMap StoreRefsForMemsetPattern;
  StoreList StoreRefsForMemcpy;
  bool HasMemset;
  bool HasMemsetPattern;
  bool HasMemcpy;
    enum LegalStoreKind {
    None = 0,
    Memset,
    MemsetPattern,
    Memcpy,
    UnorderedAtomicMemcpy,
    DontUse               };
    
  bool runOnCountableLoop();
  bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
                      SmallVectorImpl<BasicBlock *> &ExitBlocks);
  void collectStores(BasicBlock *BB);
  LegalStoreKind isLegalStore(StoreInst *SI);
  enum class ForMemset { No, Yes };
  bool processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount,
                         ForMemset For);
  template <typename MemInst>
  bool processLoopMemIntrinsic(
      BasicBlock *BB,
      bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *),
      const SCEV *BECount);
  bool processLoopMemCpy(MemCpyInst *MCI, const SCEV *BECount);
  bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
  bool processLoopStridedStore(Value *DestPtr, const SCEV *StoreSizeSCEV,
                               MaybeAlign StoreAlignment, Value *StoredVal,
                               Instruction *TheStore,
                               SmallPtrSetImpl<Instruction *> &Stores,
                               const SCEVAddRecExpr *Ev, const SCEV *BECount,
                               bool IsNegStride, bool IsLoopMemset = false);
  bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount);
  bool processLoopStoreOfLoopLoad(Value *DestPtr, Value *SourcePtr,
                                  const SCEV *StoreSize, MaybeAlign StoreAlign,
                                  MaybeAlign LoadAlign, Instruction *TheStore,
                                  Instruction *TheLoad,
                                  const SCEVAddRecExpr *StoreEv,
                                  const SCEVAddRecExpr *LoadEv,
                                  const SCEV *BECount);
  bool avoidLIRForMultiBlockLoop(bool IsMemset = false,
                                 bool IsLoopMemset = false);
      
  bool runOnNoncountableLoop();
  bool recognizePopcount();
  void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,
                               PHINode *CntPhi, Value *Var);
  bool recognizeAndInsertFFS();    void transformLoopToCountable(Intrinsic::ID IntrinID, BasicBlock *PreCondBB,
                                Instruction *CntInst, PHINode *CntPhi,
                                Value *Var, Instruction *DefX,
                                const DebugLoc &DL, bool ZeroCheck,
                                bool IsCntPhiUsedOutsideLoop);
  bool recognizeShiftUntilBitTest();
  bool recognizeShiftUntilZero();
  };
class LoopIdiomRecognizeLegacyPass : public LoopPass {
public:
  static char ID;
  explicit LoopIdiomRecognizeLegacyPass() : LoopPass(ID) {
    initializeLoopIdiomRecognizeLegacyPassPass(
        *PassRegistry::getPassRegistry());
  }
  bool runOnLoop(Loop *L, LPPassManager &LPM) override {
    if (DisableLIRP::All)
      return false;
    if (skipLoop(L))
      return false;
    AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
    DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
    LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
    ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
    TargetLibraryInfo *TLI =
        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
            *L->getHeader()->getParent());
    const TargetTransformInfo *TTI =
        &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
            *L->getHeader()->getParent());
    const DataLayout *DL = &L->getHeader()->getModule()->getDataLayout();
    auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
    MemorySSA *MSSA = nullptr;
    if (MSSAAnalysis)
      MSSA = &MSSAAnalysis->getMSSA();
                OptimizationRemarkEmitter ORE(L->getHeader()->getParent());
    LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, MSSA, DL, ORE);
    return LIR.runOnLoop(L);
  }
      void getAnalysisUsage(AnalysisUsage &AU) const override {
    AU.addRequired<TargetLibraryInfoWrapperPass>();
    AU.addRequired<TargetTransformInfoWrapperPass>();
    AU.addPreserved<MemorySSAWrapperPass>();
    getLoopAnalysisUsage(AU);
  }
};
} 
char LoopIdiomRecognizeLegacyPass::ID = 0;
PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM,
                                              LoopStandardAnalysisResults &AR,
                                              LPMUpdater &) {
  if (DisableLIRP::All)
    return PreservedAnalyses::all();
  const auto *DL = &L.getHeader()->getModule()->getDataLayout();
        OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
  LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI,
                         AR.MSSA, DL, ORE);
  if (!LIR.runOnLoop(&L))
    return PreservedAnalyses::all();
  auto PA = getLoopPassPreservedAnalyses();
  if (AR.MSSA)
    PA.preserve<MemorySSAAnalysis>();
  return PA;
}
INITIALIZE_PASS_BEGIN(LoopIdiomRecognizeLegacyPass, "loop-idiom",
                      "Recognize loop idioms", false, false)
INITIALIZE_PASS_DEPENDENCY(LoopPass)
INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
INITIALIZE_PASS_END(LoopIdiomRecognizeLegacyPass, "loop-idiom",
                    "Recognize loop idioms", false, false)
Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognizeLegacyPass(); }
static void deleteDeadInstruction(Instruction *I) {
  I->replaceAllUsesWith(PoisonValue::get(I->getType()));
  I->eraseFromParent();
}
bool LoopIdiomRecognize::runOnLoop(Loop *L) {
  CurLoop = L;
      if (!L->getLoopPreheader())
    return false;
    StringRef Name = L->getHeader()->getParent()->getName();
  if (Name == "memset" || Name == "memcpy")
    return false;
    ApplyCodeSizeHeuristics =
      L->getHeader()->getParent()->hasOptSize() && UseLIRCodeSizeHeurs;
  HasMemset = TLI->has(LibFunc_memset);
  HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);
  HasMemcpy = TLI->has(LibFunc_memcpy);
  if (HasMemset || HasMemsetPattern || HasMemcpy)
    if (SE->hasLoopInvariantBackedgeTakenCount(L))
      return runOnCountableLoop();
  return runOnNoncountableLoop();
}
bool LoopIdiomRecognize::runOnCountableLoop() {
  const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop);
  assert(!isa<SCEVCouldNotCompute>(BECount) &&
         "runOnCountableLoop() called on a loop without a predictable"
         "backedge-taken count");
      if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
    if (BECst->getAPInt() == 0)
      return false;
  SmallVector<BasicBlock *, 8> ExitBlocks;
  CurLoop->getUniqueExitBlocks(ExitBlocks);
  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
                    << CurLoop->getHeader()->getParent()->getName()
                    << "] Countable Loop %" << CurLoop->getHeader()->getName()
                    << "\n");
      SimpleLoopSafetyInfo SafetyInfo;
  SafetyInfo.computeLoopSafetyInfo(CurLoop);
  if (SafetyInfo.anyBlockMayThrow())
    return false;
  bool MadeChange = false;
    for (auto *BB : CurLoop->getBlocks()) {
        if (LI->getLoopFor(BB) != CurLoop)
      continue;
    MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks);
  }
  return MadeChange;
}
static APInt getStoreStride(const SCEVAddRecExpr *StoreEv) {
  const SCEVConstant *ConstStride = cast<SCEVConstant>(StoreEv->getOperand(1));
  return ConstStride->getAPInt();
}
static Constant *getMemSetPatternValue(Value *V, const DataLayout *DL) {
    
        Constant *C = dyn_cast<Constant>(V);
  if (!C)
    return nullptr;
    uint64_t Size = DL->getTypeSizeInBits(V->getType());
  if (Size == 0 || (Size & 7) || (Size & (Size - 1)))
    return nullptr;
    if (DL->isBigEndian())
    return nullptr;
    Size /= 8;
      if (Size > 16)
    return nullptr;
    if (Size == 16)
    return C;
    unsigned ArraySize = 16 / Size;
  ArrayType *AT = ArrayType::get(V->getType(), ArraySize);
  return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C));
}
LoopIdiomRecognize::LegalStoreKind
LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
    if (SI->isVolatile())
    return LegalStoreKind::None;
    if (!SI->isUnordered())
    return LegalStoreKind::None;
    if (SI->getMetadata(LLVMContext::MD_nontemporal))
    return LegalStoreKind::None;
  Value *StoredVal = SI->getValueOperand();
  Value *StorePtr = SI->getPointerOperand();
      if (DL->isNonIntegralPointerType(StoredVal->getType()->getScalarType()))
    return LegalStoreKind::None;
        TypeSize SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
  if (SizeInBits.isScalable() || (SizeInBits.getFixedSize() & 7) ||
      (SizeInBits.getFixedSize() >> 32) != 0)
    return LegalStoreKind::None;
        const SCEVAddRecExpr *StoreEv =
      dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
  if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
    return LegalStoreKind::None;
    if (!isa<SCEVConstant>(StoreEv->getOperand(1)))
    return LegalStoreKind::None;
  
          Value *SplatValue = isBytewiseValue(StoredVal, *DL);
    bool UnorderedAtomic = SI->isUnordered() && !SI->isSimple();
      if (!UnorderedAtomic && HasMemset && SplatValue && !DisableLIRP::Memset &&
                  CurLoop->isLoopInvariant(SplatValue)) {
        return LegalStoreKind::Memset;
  }
  if (!UnorderedAtomic && HasMemsetPattern && !DisableLIRP::Memset &&
            StorePtr->getType()->getPointerAddressSpace() == 0 &&
      getMemSetPatternValue(StoredVal, DL)) {
        return LegalStoreKind::MemsetPattern;
  }
    if (HasMemcpy && !DisableLIRP::Memcpy) {
            APInt Stride = getStoreStride(StoreEv);
    unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
    if (StoreSize != Stride && StoreSize != -Stride)
      return LegalStoreKind::None;
        LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
        if (!LI || LI->isVolatile())
      return LegalStoreKind::None;
        if (!LI->isUnordered())
      return LegalStoreKind::None;
                const SCEVAddRecExpr *LoadEv =
        dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
    if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
      return LegalStoreKind::None;
        if (StoreEv->getOperand(1) != LoadEv->getOperand(1))
      return LegalStoreKind::None;
        UnorderedAtomic = UnorderedAtomic || LI->isAtomic();
    return UnorderedAtomic ? LegalStoreKind::UnorderedAtomicMemcpy
                           : LegalStoreKind::Memcpy;
  }
    return LegalStoreKind::None;
}
void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
  StoreRefsForMemset.clear();
  StoreRefsForMemsetPattern.clear();
  StoreRefsForMemcpy.clear();
  for (Instruction &I : *BB) {
    StoreInst *SI = dyn_cast<StoreInst>(&I);
    if (!SI)
      continue;
        switch (isLegalStore(SI)) {
    case LegalStoreKind::None:
            break;
    case LegalStoreKind::Memset: {
            Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
      StoreRefsForMemset[Ptr].push_back(SI);
    } break;
    case LegalStoreKind::MemsetPattern: {
            Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
      StoreRefsForMemsetPattern[Ptr].push_back(SI);
    } break;
    case LegalStoreKind::Memcpy:
    case LegalStoreKind::UnorderedAtomicMemcpy:
      StoreRefsForMemcpy.push_back(SI);
      break;
    default:
      assert(false && "unhandled return value");
      break;
    }
  }
}
bool LoopIdiomRecognize::runOnLoopBlock(
    BasicBlock *BB, const SCEV *BECount,
    SmallVectorImpl<BasicBlock *> &ExitBlocks) {
        for (BasicBlock *ExitBlock : ExitBlocks)
    if (!DT->dominates(BB, ExitBlock))
      return false;
  bool MadeChange = false;
    collectStores(BB);
        for (auto &SL : StoreRefsForMemset)
    MadeChange |= processLoopStores(SL.second, BECount, ForMemset::Yes);
  for (auto &SL : StoreRefsForMemsetPattern)
    MadeChange |= processLoopStores(SL.second, BECount, ForMemset::No);
    for (auto &SI : StoreRefsForMemcpy)
    MadeChange |= processLoopStoreOfLoopLoad(SI, BECount);
  MadeChange |= processLoopMemIntrinsic<MemCpyInst>(
      BB, &LoopIdiomRecognize::processLoopMemCpy, BECount);
  MadeChange |= processLoopMemIntrinsic<MemSetInst>(
      BB, &LoopIdiomRecognize::processLoopMemSet, BECount);
  return MadeChange;
}
bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
                                           const SCEV *BECount, ForMemset For) {
    SetVector<StoreInst *> Heads, Tails;
  SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;
      SmallVector<unsigned, 16> IndexQueue;
  for (unsigned i = 0, e = SL.size(); i < e; ++i) {
    assert(SL[i]->isSimple() && "Expected only non-volatile stores.");
    Value *FirstStoredVal = SL[i]->getValueOperand();
    Value *FirstStorePtr = SL[i]->getPointerOperand();
    const SCEVAddRecExpr *FirstStoreEv =
        cast<SCEVAddRecExpr>(SE->getSCEV(FirstStorePtr));
    APInt FirstStride = getStoreStride(FirstStoreEv);
    unsigned FirstStoreSize = DL->getTypeStoreSize(SL[i]->getValueOperand()->getType());
        if (FirstStride == FirstStoreSize || -FirstStride == FirstStoreSize) {
      Heads.insert(SL[i]);
      continue;
    }
    Value *FirstSplatValue = nullptr;
    Constant *FirstPatternValue = nullptr;
    if (For == ForMemset::Yes)
      FirstSplatValue = isBytewiseValue(FirstStoredVal, *DL);
    else
      FirstPatternValue = getMemSetPatternValue(FirstStoredVal, DL);
    assert((FirstSplatValue || FirstPatternValue) &&
           "Expected either splat value or pattern value.");
    IndexQueue.clear();
                    unsigned j = 0;
    for (j = i + 1; j < e; ++j)
      IndexQueue.push_back(j);
    for (j = i; j > 0; --j)
      IndexQueue.push_back(j - 1);
    for (auto &k : IndexQueue) {
      assert(SL[k]->isSimple() && "Expected only non-volatile stores.");
      Value *SecondStorePtr = SL[k]->getPointerOperand();
      const SCEVAddRecExpr *SecondStoreEv =
          cast<SCEVAddRecExpr>(SE->getSCEV(SecondStorePtr));
      APInt SecondStride = getStoreStride(SecondStoreEv);
      if (FirstStride != SecondStride)
        continue;
      Value *SecondStoredVal = SL[k]->getValueOperand();
      Value *SecondSplatValue = nullptr;
      Constant *SecondPatternValue = nullptr;
      if (For == ForMemset::Yes)
        SecondSplatValue = isBytewiseValue(SecondStoredVal, *DL);
      else
        SecondPatternValue = getMemSetPatternValue(SecondStoredVal, DL);
      assert((SecondSplatValue || SecondPatternValue) &&
             "Expected either splat value or pattern value.");
      if (isConsecutiveAccess(SL[i], SL[k], *DL, *SE, false)) {
        if (For == ForMemset::Yes) {
          if (isa<UndefValue>(FirstSplatValue))
            FirstSplatValue = SecondSplatValue;
          if (FirstSplatValue != SecondSplatValue)
            continue;
        } else {
          if (isa<UndefValue>(FirstPatternValue))
            FirstPatternValue = SecondPatternValue;
          if (FirstPatternValue != SecondPatternValue)
            continue;
        }
        Tails.insert(SL[k]);
        Heads.insert(SL[i]);
        ConsecutiveChain[SL[i]] = SL[k];
        break;
      }
    }
  }
      SmallPtrSet<Value *, 16> TransformedStores;
  bool Changed = false;
    for (StoreInst *I : Heads) {
    if (Tails.count(I))
      continue;
            SmallPtrSet<Instruction *, 8> AdjacentStores;
    StoreInst *HeadStore = I;
    unsigned StoreSize = 0;
        while (Tails.count(I) || Heads.count(I)) {
      if (TransformedStores.count(I))
        break;
      AdjacentStores.insert(I);
      StoreSize += DL->getTypeStoreSize(I->getValueOperand()->getType());
            I = ConsecutiveChain[I];
    }
    Value *StoredVal = HeadStore->getValueOperand();
    Value *StorePtr = HeadStore->getPointerOperand();
    const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
    APInt Stride = getStoreStride(StoreEv);
            if (StoreSize != Stride && StoreSize != -Stride)
      continue;
    bool IsNegStride = StoreSize == -Stride;
    Type *IntIdxTy = DL->getIndexType(StorePtr->getType());
    const SCEV *StoreSizeSCEV = SE->getConstant(IntIdxTy, StoreSize);
    if (processLoopStridedStore(StorePtr, StoreSizeSCEV,
                                MaybeAlign(HeadStore->getAlign()), StoredVal,
                                HeadStore, AdjacentStores, StoreEv, BECount,
                                IsNegStride)) {
      TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end());
      Changed = true;
    }
  }
  return Changed;
}
template <typename MemInst>
bool LoopIdiomRecognize::processLoopMemIntrinsic(
    BasicBlock *BB,
    bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *),
    const SCEV *BECount) {
  bool MadeChange = false;
  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
    Instruction *Inst = &*I++;
        if (MemInst *MI = dyn_cast<MemInst>(Inst)) {
      WeakTrackingVH InstPtr(&*I);
      if (!(this->*Processor)(MI, BECount))
        continue;
      MadeChange = true;
                  if (!InstPtr)
        I = BB->begin();
    }
  }
  return MadeChange;
}
bool LoopIdiomRecognize::processLoopMemCpy(MemCpyInst *MCI,
                                           const SCEV *BECount) {
    if (MCI->isVolatile() || !isa<ConstantInt>(MCI->getLength()))
    return false;
    if ((!HasMemcpy && !isa<MemCpyInlineInst>(MCI)) || DisableLIRP::Memcpy)
    return false;
  Value *Dest = MCI->getDest();
  Value *Source = MCI->getSource();
  if (!Dest || !Source)
    return false;
        const SCEVAddRecExpr *StoreEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Dest));
  if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
    return false;
  const SCEVAddRecExpr *LoadEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Source));
  if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
    return false;
    uint64_t SizeInBytes = cast<ConstantInt>(MCI->getLength())->getZExtValue();
  if ((SizeInBytes >> 32) != 0)
    return false;
      const SCEVConstant *ConstStoreStride =
      dyn_cast<SCEVConstant>(StoreEv->getOperand(1));
  const SCEVConstant *ConstLoadStride =
      dyn_cast<SCEVConstant>(LoadEv->getOperand(1));
  if (!ConstStoreStride || !ConstLoadStride)
    return false;
  APInt StoreStrideValue = ConstStoreStride->getAPInt();
  APInt LoadStrideValue = ConstLoadStride->getAPInt();
    if (StoreStrideValue.getBitWidth() > 64 || LoadStrideValue.getBitWidth() > 64)
    return false;
  if (SizeInBytes != StoreStrideValue && SizeInBytes != -StoreStrideValue) {
    ORE.emit([&]() {
      return OptimizationRemarkMissed(DEBUG_TYPE, "SizeStrideUnequal", MCI)
             << ore::NV("Inst", "memcpy") << " in "
             << ore::NV("Function", MCI->getFunction())
             << " function will not be hoisted: "
             << ore::NV("Reason", "memcpy size is not equal to stride");
    });
    return false;
  }
  int64_t StoreStrideInt = StoreStrideValue.getSExtValue();
  int64_t LoadStrideInt = LoadStrideValue.getSExtValue();
    if (StoreStrideInt != LoadStrideInt)
    return false;
  return processLoopStoreOfLoopLoad(
      Dest, Source, SE->getConstant(Dest->getType(), SizeInBytes),
      MCI->getDestAlign(), MCI->getSourceAlign(), MCI, MCI, StoreEv, LoadEv,
      BECount);
}
bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
                                           const SCEV *BECount) {
    if (MSI->isVolatile())
    return false;
    if (!HasMemset || DisableLIRP::Memset)
    return false;
  Value *Pointer = MSI->getDest();
        const SCEVAddRecExpr *Ev = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Pointer));
  if (!Ev || Ev->getLoop() != CurLoop)
    return false;
  if (!Ev->isAffine()) {
    LLVM_DEBUG(dbgs() << "  Pointer is not affine, abort\n");
    return false;
  }
  const SCEV *PointerStrideSCEV = Ev->getOperand(1);
  const SCEV *MemsetSizeSCEV = SE->getSCEV(MSI->getLength());
  if (!PointerStrideSCEV || !MemsetSizeSCEV)
    return false;
  bool IsNegStride = false;
  const bool IsConstantSize = isa<ConstantInt>(MSI->getLength());
  if (IsConstantSize) {
                LLVM_DEBUG(dbgs() << "  memset size is constant\n");
    uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue();
    const SCEVConstant *ConstStride = dyn_cast<SCEVConstant>(Ev->getOperand(1));
    if (!ConstStride)
      return false;
    APInt Stride = ConstStride->getAPInt();
    if (SizeInBytes != Stride && SizeInBytes != -Stride)
      return false;
    IsNegStride = SizeInBytes == -Stride;
  } else {
                        LLVM_DEBUG(dbgs() << "  memset size is non-constant\n");
    if (Pointer->getType()->getPointerAddressSpace() != 0) {
      LLVM_DEBUG(dbgs() << "  pointer is not in address space zero, "
                        << "abort\n");
      return false;
    }
    if (!SE->isLoopInvariant(MemsetSizeSCEV, CurLoop)) {
      LLVM_DEBUG(dbgs() << "  memset size is not a loop-invariant, "
                        << "abort\n");
      return false;
    }
        IsNegStride = PointerStrideSCEV->isNonConstantNegative();
    const SCEV *PositiveStrideSCEV =
        IsNegStride ? SE->getNegativeSCEV(PointerStrideSCEV)
                    : PointerStrideSCEV;
    LLVM_DEBUG(dbgs() << "  MemsetSizeSCEV: " << *MemsetSizeSCEV << "\n"
                      << "  PositiveStrideSCEV: " << *PositiveStrideSCEV
                      << "\n");
    if (PositiveStrideSCEV != MemsetSizeSCEV) {
                  const SCEV *FoldedPositiveStride =
          SE->applyLoopGuards(PositiveStrideSCEV, CurLoop);
      const SCEV *FoldedMemsetSize =
          SE->applyLoopGuards(MemsetSizeSCEV, CurLoop);
      LLVM_DEBUG(dbgs() << "  Try to fold SCEV based on loop guard\n"
                        << "    FoldedMemsetSize: " << *FoldedMemsetSize << "\n"
                        << "    FoldedPositiveStride: " << *FoldedPositiveStride
                        << "\n");
      if (FoldedPositiveStride != FoldedMemsetSize) {
        LLVM_DEBUG(dbgs() << "  SCEV don't match, abort\n");
        return false;
      }
    }
  }
      Value *SplatValue = MSI->getValue();
  if (!SplatValue || !CurLoop->isLoopInvariant(SplatValue))
    return false;
  SmallPtrSet<Instruction *, 1> MSIs;
  MSIs.insert(MSI);
  return processLoopStridedStore(Pointer, SE->getSCEV(MSI->getLength()),
                                 MSI->getDestAlign(), SplatValue, MSI, MSIs, Ev,
                                 BECount, IsNegStride, true);
}
static bool
mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L,
                      const SCEV *BECount, const SCEV *StoreSizeSCEV,
                      AliasAnalysis &AA,
                      SmallPtrSetImpl<Instruction *> &IgnoredInsts) {
        LocationSize AccessSize = LocationSize::afterPointer();
      const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount);
  const SCEVConstant *ConstSize = dyn_cast<SCEVConstant>(StoreSizeSCEV);
  if (BECst && ConstSize)
    AccessSize = LocationSize::precise((BECst->getValue()->getZExtValue() + 1) *
                                       ConstSize->getValue()->getZExtValue());
          MemoryLocation StoreLoc(Ptr, AccessSize);
  for (BasicBlock *B : L->blocks())
    for (Instruction &I : *B)
      if (!IgnoredInsts.contains(&I) &&
          isModOrRefSet(
              intersectModRef(AA.getModRefInfo(&I, StoreLoc), Access)))
        return true;
  return false;
}
static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount,
                                        Type *IntPtr, const SCEV *StoreSizeSCEV,
                                        ScalarEvolution *SE) {
  const SCEV *Index = SE->getTruncateOrZeroExtend(BECount, IntPtr);
  if (!StoreSizeSCEV->isOne()) {
        Index = SE->getMulExpr(Index,
                           SE->getTruncateOrZeroExtend(StoreSizeSCEV, IntPtr),
                           SCEV::FlagNUW);
  }
    return SE->getMinusSCEV(Start, Index);
}
static const SCEV *getTripCount(const SCEV *BECount, Type *IntPtr,
                                Loop *CurLoop, const DataLayout *DL,
                                ScalarEvolution *SE) {
  const SCEV *TripCountS = nullptr;
              if (DL->getTypeSizeInBits(BECount->getType()) <
          DL->getTypeSizeInBits(IntPtr) &&
      SE->isLoopEntryGuardedByCond(
          CurLoop, ICmpInst::ICMP_NE, BECount,
          SE->getNegativeSCEV(SE->getOne(BECount->getType())))) {
    TripCountS = SE->getZeroExtendExpr(
        SE->getAddExpr(BECount, SE->getOne(BECount->getType()), SCEV::FlagNUW),
        IntPtr);
  } else {
    TripCountS = SE->getAddExpr(SE->getTruncateOrZeroExtend(BECount, IntPtr),
                                SE->getOne(IntPtr), SCEV::FlagNUW);
  }
  return TripCountS;
}
static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr,
                               const SCEV *StoreSizeSCEV, Loop *CurLoop,
                               const DataLayout *DL, ScalarEvolution *SE) {
  const SCEV *TripCountSCEV = getTripCount(BECount, IntPtr, CurLoop, DL, SE);
  return SE->getMulExpr(TripCountSCEV,
                        SE->getTruncateOrZeroExtend(StoreSizeSCEV, IntPtr),
                        SCEV::FlagNUW);
}
bool LoopIdiomRecognize::processLoopStridedStore(
    Value *DestPtr, const SCEV *StoreSizeSCEV, MaybeAlign StoreAlignment,
    Value *StoredVal, Instruction *TheStore,
    SmallPtrSetImpl<Instruction *> &Stores, const SCEVAddRecExpr *Ev,
    const SCEV *BECount, bool IsNegStride, bool IsLoopMemset) {
  Module *M = TheStore->getModule();
  Value *SplatValue = isBytewiseValue(StoredVal, *DL);
  Constant *PatternValue = nullptr;
  if (!SplatValue)
    PatternValue = getMemSetPatternValue(StoredVal, DL);
  assert((SplatValue || PatternValue) &&
         "Expected either splat value or pattern value.");
        unsigned DestAS = DestPtr->getType()->getPointerAddressSpace();
  BasicBlock *Preheader = CurLoop->getLoopPreheader();
  IRBuilder<> Builder(Preheader->getTerminator());
  SCEVExpander Expander(*SE, *DL, "loop-idiom");
  SCEVExpanderCleaner ExpCleaner(Expander);
  Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS);
  Type *IntIdxTy = DL->getIndexType(DestPtr->getType());
  bool Changed = false;
  const SCEV *Start = Ev->getStart();
    if (IsNegStride)
    Start = getStartForNegStride(Start, BECount, IntIdxTy, StoreSizeSCEV, SE);
      if (!Expander.isSafeToExpand(Start))
    return Changed;
            Value *BasePtr =
      Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator());
                Changed = true;
  if (mayLoopAccessLocation(BasePtr, ModRefInfo::ModRef, CurLoop, BECount,
                            StoreSizeSCEV, *AA, Stores))
    return Changed;
  if (avoidLIRForMultiBlockLoop(true, IsLoopMemset))
    return Changed;
  
  const SCEV *NumBytesS =
      getNumBytes(BECount, IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE);
      if (!Expander.isSafeToExpand(NumBytesS))
    return Changed;
  Value *NumBytes =
      Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
  CallInst *NewCall;
  if (SplatValue) {
    AAMDNodes AATags = TheStore->getAAMetadata();
    for (Instruction *Store : Stores)
      AATags = AATags.merge(Store->getAAMetadata());
    if (auto CI = dyn_cast<ConstantInt>(NumBytes))
      AATags = AATags.extendTo(CI->getZExtValue());
    else
      AATags = AATags.extendTo(-1);
    NewCall = Builder.CreateMemSet(
        BasePtr, SplatValue, NumBytes, MaybeAlign(StoreAlignment),
        false, AATags.TBAA, AATags.Scope, AATags.NoAlias);
  } else if (isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16)) {
        Type *Int8PtrTy = DestInt8PtrTy;
    StringRef FuncName = "memset_pattern16";
    FunctionCallee MSP = getOrInsertLibFunc(M, *TLI, LibFunc_memset_pattern16,
                            Builder.getVoidTy(), Int8PtrTy, Int8PtrTy, IntIdxTy);
    inferNonMandatoryLibFuncAttrs(M, FuncName, *TLI);
            GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true,
                                            GlobalValue::PrivateLinkage,
                                            PatternValue, ".memset_pattern");
    GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);     GV->setAlignment(Align(16));
    Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy);
    NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes});
  } else
    return Changed;
  NewCall->setDebugLoc(TheStore->getDebugLoc());
  if (MSSAU) {
    MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
        NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
    MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
  }
  LLVM_DEBUG(dbgs() << "  Formed memset: " << *NewCall << "\n"
                    << "    from store to: " << *Ev << " at: " << *TheStore
                    << "\n");
  ORE.emit([&]() {
    OptimizationRemark R(DEBUG_TYPE, "ProcessLoopStridedStore",
                         NewCall->getDebugLoc(), Preheader);
    R << "Transformed loop-strided store in "
      << ore::NV("Function", TheStore->getFunction())
      << " function into a call to "
      << ore::NV("NewFunction", NewCall->getCalledFunction())
      << "() intrinsic";
    if (!Stores.empty())
      R << ore::setExtraArgs();
    for (auto *I : Stores) {
      R << ore::NV("FromBlock", I->getParent()->getName())
        << ore::NV("ToBlock", Preheader->getName());
    }
    return R;
  });
      for (auto *I : Stores) {
    if (MSSAU)
      MSSAU->removeMemoryAccess(I, true);
    deleteDeadInstruction(I);
  }
  if (MSSAU && VerifyMemorySSA)
    MSSAU->getMemorySSA()->verifyMemorySSA();
  ++NumMemSet;
  ExpCleaner.markResultUsed();
  return true;
}
bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
                                                    const SCEV *BECount) {
  assert(SI->isUnordered() && "Expected only non-volatile non-ordered stores.");
  Value *StorePtr = SI->getPointerOperand();
  const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
  unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
    LoadInst *LI = cast<LoadInst>(SI->getValueOperand());
  assert(LI->isUnordered() && "Expected only non-volatile non-ordered loads.");
        Value *LoadPtr = LI->getPointerOperand();
  const SCEVAddRecExpr *LoadEv = cast<SCEVAddRecExpr>(SE->getSCEV(LoadPtr));
  const SCEV *StoreSizeSCEV = SE->getConstant(StorePtr->getType(), StoreSize);
  return processLoopStoreOfLoopLoad(StorePtr, LoadPtr, StoreSizeSCEV,
                                    SI->getAlign(), LI->getAlign(), SI, LI,
                                    StoreEv, LoadEv, BECount);
}
class MemmoveVerifier {
public:
  explicit MemmoveVerifier(const Value &LoadBasePtr, const Value &StoreBasePtr,
                           const DataLayout &DL)
      : DL(DL), BP1(llvm::GetPointerBaseWithConstantOffset(
                    LoadBasePtr.stripPointerCasts(), LoadOff, DL)),
        BP2(llvm::GetPointerBaseWithConstantOffset(
            StoreBasePtr.stripPointerCasts(), StoreOff, DL)),
        IsSameObject(BP1 == BP2) {}
  bool loadAndStoreMayFormMemmove(unsigned StoreSize, bool IsNegStride,
                                  const Instruction &TheLoad,
                                  bool IsMemCpy) const {
    if (IsMemCpy) {
                  if ((!IsNegStride && LoadOff <= StoreOff) ||
          (IsNegStride && LoadOff >= StoreOff))
        return false;
    } else {
                  int64_t LoadSize =
          DL.getTypeSizeInBits(TheLoad.getType()).getFixedSize() / 8;
      if (BP1 != BP2 || LoadSize != int64_t(StoreSize))
        return false;
      if ((!IsNegStride && LoadOff < StoreOff + int64_t(StoreSize)) ||
          (IsNegStride && LoadOff + LoadSize > StoreOff))
        return false;
    }
    return true;
  }
private:
  const DataLayout &DL;
  int64_t LoadOff = 0;
  int64_t StoreOff = 0;
  const Value *BP1;
  const Value *BP2;
public:
  const bool IsSameObject;
};
bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
    Value *DestPtr, Value *SourcePtr, const SCEV *StoreSizeSCEV,
    MaybeAlign StoreAlign, MaybeAlign LoadAlign, Instruction *TheStore,
    Instruction *TheLoad, const SCEVAddRecExpr *StoreEv,
    const SCEVAddRecExpr *LoadEv, const SCEV *BECount) {
        if (isa<MemCpyInlineInst>(TheStore))
    return false;
        BasicBlock *Preheader = CurLoop->getLoopPreheader();
  IRBuilder<> Builder(Preheader->getTerminator());
  SCEVExpander Expander(*SE, *DL, "loop-idiom");
  SCEVExpanderCleaner ExpCleaner(Expander);
  bool Changed = false;
  const SCEV *StrStart = StoreEv->getStart();
  unsigned StrAS = DestPtr->getType()->getPointerAddressSpace();
  Type *IntIdxTy = Builder.getIntNTy(DL->getIndexSizeInBits(StrAS));
  APInt Stride = getStoreStride(StoreEv);
  const SCEVConstant *ConstStoreSize = dyn_cast<SCEVConstant>(StoreSizeSCEV);
    assert(ConstStoreSize && "store size is expected to be a constant");
  int64_t StoreSize = ConstStoreSize->getValue()->getZExtValue();
  bool IsNegStride = StoreSize == -Stride;
    if (IsNegStride)
    StrStart =
        getStartForNegStride(StrStart, BECount, IntIdxTy, StoreSizeSCEV, SE);
              Value *StoreBasePtr = Expander.expandCodeFor(
      StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator());
                Changed = true;
  SmallPtrSet<Instruction *, 2> IgnoredInsts;
  IgnoredInsts.insert(TheStore);
  bool IsMemCpy = isa<MemCpyInst>(TheStore);
  const StringRef InstRemark = IsMemCpy ? "memcpy" : "load and store";
  bool LoopAccessStore =
      mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount,
                            StoreSizeSCEV, *AA, IgnoredInsts);
  if (LoopAccessStore) {
                if (!TheLoad->hasOneUse())
      return Changed;
    IgnoredInsts.insert(TheLoad);
    if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop,
                              BECount, StoreSizeSCEV, *AA, IgnoredInsts)) {
      ORE.emit([&]() {
        return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessStore",
                                        TheStore)
               << ore::NV("Inst", InstRemark) << " in "
               << ore::NV("Function", TheStore->getFunction())
               << " function will not be hoisted: "
               << ore::NV("Reason", "The loop may access store location");
      });
      return Changed;
    }
    IgnoredInsts.erase(TheLoad);
  }
  const SCEV *LdStart = LoadEv->getStart();
  unsigned LdAS = SourcePtr->getType()->getPointerAddressSpace();
    if (IsNegStride)
    LdStart =
        getStartForNegStride(LdStart, BECount, IntIdxTy, StoreSizeSCEV, SE);
      Value *LoadBasePtr = Expander.expandCodeFor(
      LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator());
      MemmoveVerifier Verifier(*LoadBasePtr, *StoreBasePtr, *DL);
  if (IsMemCpy && !Verifier.IsSameObject)
    IgnoredInsts.erase(TheStore);
  if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount,
                            StoreSizeSCEV, *AA, IgnoredInsts)) {
    ORE.emit([&]() {
      return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessLoad", TheLoad)
             << ore::NV("Inst", InstRemark) << " in "
             << ore::NV("Function", TheStore->getFunction())
             << " function will not be hoisted: "
             << ore::NV("Reason", "The loop may access load location");
    });
    return Changed;
  }
  bool UseMemMove = IsMemCpy ? Verifier.IsSameObject : LoopAccessStore;
  if (UseMemMove)
    if (!Verifier.loadAndStoreMayFormMemmove(StoreSize, IsNegStride, *TheLoad,
                                             IsMemCpy))
      return Changed;
  if (avoidLIRForMultiBlockLoop())
    return Changed;
  
  const SCEV *NumBytesS =
      getNumBytes(BECount, IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE);
  Value *NumBytes =
      Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
  AAMDNodes AATags = TheLoad->getAAMetadata();
  AAMDNodes StoreAATags = TheStore->getAAMetadata();
  AATags = AATags.merge(StoreAATags);
  if (auto CI = dyn_cast<ConstantInt>(NumBytes))
    AATags = AATags.extendTo(CI->getZExtValue());
  else
    AATags = AATags.extendTo(-1);
  CallInst *NewCall = nullptr;
        if (!TheStore->isAtomic() && !TheLoad->isAtomic()) {
    if (UseMemMove)
      NewCall = Builder.CreateMemMove(
          StoreBasePtr, StoreAlign, LoadBasePtr, LoadAlign, NumBytes,
          false, AATags.TBAA, AATags.Scope, AATags.NoAlias);
    else
      NewCall =
          Builder.CreateMemCpy(StoreBasePtr, StoreAlign, LoadBasePtr, LoadAlign,
                               NumBytes, false, AATags.TBAA,
                               AATags.TBAAStruct, AATags.Scope, AATags.NoAlias);
  } else {
        if (UseMemMove)
      return Changed;
            assert((StoreAlign && LoadAlign) &&
           "Expect unordered load/store to have align.");
    if (StoreAlign.value() < StoreSize || LoadAlign.value() < StoreSize)
      return Changed;
                    if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize())
      return Changed;
                NewCall = Builder.CreateElementUnorderedAtomicMemCpy(
        StoreBasePtr, StoreAlign.value(), LoadBasePtr, LoadAlign.value(),
        NumBytes, StoreSize, AATags.TBAA, AATags.TBAAStruct, AATags.Scope,
        AATags.NoAlias);
  }
  NewCall->setDebugLoc(TheStore->getDebugLoc());
  if (MSSAU) {
    MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
        NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
    MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
  }
  LLVM_DEBUG(dbgs() << "  Formed new call: " << *NewCall << "\n"
                    << "    from load ptr=" << *LoadEv << " at: " << *TheLoad
                    << "\n"
                    << "    from store ptr=" << *StoreEv << " at: " << *TheStore
                    << "\n");
  ORE.emit([&]() {
    return OptimizationRemark(DEBUG_TYPE, "ProcessLoopStoreOfLoopLoad",
                              NewCall->getDebugLoc(), Preheader)
           << "Formed a call to "
           << ore::NV("NewFunction", NewCall->getCalledFunction())
           << "() intrinsic from " << ore::NV("Inst", InstRemark)
           << " instruction in " << ore::NV("Function", TheStore->getFunction())
           << " function"
           << ore::setExtraArgs()
           << ore::NV("FromBlock", TheStore->getParent()->getName())
           << ore::NV("ToBlock", Preheader->getName());
  });
      if (MSSAU)
    MSSAU->removeMemoryAccess(TheStore, true);
  deleteDeadInstruction(TheStore);
  if (MSSAU && VerifyMemorySSA)
    MSSAU->getMemorySSA()->verifyMemorySSA();
  if (UseMemMove)
    ++NumMemMove;
  else
    ++NumMemCpy;
  ExpCleaner.markResultUsed();
  return true;
}
bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
                                                   bool IsLoopMemset) {
  if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > 1) {
    if (CurLoop->isOutermost() && (!IsMemset || !IsLoopMemset)) {
      LLVM_DEBUG(dbgs() << "  " << CurLoop->getHeader()->getParent()->getName()
                        << " : LIR " << (IsMemset ? "Memset" : "Memcpy")
                        << " avoided: multi-block top-level loop\n");
      return true;
    }
  }
  return false;
}
bool LoopIdiomRecognize::runOnNoncountableLoop() {
  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
                    << CurLoop->getHeader()->getParent()->getName()
                    << "] Noncountable Loop %"
                    << CurLoop->getHeader()->getName() << "\n");
  return recognizePopcount() || recognizeAndInsertFFS() ||
         recognizeShiftUntilBitTest() || recognizeShiftUntilZero();
}
static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry,
                             bool JmpOnZero = false) {
  if (!BI || !BI->isConditional())
    return nullptr;
  ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
  if (!Cond)
    return nullptr;
  ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1));
  if (!CmpZero || !CmpZero->isZero())
    return nullptr;
  BasicBlock *TrueSucc = BI->getSuccessor(0);
  BasicBlock *FalseSucc = BI->getSuccessor(1);
  if (JmpOnZero)
    std::swap(TrueSucc, FalseSucc);
  ICmpInst::Predicate Pred = Cond->getPredicate();
  if ((Pred == ICmpInst::ICMP_NE && TrueSucc == LoopEntry) ||
      (Pred == ICmpInst::ICMP_EQ && FalseSucc == LoopEntry))
    return Cond->getOperand(0);
  return nullptr;
}
static PHINode *getRecurrenceVar(Value *VarX, Instruction *DefX,
                                 BasicBlock *LoopEntry) {
  auto *PhiX = dyn_cast<PHINode>(VarX);
  if (PhiX && PhiX->getParent() == LoopEntry &&
      (PhiX->getOperand(0) == DefX || PhiX->getOperand(1) == DefX))
    return PhiX;
  return nullptr;
}
static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
                                Instruction *&CntInst, PHINode *&CntPhi,
                                Value *&Var) {
      BasicBlock *LoopEntry;
  Instruction *DefX2, *CountInst;
  Value *VarX1, *VarX0;
  PHINode *PhiX, *CountPhi;
  DefX2 = CountInst = nullptr;
  VarX1 = VarX0 = nullptr;
  PhiX = CountPhi = nullptr;
  LoopEntry = *(CurLoop->block_begin());
    {
    if (Value *T = matchCondition(
            dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
      DefX2 = dyn_cast<Instruction>(T);
    else
      return false;
  }
    {
    if (!DefX2 || DefX2->getOpcode() != Instruction::And)
      return false;
    BinaryOperator *SubOneOp;
    if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0))))
      VarX1 = DefX2->getOperand(1);
    else {
      VarX1 = DefX2->getOperand(0);
      SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1));
    }
    if (!SubOneOp || SubOneOp->getOperand(0) != VarX1)
      return false;
    ConstantInt *Dec = dyn_cast<ConstantInt>(SubOneOp->getOperand(1));
    if (!Dec ||
        !((SubOneOp->getOpcode() == Instruction::Sub && Dec->isOne()) ||
          (SubOneOp->getOpcode() == Instruction::Add &&
           Dec->isMinusOne()))) {
      return false;
    }
  }
    PhiX = getRecurrenceVar(VarX1, DefX2, LoopEntry);
  if (!PhiX)
    return false;
    {
    CountInst = nullptr;
    for (Instruction &Inst : llvm::make_range(
             LoopEntry->getFirstNonPHI()->getIterator(), LoopEntry->end())) {
      if (Inst.getOpcode() != Instruction::Add)
        continue;
      ConstantInt *Inc = dyn_cast<ConstantInt>(Inst.getOperand(1));
      if (!Inc || !Inc->isOne())
        continue;
      PHINode *Phi = getRecurrenceVar(Inst.getOperand(0), &Inst, LoopEntry);
      if (!Phi)
        continue;
            bool LiveOutLoop = false;
      for (User *U : Inst.users()) {
        if ((cast<Instruction>(U))->getParent() != LoopEntry) {
          LiveOutLoop = true;
          break;
        }
      }
      if (LiveOutLoop) {
        CountInst = &Inst;
        CountPhi = Phi;
        break;
      }
    }
    if (!CountInst)
      return false;
  }
      {
    auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator());
    Value *T = matchCondition(PreCondBr, CurLoop->getLoopPreheader());
    if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1))
      return false;
    CntInst = CountInst;
    CntPhi = CountPhi;
    Var = T;
  }
  return true;
}
static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL,
                                      Intrinsic::ID &IntrinID, Value *&InitX,
                                      Instruction *&CntInst, PHINode *&CntPhi,
                                      Instruction *&DefX) {
  BasicBlock *LoopEntry;
  Value *VarX = nullptr;
  DefX = nullptr;
  CntInst = nullptr;
  CntPhi = nullptr;
  LoopEntry = *(CurLoop->block_begin());
    if (Value *T = matchCondition(
          dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
    DefX = dyn_cast<Instruction>(T);
  else
    return false;
    if (!DefX || !DefX->isShift())
    return false;
  IntrinID = DefX->getOpcode() == Instruction::Shl ? Intrinsic::cttz :
                                                     Intrinsic::ctlz;
  ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1));
  if (!Shft || !Shft->isOne())
    return false;
  VarX = DefX->getOperand(0);
    PHINode *PhiX = getRecurrenceVar(VarX, DefX, LoopEntry);
  if (!PhiX)
    return false;
  InitX = PhiX->getIncomingValueForBlock(CurLoop->getLoopPreheader());
      if (DefX->getOpcode() == Instruction::AShr && !isKnownNonNegative(InitX, DL))
    return false;
                for (Instruction &Inst : llvm::make_range(
           LoopEntry->getFirstNonPHI()->getIterator(), LoopEntry->end())) {
    if (Inst.getOpcode() != Instruction::Add)
      continue;
    ConstantInt *Inc = dyn_cast<ConstantInt>(Inst.getOperand(1));
    if (!Inc || (!Inc->isOne() && !Inc->isMinusOne()))
      continue;
    PHINode *Phi = getRecurrenceVar(Inst.getOperand(0), &Inst, LoopEntry);
    if (!Phi)
      continue;
    CntInst = &Inst;
    CntPhi = Phi;
    break;
  }
  if (!CntInst)
    return false;
  return true;
}
bool LoopIdiomRecognize::recognizeAndInsertFFS() {
    if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
    return false;
  Intrinsic::ID IntrinID;
  Value *InitX;
  Instruction *DefX = nullptr;
  PHINode *CntPhi = nullptr;
  Instruction *CntInst = nullptr;
      size_t IdiomCanonicalSize = 6;
  if (!detectShiftUntilZeroIdiom(CurLoop, *DL, IntrinID, InitX,
                                 CntInst, CntPhi, DefX))
    return false;
  bool IsCntPhiUsedOutsideLoop = false;
  for (User *U : CntPhi->users())
    if (!CurLoop->contains(cast<Instruction>(U))) {
      IsCntPhiUsedOutsideLoop = true;
      break;
    }
  bool IsCntInstUsedOutsideLoop = false;
  for (User *U : CntInst->users())
    if (!CurLoop->contains(cast<Instruction>(U))) {
      IsCntInstUsedOutsideLoop = true;
      break;
    }
      if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop)
    return false;
        bool ZeroCheck = false;
      BasicBlock *PH = CurLoop->getLoopPreheader();
          if (!IsCntPhiUsedOutsideLoop) {
    auto *PreCondBB = PH->getSinglePredecessor();
    if (!PreCondBB)
      return false;
    auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
    if (!PreCondBI)
      return false;
    if (matchCondition(PreCondBI, PH) != InitX)
      return false;
    ZeroCheck = true;
  }
    
              
  const Value *Args[] = {InitX,
                         ConstantInt::getBool(InitX->getContext(), ZeroCheck)};
    auto InstWithoutDebugIt = CurLoop->getHeader()->instructionsWithoutDebug();
  uint32_t HeaderSize =
      std::distance(InstWithoutDebugIt.begin(), InstWithoutDebugIt.end());
  IntrinsicCostAttributes Attrs(IntrinID, InitX->getType(), Args);
  InstructionCost Cost =
    TTI->getIntrinsicInstrCost(Attrs, TargetTransformInfo::TCK_SizeAndLatency);
  if (HeaderSize != IdiomCanonicalSize &&
      Cost > TargetTransformInfo::TCC_Basic)
    return false;
  transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX,
                           DefX->getDebugLoc(), ZeroCheck,
                           IsCntPhiUsedOutsideLoop);
  return true;
}
bool LoopIdiomRecognize::recognizePopcount() {
  if (TTI->getPopcntSupport(32) != TargetTransformInfo::PSK_FastHardware)
    return false;
        
    if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
    return false;
  BasicBlock *LoopBody = *(CurLoop->block_begin());
  if (LoopBody->size() >= 20) {
        return false;
  }
    BasicBlock *PH = CurLoop->getLoopPreheader();
  if (!PH || &PH->front() != PH->getTerminator())
    return false;
  auto *EntryBI = dyn_cast<BranchInst>(PH->getTerminator());
  if (!EntryBI || EntryBI->isConditional())
    return false;
      auto *PreCondBB = PH->getSinglePredecessor();
  if (!PreCondBB)
    return false;
  auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
  if (!PreCondBI || PreCondBI->isUnconditional())
    return false;
  Instruction *CntInst;
  PHINode *CntPhi;
  Value *Val;
  if (!detectPopcountIdiom(CurLoop, PreCondBB, CntInst, CntPhi, Val))
    return false;
  transformLoopToPopcount(PreCondBB, CntInst, CntPhi, Val);
  return true;
}
static CallInst *createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
                                       const DebugLoc &DL) {
  Value *Ops[] = {Val};
  Type *Tys[] = {Val->getType()};
  Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
  Function *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys);
  CallInst *CI = IRBuilder.CreateCall(Func, Ops);
  CI->setDebugLoc(DL);
  return CI;
}
static CallInst *createFFSIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
                                    const DebugLoc &DL, bool ZeroCheck,
                                    Intrinsic::ID IID) {
  Value *Ops[] = {Val, IRBuilder.getInt1(ZeroCheck)};
  Type *Tys[] = {Val->getType()};
  Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
  Function *Func = Intrinsic::getDeclaration(M, IID, Tys);
  CallInst *CI = IRBuilder.CreateCall(Func, Ops);
  CI->setDebugLoc(DL);
  return CI;
}
void LoopIdiomRecognize::transformLoopToCountable(
    Intrinsic::ID IntrinID, BasicBlock *Preheader, Instruction *CntInst,
    PHINode *CntPhi, Value *InitX, Instruction *DefX, const DebugLoc &DL,
    bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) {
  BranchInst *PreheaderBr = cast<BranchInst>(Preheader->getTerminator());
    IRBuilder<> Builder(PreheaderBr);
  Builder.SetCurrentDebugLocation(DL);
              Value *InitXNext;
  if (IsCntPhiUsedOutsideLoop) {
    if (DefX->getOpcode() == Instruction::AShr)
      InitXNext = Builder.CreateAShr(InitX, 1);
    else if (DefX->getOpcode() == Instruction::LShr)
      InitXNext = Builder.CreateLShr(InitX, 1);
    else if (DefX->getOpcode() == Instruction::Shl)       InitXNext = Builder.CreateShl(InitX, 1);
    else
      llvm_unreachable("Unexpected opcode!");
  } else
    InitXNext = InitX;
  Value *Count =
      createFFSIntrinsic(Builder, InitXNext, DL, ZeroCheck, IntrinID);
  Type *CountTy = Count->getType();
  Count = Builder.CreateSub(
      ConstantInt::get(CountTy, CountTy->getIntegerBitWidth()), Count);
  Value *NewCount = Count;
  if (IsCntPhiUsedOutsideLoop)
    Count = Builder.CreateAdd(Count, ConstantInt::get(CountTy, 1));
  NewCount = Builder.CreateZExtOrTrunc(NewCount, CntInst->getType());
  Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader);
  if (cast<ConstantInt>(CntInst->getOperand(1))->isOne()) {
            ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
    if (!InitConst || !InitConst->isZero())
      NewCount = Builder.CreateAdd(NewCount, CntInitVal);
  } else {
            NewCount = Builder.CreateSub(CntInitVal, NewCount);
  }
                  BasicBlock *Body = *(CurLoop->block_begin());
  auto *LbBr = cast<BranchInst>(Body->getTerminator());
  ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
  PHINode *TcPhi = PHINode::Create(CountTy, 2, "tcphi", &Body->front());
  Builder.SetInsertPoint(LbCond);
  Instruction *TcDec = cast<Instruction>(Builder.CreateSub(
      TcPhi, ConstantInt::get(CountTy, 1), "tcdec", false, true));
  TcPhi->addIncoming(Count, Preheader);
  TcPhi->addIncoming(TcDec, Body);
  CmpInst::Predicate Pred =
      (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ;
  LbCond->setPredicate(Pred);
  LbCond->setOperand(0, TcDec);
  LbCond->setOperand(1, ConstantInt::get(CountTy, 0));
      if (IsCntPhiUsedOutsideLoop)
    CntPhi->replaceUsesOutsideBlock(NewCount, Body);
  else
    CntInst->replaceUsesOutsideBlock(NewCount, Body);
      SE->forgetLoop(CurLoop);
}
void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
                                                 Instruction *CntInst,
                                                 PHINode *CntPhi, Value *Var) {
  BasicBlock *PreHead = CurLoop->getLoopPreheader();
  auto *PreCondBr = cast<BranchInst>(PreCondBB->getTerminator());
  const DebugLoc &DL = CntInst->getDebugLoc();
      
    IRBuilder<> Builder(PreCondBr);
  Value *PopCnt, *PopCntZext, *NewCount, *TripCnt;
  {
    PopCnt = createPopcntIntrinsic(Builder, Var, DL);
    NewCount = PopCntZext =
        Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType()));
    if (NewCount != PopCnt)
      (cast<Instruction>(NewCount))->setDebugLoc(DL);
        TripCnt = NewCount;
        Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead);
    ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
    if (!InitConst || !InitConst->isZero()) {
      NewCount = Builder.CreateAdd(NewCount, CntInitVal);
      (cast<Instruction>(NewCount))->setDebugLoc(DL);
    }
  }
          {
    ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition());
    Value *Opnd0 = PopCntZext;
    Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0);
    if (PreCond->getOperand(0) != Var)
      std::swap(Opnd0, Opnd1);
    ICmpInst *NewPreCond = cast<ICmpInst>(
        Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1));
    PreCondBr->setCondition(NewPreCond);
    RecursivelyDeleteTriviallyDeadInstructions(PreCond, TLI);
  }
                                          BasicBlock *Body = *(CurLoop->block_begin());
  {
    auto *LbBr = cast<BranchInst>(Body->getTerminator());
    ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
    Type *Ty = TripCnt->getType();
    PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());
    Builder.SetInsertPoint(LbCond);
    Instruction *TcDec = cast<Instruction>(
        Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
                          "tcdec", false, true));
    TcPhi->addIncoming(TripCnt, PreHead);
    TcPhi->addIncoming(TcDec, Body);
    CmpInst::Predicate Pred =
        (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_UGT : CmpInst::ICMP_SLE;
    LbCond->setPredicate(Pred);
    LbCond->setOperand(0, TcDec);
    LbCond->setOperand(1, ConstantInt::get(Ty, 0));
  }
        CntInst->replaceUsesOutsideBlock(NewCount, Body);
      SE->forgetLoop(CurLoop);
}
template <typename SubPattern_t> struct match_LoopInvariant {
  SubPattern_t SubPattern;
  const Loop *L;
  match_LoopInvariant(const SubPattern_t &SP, const Loop *L)
      : SubPattern(SP), L(L) {}
  template <typename ITy> bool match(ITy *V) {
    return L->isLoopInvariant(V) && SubPattern.match(V);
  }
};
template <typename Ty>
inline match_LoopInvariant<Ty> m_LoopInvariant(const Ty &M, const Loop *L) {
  return match_LoopInvariant<Ty>(M, L);
}
static bool detectShiftUntilBitTestIdiom(Loop *CurLoop, Value *&BaseX,
                                         Value *&BitMask, Value *&BitPos,
                                         Value *&CurrX, Instruction *&NextX) {
  LLVM_DEBUG(dbgs() << DEBUG_TYPE
             " Performing shift-until-bittest idiom detection.\n");
    if (CurLoop->getNumBlocks() != 1 || CurLoop->getNumBackEdges() != 1) {
    LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n");
    return false;
  }
  BasicBlock *LoopHeaderBB = CurLoop->getHeader();
  BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
  assert(LoopPreheaderBB && "There is always a loop preheader.");
  using namespace PatternMatch;
  
  ICmpInst::Predicate Pred;
  Value *CmpLHS, *CmpRHS;
  BasicBlock *TrueBB, *FalseBB;
  if (!match(LoopHeaderBB->getTerminator(),
             m_Br(m_ICmp(Pred, m_Value(CmpLHS), m_Value(CmpRHS)),
                  m_BasicBlock(TrueBB), m_BasicBlock(FalseBB)))) {
    LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n");
    return false;
  }
  
  auto MatchVariableBitMask = [&]() {
    return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) &&
           match(CmpLHS,
                 m_c_And(m_Value(CurrX),
                         m_CombineAnd(
                             m_Value(BitMask),
                             m_LoopInvariant(m_Shl(m_One(), m_Value(BitPos)),
                                             CurLoop))));
  };
  auto MatchConstantBitMask = [&]() {
    return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) &&
           match(CmpLHS, m_And(m_Value(CurrX),
                               m_CombineAnd(m_Value(BitMask), m_Power2()))) &&
           (BitPos = ConstantExpr::getExactLogBase2(cast<Constant>(BitMask)));
  };
  auto MatchDecomposableConstantBitMask = [&]() {
    APInt Mask;
    return llvm::decomposeBitTestICmp(CmpLHS, CmpRHS, Pred, CurrX, Mask) &&
           ICmpInst::isEquality(Pred) && Mask.isPowerOf2() &&
           (BitMask = ConstantInt::get(CurrX->getType(), Mask)) &&
           (BitPos = ConstantInt::get(CurrX->getType(), Mask.logBase2()));
  };
  if (!MatchVariableBitMask() && !MatchConstantBitMask() &&
      !MatchDecomposableConstantBitMask()) {
    LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge comparison.\n");
    return false;
  }
    auto *CurrXPN = dyn_cast<PHINode>(CurrX);
  if (!CurrXPN || CurrXPN->getParent() != LoopHeaderBB) {
    LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n");
    return false;
  }
  BaseX = CurrXPN->getIncomingValueForBlock(LoopPreheaderBB);
  NextX =
      dyn_cast<Instruction>(CurrXPN->getIncomingValueForBlock(LoopHeaderBB));
  assert(CurLoop->isLoopInvariant(BaseX) &&
         "Expected BaseX to be avaliable in the preheader!");
  if (!NextX || !match(NextX, m_Shl(m_Specific(CurrX), m_One()))) {
        LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n");
    return false;
  }
  
  assert(ICmpInst::isEquality(Pred) &&
         "Should only get equality predicates here.");
    if (Pred != ICmpInst::Predicate::ICMP_EQ) {
    Pred = ICmpInst::getInversePredicate(Pred);
    std::swap(TrueBB, FalseBB);
  }
      if (TrueBB != LoopHeaderBB) {
    LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n");
    return false;
  }
    return true;
}
bool LoopIdiomRecognize::recognizeShiftUntilBitTest() {
  bool MadeChange = false;
  Value *X, *BitMask, *BitPos, *XCurr;
  Instruction *XNext;
  if (!detectShiftUntilBitTestIdiom(CurLoop, X, BitMask, BitPos, XCurr,
                                    XNext)) {
    LLVM_DEBUG(dbgs() << DEBUG_TYPE
               " shift-until-bittest idiom detection failed.\n");
    return MadeChange;
  }
  LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom detected!\n");
    
  BasicBlock *LoopHeaderBB = CurLoop->getHeader();
  BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
  assert(LoopPreheaderBB && "There is always a loop preheader.");
  BasicBlock *SuccessorBB = CurLoop->getExitBlock();
  assert(SuccessorBB && "There is only a single successor.");
  IRBuilder<> Builder(LoopPreheaderBB->getTerminator());
  Builder.SetCurrentDebugLocation(cast<Instruction>(XCurr)->getDebugLoc());
  Intrinsic::ID IntrID = Intrinsic::ctlz;
  Type *Ty = X->getType();
  unsigned Bitwidth = Ty->getScalarSizeInBits();
  TargetTransformInfo::TargetCostKind CostKind =
      TargetTransformInfo::TCK_SizeAndLatency;
        IntrinsicCostAttributes Attrs(
      IntrID, Ty, {UndefValue::get(Ty), Builder.getTrue()});
  InstructionCost Cost = TTI->getIntrinsicInstrCost(Attrs, CostKind);
  if (Cost > TargetTransformInfo::TCC_Basic) {
    LLVM_DEBUG(dbgs() << DEBUG_TYPE
               " Intrinsic is too costly, not beneficial\n");
    return MadeChange;
  }
  if (TTI->getArithmeticInstrCost(Instruction::Shl, Ty, CostKind) >
      TargetTransformInfo::TCC_Basic) {
    LLVM_DEBUG(dbgs() << DEBUG_TYPE " Shift is too costly, not beneficial\n");
    return MadeChange;
  }
    MadeChange = true;
  
  Value *LowBitMask = Builder.CreateAdd(BitMask, Constant::getAllOnesValue(Ty),
                                        BitPos->getName() + ".lowbitmask");
  Value *Mask =
      Builder.CreateOr(LowBitMask, BitMask, BitPos->getName() + ".mask");
  Value *XMasked = Builder.CreateAnd(X, Mask, X->getName() + ".masked");
  CallInst *XMaskedNumLeadingZeros = Builder.CreateIntrinsic(
      IntrID, Ty, {XMasked, Builder.getTrue()},
      nullptr, XMasked->getName() + ".numleadingzeros");
  Value *XMaskedNumActiveBits = Builder.CreateSub(
      ConstantInt::get(Ty, Ty->getScalarSizeInBits()), XMaskedNumLeadingZeros,
      XMasked->getName() + ".numactivebits", true,
      Bitwidth != 2);
  Value *XMaskedLeadingOnePos =
      Builder.CreateAdd(XMaskedNumActiveBits, Constant::getAllOnesValue(Ty),
                        XMasked->getName() + ".leadingonepos", false,
                        Bitwidth > 2);
  Value *LoopBackedgeTakenCount = Builder.CreateSub(
      BitPos, XMaskedLeadingOnePos, CurLoop->getName() + ".backedgetakencount",
      true, true);
      Value *LoopTripCount =
      Builder.CreateAdd(LoopBackedgeTakenCount, ConstantInt::get(Ty, 1),
                        CurLoop->getName() + ".tripcount", true,
                        Bitwidth != 2);
  
      Value *NewX = Builder.CreateShl(X, LoopBackedgeTakenCount);
  NewX->takeName(XCurr);
  if (auto *I = dyn_cast<Instruction>(NewX))
    I->copyIRFlags(XNext, true);
  Value *NewXNext;
          if (XNext->hasNoSignedWrap() || XNext->hasNoUnsignedWrap() ||
      PatternMatch::match(
          BitPos, PatternMatch::m_SpecificInt_ICMP(
                      ICmpInst::ICMP_NE, APInt(Ty->getScalarSizeInBits(),
                                               Ty->getScalarSizeInBits() - 1))))
    NewXNext = Builder.CreateShl(X, LoopTripCount);
  else {
                NewXNext = Builder.CreateShl(NewX, ConstantInt::get(Ty, 1));
  }
  NewXNext->takeName(XNext);
  if (auto *I = dyn_cast<Instruction>(NewXNext))
    I->copyIRFlags(XNext, true);
    
  XCurr->replaceUsesOutsideBlock(NewX, LoopHeaderBB);
  XNext->replaceUsesOutsideBlock(NewXNext, LoopHeaderBB);
  
    Builder.SetInsertPoint(&LoopHeaderBB->front());
  auto *IV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv");
      Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
  auto *IVNext =
      Builder.CreateAdd(IV, ConstantInt::get(Ty, 1), IV->getName() + ".next",
                        true, Bitwidth != 2);
    auto *IVCheck = Builder.CreateICmpEQ(IVNext, LoopTripCount,
                                       CurLoop->getName() + ".ivcheck");
  Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB);
  LoopHeaderBB->getTerminator()->eraseFromParent();
    IV->addIncoming(ConstantInt::get(Ty, 0), LoopPreheaderBB);
  IV->addIncoming(IVNext, LoopHeaderBB);
    
  SE->forgetLoop(CurLoop);
  
  LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom optimized!\n");
  ++NumShiftUntilBitTest;
  return MadeChange;
}
static bool detectShiftUntilZeroIdiom(Loop *CurLoop, ScalarEvolution *SE,
                                      Instruction *&ValShiftedIsZero,
                                      Intrinsic::ID &IntrinID, Instruction *&IV,
                                      Value *&Start, Value *&Val,
                                      const SCEV *&ExtraOffsetExpr,
                                      bool &InvertedCond) {
  LLVM_DEBUG(dbgs() << DEBUG_TYPE
             " Performing shift-until-zero idiom detection.\n");
    if (CurLoop->getNumBlocks() != 1 || CurLoop->getNumBackEdges() != 1) {
    LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n");
    return false;
  }
  Instruction *ValShifted, *NBits, *IVNext;
  Value *ExtraOffset;
  BasicBlock *LoopHeaderBB = CurLoop->getHeader();
  BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
  assert(LoopPreheaderBB && "There is always a loop preheader.");
  using namespace PatternMatch;
  
  ICmpInst::Predicate Pred;
  BasicBlock *TrueBB, *FalseBB;
  if (!match(LoopHeaderBB->getTerminator(),
             m_Br(m_Instruction(ValShiftedIsZero), m_BasicBlock(TrueBB),
                  m_BasicBlock(FalseBB))) ||
      !match(ValShiftedIsZero,
             m_ICmp(Pred, m_Instruction(ValShifted), m_Zero())) ||
      !ICmpInst::isEquality(Pred)) {
    LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n");
    return false;
  }
      if (!match(ValShifted, m_Shift(m_LoopInvariant(m_Value(Val), CurLoop),
                                 m_Instruction(NBits)))) {
    LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad comparisons value computation.\n");
    return false;
  }
  IntrinID = ValShifted->getOpcode() == Instruction::Shl ? Intrinsic::cttz
                                                         : Intrinsic::ctlz;
  
  if (match(NBits, m_c_Add(m_Instruction(IV),
                           m_LoopInvariant(m_Value(ExtraOffset), CurLoop))) &&
      (NBits->hasNoSignedWrap() || NBits->hasNoUnsignedWrap()))
    ExtraOffsetExpr = SE->getNegativeSCEV(SE->getSCEV(ExtraOffset));
  else if (match(NBits,
                 m_Sub(m_Instruction(IV),
                       m_LoopInvariant(m_Value(ExtraOffset), CurLoop))) &&
           NBits->hasNoSignedWrap())
    ExtraOffsetExpr = SE->getSCEV(ExtraOffset);
  else {
    IV = NBits;
    ExtraOffsetExpr = SE->getZero(NBits->getType());
  }
    auto *IVPN = dyn_cast<PHINode>(IV);
  if (!IVPN || IVPN->getParent() != LoopHeaderBB) {
    LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n");
    return false;
  }
  Start = IVPN->getIncomingValueForBlock(LoopPreheaderBB);
  IVNext = dyn_cast<Instruction>(IVPN->getIncomingValueForBlock(LoopHeaderBB));
  if (!IVNext || !match(IVNext, m_Add(m_Specific(IVPN), m_One()))) {
    LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n");
    return false;
  }
  
  assert(ICmpInst::isEquality(Pred) &&
         "Should only get equality predicates here.");
    InvertedCond = Pred != ICmpInst::Predicate::ICMP_EQ;
  if (InvertedCond) {
    Pred = ICmpInst::getInversePredicate(Pred);
    std::swap(TrueBB, FalseBB);
  }
      if (FalseBB != LoopHeaderBB) {
    LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n");
    return false;
  }
              if (ValShifted->getOpcode() == Instruction::AShr &&
      !isMustProgress(CurLoop) && !SE->isKnownNonNegative(SE->getSCEV(Val))) {
    LLVM_DEBUG(dbgs() << DEBUG_TYPE " Can not prove the loop is finite.\n");
    return false;
  }
    return true;
}
bool LoopIdiomRecognize::recognizeShiftUntilZero() {
  bool MadeChange = false;
  Instruction *ValShiftedIsZero;
  Intrinsic::ID IntrID;
  Instruction *IV;
  Value *Start, *Val;
  const SCEV *ExtraOffsetExpr;
  bool InvertedCond;
  if (!detectShiftUntilZeroIdiom(CurLoop, SE, ValShiftedIsZero, IntrID, IV,
                                 Start, Val, ExtraOffsetExpr, InvertedCond)) {
    LLVM_DEBUG(dbgs() << DEBUG_TYPE
               " shift-until-zero idiom detection failed.\n");
    return MadeChange;
  }
  LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-zero idiom detected!\n");
    
  BasicBlock *LoopHeaderBB = CurLoop->getHeader();
  BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
  assert(LoopPreheaderBB && "There is always a loop preheader.");
  BasicBlock *SuccessorBB = CurLoop->getExitBlock();
  assert(SuccessorBB && "There is only a single successor.");
  IRBuilder<> Builder(LoopPreheaderBB->getTerminator());
  Builder.SetCurrentDebugLocation(IV->getDebugLoc());
  Type *Ty = Val->getType();
  unsigned Bitwidth = Ty->getScalarSizeInBits();
  TargetTransformInfo::TargetCostKind CostKind =
      TargetTransformInfo::TCK_SizeAndLatency;
        IntrinsicCostAttributes Attrs(
      IntrID, Ty, {UndefValue::get(Ty), Builder.getFalse()});
  InstructionCost Cost = TTI->getIntrinsicInstrCost(Attrs, CostKind);
  if (Cost > TargetTransformInfo::TCC_Basic) {
    LLVM_DEBUG(dbgs() << DEBUG_TYPE
               " Intrinsic is too costly, not beneficial\n");
    return MadeChange;
  }
    MadeChange = true;
  bool OffsetIsZero = false;
  if (auto *ExtraOffsetExprC = dyn_cast<SCEVConstant>(ExtraOffsetExpr))
    OffsetIsZero = ExtraOffsetExprC->isZero();
  
  CallInst *ValNumLeadingZeros = Builder.CreateIntrinsic(
      IntrID, Ty, {Val, Builder.getFalse()},
      nullptr, Val->getName() + ".numleadingzeros");
  Value *ValNumActiveBits = Builder.CreateSub(
      ConstantInt::get(Ty, Ty->getScalarSizeInBits()), ValNumLeadingZeros,
      Val->getName() + ".numactivebits", true,
      Bitwidth != 2);
  SCEVExpander Expander(*SE, *DL, "loop-idiom");
  Expander.setInsertPoint(&*Builder.GetInsertPoint());
  Value *ExtraOffset = Expander.expandCodeFor(ExtraOffsetExpr);
  Value *ValNumActiveBitsOffset = Builder.CreateAdd(
      ValNumActiveBits, ExtraOffset, ValNumActiveBits->getName() + ".offset",
      OffsetIsZero, true);
  Value *IVFinal = Builder.CreateIntrinsic(Intrinsic::smax, {Ty},
                                           {ValNumActiveBitsOffset, Start},
                                           nullptr, "iv.final");
  auto *LoopBackedgeTakenCount = cast<Instruction>(Builder.CreateSub(
      IVFinal, Start, CurLoop->getName() + ".backedgetakencount",
      OffsetIsZero, true));
  
    Value *LoopTripCount =
      Builder.CreateAdd(LoopBackedgeTakenCount, ConstantInt::get(Ty, 1),
                        CurLoop->getName() + ".tripcount", true,
                        Bitwidth != 2);
    
  IV->replaceUsesOutsideBlock(IVFinal, LoopHeaderBB);
  
    Builder.SetInsertPoint(&LoopHeaderBB->front());
  auto *CIV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv");
    Builder.SetInsertPoint(LoopHeaderBB->getFirstNonPHI());
  auto *CIVNext =
      Builder.CreateAdd(CIV, ConstantInt::get(Ty, 1), CIV->getName() + ".next",
                        true, Bitwidth != 2);
    auto *CIVCheck = Builder.CreateICmpEQ(CIVNext, LoopTripCount,
                                        CurLoop->getName() + ".ivcheck");
  auto *NewIVCheck = CIVCheck;
  if (InvertedCond) {
    NewIVCheck = Builder.CreateNot(CIVCheck);
    NewIVCheck->takeName(ValShiftedIsZero);
  }
    auto *IVDePHId = Builder.CreateAdd(CIV, Start, "", false,
                                     true);   IVDePHId->takeName(IV);
    Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
  Builder.CreateCondBr(CIVCheck, SuccessorBB, LoopHeaderBB);
  LoopHeaderBB->getTerminator()->eraseFromParent();
    CIV->addIncoming(ConstantInt::get(Ty, 0), LoopPreheaderBB);
  CIV->addIncoming(CIVNext, LoopHeaderBB);
    
  SE->forgetLoop(CurLoop);
    IV->replaceAllUsesWith(IVDePHId);
  IV->eraseFromParent();
  ValShiftedIsZero->replaceAllUsesWith(NewIVCheck);
  ValShiftedIsZero->eraseFromParent();
  
  LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-zero idiom optimized!\n");
  ++NumShiftUntilZero;
  return MadeChange;
}