#include "AMDGPUTargetTransformInfo.h"
#include "AMDGPUTargetMachine.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/Support/KnownBits.h"
using namespace llvm;
#define DEBUG_TYPE "AMDGPUtti"
static cl::opt<unsigned> UnrollThresholdPrivate(
"amdgpu-unroll-threshold-private",
cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
cl::init(2700), cl::Hidden);
static cl::opt<unsigned> UnrollThresholdLocal(
"amdgpu-unroll-threshold-local",
cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
cl::init(1000), cl::Hidden);
static cl::opt<unsigned> UnrollThresholdIf(
"amdgpu-unroll-threshold-if",
cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
cl::init(200), cl::Hidden);
static cl::opt<bool> UnrollRuntimeLocal(
"amdgpu-unroll-runtime-local",
cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),
cl::init(true), cl::Hidden);
static cl::opt<bool> UseLegacyDA(
"amdgpu-use-legacy-divergence-analysis",
cl::desc("Enable legacy divergence analysis for AMDGPU"),
cl::init(false), cl::Hidden);
static cl::opt<unsigned> UnrollMaxBlockToAnalyze(
"amdgpu-unroll-max-block-to-analyze",
cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"),
cl::init(32), cl::Hidden);
static cl::opt<unsigned> ArgAllocaCost("amdgpu-inline-arg-alloca-cost",
cl::Hidden, cl::init(4000),
cl::desc("Cost of alloca argument"));
static cl::opt<unsigned>
ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden,
cl::init(256),
cl::desc("Maximum alloca size to use for inline cost"));
static cl::opt<size_t> InlineMaxBB(
"amdgpu-inline-max-bb", cl::Hidden, cl::init(1100),
cl::desc("Maximum number of BBs allowed in a function after inlining"
" (compile time constraint)"));
static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
unsigned Depth = 0) {
const Instruction *I = dyn_cast<Instruction>(Cond);
if (!I)
return false;
for (const Value *V : I->operand_values()) {
if (!L->contains(I))
continue;
if (const PHINode *PHI = dyn_cast<PHINode>(V)) {
if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) {
return SubLoop->contains(PHI); }))
return true;
} else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))
return true;
}
return false;
}
AMDGPUTTIImpl::AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
: BaseT(TM, F.getParent()->getDataLayout()),
TargetTriple(TM->getTargetTriple()),
ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
TLI(ST->getTargetLowering()) {}
void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP,
OptimizationRemarkEmitter *ORE) {
const Function &F = *L->getHeader()->getParent();
UP.Threshold = AMDGPU::getIntegerAttribute(F, "amdgpu-unroll-threshold", 300);
UP.MaxCount = std::numeric_limits<unsigned>::max();
UP.Partial = true;
UP.BEInsns += 3;
const unsigned MaxAlloca = (256 - 16) * 4;
unsigned ThresholdPrivate = UnrollThresholdPrivate;
unsigned ThresholdLocal = UnrollThresholdLocal;
if (MDNode *LoopUnrollThreshold =
findOptionMDForLoop(L, "amdgpu.loop.unroll.threshold")) {
if (LoopUnrollThreshold->getNumOperands() == 2) {
ConstantInt *MetaThresholdValue = mdconst::extract_or_null<ConstantInt>(
LoopUnrollThreshold->getOperand(1));
if (MetaThresholdValue) {
UP.Threshold = MetaThresholdValue->getSExtValue();
UP.PartialThreshold = UP.Threshold;
ThresholdPrivate = std::min(ThresholdPrivate, UP.Threshold);
ThresholdLocal = std::min(ThresholdLocal, UP.Threshold);
}
}
}
unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
for (const BasicBlock *BB : L->getBlocks()) {
const DataLayout &DL = BB->getModule()->getDataLayout();
unsigned LocalGEPsSeen = 0;
if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {
return SubLoop->contains(BB); }))
continue;
for (const Instruction &I : *BB) {
if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) {
if (UP.Threshold < MaxBoost && Br->isConditional()) {
BasicBlock *Succ0 = Br->getSuccessor(0);
BasicBlock *Succ1 = Br->getSuccessor(1);
if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) ||
(L->contains(Succ1) && L->isLoopExiting(Succ1)))
continue;
if (dependsOnLocalPhi(L, Br->getCondition())) {
UP.Threshold += UnrollThresholdIf;
LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
<< " for loop:\n"
<< *L << " due to " << *Br << '\n');
if (UP.Threshold >= MaxBoost)
return;
}
}
continue;
}
const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I);
if (!GEP)
continue;
unsigned AS = GEP->getAddressSpace();
unsigned Threshold = 0;
if (AS == AMDGPUAS::PRIVATE_ADDRESS)
Threshold = ThresholdPrivate;
else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS)
Threshold = ThresholdLocal;
else
continue;
if (UP.Threshold >= Threshold)
continue;
if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
const Value *Ptr = GEP->getPointerOperand();
const AllocaInst *Alloca =
dyn_cast<AllocaInst>(getUnderlyingObject(Ptr));
if (!Alloca || !Alloca->isStaticAlloca())
continue;
Type *Ty = Alloca->getAllocatedType();
unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0;
if (AllocaSize > MaxAlloca)
continue;
} else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
AS == AMDGPUAS::REGION_ADDRESS) {
LocalGEPsSeen++;
if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 ||
(!isa<GlobalVariable>(GEP->getPointerOperand()) &&
!isa<Argument>(GEP->getPointerOperand())))
continue;
LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n"
<< *L << " due to LDS use.\n");
UP.Runtime = UnrollRuntimeLocal;
}
bool HasLoopDef = false;
for (const Value *Op : GEP->operands()) {
const Instruction *Inst = dyn_cast<Instruction>(Op);
if (!Inst || L->isLoopInvariant(Op))
continue;
if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
return SubLoop->contains(Inst); }))
continue;
HasLoopDef = true;
break;
}
if (!HasLoopDef)
continue;
UP.Threshold = Threshold;
LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
<< " for loop:\n"
<< *L << " due to " << *GEP << '\n');
if (UP.Threshold >= MaxBoost)
return;
}
if (L->isInnermost() && BB->size() < UnrollMaxBlockToAnalyze)
UP.MaxIterationsCountToAnalyze = 32;
}
}
void AMDGPUTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
TTI::PeelingPreferences &PP) {
BaseT::getPeelingPreferences(L, SE, PP);
}
const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {
AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,
AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureFlatForGlobal,
AMDGPU::FeaturePromoteAlloca, AMDGPU::FeatureUnalignedScratchAccess,
AMDGPU::FeatureUnalignedAccessMode,
AMDGPU::FeatureAutoWaitcntBeforeBarrier,
AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK,
AMDGPU::FeatureTrapHandler,
AMDGPU::FeatureSRAMECC,
AMDGPU::FeatureFastFMAF32, AMDGPU::HalfRate64Ops};
GCNTTIImpl::GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
: BaseT(TM, F.getParent()->getDataLayout()),
ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
TLI(ST->getTargetLowering()), CommonTTI(TM, F),
IsGraphics(AMDGPU::isGraphics(F.getCallingConv())) {
AMDGPU::SIModeRegisterDefaults Mode(F);
HasFP32Denormals = Mode.allFP32Denormals();
HasFP64FP16Denormals = Mode.allFP64FP16Denormals();
}
unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
return 4;
}
TypeSize
GCNTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
switch (K) {
case TargetTransformInfo::RGK_Scalar:
return TypeSize::getFixed(32);
case TargetTransformInfo::RGK_FixedWidthVector:
return TypeSize::getFixed(ST->hasPackedFP32Ops() ? 64 : 32);
case TargetTransformInfo::RGK_ScalableVector:
return TypeSize::getScalable(0);
}
llvm_unreachable("Unsupported register kind");
}
unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
return 32;
}
unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
if (Opcode == Instruction::Load || Opcode == Instruction::Store)
return 32 * 4 / ElemWidth;
return (ElemWidth == 16 && ST->has16BitInsts()) ? 2
: (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
: 1;
}
unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
unsigned ChainSizeInBytes,
VectorType *VecTy) const {
unsigned VecRegBitWidth = VF * LoadSize;
if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
return 128 / LoadSize;
return VF;
}
unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
unsigned ChainSizeInBytes,
VectorType *VecTy) const {
unsigned VecRegBitWidth = VF * StoreSize;
if (VecRegBitWidth > 128)
return 128 / StoreSize;
return VF;
}
unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER) {
return 512;
}
if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
return 8 * ST->getMaxPrivateElementSize();
return 128;
}
bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
Align Alignment,
unsigned AddrSpace) const {
if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) &&
ChainSizeInBytes <= ST->getMaxPrivateElementSize();
}
return true;
}
bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
Align Alignment,
unsigned AddrSpace) const {
return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
}
bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
Align Alignment,
unsigned AddrSpace) const {
return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
}
Type *GCNTTIImpl::getMemcpyLoopLoweringType(
LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign,
Optional<uint32_t> AtomicElementSize) const {
if (AtomicElementSize)
return Type::getIntNTy(Context, *AtomicElementSize * 8);
unsigned MinAlign = std::min(SrcAlign, DestAlign);
if (MinAlign == 2)
return Type::getInt16Ty(Context);
if (SrcAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
SrcAddrSpace == AMDGPUAS::REGION_ADDRESS ||
DestAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
DestAddrSpace == AMDGPUAS::REGION_ADDRESS) {
return FixedVectorType::get(Type::getInt32Ty(Context), 2);
}
return FixedVectorType::get(Type::getInt32Ty(Context), 4);
}
void GCNTTIImpl::getMemcpyLoopResidualLoweringType(
SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
unsigned SrcAlign, unsigned DestAlign,
Optional<uint32_t> AtomicCpySize) const {
assert(RemainingBytes < 16);
if (AtomicCpySize)
BaseT::getMemcpyLoopResidualLoweringType(
OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,
DestAlign, AtomicCpySize);
unsigned MinAlign = std::min(SrcAlign, DestAlign);
if (MinAlign != 2) {
Type *I64Ty = Type::getInt64Ty(Context);
while (RemainingBytes >= 8) {
OpsOut.push_back(I64Ty);
RemainingBytes -= 8;
}
Type *I32Ty = Type::getInt32Ty(Context);
while (RemainingBytes >= 4) {
OpsOut.push_back(I32Ty);
RemainingBytes -= 4;
}
}
Type *I16Ty = Type::getInt16Ty(Context);
while (RemainingBytes >= 2) {
OpsOut.push_back(I16Ty);
RemainingBytes -= 2;
}
Type *I8Ty = Type::getInt8Ty(Context);
while (RemainingBytes) {
OpsOut.push_back(I8Ty);
--RemainingBytes;
}
}
unsigned GCNTTIImpl::getMaxInterleaveFactor(unsigned VF) {
if (VF == 1)
return 1;
return 8;
}
bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
MemIntrinsicInfo &Info) const {
switch (Inst->getIntrinsicID()) {
case Intrinsic::amdgcn_atomic_inc:
case Intrinsic::amdgcn_atomic_dec:
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap:
case Intrinsic::amdgcn_ds_fadd:
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax: {
auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));
if (!Ordering || !Volatile)
return false;
unsigned OrderingVal = Ordering->getZExtValue();
if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))
return false;
Info.PtrVal = Inst->getArgOperand(0);
Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
Info.ReadMem = true;
Info.WriteMem = true;
Info.IsVolatile = !Volatile->isZero();
return true;
}
default:
return false;
}
}
InstructionCost GCNTTIImpl::getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info,
TTI::OperandValueProperties Opd1PropInfo,
TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
const Instruction *CxtI) {
std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
int ISD = TLI->InstructionOpcodeToISD(Opcode);
unsigned NElts = LT.second.isVector() ?
LT.second.getVectorNumElements() : 1;
MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
switch (ISD) {
case ISD::SHL:
case ISD::SRL:
case ISD::SRA:
if (SLT == MVT::i64)
return get64BitInstrCost(CostKind) * LT.first * NElts;
if (ST->has16BitInsts() && SLT == MVT::i16)
NElts = (NElts + 1) / 2;
return getFullRateInstrCost() * LT.first * NElts;
case ISD::ADD:
case ISD::SUB:
case ISD::AND:
case ISD::OR:
case ISD::XOR:
if (SLT == MVT::i64) {
return 2 * getFullRateInstrCost() * LT.first * NElts;
}
if (ST->has16BitInsts() && SLT == MVT::i16)
NElts = (NElts + 1) / 2;
return LT.first * NElts * getFullRateInstrCost();
case ISD::MUL: {
const int QuarterRateCost = getQuarterRateInstrCost(CostKind);
if (SLT == MVT::i64) {
const int FullRateCost = getFullRateInstrCost();
return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
}
if (ST->has16BitInsts() && SLT == MVT::i16)
NElts = (NElts + 1) / 2;
return QuarterRateCost * NElts * LT.first;
}
case ISD::FMUL:
if (CxtI && CxtI->hasOneUse())
if (const auto *FAdd = dyn_cast<BinaryOperator>(*CxtI->user_begin())) {
const int OPC = TLI->InstructionOpcodeToISD(FAdd->getOpcode());
if (OPC == ISD::FADD || OPC == ISD::FSUB) {
if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals)
return TargetTransformInfo::TCC_Free;
if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals)
return TargetTransformInfo::TCC_Free;
const TargetOptions &Options = TLI->getTargetMachine().Options;
if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
Options.UnsafeFPMath ||
(FAdd->hasAllowContract() && CxtI->hasAllowContract()))
return TargetTransformInfo::TCC_Free;
}
}
LLVM_FALLTHROUGH;
case ISD::FADD:
case ISD::FSUB:
if (ST->hasPackedFP32Ops() && SLT == MVT::f32)
NElts = (NElts + 1) / 2;
if (SLT == MVT::f64)
return LT.first * NElts * get64BitInstrCost(CostKind);
if (ST->has16BitInsts() && SLT == MVT::f16)
NElts = (NElts + 1) / 2;
if (SLT == MVT::f32 || SLT == MVT::f16)
return LT.first * NElts * getFullRateInstrCost();
break;
case ISD::FDIV:
case ISD::FREM:
if (SLT == MVT::f64) {
int Cost = 7 * get64BitInstrCost(CostKind) +
getQuarterRateInstrCost(CostKind) +
3 * getHalfRateInstrCost(CostKind);
if (!ST->hasUsableDivScaleConditionOutput())
Cost += 3 * getFullRateInstrCost();
return LT.first * Cost * NElts;
}
if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) {
if ((SLT == MVT::f32 && !HasFP32Denormals) ||
(SLT == MVT::f16 && ST->has16BitInsts())) {
return LT.first * getQuarterRateInstrCost(CostKind) * NElts;
}
}
if (SLT == MVT::f16 && ST->has16BitInsts()) {
int Cost =
4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(CostKind);
return LT.first * Cost * NElts;
}
if (SLT == MVT::f32 || SLT == MVT::f16) {
int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() +
1 * getQuarterRateInstrCost(CostKind);
if (!HasFP32Denormals) {
Cost += 2 * getFullRateInstrCost();
}
return LT.first * NElts * Cost;
}
break;
case ISD::FNEG:
return TLI->isFNegFree(SLT) ? 0 : NElts;
default:
break;
}
return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, Opd2Info,
Opd1PropInfo, Opd2PropInfo, Args, CxtI);
}
static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID) {
switch (ID) {
case Intrinsic::fma: case Intrinsic::round:
case Intrinsic::uadd_sat:
case Intrinsic::usub_sat:
case Intrinsic::sadd_sat:
case Intrinsic::ssub_sat:
return true;
default:
return false;
}
}
InstructionCost
GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
TTI::TargetCostKind CostKind) {
if (ICA.getID() == Intrinsic::fabs)
return 0;
if (!intrinsicHasPackedVectorBenefit(ICA.getID()))
return BaseT::getIntrinsicInstrCost(ICA, CostKind);
Type *RetTy = ICA.getReturnType();
std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
unsigned NElts = LT.second.isVector() ?
LT.second.getVectorNumElements() : 1;
MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
if (SLT == MVT::f64)
return LT.first * NElts * get64BitInstrCost(CostKind);
if ((ST->has16BitInsts() && SLT == MVT::f16) ||
(ST->hasPackedFP32Ops() && SLT == MVT::f32))
NElts = (NElts + 1) / 2;
unsigned InstRate = getQuarterRateInstrCost(CostKind);
switch (ICA.getID()) {
case Intrinsic::fma:
InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
: getQuarterRateInstrCost(CostKind);
break;
case Intrinsic::uadd_sat:
case Intrinsic::usub_sat:
case Intrinsic::sadd_sat:
case Intrinsic::ssub_sat:
static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16};
if (any_of(ValidSatTys, [<](MVT M) { return M == LT.second; }))
NElts = 1;
break;
}
return LT.first * NElts * InstRate;
}
InstructionCost GCNTTIImpl::getCFInstrCost(unsigned Opcode,
TTI::TargetCostKind CostKind,
const Instruction *I) {
assert((I == nullptr || I->getOpcode() == Opcode) &&
"Opcode should reflect passed instruction.");
const bool SCost =
(CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency);
const int CBrCost = SCost ? 5 : 7;
switch (Opcode) {
case Instruction::Br: {
auto BI = dyn_cast_or_null<BranchInst>(I);
if (BI && BI->isUnconditional())
return SCost ? 1 : 4;
return CBrCost;
}
case Instruction::Switch: {
auto SI = dyn_cast_or_null<SwitchInst>(I);
return (SI ? (SI->getNumCases() + 1) : 4) * (CBrCost + 1);
}
case Instruction::Ret:
return SCost ? 1 : 10;
}
return BaseT::getCFInstrCost(Opcode, CostKind, I);
}
InstructionCost
GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
Optional<FastMathFlags> FMF,
TTI::TargetCostKind CostKind) {
if (TTI::requiresOrderedReduction(FMF))
return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
EVT OrigTy = TLI->getValueType(DL, Ty);
if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
return LT.first * getFullRateInstrCost();
}
InstructionCost
GCNTTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
bool IsUnsigned,
TTI::TargetCostKind CostKind) {
EVT OrigTy = TLI->getValueType(DL, Ty);
if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
return LT.first * getHalfRateInstrCost(CostKind);
}
InstructionCost GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
unsigned Index) {
switch (Opcode) {
case Instruction::ExtractElement:
case Instruction::InsertElement: {
unsigned EltSize
= DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
if (EltSize < 32) {
if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
return 0;
return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
}
return Index == ~0u ? 2 : 0;
}
default:
return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
}
}
bool GCNTTIImpl::isInlineAsmSourceOfDivergence(
const CallInst *CI, ArrayRef<unsigned> Indices) const {
if (Indices.size() > 1)
return true;
const DataLayout &DL = CI->getModule()->getDataLayout();
const SIRegisterInfo *TRI = ST->getRegisterInfo();
TargetLowering::AsmOperandInfoVector TargetConstraints =
TLI->ParseConstraints(DL, ST->getRegisterInfo(), *CI);
const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0];
int OutputIdx = 0;
for (auto &TC : TargetConstraints) {
if (TC.Type != InlineAsm::isOutput)
continue;
if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)
continue;
TLI->ComputeConstraintToUse(TC, SDValue());
const TargetRegisterClass *RC = TLI->getRegForInlineAsmConstraint(
TRI, TC.ConstraintCode, TC.ConstraintVT).second;
if (!RC || !TRI->isSGPRClass(RC))
return true;
}
return false;
}
bool GCNTTIImpl::useGPUDivergenceAnalysis() const {
return !UseLegacyDA;
}
bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
if (const Argument *A = dyn_cast<Argument>(V))
return !AMDGPU::isArgPassedInSGPR(A);
if (const LoadInst *Load = dyn_cast<LoadInst>(V))
return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
if (isa<AtomicRMWInst>(V) || isa<AtomicCmpXchgInst>(V))
return true;
if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V))
return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID());
if (const CallInst *CI = dyn_cast<CallInst>(V)) {
if (CI->isInlineAsm())
return isInlineAsmSourceOfDivergence(CI);
return true;
}
if (isa<InvokeInst>(V))
return true;
return false;
}
bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
switch (Intrinsic->getIntrinsicID()) {
default:
return false;
case Intrinsic::amdgcn_readfirstlane:
case Intrinsic::amdgcn_readlane:
case Intrinsic::amdgcn_icmp:
case Intrinsic::amdgcn_fcmp:
case Intrinsic::amdgcn_ballot:
case Intrinsic::amdgcn_if_break:
return true;
}
}
if (const CallInst *CI = dyn_cast<CallInst>(V)) {
if (CI->isInlineAsm())
return !isInlineAsmSourceOfDivergence(CI);
return false;
}
const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);
if (!ExtValue)
return false;
const CallInst *CI = dyn_cast<CallInst>(ExtValue->getOperand(0));
if (!CI)
return false;
if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(CI)) {
switch (Intrinsic->getIntrinsicID()) {
default:
return false;
case Intrinsic::amdgcn_if:
case Intrinsic::amdgcn_else: {
ArrayRef<unsigned> Indices = ExtValue->getIndices();
return Indices.size() == 1 && Indices[0] == 1;
}
}
}
if (CI->isInlineAsm())
return !isInlineAsmSourceOfDivergence(CI, ExtValue->getIndices());
return false;
}
bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
Intrinsic::ID IID) const {
switch (IID) {
case Intrinsic::amdgcn_atomic_inc:
case Intrinsic::amdgcn_atomic_dec:
case Intrinsic::amdgcn_ds_fadd:
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax:
case Intrinsic::amdgcn_is_shared:
case Intrinsic::amdgcn_is_private:
OpIndexes.push_back(0);
return true;
default:
return false;
}
}
Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
Value *OldV,
Value *NewV) const {
auto IntrID = II->getIntrinsicID();
switch (IntrID) {
case Intrinsic::amdgcn_atomic_inc:
case Intrinsic::amdgcn_atomic_dec:
case Intrinsic::amdgcn_ds_fadd:
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax: {
const ConstantInt *IsVolatile = cast<ConstantInt>(II->getArgOperand(4));
if (!IsVolatile->isZero())
return nullptr;
Module *M = II->getParent()->getParent()->getParent();
Type *DestTy = II->getType();
Type *SrcTy = NewV->getType();
Function *NewDecl =
Intrinsic::getDeclaration(M, II->getIntrinsicID(), {DestTy, SrcTy});
II->setArgOperand(0, NewV);
II->setCalledFunction(NewDecl);
return II;
}
case Intrinsic::amdgcn_is_shared:
case Intrinsic::amdgcn_is_private: {
unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
AMDGPUAS::LOCAL_ADDRESS : AMDGPUAS::PRIVATE_ADDRESS;
unsigned NewAS = NewV->getType()->getPointerAddressSpace();
LLVMContext &Ctx = NewV->getType()->getContext();
ConstantInt *NewVal = (TrueAS == NewAS) ?
ConstantInt::getTrue(Ctx) : ConstantInt::getFalse(Ctx);
return NewVal;
}
case Intrinsic::ptrmask: {
unsigned OldAS = OldV->getType()->getPointerAddressSpace();
unsigned NewAS = NewV->getType()->getPointerAddressSpace();
Value *MaskOp = II->getArgOperand(1);
Type *MaskTy = MaskOp->getType();
bool DoTruncate = false;
const GCNTargetMachine &TM =
static_cast<const GCNTargetMachine &>(getTLI()->getTargetMachine());
if (!TM.isNoopAddrSpaceCast(OldAS, NewAS)) {
if (DL.getPointerSizeInBits(OldAS) != 64 ||
DL.getPointerSizeInBits(NewAS) != 32)
return nullptr;
KnownBits Known = computeKnownBits(MaskOp, DL, 0, nullptr, II);
if (Known.countMinLeadingOnes() < 32)
return nullptr;
DoTruncate = true;
}
IRBuilder<> B(II);
if (DoTruncate) {
MaskTy = B.getInt32Ty();
MaskOp = B.CreateTrunc(MaskOp, MaskTy);
}
return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->getType(), MaskTy},
{NewV, MaskOp});
}
default:
return nullptr;
}
}
InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
VectorType *VT, ArrayRef<int> Mask,
int Index, VectorType *SubTp,
ArrayRef<const Value *> Args) {
Kind = improveShuffleKindFromMask(Kind, Mask);
if (ST->hasVOP3PInsts()) {
if (cast<FixedVectorType>(VT)->getNumElements() == 2 &&
DL.getTypeSizeInBits(VT->getElementType()) == 16) {
switch (Kind) {
case TTI::SK_Broadcast:
case TTI::SK_Reverse:
case TTI::SK_PermuteSingleSrc:
return 0;
default:
break;
}
}
}
return BaseT::getShuffleCost(Kind, VT, Mask, Index, SubTp);
}
bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
const Function *Callee) const {
const TargetMachine &TM = getTLI()->getTargetMachine();
const GCNSubtarget *CallerST
= static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Caller));
const GCNSubtarget *CalleeST
= static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Callee));
const FeatureBitset &CallerBits = CallerST->getFeatureBits();
const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();
FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
return false;
AMDGPU::SIModeRegisterDefaults CallerMode(*Caller);
AMDGPU::SIModeRegisterDefaults CalleeMode(*Callee);
if (!CallerMode.isInlineCompatible(CalleeMode))
return false;
if (Callee->hasFnAttribute(Attribute::AlwaysInline) ||
Callee->hasFnAttribute(Attribute::InlineHint))
return true;
if (InlineMaxBB) {
if (Callee->size() == 1)
return true;
size_t BBSize = Caller->size() + Callee->size() - 1;
return BBSize <= InlineMaxBB;
}
return true;
}
unsigned GCNTTIImpl::adjustInliningThreshold(const CallBase *CB) const {
uint64_t AllocaSize = 0;
SmallPtrSet<const AllocaInst *, 8> AIVisited;
for (Value *PtrArg : CB->args()) {
PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType());
if (!Ty || (Ty->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS &&
Ty->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS))
continue;
PtrArg = getUnderlyingObject(PtrArg);
if (const AllocaInst *AI = dyn_cast<AllocaInst>(PtrArg)) {
if (!AI->isStaticAlloca() || !AIVisited.insert(AI).second)
continue;
AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType());
if (AllocaSize > ArgAllocaCutoff) {
AllocaSize = 0;
break;
}
}
}
if (AllocaSize)
return ArgAllocaCost;
return 0;
}
void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP,
OptimizationRemarkEmitter *ORE) {
CommonTTI.getUnrollingPreferences(L, SE, UP, ORE);
}
void GCNTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
TTI::PeelingPreferences &PP) {
CommonTTI.getPeelingPreferences(L, SE, PP);
}
int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const {
return ST->hasFullRate64Ops()
? getFullRateInstrCost()
: ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)
: getQuarterRateInstrCost(CostKind);
}