#include "X86TargetTransformInfo.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsX86.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Transforms/InstCombine/InstCombiner.h"
using namespace llvm;
#define DEBUG_TYPE "x86tti"
static Constant *getNegativeIsTrueBoolVec(Constant *V) {
VectorType *IntTy = VectorType::getInteger(cast<VectorType>(V->getType()));
V = ConstantExpr::getBitCast(V, IntTy);
V = ConstantExpr::getICmp(CmpInst::ICMP_SGT, Constant::getNullValue(IntTy),
V);
return V;
}
static Value *getBoolVecFromMask(Value *Mask) {
if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask))
return getNegativeIsTrueBoolVec(ConstantMask);
Value *ExtMask;
if (PatternMatch::match(
Mask, PatternMatch::m_SExt(PatternMatch::m_Value(ExtMask))) &&
ExtMask->getType()->isIntOrIntVectorTy(1))
return ExtMask;
return nullptr;
}
static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) {
Value *Ptr = II.getOperand(0);
Value *Mask = II.getOperand(1);
Constant *ZeroVec = Constant::getNullValue(II.getType());
if (isa<ConstantAggregateZero>(Mask))
return IC.replaceInstUsesWith(II, ZeroVec);
if (Value *BoolMask = getBoolVecFromMask(Mask)) {
unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace);
Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
CallInst *NewMaskedLoad = IC.Builder.CreateMaskedLoad(
II.getType(), PtrCast, Align(1), BoolMask, ZeroVec);
return IC.replaceInstUsesWith(II, NewMaskedLoad);
}
return nullptr;
}
static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) {
Value *Ptr = II.getOperand(0);
Value *Mask = II.getOperand(1);
Value *Vec = II.getOperand(2);
if (isa<ConstantAggregateZero>(Mask)) {
IC.eraseInstFromFunction(II);
return true;
}
if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu)
return false;
if (Value *BoolMask = getBoolVecFromMask(Mask)) {
unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace);
Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask);
IC.eraseInstFromFunction(II);
return true;
}
return false;
}
static Value *simplifyX86immShift(const IntrinsicInst &II,
InstCombiner::BuilderTy &Builder) {
bool LogicalShift = false;
bool ShiftLeft = false;
bool IsImm = false;
switch (II.getIntrinsicID()) {
default:
llvm_unreachable("Unexpected intrinsic!");
case Intrinsic::x86_sse2_psrai_d:
case Intrinsic::x86_sse2_psrai_w:
case Intrinsic::x86_avx2_psrai_d:
case Intrinsic::x86_avx2_psrai_w:
case Intrinsic::x86_avx512_psrai_q_128:
case Intrinsic::x86_avx512_psrai_q_256:
case Intrinsic::x86_avx512_psrai_d_512:
case Intrinsic::x86_avx512_psrai_q_512:
case Intrinsic::x86_avx512_psrai_w_512:
IsImm = true;
LLVM_FALLTHROUGH;
case Intrinsic::x86_sse2_psra_d:
case Intrinsic::x86_sse2_psra_w:
case Intrinsic::x86_avx2_psra_d:
case Intrinsic::x86_avx2_psra_w:
case Intrinsic::x86_avx512_psra_q_128:
case Intrinsic::x86_avx512_psra_q_256:
case Intrinsic::x86_avx512_psra_d_512:
case Intrinsic::x86_avx512_psra_q_512:
case Intrinsic::x86_avx512_psra_w_512:
LogicalShift = false;
ShiftLeft = false;
break;
case Intrinsic::x86_sse2_psrli_d:
case Intrinsic::x86_sse2_psrli_q:
case Intrinsic::x86_sse2_psrli_w:
case Intrinsic::x86_avx2_psrli_d:
case Intrinsic::x86_avx2_psrli_q:
case Intrinsic::x86_avx2_psrli_w:
case Intrinsic::x86_avx512_psrli_d_512:
case Intrinsic::x86_avx512_psrli_q_512:
case Intrinsic::x86_avx512_psrli_w_512:
IsImm = true;
LLVM_FALLTHROUGH;
case Intrinsic::x86_sse2_psrl_d:
case Intrinsic::x86_sse2_psrl_q:
case Intrinsic::x86_sse2_psrl_w:
case Intrinsic::x86_avx2_psrl_d:
case Intrinsic::x86_avx2_psrl_q:
case Intrinsic::x86_avx2_psrl_w:
case Intrinsic::x86_avx512_psrl_d_512:
case Intrinsic::x86_avx512_psrl_q_512:
case Intrinsic::x86_avx512_psrl_w_512:
LogicalShift = true;
ShiftLeft = false;
break;
case Intrinsic::x86_sse2_pslli_d:
case Intrinsic::x86_sse2_pslli_q:
case Intrinsic::x86_sse2_pslli_w:
case Intrinsic::x86_avx2_pslli_d:
case Intrinsic::x86_avx2_pslli_q:
case Intrinsic::x86_avx2_pslli_w:
case Intrinsic::x86_avx512_pslli_d_512:
case Intrinsic::x86_avx512_pslli_q_512:
case Intrinsic::x86_avx512_pslli_w_512:
IsImm = true;
LLVM_FALLTHROUGH;
case Intrinsic::x86_sse2_psll_d:
case Intrinsic::x86_sse2_psll_q:
case Intrinsic::x86_sse2_psll_w:
case Intrinsic::x86_avx2_psll_d:
case Intrinsic::x86_avx2_psll_q:
case Intrinsic::x86_avx2_psll_w:
case Intrinsic::x86_avx512_psll_d_512:
case Intrinsic::x86_avx512_psll_q_512:
case Intrinsic::x86_avx512_psll_w_512:
LogicalShift = true;
ShiftLeft = true;
break;
}
assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
Value *Vec = II.getArgOperand(0);
Value *Amt = II.getArgOperand(1);
auto *VT = cast<FixedVectorType>(Vec->getType());
Type *SVT = VT->getElementType();
Type *AmtVT = Amt->getType();
unsigned VWidth = VT->getNumElements();
unsigned BitWidth = SVT->getPrimitiveSizeInBits();
if (IsImm) {
assert(AmtVT->isIntegerTy(32) && "Unexpected shift-by-immediate type");
KnownBits KnownAmtBits =
llvm::computeKnownBits(Amt, II.getModule()->getDataLayout());
if (KnownAmtBits.getMaxValue().ult(BitWidth)) {
Amt = Builder.CreateZExtOrTrunc(Amt, SVT);
Amt = Builder.CreateVectorSplat(VWidth, Amt);
return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
: Builder.CreateLShr(Vec, Amt))
: Builder.CreateAShr(Vec, Amt));
}
if (KnownAmtBits.getMinValue().uge(BitWidth)) {
if (LogicalShift)
return ConstantAggregateZero::get(VT);
Amt = ConstantInt::get(SVT, BitWidth - 1);
return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt));
}
} else {
assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
cast<VectorType>(AmtVT)->getElementType() == SVT &&
"Unexpected shift-by-scalar type");
unsigned NumAmtElts = cast<FixedVectorType>(AmtVT)->getNumElements();
APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0);
APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2);
KnownBits KnownLowerBits = llvm::computeKnownBits(
Amt, DemandedLower, II.getModule()->getDataLayout());
KnownBits KnownUpperBits = llvm::computeKnownBits(
Amt, DemandedUpper, II.getModule()->getDataLayout());
if (KnownLowerBits.getMaxValue().ult(BitWidth) &&
(DemandedUpper.isZero() || KnownUpperBits.isZero())) {
SmallVector<int, 16> ZeroSplat(VWidth, 0);
Amt = Builder.CreateShuffleVector(Amt, ZeroSplat);
return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
: Builder.CreateLShr(Vec, Amt))
: Builder.CreateAShr(Vec, Amt));
}
}
auto *CDV = dyn_cast<ConstantDataVector>(Amt);
if (!CDV)
return nullptr;
assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
cast<VectorType>(AmtVT)->getElementType() == SVT &&
"Unexpected shift-by-scalar type");
APInt Count(64, 0);
for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) {
unsigned SubEltIdx = (NumSubElts - 1) - i;
auto *SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx));
Count <<= BitWidth;
Count |= SubElt->getValue().zextOrTrunc(64);
}
if (Count.isZero())
return Vec;
if (Count.uge(BitWidth)) {
if (LogicalShift)
return ConstantAggregateZero::get(VT);
Count = APInt(64, BitWidth - 1);
}
auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth));
auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt);
if (ShiftLeft)
return Builder.CreateShl(Vec, ShiftVec);
if (LogicalShift)
return Builder.CreateLShr(Vec, ShiftVec);
return Builder.CreateAShr(Vec, ShiftVec);
}
static Value *simplifyX86varShift(const IntrinsicInst &II,
InstCombiner::BuilderTy &Builder) {
bool LogicalShift = false;
bool ShiftLeft = false;
switch (II.getIntrinsicID()) {
default:
llvm_unreachable("Unexpected intrinsic!");
case Intrinsic::x86_avx2_psrav_d:
case Intrinsic::x86_avx2_psrav_d_256:
case Intrinsic::x86_avx512_psrav_q_128:
case Intrinsic::x86_avx512_psrav_q_256:
case Intrinsic::x86_avx512_psrav_d_512:
case Intrinsic::x86_avx512_psrav_q_512:
case Intrinsic::x86_avx512_psrav_w_128:
case Intrinsic::x86_avx512_psrav_w_256:
case Intrinsic::x86_avx512_psrav_w_512:
LogicalShift = false;
ShiftLeft = false;
break;
case Intrinsic::x86_avx2_psrlv_d:
case Intrinsic::x86_avx2_psrlv_d_256:
case Intrinsic::x86_avx2_psrlv_q:
case Intrinsic::x86_avx2_psrlv_q_256:
case Intrinsic::x86_avx512_psrlv_d_512:
case Intrinsic::x86_avx512_psrlv_q_512:
case Intrinsic::x86_avx512_psrlv_w_128:
case Intrinsic::x86_avx512_psrlv_w_256:
case Intrinsic::x86_avx512_psrlv_w_512:
LogicalShift = true;
ShiftLeft = false;
break;
case Intrinsic::x86_avx2_psllv_d:
case Intrinsic::x86_avx2_psllv_d_256:
case Intrinsic::x86_avx2_psllv_q:
case Intrinsic::x86_avx2_psllv_q_256:
case Intrinsic::x86_avx512_psllv_d_512:
case Intrinsic::x86_avx512_psllv_q_512:
case Intrinsic::x86_avx512_psllv_w_128:
case Intrinsic::x86_avx512_psllv_w_256:
case Intrinsic::x86_avx512_psllv_w_512:
LogicalShift = true;
ShiftLeft = true;
break;
}
assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
Value *Vec = II.getArgOperand(0);
Value *Amt = II.getArgOperand(1);
auto *VT = cast<FixedVectorType>(II.getType());
Type *SVT = VT->getElementType();
int NumElts = VT->getNumElements();
int BitWidth = SVT->getIntegerBitWidth();
KnownBits KnownAmt =
llvm::computeKnownBits(Amt, II.getModule()->getDataLayout());
if (KnownAmt.getMaxValue().ult(BitWidth)) {
return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
: Builder.CreateLShr(Vec, Amt))
: Builder.CreateAShr(Vec, Amt));
}
auto *CShift = dyn_cast<Constant>(Amt);
if (!CShift)
return nullptr;
bool AnyOutOfRange = false;
SmallVector<int, 8> ShiftAmts;
for (int I = 0; I < NumElts; ++I) {
auto *CElt = CShift->getAggregateElement(I);
if (isa_and_nonnull<UndefValue>(CElt)) {
ShiftAmts.push_back(-1);
continue;
}
auto *COp = dyn_cast_or_null<ConstantInt>(CElt);
if (!COp)
return nullptr;
APInt ShiftVal = COp->getValue();
if (ShiftVal.uge(BitWidth)) {
AnyOutOfRange = LogicalShift;
ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1);
continue;
}
ShiftAmts.push_back((int)ShiftVal.getZExtValue());
}
auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); };
if (llvm::all_of(ShiftAmts, OutOfRange)) {
SmallVector<Constant *, 8> ConstantVec;
for (int Idx : ShiftAmts) {
if (Idx < 0) {
ConstantVec.push_back(UndefValue::get(SVT));
} else {
assert(LogicalShift && "Logical shift expected");
ConstantVec.push_back(ConstantInt::getNullValue(SVT));
}
}
return ConstantVector::get(ConstantVec);
}
if (AnyOutOfRange)
return nullptr;
SmallVector<Constant *, 8> ShiftVecAmts;
for (int Idx : ShiftAmts) {
if (Idx < 0)
ShiftVecAmts.push_back(UndefValue::get(SVT));
else
ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx));
}
auto ShiftVec = ConstantVector::get(ShiftVecAmts);
if (ShiftLeft)
return Builder.CreateShl(Vec, ShiftVec);
if (LogicalShift)
return Builder.CreateLShr(Vec, ShiftVec);
return Builder.CreateAShr(Vec, ShiftVec);
}
static Value *simplifyX86pack(IntrinsicInst &II,
InstCombiner::BuilderTy &Builder, bool IsSigned) {
Value *Arg0 = II.getArgOperand(0);
Value *Arg1 = II.getArgOperand(1);
Type *ResTy = II.getType();
if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1))
return UndefValue::get(ResTy);
auto *ArgTy = cast<FixedVectorType>(Arg0->getType());
unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128;
unsigned NumSrcElts = ArgTy->getNumElements();
assert(cast<FixedVectorType>(ResTy)->getNumElements() == (2 * NumSrcElts) &&
"Unexpected packing types");
unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits();
unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits();
assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) &&
"Unexpected packing types");
if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
return nullptr;
APInt MinValue, MaxValue;
if (IsSigned) {
MinValue =
APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
MaxValue =
APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
} else {
MinValue = APInt::getZero(SrcScalarSizeInBits);
MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits);
}
auto *MinC = Constant::getIntegerValue(ArgTy, MinValue);
auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue);
Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0);
Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1);
Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0);
Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1);
SmallVector<int, 32> PackMask;
for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane));
for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts);
}
auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask);
return Builder.CreateTrunc(Shuffle, ResTy);
}
static Value *simplifyX86movmsk(const IntrinsicInst &II,
InstCombiner::BuilderTy &Builder) {
Value *Arg = II.getArgOperand(0);
Type *ResTy = II.getType();
if (isa<UndefValue>(Arg))
return Constant::getNullValue(ResTy);
auto *ArgTy = dyn_cast<FixedVectorType>(Arg->getType());
if (!ArgTy)
return nullptr;
unsigned NumElts = ArgTy->getNumElements();
Type *IntegerTy = Builder.getIntNTy(NumElts);
Value *Res = Builder.CreateBitCast(Arg, VectorType::getInteger(ArgTy));
Res = Builder.CreateIsNeg(Res);
Res = Builder.CreateBitCast(Res, IntegerTy);
Res = Builder.CreateZExtOrTrunc(Res, ResTy);
return Res;
}
static Value *simplifyX86addcarry(const IntrinsicInst &II,
InstCombiner::BuilderTy &Builder) {
Value *CarryIn = II.getArgOperand(0);
Value *Op1 = II.getArgOperand(1);
Value *Op2 = II.getArgOperand(2);
Type *RetTy = II.getType();
Type *OpTy = Op1->getType();
assert(RetTy->getStructElementType(0)->isIntegerTy(8) &&
RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() &&
"Unexpected types for x86 addcarry");
if (match(CarryIn, PatternMatch::m_ZeroInt())) {
Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy,
{Op1, Op2});
Value *UAddResult = Builder.CreateExtractValue(UAdd, 0);
Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1),
Builder.getInt8Ty());
Value *Res = UndefValue::get(RetTy);
Res = Builder.CreateInsertValue(Res, UAddOV, 0);
return Builder.CreateInsertValue(Res, UAddResult, 1);
}
return nullptr;
}
static Value *simplifyX86insertps(const IntrinsicInst &II,
InstCombiner::BuilderTy &Builder) {
auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2));
if (!CInt)
return nullptr;
auto *VecTy = cast<FixedVectorType>(II.getType());
assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type");
uint8_t Imm = CInt->getZExtValue();
uint8_t ZMask = Imm & 0xf;
uint8_t DestLane = (Imm >> 4) & 0x3;
uint8_t SourceLane = (Imm >> 6) & 0x3;
ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);
if (ZMask == 0xf)
return ZeroVector;
int ShuffleMask[4] = {0, 1, 2, 3};
Value *V1 = II.getArgOperand(1);
if (ZMask) {
if ((II.getArgOperand(0) == II.getArgOperand(1)) ||
(ZMask & (1 << DestLane))) {
V1 = ZeroVector;
ShuffleMask[DestLane] = SourceLane;
for (unsigned i = 0; i < 4; ++i)
if ((ZMask >> i) & 0x1)
ShuffleMask[i] = i + 4;
} else {
return nullptr;
}
} else {
ShuffleMask[DestLane] = SourceLane + 4;
}
return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask);
}
static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0,
ConstantInt *CILength, ConstantInt *CIIndex,
InstCombiner::BuilderTy &Builder) {
auto LowConstantHighUndef = [&](uint64_t Val) {
Type *IntTy64 = Type::getInt64Ty(II.getContext());
Constant *Args[] = {ConstantInt::get(IntTy64, Val),
UndefValue::get(IntTy64)};
return ConstantVector::get(Args);
};
auto *C0 = dyn_cast<Constant>(Op0);
auto *CI0 =
C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
: nullptr;
if (CILength && CIIndex) {
APInt APIndex = CIIndex->getValue().zextOrTrunc(6);
APInt APLength = CILength->getValue().zextOrTrunc(6);
unsigned Index = APIndex.getZExtValue();
unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
unsigned End = Index + Length;
if (End > 64)
return UndefValue::get(II.getType());
if ((Length % 8) == 0 && (Index % 8) == 0) {
Length /= 8;
Index /= 8;
Type *IntTy8 = Type::getInt8Ty(II.getContext());
auto *ShufTy = FixedVectorType::get(IntTy8, 16);
SmallVector<int, 16> ShuffleMask;
for (int i = 0; i != (int)Length; ++i)
ShuffleMask.push_back(i + Index);
for (int i = Length; i != 8; ++i)
ShuffleMask.push_back(i + 16);
for (int i = 8; i != 16; ++i)
ShuffleMask.push_back(-1);
Value *SV = Builder.CreateShuffleVector(
Builder.CreateBitCast(Op0, ShufTy),
ConstantAggregateZero::get(ShufTy), ShuffleMask);
return Builder.CreateBitCast(SV, II.getType());
}
if (CI0) {
APInt Elt = CI0->getValue();
Elt.lshrInPlace(Index);
Elt = Elt.zextOrTrunc(Length);
return LowConstantHighUndef(Elt.getZExtValue());
}
if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) {
Value *Args[] = {Op0, CILength, CIIndex};
Module *M = II.getModule();
Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi);
return Builder.CreateCall(F, Args);
}
}
if (CI0 && CI0->isZero())
return LowConstantHighUndef(0);
return nullptr;
}
static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1,
APInt APLength, APInt APIndex,
InstCombiner::BuilderTy &Builder) {
APIndex = APIndex.zextOrTrunc(6);
APLength = APLength.zextOrTrunc(6);
unsigned Index = APIndex.getZExtValue();
unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
unsigned End = Index + Length;
if (End > 64)
return UndefValue::get(II.getType());
if ((Length % 8) == 0 && (Index % 8) == 0) {
Length /= 8;
Index /= 8;
Type *IntTy8 = Type::getInt8Ty(II.getContext());
auto *ShufTy = FixedVectorType::get(IntTy8, 16);
SmallVector<int, 16> ShuffleMask;
for (int i = 0; i != (int)Index; ++i)
ShuffleMask.push_back(i);
for (int i = 0; i != (int)Length; ++i)
ShuffleMask.push_back(i + 16);
for (int i = Index + Length; i != 8; ++i)
ShuffleMask.push_back(i);
for (int i = 8; i != 16; ++i)
ShuffleMask.push_back(-1);
Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy),
Builder.CreateBitCast(Op1, ShufTy),
ShuffleMask);
return Builder.CreateBitCast(SV, II.getType());
}
auto *C0 = dyn_cast<Constant>(Op0);
auto *C1 = dyn_cast<Constant>(Op1);
auto *CI00 =
C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
: nullptr;
auto *CI10 =
C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
: nullptr;
if (CI00 && CI10) {
APInt V00 = CI00->getValue();
APInt V10 = CI10->getValue();
APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index);
V00 = V00 & ~Mask;
V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index);
APInt Val = V00 | V10;
Type *IntTy64 = Type::getInt64Ty(II.getContext());
Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()),
UndefValue::get(IntTy64)};
return ConstantVector::get(Args);
}
if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) {
Type *IntTy8 = Type::getInt8Ty(II.getContext());
Constant *CILength = ConstantInt::get(IntTy8, Length, false);
Constant *CIIndex = ConstantInt::get(IntTy8, Index, false);
Value *Args[] = {Op0, Op1, CILength, CIIndex};
Module *M = II.getModule();
Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi);
return Builder.CreateCall(F, Args);
}
return nullptr;
}
static Value *simplifyX86pshufb(const IntrinsicInst &II,
InstCombiner::BuilderTy &Builder) {
auto *V = dyn_cast<Constant>(II.getArgOperand(1));
if (!V)
return nullptr;
auto *VecTy = cast<FixedVectorType>(II.getType());
unsigned NumElts = VecTy->getNumElements();
assert((NumElts == 16 || NumElts == 32 || NumElts == 64) &&
"Unexpected number of elements in shuffle mask!");
int Indexes[64];
for (unsigned I = 0; I < NumElts; ++I) {
Constant *COp = V->getAggregateElement(I);
if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
return nullptr;
if (isa<UndefValue>(COp)) {
Indexes[I] = -1;
continue;
}
int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue();
Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0);
Indexes[I] = Index;
}
auto V1 = II.getArgOperand(0);
auto V2 = Constant::getNullValue(VecTy);
return Builder.CreateShuffleVector(V1, V2, makeArrayRef(Indexes, NumElts));
}
static Value *simplifyX86vpermilvar(const IntrinsicInst &II,
InstCombiner::BuilderTy &Builder) {
auto *V = dyn_cast<Constant>(II.getArgOperand(1));
if (!V)
return nullptr;
auto *VecTy = cast<FixedVectorType>(II.getType());
unsigned NumElts = VecTy->getNumElements();
bool IsPD = VecTy->getScalarType()->isDoubleTy();
unsigned NumLaneElts = IsPD ? 2 : 4;
assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2);
int Indexes[16];
for (unsigned I = 0; I < NumElts; ++I) {
Constant *COp = V->getAggregateElement(I);
if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
return nullptr;
if (isa<UndefValue>(COp)) {
Indexes[I] = -1;
continue;
}
APInt Index = cast<ConstantInt>(COp)->getValue();
Index = Index.zextOrTrunc(32).getLoBits(2);
if (IsPD)
Index.lshrInPlace(1);
Index += APInt(32, (I / NumLaneElts) * NumLaneElts);
Indexes[I] = Index.getZExtValue();
}
auto V1 = II.getArgOperand(0);
return Builder.CreateShuffleVector(V1, makeArrayRef(Indexes, NumElts));
}
static Value *simplifyX86vpermv(const IntrinsicInst &II,
InstCombiner::BuilderTy &Builder) {
auto *V = dyn_cast<Constant>(II.getArgOperand(1));
if (!V)
return nullptr;
auto *VecTy = cast<FixedVectorType>(II.getType());
unsigned Size = VecTy->getNumElements();
assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) &&
"Unexpected shuffle mask size");
int Indexes[64];
for (unsigned I = 0; I < Size; ++I) {
Constant *COp = V->getAggregateElement(I);
if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
return nullptr;
if (isa<UndefValue>(COp)) {
Indexes[I] = -1;
continue;
}
uint32_t Index = cast<ConstantInt>(COp)->getZExtValue();
Index &= Size - 1;
Indexes[I] = Index;
}
auto V1 = II.getArgOperand(0);
return Builder.CreateShuffleVector(V1, makeArrayRef(Indexes, Size));
}
Optional<Instruction *>
X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
auto SimplifyDemandedVectorEltsLow = [&IC](Value *Op, unsigned Width,
unsigned DemandedWidth) {
APInt UndefElts(Width, 0);
APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth);
return IC.SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts);
};
Intrinsic::ID IID = II.getIntrinsicID();
switch (IID) {
case Intrinsic::x86_bmi_bextr_32:
case Intrinsic::x86_bmi_bextr_64:
case Intrinsic::x86_tbm_bextri_u32:
case Intrinsic::x86_tbm_bextri_u64:
if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
uint64_t Shift = C->getZExtValue();
uint64_t Length = (Shift >> 8) & 0xff;
Shift &= 0xff;
unsigned BitWidth = II.getType()->getIntegerBitWidth();
if (Length == 0 || Shift >= BitWidth) {
return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
}
if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
uint64_t Result = InC->getZExtValue() >> Shift;
if (Length > BitWidth)
Length = BitWidth;
Result &= maskTrailingOnes<uint64_t>(Length);
return IC.replaceInstUsesWith(II,
ConstantInt::get(II.getType(), Result));
}
}
break;
case Intrinsic::x86_bmi_bzhi_32:
case Intrinsic::x86_bmi_bzhi_64:
if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
uint64_t Index = C->getZExtValue() & 0xff;
unsigned BitWidth = II.getType()->getIntegerBitWidth();
if (Index >= BitWidth) {
return IC.replaceInstUsesWith(II, II.getArgOperand(0));
}
if (Index == 0) {
return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
}
if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
uint64_t Result = InC->getZExtValue();
Result &= maskTrailingOnes<uint64_t>(Index);
return IC.replaceInstUsesWith(II,
ConstantInt::get(II.getType(), Result));
}
}
break;
case Intrinsic::x86_bmi_pext_32:
case Intrinsic::x86_bmi_pext_64:
if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
if (MaskC->isNullValue()) {
return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
}
if (MaskC->isAllOnesValue()) {
return IC.replaceInstUsesWith(II, II.getArgOperand(0));
}
unsigned MaskIdx, MaskLen;
if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
Value *Input = II.getArgOperand(0);
Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1));
Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx);
Value *Shifted = IC.Builder.CreateLShr(Masked, ShiftAmt);
return IC.replaceInstUsesWith(II, Shifted);
}
if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
uint64_t Src = SrcC->getZExtValue();
uint64_t Mask = MaskC->getZExtValue();
uint64_t Result = 0;
uint64_t BitToSet = 1;
while (Mask) {
uint64_t BitToTest = Mask & -Mask;
if (BitToTest & Src)
Result |= BitToSet;
BitToSet <<= 1;
Mask &= Mask - 1;
}
return IC.replaceInstUsesWith(II,
ConstantInt::get(II.getType(), Result));
}
}
break;
case Intrinsic::x86_bmi_pdep_32:
case Intrinsic::x86_bmi_pdep_64:
if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
if (MaskC->isNullValue()) {
return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
}
if (MaskC->isAllOnesValue()) {
return IC.replaceInstUsesWith(II, II.getArgOperand(0));
}
unsigned MaskIdx, MaskLen;
if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
Value *Input = II.getArgOperand(0);
Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx);
Value *Shifted = IC.Builder.CreateShl(Input, ShiftAmt);
Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1));
return IC.replaceInstUsesWith(II, Masked);
}
if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
uint64_t Src = SrcC->getZExtValue();
uint64_t Mask = MaskC->getZExtValue();
uint64_t Result = 0;
uint64_t BitToTest = 1;
while (Mask) {
uint64_t BitToSet = Mask & -Mask;
if (BitToTest & Src)
Result |= BitToSet;
BitToTest <<= 1;
Mask &= Mask - 1;
}
return IC.replaceInstUsesWith(II,
ConstantInt::get(II.getType(), Result));
}
}
break;
case Intrinsic::x86_sse_cvtss2si:
case Intrinsic::x86_sse_cvtss2si64:
case Intrinsic::x86_sse_cvttss2si:
case Intrinsic::x86_sse_cvttss2si64:
case Intrinsic::x86_sse2_cvtsd2si:
case Intrinsic::x86_sse2_cvtsd2si64:
case Intrinsic::x86_sse2_cvttsd2si:
case Intrinsic::x86_sse2_cvttsd2si64:
case Intrinsic::x86_avx512_vcvtss2si32:
case Intrinsic::x86_avx512_vcvtss2si64:
case Intrinsic::x86_avx512_vcvtss2usi32:
case Intrinsic::x86_avx512_vcvtss2usi64:
case Intrinsic::x86_avx512_vcvtsd2si32:
case Intrinsic::x86_avx512_vcvtsd2si64:
case Intrinsic::x86_avx512_vcvtsd2usi32:
case Intrinsic::x86_avx512_vcvtsd2usi64:
case Intrinsic::x86_avx512_cvttss2si:
case Intrinsic::x86_avx512_cvttss2si64:
case Intrinsic::x86_avx512_cvttss2usi:
case Intrinsic::x86_avx512_cvttss2usi64:
case Intrinsic::x86_avx512_cvttsd2si:
case Intrinsic::x86_avx512_cvttsd2si64:
case Intrinsic::x86_avx512_cvttsd2usi:
case Intrinsic::x86_avx512_cvttsd2usi64: {
Value *Arg = II.getArgOperand(0);
unsigned VWidth = cast<FixedVectorType>(Arg->getType())->getNumElements();
if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) {
return IC.replaceOperand(II, 0, V);
}
break;
}
case Intrinsic::x86_mmx_pmovmskb:
case Intrinsic::x86_sse_movmsk_ps:
case Intrinsic::x86_sse2_movmsk_pd:
case Intrinsic::x86_sse2_pmovmskb_128:
case Intrinsic::x86_avx_movmsk_pd_256:
case Intrinsic::x86_avx_movmsk_ps_256:
case Intrinsic::x86_avx2_pmovmskb:
if (Value *V = simplifyX86movmsk(II, IC.Builder)) {
return IC.replaceInstUsesWith(II, V);
}
break;
case Intrinsic::x86_sse_comieq_ss:
case Intrinsic::x86_sse_comige_ss:
case Intrinsic::x86_sse_comigt_ss:
case Intrinsic::x86_sse_comile_ss:
case Intrinsic::x86_sse_comilt_ss:
case Intrinsic::x86_sse_comineq_ss:
case Intrinsic::x86_sse_ucomieq_ss:
case Intrinsic::x86_sse_ucomige_ss:
case Intrinsic::x86_sse_ucomigt_ss:
case Intrinsic::x86_sse_ucomile_ss:
case Intrinsic::x86_sse_ucomilt_ss:
case Intrinsic::x86_sse_ucomineq_ss:
case Intrinsic::x86_sse2_comieq_sd:
case Intrinsic::x86_sse2_comige_sd:
case Intrinsic::x86_sse2_comigt_sd:
case Intrinsic::x86_sse2_comile_sd:
case Intrinsic::x86_sse2_comilt_sd:
case Intrinsic::x86_sse2_comineq_sd:
case Intrinsic::x86_sse2_ucomieq_sd:
case Intrinsic::x86_sse2_ucomige_sd:
case Intrinsic::x86_sse2_ucomigt_sd:
case Intrinsic::x86_sse2_ucomile_sd:
case Intrinsic::x86_sse2_ucomilt_sd:
case Intrinsic::x86_sse2_ucomineq_sd:
case Intrinsic::x86_avx512_vcomi_ss:
case Intrinsic::x86_avx512_vcomi_sd:
case Intrinsic::x86_avx512_mask_cmp_ss:
case Intrinsic::x86_avx512_mask_cmp_sd: {
bool MadeChange = false;
Value *Arg0 = II.getArgOperand(0);
Value *Arg1 = II.getArgOperand(1);
unsigned VWidth = cast<FixedVectorType>(Arg0->getType())->getNumElements();
if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) {
IC.replaceOperand(II, 0, V);
MadeChange = true;
}
if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) {
IC.replaceOperand(II, 1, V);
MadeChange = true;
}
if (MadeChange) {
return &II;
}
break;
}
case Intrinsic::x86_avx512_add_ps_512:
case Intrinsic::x86_avx512_div_ps_512:
case Intrinsic::x86_avx512_mul_ps_512:
case Intrinsic::x86_avx512_sub_ps_512:
case Intrinsic::x86_avx512_add_pd_512:
case Intrinsic::x86_avx512_div_pd_512:
case Intrinsic::x86_avx512_mul_pd_512:
case Intrinsic::x86_avx512_sub_pd_512:
if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
if (R->getValue() == 4) {
Value *Arg0 = II.getArgOperand(0);
Value *Arg1 = II.getArgOperand(1);
Value *V;
switch (IID) {
default:
llvm_unreachable("Case stmts out of sync!");
case Intrinsic::x86_avx512_add_ps_512:
case Intrinsic::x86_avx512_add_pd_512:
V = IC.Builder.CreateFAdd(Arg0, Arg1);
break;
case Intrinsic::x86_avx512_sub_ps_512:
case Intrinsic::x86_avx512_sub_pd_512:
V = IC.Builder.CreateFSub(Arg0, Arg1);
break;
case Intrinsic::x86_avx512_mul_ps_512:
case Intrinsic::x86_avx512_mul_pd_512:
V = IC.Builder.CreateFMul(Arg0, Arg1);
break;
case Intrinsic::x86_avx512_div_ps_512:
case Intrinsic::x86_avx512_div_pd_512:
V = IC.Builder.CreateFDiv(Arg0, Arg1);
break;
}
return IC.replaceInstUsesWith(II, V);
}
}
break;
case Intrinsic::x86_avx512_mask_add_ss_round:
case Intrinsic::x86_avx512_mask_div_ss_round:
case Intrinsic::x86_avx512_mask_mul_ss_round:
case Intrinsic::x86_avx512_mask_sub_ss_round:
case Intrinsic::x86_avx512_mask_add_sd_round:
case Intrinsic::x86_avx512_mask_div_sd_round:
case Intrinsic::x86_avx512_mask_mul_sd_round:
case Intrinsic::x86_avx512_mask_sub_sd_round:
if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(4))) {
if (R->getValue() == 4) {
Value *Arg0 = II.getArgOperand(0);
Value *Arg1 = II.getArgOperand(1);
Value *LHS = IC.Builder.CreateExtractElement(Arg0, (uint64_t)0);
Value *RHS = IC.Builder.CreateExtractElement(Arg1, (uint64_t)0);
Value *V;
switch (IID) {
default:
llvm_unreachable("Case stmts out of sync!");
case Intrinsic::x86_avx512_mask_add_ss_round:
case Intrinsic::x86_avx512_mask_add_sd_round:
V = IC.Builder.CreateFAdd(LHS, RHS);
break;
case Intrinsic::x86_avx512_mask_sub_ss_round:
case Intrinsic::x86_avx512_mask_sub_sd_round:
V = IC.Builder.CreateFSub(LHS, RHS);
break;
case Intrinsic::x86_avx512_mask_mul_ss_round:
case Intrinsic::x86_avx512_mask_mul_sd_round:
V = IC.Builder.CreateFMul(LHS, RHS);
break;
case Intrinsic::x86_avx512_mask_div_ss_round:
case Intrinsic::x86_avx512_mask_div_sd_round:
V = IC.Builder.CreateFDiv(LHS, RHS);
break;
}
Value *Mask = II.getArgOperand(3);
auto *C = dyn_cast<ConstantInt>(Mask);
if (!C || !C->getValue()[0]) {
auto *MaskTy = FixedVectorType::get(
IC.Builder.getInt1Ty(),
cast<IntegerType>(Mask->getType())->getBitWidth());
Mask = IC.Builder.CreateBitCast(Mask, MaskTy);
Mask = IC.Builder.CreateExtractElement(Mask, (uint64_t)0);
Value *Passthru =
IC.Builder.CreateExtractElement(II.getArgOperand(2), (uint64_t)0);
V = IC.Builder.CreateSelect(Mask, V, Passthru);
}
V = IC.Builder.CreateInsertElement(Arg0, V, (uint64_t)0);
return IC.replaceInstUsesWith(II, V);
}
}
break;
case Intrinsic::x86_sse2_psrai_d:
case Intrinsic::x86_sse2_psrai_w:
case Intrinsic::x86_avx2_psrai_d:
case Intrinsic::x86_avx2_psrai_w:
case Intrinsic::x86_avx512_psrai_q_128:
case Intrinsic::x86_avx512_psrai_q_256:
case Intrinsic::x86_avx512_psrai_d_512:
case Intrinsic::x86_avx512_psrai_q_512:
case Intrinsic::x86_avx512_psrai_w_512:
case Intrinsic::x86_sse2_psrli_d:
case Intrinsic::x86_sse2_psrli_q:
case Intrinsic::x86_sse2_psrli_w:
case Intrinsic::x86_avx2_psrli_d:
case Intrinsic::x86_avx2_psrli_q:
case Intrinsic::x86_avx2_psrli_w:
case Intrinsic::x86_avx512_psrli_d_512:
case Intrinsic::x86_avx512_psrli_q_512:
case Intrinsic::x86_avx512_psrli_w_512:
case Intrinsic::x86_sse2_pslli_d:
case Intrinsic::x86_sse2_pslli_q:
case Intrinsic::x86_sse2_pslli_w:
case Intrinsic::x86_avx2_pslli_d:
case Intrinsic::x86_avx2_pslli_q:
case Intrinsic::x86_avx2_pslli_w:
case Intrinsic::x86_avx512_pslli_d_512:
case Intrinsic::x86_avx512_pslli_q_512:
case Intrinsic::x86_avx512_pslli_w_512:
if (Value *V = simplifyX86immShift(II, IC.Builder)) {
return IC.replaceInstUsesWith(II, V);
}
break;
case Intrinsic::x86_sse2_psra_d:
case Intrinsic::x86_sse2_psra_w:
case Intrinsic::x86_avx2_psra_d:
case Intrinsic::x86_avx2_psra_w:
case Intrinsic::x86_avx512_psra_q_128:
case Intrinsic::x86_avx512_psra_q_256:
case Intrinsic::x86_avx512_psra_d_512:
case Intrinsic::x86_avx512_psra_q_512:
case Intrinsic::x86_avx512_psra_w_512:
case Intrinsic::x86_sse2_psrl_d:
case Intrinsic::x86_sse2_psrl_q:
case Intrinsic::x86_sse2_psrl_w:
case Intrinsic::x86_avx2_psrl_d:
case Intrinsic::x86_avx2_psrl_q:
case Intrinsic::x86_avx2_psrl_w:
case Intrinsic::x86_avx512_psrl_d_512:
case Intrinsic::x86_avx512_psrl_q_512:
case Intrinsic::x86_avx512_psrl_w_512:
case Intrinsic::x86_sse2_psll_d:
case Intrinsic::x86_sse2_psll_q:
case Intrinsic::x86_sse2_psll_w:
case Intrinsic::x86_avx2_psll_d:
case Intrinsic::x86_avx2_psll_q:
case Intrinsic::x86_avx2_psll_w:
case Intrinsic::x86_avx512_psll_d_512:
case Intrinsic::x86_avx512_psll_q_512:
case Intrinsic::x86_avx512_psll_w_512: {
if (Value *V = simplifyX86immShift(II, IC.Builder)) {
return IC.replaceInstUsesWith(II, V);
}
Value *Arg1 = II.getArgOperand(1);
assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 &&
"Unexpected packed shift size");
unsigned VWidth = cast<FixedVectorType>(Arg1->getType())->getNumElements();
if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) {
return IC.replaceOperand(II, 1, V);
}
break;
}
case Intrinsic::x86_avx2_psllv_d:
case Intrinsic::x86_avx2_psllv_d_256:
case Intrinsic::x86_avx2_psllv_q:
case Intrinsic::x86_avx2_psllv_q_256:
case Intrinsic::x86_avx512_psllv_d_512:
case Intrinsic::x86_avx512_psllv_q_512:
case Intrinsic::x86_avx512_psllv_w_128:
case Intrinsic::x86_avx512_psllv_w_256:
case Intrinsic::x86_avx512_psllv_w_512:
case Intrinsic::x86_avx2_psrav_d:
case Intrinsic::x86_avx2_psrav_d_256:
case Intrinsic::x86_avx512_psrav_q_128:
case Intrinsic::x86_avx512_psrav_q_256:
case Intrinsic::x86_avx512_psrav_d_512:
case Intrinsic::x86_avx512_psrav_q_512:
case Intrinsic::x86_avx512_psrav_w_128:
case Intrinsic::x86_avx512_psrav_w_256:
case Intrinsic::x86_avx512_psrav_w_512:
case Intrinsic::x86_avx2_psrlv_d:
case Intrinsic::x86_avx2_psrlv_d_256:
case Intrinsic::x86_avx2_psrlv_q:
case Intrinsic::x86_avx2_psrlv_q_256:
case Intrinsic::x86_avx512_psrlv_d_512:
case Intrinsic::x86_avx512_psrlv_q_512:
case Intrinsic::x86_avx512_psrlv_w_128:
case Intrinsic::x86_avx512_psrlv_w_256:
case Intrinsic::x86_avx512_psrlv_w_512:
if (Value *V = simplifyX86varShift(II, IC.Builder)) {
return IC.replaceInstUsesWith(II, V);
}
break;
case Intrinsic::x86_sse2_packssdw_128:
case Intrinsic::x86_sse2_packsswb_128:
case Intrinsic::x86_avx2_packssdw:
case Intrinsic::x86_avx2_packsswb:
case Intrinsic::x86_avx512_packssdw_512:
case Intrinsic::x86_avx512_packsswb_512:
if (Value *V = simplifyX86pack(II, IC.Builder, true)) {
return IC.replaceInstUsesWith(II, V);
}
break;
case Intrinsic::x86_sse2_packuswb_128:
case Intrinsic::x86_sse41_packusdw:
case Intrinsic::x86_avx2_packusdw:
case Intrinsic::x86_avx2_packuswb:
case Intrinsic::x86_avx512_packusdw_512:
case Intrinsic::x86_avx512_packuswb_512:
if (Value *V = simplifyX86pack(II, IC.Builder, false)) {
return IC.replaceInstUsesWith(II, V);
}
break;
case Intrinsic::x86_pclmulqdq:
case Intrinsic::x86_pclmulqdq_256:
case Intrinsic::x86_pclmulqdq_512: {
if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
unsigned Imm = C->getZExtValue();
bool MadeChange = false;
Value *Arg0 = II.getArgOperand(0);
Value *Arg1 = II.getArgOperand(1);
unsigned VWidth =
cast<FixedVectorType>(Arg0->getType())->getNumElements();
APInt UndefElts1(VWidth, 0);
APInt DemandedElts1 =
APInt::getSplat(VWidth, APInt(2, (Imm & 0x01) ? 2 : 1));
if (Value *V =
IC.SimplifyDemandedVectorElts(Arg0, DemandedElts1, UndefElts1)) {
IC.replaceOperand(II, 0, V);
MadeChange = true;
}
APInt UndefElts2(VWidth, 0);
APInt DemandedElts2 =
APInt::getSplat(VWidth, APInt(2, (Imm & 0x10) ? 2 : 1));
if (Value *V =
IC.SimplifyDemandedVectorElts(Arg1, DemandedElts2, UndefElts2)) {
IC.replaceOperand(II, 1, V);
MadeChange = true;
}
if (DemandedElts1.isSubsetOf(UndefElts1) ||
DemandedElts2.isSubsetOf(UndefElts2)) {
return IC.replaceInstUsesWith(II,
ConstantAggregateZero::get(II.getType()));
}
if (MadeChange) {
return &II;
}
}
break;
}
case Intrinsic::x86_sse41_insertps:
if (Value *V = simplifyX86insertps(II, IC.Builder)) {
return IC.replaceInstUsesWith(II, V);
}
break;
case Intrinsic::x86_sse4a_extrq: {
Value *Op0 = II.getArgOperand(0);
Value *Op1 = II.getArgOperand(1);
unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
VWidth1 == 16 && "Unexpected operand sizes");
auto *C1 = dyn_cast<Constant>(Op1);
auto *CILength =
C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
: nullptr;
auto *CIIndex =
C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
: nullptr;
if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
return IC.replaceInstUsesWith(II, V);
}
bool MadeChange = false;
if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
IC.replaceOperand(II, 0, V);
MadeChange = true;
}
if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) {
IC.replaceOperand(II, 1, V);
MadeChange = true;
}
if (MadeChange) {
return &II;
}
break;
}
case Intrinsic::x86_sse4a_extrqi: {
Value *Op0 = II.getArgOperand(0);
unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
"Unexpected operand size");
auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(1));
auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(2));
if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
return IC.replaceInstUsesWith(II, V);
}
if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
return IC.replaceOperand(II, 0, V);
}
break;
}
case Intrinsic::x86_sse4a_insertq: {
Value *Op0 = II.getArgOperand(0);
Value *Op1 = II.getArgOperand(1);
unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
cast<FixedVectorType>(Op1->getType())->getNumElements() == 2 &&
"Unexpected operand size");
auto *C1 = dyn_cast<Constant>(Op1);
auto *CI11 =
C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
: nullptr;
if (CI11) {
const APInt &V11 = CI11->getValue();
APInt Len = V11.zextOrTrunc(6);
APInt Idx = V11.lshr(8).zextOrTrunc(6);
if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
return IC.replaceInstUsesWith(II, V);
}
}
if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
return IC.replaceOperand(II, 0, V);
}
break;
}
case Intrinsic::x86_sse4a_insertqi: {
Value *Op0 = II.getArgOperand(0);
Value *Op1 = II.getArgOperand(1);
unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
VWidth1 == 2 && "Unexpected operand sizes");
auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(2));
auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(3));
if (CILength && CIIndex) {
APInt Len = CILength->getValue().zextOrTrunc(6);
APInt Idx = CIIndex->getValue().zextOrTrunc(6);
if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
return IC.replaceInstUsesWith(II, V);
}
}
bool MadeChange = false;
if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
IC.replaceOperand(II, 0, V);
MadeChange = true;
}
if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) {
IC.replaceOperand(II, 1, V);
MadeChange = true;
}
if (MadeChange) {
return &II;
}
break;
}
case Intrinsic::x86_sse41_pblendvb:
case Intrinsic::x86_sse41_blendvps:
case Intrinsic::x86_sse41_blendvpd:
case Intrinsic::x86_avx_blendv_ps_256:
case Intrinsic::x86_avx_blendv_pd_256:
case Intrinsic::x86_avx2_pblendvb: {
Value *Op0 = II.getArgOperand(0);
Value *Op1 = II.getArgOperand(1);
Value *Mask = II.getArgOperand(2);
if (Op0 == Op1) {
return IC.replaceInstUsesWith(II, Op0);
}
if (isa<ConstantAggregateZero>(Mask)) {
return IC.replaceInstUsesWith(II, Op0);
}
if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) {
Constant *NewSelector = getNegativeIsTrueBoolVec(ConstantMask);
return SelectInst::Create(NewSelector, Op1, Op0, "blendv");
}
Value *BoolVec;
Mask = InstCombiner::peekThroughBitcast(Mask);
if (match(Mask, PatternMatch::m_SExt(PatternMatch::m_Value(BoolVec))) &&
BoolVec->getType()->isVectorTy() &&
BoolVec->getType()->getScalarSizeInBits() == 1) {
assert(Mask->getType()->getPrimitiveSizeInBits() ==
II.getType()->getPrimitiveSizeInBits() &&
"Not expecting mask and operands with different sizes");
unsigned NumMaskElts =
cast<FixedVectorType>(Mask->getType())->getNumElements();
unsigned NumOperandElts =
cast<FixedVectorType>(II.getType())->getNumElements();
if (NumMaskElts == NumOperandElts) {
return SelectInst::Create(BoolVec, Op1, Op0);
}
if (NumMaskElts < NumOperandElts) {
Value *CastOp0 = IC.Builder.CreateBitCast(Op0, Mask->getType());
Value *CastOp1 = IC.Builder.CreateBitCast(Op1, Mask->getType());
Value *Sel = IC.Builder.CreateSelect(BoolVec, CastOp1, CastOp0);
return new BitCastInst(Sel, II.getType());
}
}
break;
}
case Intrinsic::x86_ssse3_pshuf_b_128:
case Intrinsic::x86_avx2_pshuf_b:
case Intrinsic::x86_avx512_pshuf_b_512:
if (Value *V = simplifyX86pshufb(II, IC.Builder)) {
return IC.replaceInstUsesWith(II, V);
}
break;
case Intrinsic::x86_avx_vpermilvar_ps:
case Intrinsic::x86_avx_vpermilvar_ps_256:
case Intrinsic::x86_avx512_vpermilvar_ps_512:
case Intrinsic::x86_avx_vpermilvar_pd:
case Intrinsic::x86_avx_vpermilvar_pd_256:
case Intrinsic::x86_avx512_vpermilvar_pd_512:
if (Value *V = simplifyX86vpermilvar(II, IC.Builder)) {
return IC.replaceInstUsesWith(II, V);
}
break;
case Intrinsic::x86_avx2_permd:
case Intrinsic::x86_avx2_permps:
case Intrinsic::x86_avx512_permvar_df_256:
case Intrinsic::x86_avx512_permvar_df_512:
case Intrinsic::x86_avx512_permvar_di_256:
case Intrinsic::x86_avx512_permvar_di_512:
case Intrinsic::x86_avx512_permvar_hi_128:
case Intrinsic::x86_avx512_permvar_hi_256:
case Intrinsic::x86_avx512_permvar_hi_512:
case Intrinsic::x86_avx512_permvar_qi_128:
case Intrinsic::x86_avx512_permvar_qi_256:
case Intrinsic::x86_avx512_permvar_qi_512:
case Intrinsic::x86_avx512_permvar_sf_512:
case Intrinsic::x86_avx512_permvar_si_512:
if (Value *V = simplifyX86vpermv(II, IC.Builder)) {
return IC.replaceInstUsesWith(II, V);
}
break;
case Intrinsic::x86_avx_maskload_ps:
case Intrinsic::x86_avx_maskload_pd:
case Intrinsic::x86_avx_maskload_ps_256:
case Intrinsic::x86_avx_maskload_pd_256:
case Intrinsic::x86_avx2_maskload_d:
case Intrinsic::x86_avx2_maskload_q:
case Intrinsic::x86_avx2_maskload_d_256:
case Intrinsic::x86_avx2_maskload_q_256:
if (Instruction *I = simplifyX86MaskedLoad(II, IC)) {
return I;
}
break;
case Intrinsic::x86_sse2_maskmov_dqu:
case Intrinsic::x86_avx_maskstore_ps:
case Intrinsic::x86_avx_maskstore_pd:
case Intrinsic::x86_avx_maskstore_ps_256:
case Intrinsic::x86_avx_maskstore_pd_256:
case Intrinsic::x86_avx2_maskstore_d:
case Intrinsic::x86_avx2_maskstore_q:
case Intrinsic::x86_avx2_maskstore_d_256:
case Intrinsic::x86_avx2_maskstore_q_256:
if (simplifyX86MaskedStore(II, IC)) {
return nullptr;
}
break;
case Intrinsic::x86_addcarry_32:
case Intrinsic::x86_addcarry_64:
if (Value *V = simplifyX86addcarry(II, IC.Builder)) {
return IC.replaceInstUsesWith(II, V);
}
break;
default:
break;
}
return None;
}
Optional<Value *> X86TTIImpl::simplifyDemandedUseBitsIntrinsic(
InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known,
bool &KnownBitsComputed) const {
switch (II.getIntrinsicID()) {
default:
break;
case Intrinsic::x86_mmx_pmovmskb:
case Intrinsic::x86_sse_movmsk_ps:
case Intrinsic::x86_sse2_movmsk_pd:
case Intrinsic::x86_sse2_pmovmskb_128:
case Intrinsic::x86_avx_movmsk_ps_256:
case Intrinsic::x86_avx_movmsk_pd_256:
case Intrinsic::x86_avx2_pmovmskb: {
unsigned ArgWidth;
if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) {
ArgWidth = 8; } else {
auto *ArgType = cast<FixedVectorType>(II.getArgOperand(0)->getType());
ArgWidth = ArgType->getNumElements();
}
APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth);
Type *VTy = II.getType();
if (DemandedElts.isZero()) {
return ConstantInt::getNullValue(VTy);
}
Known.Zero.setBitsFrom(ArgWidth);
KnownBitsComputed = true;
break;
}
}
return None;
}
Optional<Value *> X86TTIImpl::simplifyDemandedVectorEltsIntrinsic(
InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
APInt &UndefElts2, APInt &UndefElts3,
std::function<void(Instruction *, unsigned, APInt, APInt &)>
simplifyAndSetOp) const {
unsigned VWidth = cast<FixedVectorType>(II.getType())->getNumElements();
switch (II.getIntrinsicID()) {
default:
break;
case Intrinsic::x86_xop_vfrcz_ss:
case Intrinsic::x86_xop_vfrcz_sd:
if (!DemandedElts[0]) {
IC.addToWorklist(&II);
return ConstantAggregateZero::get(II.getType());
}
DemandedElts = 1;
simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
UndefElts = UndefElts[0];
break;
case Intrinsic::x86_sse_rcp_ss:
case Intrinsic::x86_sse_rsqrt_ss:
simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
if (!DemandedElts[0]) {
IC.addToWorklist(&II);
return II.getArgOperand(0);
}
break;
case Intrinsic::x86_sse_min_ss:
case Intrinsic::x86_sse_max_ss:
case Intrinsic::x86_sse_cmp_ss:
case Intrinsic::x86_sse2_min_sd:
case Intrinsic::x86_sse2_max_sd:
case Intrinsic::x86_sse2_cmp_sd: {
simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
if (!DemandedElts[0]) {
IC.addToWorklist(&II);
return II.getArgOperand(0);
}
DemandedElts = 1;
simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
if (!UndefElts2[0])
UndefElts.clearBit(0);
break;
}
case Intrinsic::x86_sse41_round_ss:
case Intrinsic::x86_sse41_round_sd: {
APInt DemandedElts2 = DemandedElts;
DemandedElts2.clearBit(0);
simplifyAndSetOp(&II, 0, DemandedElts2, UndefElts);
if (!DemandedElts[0]) {
IC.addToWorklist(&II);
return II.getArgOperand(0);
}
DemandedElts = 1;
simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
UndefElts.clearBit(0);
UndefElts |= UndefElts2[0];
break;
}
case Intrinsic::x86_avx512_mask_add_ss_round:
case Intrinsic::x86_avx512_mask_div_ss_round:
case Intrinsic::x86_avx512_mask_mul_ss_round:
case Intrinsic::x86_avx512_mask_sub_ss_round:
case Intrinsic::x86_avx512_mask_max_ss_round:
case Intrinsic::x86_avx512_mask_min_ss_round:
case Intrinsic::x86_avx512_mask_add_sd_round:
case Intrinsic::x86_avx512_mask_div_sd_round:
case Intrinsic::x86_avx512_mask_mul_sd_round:
case Intrinsic::x86_avx512_mask_sub_sd_round:
case Intrinsic::x86_avx512_mask_max_sd_round:
case Intrinsic::x86_avx512_mask_min_sd_round:
simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
if (!DemandedElts[0]) {
IC.addToWorklist(&II);
return II.getArgOperand(0);
}
DemandedElts = 1;
simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
simplifyAndSetOp(&II, 2, DemandedElts, UndefElts3);
if (!UndefElts2[0] || !UndefElts3[0])
UndefElts.clearBit(0);
break;
case Intrinsic::x86_sse3_addsub_pd:
case Intrinsic::x86_sse3_addsub_ps:
case Intrinsic::x86_avx_addsub_pd_256:
case Intrinsic::x86_avx_addsub_ps_256: {
APInt SubMask = APInt::getSplat(VWidth, APInt(2, 0x1));
APInt AddMask = APInt::getSplat(VWidth, APInt(2, 0x2));
bool IsSubOnly = DemandedElts.isSubsetOf(SubMask);
bool IsAddOnly = DemandedElts.isSubsetOf(AddMask);
if (IsSubOnly || IsAddOnly) {
assert((IsSubOnly ^ IsAddOnly) && "Can't be both add-only and sub-only");
IRBuilderBase::InsertPointGuard Guard(IC.Builder);
IC.Builder.SetInsertPoint(&II);
Value *Arg0 = II.getArgOperand(0), *Arg1 = II.getArgOperand(1);
return IC.Builder.CreateBinOp(
IsSubOnly ? Instruction::FSub : Instruction::FAdd, Arg0, Arg1);
}
simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
UndefElts &= UndefElts2;
break;
}
case Intrinsic::x86_avx2_psllv_d:
case Intrinsic::x86_avx2_psllv_d_256:
case Intrinsic::x86_avx2_psllv_q:
case Intrinsic::x86_avx2_psllv_q_256:
case Intrinsic::x86_avx2_psrlv_d:
case Intrinsic::x86_avx2_psrlv_d_256:
case Intrinsic::x86_avx2_psrlv_q:
case Intrinsic::x86_avx2_psrlv_q_256:
case Intrinsic::x86_avx2_psrav_d:
case Intrinsic::x86_avx2_psrav_d_256: {
simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
UndefElts &= UndefElts2;
break;
}
case Intrinsic::x86_sse2_packssdw_128:
case Intrinsic::x86_sse2_packsswb_128:
case Intrinsic::x86_sse2_packuswb_128:
case Intrinsic::x86_sse41_packusdw:
case Intrinsic::x86_avx2_packssdw:
case Intrinsic::x86_avx2_packsswb:
case Intrinsic::x86_avx2_packusdw:
case Intrinsic::x86_avx2_packuswb:
case Intrinsic::x86_avx512_packssdw_512:
case Intrinsic::x86_avx512_packsswb_512:
case Intrinsic::x86_avx512_packusdw_512:
case Intrinsic::x86_avx512_packuswb_512: {
auto *Ty0 = II.getArgOperand(0)->getType();
unsigned InnerVWidth = cast<FixedVectorType>(Ty0)->getNumElements();
assert(VWidth == (InnerVWidth * 2) && "Unexpected input size");
unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128;
unsigned VWidthPerLane = VWidth / NumLanes;
unsigned InnerVWidthPerLane = InnerVWidth / NumLanes;
for (int OpNum = 0; OpNum != 2; ++OpNum) {
APInt OpDemandedElts(InnerVWidth, 0);
for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
unsigned LaneIdx = Lane * VWidthPerLane;
for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) {
unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum;
if (DemandedElts[Idx])
OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt);
}
}
APInt OpUndefElts(InnerVWidth, 0);
simplifyAndSetOp(&II, OpNum, OpDemandedElts, OpUndefElts);
OpUndefElts = OpUndefElts.zext(VWidth);
for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane);
LaneElts = LaneElts.getLoBits(InnerVWidthPerLane);
LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum);
UndefElts |= LaneElts;
}
}
break;
}
case Intrinsic::x86_ssse3_pshuf_b_128:
case Intrinsic::x86_avx2_pshuf_b:
case Intrinsic::x86_avx512_pshuf_b_512:
case Intrinsic::x86_avx_vpermilvar_ps:
case Intrinsic::x86_avx_vpermilvar_ps_256:
case Intrinsic::x86_avx512_vpermilvar_ps_512:
case Intrinsic::x86_avx_vpermilvar_pd:
case Intrinsic::x86_avx_vpermilvar_pd_256:
case Intrinsic::x86_avx512_vpermilvar_pd_512:
case Intrinsic::x86_avx2_permd:
case Intrinsic::x86_avx2_permps: {
simplifyAndSetOp(&II, 1, DemandedElts, UndefElts);
break;
}
case Intrinsic::x86_sse4a_extrq:
case Intrinsic::x86_sse4a_extrqi:
case Intrinsic::x86_sse4a_insertq:
case Intrinsic::x86_sse4a_insertqi:
UndefElts.setHighBits(VWidth / 2);
break;
}
return None;
}