#include "ARMTargetMachine.h"
#include "ARMTargetTransformInfo.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/Support/CommandLine.h"
using namespace llvm;
#define DEBUG_TYPE "arm-selectiondag-info"
cl::opt<TPLoop::MemTransfer> EnableMemtransferTPLoop(
"arm-memtransfer-tploop", cl::Hidden,
cl::desc("Control conversion of memcpy to "
"Tail predicated loops (WLSTP)"),
cl::init(TPLoop::ForceDisabled),
cl::values(clEnumValN(TPLoop::ForceDisabled, "force-disabled",
"Don't convert memcpy to TP loop."),
clEnumValN(TPLoop::ForceEnabled, "force-enabled",
"Always convert memcpy to TP loop."),
clEnumValN(TPLoop::Allow, "allow",
"Allow (may be subject to certain conditions) "
"conversion of memcpy to TP loop.")));
SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall(
SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
SDValue Size, unsigned Align, RTLIB::Libcall LC) const {
const ARMSubtarget &Subtarget =
DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
const ARMTargetLowering *TLI = Subtarget.getTargetLowering();
if (std::strncmp(TLI->getLibcallName(LC), "__aeabi", 7) != 0)
return SDValue();
enum {
AEABI_MEMCPY = 0,
AEABI_MEMMOVE,
AEABI_MEMSET,
AEABI_MEMCLR
} AEABILibcall;
switch (LC) {
case RTLIB::MEMCPY:
AEABILibcall = AEABI_MEMCPY;
break;
case RTLIB::MEMMOVE:
AEABILibcall = AEABI_MEMMOVE;
break;
case RTLIB::MEMSET:
AEABILibcall = AEABI_MEMSET;
if (ConstantSDNode *ConstantSrc = dyn_cast<ConstantSDNode>(Src))
if (ConstantSrc->getZExtValue() == 0)
AEABILibcall = AEABI_MEMCLR;
break;
default:
return SDValue();
}
enum {
ALIGN1 = 0,
ALIGN4,
ALIGN8
} AlignVariant;
if ((Align & 7) == 0)
AlignVariant = ALIGN8;
else if ((Align & 3) == 0)
AlignVariant = ALIGN4;
else
AlignVariant = ALIGN1;
TargetLowering::ArgListTy Args;
TargetLowering::ArgListEntry Entry;
Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
Entry.Node = Dst;
Args.push_back(Entry);
if (AEABILibcall == AEABI_MEMCLR) {
Entry.Node = Size;
Args.push_back(Entry);
} else if (AEABILibcall == AEABI_MEMSET) {
Entry.Node = Size;
Args.push_back(Entry);
if (Src.getValueType().bitsGT(MVT::i32))
Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
else if (Src.getValueType().bitsLT(MVT::i32))
Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Src);
Entry.Node = Src;
Entry.Ty = Type::getInt32Ty(*DAG.getContext());
Entry.IsSExt = false;
Args.push_back(Entry);
} else {
Entry.Node = Src;
Args.push_back(Entry);
Entry.Node = Size;
Args.push_back(Entry);
}
char const *FunctionNames[4][3] = {
{ "__aeabi_memcpy", "__aeabi_memcpy4", "__aeabi_memcpy8" },
{ "__aeabi_memmove", "__aeabi_memmove4", "__aeabi_memmove8" },
{ "__aeabi_memset", "__aeabi_memset4", "__aeabi_memset8" },
{ "__aeabi_memclr", "__aeabi_memclr4", "__aeabi_memclr8" }
};
TargetLowering::CallLoweringInfo CLI(DAG);
CLI.setDebugLoc(dl)
.setChain(Chain)
.setLibCallee(
TLI->getLibcallCallingConv(LC), Type::getVoidTy(*DAG.getContext()),
DAG.getExternalSymbol(FunctionNames[AEABILibcall][AlignVariant],
TLI->getPointerTy(DAG.getDataLayout())),
std::move(Args))
.setDiscardResult();
std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
return CallResult.second;
}
static bool shouldGenerateInlineTPLoop(const ARMSubtarget &Subtarget,
const SelectionDAG &DAG,
ConstantSDNode *ConstantSize,
Align Alignment, bool IsMemcpy) {
auto &F = DAG.getMachineFunction().getFunction();
if (!EnableMemtransferTPLoop)
return false;
if (EnableMemtransferTPLoop == TPLoop::ForceEnabled)
return true;
if (F.hasOptNone() || F.hasOptSize())
return false;
if (!IsMemcpy)
return true;
if (!ConstantSize && Alignment >= Align(4))
return true;
if (ConstantSize &&
ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold() &&
ConstantSize->getZExtValue() <
Subtarget.getMaxMemcpyTPInlineSizeThreshold())
return true;
return false;
}
SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(
SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
const ARMSubtarget &Subtarget =
DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
if (Subtarget.hasMVEIntegerOps() &&
shouldGenerateInlineTPLoop(Subtarget, DAG, ConstantSize, Alignment, true))
return DAG.getNode(ARMISD::MEMCPYLOOP, dl, MVT::Other, Chain, Dst, Src,
DAG.getZExtOrTrunc(Size, dl, MVT::i32));
if (Alignment < Align(4))
return SDValue();
if (!ConstantSize)
return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
Alignment.value(), RTLIB::MEMCPY);
uint64_t SizeVal = ConstantSize->getZExtValue();
if (!AlwaysInline && SizeVal > Subtarget.getMaxInlineSizeThreshold())
return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
Alignment.value(), RTLIB::MEMCPY);
unsigned BytesLeft = SizeVal & 3;
unsigned NumMemOps = SizeVal >> 2;
unsigned EmittedNumMemOps = 0;
EVT VT = MVT::i32;
unsigned VTSize = 4;
unsigned i = 0;
const unsigned MaxLoadsInLDM = Subtarget.isThumb1Only() ? 4 : 6;
SDValue TFOps[6];
SDValue Loads[6];
uint64_t SrcOff = 0, DstOff = 0;
unsigned NumMEMCPYs = (NumMemOps + MaxLoadsInLDM - 1) / MaxLoadsInLDM;
if (NumMEMCPYs > 1 && Subtarget.hasMinSize()) {
return SDValue();
}
SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other, MVT::Glue);
for (unsigned I = 0; I != NumMEMCPYs; ++I) {
unsigned NextEmittedNumMemOps = NumMemOps * (I + 1) / NumMEMCPYs;
unsigned NumRegs = NextEmittedNumMemOps - EmittedNumMemOps;
Dst = DAG.getNode(ARMISD::MEMCPY, dl, VTs, Chain, Dst, Src,
DAG.getConstant(NumRegs, dl, MVT::i32));
Src = Dst.getValue(1);
Chain = Dst.getValue(2);
DstPtrInfo = DstPtrInfo.getWithOffset(NumRegs * VTSize);
SrcPtrInfo = SrcPtrInfo.getWithOffset(NumRegs * VTSize);
EmittedNumMemOps = NextEmittedNumMemOps;
}
if (BytesLeft == 0)
return Chain;
auto getRemainingValueType = [](unsigned BytesLeft) {
return (BytesLeft >= 2) ? MVT::i16 : MVT::i8;
};
auto getRemainingSize = [](unsigned BytesLeft) {
return (BytesLeft >= 2) ? 2 : 1;
};
unsigned BytesLeftSave = BytesLeft;
i = 0;
while (BytesLeft) {
VT = getRemainingValueType(BytesLeft);
VTSize = getRemainingSize(BytesLeft);
Loads[i] = DAG.getLoad(VT, dl, Chain,
DAG.getNode(ISD::ADD, dl, MVT::i32, Src,
DAG.getConstant(SrcOff, dl, MVT::i32)),
SrcPtrInfo.getWithOffset(SrcOff));
TFOps[i] = Loads[i].getValue(1);
++i;
SrcOff += VTSize;
BytesLeft -= VTSize;
}
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
makeArrayRef(TFOps, i));
i = 0;
BytesLeft = BytesLeftSave;
while (BytesLeft) {
VT = getRemainingValueType(BytesLeft);
VTSize = getRemainingSize(BytesLeft);
TFOps[i] = DAG.getStore(Chain, dl, Loads[i],
DAG.getNode(ISD::ADD, dl, MVT::i32, Dst,
DAG.getConstant(DstOff, dl, MVT::i32)),
DstPtrInfo.getWithOffset(DstOff));
++i;
DstOff += VTSize;
BytesLeft -= VTSize;
}
return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
makeArrayRef(TFOps, i));
}
SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemmove(
SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
SDValue Size, Align Alignment, bool isVolatile,
MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
Alignment.value(), RTLIB::MEMMOVE);
}
SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemset(
SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
MachinePointerInfo DstPtrInfo) const {
const ARMSubtarget &Subtarget =
DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
if (Subtarget.hasMVEIntegerOps() &&
shouldGenerateInlineTPLoop(Subtarget, DAG, ConstantSize, Alignment,
false)) {
Src = DAG.getSplatBuildVector(MVT::v16i8, dl,
DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src));
return DAG.getNode(ARMISD::MEMSETLOOP, dl, MVT::Other, Chain, Dst, Src,
DAG.getZExtOrTrunc(Size, dl, MVT::i32));
}
if (!AlwaysInline)
return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
Alignment.value(), RTLIB::MEMSET);
return SDValue();
}