#include <cassert>
#include <cstdlib>
#include <iomanip>
#include <iostream>
#include <vector>
#define GENERATE_NEON
#define GENERATE_NEON_INS
struct Operator;
static inline unsigned short MakeMask(unsigned V0, unsigned V1,
unsigned V2, unsigned V3) {
return (V0 << (3*4)) | (V1 << (2*4)) | (V2 << (1*4)) | (V3 << (0*4));
}
static unsigned getMaskElt(unsigned Mask, unsigned Elt) {
return (Mask >> ((3-Elt)*4)) & 0xF;
}
static unsigned setMaskElt(unsigned Mask, unsigned Elt, unsigned NewVal) {
unsigned FieldShift = ((3-Elt)*4);
return (Mask & ~(0xF << FieldShift)) | (NewVal << FieldShift);
}
static bool isValidMask(unsigned short Mask) {
unsigned short UndefBits = Mask & 0x8888;
return (Mask & ((UndefBits >> 1)|(UndefBits>>2)|(UndefBits>>3))) == 0;
}
static bool hasUndefElements(unsigned short Mask) {
return (Mask & 0x8888) != 0;
}
static bool isOnlyLHSMask(unsigned short Mask) {
return (Mask & 0x4444) == 0;
}
#if 0#endif
static unsigned getCompressedMask(unsigned short Mask) {
return getMaskElt(Mask, 0)*9*9*9 + getMaskElt(Mask, 1)*9*9 +
getMaskElt(Mask, 2)*9 + getMaskElt(Mask, 3);
}
static void PrintMask(unsigned i, std::ostream &OS) {
OS << "<" << (char)(getMaskElt(i, 0) == 8 ? 'u' : ('0'+getMaskElt(i, 0)))
<< "," << (char)(getMaskElt(i, 1) == 8 ? 'u' : ('0'+getMaskElt(i, 1)))
<< "," << (char)(getMaskElt(i, 2) == 8 ? 'u' : ('0'+getMaskElt(i, 2)))
<< "," << (char)(getMaskElt(i, 3) == 8 ? 'u' : ('0'+getMaskElt(i, 3)))
<< ">";
}
struct ShuffleVal {
Operator *Op; unsigned Cost; unsigned short Arg0, Arg1;
ShuffleVal() : Cost(1000000) {}
};
static ShuffleVal ShufTab[65536];
static std::vector<Operator*> TheOperators;
struct Operator {
const char *Name;
unsigned short ShuffleMask;
unsigned short OpNum;
unsigned Cost;
Operator(unsigned short shufflemask, const char *name, unsigned opnum,
unsigned cost = 1)
: Name(name), ShuffleMask(shufflemask), OpNum(opnum),Cost(cost) {
TheOperators.push_back(this);
}
~Operator() {
assert(TheOperators.back() == this);
TheOperators.pop_back();
}
bool isOnlyLHSOperator() const {
return isOnlyLHSMask(ShuffleMask);
}
const char *getName() const { return Name; }
unsigned getCost() const { return Cost; }
unsigned short getTransformedMask(unsigned short LHSMask, unsigned RHSMask) {
unsigned Result = 0;
for (unsigned i = 0; i != 4; ++i) {
unsigned SrcElt = (ShuffleMask >> (4*i)) & 0xF;
unsigned ResElt;
if (SrcElt < 4)
ResElt = getMaskElt(LHSMask, SrcElt);
else if (SrcElt < 8)
ResElt = getMaskElt(RHSMask, SrcElt-4);
else {
assert(SrcElt == 8 && "Bad src elt!");
ResElt = 8;
}
Result |= ResElt << (4*i);
}
return Result;
}
};
#ifdef GENERATE_NEON_INS
static Operator InsOp(0, "ins", 15, 1);
#endif
static const char *getZeroCostOpName(unsigned short Op) {
if (ShufTab[Op].Arg0 == 0x0123)
return "LHS";
else if (ShufTab[Op].Arg0 == 0x4567)
return "RHS";
else {
assert(0 && "bad zero cost operation");
abort();
}
}
static void PrintOperation(unsigned ValNo, unsigned short Vals[]) {
unsigned short ThisOp = Vals[ValNo];
std::cerr << "t" << ValNo;
PrintMask(ThisOp, std::cerr);
std::cerr << " = " << ShufTab[ThisOp].Op->getName() << "(";
if (ShufTab[ShufTab[ThisOp].Arg0].Cost == 0) {
std::cerr << getZeroCostOpName(ShufTab[ThisOp].Arg0);
PrintMask(ShufTab[ThisOp].Arg0, std::cerr);
} else {
for (unsigned i = 0; ; ++i)
if (Vals[i] == ShufTab[ThisOp].Arg0) {
std::cerr << "t" << i;
break;
}
}
#ifdef GENERATE_NEON_INS
if (ShufTab[ThisOp].Op == &InsOp) {
std::cerr << ", lane " << ShufTab[ThisOp].Arg1;
} else
#endif
if (!ShufTab[Vals[ValNo]].Op->isOnlyLHSOperator()) {
std::cerr << ", ";
if (ShufTab[ShufTab[ThisOp].Arg1].Cost == 0) {
std::cerr << getZeroCostOpName(ShufTab[ThisOp].Arg1);
PrintMask(ShufTab[ThisOp].Arg1, std::cerr);
} else {
for (unsigned i = 0; ; ++i)
if (Vals[i] == ShufTab[ThisOp].Arg1) {
std::cerr << "t" << i;
break;
}
}
}
std::cerr << ") ";
}
static unsigned getNumEntered() {
unsigned Count = 0;
for (unsigned i = 0; i != 65536; ++i)
Count += ShufTab[i].Cost < 100;
return Count;
}
static void EvaluateOps(unsigned short Elt, unsigned short Vals[],
unsigned &NumVals) {
if (ShufTab[Elt].Cost == 0) return;
#ifdef GENERATE_NEON_INS
if (ShufTab[Elt].Op == &InsOp) {
EvaluateOps(ShufTab[Elt].Arg0, Vals, NumVals);
Vals[NumVals++] = Elt;
return;
}
#endif
for (unsigned i = 0, e = NumVals; i != e; ++i)
if (Vals[i] == Elt) return;
unsigned Arg0 = ShufTab[Elt].Arg0, Arg1 = ShufTab[Elt].Arg1;
if (ShufTab[Arg0].Cost)
EvaluateOps(Arg0, Vals, NumVals);
if (Arg0 != Arg1 && ShufTab[Arg1].Cost)
EvaluateOps(Arg1, Vals, NumVals);
Vals[NumVals++] = Elt;
}
int main() {
ShufTab[0x0123].Cost = 0;
ShufTab[0x0123].Op = nullptr;
ShufTab[0x0123].Arg0 = 0x0123;
ShufTab[0x4567].Cost = 0;
ShufTab[0x4567].Op = nullptr;
ShufTab[0x4567].Arg0 = 0x4567;
bool MadeChange = true;
unsigned OpCount = 0;
while (MadeChange) {
MadeChange = false;
++OpCount;
std::cerr << "Starting iteration #" << OpCount << " with "
<< getNumEntered() << " entries established.\n";
unsigned MaxCost = ShufTab[0].Cost;
for (unsigned i = 1; i != 0x8889; ++i) {
if (!isValidMask(i)) continue;
if (ShufTab[i].Cost > MaxCost)
MaxCost = ShufTab[i].Cost;
if (hasUndefElements(i)) {
unsigned UndefIdx;
if (i & 0x8000)
UndefIdx = 0;
else if (i & 0x0800)
UndefIdx = 1;
else if (i & 0x0080)
UndefIdx = 2;
else if (i & 0x0008)
UndefIdx = 3;
else
abort();
unsigned MinVal = i;
unsigned MinCost = ShufTab[i].Cost;
for (unsigned j = 0; j != 8; ++j) {
unsigned NewElt = setMaskElt(i, UndefIdx, j);
if (ShufTab[NewElt].Cost < MinCost) {
MinCost = ShufTab[NewElt].Cost;
MinVal = NewElt;
}
}
if (i != MinVal) {
MadeChange = true;
ShufTab[i] = ShufTab[MinVal];
}
}
#ifdef GENERATE_NEON_INS
else {
for (unsigned LaneIdx = 0; LaneIdx < 4; LaneIdx++) {
if (getMaskElt(i, LaneIdx) == 8)
continue;
unsigned NewElt = setMaskElt(i, LaneIdx, 8);
if (ShufTab[NewElt].Cost + 1 < ShufTab[i].Cost) {
MadeChange = true;
ShufTab[i].Cost = ShufTab[NewElt].Cost + 1;
ShufTab[i].Op = &InsOp;
ShufTab[i].Arg0 = NewElt;
ShufTab[i].Arg1 = LaneIdx;
}
}
for (unsigned LaneIdx = 0; LaneIdx < 4; LaneIdx += 2) {
unsigned Ln0 = getMaskElt(i, LaneIdx);
unsigned Ln1 = getMaskElt(i, LaneIdx + 1);
if ((Ln0 == 0 && Ln1 == 1) || (Ln0 == 2 && Ln1 == 3) ||
(Ln0 == 4 && Ln1 == 5) || (Ln0 == 6 && Ln1 == 7)) {
unsigned NewElt = setMaskElt(i, LaneIdx, 8);
NewElt = setMaskElt(NewElt, LaneIdx + 1, 8);
if (ShufTab[NewElt].Cost + 1 < ShufTab[i].Cost) {
MadeChange = true;
ShufTab[i].Cost = ShufTab[NewElt].Cost + 1;
ShufTab[i].Op = &InsOp;
ShufTab[i].Arg0 = NewElt;
ShufTab[i].Arg1 = (LaneIdx >> 1) | 0x4;
}
}
}
}
#endif
}
for (unsigned LHS = 0; LHS != 0x8889; ++LHS) {
if (!isValidMask(LHS)) continue;
if (ShufTab[LHS].Cost > 1000) continue;
if (ShufTab[LHS].Cost + 1 >= MaxCost)
continue;
for (unsigned opnum = 0, e = TheOperators.size(); opnum != e; ++opnum) {
Operator *Op = TheOperators[opnum];
#ifdef GENERATE_NEON_INS
if (Op == &InsOp)
continue;
#endif
unsigned ResultMask = Op->getTransformedMask(LHS, LHS);
unsigned Cost = ShufTab[LHS].Cost + Op->getCost();
if (Cost < ShufTab[ResultMask].Cost) {
ShufTab[ResultMask].Cost = Cost;
ShufTab[ResultMask].Op = Op;
ShufTab[ResultMask].Arg0 = LHS;
ShufTab[ResultMask].Arg1 = LHS;
MadeChange = true;
}
if (Op->isOnlyLHSOperator()) continue;
for (unsigned RHS = 0; RHS != 0x8889; ++RHS) {
if (!isValidMask(RHS)) continue;
if (ShufTab[RHS].Cost > 1000) continue;
if (ShufTab[RHS].Cost + 1 >= MaxCost)
continue;
unsigned ResultMask = Op->getTransformedMask(LHS, RHS);
if (ShufTab[ResultMask].Cost <= OpCount ||
ShufTab[ResultMask].Cost <= ShufTab[LHS].Cost ||
ShufTab[ResultMask].Cost <= ShufTab[RHS].Cost)
continue;
unsigned short Vals[30];
unsigned NumVals = 0;
EvaluateOps(LHS, Vals, NumVals);
EvaluateOps(RHS, Vals, NumVals);
unsigned Cost = NumVals + Op->getCost();
if (Cost < ShufTab[ResultMask].Cost) {
ShufTab[ResultMask].Cost = Cost;
ShufTab[ResultMask].Op = Op;
ShufTab[ResultMask].Arg0 = LHS;
ShufTab[ResultMask].Arg1 = RHS;
MadeChange = true;
}
}
}
}
}
std::cerr << "Finished Table has " << getNumEntered()
<< " entries established.\n";
unsigned CostArray[10] = { 0 };
for (unsigned i = 0; i != 65536; ++i) {
if (!isValidMask(i)) continue;
if (ShufTab[i].Cost > 9)
++CostArray[9];
else
++CostArray[ShufTab[i].Cost];
}
for (unsigned i = 0; i != 9; ++i)
if (CostArray[i])
std::cout << "// " << CostArray[i] << " entries have cost " << i << "\n";
if (CostArray[9])
std::cout << "// " << CostArray[9] << " entries have higher cost!\n";
std::cout << "\n// This table is 6561*4 = 26244 bytes in size.\n";
std::cout << "static const unsigned PerfectShuffleTable[6561+1] = {\n";
for (unsigned i = 0; i != 0x8889; ++i) {
if (!isValidMask(i)) continue;
unsigned CostSat = ShufTab[i].Cost;
if (CostSat > 4) CostSat = 4;
if (CostSat == 0) CostSat = 1;
--CostSat;
unsigned OpNum = ShufTab[i].Op ? ShufTab[i].Op->OpNum : 0;
assert(OpNum < 16 && "Too few bits to encode operation!");
unsigned LHS = getCompressedMask(ShufTab[i].Arg0);
unsigned RHS = getCompressedMask(ShufTab[i].Arg1);
unsigned Val = (CostSat << 30) | (OpNum << 26) | (LHS << 13) | RHS;
std::cout << " " << std::setw(10) << Val << "U, // ";
PrintMask(i, std::cout);
std::cout << ": Cost " << ShufTab[i].Cost;
std::cout << " " << (ShufTab[i].Op ? ShufTab[i].Op->getName() : "copy");
std::cout << " ";
if (ShufTab[ShufTab[i].Arg0].Cost == 0) {
std::cout << getZeroCostOpName(ShufTab[i].Arg0);
} else {
PrintMask(ShufTab[i].Arg0, std::cout);
}
if (ShufTab[i].Op && !ShufTab[i].Op->isOnlyLHSOperator()) {
std::cout << ", ";
if (ShufTab[ShufTab[i].Arg1].Cost == 0) {
std::cout << getZeroCostOpName(ShufTab[i].Arg1);
} else {
PrintMask(ShufTab[i].Arg1, std::cout);
}
}
#ifdef GENERATE_NEON_INS
else if (ShufTab[i].Op == &InsOp) {
std::cout << ", lane " << ShufTab[i].Arg1;
}
#endif
std::cout << "\n";
}
std::cout << " 0\n};\n";
if (false) {
for (unsigned i = 0; i != 0x8889; ++i) {
if (!isValidMask(i)) continue;
if (ShufTab[i].Cost < 1000) {
PrintMask(i, std::cerr);
std::cerr << " - Cost " << ShufTab[i].Cost << " - ";
unsigned short Vals[30];
unsigned NumVals = 0;
EvaluateOps(i, Vals, NumVals);
for (unsigned j = 0, e = NumVals; j != e; ++j)
PrintOperation(j, Vals);
std::cerr << "\n";
}
}
}
}
#ifdef GENERATE_ALTIVEC
enum {
OP_COPY = 0, OP_VMRGHW,
OP_VMRGLW,
OP_VSPLTISW0,
OP_VSPLTISW1,
OP_VSPLTISW2,
OP_VSPLTISW3,
OP_VSLDOI4,
OP_VSLDOI8,
OP_VSLDOI12
};
struct vmrghw : public Operator {
vmrghw() : Operator(0x0415, "vmrghw", OP_VMRGHW) {}
} the_vmrghw;
struct vmrglw : public Operator {
vmrglw() : Operator(0x2637, "vmrglw", OP_VMRGLW) {}
} the_vmrglw;
template<unsigned Elt>
struct vspltisw : public Operator {
vspltisw(const char *N, unsigned Opc)
: Operator(MakeMask(Elt, Elt, Elt, Elt), N, Opc) {}
};
vspltisw<0> the_vspltisw0("vspltisw0", OP_VSPLTISW0);
vspltisw<1> the_vspltisw1("vspltisw1", OP_VSPLTISW1);
vspltisw<2> the_vspltisw2("vspltisw2", OP_VSPLTISW2);
vspltisw<3> the_vspltisw3("vspltisw3", OP_VSPLTISW3);
template<unsigned N>
struct vsldoi : public Operator {
vsldoi(const char *Name, unsigned Opc)
: Operator(MakeMask(N&7, (N+1)&7, (N+2)&7, (N+3)&7), Name, Opc) {
}
};
vsldoi<1> the_vsldoi1("vsldoi4" , OP_VSLDOI4);
vsldoi<2> the_vsldoi2("vsldoi8" , OP_VSLDOI8);
vsldoi<3> the_vsldoi3("vsldoi12", OP_VSLDOI12);
#endif
#ifdef GENERATE_NEON
enum {
OP_COPY = 0, OP_VREV,
OP_VDUP0,
OP_VDUP1,
OP_VDUP2,
OP_VDUP3,
OP_VEXT1,
OP_VEXT2,
OP_VEXT3,
OP_VUZPL, OP_VUZPR, OP_VZIPL, OP_VZIPR, OP_VTRNL, OP_VTRNR };
struct vrev : public Operator {
vrev() : Operator(0x1032, "vrev", OP_VREV) {}
} the_vrev;
template<unsigned Elt>
struct vdup : public Operator {
vdup(const char *N, unsigned Opc)
: Operator(MakeMask(Elt, Elt, Elt, Elt), N, Opc) {}
};
vdup<0> the_vdup0("vdup0", OP_VDUP0);
vdup<1> the_vdup1("vdup1", OP_VDUP1);
vdup<2> the_vdup2("vdup2", OP_VDUP2);
vdup<3> the_vdup3("vdup3", OP_VDUP3);
template<unsigned N>
struct vext : public Operator {
vext(const char *Name, unsigned Opc)
: Operator(MakeMask(N&7, (N+1)&7, (N+2)&7, (N+3)&7), Name, Opc) {
}
};
vext<1> the_vext1("vext1", OP_VEXT1);
vext<2> the_vext2("vext2", OP_VEXT2);
vext<3> the_vext3("vext3", OP_VEXT3);
struct vuzpl : public Operator {
vuzpl() : Operator(0x0246, "vuzpl", OP_VUZPL, 1) {}
} the_vuzpl;
struct vuzpr : public Operator {
vuzpr() : Operator(0x1357, "vuzpr", OP_VUZPR, 1) {}
} the_vuzpr;
struct vzipl : public Operator {
vzipl() : Operator(0x0415, "vzipl", OP_VZIPL, 1) {}
} the_vzipl;
struct vzipr : public Operator {
vzipr() : Operator(0x2637, "vzipr", OP_VZIPR, 1) {}
} the_vzipr;
struct vtrnl : public Operator {
vtrnl() : Operator(0x0426, "vtrnl", OP_VTRNL, 1) {}
} the_vtrnl;
struct vtrnr : public Operator {
vtrnr() : Operator(0x1537, "vtrnr", OP_VTRNR, 1) {}
} the_vtrnr;
#endif