Skip to content

Commit 3366f6a

Browse files
Remove unnecessary delayfree from xarch FMA and TERNLOG instructions (#128350)
These instructions are fully reorderable and so much like various commutative nodes do not need to be marked delay free except in special scenarios. This resolves #62215
1 parent 74b0096 commit 3366f6a

2 files changed

Lines changed: 131 additions & 81 deletions

File tree

src/coreclr/jit/hwintrinsiccodegenxarch.cpp

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1571,6 +1571,44 @@ void CodeGen::genHWIntrinsic_R_R_R_RM_I(
15711571
#endif // DEBUG
15721572
}
15731573
}
1574+
else if (node->GetHWIntrinsicId() == NI_AVX512_TernaryLogic)
1575+
{
1576+
// These are the control bytes used for TernaryLogic
1577+
1578+
const uint8_t A = 0xF0;
1579+
const uint8_t B = 0xCC;
1580+
const uint8_t C = 0xAA;
1581+
1582+
uint8_t control = static_cast<uint8_t>(ival);
1583+
const TernaryLogicInfo& info = TernaryLogicInfo::lookup(control);
1584+
TernaryLogicUseFlags useFlags = info.GetAllUseFlags();
1585+
1586+
if (useFlags == TernaryLogicUseFlags::ABC)
1587+
{
1588+
// We're using all the operands and can potentially have any
1589+
// operand overlap with the target register. So we need to
1590+
// detect those cases and adjust the control byte accordingly.
1591+
1592+
if (targetReg == op2Reg)
1593+
{
1594+
std::swap(op1, op2);
1595+
std::swap(op1Reg, op2Reg);
1596+
1597+
control = TernaryLogicInfo::GetTernaryControlByte(info, B, A, C);
1598+
ival = static_cast<int8_t>(control);
1599+
}
1600+
else if ((targetReg == op3->GetRegNum()) && !op3->isUsedFromSpillTemp())
1601+
{
1602+
assert(!op3->isContained());
1603+
1604+
std::swap(op1, op3);
1605+
op1Reg = op1->GetRegNum();
1606+
1607+
control = TernaryLogicInfo::GetTernaryControlByte(info, C, B, A);
1608+
ival = static_cast<int8_t>(control);
1609+
}
1610+
}
1611+
}
15741612

15751613
assert(targetReg != REG_NA);
15761614
assert(op1Reg != REG_NA);

src/coreclr/jit/lsraxarch.cpp

Lines changed: 93 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -2520,99 +2520,36 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou
25202520
case NI_AVX512_FusedMultiplySubtractNegated:
25212521
case NI_AVX512_FusedMultiplySubtractNegatedScalar:
25222522
{
2523+
// While this operation is RMW, it is also almost freely reorderable
2524+
// and so we do not need to set the operands as delay free unless
2525+
// we are copying the upper bits and therefore op1 must be the target
2526+
25232527
assert((numArgs == 3) || (intrinsicTree->OperIsEmbRoundingEnabled()));
25242528
assert(isRMW);
25252529
assert(HWIntrinsicInfo::IsFmaIntrinsic(intrinsicId));
25262530

25272531
const bool copiesUpperBits = HWIntrinsicInfo::CopiesUpperBits(intrinsicId);
25282532

2529-
LIR::Use use;
2530-
GenTree* user = nullptr;
2531-
2532-
if (LIR::AsRange(blockSequence[curBBSeqNum]).TryGetUse(intrinsicTree, &use))
2533-
{
2534-
user = use.User();
2535-
}
2536-
unsigned resultOpNum = intrinsicTree->GetResultOpNumForRmwIntrinsic(user, op1, op2, op3);
2537-
2538-
unsigned containedOpNum = 0;
2539-
2540-
// containedOpNum remains 0 when no operand is contained or regOptional
2541-
if (op1->isContained() || op1->IsRegOptional())
2542-
{
2543-
containedOpNum = 1;
2544-
}
2545-
else if (op2->isContained() || op2->IsRegOptional())
2546-
{
2547-
containedOpNum = 2;
2548-
}
2549-
else if (op3->isContained() || op3->IsRegOptional())
2550-
{
2551-
containedOpNum = 3;
2552-
}
2553-
25542533
GenTree* emitOp1 = op1;
25552534
GenTree* emitOp2 = op2;
25562535
GenTree* emitOp3 = op3;
25572536

2558-
// Intrinsics with CopyUpperBits semantics must have op1 as target
2559-
assert(containedOpNum != 1 || !copiesUpperBits);
2560-
2561-
// We need to keep this in sync with hwintrinsiccodegenxarch.cpp
2562-
// Ideally we'd actually swap the operands here and simplify codegen
2563-
// but its a bit more complicated to do so for many operands as well
2564-
// as being complicated to tell codegen how to pick the right instruction
2565-
2566-
if (containedOpNum == 1)
2537+
if (op1->isContained() || op1->IsRegOptional())
25672538
{
2568-
// https://github.com/dotnet/runtime/issues/62215
2569-
// resultOpNum might change between lowering and lsra, comment out assertion for now.
2570-
// assert(containedOpNum != resultOpNum);
2571-
// resultOpNum is 3 or 0: op3/? = ([op1] * op2) + op3
2539+
assert(!copiesUpperBits);
25722540
std::swap(emitOp1, emitOp3);
2573-
2574-
if (resultOpNum == 2)
2575-
{
2576-
// op2 = ([op1] * op2) + op3
2577-
std::swap(emitOp1, emitOp2);
2578-
}
2579-
}
2580-
else if (containedOpNum == 3)
2581-
{
2582-
// assert(containedOpNum != resultOpNum);
2583-
if (resultOpNum == 2 && !copiesUpperBits)
2584-
{
2585-
// op2 = (op1 * op2) + [op3]
2586-
std::swap(emitOp1, emitOp2);
2587-
}
2588-
// else: op1/? = (op1 * op2) + [op3]
25892541
}
2590-
else if (containedOpNum == 2)
2542+
else if (op2->isContained() || op2->IsRegOptional())
25912543
{
2592-
// assert(containedOpNum != resultOpNum);
2593-
2594-
// op1/? = (op1 * [op2]) + op3
25952544
std::swap(emitOp2, emitOp3);
2596-
if (resultOpNum == 3 && !copiesUpperBits)
2597-
{
2598-
// op3 = (op1 * [op2]) + op3
2599-
std::swap(emitOp1, emitOp2);
2600-
}
2601-
}
2602-
else
2603-
{
2604-
// containedOpNum == 0
2605-
// no extra work when resultOpNum is 0 or 1
2606-
if (resultOpNum == 2)
2607-
{
2608-
std::swap(emitOp1, emitOp2);
2609-
}
2610-
else if (resultOpNum == 3)
2611-
{
2612-
std::swap(emitOp1, emitOp3);
2613-
}
26142545
}
26152546

2547+
// We don't have different intrinsic IDs for the 3 instruction forms and so,
2548+
// unlike other intrinsics, the operand order and the emit order are not
2549+
// consistent and have not been adjusted by lowering. Because of this, we
2550+
// need to track what the expected emit order is to know how to build the use
2551+
// but still need build the users in the order they exist in LIR.
2552+
26162553
GenTree* ops[] = {op1, op2, op3};
26172554
for (GenTree* op : ops)
26182555
{
@@ -2623,14 +2560,33 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou
26232560
}
26242561
else if (op == emitOp2)
26252562
{
2626-
srcCount += BuildDelayFreeUses(op, emitOp1);
2563+
if (copiesUpperBits)
2564+
{
2565+
srcCount += BuildDelayFreeUses(op, emitOp1);
2566+
}
2567+
else
2568+
{
2569+
tgtPrefUse2 = BuildUse(op);
2570+
srcCount++;
2571+
}
26272572
}
26282573
else if (op == emitOp3)
26292574
{
2630-
SingleTypeRegSet apxAwareRegCandidates =
2631-
ForceLowGprForApxIfNeeded(op, RBM_NONE, canHWIntrinsicUseApxRegs);
2632-
srcCount += op->isContained() ? BuildOperandUses(op, apxAwareRegCandidates)
2633-
: BuildDelayFreeUses(op, emitOp1);
2575+
if (op->isContained())
2576+
{
2577+
SingleTypeRegSet apxAwareRegCandidates =
2578+
ForceLowGprForApxIfNeeded(op, RBM_NONE, canHWIntrinsicUseApxRegs);
2579+
srcCount += BuildOperandUses(op, apxAwareRegCandidates);
2580+
}
2581+
else if (copiesUpperBits)
2582+
{
2583+
srcCount += BuildDelayFreeUses(op, emitOp1);
2584+
}
2585+
else
2586+
{
2587+
tgtPrefUse3 = BuildUse(op);
2588+
srcCount++;
2589+
}
26342590
}
26352591
}
26362592

@@ -2748,6 +2704,62 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou
27482704
break;
27492705
}
27502706

2707+
case NI_AVX512_TernaryLogic:
2708+
{
2709+
// While this operation can be RMW when all operands are used, it
2710+
// is also almost freely reorderable and so we do not need to set
2711+
// the operands as delay free unless the control byte is unknown.
2712+
2713+
assert(numArgs == 4);
2714+
2715+
if (!op4->isContainedIntOrIImmed())
2716+
{
2717+
break;
2718+
}
2719+
2720+
// We have a bit of a special consideration where not all operands may be "used"
2721+
// and where the leading operands can be contained vector constants that will be
2722+
// ignored in codegen. Only op3 can be the "true" contained operand from memory.
2723+
2724+
if (op1->isContained())
2725+
{
2726+
srcCount += BuildOperandUses(op1);
2727+
}
2728+
else
2729+
{
2730+
tgtPrefUse = BuildUse(op1);
2731+
srcCount++;
2732+
}
2733+
2734+
if (op2->isContained())
2735+
{
2736+
srcCount += BuildOperandUses(op2);
2737+
}
2738+
else
2739+
{
2740+
tgtPrefUse2 = BuildUse(op2);
2741+
srcCount++;
2742+
}
2743+
2744+
if (op3->isContained())
2745+
{
2746+
SingleTypeRegSet apxAwareRegCandidates =
2747+
ForceLowGprForApxIfNeeded(op3, RBM_NONE, canHWIntrinsicUseApxRegs);
2748+
srcCount += BuildOperandUses(op3, apxAwareRegCandidates);
2749+
}
2750+
else
2751+
{
2752+
tgtPrefUse3 = BuildUse(op3);
2753+
srcCount++;
2754+
}
2755+
2756+
assert(op4->isContained());
2757+
srcCount += BuildOperandUses(op4);
2758+
2759+
buildUses = false;
2760+
break;
2761+
}
2762+
27512763
case NI_AVXVNNI_MultiplyWideningAndAdd:
27522764
case NI_AVXVNNI_MultiplyWideningAndAddSaturate:
27532765
case NI_AVXVNNIINT_MultiplyWideningAndAdd:

0 commit comments

Comments
 (0)