@@ -2520,99 +2520,36 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou
25202520 case NI_AVX512_FusedMultiplySubtractNegated:
25212521 case NI_AVX512_FusedMultiplySubtractNegatedScalar:
25222522 {
2523+ // While this operation is RMW, it is also almost freely reorderable
2524+ // and so we do not need to set the operands as delay free unless
2525+ // we are copying the upper bits and therefore op1 must be the target
2526+
25232527 assert ((numArgs == 3 ) || (intrinsicTree->OperIsEmbRoundingEnabled ()));
25242528 assert (isRMW);
25252529 assert (HWIntrinsicInfo::IsFmaIntrinsic (intrinsicId));
25262530
25272531 const bool copiesUpperBits = HWIntrinsicInfo::CopiesUpperBits (intrinsicId);
25282532
2529- LIR::Use use;
2530- GenTree* user = nullptr ;
2531-
2532- if (LIR::AsRange (blockSequence[curBBSeqNum]).TryGetUse (intrinsicTree, &use))
2533- {
2534- user = use.User ();
2535- }
2536- unsigned resultOpNum = intrinsicTree->GetResultOpNumForRmwIntrinsic (user, op1, op2, op3);
2537-
2538- unsigned containedOpNum = 0 ;
2539-
2540- // containedOpNum remains 0 when no operand is contained or regOptional
2541- if (op1->isContained () || op1->IsRegOptional ())
2542- {
2543- containedOpNum = 1 ;
2544- }
2545- else if (op2->isContained () || op2->IsRegOptional ())
2546- {
2547- containedOpNum = 2 ;
2548- }
2549- else if (op3->isContained () || op3->IsRegOptional ())
2550- {
2551- containedOpNum = 3 ;
2552- }
2553-
25542533 GenTree* emitOp1 = op1;
25552534 GenTree* emitOp2 = op2;
25562535 GenTree* emitOp3 = op3;
25572536
2558- // Intrinsics with CopyUpperBits semantics must have op1 as target
2559- assert (containedOpNum != 1 || !copiesUpperBits);
2560-
2561- // We need to keep this in sync with hwintrinsiccodegenxarch.cpp
2562- // Ideally we'd actually swap the operands here and simplify codegen
2563- // but its a bit more complicated to do so for many operands as well
2564- // as being complicated to tell codegen how to pick the right instruction
2565-
2566- if (containedOpNum == 1 )
2537+ if (op1->isContained () || op1->IsRegOptional ())
25672538 {
2568- // https://github.com/dotnet/runtime/issues/62215
2569- // resultOpNum might change between lowering and lsra, comment out assertion for now.
2570- // assert(containedOpNum != resultOpNum);
2571- // resultOpNum is 3 or 0: op3/? = ([op1] * op2) + op3
2539+ assert (!copiesUpperBits);
25722540 std::swap (emitOp1, emitOp3);
2573-
2574- if (resultOpNum == 2 )
2575- {
2576- // op2 = ([op1] * op2) + op3
2577- std::swap (emitOp1, emitOp2);
2578- }
2579- }
2580- else if (containedOpNum == 3 )
2581- {
2582- // assert(containedOpNum != resultOpNum);
2583- if (resultOpNum == 2 && !copiesUpperBits)
2584- {
2585- // op2 = (op1 * op2) + [op3]
2586- std::swap (emitOp1, emitOp2);
2587- }
2588- // else: op1/? = (op1 * op2) + [op3]
25892541 }
2590- else if (containedOpNum == 2 )
2542+ else if (op2-> isContained () || op2-> IsRegOptional () )
25912543 {
2592- // assert(containedOpNum != resultOpNum);
2593-
2594- // op1/? = (op1 * [op2]) + op3
25952544 std::swap (emitOp2, emitOp3);
2596- if (resultOpNum == 3 && !copiesUpperBits)
2597- {
2598- // op3 = (op1 * [op2]) + op3
2599- std::swap (emitOp1, emitOp2);
2600- }
2601- }
2602- else
2603- {
2604- // containedOpNum == 0
2605- // no extra work when resultOpNum is 0 or 1
2606- if (resultOpNum == 2 )
2607- {
2608- std::swap (emitOp1, emitOp2);
2609- }
2610- else if (resultOpNum == 3 )
2611- {
2612- std::swap (emitOp1, emitOp3);
2613- }
26142545 }
26152546
2547+ // We don't have different intrinsic IDs for the 3 instruction forms and so,
2548+ // unlike other intrinsics, the operand order and the emit order are not
2549+ // consistent and have not been adjusted by lowering. Because of this, we
2550+ // need to track what the expected emit order is to know how to build the use
2551+ // but still need build the users in the order they exist in LIR.
2552+
26162553 GenTree* ops[] = {op1, op2, op3};
26172554 for (GenTree* op : ops)
26182555 {
@@ -2623,14 +2560,33 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou
26232560 }
26242561 else if (op == emitOp2)
26252562 {
2626- srcCount += BuildDelayFreeUses (op, emitOp1);
2563+ if (copiesUpperBits)
2564+ {
2565+ srcCount += BuildDelayFreeUses (op, emitOp1);
2566+ }
2567+ else
2568+ {
2569+ tgtPrefUse2 = BuildUse (op);
2570+ srcCount++;
2571+ }
26272572 }
26282573 else if (op == emitOp3)
26292574 {
2630- SingleTypeRegSet apxAwareRegCandidates =
2631- ForceLowGprForApxIfNeeded (op, RBM_NONE, canHWIntrinsicUseApxRegs);
2632- srcCount += op->isContained () ? BuildOperandUses (op, apxAwareRegCandidates)
2633- : BuildDelayFreeUses (op, emitOp1);
2575+ if (op->isContained ())
2576+ {
2577+ SingleTypeRegSet apxAwareRegCandidates =
2578+ ForceLowGprForApxIfNeeded (op, RBM_NONE, canHWIntrinsicUseApxRegs);
2579+ srcCount += BuildOperandUses (op, apxAwareRegCandidates);
2580+ }
2581+ else if (copiesUpperBits)
2582+ {
2583+ srcCount += BuildDelayFreeUses (op, emitOp1);
2584+ }
2585+ else
2586+ {
2587+ tgtPrefUse3 = BuildUse (op);
2588+ srcCount++;
2589+ }
26342590 }
26352591 }
26362592
@@ -2748,6 +2704,62 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou
27482704 break ;
27492705 }
27502706
2707+ case NI_AVX512_TernaryLogic:
2708+ {
2709+ // While this operation can be RMW when all operands are used, it
2710+ // is also almost freely reorderable and so we do not need to set
2711+ // the operands as delay free unless the control byte is unknown.
2712+
2713+ assert (numArgs == 4 );
2714+
2715+ if (!op4->isContainedIntOrIImmed ())
2716+ {
2717+ break ;
2718+ }
2719+
2720+ // We have a bit of a special consideration where not all operands may be "used"
2721+ // and where the leading operands can be contained vector constants that will be
2722+ // ignored in codegen. Only op3 can be the "true" contained operand from memory.
2723+
2724+ if (op1->isContained ())
2725+ {
2726+ srcCount += BuildOperandUses (op1);
2727+ }
2728+ else
2729+ {
2730+ tgtPrefUse = BuildUse (op1);
2731+ srcCount++;
2732+ }
2733+
2734+ if (op2->isContained ())
2735+ {
2736+ srcCount += BuildOperandUses (op2);
2737+ }
2738+ else
2739+ {
2740+ tgtPrefUse2 = BuildUse (op2);
2741+ srcCount++;
2742+ }
2743+
2744+ if (op3->isContained ())
2745+ {
2746+ SingleTypeRegSet apxAwareRegCandidates =
2747+ ForceLowGprForApxIfNeeded (op3, RBM_NONE, canHWIntrinsicUseApxRegs);
2748+ srcCount += BuildOperandUses (op3, apxAwareRegCandidates);
2749+ }
2750+ else
2751+ {
2752+ tgtPrefUse3 = BuildUse (op3);
2753+ srcCount++;
2754+ }
2755+
2756+ assert (op4->isContained ());
2757+ srcCount += BuildOperandUses (op4);
2758+
2759+ buildUses = false ;
2760+ break ;
2761+ }
2762+
27512763 case NI_AVXVNNI_MultiplyWideningAndAdd:
27522764 case NI_AVXVNNI_MultiplyWideningAndAddSaturate:
27532765 case NI_AVXVNNIINT_MultiplyWideningAndAdd:
0 commit comments