Skip to content

Commit d1163e5

Browse files
saucecontroltannergoodingCopilot
authored
JIT: Accelerate floating->long casts on x86 (#125180)
This adds floating->long/ulong cast codegen for AVX-512 and AVX10.2 on x86. With this, all non-overflow casts are now hardware accelerated. This is the last bit pulled from #116805. Typical Diff (double->long AVX-512): ```diff - sub esp, 8 - vzeroupper - vmovsd xmm0, qword ptr [esp+0x0C] - sub esp, 8 - ; npt arg push 0 - ; npt arg push 1 - vmovsd qword ptr [esp], xmm0 - call CORINFO_HELP_DBL2LNG - ; gcr arg pop 2 + vmovsd xmm0, qword ptr [esp+0x04] + vcmpordsd k1, xmm0, xmm0 + vcmpge_oqsd k2, xmm0, qword ptr [@rwd00] + vcvttpd2qq xmm0 {k1}{z}, xmm0 + vpblendmq xmm0 {k2}, xmm0, qword ptr [@RWD08] {1to2} + vmovd eax, xmm0 + vpextrd edx, xmm0, 1 - add esp, 8 ret 8 +RWD00 dq 43E0000000000000h +RWD08 dq 7FFFFFFFFFFFFFFFh -; Total bytes of code 31 +; Total bytes of code 53 ``` Full [Diffs](https://dev.azure.com/dnceng-public/public/_build/results?buildId=1391699&view=ms.vss-build-web.run-extensions-tab) Breakdown of the double->long asm: ```asm ; load the scalar double vmovsd xmm0, qword ptr [esp+0x04] ; set the low bit of k1 if the scalar value is not NaN vcmpordsd k1, xmm0, xmm0 ; set the low bit of k2 if the input was greater than or equal to 2^63 (nearest double greater than long.MaxValue) vcmpge_oqsd k2, xmm0, qword ptr [@rwd00] ; convert, using k1 mask bit. if the mask bit is not set (meaning we have a NaN), set the value to zero vcvttpd2qq xmm0 {k1}{z}, xmm0 ; if the low bit of k2 is set (meaning overflow), set the value to long.MaxValue, otherwise take the conversion result vpblendmq xmm0 {k2}, xmm0, qword ptr [@RWD08] {1to2} ; extract the two 32-bit halves of the long result vmovd eax, xmm0 vpextrd edx, xmm0, 1 ``` --------- Co-authored-by: Tanner Gooding <tagoo@microsoft.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
1 parent 6499da1 commit d1163e5

6 files changed

Lines changed: 238 additions & 43 deletions

File tree

src/coreclr/jit/decomposelongs.cpp

Lines changed: 146 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -587,40 +587,172 @@ GenTree* DecomposeLongs::DecomposeCast(LIR::Use& use)
587587
}
588588

589589
#if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_X86)
590-
if (varTypeIsFloating(dstType))
590+
if (varTypeIsFloating(srcType) || varTypeIsFloating(dstType))
591591
{
592592
// We will reach this path only if morph did not convert the cast to a helper call,
593593
// meaning we can perform the cast using SIMD instructions.
594-
// The sequence this creates is simply:
595-
// AVX512DQ.VL.ConvertToVector128Single(Vector128.CreateScalarUnsafe(LONG)).ToScalar()
596-
597-
NamedIntrinsic intrinsicId = NI_Illegal;
598-
GenTree* srcOp = cast->CastOp();
599594

600595
assert(!cast->gtOverflow());
601596
assert(m_compiler->compIsaSupportedDebugOnly(InstructionSet_AVX512));
602597

603-
intrinsicId = (dstType == TYP_FLOAT) ? NI_AVX512_ConvertToVector128Single : NI_AVX512_ConvertToVector128Double;
598+
GenTree* srcOp = cast->CastOp();
599+
GenTree* castResult = nullptr;
600+
LIR::Range castRange = LIR::EmptyRange();
604601

605-
GenTree* createScalar = m_compiler->gtNewSimdCreateScalarUnsafeNode(TYP_SIMD16, srcOp, srcType, 16);
606-
GenTree* convert = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, createScalar, intrinsicId, srcType, 16);
607-
GenTree* toScalar = m_compiler->gtNewSimdToScalarNode(dstType, convert, dstType, 16);
602+
// This creates the equivalent of the following C# code:
603+
// var srcVec = Vector128.CreateScalarUnsafe(castOp);
608604

609-
Range().InsertAfter(cast, createScalar, convert, toScalar);
610-
Range().Remove(cast);
605+
GenTree* srcVector = m_compiler->gtNewSimdCreateScalarUnsafeNode(TYP_SIMD16, srcOp, srcType, 16);
606+
castRange.InsertAtEnd(srcVector);
611607

612-
if (createScalar->IsCnsVec())
608+
if (srcVector->IsCnsVec())
613609
{
614610
Range().Remove(srcOp);
615611
}
616612

613+
if (varTypeIsFloating(dstType))
614+
{
615+
// long->floating casts don't require any kind of fixup. We simply use the vector
616+
// form of the instructions, because the scalar form is not supported on 32-bit.
617+
618+
NamedIntrinsic intrinsicId =
619+
(dstType == TYP_FLOAT) ? NI_AVX512_ConvertToVector128Single : NI_AVX512_ConvertToVector128Double;
620+
621+
castResult = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, srcVector, intrinsicId, srcType, 16);
622+
}
623+
else if (m_compiler->compOpportunisticallyDependsOn(InstructionSet_AVX10v2))
624+
{
625+
// Likewise, the AVX10.2 saturating floating->long instructions give the correct result,
626+
// but we have to use the vector form.
627+
628+
NamedIntrinsic intrinsicId = (dstType == TYP_ULONG)
629+
? NI_AVX10v2_ConvertToVectorUInt64WithTruncatedSaturation
630+
: NI_AVX10v2_ConvertToVectorInt64WithTruncatedSaturation;
631+
632+
castResult = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, srcVector, intrinsicId, srcType, 16);
633+
}
634+
else if (dstType == TYP_ULONG)
635+
{
636+
// AVX-512 unsigned conversion instructions correctly saturate for positive overflow, so
637+
// we only need to fix up negative or NaN values before conversion.
638+
//
639+
// maxs[sd] will take the value from the second operand if the first operand's value is
640+
// NaN, which allows us to fix up both negative and NaN values with a single instruction.
641+
//
642+
// This creates the equivalent of the following C# code:
643+
// var fixupVal = Sse.MaxScalar(srcVec, Vector128<T>.Zero);
644+
// castResult = Avx512DQ.VL.ConvertToVector128UInt64WithTruncation(fixupVal);
645+
646+
GenTree* zero = m_compiler->gtNewZeroConNode(TYP_SIMD16);
647+
GenTree* fixupVal =
648+
m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, srcVector, zero, NI_X86Base_MaxScalar, srcType, 16);
649+
650+
castRange.InsertAtEnd(zero);
651+
castRange.InsertAtEnd(fixupVal);
652+
653+
castResult =
654+
m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, fixupVal,
655+
NI_AVX512_ConvertToVector128UInt64WithTruncation, srcType, 16);
656+
}
657+
else
658+
{
659+
assert(dstType == TYP_LONG);
660+
661+
// We will use the input value multiple times, so we replace it with a lclVar.
662+
LIR::Use srcUse;
663+
LIR::Use::MakeDummyUse(castRange, srcVector, &srcUse);
664+
srcUse.ReplaceWithLclVar(m_compiler);
665+
srcVector = srcUse.Def();
666+
667+
// This logic is similar to the floating->long saturating logic in Lowering::LowerCast,
668+
// except that here we must keep everything in SIMD registers. We can also take advantage
669+
// of EVEX masking since the conversion itself requires AVX-512.
670+
//
671+
// We fix up NaN values by masking in zero during conversion. Negative saturation is handled
672+
// correctly by the conversion instructions. Positive saturation is handled after conversion,
673+
// because MaxValue is not precisely representable in the floating format.
674+
//
675+
// This creates roughly the equivalent of the following C# code:
676+
// var nanMask = Avx.CompareScalar(srcVec, srcVec, FloatComparisonMode.OrderedNonSignaling);
677+
//
678+
// var compareMode = FloatComparisonMode.OrderedGreaterThanOrEqualNonSignaling;
679+
// var ovfFloatingValue = Vector128.Create(9223372036854775808.0);
680+
// var ovfMask = Avx.CompareScalar(srcVec, ovfFloatingValue, compareMode);
681+
682+
GenTree* srcClone = m_compiler->gtClone(srcVector);
683+
GenTree* compareMode =
684+
m_compiler->gtNewIconNode(static_cast<int32_t>(FloatComparisonMode::OrderedNonSignaling));
685+
GenTree* nanMask = m_compiler->gtNewSimdHWIntrinsicNode(TYP_MASK, srcVector, srcClone, compareMode,
686+
NI_AVX512_CompareScalarMask, srcType, 16);
687+
688+
castRange.InsertAtEnd(srcClone);
689+
castRange.InsertAtEnd(compareMode);
690+
castRange.InsertAtEnd(nanMask);
691+
692+
compareMode = m_compiler->gtNewIconNode(
693+
static_cast<int32_t>(FloatComparisonMode::OrderedGreaterThanOrEqualNonSignaling));
694+
695+
GenTreeVecCon* ovfFloatingValue = m_compiler->gtNewVconNode(TYP_SIMD16);
696+
ovfFloatingValue->EvaluateBroadcastInPlace(srcType, 9223372036854775808.0); // 2^63
697+
698+
srcClone = m_compiler->gtClone(srcVector);
699+
GenTree* ovfMask = m_compiler->gtNewSimdHWIntrinsicNode(TYP_MASK, srcClone, ovfFloatingValue, compareMode,
700+
NI_AVX512_CompareScalarMask, srcType, 16);
701+
702+
castRange.InsertAtEnd(srcClone);
703+
castRange.InsertAtEnd(ovfFloatingValue);
704+
castRange.InsertAtEnd(compareMode);
705+
castRange.InsertAtEnd(ovfMask);
706+
707+
// Now we convert, using the masks created above for NaN and positive overflow saturation.
708+
//
709+
// This creates roughly the equivalent of the following C# code:
710+
// var convert = Avx512DQ.VL.ConvertToVector128Int64WithTruncation(srcVec);
711+
// var convertMasked = Avx512F.VL.BlendVariable(Vector128<long>.Zero, convert, nanMask);
712+
//
713+
// var maxLong = Vector128.Create(long.MaxValue);
714+
// castResult = Avx512F.VL.BlendVariable(convertMasked, maxLong, ovfMask);
715+
716+
GenTree* zero = m_compiler->gtNewZeroConNode(TYP_SIMD16);
717+
718+
srcClone = m_compiler->gtClone(srcVector);
719+
GenTree* convert =
720+
m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, srcClone,
721+
NI_AVX512_ConvertToVector128Int64WithTruncation, srcType, 16);
722+
723+
castRange.InsertAtEnd(zero);
724+
castRange.InsertAtEnd(srcClone);
725+
castRange.InsertAtEnd(convert);
726+
727+
GenTree* convertMasked = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, zero, convert, nanMask,
728+
NI_AVX512_BlendVariableMask, dstType, 16);
729+
730+
GenTreeVecCon* maxLong = m_compiler->gtNewVconNode(TYP_SIMD16);
731+
maxLong->EvaluateBroadcastInPlace(dstType, INT64_MAX);
732+
733+
castRange.InsertAtEnd(convertMasked);
734+
castRange.InsertAtEnd(maxLong);
735+
736+
castResult = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, convertMasked, maxLong, ovfMask,
737+
NI_AVX512_BlendVariableMask, dstType, 16);
738+
}
739+
740+
// Because the results are in a SIMD register, we need to ToScalar() them out.
741+
GenTree* toScalar = m_compiler->gtNewSimdToScalarNode(genActualType(dstType), castResult, dstType, 16);
742+
743+
castRange.InsertAtEnd(castResult);
744+
castRange.InsertAtEnd(toScalar);
745+
746+
Range().InsertAfter(cast, std::move(castRange));
747+
Range().Remove(cast);
748+
617749
if (use.IsDummyUse())
618750
{
619751
toScalar->SetUnusedValue();
620752
}
621753
use.ReplaceWith(toScalar);
622754

623-
return toScalar->gtNext;
755+
return toScalar;
624756
}
625757
#endif // FEATURE_HW_INTRINSICS && TARGET_X86
626758

src/coreclr/jit/flowgraph.cpp

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1339,12 +1339,8 @@ bool Compiler::fgCastRequiresHelper(var_types fromType, var_types toType, bool o
13391339
}
13401340

13411341
#if defined(TARGET_X86) || defined(TARGET_ARM)
1342-
if (varTypeIsFloating(fromType) && varTypeIsLong(toType))
1343-
{
1344-
return true;
1345-
}
1346-
1347-
if (varTypeIsLong(fromType) && varTypeIsFloating(toType))
1342+
if ((varTypeIsLong(fromType) && varTypeIsFloating(toType)) ||
1343+
(varTypeIsFloating(fromType) && varTypeIsLong(toType)))
13481344
{
13491345
#if defined(TARGET_X86)
13501346
return !compOpportunisticallyDependsOn(InstructionSet_AVX512);

src/coreclr/jit/gentree.cpp

Lines changed: 86 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -6495,7 +6495,38 @@ unsigned Compiler::gtSetEvalOrder(GenTree* tree)
64956495
{
64966496
var_types dstType = tree->AsCast()->CastToType();
64976497

6498-
if (varTypeIsLong(dstType))
6498+
if (compOpportunisticallyDependsOn(InstructionSet_AVX10v2))
6499+
{
6500+
#if defined(TARGET_X86)
6501+
if (varTypeIsLong(dstType))
6502+
{
6503+
// unsigned: vcvttp*2uqqs xmm0, xmm0
6504+
// vmovq [mem], xmm0
6505+
//
6506+
// signed: vcvttp*2qqs xmm0, xmm0
6507+
// vmovq [mem], xmm0
6508+
6509+
costEx = 4 + FLT_IND_COST_EX; // 4 + FLT_IND_COST_EX
6510+
costSz = 6 + 6; // 12
6511+
6512+
if (op1Type == TYP_FLOAT)
6513+
{
6514+
// vector widening float->long instructions take 1 extra cycle
6515+
// compared to same-size conversion
6516+
costEx += 1;
6517+
}
6518+
}
6519+
else
6520+
#endif
6521+
{
6522+
// unsigned: vcvtts*2usis eax, xmm0
6523+
// signed: vcvtts*2sis eax, xmm0
6524+
6525+
costEx = 7;
6526+
costSz = 6;
6527+
}
6528+
}
6529+
else if (varTypeIsLong(dstType))
64996530
{
65006531
#if defined(TARGET_AMD64)
65016532
if (varTypeIsUnsigned(dstType))
@@ -6543,24 +6574,59 @@ unsigned Compiler::gtSetEvalOrder(GenTree* tree)
65436574
costSz = 5 + 4 + 10 + 5 + 8 + 4; // 36
65446575
}
65456576
#else
6546-
// unsigned: ...
6547-
// call CORINFO_HELP_DBL2ULNG
6548-
//
6549-
// signed: ...
6550-
// call CORINFO_HELP_DBL2ULNG
6551-
6552-
costEx = 5 + (3 * IND_COST_EX); // CALL
6553-
costSz = 5; // 5
6577+
if (compOpportunisticallyDependsOn(InstructionSet_AVX512))
6578+
{
6579+
if (varTypeIsUnsigned(dstType))
6580+
{
6581+
// vxorps xmm1, xmm1, xmm1
6582+
// vmaxs* xmm0, xmm0, xmm1
6583+
// vcvttp*2uqq xmm0, xmm0
6584+
// vmovq [mem], xmm0
65546585

6555-
level++;
6586+
costEx = 1 + 4 + 4 + FLT_IND_COST_EX; // 9 + FLT_IND_COST_EX
6587+
costSz = 4 + 4 + 6 + 6; // 20
6588+
}
6589+
else
6590+
{
6591+
// vcmpords* k1, xmm0, xmm0
6592+
// vcmpge_oqs* k2, xmm0, qword ptr [@RWD00]
6593+
// vcvttp*2qq xmm0 {k1}{z}, xmm0
6594+
// vpblendmq xmm0 {k2}, xmm0, qword ptr [@RWD08] {1to2}
6595+
// vmovq [mem], xmm0
6596+
6597+
costEx = 4 + (4 + FLT_IND_COST_EX) + 4 + (1 + FLT_IND_COST_EX) +
6598+
FLT_IND_COST_EX; // 13 + (3 * FLT_IND_COST_EX)
6599+
costSz = 7 + 11 + 6 + 10 + 6; // 40
6600+
}
65566601

6557-
if (op1Type == TYP_FLOAT)
6602+
if (op1Type == TYP_FLOAT)
6603+
{
6604+
// vector widening float->long instructions take 1 extra cycle
6605+
// compared to same-size conversion
6606+
costEx += 1;
6607+
}
6608+
}
6609+
else
65586610
{
6559-
// vcvtss2sd xmm0, xmm0, xmm0
6560-
// ...
6611+
// unsigned: ...
6612+
// call CORINFO_HELP_DBL2ULNG
6613+
//
6614+
// signed: ...
6615+
// call CORINFO_HELP_DBL2ULNG
6616+
6617+
costEx = 5 + (3 * IND_COST_EX); // CALL
6618+
costSz = 5; // 5
6619+
6620+
level++;
65616621

6562-
costEx += 4; // 4 + CALL
6563-
costSz += 4; // 9
6622+
if (op1Type == TYP_FLOAT)
6623+
{
6624+
// vcvtss2sd xmm0, xmm0, xmm0
6625+
// ...
6626+
6627+
costEx += 4; // 4 + CALL
6628+
costSz += 4; // 9
6629+
}
65646630
}
65656631
#endif
65666632
}
@@ -34999,8 +35065,8 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
3499935065
break;
3500035066
}
3500135067

35002-
bool maskIsZero = false;
35003-
bool maskIsAllOnes = false;
35068+
bool maskIsZero = false;
35069+
bool maskIsAllBitsSet = false;
3500435070

3500535071
if (op3->IsCnsMsk())
3500635072
{
@@ -35011,7 +35077,7 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
3501135077
GenTreeMskCon* mask = op3->AsMskCon();
3501235078
uint32_t elemCount = simdSize / genTypeSize(simdBaseType);
3501335079

35014-
maskIsAllOnes = mask->gtSimdMaskVal.GetRawBits() == simdmask_t::GetBitMask(elemCount);
35080+
maskIsAllBitsSet = mask->gtSimdMaskVal.GetRawBits() == simdmask_t::GetBitMask(elemCount);
3501535081
}
3501635082
}
3501735083
else
@@ -35022,11 +35088,11 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
3502235088

3502335089
if (!maskIsZero)
3502435090
{
35025-
maskIsAllOnes = op3->IsVectorAllBitsSet();
35091+
maskIsAllBitsSet = op3->IsVectorAllBitsSet();
3502635092
}
3502735093
}
3502835094

35029-
if (maskIsAllOnes)
35095+
if (maskIsAllBitsSet)
3503035096
{
3503135097
if ((op1->gtFlags & GTF_SIDE_EFFECT) != 0)
3503235098
{

src/coreclr/jit/hwintrinsiclistxarch.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1252,6 +1252,7 @@ HARDWARE_INTRINSIC(AVX512, CompareNotGreaterThanOrEqualMask,
12521252
HARDWARE_INTRINSIC(AVX512, CompareNotLessThanMask, -1, 2, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, 1, 4, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask)
12531253
HARDWARE_INTRINSIC(AVX512, CompareNotLessThanOrEqualMask, -1, 2, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, 1, 4, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask)
12541254
HARDWARE_INTRINSIC(AVX512, CompareOrderedMask, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcmpps, INS_vcmppd}, -1, 4, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask)
1255+
HARDWARE_INTRINSIC(AVX512, CompareScalarMask, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcmpss, INS_vcmpsd}, -1, 4, HW_Category_IMM, HW_Flag_ReturnsPerElementMask)
12551256
HARDWARE_INTRINSIC(AVX512, CompareUnorderedMask, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcmpps, INS_vcmppd}, -1, 4, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask)
12561257
HARDWARE_INTRINSIC(AVX512, CompressMask, -1, 3, {INS_vpcompressb, INS_vpcompressb, INS_vpcompressw, INS_vpcompressw, INS_vpcompressd, INS_vpcompressd, INS_vpcompressq, INS_vpcompressq, INS_vcompressps, INS_vcompresspd}, 3, 3, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromSecondArg)
12571258
HARDWARE_INTRINSIC(AVX512, CompressStoreMask, -1, 3, {INS_vpcompressb, INS_vpcompressb, INS_vpcompressw, INS_vpcompressw, INS_vpcompressd, INS_vpcompressd, INS_vpcompressq, INS_vpcompressq, INS_vcompressps, INS_vcompresspd}, -1, -1, HW_Category_MemoryStore, HW_Flag_NoFlag)

0 commit comments

Comments
 (0)