JIT: Accelerate floating->long casts on x86 (#125180)

saucecontrol · tannergooding · Copilot · web-flow · commit d1163e5a8f3f · 2026-04-29T13:06:58.000Z
This adds floating->long/ulong cast codegen for AVX-512 and AVX10.2 on x86. With this, all non-overflow casts are now hardware accelerated. This is the last bit pulled from #116805. Typical Diff (double->long AVX-512): ```diff - sub esp, 8 - vzeroupper - vmovsd xmm0, qword ptr [esp+0x0C] - sub esp, 8 - ; npt arg push 0 - ; npt arg push 1 - vmovsd qword ptr [esp], xmm0 - call CORINFO_HELP_DBL2LNG - ; gcr arg pop 2 + vmovsd xmm0, qword ptr [esp+0x04] + vcmpordsd k1, xmm0, xmm0 + vcmpge_oqsd k2, xmm0, qword ptr [@rwd00] + vcvttpd2qq xmm0 {k1}{z}, xmm0 + vpblendmq xmm0 {k2}, xmm0, qword ptr [@RWD08] {1to2} + vmovd eax, xmm0 + vpextrd edx, xmm0, 1 - add esp, 8 ret 8 +RWD00 dq 43E0000000000000h +RWD08 dq 7FFFFFFFFFFFFFFFh -; Total bytes of code 31 +; Total bytes of code 53 ``` Full [Diffs](https://dev.azure.com/dnceng-public/public/_build/results?buildId=1391699&view=ms.vss-build-web.run-extensions-tab) Breakdown of the double->long asm: ```asm ; load the scalar double vmovsd xmm0, qword ptr [esp+0x04] ; set the low bit of k1 if the scalar value is not NaN vcmpordsd k1, xmm0, xmm0 ; set the low bit of k2 if the input was greater than or equal to 2^63 (nearest double greater than long.MaxValue) vcmpge_oqsd k2, xmm0, qword ptr [@rwd00] ; convert, using k1 mask bit. if the mask bit is not set (meaning we have a NaN), set the value to zero vcvttpd2qq xmm0 {k1}{z}, xmm0 ; if the low bit of k2 is set (meaning overflow), set the value to long.MaxValue, otherwise take the conversion result vpblendmq xmm0 {k2}, xmm0, qword ptr [@RWD08] {1to2} ; extract the two 32-bit halves of the long result vmovd eax, xmm0 vpextrd edx, xmm0, 1 ``` --------- Co-authored-by: Tanner Gooding <tagoo@microsoft.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
diff --git a/src/coreclr/jit/decomposelongs.cpp b/src/coreclr/jit/decomposelongs.cpp
@@ -587,40 +587,172 @@ GenTree* DecomposeLongs::DecomposeCast(LIR::Use& use)
     }
 
 #if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_X86)
-    if (varTypeIsFloating(dstType))
+    if (varTypeIsFloating(srcType) || varTypeIsFloating(dstType))
     {
         // We will reach this path only if morph did not convert the cast to a helper call,
         // meaning we can perform the cast using SIMD instructions.
-        // The sequence this creates is simply:
-        //    AVX512DQ.VL.ConvertToVector128Single(Vector128.CreateScalarUnsafe(LONG)).ToScalar()
-
-        NamedIntrinsic intrinsicId = NI_Illegal;
-        GenTree*       srcOp       = cast->CastOp();
 
         assert(!cast->gtOverflow());
         assert(m_compiler->compIsaSupportedDebugOnly(InstructionSet_AVX512));
 
-        intrinsicId = (dstType == TYP_FLOAT) ? NI_AVX512_ConvertToVector128Single : NI_AVX512_ConvertToVector128Double;
+        GenTree*   srcOp      = cast->CastOp();
+        GenTree*   castResult = nullptr;
+        LIR::Range castRange  = LIR::EmptyRange();
 
-        GenTree* createScalar = m_compiler->gtNewSimdCreateScalarUnsafeNode(TYP_SIMD16, srcOp, srcType, 16);
-        GenTree* convert  = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, createScalar, intrinsicId, srcType, 16);
-        GenTree* toScalar = m_compiler->gtNewSimdToScalarNode(dstType, convert, dstType, 16);
+        // This creates the equivalent of the following C# code:
+        //   var srcVec = Vector128.CreateScalarUnsafe(castOp);
 
-        Range().InsertAfter(cast, createScalar, convert, toScalar);
-        Range().Remove(cast);
+        GenTree* srcVector = m_compiler->gtNewSimdCreateScalarUnsafeNode(TYP_SIMD16, srcOp, srcType, 16);
+        castRange.InsertAtEnd(srcVector);
 
-        if (createScalar->IsCnsVec())
+        if (srcVector->IsCnsVec())
         {
             Range().Remove(srcOp);
         }
 
+        if (varTypeIsFloating(dstType))
+        {
+            // long->floating casts don't require any kind of fixup. We simply use the vector
+            // form of the instructions, because the scalar form is not supported on 32-bit.
+
+            NamedIntrinsic intrinsicId =
+                (dstType == TYP_FLOAT) ? NI_AVX512_ConvertToVector128Single : NI_AVX512_ConvertToVector128Double;
+
+            castResult = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, srcVector, intrinsicId, srcType, 16);
+        }
+        else if (m_compiler->compOpportunisticallyDependsOn(InstructionSet_AVX10v2))
+        {
+            // Likewise, the AVX10.2 saturating floating->long instructions give the correct result,
+            // but we have to use the vector form.
+
+            NamedIntrinsic intrinsicId = (dstType == TYP_ULONG)
+                                             ? NI_AVX10v2_ConvertToVectorUInt64WithTruncatedSaturation
+                                             : NI_AVX10v2_ConvertToVectorInt64WithTruncatedSaturation;
+
+            castResult = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, srcVector, intrinsicId, srcType, 16);
+        }
+        else if (dstType == TYP_ULONG)
+        {
+            // AVX-512 unsigned conversion instructions correctly saturate for positive overflow, so
+            // we only need to fix up negative or NaN values before conversion.
+            //
+            // maxs[sd] will take the value from the second operand if the first operand's value is
+            // NaN, which allows us to fix up both negative and NaN values with a single instruction.
+            //
+            // This creates the equivalent of the following C# code:
+            //   var fixupVal = Sse.MaxScalar(srcVec, Vector128<T>.Zero);
+            //   castResult = Avx512DQ.VL.ConvertToVector128UInt64WithTruncation(fixupVal);
+
+            GenTree* zero = m_compiler->gtNewZeroConNode(TYP_SIMD16);
+            GenTree* fixupVal =
+                m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, srcVector, zero, NI_X86Base_MaxScalar, srcType, 16);
+
+            castRange.InsertAtEnd(zero);
+            castRange.InsertAtEnd(fixupVal);
+
+            castResult =
+                m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, fixupVal,
+                                                     NI_AVX512_ConvertToVector128UInt64WithTruncation, srcType, 16);
+        }
+        else
+        {
+            assert(dstType == TYP_LONG);
+
+            // We will use the input value multiple times, so we replace it with a lclVar.
+            LIR::Use srcUse;
+            LIR::Use::MakeDummyUse(castRange, srcVector, &srcUse);
+            srcUse.ReplaceWithLclVar(m_compiler);
+            srcVector = srcUse.Def();
+
+            // This logic is similar to the floating->long saturating logic in Lowering::LowerCast,
+            // except that here we must keep everything in SIMD registers. We can also take advantage
+            // of EVEX masking since the conversion itself requires AVX-512.
+            //
+            // We fix up NaN values by masking in zero during conversion. Negative saturation is handled
+            // correctly by the conversion instructions. Positive saturation is handled after conversion,
+            // because MaxValue is not precisely representable in the floating format.
+            //
+            // This creates roughly the equivalent of the following C# code:
+            //   var nanMask = Avx.CompareScalar(srcVec, srcVec, FloatComparisonMode.OrderedNonSignaling);
+            //
+            //   var compareMode      = FloatComparisonMode.OrderedGreaterThanOrEqualNonSignaling;
+            //   var ovfFloatingValue = Vector128.Create(9223372036854775808.0);
+            //   var ovfMask          = Avx.CompareScalar(srcVec, ovfFloatingValue, compareMode);
+
+            GenTree* srcClone = m_compiler->gtClone(srcVector);
+            GenTree* compareMode =
+                m_compiler->gtNewIconNode(static_cast<int32_t>(FloatComparisonMode::OrderedNonSignaling));
+            GenTree* nanMask = m_compiler->gtNewSimdHWIntrinsicNode(TYP_MASK, srcVector, srcClone, compareMode,
+                                                                    NI_AVX512_CompareScalarMask, srcType, 16);
+
+            castRange.InsertAtEnd(srcClone);
+            castRange.InsertAtEnd(compareMode);
+            castRange.InsertAtEnd(nanMask);
+
+            compareMode = m_compiler->gtNewIconNode(
+                static_cast<int32_t>(FloatComparisonMode::OrderedGreaterThanOrEqualNonSignaling));
+
+            GenTreeVecCon* ovfFloatingValue = m_compiler->gtNewVconNode(TYP_SIMD16);
+            ovfFloatingValue->EvaluateBroadcastInPlace(srcType, 9223372036854775808.0); // 2^63
+
+            srcClone         = m_compiler->gtClone(srcVector);
+            GenTree* ovfMask = m_compiler->gtNewSimdHWIntrinsicNode(TYP_MASK, srcClone, ovfFloatingValue, compareMode,
+                                                                    NI_AVX512_CompareScalarMask, srcType, 16);
+
+            castRange.InsertAtEnd(srcClone);
+            castRange.InsertAtEnd(ovfFloatingValue);
+            castRange.InsertAtEnd(compareMode);
+            castRange.InsertAtEnd(ovfMask);
+
+            // Now we convert, using the masks created above for NaN and positive overflow saturation.
+            //
+            // This creates roughly the equivalent of the following C# code:
+            //   var convert       = Avx512DQ.VL.ConvertToVector128Int64WithTruncation(srcVec);
+            //   var convertMasked = Avx512F.VL.BlendVariable(Vector128<long>.Zero, convert, nanMask);
+            //
+            //   var maxLong       = Vector128.Create(long.MaxValue);
+            //   castResult        = Avx512F.VL.BlendVariable(convertMasked, maxLong, ovfMask);
+
+            GenTree* zero = m_compiler->gtNewZeroConNode(TYP_SIMD16);
+
+            srcClone = m_compiler->gtClone(srcVector);
+            GenTree* convert =
+                m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, srcClone,
+                                                     NI_AVX512_ConvertToVector128Int64WithTruncation, srcType, 16);
+
+            castRange.InsertAtEnd(zero);
+            castRange.InsertAtEnd(srcClone);
+            castRange.InsertAtEnd(convert);
+
+            GenTree* convertMasked = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, zero, convert, nanMask,
+                                                                          NI_AVX512_BlendVariableMask, dstType, 16);
+
+            GenTreeVecCon* maxLong = m_compiler->gtNewVconNode(TYP_SIMD16);
+            maxLong->EvaluateBroadcastInPlace(dstType, INT64_MAX);
+
+            castRange.InsertAtEnd(convertMasked);
+            castRange.InsertAtEnd(maxLong);
+
+            castResult = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, convertMasked, maxLong, ovfMask,
+                                                              NI_AVX512_BlendVariableMask, dstType, 16);
+        }
+
+        // Because the results are in a SIMD register, we need to ToScalar() them out.
+        GenTree* toScalar = m_compiler->gtNewSimdToScalarNode(genActualType(dstType), castResult, dstType, 16);
+
+        castRange.InsertAtEnd(castResult);
+        castRange.InsertAtEnd(toScalar);
+
+        Range().InsertAfter(cast, std::move(castRange));
+        Range().Remove(cast);
+
         if (use.IsDummyUse())
         {
             toScalar->SetUnusedValue();
         }
         use.ReplaceWith(toScalar);
 
-        return toScalar->gtNext;
+        return toScalar;
     }
 #endif // FEATURE_HW_INTRINSICS && TARGET_X86
 
diff --git a/src/coreclr/jit/flowgraph.cpp b/src/coreclr/jit/flowgraph.cpp
@@ -1339,12 +1339,8 @@ bool Compiler::fgCastRequiresHelper(var_types fromType, var_types toType, bool o
     }
 
 #if defined(TARGET_X86) || defined(TARGET_ARM)
-    if (varTypeIsFloating(fromType) && varTypeIsLong(toType))
-    {
-        return true;
-    }
-
-    if (varTypeIsLong(fromType) && varTypeIsFloating(toType))
+    if ((varTypeIsLong(fromType) && varTypeIsFloating(toType)) ||
+        (varTypeIsFloating(fromType) && varTypeIsLong(toType)))
     {
 #if defined(TARGET_X86)
         return !compOpportunisticallyDependsOn(InstructionSet_AVX512);
diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp
@@ -6495,7 +6495,38 @@ unsigned Compiler::gtSetEvalOrder(GenTree* tree)
                     {
                         var_types dstType = tree->AsCast()->CastToType();
 
-                        if (varTypeIsLong(dstType))
+                        if (compOpportunisticallyDependsOn(InstructionSet_AVX10v2))
+                        {
+#if defined(TARGET_X86)
+                            if (varTypeIsLong(dstType))
+                            {
+                                // unsigned: vcvttp*2uqqs xmm0, xmm0
+                                //           vmovq        [mem], xmm0
+                                //
+                                // signed:   vcvttp*2qqs  xmm0, xmm0
+                                //           vmovq        [mem], xmm0
+
+                                costEx = 4 + FLT_IND_COST_EX; // 4 + FLT_IND_COST_EX
+                                costSz = 6 + 6;               // 12
+
+                                if (op1Type == TYP_FLOAT)
+                                {
+                                    // vector widening float->long instructions take 1 extra cycle
+                                    // compared to same-size conversion
+                                    costEx += 1;
+                                }
+                            }
+                            else
+#endif
+                            {
+                                // unsigned: vcvtts*2usis eax, xmm0
+                                // signed:   vcvtts*2sis  eax, xmm0
+
+                                costEx = 7;
+                                costSz = 6;
+                            }
+                        }
+                        else if (varTypeIsLong(dstType))
                         {
 #if defined(TARGET_AMD64)
                             if (varTypeIsUnsigned(dstType))
@@ -6543,24 +6574,59 @@ unsigned Compiler::gtSetEvalOrder(GenTree* tree)
                                 costSz = 5 + 4 + 10 + 5 + 8 + 4;                    // 36
                             }
 #else
-                            // unsigned: ...
-                            //           call CORINFO_HELP_DBL2ULNG
-                            //
-                            // signed:   ...
-                            //           call CORINFO_HELP_DBL2ULNG
-
-                            costEx = 5 + (3 * IND_COST_EX); // CALL
-                            costSz = 5;                     // 5
+                            if (compOpportunisticallyDependsOn(InstructionSet_AVX512))
+                            {
+                                if (varTypeIsUnsigned(dstType))
+                                {
+                                    // vxorps       xmm1, xmm1, xmm1
+                                    // vmaxs*       xmm0, xmm0, xmm1
+                                    // vcvttp*2uqq  xmm0, xmm0
+                                    // vmovq        [mem], xmm0
 
-                            level++;
+                                    costEx = 1 + 4 + 4 + FLT_IND_COST_EX; // 9 + FLT_IND_COST_EX
+                                    costSz = 4 + 4 + 6 + 6;               // 20
+                                }
+                                else
+                                {
+                                    // vcmpords*    k1, xmm0, xmm0
+                                    // vcmpge_oqs*  k2, xmm0, qword ptr [@RWD00]
+                                    // vcvttp*2qq   xmm0 {k1}{z}, xmm0
+                                    // vpblendmq    xmm0 {k2}, xmm0, qword ptr [@RWD08] {1to2}
+                                    // vmovq        [mem], xmm0
+
+                                    costEx = 4 + (4 + FLT_IND_COST_EX) + 4 + (1 + FLT_IND_COST_EX) +
+                                             FLT_IND_COST_EX;     // 13 + (3 * FLT_IND_COST_EX)
+                                    costSz = 7 + 11 + 6 + 10 + 6; // 40
+                                }
 
-                            if (op1Type == TYP_FLOAT)
+                                if (op1Type == TYP_FLOAT)
+                                {
+                                    // vector widening float->long instructions take 1 extra cycle
+                                    // compared to same-size conversion
+                                    costEx += 1;
+                                }
+                            }
+                            else
                             {
-                                // vcvtss2sd xmm0, xmm0, xmm0
-                                // ...
+                                // unsigned: ...
+                                //           call CORINFO_HELP_DBL2ULNG
+                                //
+                                // signed:   ...
+                                //           call CORINFO_HELP_DBL2ULNG
+
+                                costEx = 5 + (3 * IND_COST_EX); // CALL
+                                costSz = 5;                     // 5
+
+                                level++;
 
-                                costEx += 4; // 4 + CALL
-                                costSz += 4; // 9
+                                if (op1Type == TYP_FLOAT)
+                                {
+                                    // vcvtss2sd xmm0, xmm0, xmm0
+                                    // ...
+
+                                    costEx += 4; // 4 + CALL
+                                    costSz += 4; // 9
+                                }
                             }
 #endif
                         }
@@ -34999,8 +35065,8 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
                     break;
                 }
 
-                bool maskIsZero    = false;
-                bool maskIsAllOnes = false;
+                bool maskIsZero       = false;
+                bool maskIsAllBitsSet = false;
 
                 if (op3->IsCnsMsk())
                 {
@@ -35011,7 +35077,7 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
                         GenTreeMskCon* mask      = op3->AsMskCon();
                         uint32_t       elemCount = simdSize / genTypeSize(simdBaseType);
 
-                        maskIsAllOnes = mask->gtSimdMaskVal.GetRawBits() == simdmask_t::GetBitMask(elemCount);
+                        maskIsAllBitsSet = mask->gtSimdMaskVal.GetRawBits() == simdmask_t::GetBitMask(elemCount);
                     }
                 }
                 else
@@ -35022,11 +35088,11 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
 
                     if (!maskIsZero)
                     {
-                        maskIsAllOnes = op3->IsVectorAllBitsSet();
+                        maskIsAllBitsSet = op3->IsVectorAllBitsSet();
                     }
                 }
 
-                if (maskIsAllOnes)
+                if (maskIsAllBitsSet)
                 {
                     if ((op1->gtFlags & GTF_SIDE_EFFECT) != 0)
                     {
diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h
@@ -1252,6 +1252,7 @@ HARDWARE_INTRINSIC(AVX512,          CompareNotGreaterThanOrEqualMask,
 HARDWARE_INTRINSIC(AVX512,          CompareNotLessThanMask,                                          -1,              2,     {INS_vpcmpb,            INS_vpcmpub,            INS_vpcmpw,             INS_vpcmpuw,            INS_vpcmpd,             INS_vpcmpud,            INS_vpcmpq,             INS_vpcmpuq,            INS_vcmpps,             INS_vcmppd},             1,         4,         HW_Category_SimpleSIMD,             HW_Flag_ReturnsPerElementMask)
 HARDWARE_INTRINSIC(AVX512,          CompareNotLessThanOrEqualMask,                                   -1,              2,     {INS_vpcmpb,            INS_vpcmpub,            INS_vpcmpw,             INS_vpcmpuw,            INS_vpcmpd,             INS_vpcmpud,            INS_vpcmpq,             INS_vpcmpuq,            INS_vcmpps,             INS_vcmppd},             1,         4,         HW_Category_SimpleSIMD,             HW_Flag_ReturnsPerElementMask)
 HARDWARE_INTRINSIC(AVX512,          CompareOrderedMask,                                              -1,              2,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_vcmpps,             INS_vcmppd},            -1,         4,         HW_Category_SimpleSIMD,             HW_Flag_ReturnsPerElementMask)
+HARDWARE_INTRINSIC(AVX512,          CompareScalarMask,                                               16,              3,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_vcmpss,             INS_vcmpsd},            -1,         4,         HW_Category_IMM,                    HW_Flag_ReturnsPerElementMask)
 HARDWARE_INTRINSIC(AVX512,          CompareUnorderedMask,                                            -1,              2,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_vcmpps,             INS_vcmppd},            -1,         4,         HW_Category_SimpleSIMD,             HW_Flag_ReturnsPerElementMask)
 HARDWARE_INTRINSIC(AVX512,          CompressMask,                                                    -1,              3,     {INS_vpcompressb,       INS_vpcompressb,        INS_vpcompressw,        INS_vpcompressw,        INS_vpcompressd,        INS_vpcompressd,        INS_vpcompressq,        INS_vpcompressq,        INS_vcompressps,        INS_vcompresspd},        3,         3,         HW_Category_SimpleSIMD,             HW_Flag_BaseTypeFromSecondArg)
 HARDWARE_INTRINSIC(AVX512,          CompressStoreMask,                                               -1,              3,     {INS_vpcompressb,       INS_vpcompressb,        INS_vpcompressw,        INS_vpcompressw,        INS_vpcompressd,        INS_vpcompressd,        INS_vpcompressq,        INS_vpcompressq,        INS_vcompressps,        INS_vcompresspd},       -1,        -1,         HW_Category_MemoryStore,            HW_Flag_NoFlag)
diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp
diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp

Original file line number	Diff line number	Diff line change
`@@ -1339,12 +1339,8 @@ bool Compiler::fgCastRequiresHelper(var_types fromType, var_types toType, bool o`
`1339`	`1339`	`}`
`1340`	`1340`
`1341`	`1341`	`#if defined(TARGET_X86) \|\| defined(TARGET_ARM)`
`1342`		`- if (varTypeIsFloating(fromType) && varTypeIsLong(toType))`
`1343`		`- {`
`1344`		`- return true;`
`1345`		`- }`
`1346`		`-`
`1347`		`- if (varTypeIsLong(fromType) && varTypeIsFloating(toType))`
	`1342`	`+ if ((varTypeIsLong(fromType) && varTypeIsFloating(toType)) \|\|`
	`1343`	`+ (varTypeIsFloating(fromType) && varTypeIsLong(toType)))`
`1348`	`1344`	`{`
`1349`	`1345`	`#if defined(TARGET_X86)`
`1350`	`1346`	`return !compOpportunisticallyDependsOn(InstructionSet_AVX512);`