From a6450c7913313e204576fe977e07bbcd01dcb22e Mon Sep 17 00:00:00 2001 From: Sebastian Nickolls Date: Fri, 30 Jan 2026 09:20:12 +0000 Subject: [PATCH 1/2] Arm64: Improve support for HW_Flag_ReturnsPerElementMask When wrapping an intrinsic node that has an embedded mask with a ConditionalSelect, ensure that the constant node in op3 has a mask type when the intrinsic has the HW_Flag_ReturnsPerElementMask flag. Build out further support for ConditionalSelect_Predicates, and use this to wrap nodes with HW_Flag_ReturnsPerElementMask. Add GenTree::IsSelectZero and update various areas in HW intrinsic codegen to ensure this intrinsic assembles correctly. Use a tree visitor for assigning `TYP_MASK` to intrinsics that have `HW_Flag_ReturnsPerElementMask`. The current version of `impHWIntrinsic` does not process child nodes of the tree it returns for mask types, only the root node. --- src/coreclr/jit/emitarm64sve.cpp | 7 +- src/coreclr/jit/gentree.cpp | 8 +++ src/coreclr/jit/gentree.h | 17 +++++ src/coreclr/jit/hwintrinsic.cpp | 79 ++++++++++++++++++--- src/coreclr/jit/hwintrinsic.h | 14 ++++ src/coreclr/jit/hwintrinsicarm64.cpp | 2 +- src/coreclr/jit/hwintrinsiccodegenarm64.cpp | 15 ++-- src/coreclr/jit/hwintrinsiclistarm64sve.h | 42 +++++------ src/coreclr/jit/lowerarmarch.cpp | 75 ++++++++++--------- src/coreclr/jit/lsraarm64.cpp | 5 +- 10 files changed, 189 insertions(+), 75 deletions(-) diff --git a/src/coreclr/jit/emitarm64sve.cpp b/src/coreclr/jit/emitarm64sve.cpp index 2246bfcb29bcfe..a6ef13deff4062 100644 --- a/src/coreclr/jit/emitarm64sve.cpp +++ b/src/coreclr/jit/emitarm64sve.cpp @@ -4042,6 +4042,7 @@ void emitter::emitInsSve_R_R_R(instruction ins, assert(isPredicateRegister(reg1)); // MMMM assert(isPredicateRegister(reg2)); // gggg assert(isPredicateRegister(reg3)); // NNNN + opt = INS_OPTS_SCALABLE_B; fmt = IF_SVE_DC_3A; break; @@ -6336,11 +6337,15 @@ void emitter::emitInsSve_R_R_R_R(instruction ins, } else { - assert(opt == INS_OPTS_SCALABLE_B); + assert(insOptsScalable(opt)); assert(isPredicateRegister(reg1)); // dddd assert(isPredicateRegister(reg2)); // gggg assert(isPredicateRegister(reg3)); // nnnn assert(isPredicateRegister(reg4)); // mmmm + // We support all lane arrangements, although we require byte arrangement for the + // encoding as there is only one encoding. This operation is bitwise, so it will + // preserve other lane arrangements anyway. + opt = INS_OPTS_SCALABLE_B; fmt = IF_SVE_CZ_4A; } break; diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index ef9ed60c7e3518..1cfc49cda0dc6b 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -9419,6 +9419,14 @@ GenTree* Compiler::gtNewZeroConNode(var_types type) vecCon->gtSimdVal = simd_t::Zero(); return vecCon; } +#ifdef FEATURE_MASKED_HW_INTRINSICS + else if (varTypeIsMask(type)) + { + GenTreeMskCon* mskCon = gtNewMskConNode(TYP_MASK); + mskCon->gtSimdMaskVal = simdmask_t::Zero(); + return mskCon; + } +#endif // FEATURE_MASKED_HW_INTRINSICS #endif // FEATURE_SIMD type = genActualType(type); diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index 1897b43014f17d..49c0ddbb084671 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -1968,6 +1968,7 @@ struct GenTree inline bool IsMaskZero() const; inline bool IsMaskAllBitsSet() const; inline bool IsTrueMask(var_types simdBaseType) const; + inline bool IsSelectZero() const; inline uint64_t GetIntegralVectorConstElement(size_t index, var_types simdBaseType); @@ -9847,6 +9848,22 @@ inline bool GenTree::IsTrueMask(var_types simdBaseType) const return false; } +//------------------------------------------------------------------------ +// IsSelectZero: Is the given node a zero value for the purposes of +// conditional selection. ConditionalSelect can operate on all +// vectors or all masks. +// +// Returns true if the node is an all false mask node or a zero vector node. +// +// If such a node is used in op3 of ConditionalSelect, it will result in a +// simple filtering operation on the vector or mask node in op2, using the mask +// provided in op1. +// +inline bool GenTree::IsSelectZero() const +{ + return IsVectorZero() || IsMaskZero(); +} + //------------------------------------------------------------------- // GetIntegralVectorConstElement: Gets the value of a given element in an integral vector constant // diff --git a/src/coreclr/jit/hwintrinsic.cpp b/src/coreclr/jit/hwintrinsic.cpp index aed5d2f236e6d3..db170f45256cf3 100644 --- a/src/coreclr/jit/hwintrinsic.cpp +++ b/src/coreclr/jit/hwintrinsic.cpp @@ -1928,6 +1928,64 @@ bool Compiler::CheckHWIntrinsicImmRange(NamedIntrinsic intrinsic, return true; } +#ifdef FEATURE_MASKED_HW_INTRINSICS +//------------------------------------------------------------------------ +// MaskVisitor: Traverses HW Intrinsic trees and applies mask types to nodes +// +// This visitor is used on HW intrinsic trees as they are imported to ensure +// the tree has correct type assignments where intrinsics operate on masks. +// +// For example, an intrinsic marked with HW_Flag_ReturnsPerElementMask should +// have the node type assigned to TYP_MASK. +// +// This work needs to be done before the 'Optimize Mask Conversions' pass, as +// that pass requires all locals have had the opportunity to have been assigned +// a mask type. +// +// Ideally mask types could be assigned correctly on creation during import, but +// this puts the burden on the importer code to ensure these architecture +// specific details are correct. Ideally this visitor is used sparingly on small +// HW intrinsic trees to avoid any potential traversal cost. +// +class MaskVisitor : public GenTreeVisitor +{ +public: + enum + { + DoPreOrder = true, + }; + + MaskVisitor(Compiler* comp) + : GenTreeVisitor(comp) + { + } + + fgWalkResult PreOrderVisit(GenTree** use, GenTree* user) + { + if ((*use)->OperIsHWIntrinsic()) + { + GenTreeHWIntrinsic* intrin = (*use)->AsHWIntrinsic(); + + if (HWIntrinsicInfo::ReturnsPerElementMask(intrin->GetHWIntrinsicId()) && !intrin->TypeIs(TYP_MASK)) + { + var_types previousType = intrin->TypeGet(); + intrin->gtType = TYP_MASK; + GenTree* converted = + m_compiler->gtNewSimdCvtMaskToVectorNode(previousType, intrin, intrin->GetSimdBaseType(), + intrin->GetSimdSize()); + *use = converted; + } + else if (intrin->TypeIs(TYP_MASK)) + { + assert(HWIntrinsicInfo::ReturnsPerElementMask(intrin->GetHWIntrinsicId())); + } + } + + return fgWalkResult::WALK_CONTINUE; + } +}; +#endif + //------------------------------------------------------------------------ // impHWIntrinsic: Import a hardware intrinsic as a GT_HWINTRINSIC node if possible // @@ -2233,13 +2291,6 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic, } var_types nodeRetType = retType; -#if defined(FEATURE_MASKED_HW_INTRINSICS) && defined(TARGET_ARM64) - if (HWIntrinsicInfo::ReturnsPerElementMask(intrinsic)) - { - // Ensure the result is generated to a mask. - nodeRetType = TYP_MASK; - } -#endif // FEATURE_MASKED_HW_INTRINSICS && TARGET_ARM64 // table-driven importer of simple intrinsics if (impIsTableDrivenHWIntrinsic(intrinsic, category)) @@ -2686,11 +2737,15 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic, } } - if (nodeRetType == TYP_MASK) +#ifdef FEATURE_MASKED_HW_INTRINSICS + if (retNode != nullptr) { - // HWInstrinsic returns a mask, but all returns must be vectors, so convert mask to vector. - retNode = gtNewSimdCvtMaskToVectorNode(retType, retNode, simdBaseType, simdSize); + // Introduce mask types into the tree. + MaskVisitor visitor(this); + visitor.WalkTree(&retNode, nullptr); } +#endif + #endif // FEATURE_MASKED_HW_INTRINSICS && TARGET_ARM64 if ((retNode != nullptr) && retNode->OperIs(GT_HWINTRINSIC)) @@ -2698,6 +2753,10 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic, assert(!retNode->OperMayThrow(this) || ((retNode->gtFlags & GTF_EXCEPT) != 0)); assert(!retNode->OperRequiresAsgFlag() || ((retNode->gtFlags & GTF_ASG) != 0)); assert(!retNode->OperIsImplicitIndir() || ((retNode->gtFlags & GTF_GLOB_REF) != 0)); + +#ifdef FEATURE_MASKED_HW_INTRINSICS + assert(!retNode->TypeIs(TYP_MASK)); +#endif } return retNode; diff --git a/src/coreclr/jit/hwintrinsic.h b/src/coreclr/jit/hwintrinsic.h index 6ed6541fb309f3..86028273b67f0b 100644 --- a/src/coreclr/jit/hwintrinsic.h +++ b/src/coreclr/jit/hwintrinsic.h @@ -1196,6 +1196,20 @@ struct HWIntrinsicInfo } #endif // FEATURE_MASKED_HW_INTRINSICS + // IsSveConditionalSelect: Is this intrinsic a ConditionalSelect intrinsic? + // + // Arguments: + // id -- Intrinsic ID to test + // + // Return value: + // Returns true if the ID is either of the vector or mask variant of + // ConditionalSelect. + // + static bool IsSveConditionalSelect(NamedIntrinsic id) + { + return (id == NI_Sve_ConditionalSelect) || (id == NI_Sve_ConditionalSelect_Predicates); + } + #endif // TARGET_ARM64 static bool HasSpecialSideEffect(NamedIntrinsic id) diff --git a/src/coreclr/jit/hwintrinsicarm64.cpp b/src/coreclr/jit/hwintrinsicarm64.cpp index 7275c2ffe4f305..4972efe7f72938 100644 --- a/src/coreclr/jit/hwintrinsicarm64.cpp +++ b/src/coreclr/jit/hwintrinsicarm64.cpp @@ -2948,7 +2948,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, } // Was not able to generate a pattern, instead import a truemaskall - retNode = gtNewSimdHWIntrinsicNode(TYP_MASK, op1, intrinsic, simdBaseType, simdSize); + retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, simdBaseType, simdSize); break; } diff --git a/src/coreclr/jit/hwintrinsiccodegenarm64.cpp b/src/coreclr/jit/hwintrinsiccodegenarm64.cpp index 5ab1fe078b43ed..555c69cf3b8a92 100644 --- a/src/coreclr/jit/hwintrinsiccodegenarm64.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenarm64.cpp @@ -552,7 +552,7 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) { // Handle case where op2 is operation that needs embedded mask GenTree* op2 = intrin.op2; - assert(intrin.id == NI_Sve_ConditionalSelect); + assert(HWIntrinsicInfo::IsSveConditionalSelect(intrin.id)); assert(op2->OperIsHWIntrinsic()); assert(op2->isContained()); @@ -597,7 +597,7 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) // Shared code for setting up embedded mask arg for intrinsics with 3+ operands auto emitEmbeddedMaskSetupInstrs = [&] { - if (intrin.op3->IsVectorZero() || (targetReg != falseReg) || (targetReg != embMaskOp1Reg)) + if (intrin.op3->IsSelectZero() || (targetReg != falseReg) || (targetReg != embMaskOp1Reg)) { return 1; } @@ -605,7 +605,7 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) }; auto emitEmbeddedMaskSetup = [&] { - if (intrin.op3->IsVectorZero()) + if (intrin.op3->IsSelectZero()) { // If `falseReg` is zero, then move the first operand of `intrinEmbMask` in the // destination using /Z. @@ -712,7 +712,7 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) if (intrin.op3->isContained()) { - assert(intrin.op3->IsVectorZero()); + assert(intrin.op3->IsSelectZero()); if (intrin.op1->isContained() || intrin.op1->IsTrueMask(node->GetSimdBaseType())) { @@ -818,7 +818,7 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) // Predicate functionality is currently not exposed for this API, // but the FADDA instruction only has a predicated variant. // Thus, we expect the JIT to wrap this with CndSel. - assert(intrin.op3->IsVectorZero()); + assert(intrin.op3->IsSelectZero()); break; case NI_Sve2_AddSaturate: @@ -881,7 +881,7 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) } }; - if (intrin.op3->IsVectorZero()) + if ((intrin.op3->IsSelectZero())) { // If `falseReg` is zero, then move the first operand of `intrinEmbMask` in the // destination using /Z. @@ -1228,7 +1228,7 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) // This handles optimizations for instructions that have // an implicit 'zero' vector of what would be the second operand. if (HWIntrinsicInfo::SupportsContainment(intrin.id) && intrin.op2->isContained() && - intrin.op2->IsVectorZero()) + intrin.op2->IsSelectZero()) { GetEmitter()->emitIns_R_R(ins, emitSize, targetReg, op1Reg, opt); } @@ -2787,7 +2787,6 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) case NI_Sve_CreateBreakAfterPropagateMask: case NI_Sve_CreateBreakBeforePropagateMask: - case NI_Sve_ConditionalSelect_Predicates: { GetEmitter()->emitInsSve_R_R_R_R(ins, emitSize, targetReg, op1Reg, op2Reg, op3Reg, INS_OPTS_SCALABLE_B); break; diff --git a/src/coreclr/jit/hwintrinsiclistarm64sve.h b/src/coreclr/jit/hwintrinsiclistarm64sve.h index a636c5901c804e..33318b568bfefa 100644 --- a/src/coreclr/jit/hwintrinsiclistarm64sve.h +++ b/src/coreclr/jit/hwintrinsiclistarm64sve.h @@ -63,28 +63,28 @@ HARDWARE_INTRINSIC(Sve, CreateBreakAfterPropagateMask, HARDWARE_INTRINSIC(Sve, CreateBreakBeforeMask, -1, 2, {INS_sve_brkb, INS_sve_brkb, INS_sve_brkb, INS_sve_brkb, INS_sve_brkb, INS_sve_brkb, INS_sve_brkb, INS_sve_brkb, INS_sve_brkb, INS_sve_brkb}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(Sve, CreateBreakBeforePropagateMask, -1, 3, {INS_sve_brkpb, INS_sve_brkpb, INS_sve_brkpb, INS_sve_brkpb, INS_sve_brkpb, INS_sve_brkpb, INS_sve_brkpb, INS_sve_brkpb, INS_sve_brkpb, INS_sve_brkpb}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialCodeGen|HW_Flag_ZeroingMaskedOperation) HARDWARE_INTRINSIC(Sve, CreateBreakPropagateMask, -1, -1, {INS_sve_brkn, INS_sve_brkn, INS_sve_brkn, INS_sve_brkn, INS_sve_brkn, INS_sve_brkn, INS_sve_brkn, INS_sve_brkn, INS_sve_brkn, INS_sve_brkn}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_ReturnsPerElementMask|HW_Flag_HasRMWSemantics|HW_Flag_ZeroingMaskedOperation) -HARDWARE_INTRINSIC(Sve, CreateFalseMaskByte, -1, 0, {INS_invalid, INS_sve_pfalse, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialImport) -HARDWARE_INTRINSIC(Sve, CreateFalseMaskDouble, -1, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_pfalse}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialImport) -HARDWARE_INTRINSIC(Sve, CreateFalseMaskInt16, -1, 0, {INS_invalid, INS_invalid, INS_sve_pfalse, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialImport) -HARDWARE_INTRINSIC(Sve, CreateFalseMaskInt32, -1, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_pfalse, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialImport) -HARDWARE_INTRINSIC(Sve, CreateFalseMaskInt64, -1, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_pfalse, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialImport) -HARDWARE_INTRINSIC(Sve, CreateFalseMaskSByte, -1, 0, {INS_sve_pfalse, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialImport) -HARDWARE_INTRINSIC(Sve, CreateFalseMaskSingle, -1, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_pfalse, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialImport) -HARDWARE_INTRINSIC(Sve, CreateFalseMaskUInt16, -1, 0, {INS_invalid, INS_invalid, INS_invalid, INS_sve_pfalse, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialImport) -HARDWARE_INTRINSIC(Sve, CreateFalseMaskUInt32, -1, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_pfalse, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialImport) -HARDWARE_INTRINSIC(Sve, CreateFalseMaskUInt64, -1, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_pfalse, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(Sve, CreateFalseMaskByte, -1, 0, {INS_invalid, INS_sve_pfalse, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialImport|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(Sve, CreateFalseMaskDouble, -1, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_pfalse}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialImport|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(Sve, CreateFalseMaskInt16, -1, 0, {INS_invalid, INS_invalid, INS_sve_pfalse, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialImport|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(Sve, CreateFalseMaskInt32, -1, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_pfalse, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialImport|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(Sve, CreateFalseMaskInt64, -1, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_pfalse, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialImport|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(Sve, CreateFalseMaskSByte, -1, 0, {INS_sve_pfalse, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialImport|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(Sve, CreateFalseMaskSingle, -1, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_pfalse, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialImport|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(Sve, CreateFalseMaskUInt16, -1, 0, {INS_invalid, INS_invalid, INS_invalid, INS_sve_pfalse, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialImport|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(Sve, CreateFalseMaskUInt32, -1, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_pfalse, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialImport|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(Sve, CreateFalseMaskUInt64, -1, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_pfalse, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialImport|HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(Sve, CreateMaskForFirstActiveElement, -1, 2, {INS_sve_pfirst, INS_sve_pfirst, INS_sve_pfirst, INS_sve_pfirst, INS_sve_pfirst, INS_sve_pfirst, INS_sve_pfirst, INS_sve_pfirst, INS_sve_pfirst, INS_sve_pfirst}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialCodeGen|HW_Flag_HasRMWSemantics) HARDWARE_INTRINSIC(Sve, CreateMaskForNextActiveElement, -1, 2, {INS_invalid, INS_sve_pnext, INS_invalid, INS_sve_pnext, INS_invalid, INS_sve_pnext, INS_invalid, INS_sve_pnext, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_ReturnsPerElementMask|HW_Flag_HasRMWSemantics) -HARDWARE_INTRINSIC(Sve, CreateTrueMaskByte, -1, 1, {INS_invalid, INS_sve_ptrue, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_SpecialImport) -HARDWARE_INTRINSIC(Sve, CreateTrueMaskDouble, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ptrue}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_SpecialImport) -HARDWARE_INTRINSIC(Sve, CreateTrueMaskInt16, -1, 1, {INS_invalid, INS_invalid, INS_sve_ptrue, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_SpecialImport) -HARDWARE_INTRINSIC(Sve, CreateTrueMaskInt32, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ptrue, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_SpecialImport) -HARDWARE_INTRINSIC(Sve, CreateTrueMaskInt64, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ptrue, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_SpecialImport) -HARDWARE_INTRINSIC(Sve, CreateTrueMaskSByte, -1, 1, {INS_sve_ptrue, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_SpecialImport) -HARDWARE_INTRINSIC(Sve, CreateTrueMaskSingle, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ptrue, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_SpecialImport) -HARDWARE_INTRINSIC(Sve, CreateTrueMaskUInt16, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_sve_ptrue, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_SpecialImport) -HARDWARE_INTRINSIC(Sve, CreateTrueMaskUInt32, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ptrue, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_SpecialImport) -HARDWARE_INTRINSIC(Sve, CreateTrueMaskUInt64, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ptrue, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(Sve, CreateTrueMaskByte, -1, 1, {INS_invalid, INS_sve_ptrue, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_SpecialImport|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(Sve, CreateTrueMaskDouble, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ptrue}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_SpecialImport|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(Sve, CreateTrueMaskInt16, -1, 1, {INS_invalid, INS_invalid, INS_sve_ptrue, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_SpecialImport|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(Sve, CreateTrueMaskInt32, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ptrue, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_SpecialImport|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(Sve, CreateTrueMaskInt64, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ptrue, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_SpecialImport|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(Sve, CreateTrueMaskSByte, -1, 1, {INS_sve_ptrue, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_SpecialImport|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(Sve, CreateTrueMaskSingle, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ptrue, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_SpecialImport|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(Sve, CreateTrueMaskUInt16, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_sve_ptrue, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_SpecialImport|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(Sve, CreateTrueMaskUInt32, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ptrue, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_SpecialImport|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(Sve, CreateTrueMaskUInt64, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ptrue, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_SpecialImport|HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(Sve, CreateWhileLessThanMaskByte, -1, 2, {INS_invalid, INS_sve_whilelt, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(Sve, CreateWhileLessThanMaskDouble, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_whilelt}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(Sve, CreateWhileLessThanMaskInt16, -1, 2, {INS_invalid, INS_invalid, INS_sve_whilelt, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ReturnsPerElementMask) @@ -546,7 +546,7 @@ HARDWARE_INTRINSIC(Sve, And_Predicates, HARDWARE_INTRINSIC(Sve, BitwiseClear_Predicates, -1, 2, {INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ReturnsPerElementMask|HW_Flag_EmbeddedMaskedOperation|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(Sve, Or_Predicates, -1, 2, {INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ReturnsPerElementMask|HW_Flag_EmbeddedMaskedOperation|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(Sve, Xor_Predicates, -1, 2, {INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ReturnsPerElementMask|HW_Flag_EmbeddedMaskedOperation|HW_Flag_SpecialCodeGen) -HARDWARE_INTRINSIC(Sve, ConditionalSelect_Predicates, -1, 3, {INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ReturnsPerElementMask|HW_Flag_ExplicitMaskedOperation|HW_Flag_SpecialCodeGen) +HARDWARE_INTRINSIC(Sve, ConditionalSelect_Predicates, -1, 3, {INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ReturnsPerElementMask|HW_Flag_ExplicitMaskedOperation|HW_Flag_SupportsContainment) HARDWARE_INTRINSIC(Sve, ZipHigh_Predicates, -1, 2, {INS_sve_zip2, INS_sve_zip2, INS_sve_zip2, INS_sve_zip2, INS_sve_zip2, INS_sve_zip2, INS_sve_zip2, INS_sve_zip2, INS_sve_zip2, INS_sve_zip2}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(Sve, ZipLow_Predicates, -1, 2, {INS_sve_zip1, INS_sve_zip1, INS_sve_zip1, INS_sve_zip1, INS_sve_zip1, INS_sve_zip1, INS_sve_zip1, INS_sve_zip1, INS_sve_zip1, INS_sve_zip1}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(Sve, UnzipEven_Predicates, -1, 2, {INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ReturnsPerElementMask) diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index 6d9fe77b48a6cf..edf48da5dab83b 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -1577,6 +1577,17 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) node->gtType = TYP_SIMD16; } +#ifdef TARGET_ARM64 + // Enforce invariant HW_Flag_ReturnPerElementMask <==> node->TypeIs(TYP_MASK) + // This should happen at all stages of the compiler, but it's especially important to check here, + // as some Lowering analyses (such as embedded masks) will depend on this consistency. + if (node->TypeIs(TYP_MASK) || HWIntrinsicInfo::ReturnsPerElementMask(node->GetHWIntrinsicId())) + { + assert(HWIntrinsicInfo::ReturnsPerElementMask(node->GetHWIntrinsicId())); + assert(node->TypeIs(TYP_MASK)); + } +#endif + NamedIntrinsic intrinsicId = node->GetHWIntrinsicId(); bool isScalar = false; @@ -1869,6 +1880,7 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) break; case NI_Sve_ConditionalSelect: + case NI_Sve_ConditionalSelect_Predicates: return LowerHWIntrinsicCndSel(node); case NI_Sve_SetFfr: @@ -2007,47 +2019,42 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) // Use lastOp to verify if it's a ConditionlSelectNode. size_t lastOpNum = node->GetOperandCount(); - if (node->Op(lastOpNum)->OperIsHWIntrinsic() && - node->Op(lastOpNum)->AsHWIntrinsic()->GetHWIntrinsicId() == NI_Sve_ConditionalSelect && - TryContainingCselOp(node, node->Op(lastOpNum)->AsHWIntrinsic())) + if (node->Op(lastOpNum)->OperIsHWIntrinsic() && TryContainingCselOp(node, node->Op(lastOpNum)->AsHWIntrinsic())) { LABELEDDISPTREERANGE("Contained conditional select", BlockRange(), node); return node->gtNext; } - // Wrap a conditional select around the embedded mask operation + // Get the existing use of the node before modifying the graph. + bool foundUse = BlockRange().TryGetUse(node, &use); - unsigned simdSize = node->GetSimdSize(); - var_types simdType = Compiler::getSIMDTypeForSize(simdSize); + if (foundUse) + { + // For Vector operations: ConditionalSelect(CreateTrueMask(), Op(...), Vector.Zero) + // For Mask operations: ConditionalSelect(CreateTrueMask(), Op(...), CreateFalseMask()) + bool isMaskOp = HWIntrinsicInfo::ReturnsPerElementMask(node->GetHWIntrinsicId()); - bool foundUse = BlockRange().TryGetUse(node, &use); - GenTree* trueMask = m_compiler->gtNewSimdAllTrueMaskNode(node->GetSimdBaseType()); - GenTree* falseVal = m_compiler->gtNewZeroConNode(simdType); - var_types nodeType = simdType; + var_types selectType = isMaskOp ? TYP_MASK : Compiler::getSIMDTypeForSize(node->GetSimdSize()); + NamedIntrinsic selectIntrin = isMaskOp ? NI_Sve_ConditionalSelect_Predicates : NI_Sve_ConditionalSelect; - if (HWIntrinsicInfo::ReturnsPerElementMask(node->GetHWIntrinsicId())) - { - nodeType = TYP_MASK; - } + GenTree* trueMask = m_compiler->gtNewSimdAllTrueMaskNode(node->GetSimdBaseType()); + GenTree* falseVal = m_compiler->gtNewZeroConNode(selectType); + BlockRange().InsertBefore(node, trueMask); + BlockRange().InsertBefore(node, falseVal); - BlockRange().InsertBefore(node, trueMask); - BlockRange().InsertBefore(node, falseVal); + GenTreeHWIntrinsic* condSelNode = + m_compiler->gtNewSimdHWIntrinsicNode(selectType, trueMask, node, falseVal, selectIntrin, + node->GetSimdBaseType(), node->GetSimdSize()); + BlockRange().InsertAfter(node, condSelNode); - GenTreeHWIntrinsic* condSelNode = - m_compiler->gtNewSimdHWIntrinsicNode(nodeType, trueMask, node, falseVal, NI_Sve_ConditionalSelect, - node->GetSimdBaseType(), simdSize); - BlockRange().InsertAfter(node, condSelNode); - if (foundUse) - { use.ReplaceWith(condSelNode); + + LABELEDDISPTREERANGE("Wrapped embedded-mask intrinsic with ConditionalSelect", BlockRange(), condSelNode); } else { - node->ClearUnusedValue(); - condSelNode->SetUnusedValue(); + assert(node->IsUnusedValue()); } - - LABELEDDISPTREERANGE("Embedded HWIntrinisic inside conditional select", BlockRange(), condSelNode); } ContainCheckHWIntrinsic(node); @@ -3745,7 +3752,10 @@ bool Lowering::TryLowerNegToMulLongOp(GenTreeOp* op, GenTree** next) // bool Lowering::TryContainingCselOp(GenTreeHWIntrinsic* parentNode, GenTreeHWIntrinsic* childNode) { - assert(childNode->GetHWIntrinsicId() == NI_Sve_ConditionalSelect); + if (!HWIntrinsicInfo::IsSveConditionalSelect(childNode->GetHWIntrinsicId())) + { + return false; + } if (childNode->Op(2)->IsEmbMaskOp()) { @@ -4010,6 +4020,7 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) break; case NI_Sve_ConditionalSelect: + case NI_Sve_ConditionalSelect_Predicates: { assert(intrin.numOperands == 3); GenTree* op1 = intrin.op1; @@ -4085,7 +4096,7 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) } // Handle op3 - if (op3->IsVectorZero() && op1->IsTrueMask(node->GetSimdBaseType()) && op2->IsEmbMaskOp()) + if (op3->IsSelectZero() && op1->IsTrueMask(node->GetSimdBaseType()) && op2->IsEmbMaskOp()) { // When we are merging with zero, we can specialize // and avoid instantiating the vector constant. @@ -4206,14 +4217,14 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) // GenTree* Lowering::LowerHWIntrinsicCndSel(GenTreeHWIntrinsic* cndSelNode) { - assert(cndSelNode->OperIsHWIntrinsic(NI_Sve_ConditionalSelect)); - + NamedIntrinsic selectIntrin = cndSelNode->GetHWIntrinsicId(); + assert(HWIntrinsicInfo::IsSveConditionalSelect(selectIntrin)); GenTree* op1 = cndSelNode->Op(1); GenTree* op2 = cndSelNode->Op(2); GenTree* op3 = cndSelNode->Op(3); GenTree* lowerCndSel = cndSelNode; - if (op2->OperIsHWIntrinsic(NI_Sve_ConditionalSelect)) + if (op2->OperIsHWIntrinsic(selectIntrin)) { // Handle cases where there is a nested ConditionalSelect for `trueValue` GenTreeHWIntrinsic* nestedCndSel = op2->AsHWIntrinsic(); @@ -4231,7 +4242,7 @@ GenTree* Lowering::LowerHWIntrinsicCndSel(GenTreeHWIntrinsic* cndSelNode) if (nestedOp1->IsTrueMask(cndSelNode->GetSimdBaseType()) && !HWIntrinsicInfo::IsReduceOperation(nestedOp2Id) && - (!HWIntrinsicInfo::IsZeroingMaskedOperation(nestedOp2Id) || op3->IsVectorZero())) + (!HWIntrinsicInfo::IsZeroingMaskedOperation(nestedOp2Id) || op3->IsSelectZero())) { GenTree* nestedOp2 = nestedCndSel->Op(2); GenTree* nestedOp3 = nestedCndSel->Op(3); diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index 0d74688745910d..1e023bd619302b 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -2535,7 +2535,7 @@ GenTree* LinearScan::getConsecutiveRegistersOperand(const HWIntrinsic intrin, bo // GenTreeHWIntrinsic* LinearScan::getEmbeddedMaskOperand(const HWIntrinsic intrin) { - if ((intrin.id == NI_Sve_ConditionalSelect) && (intrin.op2->IsEmbMaskOp())) + if (HWIntrinsicInfo::IsSveConditionalSelect(intrin.id) && (intrin.op2->IsEmbMaskOp())) { assert(intrin.op2->OperIsHWIntrinsic()); return intrin.op2->AsHWIntrinsic(); @@ -2559,7 +2559,8 @@ GenTreeHWIntrinsic* LinearScan::getContainedCselOperand(GenTreeHWIntrinsic* intr GenTree* currentOp = intrinsicTree->Op(opNum); if (currentOp->OperIs(GT_HWINTRINSIC) && - (currentOp->AsHWIntrinsic()->GetHWIntrinsicId() == NI_Sve_ConditionalSelect) && currentOp->isContained()) + HWIntrinsicInfo::IsSveConditionalSelect(currentOp->AsHWIntrinsic()->GetHWIntrinsicId()) && + currentOp->isContained()) { return currentOp->AsHWIntrinsic(); } From 0510dd638ebaaefa980032563ca3a7e0f070e4d4 Mon Sep 17 00:00:00 2001 From: Sebastian Nickolls Date: Wed, 27 May 2026 13:07:38 +0100 Subject: [PATCH 2/2] Revert general changes to HWIntrinsic importer --- src/coreclr/jit/hwintrinsic.cpp | 79 +++++---------------------------- 1 file changed, 10 insertions(+), 69 deletions(-) diff --git a/src/coreclr/jit/hwintrinsic.cpp b/src/coreclr/jit/hwintrinsic.cpp index 40acdcbf1ef629..f9808f6ff44063 100644 --- a/src/coreclr/jit/hwintrinsic.cpp +++ b/src/coreclr/jit/hwintrinsic.cpp @@ -1888,64 +1888,6 @@ bool Compiler::CheckHWIntrinsicImmRange(NamedIntrinsic intrinsic, return true; } -#ifdef FEATURE_MASKED_HW_INTRINSICS -//------------------------------------------------------------------------ -// MaskVisitor: Traverses HW Intrinsic trees and applies mask types to nodes -// -// This visitor is used on HW intrinsic trees as they are imported to ensure -// the tree has correct type assignments where intrinsics operate on masks. -// -// For example, an intrinsic marked with HW_Flag_ReturnsPerElementMask should -// have the node type assigned to TYP_MASK. -// -// This work needs to be done before the 'Optimize Mask Conversions' pass, as -// that pass requires all locals have had the opportunity to have been assigned -// a mask type. -// -// Ideally mask types could be assigned correctly on creation during import, but -// this puts the burden on the importer code to ensure these architecture -// specific details are correct. Ideally this visitor is used sparingly on small -// HW intrinsic trees to avoid any potential traversal cost. -// -class MaskVisitor : public GenTreeVisitor -{ -public: - enum - { - DoPreOrder = true, - }; - - MaskVisitor(Compiler* comp) - : GenTreeVisitor(comp) - { - } - - fgWalkResult PreOrderVisit(GenTree** use, GenTree* user) - { - if ((*use)->OperIsHWIntrinsic()) - { - GenTreeHWIntrinsic* intrin = (*use)->AsHWIntrinsic(); - - if (HWIntrinsicInfo::ReturnsPerElementMask(intrin->GetHWIntrinsicId()) && !intrin->TypeIs(TYP_MASK)) - { - var_types previousType = intrin->TypeGet(); - intrin->gtType = TYP_MASK; - GenTree* converted = - m_compiler->gtNewSimdCvtMaskToVectorNode(previousType, intrin, intrin->GetSimdBaseType(), - intrin->GetSimdSize()); - *use = converted; - } - else if (intrin->TypeIs(TYP_MASK)) - { - assert(HWIntrinsicInfo::ReturnsPerElementMask(intrin->GetHWIntrinsicId())); - } - } - - return fgWalkResult::WALK_CONTINUE; - } -}; -#endif - //------------------------------------------------------------------------ // impHWIntrinsic: Import a hardware intrinsic as a GT_HWINTRINSIC node if possible // @@ -2251,6 +2193,13 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic, } var_types nodeRetType = retType; +#if defined(FEATURE_MASKED_HW_INTRINSICS) && defined(TARGET_ARM64) + if (HWIntrinsicInfo::ReturnsPerElementMask(intrinsic)) + { + // Ensure the result is generated to a mask. + nodeRetType = TYP_MASK; + } +#endif // FEATURE_MASKED_HW_INTRINSICS && TARGET_ARM64 // table-driven importer of simple intrinsics if (impIsTableDrivenHWIntrinsic(intrinsic, category)) @@ -2697,15 +2646,11 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic, } } -#ifdef FEATURE_MASKED_HW_INTRINSICS - if (retNode != nullptr) + if (nodeRetType == TYP_MASK) { - // Introduce mask types into the tree. - MaskVisitor visitor(this); - visitor.WalkTree(&retNode, nullptr); + // HWInstrinsic returns a mask, but all returns must be vectors, so convert mask to vector. + retNode = gtNewSimdCvtMaskToVectorNode(retType, retNode, simdBaseType, simdSize); } -#endif - #endif // FEATURE_MASKED_HW_INTRINSICS && TARGET_ARM64 if ((retNode != nullptr) && retNode->OperIs(GT_HWINTRINSIC)) @@ -2713,10 +2658,6 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic, assert(!retNode->OperMayThrow(this) || ((retNode->gtFlags & GTF_EXCEPT) != 0)); assert(!retNode->OperRequiresAsgFlag() || ((retNode->gtFlags & GTF_ASG) != 0)); assert(!retNode->OperIsImplicitIndir() || ((retNode->gtFlags & GTF_GLOB_REF) != 0)); - -#ifdef FEATURE_MASKED_HW_INTRINSICS - assert(!retNode->TypeIs(TYP_MASK)); -#endif } return retNode;