[LV][RFC] Generating conditional VPBB that will be skip when the mask is inactive in VPlan.#141900
[LV][RFC] Generating conditional VPBB that will be skip when the mask is inactive in VPlan.#141900ElvisWang123 wants to merge 6 commits into
Conversation
|
@llvm/pr-subscribers-vectorizers @llvm/pr-subscribers-llvm-analysis Author: Elvis Wang (ElvisWang123) ChangesThis patch create an overview of VPlan transformation of creating conditional VPBB and will be split off into multiple patches later. This patch add the transformation that convert flatten control flow First, this transformation will collect all masked stores and operands Second, this transformation will split original vector loop and insert E.g. After: { vector.if.bb: vector.loop.split: Co-authored-by: nikolaypanchenko <kolya.panchenko@sifive.com> Patch is 20.32 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/141900.diff 9 Files Affected:
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 8f4ce80ada5ed..20c4b52098770 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1822,6 +1822,10 @@ class TargetTransformInfo {
/// otherwise scalar epilogue loop.
LLVM_ABI bool preferEpilogueVectorization() const;
+ /// Return true if the loop vectorizer shoud consider vectorizing with
+ /// flattern control flow, otherwise create conditional vector basic block.
+ bool preferFlattenControlFlow() const;
+
/// \returns True if the target wants to expand the given reduction intrinsic
/// into a shuffle sequence.
LLVM_ABI bool shouldExpandReduction(const IntrinsicInst *II) const;
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index a80b4c5179bad..1211c80f8ff51 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -1091,6 +1091,8 @@ class TargetTransformInfoImplBase {
virtual bool preferEpilogueVectorization() const { return true; }
+ virtual bool preferFlattenControlFlow() const { return true; }
+
virtual bool shouldExpandReduction(const IntrinsicInst *II) const {
return true;
}
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index ff8778168686d..5000bef968213 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -777,6 +777,10 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
return BaseT::preferPredicateOverEpilogue(TFI);
}
+ bool preferFlattenControlFlow() const override {
+ return thisT()->preferFlattenControlFlow();
+ }
+
TailFoldingStyle
getPreferredTailFoldingStyle(bool IVUpdateMayOverflow = true) const override {
return BaseT::getPreferredTailFoldingStyle(IVUpdateMayOverflow);
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 0f857399660fe..d455e1dc63c13 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -371,6 +371,10 @@ bool TargetTransformInfo::preferPredicateOverEpilogue(
return TTIImpl->preferPredicateOverEpilogue(TFI);
}
+bool TargetTransformInfo::preferFlattenControlFlow() const {
+ return TTIImpl->preferFlattenControlFlow();
+}
+
TailFoldingStyle TargetTransformInfo::getPreferredTailFoldingStyle(
bool IVUpdateMayOverflow) const {
return TTIImpl->getPreferredTailFoldingStyle(IVUpdateMayOverflow);
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 0a784461d67bf..412dd8ae2b991 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -146,6 +146,8 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
return false;
}
+ bool preferFlattenControlFlow() const { return false; }
+
InstructionCost
getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
unsigned AddressSpace,
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 2fe59a464457f..a834b6106f028 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -351,6 +351,11 @@ static cl::opt<bool> PreferPredicatedReductionSelect(
cl::desc(
"Prefer predicating a reduction operation over an after loop select."));
+static cl::opt<bool>
+ PreferFlattenControlFlow("prefer-flatten-control-flow", cl::init(true),
+ cl::Hidden,
+ cl::desc("Prefer flatten control flow."));
+
cl::opt<bool> llvm::EnableVPlanNativePath(
"enable-vplan-native-path", cl::Hidden,
cl::desc("Enable VPlan-native vectorization path with "
@@ -9287,6 +9292,9 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
}
VPlanTransforms::optimizeInductionExitUsers(*Plan, IVEndValues);
+ if (!PreferFlattenControlFlow && !TTI.preferFlattenControlFlow())
+ VPlanTransforms::optimizeConditionalVPBB(*Plan);
+
assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
return Plan;
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index dc5be520505eb..e1c8690986599 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2087,6 +2087,23 @@ void VPlanTransforms::addActiveLaneMask(
HeaderMask->replaceAllUsesWith(LaneMask);
}
+static bool replaceHeaderMaskToEVL(VPValue *HeaderMask, VPRecipeBase *R) {
+ using namespace llvm::VPlanPatternMatch;
+ VPValue *EdgeMask;
+ if (!R)
+ return false;
+ if (match(R, m_Binary<VPInstruction::BranchOnCount>(
+ m_VPInstruction<VPInstruction::AnyOf>(
+ m_Binary<VPInstruction::LogicalAnd>(
+ m_Specific(HeaderMask), m_VPValue(EdgeMask))),
+ m_VPValue()))) {
+
+ cast<VPInstruction>(R->getOperand(0))->setOperand(0, EdgeMask);
+ return true;
+ }
+ return false;
+}
+
/// Try to convert \p CurRecipe to a corresponding EVL-based recipe. Returns
/// nullptr if no EVL-based recipe could be created.
/// \p HeaderMask Header Mask.
@@ -2202,6 +2219,8 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) {
for (VPUser *U : collectUsersRecursively(HeaderMask)) {
auto *CurRecipe = cast<VPRecipeBase>(U);
+ if (replaceHeaderMaskToEVL(HeaderMask, CurRecipe))
+ continue;
VPRecipeBase *EVLRecipe = createEVLRecipe(
HeaderMask, *CurRecipe, TypeInfo, *AllOneMask, EVL, PrevEVL);
if (!EVLRecipe)
@@ -3202,3 +3221,160 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1)));
removeDeadRecipes(Plan);
}
+
+void VPlanTransforms::optimizeConditionalVPBB(VPlan &Plan) {
+
+ VPDominatorTree VPDT;
+ VPDT.recalculate(Plan);
+
+ SmallVector<VPValue *> HeaderMasks = collectAllHeaderMasks(Plan);
+
+ // Get the mask from the store recipes.
+ auto GetMask = [&HeaderMasks](VPRecipeBase &R) -> VPValue * {
+ using namespace llvm::VPlanPatternMatch;
+ if (isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe>(R)) {
+ VPValue *OrigMask = cast<VPWidenMemoryRecipe>(R).getMask();
+ if (!OrigMask)
+ return OrigMask;
+
+ if (any_of(HeaderMasks, [OrigMask](VPValue *HeaderMask) {
+ return OrigMask == HeaderMask;
+ }))
+ return nullptr;
+
+ // Match active.lane.mask.
+ if (match(OrigMask, m_VPInstruction<VPInstruction::ActiveLaneMask>(
+ m_VPValue(), m_VPValue())))
+ return nullptr;
+
+ return OrigMask;
+ }
+ return nullptr;
+ };
+
+ // First, collect all masked stores.
+ SmallVector<std::pair<VPRecipeBase *, VPValue *>> MaskedStores;
+ ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
+ Plan.getEntry());
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
+ for (VPRecipeBase &R : *VPBB) {
+ if (VPValue *Mask = GetMask(R))
+ MaskedStores.emplace_back(&R, Mask);
+ }
+ }
+
+ DenseSet<VPRecipeBase *> Candidates;
+ auto AddOperandsToCandidates = [&Candidates](VPRecipeBase *R) {
+ for (VPValue *Op : R->operands())
+ if (VPRecipeBase *OpR = Op->getDefiningRecipe())
+ Candidates.insert(OpR);
+ };
+
+ SmallVector<SetVector<VPRecipeBase *>> Tries;
+ while (!MaskedStores.empty()) {
+ auto [LR, M] = MaskedStores.pop_back_val();
+ Candidates.clear();
+ AddOperandsToCandidates(LR);
+
+ SetVector<VPRecipeBase *> CurrentTree;
+ CurrentTree.insert(LR);
+
+ VPBasicBlock *MaskBlock =
+ M->hasDefiningRecipe() ? M->getDefiningRecipe()->getParent() : nullptr;
+ auto End = MaskBlock == LR->getParent()
+ ? M->getDefiningRecipe()->getReverseIterator()
+ : LR->getParent()->getFirstNonPhi()->getReverseIterator();
+ // Greedily add all recipes that are used to compute stored value to the
+ // tree. All users of the added recipe must dominate the store
+ // recipe.
+ for (VPRecipeBase &R : make_range(LR->getReverseIterator(), End)) {
+ // Recipe is not a part of the tree
+ if (!Candidates.contains(&R))
+ continue;
+
+ if (any_of(R.definedValues(), [&LR = LR, &VPDT](VPValue *Def) {
+ for (VPUser *U : Def->users()) {
+ if (auto *UR = dyn_cast<VPRecipeBase>(U)) {
+ if (UR == LR || VPDT.properlyDominates(UR, LR))
+ continue;
+ }
+ return true;
+ }
+ return false;
+ }))
+ continue;
+
+ CurrentTree.insert(&R);
+ AddOperandsToCandidates(&R);
+ }
+ // Previous traversal could add recipes that are used by non-added recipes,
+ // thus need to be removed from the list.
+ DenseSet<VPRecipeBase *> ToRemove;
+ bool Changed;
+ do {
+ Changed = false;
+ for (VPRecipeBase *R : CurrentTree) {
+ if (ToRemove.contains(R))
+ continue;
+ if (any_of(R->definedValues(), [&](VPValue *Def) {
+ for (VPUser *U : Def->users()) {
+ if (auto *UR = dyn_cast<VPRecipeBase>(U))
+ if (!CurrentTree.contains(UR) || ToRemove.contains(UR))
+ return true;
+ }
+ return false;
+ })) {
+ Changed = true;
+ ToRemove.insert(R);
+ }
+ }
+ } while (Changed);
+
+ for (VPRecipeBase *R : ToRemove)
+ CurrentTree.remove(R);
+
+ if (CurrentTree.size() > 1)
+ Tries.push_back(CurrentTree);
+ }
+ for (const auto &List : Tries) {
+ VPRecipeBase *LR = List.front();
+ VPValue *M = cast<VPWidenMemoryRecipe>(LR)->getMask();
+ assert(M && "Mask VPValue must exist at this point");
+ auto Recipes = reverse(List.getArrayRef());
+
+ // Split current basic block at LR point so that VPConditionalRegionBlock
+ // can be added inbetween.
+ VPBasicBlock *ParentBB = LR->getParent();
+ VPBasicBlock *ContBB = ParentBB->splitAt(LR->getIterator());
+
+ // Create VPBB and insert it between ParentBB and ContBB.
+ VPBasicBlock *IfBB = Plan.createVPBasicBlock("vector.if.bb");
+ VPBlockUtils::insertBlockAfter(IfBB, ParentBB);
+ if (ContBB->getNumSuccessors() == 0)
+ ParentBB->getEnclosingLoopRegion()->setExiting(ContBB);
+
+ // Copy recipes into conditional block.
+ for (VPRecipeBase *R : Recipes)
+ R->moveBefore(*IfBB, IfBB->end());
+
+ // Add the condition and brach in the parent block.
+ auto *ActiveLane =
+ new VPInstruction(VPInstruction::AnyOf, {M}, nullptr, "any.of.mask");
+
+ Type *CanonicalIVType = Plan.getCanonicalIV()->getScalarType();
+ LLVMContext &Ctx = CanonicalIVType->getContext();
+ VPValue *Zero =
+ Plan.getOrAddLiveIn(ConstantInt::get(Type::getInt1Ty(Ctx), 0));
+
+ auto *BranchOnCount =
+ new VPInstruction(VPInstruction::BranchOnCount, {ActiveLane, Zero});
+ ParentBB->appendRecipe(ActiveLane);
+ ParentBB->appendRecipe(BranchOnCount);
+
+ // Set proper predecessor and successors for modifed basicblocks.
+ ParentBB->clearSuccessors();
+ ParentBB->setTwoSuccessors(ContBB, IfBB);
+ ContBB->clearPredecessors();
+ ContBB->setPredecessors({ParentBB, IfBB});
+ }
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 34e2de4eb3b74..0e535d6bf7c36 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -234,6 +234,29 @@ struct VPlanTransforms {
/// removed in the future.
static DenseMap<VPBasicBlock *, VPValue *>
introduceMasksAndLinearize(VPlan &Plan, bool FoldTail);
+
+ /// Try to convet flatten control flow to the conditional vector basic block.
+ /// If no active bits in the mask, will skip all the masked operations.
+ /// This transformation will collect all masked operations bottom-up from the
+ /// masked stores and put all of masked operations in a new vector basic
+ /// block. This original vector.loop will be split and the new created basic
+ /// block will inserted in between.
+ ///
+ /// After transformation the vplan will looks like.
+ /// vector.loop:
+ /// ...
+ /// %any.active.mask = any-of(%Mask)
+ /// Branch-On-Count %any.active.mask, 0
+ /// successors vector.loop.split, vector.if.bb
+ ///
+ /// vector.if.bb:
+ /// (Masked operations)
+ /// successors vector.loop.split
+ ///
+ /// vector.loop.split:
+ /// ...
+ /// successors middle.block, vector.loop
+ static void optimizeConditionalVPBB(VPlan &Plan);
};
} // namespace llvm
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-conditional-basic-block.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-conditional-basic-block.ll
new file mode 100644
index 0000000000000..2b8f2542ab544
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-conditional-basic-block.ll
@@ -0,0 +1,127 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-vectorize -force-vector-width=4 -S -mtriple=riscv64 -mattr=+v -prefer-flatten-control-flow=false %s | FileCheck %s
+
+define void @test(i32 %control1, i32 %control2, i32 %target, i32 %reg.4.val, ptr %reg.24.val) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: i32 [[CONTROL1:%.*]], i32 [[CONTROL2:%.*]], i32 [[TARGET:%.*]], i32 [[REG_4_VAL:%.*]], ptr [[REG_24_VAL:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[REG_4_VAL]], 0
+; CHECK-NEXT: br i1 [[CMP1]], label %[[FOR_BODY_LR_PH:.*]], label %[[FOR_END:.*]]
+; CHECK: [[FOR_BODY_LR_PH]]:
+; CHECK-NEXT: [[SH_PROM:%.*]] = zext nneg i32 [[CONTROL1]] to i64
+; CHECK-NEXT: [[SHL:%.*]] = shl nuw i64 1, [[SH_PROM]]
+; CHECK-NEXT: [[SH_PROM5:%.*]] = zext nneg i32 [[CONTROL2]] to i64
+; CHECK-NEXT: [[SHL6:%.*]] = shl nuw i64 1, [[SH_PROM5]]
+; CHECK-NEXT: [[SH_PROM10:%.*]] = zext nneg i32 [[TARGET]] to i64
+; CHECK-NEXT: [[SHL11:%.*]] = shl nuw nsw i64 1, [[SH_PROM10]]
+; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[REG_4_VAL]] to i64
+; CHECK-NEXT: [[TMP0:%.*]] = freeze i64 [[SHL6]]
+; CHECK-NEXT: [[TMP1:%.*]] = or i64 [[SHL]], [[TMP0]]
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 8
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[SHL11]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[TMP1]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[IF_THEN9_SPLIT:.*]] ]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[REG_24_VAL]], i64 [[INDEX]]
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
+; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
+; CHECK-NEXT: [[TMP5:%.*]] = and <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; CHECK-NEXT: [[TMP6:%.*]] = and <4 x i64> [[WIDE_LOAD3]], [[BROADCAST_SPLAT2]]
+; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <4 x i64> [[TMP5]], [[BROADCAST_SPLAT2]]
+; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <4 x i64> [[TMP6]], [[BROADCAST_SPLAT2]]
+; CHECK-NEXT: [[TMP9:%.*]] = xor <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP10:%.*]] = xor <4 x i64> [[WIDE_LOAD3]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP7]])
+; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i1 [[TMP13]], false
+; CHECK-NEXT: br i1 [[TMP14]], label %[[IF_THEN9_SPLIT]], label %[[VECTOR_IF_BB:.*]]
+; CHECK: [[VECTOR_IF_BB]]:
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i64, ptr [[TMP2]], i32 0
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[TMP2]], i32 4
+; CHECK-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP9]], ptr [[TMP11]], i32 8, <4 x i1> [[TMP7]])
+; CHECK-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP10]], ptr [[TMP12]], i32 8, <4 x i1> [[TMP8]])
+; CHECK-NEXT: br label %[[IF_THEN9_SPLIT]]
+; CHECK: [[IF_THEN9_SPLIT]]:
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_LR_PH]] ]
+; CHECK-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK: [[FOR_BODY]]:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC:.*]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[REG_24_VAL]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP27:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT: [[TMP28:%.*]] = and i64 [[TMP27]], [[TMP1]]
+; CHECK-NEXT: [[OR_COND_NOT:%.*]] = icmp eq i64 [[TMP28]], [[TMP1]]
+; CHECK-NEXT: br i1 [[OR_COND_NOT]], label %[[IF_THEN9:.*]], label %[[FOR_INC]]
+; CHECK: [[IF_THEN9]]:
+; CHECK-NEXT: [[XOR:%.*]] = xor i64 [[TMP27]], [[SHL11]]
+; CHECK-NEXT: store i64 [[XOR]], ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT: br label %[[FOR_INC]]
+; CHECK: [[FOR_INC]]:
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK: [[FOR_END_LOOPEXIT]]:
+; CHECK-NEXT: br label %[[FOR_END]]
+; CHECK: [[FOR_END]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ %cmp1 = icmp sgt i32 %reg.4.val, 0
+ br i1 %cmp1, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:
+ %sh_prom = zext nneg i32 %control1 to i64
+ %shl = shl nuw i64 1, %sh_prom
+ %sh_prom5 = zext nneg i32 %control2 to i64
+ %shl6 = shl nuw i64 1, %sh_prom5
+ %sh_prom10 = zext nneg i32 %target to i64
+ %shl11 = shl nuw nsw i64 1, %sh_prom10
+ %wide.trip.count = zext nneg i32 %reg.4.val to i64
+ %0 = freeze i64 %shl6
+ %1 = or i64 %shl, %0
+ br label %for.body
+
+for.body:
+ %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.inc ]
+ %arrayidx = getelementptr inbounds i64, ptr %reg.24.val, i64 %indvars.iv
+ %2 = load i64, ptr %arrayidx, align 8
+ %3 = and i64 %2, %1
+ %or.cond.not = icmp eq i64 %3, %1
+ br i1 %or.cond.not, label %if.then9, label %for.inc
+
+if.then9:
+ %xor = xor i64 %2, %shl11
+ store i64 %xor, ptr %arrayidx, align 8
+ br label %for.inc
+
+for.inc:
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.end.loopexit, label %for.body
+
+for.end.loope...
[truncated]
|
There was a problem hiding this comment.
Could profiling data help guide the best vectorisation strategy here too? This could be from PGO or it could be the likely or unlikely C level branch hints.
There was a problem hiding this comment.
Thanks for your comment!
I think the challenge here is that the branch taken probability also changes by the VF.
I am not quite familiar with the PGO. Could PGO collect the probability of branch after vectorization?
There was a problem hiding this comment.
That really depends on what will be collected in PGO. Naive hit-rate information won't be sufficient to make the best decision. For example, having a 25% hits of the condition within a loop with 100 iterations could mean and :
- condition at first 25 iterations are true, 75 others are false. In this case generating CF within vector loop will be good
- condition at every 4th iteration is true, such control flow will add extra overhead. (assuming 128bit vector register and 32bit data)
having a more fine-grained profile, i.e. hit-rate within a VLEN could be useful, especially for VLS.
e2382bf to
6dd5b38
Compare
fhahn
left a comment
There was a problem hiding this comment.
Thanks for sharing. I'm curious if you have any eprformance data you could share?
Yes. We got ~30% performance improvement on 462.libquantum on sifive-p470 with this optimization. |
| @@ -2202,6 +2219,8 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { | |||
| for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) { | |||
| for (VPUser *U : collectUsersRecursively(HeaderMask)) { | |||
| auto *CurRecipe = cast<VPRecipeBase>(U); | |||
| if (replaceHeaderMaskToEVL(HeaderMask, CurRecipe)) | |||
There was a problem hiding this comment.
JFYI, this line is unsound. The next line can return without replacing a a recipe with a VL recipe, and thus you've just dropped a mask without adding any form of predication.
There was a problem hiding this comment.
I think when this function being invoked, the loop will be EVL tail-folding.
This function is focus on peeling out the HeaderMask (which will be replaced by EVL) in the BlockInMasks when generating VPInstruction::BranchOnCount %Masks for conditional basic block.
Currently only load/store and reduction will generate EVL recipes. So I think it is fine to skip following EVL replacements.
There was a problem hiding this comment.
Would it be safer to do this optimisation in optimizeConditionalVPBB, and move optimizeConditionalVPBB till after the EVL transform has happened?
I think we should generally try to do the EVL transform as early as possible, rather than teaching it to work around various other optimisations
There was a problem hiding this comment.
Yes, move the optimizationConditionalVPBB() pass after EVL transformation pass. thanks!
6dd5b38 to
3d111a1
Compare
There was a problem hiding this comment.
Rebase and ping :)
@fhahn Do you have any concern of this patch? Just try to make sure this optimization will not break any rules in LV and VPlan.
| @@ -2202,6 +2219,8 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { | |||
| for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) { | |||
| for (VPUser *U : collectUsersRecursively(HeaderMask)) { | |||
| auto *CurRecipe = cast<VPRecipeBase>(U); | |||
| if (replaceHeaderMaskToEVL(HeaderMask, CurRecipe)) | |||
There was a problem hiding this comment.
I think when this function being invoked, the loop will be EVL tail-folding.
This function is focus on peeling out the HeaderMask (which will be replaced by EVL) in the BlockInMasks when generating VPInstruction::BranchOnCount %Masks for conditional basic block.
Currently only load/store and reduction will generate EVL recipes. So I think it is fine to skip following EVL replacements.
lukel97
left a comment
There was a problem hiding this comment.
Out of curiosity did you see any regressions in any workloads with this flag enabled? I.e. loops where the mask is true most of the time so it doesn't amortise the cost of the vector.reduce.or
There was a problem hiding this comment.
Nit, would this flag be better inverted, like prefer-control-flow or something?
There was a problem hiding this comment.
Inverted, thanks!
There was a problem hiding this comment.
Is it possible to do this later in VPlanTransforms::optimize?
There was a problem hiding this comment.
Move after EVL insertion pass. This can help to remove replaceHeaderMaskToEVL(). Thanks!
| @@ -2202,6 +2219,8 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { | |||
| for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) { | |||
| for (VPUser *U : collectUsersRecursively(HeaderMask)) { | |||
| auto *CurRecipe = cast<VPRecipeBase>(U); | |||
| if (replaceHeaderMaskToEVL(HeaderMask, CurRecipe)) | |||
There was a problem hiding this comment.
Would it be safer to do this optimisation in optimizeConditionalVPBB, and move optimizeConditionalVPBB till after the EVL transform has happened?
I think we should generally try to do the EVL transform as early as possible, rather than teaching it to work around various other optimisations
There was a problem hiding this comment.
For EVL tail folding, do we not still need to convert the anyof into a vp.reduce.or or something? Otherwise we will be counting lanes in edgemask that are outside the EVL
There was a problem hiding this comment.
The AnyOf(%mask) can lower to vcpop.m. Not sure if VPOptimizer can handle this properly or not.
If not, need to put this pass before EVL transformations and using vp.reduce.or.
3d111a1 to
66f9454
Compare
ElvisWang123
left a comment
There was a problem hiding this comment.
Rebase and address comments.
There was a problem hiding this comment.
Inverted, thanks!
There was a problem hiding this comment.
Move after EVL insertion pass. This can help to remove replaceHeaderMaskToEVL(). Thanks!
| @@ -2202,6 +2219,8 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { | |||
| for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) { | |||
| for (VPUser *U : collectUsersRecursively(HeaderMask)) { | |||
| auto *CurRecipe = cast<VPRecipeBase>(U); | |||
| if (replaceHeaderMaskToEVL(HeaderMask, CurRecipe)) | |||
There was a problem hiding this comment.
Yes, move the optimizationConditionalVPBB() pass after EVL transformation pass. thanks!
There was a problem hiding this comment.
The AnyOf(%mask) can lower to vcpop.m. Not sure if VPOptimizer can handle this properly or not.
If not, need to put this pass before EVL transformations and using vp.reduce.or.
66f9454 to
0867545
Compare
|
Gentle ping :) |
|
Hi @ElvisWang123 and @npanchen, thank you for working on this! I just want to post some information to support this change. I tried this patch with Flang compiler on zen4 with setting It looks like the AOCC compiler also has a similar optimization, which they call BOSCC (https://reviews.llvm.org/D139074). Some performance results of BOSCC for 481.wrf are also reported in #137213. So this change seems to be profitable for multiple benchmarks and multiple architectures, and it will be great to land it in llvm. On the downside, I got a few failures on other benchmarks, and they are all related to |
|
@vzakhari Thanks for your support and review! |
0867545 to
2ba3f32
Compare
2ba3f32 to
aa1d309
Compare
Yes, the cost is hard to calculate. The VL and the data distribution will change the time that conditional BB being executed. One possible (and simple) approach would be just divide 2 for the cost of the entire conditional BB for those target can gain benefit on this optimization.
Will collect the other benchmark results this week. Indeed the reduce-or might be quite expensive in the loop body. In RISCV, this reduce.or can be optimized to the |
|
Hi @lukel97 , On our CPUs that see improvements from this optimization on 462.libquantum (like the sifive-p470), we don't see any significant performance drops on other SPEC2006 Int or SPEC2017 benchmarks. That said, there are some CPUs that don't benefit from this optimization and might even see regressions on libquantum. |
fhahn
left a comment
There was a problem hiding this comment.
Hi @lukel97 ,
On our CPUs that see improvements from this optimization on 462.libquantum (like the sifive-p470), we don't see any significant performance drops on other SPEC2006 Int or SPEC2017 benchmarks.
That said, there are some CPUs that don't benefit from this optimization and might even see regressions on libquantum.
it would be good to have a micro benchmarks to cover a number of cases, including branch alternating every iteration, alternativing infrequently and some in between to get a better idea what the best case and worst case performance is, independent of libquantum. It would at least be good to know what the performance impact is for a common loop when the branches are alternating, which presumably would be the worst case?
Could we use block frequencies to determine when profitable?
| %cmp1 = icmp sgt i32 %reg.4.val, 0 | ||
| br i1 %cmp1, label %for.body.lr.ph, label %for.end | ||
|
|
||
| for.body.lr.ph: | ||
| %sh_prom = zext nneg i32 %control1 to i64 | ||
| %shl = shl nuw i64 1, %sh_prom | ||
| %sh_prom5 = zext nneg i32 %control2 to i64 | ||
| %shl6 = shl nuw i64 1, %sh_prom5 | ||
| %sh_prom10 = zext nneg i32 %target to i64 | ||
| %shl11 = shl nuw nsw i64 1, %sh_prom10 | ||
| %wide.trip.count = zext nneg i32 %reg.4.val to i64 | ||
| %0 = freeze i64 %shl6 | ||
| %1 = or i64 %shl, %0 | ||
| br label %for.body |
There was a problem hiding this comment.
is all this needed? Looks like the test can be simplified. and as @lukel97 mentioned, would be good to extend test coverage, to also stores with header masks, active-lane-masks etc.
There was a problem hiding this comment.
Simplified, thanks!
The header-masked and active-lane-masked tests were added under AArch64.
| return nullptr; | ||
| VPValue *OrigMask = cast<VPWidenMemoryRecipe>(R).getMask(); | ||
| if (!OrigMask || OrigMask == HeaderMask || | ||
| match(OrigMask, m_VPInstruction<VPInstruction::ActiveLaneMask>( |
There was a problem hiding this comment.
can we have active-lane-mask without being header mask?
There was a problem hiding this comment.
Removed, thanks!
| @@ -5182,3 +5182,161 @@ void VPlanTransforms::addExitUsersForFirstOrderRecurrences(VPlan &Plan, | |||
| } | |||
| } | |||
| } | |||
|
|
|||
| void VPlanTransforms::optimizeConditionalVPBB(VPlan &Plan) { | |||
There was a problem hiding this comment.
It seems like this should at least check the cost of the masked store? i.e. don't do it if the cost of the masked store is cheap?
There was a problem hiding this comment.
It is difficult to determine whether a masked store is cheap.
If a cost model is needed, a possible approach is to compare the cost of the (conditional vpbb / 2) + reduce.or + icmp against the original cost.
WDYT?
6978329 to
76560fc
Compare
I have created some micro-benchmarks to test these scenarios. Will share the results once the testing is complete.
It is difficult to use PGO block frequencies directly because the entry rate of the conditional vector block is affected by both the vector length and the distribution of the taken branches (mask bits). |
🪟 Windows x64 Test Results
✅ The build succeeded and all tests passed. |
| ; CHECK-NEXT: [[TMP12:%.*]] = freeze <4 x i1> [[TMP7]] | ||
| ; CHECK-NEXT: [[TMP13:%.*]] = freeze <4 x i1> [[TMP8]] | ||
| ; CHECK-NEXT: [[TMP14:%.*]] = or <4 x i1> [[TMP12]], [[TMP13]] | ||
| ; CHECK-NEXT: [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP14]]) |
There was a problem hiding this comment.
Could we check how AnyOf is handled when loop is tail folded with EVL?
I expect AnyOf to be replaced by vp.reduce.or with VL in optimizeMaskToEVL to prevent branch-on-poison.
There was a problem hiding this comment.
Add EVL tests and also convert AnyOf to vp.reduce.or in b63cec1. Thanks!
76560fc to
b63cec1
Compare
I checked the worst-case scenario on our CPU (p470) using the following benchmark. The worst case for this optimization occurs when there is only one active lane in the entire vector. This forces the conditional instruction to execute, but we still pay the extra cost for the any_of check and the branch. I tested with strides ranging from 1 to 1024 (powers of 2) and found that there is no significant difference between the version optimized by conditional VPBB and the unoptimized baseline. |
|
Great thanks for checking! It would be great if you could add a benchmark to https://github.com/llvm/llvm-test-suite/tree/main/MicroBenchmarks/LoopVectorization, so it is easy to test on other platforms and track in the future |
b63cec1 to
88cb007
Compare
llvm/llvm-test-suite#345 is ready for review. Please let me know if any changes are needed. |
|
Ping! |
88cb007 to
3a7cc35
Compare
| } | ||
| } | ||
|
|
||
| if (MaskedStores.empty()) |
There was a problem hiding this comment.
This limits the transformation to store-rooted trees only.
Conditional blocks that contain only expensive computations i.e. FP math, masked loads, gathers, replicated recipes, or interleave groups won't be considered.
For example, a loop like:
if (cond[i]) {
temp += sqrt(A[i]) * log(B[C[i]]);
}
Such limitations are not in PR #197423
There was a problem hiding this comment.
Yes. This patch focus on masked store first. We could expand this to support more operation in the futue.
There was a problem hiding this comment.
This is more than scoping — it's a fundamental design limitation. Because the transform is rooted at stores, anything not anchored by a masked store is structurally invisible: conditional blocks dominated by expensive compute (FP math, sqrt/log, gathers, replicated/interleave recipes) are significant opportunities where guarding pays off, and they're skipped by construction. BOSCC guards the whole block on the condition code, regardless of whether a store terminates it. A per-block redesign gets these for free instead of as future reworks.
| bool Changed; | ||
| do { | ||
| Changed = false; | ||
| for (VPRecipeBase *R : CurrentTree) { |
There was a problem hiding this comment.
This conservatively removes any recipe whose values have users outside the conditional tree.
An alternative approach (used in PR #197423) is to introduce a ConditionalMerge PHI in a join block that merges the guarded value with poison on the skip path.
This would allow more recipes to be moved into the guarded block.
There was a problem hiding this comment.
Live-out value from the conditional VPBB needs extra works.
I would like to keep first patch simple and support this in the future patch.
There was a problem hiding this comment.
Agree this requires extra work, which may require some portion to be redesigned.
| for (VPRecipeBase *R : ToRemove) | ||
| CurrentTree.remove(R); | ||
|
|
||
| if (CurrentTree.size() > 1) |
There was a problem hiding this comment.
Can this be parameterized by a flag ?
There was a problem hiding this comment.
Removed, since this optimization is already controlled by flag and a masked store usually can be move the conditional VPBB with vector-pointer.
We could add the cost-model based decision (or flag) to control this in the future.
| }; | ||
|
|
||
| SmallVector<SetVector<VPRecipeBase *>> Tries; | ||
| while (!MaskedStores.empty()) { |
There was a problem hiding this comment.
Is it true that each masked store produces its own independent conditional block with its own any-of + branch ?
for (int i = 0; i < n; i++) {
if (cond[i]) {
A[i] = ExpensiveOp1(data[i]);
B[i] = ExpensiveOp2(data[i]);
}
}
If multiple stores share the same mask, is this creates multiple guard branches per iteration ?
i.e.
vector.loop:
%data = load data[i]
%cmp = icmp ... cond[i]
%any1 = any-of(%cmp)
branch %any1 -> if.bb.1, split.1
if.bb.1:
%exp1 = ExpensiveOp1(%data)
masked-store(%exp1, A, %cmp)
-> split.1
split.1:
%any2 = any-of(%cmp)
branch %any2 -> if.bb.2, split.2
if.bb.2:
%exp2 = ExpensiveOp2(%data)
masked-store(%exp2, B, %cmp)
-> split.2
There was a problem hiding this comment.
Yes, 1 conditional block will be generated for each masked store.
To merge multiple conditional blocks with same mask into same conditional block needs some extra works (e.g. make sure this merge is legal).
I would like to keep this patch simple and we can do this optimization in the future.
There was a problem hiding this comment.
This confirms the core concern of store-rooting: for if (cond[i]) { A[i]=Op1(..); B[i]=Op2(..); }, store-rooting emits a separate any-of + branch per store despite one shared predicate — redundant branch/code-size overhead. "Merge same-mask blocks later" is just re-deriving the block as the unit of guarding, which is where BOSCC starts. A block-centric redesign yields one branch by construction addresses this and previous limitations highlighted in my comments — strong evidence that per-store rooting is the wrong primitive.
|
Ping! |
|
Ping x2! |
There was a problem hiding this comment.
Rebased and updated. Also addressed some of comments from @nema-ashutosh.
I've updated the branch but somehow the PR isn't updated.
| } | ||
| } | ||
|
|
||
| if (MaskedStores.empty()) |
There was a problem hiding this comment.
Yes. This patch focus on masked store first. We could expand this to support more operation in the futue.
| }; | ||
|
|
||
| SmallVector<SetVector<VPRecipeBase *>> Tries; | ||
| while (!MaskedStores.empty()) { |
There was a problem hiding this comment.
Yes, 1 conditional block will be generated for each masked store.
To merge multiple conditional blocks with same mask into same conditional block needs some extra works (e.g. make sure this merge is legal).
I would like to keep this patch simple and we can do this optimization in the future.
| bool Changed; | ||
| do { | ||
| Changed = false; | ||
| for (VPRecipeBase *R : CurrentTree) { |
There was a problem hiding this comment.
Live-out value from the conditional VPBB needs extra works.
I would like to keep first patch simple and support this in the future patch.
| for (VPRecipeBase *R : ToRemove) | ||
| CurrentTree.remove(R); | ||
|
|
||
| if (CurrentTree.size() > 1) |
There was a problem hiding this comment.
Removed, since this optimization is already controlled by flag and a masked store usually can be move the conditional VPBB with vector-pointer.
We could add the cost-model based decision (or flag) to control this in the future.
nema-ashutosh
left a comment
There was a problem hiding this comment.
Thanks for working on this, the patch is clean, but my main concern is structural.
Rooting the transform at masked stores is the wrong primitive: the inline limitations (skipped compute-heavy blocks, live-outs, redundant per-store guards) are all symptoms of store-rooting. The original BOSCC approach guards a whole conditional block, which dissolves these naturally. I'd prefer we settle block-vs-store before landing this (see also #197423).
| } | ||
| } | ||
|
|
||
| if (MaskedStores.empty()) |
There was a problem hiding this comment.
This is more than scoping — it's a fundamental design limitation. Because the transform is rooted at stores, anything not anchored by a masked store is structurally invisible: conditional blocks dominated by expensive compute (FP math, sqrt/log, gathers, replicated/interleave recipes) are significant opportunities where guarding pays off, and they're skipped by construction. BOSCC guards the whole block on the condition code, regardless of whether a store terminates it. A per-block redesign gets these for free instead of as future reworks.
| }; | ||
|
|
||
| SmallVector<SetVector<VPRecipeBase *>> Tries; | ||
| while (!MaskedStores.empty()) { |
There was a problem hiding this comment.
This confirms the core concern of store-rooting: for if (cond[i]) { A[i]=Op1(..); B[i]=Op2(..); }, store-rooting emits a separate any-of + branch per store despite one shared predicate — redundant branch/code-size overhead. "Merge same-mask blocks later" is just re-deriving the block as the unit of guarding, which is where BOSCC starts. A block-centric redesign yields one branch by construction addresses this and previous limitations highlighted in my comments — strong evidence that per-store rooting is the wrong primitive.
| bool Changed; | ||
| do { | ||
| Changed = false; | ||
| for (VPRecipeBase *R : CurrentTree) { |
There was a problem hiding this comment.
Agree this requires extra work, which may require some portion to be redesigned.
This patch add a attribute in TTI to let LV knows which is better. Default value of preferControlFlowVectorization is false to match current TTI implementation. If preferFlattenControlFlow() return true, LV will try to generate conditional VPBB if possible.
This patch add the transformation that convert flatten control flow
with conditional vector basic block.
This transformation can help program skip masked operations without any
active lane.
First, this transformation will collect all masked stores and operands
bottom-up. And put these msaked operations into a new vector basic
block.
Second, this transformation will split original vector loop and insert
the new basic block between split blocks. And update the conditional
branch in the orignal blocks.
E.g.
Before: {
vector.loop:
...
BranchOnCount %IV, %TC
Successors middle.block, vector.loop
}
After: {
vector.loop:
...
%any.active.mask = any-of(%mask)
BranchOnCond %any.active.mask
Successors vector.if.bb, vector.loop.split
vector.if.bb:
... (Masked operations)
Successors vector.loop.split
vector.loop.split:
...
BranchOnCount %IV, %TC
Successors middle.block, vector.loop
}
c1ef8f0 to
3838e24
Compare
Thanks for your comments! I'm working on some refactor this patch to make it possible to support (live-out, compute-heavy blocks, common mask merging, etc) in the future. |
This patch create an overview of VPlan transformation of creating conditional VPBB and will be split off into multiple patches later.
RFC: https://discourse.llvm.org/t/rfc-lv-generating-conditional-vpbb-that-will-be-skip-when-the-mask-is-inactive-in-vplan/86591
This patch add the transformation that convert flatten control flow
with conditional vector basic block.
This transformation can help program skip masked operations without any
active lane.
First, this transformation will collect all masked stores and operands
bottom-up. And put these masked operations into a new vector basic
block.
Second, this transformation will split original vector loop and insert
the new basic block between split blocks. And update the conditional
branch in the original blocks.
E.g.
Before: {
vector.loop:
...
BranchOnCount %IV, %TC
Successors middle.block, vector.loop
}
After: {
vector.loop:
...
%any.active.mask = any-of(%mask)
BranchOnCount %any.active.mask, 0
Successors vector.loop.split, vector.if.bb
vector.if.bb:
... (Masked operations)
Successors vector.loop.split
vector.loop.split:
...
BranchOnCount %IV, %TC
Successors middle.block, vector.loop
}
Co-authored-by: nikolaypanchenko kolya.panchenko@sifive.com