From 6da6c9cc7ba8c331b0a47f7a19441ca0a3c63d15 Mon Sep 17 00:00:00 2001 From: Jack Styles Date: Wed, 13 May 2026 17:32:32 +0100 Subject: [PATCH 1/4] Add MicroBenchmark for Small Trip Count Loop vectorization For targets where getMinTripCountTailFoldingThreshold returns a value greater than zero, https://github.com/llvm/llvm-project/pull/195823 has enabled better vectorization of loops where applicable. This micro benchmark is intended to show the impact of these changes on the relevant targets. For targets where getMinTripCountTailFoldingThreshold returns zero, there will be no effect to runtime when comparing scalar vs vector. --- .../LoopVectorization/CMakeLists.txt | 1 + .../LoopVectorization/SmallLoopTripCount.cpp | 102 ++++++++++++++++++ 2 files changed, 103 insertions(+) create mode 100644 MicroBenchmarks/LoopVectorization/SmallLoopTripCount.cpp diff --git a/MicroBenchmarks/LoopVectorization/CMakeLists.txt b/MicroBenchmarks/LoopVectorization/CMakeLists.txt index 516cafbe6141..9280d4d18e0b 100644 --- a/MicroBenchmarks/LoopVectorization/CMakeLists.txt +++ b/MicroBenchmarks/LoopVectorization/CMakeLists.txt @@ -14,6 +14,7 @@ llvm_test_executable(LoopVectorizationBenchmarks main.cpp MathFunctions.cpp RuntimeChecks.cpp + SmallLoopTripCount.cpp VectorOperations.cpp EarlyExit.cpp ) diff --git a/MicroBenchmarks/LoopVectorization/SmallLoopTripCount.cpp b/MicroBenchmarks/LoopVectorization/SmallLoopTripCount.cpp new file mode 100644 index 000000000000..b6d50f305550 --- /dev/null +++ b/MicroBenchmarks/LoopVectorization/SmallLoopTripCount.cpp @@ -0,0 +1,102 @@ +// This program tests the performance impact of vectorization in loops with +// small trip counts. These cases exercise the LoopVectorize path that accepts +// trip counts one larger than the vectorization factor. + +#include +#include +#include +#include + +#include "benchmark/benchmark.h" + +#define NOINLINE __attribute__((noinline)) +#define LOOP_VECTORIZE_ENABLE \ + _Pragma("clang loop vectorize(enable) unroll(disable)") +#define LOOP_VECTORIZE_DISABLE \ + _Pragma("clang loop vectorize(disable) interleave(disable) unroll(disable)") +#define LOOP_INTERLEAVE_COUNT_2 \ + _Pragma("clang loop vectorize(enable) interleave_count(2) unroll(disable)") + +static uint64_t g_small_loop_trip_count_sum = 0; + +template +NOINLINE void loopTc5Vector(const Ty *__restrict A, Ty *__restrict B) { + LOOP_VECTORIZE_ENABLE + for (uint64_t I = 0; I != 5; ++I) + B[I] = A[I] + static_cast(1); +} + +template +NOINLINE void loopTc5Scalar(const Ty *__restrict A, Ty *__restrict B) { + LOOP_VECTORIZE_DISABLE + for (uint64_t I = 0; I != 5; ++I) + B[I] = A[I] + static_cast(1); +} + +NOINLINE void loopTc5I64InterleaveCount2Vector(const uint64_t *__restrict A, + uint64_t *__restrict B) { + LOOP_INTERLEAVE_COUNT_2 + for (uint64_t I = 0; I != 5; ++I) + B[I] = A[I] + 1; +} + +template using KernelFn = void (*)(const Ty *, Ty *); + +template static void initData(std::array &A) { + for (size_t I = 0; I != A.size(); ++I) + A[I] = static_cast(0x0102030405060708ULL + I); +} + +template static uint64_t checksum(const std::array &A) { + uint64_t Sum = 0; + for (size_t I = 0; I != 5; ++I) { + auto Value = static_cast(A[I]); + for (size_t Byte = 0; Byte != sizeof(Ty); ++Byte) { + Sum = Sum * 131 + (Value & std::numeric_limits::max()); + Value >>= 8; + } + } + return Sum; +} + +template +static void runBenchForSmallLoopTripCount(benchmark::State &State, + KernelFn Fn) { + std::array A; + std::array B = {}; + initData(A); + + for (auto _ : State) { + benchmark::DoNotOptimize(A.data()); + benchmark::DoNotOptimize(B.data()); + Fn(A.data(), B.data()); + benchmark::ClobberMemory(); + } + + g_small_loop_trip_count_sum ^= checksum(B); + benchmark::DoNotOptimize(g_small_loop_trip_count_sum); + State.SetItemsProcessed(State.iterations() * 5); +} + +template void benchTc5Vector(benchmark::State &State) { + runBenchForSmallLoopTripCount(State, loopTc5Vector); +} + +template void benchTc5Scalar(benchmark::State &State) { + runBenchForSmallLoopTripCount(State, loopTc5Scalar); +} + +void benchTc5I64InterleaveCount2Vector(benchmark::State &State) { + runBenchForSmallLoopTripCount(State, + loopTc5I64InterleaveCount2Vector); +} + +BENCHMARK_TEMPLATE(benchTc5Vector, uint8_t)->Name("tc5/i8/vector"); +BENCHMARK_TEMPLATE(benchTc5Scalar, uint8_t)->Name("tc5/i8/scalar"); +BENCHMARK_TEMPLATE(benchTc5Vector, uint16_t)->Name("tc5/i16/vector"); +BENCHMARK_TEMPLATE(benchTc5Scalar, uint16_t)->Name("tc5/i16/scalar"); +BENCHMARK_TEMPLATE(benchTc5Vector, uint32_t)->Name("tc5/i32/vector"); +BENCHMARK_TEMPLATE(benchTc5Scalar, uint32_t)->Name("tc5/i32/scalar"); +BENCHMARK_TEMPLATE(benchTc5Vector, uint64_t)->Name("tc5/i64/vector"); +BENCHMARK_TEMPLATE(benchTc5Scalar, uint64_t)->Name("tc5/i64/scalar"); +BENCHMARK(benchTc5I64InterleaveCount2Vector)->Name("tc5/i64/ic2/vector"); From 9b8a4ae65426428f0108af326de2f8f04405aff6 Mon Sep 17 00:00:00 2001 From: Jack Styles Date: Tue, 19 May 2026 10:39:38 +0000 Subject: [PATCH 2/4] Respond to review comments --- .../LoopVectorization/SmallLoopTripCount.cpp | 110 +++++++++++++----- 1 file changed, 84 insertions(+), 26 deletions(-) diff --git a/MicroBenchmarks/LoopVectorization/SmallLoopTripCount.cpp b/MicroBenchmarks/LoopVectorization/SmallLoopTripCount.cpp index b6d50f305550..8ec377722662 100644 --- a/MicroBenchmarks/LoopVectorization/SmallLoopTripCount.cpp +++ b/MicroBenchmarks/LoopVectorization/SmallLoopTripCount.cpp @@ -5,7 +5,6 @@ #include #include #include -#include #include "benchmark/benchmark.h" @@ -17,8 +16,6 @@ #define LOOP_INTERLEAVE_COUNT_2 \ _Pragma("clang loop vectorize(enable) interleave_count(2) unroll(disable)") -static uint64_t g_small_loop_trip_count_sum = 0; - template NOINLINE void loopTc5Vector(const Ty *__restrict A, Ty *__restrict B) { LOOP_VECTORIZE_ENABLE @@ -33,13 +30,55 @@ NOINLINE void loopTc5Scalar(const Ty *__restrict A, Ty *__restrict B) { B[I] = A[I] + static_cast(1); } -NOINLINE void loopTc5I64InterleaveCount2Vector(const uint64_t *__restrict A, - uint64_t *__restrict B) { +template +NOINLINE void loopTc5I64InterleaveCount2Vector(const Ty *__restrict A, Ty *__restrict B) { LOOP_INTERLEAVE_COUNT_2 for (uint64_t I = 0; I != 5; ++I) - B[I] = A[I] + 1; + B[I] = A[I] + static_cast(1); +} + +template +NOINLINE void loopTc5ScalarizedDivVector(const Ty *__restrict A, + Ty *__restrict B) { + LOOP_VECTORIZE_ENABLE + for (uint64_t I = 0; I != 5; ++I) { + Ty Den = (A[I] & static_cast(15)) + static_cast(1); + B[I] = (A[I] * static_cast(13)) / Den; + } +} + +template +NOINLINE void loopTc5ScalarizedDivScalar(const Ty *__restrict A, + Ty *__restrict B) { + LOOP_VECTORIZE_DISABLE + for (uint64_t I = 0; I != 5; ++I) { + Ty Den = (A[I] & static_cast(15)) + static_cast(1); + B[I] = (A[I] * static_cast(13)) / Den; + } +} + +template +NOINLINE void loopTc3Vector(const Ty *__restrict A, Ty *__restrict B) { + LOOP_VECTORIZE_ENABLE + for (uint64_t I = 0; I != 3; ++I) + B[I] = A[I] + static_cast(1); +} + +template +NOINLINE void loopTc3Scalar(const Ty *__restrict A, Ty *__restrict B) { + LOOP_VECTORIZE_DISABLE + for (uint64_t I = 0; I != 3; ++I) + B[I] = A[I] + static_cast(1); } +template +NOINLINE void loopTc3I64InterleaveCount2Vector(const Ty *__restrict A, Ty *__restrict B) { + LOOP_INTERLEAVE_COUNT_2 + for (uint64_t I = 0; I != 3; ++I) + B[I] = A[I] + static_cast(1); +} + + template using KernelFn = void (*)(const Ty *, Ty *); template static void initData(std::array &A) { @@ -47,18 +86,6 @@ template static void initData(std::array &A) { A[I] = static_cast(0x0102030405060708ULL + I); } -template static uint64_t checksum(const std::array &A) { - uint64_t Sum = 0; - for (size_t I = 0; I != 5; ++I) { - auto Value = static_cast(A[I]); - for (size_t Byte = 0; Byte != sizeof(Ty); ++Byte) { - Sum = Sum * 131 + (Value & std::numeric_limits::max()); - Value >>= 8; - } - } - return Sum; -} - template static void runBenchForSmallLoopTripCount(benchmark::State &State, KernelFn Fn) { @@ -72,10 +99,6 @@ static void runBenchForSmallLoopTripCount(benchmark::State &State, Fn(A.data(), B.data()); benchmark::ClobberMemory(); } - - g_small_loop_trip_count_sum ^= checksum(B); - benchmark::DoNotOptimize(g_small_loop_trip_count_sum); - State.SetItemsProcessed(State.iterations() * 5); } template void benchTc5Vector(benchmark::State &State) { @@ -86,9 +109,32 @@ template void benchTc5Scalar(benchmark::State &State) { runBenchForSmallLoopTripCount(State, loopTc5Scalar); } -void benchTc5I64InterleaveCount2Vector(benchmark::State &State) { - runBenchForSmallLoopTripCount(State, - loopTc5I64InterleaveCount2Vector); +template void benchTc5I64InterleaveCount2Vector(benchmark::State &State) { + runBenchForSmallLoopTripCount(State, + loopTc5I64InterleaveCount2Vector); +} + +template +void benchTc5ScalarizedDivVector(benchmark::State &State) { + runBenchForSmallLoopTripCount(State, loopTc5ScalarizedDivVector); +} + +template +void benchTc5ScalarizedDivScalar(benchmark::State &State) { + runBenchForSmallLoopTripCount(State, loopTc5ScalarizedDivScalar); +} + +template void benchTc3Vector(benchmark::State &State) { + runBenchForSmallLoopTripCount(State, loopTc3Vector); +} + +template void benchTc3Scalar(benchmark::State &State) { + runBenchForSmallLoopTripCount(State, loopTc3Scalar); +} + +template void benchTc3I64InterleaveCount2Vector(benchmark::State &State) { + runBenchForSmallLoopTripCount(State, + loopTc3I64InterleaveCount2Vector); } BENCHMARK_TEMPLATE(benchTc5Vector, uint8_t)->Name("tc5/i8/vector"); @@ -99,4 +145,16 @@ BENCHMARK_TEMPLATE(benchTc5Vector, uint32_t)->Name("tc5/i32/vector"); BENCHMARK_TEMPLATE(benchTc5Scalar, uint32_t)->Name("tc5/i32/scalar"); BENCHMARK_TEMPLATE(benchTc5Vector, uint64_t)->Name("tc5/i64/vector"); BENCHMARK_TEMPLATE(benchTc5Scalar, uint64_t)->Name("tc5/i64/scalar"); -BENCHMARK(benchTc5I64InterleaveCount2Vector)->Name("tc5/i64/ic2/vector"); +BENCHMARK_TEMPLATE(benchTc5I64InterleaveCount2Vector, uint64_t)->Name("tc5/i64/ic2/vector"); +BENCHMARK_TEMPLATE(benchTc5ScalarizedDivVector, uint64_t)->Name("tc5/i64/scalarized-div/vector"); +BENCHMARK_TEMPLATE(benchTc5ScalarizedDivScalar, uint64_t)->Name("tc5/i64/scalarized-div/scalar"); + +BENCHMARK_TEMPLATE(benchTc3Vector, uint8_t)->Name("tc3/i8/vector"); +BENCHMARK_TEMPLATE(benchTc3Scalar, uint8_t)->Name("tc3/i8/scalar"); +BENCHMARK_TEMPLATE(benchTc3Vector, uint16_t)->Name("tc3/i16/vector"); +BENCHMARK_TEMPLATE(benchTc3Scalar, uint16_t)->Name("tc3/i16/scalar"); +BENCHMARK_TEMPLATE(benchTc3Vector, uint32_t)->Name("tc3/i32/vector"); +BENCHMARK_TEMPLATE(benchTc3Scalar, uint32_t)->Name("tc3/i32/scalar"); +BENCHMARK_TEMPLATE(benchTc3Vector, uint64_t)->Name("tc3/i64/vector"); +BENCHMARK_TEMPLATE(benchTc3Scalar, uint64_t)->Name("tc3/i64/scalar"); +BENCHMARK_TEMPLATE(benchTc3I64InterleaveCount2Vector, uint64_t)->Name("tc3/i64/ic2/vector"); From 5918831ca0b1f7b1f82ca387b5a3e94e4673fedc Mon Sep 17 00:00:00 2001 From: Jack Styles Date: Tue, 19 May 2026 10:40:30 +0000 Subject: [PATCH 3/4] formatting --- .../LoopVectorization/SmallLoopTripCount.cpp | 29 ++++++++++++------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/MicroBenchmarks/LoopVectorization/SmallLoopTripCount.cpp b/MicroBenchmarks/LoopVectorization/SmallLoopTripCount.cpp index 8ec377722662..1e8894b30c5d 100644 --- a/MicroBenchmarks/LoopVectorization/SmallLoopTripCount.cpp +++ b/MicroBenchmarks/LoopVectorization/SmallLoopTripCount.cpp @@ -31,7 +31,8 @@ NOINLINE void loopTc5Scalar(const Ty *__restrict A, Ty *__restrict B) { } template -NOINLINE void loopTc5I64InterleaveCount2Vector(const Ty *__restrict A, Ty *__restrict B) { +NOINLINE void loopTc5I64InterleaveCount2Vector(const Ty *__restrict A, + Ty *__restrict B) { LOOP_INTERLEAVE_COUNT_2 for (uint64_t I = 0; I != 5; ++I) B[I] = A[I] + static_cast(1); @@ -72,13 +73,13 @@ NOINLINE void loopTc3Scalar(const Ty *__restrict A, Ty *__restrict B) { } template -NOINLINE void loopTc3I64InterleaveCount2Vector(const Ty *__restrict A, Ty *__restrict B) { +NOINLINE void loopTc3I64InterleaveCount2Vector(const Ty *__restrict A, + Ty *__restrict B) { LOOP_INTERLEAVE_COUNT_2 for (uint64_t I = 0; I != 3; ++I) B[I] = A[I] + static_cast(1); } - template using KernelFn = void (*)(const Ty *, Ty *); template static void initData(std::array &A) { @@ -109,9 +110,10 @@ template void benchTc5Scalar(benchmark::State &State) { runBenchForSmallLoopTripCount(State, loopTc5Scalar); } -template void benchTc5I64InterleaveCount2Vector(benchmark::State &State) { +template +void benchTc5I64InterleaveCount2Vector(benchmark::State &State) { runBenchForSmallLoopTripCount(State, - loopTc5I64InterleaveCount2Vector); + loopTc5I64InterleaveCount2Vector); } template @@ -132,9 +134,10 @@ template void benchTc3Scalar(benchmark::State &State) { runBenchForSmallLoopTripCount(State, loopTc3Scalar); } -template void benchTc3I64InterleaveCount2Vector(benchmark::State &State) { +template +void benchTc3I64InterleaveCount2Vector(benchmark::State &State) { runBenchForSmallLoopTripCount(State, - loopTc3I64InterleaveCount2Vector); + loopTc3I64InterleaveCount2Vector); } BENCHMARK_TEMPLATE(benchTc5Vector, uint8_t)->Name("tc5/i8/vector"); @@ -145,9 +148,12 @@ BENCHMARK_TEMPLATE(benchTc5Vector, uint32_t)->Name("tc5/i32/vector"); BENCHMARK_TEMPLATE(benchTc5Scalar, uint32_t)->Name("tc5/i32/scalar"); BENCHMARK_TEMPLATE(benchTc5Vector, uint64_t)->Name("tc5/i64/vector"); BENCHMARK_TEMPLATE(benchTc5Scalar, uint64_t)->Name("tc5/i64/scalar"); -BENCHMARK_TEMPLATE(benchTc5I64InterleaveCount2Vector, uint64_t)->Name("tc5/i64/ic2/vector"); -BENCHMARK_TEMPLATE(benchTc5ScalarizedDivVector, uint64_t)->Name("tc5/i64/scalarized-div/vector"); -BENCHMARK_TEMPLATE(benchTc5ScalarizedDivScalar, uint64_t)->Name("tc5/i64/scalarized-div/scalar"); +BENCHMARK_TEMPLATE(benchTc5I64InterleaveCount2Vector, uint64_t) + ->Name("tc5/i64/ic2/vector"); +BENCHMARK_TEMPLATE(benchTc5ScalarizedDivVector, uint64_t) + ->Name("tc5/i64/scalarized-div/vector"); +BENCHMARK_TEMPLATE(benchTc5ScalarizedDivScalar, uint64_t) + ->Name("tc5/i64/scalarized-div/scalar"); BENCHMARK_TEMPLATE(benchTc3Vector, uint8_t)->Name("tc3/i8/vector"); BENCHMARK_TEMPLATE(benchTc3Scalar, uint8_t)->Name("tc3/i8/scalar"); @@ -157,4 +163,5 @@ BENCHMARK_TEMPLATE(benchTc3Vector, uint32_t)->Name("tc3/i32/vector"); BENCHMARK_TEMPLATE(benchTc3Scalar, uint32_t)->Name("tc3/i32/scalar"); BENCHMARK_TEMPLATE(benchTc3Vector, uint64_t)->Name("tc3/i64/vector"); BENCHMARK_TEMPLATE(benchTc3Scalar, uint64_t)->Name("tc3/i64/scalar"); -BENCHMARK_TEMPLATE(benchTc3I64InterleaveCount2Vector, uint64_t)->Name("tc3/i64/ic2/vector"); +BENCHMARK_TEMPLATE(benchTc3I64InterleaveCount2Vector, uint64_t) + ->Name("tc3/i64/ic2/vector"); From 2d40f92c495eb5e2d214a99eb86526c8e33689b6 Mon Sep 17 00:00:00 2001 From: Jack Styles Date: Fri, 29 May 2026 13:20:00 +0000 Subject: [PATCH 4/4] Add benchmarks for examples where vectorisation is not beneficial --- .../LoopVectorization/SmallLoopTripCount.cpp | 62 +++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/MicroBenchmarks/LoopVectorization/SmallLoopTripCount.cpp b/MicroBenchmarks/LoopVectorization/SmallLoopTripCount.cpp index 1e8894b30c5d..da1dee375cca 100644 --- a/MicroBenchmarks/LoopVectorization/SmallLoopTripCount.cpp +++ b/MicroBenchmarks/LoopVectorization/SmallLoopTripCount.cpp @@ -80,6 +80,38 @@ NOINLINE void loopTc3I64InterleaveCount2Vector(const Ty *__restrict A, B[I] = A[I] + static_cast(1); } +template +NOINLINE void loopTc3SelectionVector(const Ty *__restrict A, Ty *__restrict B) { + LOOP_VECTORIZE_ENABLE + for (int i = 0; i < 3; i++) { + B[i] = B[i] < A[i] ? B[i] : A[i]; + } +} + +template +NOINLINE void loopTc3SelectionScalar(const Ty *__restrict A, Ty *__restrict B) { + LOOP_VECTORIZE_DISABLE + for (int i = 0; i < 3; i++) { + B[i] = B[i] < A[i] ? B[i] : A[i]; + } +} + +template +NOINLINE void loopTc5ModulusVector(const Ty *__restrict A, Ty *__restrict B) { + LOOP_VECTORIZE_ENABLE + for (int i = 0; i < 5; i++) { + B[i] %= (A[i] | 1); + } +} + +template +NOINLINE void loopTc5ModulusScalar(const Ty *__restrict A, Ty *__restrict B) { + LOOP_VECTORIZE_DISABLE + for (int i = 0; i < 5; i++) { + B[i] %= (A[i] | 1); + } +} + template using KernelFn = void (*)(const Ty *, Ty *); template static void initData(std::array &A) { @@ -140,6 +172,24 @@ void benchTc3I64InterleaveCount2Vector(benchmark::State &State) { loopTc3I64InterleaveCount2Vector); } +template +void benchloopTc3SelectionVector(benchmark::State &State) { + runBenchForSmallLoopTripCount(State, loopTc3SelectionVector); +} + +template +void benchloopTc3SelectionScalar(benchmark::State &State) { + runBenchForSmallLoopTripCount(State, loopTc3SelectionScalar); +} + +template void benchloopTc5ModulusVector(benchmark::State &State) { + runBenchForSmallLoopTripCount(State, loopTc5ModulusVector); +} + +template void benchloopTc5ModulusScalar(benchmark::State &State) { + runBenchForSmallLoopTripCount(State, loopTc5ModulusScalar); +} + BENCHMARK_TEMPLATE(benchTc5Vector, uint8_t)->Name("tc5/i8/vector"); BENCHMARK_TEMPLATE(benchTc5Scalar, uint8_t)->Name("tc5/i8/scalar"); BENCHMARK_TEMPLATE(benchTc5Vector, uint16_t)->Name("tc5/i16/vector"); @@ -165,3 +215,15 @@ BENCHMARK_TEMPLATE(benchTc3Vector, uint64_t)->Name("tc3/i64/vector"); BENCHMARK_TEMPLATE(benchTc3Scalar, uint64_t)->Name("tc3/i64/scalar"); BENCHMARK_TEMPLATE(benchTc3I64InterleaveCount2Vector, uint64_t) ->Name("tc3/i64/ic2/vector"); +BENCHMARK_TEMPLATE(benchloopTc3SelectionVector, int8_t) + ->Name("tc3Selection/i8/vector"); +BENCHMARK_TEMPLATE(benchloopTc3SelectionScalar, int8_t) + ->Name("tc3Selection/i8/scalar"); +BENCHMARK_TEMPLATE(benchloopTc3SelectionVector, int16_t) + ->Name("tc3Selection/i16/vector"); +BENCHMARK_TEMPLATE(benchloopTc3SelectionScalar, int16_t) + ->Name("tc3Selection/i16/scalar"); +BENCHMARK_TEMPLATE(benchloopTc5ModulusVector, int32_t) + ->Name("tc5Modulus/i32/vector"); +BENCHMARK_TEMPLATE(benchloopTc5ModulusScalar, int32_t) + ->Name("tc5Modulus/i32/scalar");