antgroup
diff --git a/‎src/simd/avx.cpp‎
Lines changed: 95 additions & 841 deletions b/‎src/simd/avx.cpp‎
Lines changed: 95 additions & 841 deletions
diff --git a/‎src/simd/avx2.cpp‎
Lines changed: 108 additions & 1037 deletions b/‎src/simd/avx2.cpp‎
Lines changed: 108 additions & 1037 deletions
diff --git a/‎src/simd/avx512.cpp‎
Lines changed: 107 additions & 975 deletions b/‎src/simd/avx512.cpp‎
Lines changed: 107 additions & 975 deletions
diff --git a/‎src/simd/generic.cpp‎
Lines changed: 96 additions & 236 deletions b/‎src/simd/generic.cpp‎
Lines changed: 96 additions & 236 deletions
diff --git a/‎src/simd/kernels/binary_op.h‎
Lines changed: 75 additions & 0 deletions b/‎src/simd/kernels/binary_op.h‎
Lines changed: 75 additions & 0 deletions
diff --git a/‎src/simd/kernels/bit_op.h‎
Lines changed: 123 additions & 0 deletions b/‎src/simd/kernels/bit_op.h‎
Lines changed: 123 additions & 0 deletions
diff --git a/‎src/simd/kernels/butterfly.h‎
Lines changed: 96 additions & 0 deletions b/‎src/simd/kernels/butterfly.h‎
Lines changed: 96 additions & 0 deletions
diff --git a/‎src/simd/kernels/compute_batch4.h‎
Lines changed: 102 additions & 0 deletions b/‎src/simd/kernels/compute_batch4.h‎
Lines changed: 102 additions & 0 deletions
@@ -0,0 +1,75 @@
+
+// Copyright 2024-present the vsag project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+// Element-wise binary op kernel: z[i] = op(x[i], y[i]).
+// Used by FP32Add / FP32Sub / FP32Mul / FP32Div across all ISAs.
+//
+// Op selects the per-element operation via a tag dispatched at compile time
+// to the corresponding traits method (add/sub/mul/div). The Generic backend
+// (Width == 1) compiles out the fallback branch via `if constexpr`.
+
+#include <cstdint>
+
+namespace vsag::simd {
+
+using BinaryFallback = void (*)(const float*, const float*, float*, uint64_t);
+
+enum class BinaryOp { Add, Sub, Mul, Div };
+
+template <typename T, BinaryOp Op>
+inline __attribute__((always_inline)) typename T::FloatVec
+binary_apply(typename T::FloatVec a, typename T::FloatVec b) {
+    if constexpr (Op == BinaryOp::Add) {
+        return T::add(a, b);
+    } else if constexpr (Op == BinaryOp::Sub) {
+        return T::sub(a, b);
+    } else if constexpr (Op == BinaryOp::Mul) {
+        return T::mul(a, b);
+    } else {
+        return T::div(a, b);
+    }
+}
+
+template <typename T, BinaryOp Op>
+inline void
+BinaryOpImpl(
+    const float* x, const float* y, float* z, uint64_t dim, BinaryFallback fallback = nullptr) {
+    using V = typename T::FloatVec;
+    constexpr int W = T::Width;
+
+    if constexpr (W > 1) {
+        if (dim < static_cast<uint64_t>(W)) {
+            fallback(x, y, z, dim);
+            return;
+        }
+    }
+
+    uint64_t i = 0;
+    for (; i + W <= dim; i += W) {
+        V a = T::load(x + i);
+        V b = T::load(y + i);
+        T::store(z + i, binary_apply<T, Op>(a, b));
+    }
+
+    if constexpr (W > 1) {
+        if (dim > i) {
+            fallback(x + i, y + i, z + i, dim - i);
+        }
+    }
+}
+
+}  // namespace vsag::simd
@@ -0,0 +1,123 @@
+// Copyright 2024-present the vsag project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+// Bitwise operation kernels: AND, OR, XOR, NOT.
+//
+// Parameterized on BitTraits<ISA>, which must expose:
+//   IntVec              - integer vector type (e.g. __m128i, __m256i, __m512i)
+//   ByteWidth           - number of bytes per vector (16, 32, 64)
+//   load(const uint8_t* p)     -> IntVec
+//   store(uint8_t* p, IntVec v)
+//   bit_and(IntVec a, IntVec b) -> IntVec
+//   bit_or(IntVec a, IntVec b)  -> IntVec
+//   bit_xor(IntVec a, IntVec b) -> IntVec
+//   bit_not(IntVec a)            -> IntVec    (optional, only needed for BitNotImpl)
+
+#include <cstdint>
+
+namespace vsag::simd {
+
+using BitOpFallback = void (*)(const uint8_t*, const uint8_t*, uint64_t, uint8_t*);
+using BitNotFallback = void (*)(const uint8_t*, uint64_t, uint8_t*);
+
+template <typename T>
+inline void
+BitAndImpl(const uint8_t* x,
+           const uint8_t* y,
+           uint64_t num_byte,
+           uint8_t* result,
+           BitOpFallback fallback = nullptr) {
+    constexpr int W = T::ByteWidth;
+    if (num_byte == 0)
+        return;
+    if (num_byte < static_cast<uint64_t>(W)) {
+        return fallback(x, y, num_byte, result);
+    }
+    int64_t i = 0;
+    for (; i + W <= static_cast<int64_t>(num_byte); i += W) {
+        T::store(result + i, T::bit_and(T::load(x + i), T::load(y + i)));
+    }
+    if (i < static_cast<int64_t>(num_byte)) {
+        fallback(x + i, y + i, num_byte - i, result + i);
+    }
+}
+
+template <typename T>
+inline void
+BitOrImpl(const uint8_t* x,
+          const uint8_t* y,
+          uint64_t num_byte,
+          uint8_t* result,
+          BitOpFallback fallback = nullptr) {
+    constexpr int W = T::ByteWidth;
+    if (num_byte == 0)
+        return;
+    if (num_byte < static_cast<uint64_t>(W)) {
+        return fallback(x, y, num_byte, result);
+    }
+    int64_t i = 0;
+    for (; i + W <= static_cast<int64_t>(num_byte); i += W) {
+        T::store(result + i, T::bit_or(T::load(x + i), T::load(y + i)));
+    }
+    if (i < static_cast<int64_t>(num_byte)) {
+        fallback(x + i, y + i, num_byte - i, result + i);
+    }
+}
+
+template <typename T>
+inline void
+BitXorImpl(const uint8_t* x,
+           const uint8_t* y,
+           uint64_t num_byte,
+           uint8_t* result,
+           BitOpFallback fallback = nullptr) {
+    constexpr int W = T::ByteWidth;
+    if (num_byte == 0)
+        return;
+    if (num_byte < static_cast<uint64_t>(W)) {
+        return fallback(x, y, num_byte, result);
+    }
+    int64_t i = 0;
+    for (; i + W <= static_cast<int64_t>(num_byte); i += W) {
+        T::store(result + i, T::bit_xor(T::load(x + i), T::load(y + i)));
+    }
+    if (i < static_cast<int64_t>(num_byte)) {
+        fallback(x + i, y + i, num_byte - i, result + i);
+    }
+}
+
+template <typename T>
+inline void
+BitNotImpl(const uint8_t* x,
+           uint64_t num_byte,
+           uint8_t* result,
+           BitNotFallback fallback = nullptr) {
+    constexpr int W = T::ByteWidth;
+    if (num_byte == 0)
+        return;
+    if (num_byte < static_cast<uint64_t>(W)) {
+        return fallback(x, num_byte, result);
+    }
+    int64_t i = 0;
+    for (; i + W <= static_cast<int64_t>(num_byte); i += W) {
+        T::store(result + i, T::bit_not(T::load(x + i)));
+    }
+    if (i < static_cast<int64_t>(num_byte)) {
+        fallback(x + i, num_byte - i, result + i);
+    }
+}
+
+}  // namespace vsag::simd
@@ -0,0 +1,96 @@
+// Copyright 2024-present the vsag project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+// Butterfly-pattern kernels used in Fast Hadamard Transform (FHT):
+//   RotateOp:  data[i+j] = data[i+j] + data[i+j+step]
+//              data[i+j+step] = data[i+j] - data[i+j+step]
+//   KacsWalk:  in-place butterfly on two halves of the array.
+//
+// These use only load/store/add/sub from the traits interface.
+
+#include <cmath>
+#include <cstdint>
+
+namespace vsag::simd {
+
+using RotateOpFallback = void (*)(float*, int, int, int);
+using KacsWalkFallback = void (*)(float*, uint64_t);
+
+// RotateOp: butterfly on stride `step` within [idx, dim_).
+// Requires step >= T::Width for the vectorized path.
+template <typename T>
+inline void
+RotateOpImpl(float* data, int idx, int dim_, int step) {
+    using V = typename T::FloatVec;
+    constexpr int W = T::Width;
+
+    for (int i = idx; i < dim_; i += step * 2) {
+        int j = 0;
+        for (; j + W <= step; j += W) {
+            V g1 = T::load(&data[i + j]);
+            V g2 = T::load(&data[i + j + step]);
+            T::store(&data[i + j], T::add(g1, g2));
+            T::store(&data[i + j + step], T::sub(g1, g2));
+        }
+        for (; j < step; ++j) {
+            float g1 = data[i + j];
+            float g2 = data[i + j + step];
+            data[i + j] = g1 + g2;
+            data[i + j + step] = g1 - g2;
+        }
+    }
+}
+
+// KacsWalk: in-place butterfly on data[0..len/2-1] vs data[offset..offset+len/2-1].
+// For odd-length arrays, the middle element is scaled by sqrt(2).
+template <typename T>
+inline void
+KacsWalkImpl(float* data, uint64_t len, KacsWalkFallback fallback = nullptr) {
+    using V = typename T::FloatVec;
+    constexpr int W = T::Width;
+
+    if constexpr (W > 1) {
+        if (len / 2 < static_cast<uint64_t>(W)) {
+            fallback(data, len);
+            return;
+        }
+    }
+
+    uint64_t base = len % 2;
+    uint64_t offset = base + (len / 2);
+    uint64_t i = 0;
+
+    for (; i + W <= len / 2; i += W) {
+        V x = T::load(&data[i]);
+        V y = T::load(&data[i + offset]);
+        T::store(&data[i], T::add(x, y));
+        T::store(&data[i + offset], T::sub(x, y));
+    }
+
+    // Scalar tail
+    for (; i < len / 2; i++) {
+        float x = data[i];
+        float y = data[i + offset];
+        data[i] = x + y;
+        data[i + offset] = x - y;
+    }
+
+    if (base != 0) {
+        data[len / 2] *= std::sqrt(2.0f);
+    }
+}
+
+}  // namespace vsag::simd
@@ -0,0 +1,102 @@
+
+// Copyright 2024-present the vsag project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+// Batch-of-4 IP / L2 kernel: one query vector against four code vectors.
+// Results are accumulated into result1..result4 (the caller must initialise
+// them before invocation, e.g. to 0). Matches the existing semantics of
+// FP32ComputeIPBatch4 / FP32ComputeL2SqrBatch4: the four accumulators
+// share the same query load, so we get 4x reuse of every q-cacheline.
+
+#include <cstdint>
+
+#include "simd/simd_marco.h"
+
+namespace vsag::simd {
+
+using Batch4Fallback = void (*)(const float* RESTRICT query,
+                                uint64_t dim,
+                                const float* RESTRICT c1,
+                                const float* RESTRICT c2,
+                                const float* RESTRICT c3,
+                                const float* RESTRICT c4,
+                                float& r1,
+                                float& r2,
+                                float& r3,
+                                float& r4);
+
+enum class Batch4Kind { IP, L2 };
+
+template <typename T, Batch4Kind Kind>
+inline __attribute__((always_inline)) typename T::FloatVec
+batch4_accumulate(typename T::FloatVec q, typename T::FloatVec c, typename T::FloatVec acc) {
+    if constexpr (Kind == Batch4Kind::IP) {
+        return T::fmadd(q, c, acc);
+    } else {
+        typename T::FloatVec d = T::sub(q, c);
+        return T::fmadd(d, d, acc);
+    }
+}
+
+template <typename T, Batch4Kind Kind>
+inline void
+ComputeBatch4Impl(const float* RESTRICT query,
+                  uint64_t dim,
+                  const float* RESTRICT c1,
+                  const float* RESTRICT c2,
+                  const float* RESTRICT c3,
+                  const float* RESTRICT c4,
+                  float& r1,
+                  float& r2,
+                  float& r3,
+                  float& r4,
+                  Batch4Fallback fallback = nullptr) {
+    using V = typename T::FloatVec;
+    constexpr int W = T::Width;
+
+    if constexpr (W > 1) {
+        if (dim < static_cast<uint64_t>(W)) {
+            fallback(query, dim, c1, c2, c3, c4, r1, r2, r3, r4);
+            return;
+        }
+    }
+
+    V s1 = T::zero();
+    V s2 = T::zero();
+    V s3 = T::zero();
+    V s4 = T::zero();
+
+    uint64_t i = 0;
+    for (; i + W <= dim; i += W) {
+        V q = T::load(query + i);
+        s1 = batch4_accumulate<T, Kind>(q, T::load(c1 + i), s1);
+        s2 = batch4_accumulate<T, Kind>(q, T::load(c2 + i), s2);
+        s3 = batch4_accumulate<T, Kind>(q, T::load(c3 + i), s3);
+        s4 = batch4_accumulate<T, Kind>(q, T::load(c4 + i), s4);
+    }
+    r1 += T::reduce_add(s1);
+    r2 += T::reduce_add(s2);
+    r3 += T::reduce_add(s3);
+    r4 += T::reduce_add(s4);
+
+    if constexpr (W > 1) {
+        if (dim > i) {
+            fallback(query + i, dim - i, c1 + i, c2 + i, c3 + i, c4 + i, r1, r2, r3, r4);
+        }
+    }
+}
+
+}  // namespace vsag::simd