From 0ff0d30d8a756e7ca17502191d5a6cbe269cfd80 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Tue, 2 Jun 2026 14:13:06 -0700
Subject: [PATCH 01/22] #525 adding LagrangeBasis/Serendipity function support
 and unit tests for refactored basis functions

---
 Code/Source/solver/CMakeLists.txt             |   25 +-
 Code/Source/solver/FE/Basis/BasisCache.cpp    |  309 +
 Code/Source/solver/FE/Basis/BasisCache.h      |  456 +
 Code/Source/solver/FE/Basis/BasisExceptions.h |  134 +
 Code/Source/solver/FE/Basis/BasisFactory.cpp  |  160 +
 Code/Source/solver/FE/Basis/BasisFactory.h    |   57 +
 Code/Source/solver/FE/Basis/BasisFunction.cpp |  366 +
 Code/Source/solver/FE/Basis/BasisFunction.h   |  426 +
 Code/Source/solver/FE/Basis/BasisTolerance.h  |   52 +
 Code/Source/solver/FE/Basis/BasisTraits.h     |  218 +
 Code/Source/solver/FE/Basis/LagrangeBasis.cpp | 8323 +++++++++++++++++
 Code/Source/solver/FE/Basis/LagrangeBasis.h   |  175 +
 .../solver/FE/Basis/LagrangeBasisFast.h       | 1378 +++
 .../solver/FE/Basis/LagrangeBasisPyramid.cpp  | 2069 ++++
 .../solver/FE/Basis/LagrangeBasisPyramid.h    |   67 +
 .../solver/FE/Basis/LagrangeBasisSimplex.cpp  | 2457 +++++
 .../solver/FE/Basis/LagrangeBasisSimplex.h    |   78 +
 .../solver/FE/Basis/LagrangeBasisUtility.h    |   25 +
 .../FE/Basis/NodeOrderingConventions.cpp      |  818 ++
 .../solver/FE/Basis/NodeOrderingConventions.h |  538 ++
 .../solver/FE/Basis/PyramidModalBasis.h       |  265 +
 .../solver/FE/Basis/SerendipityBasis.cpp      |  882 ++
 .../Source/solver/FE/Basis/SerendipityBasis.h |   70 +
 Code/Source/solver/FE/Basis/VectorBasis.h     |  255 +
 .../FE/Basis/VectorBasisEvaluationHelpers.cpp |  593 ++
 .../FE/Basis/VectorBasisEvaluationHelpers.h   |  751 ++
 .../FE/Basis/VectorBasisModalPolynomial.h     |   77 +
 Code/Source/solver/FE/Common/Alignment.h      |   23 +
 Code/Source/solver/FE/Common/Types.h          |  532 ++
 .../solver/FE/Math/DenseLinearAlgebra.cpp     |  480 +
 .../solver/FE/Math/DenseLinearAlgebra.h       |  119 +
 .../solver/FE/Math/DenseTransformKernels.h    |   78 +
 Code/Source/solver/FE/Math/ExpressionOps.h    |   99 +
 Code/Source/solver/FE/Math/IntegerMath.h      |   98 +
 Code/Source/solver/FE/Math/MathConstants.h    |  388 +
 Code/Source/solver/FE/Math/Matrix.h           | 1487 +++
 Code/Source/solver/FE/Math/MatrixExpr.h       |  626 ++
 Code/Source/solver/FE/Math/Vector.h           |  831 ++
 Code/Source/solver/FE/Math/VectorExpr.h       |  418 +
 .../solver/FE/Quadrature/QuadratureRule.h     |  237 +
 Code/Source/solver/fs.cpp                     |   71 +-
 Code/Source/solver/nn.cpp                     |  666 +-
 .../FE/Basis/test_BasisCacheFactory.cpp       |  256 +
 .../FE/Basis/test_BasisErrorPaths.cpp         |  203 +
 .../unitTests/FE/Basis/test_BasisHessians.cpp |  314 +
 .../FE/Basis/test_ConstexprBasis.cpp          |  226 +
 .../FE/Basis/test_HigherOrderWedgePyramid.cpp |  173 +
 .../unitTests/FE/Basis/test_LagrangeBasis.cpp | 3028 ++++++
 .../FE/Basis/test_SerendipityTensorModal.cpp  |  116 +
 .../FE/Math/test_DenseLinearAlgebra.cpp       |  265 +
 .../unitTests/FE/Math/test_ExpressionOps.cpp  |  509 +
 .../unitTests/FE/Math/test_MathConstants.cpp  |  341 +
 tests/unitTests/FE/Math/test_Matrix.cpp       |  594 ++
 tests/unitTests/FE/Math/test_MatrixExpr.cpp   |  528 ++
 tests/unitTests/FE/Math/test_Vector.cpp       |  589 ++
 tests/unitTests/FE/Math/test_VectorExpr.cpp   |  409 +
 56 files changed, 34681 insertions(+), 47 deletions(-)
 create mode 100644 Code/Source/solver/FE/Basis/BasisCache.cpp
 create mode 100644 Code/Source/solver/FE/Basis/BasisCache.h
 create mode 100644 Code/Source/solver/FE/Basis/BasisExceptions.h
 create mode 100644 Code/Source/solver/FE/Basis/BasisFactory.cpp
 create mode 100644 Code/Source/solver/FE/Basis/BasisFactory.h
 create mode 100644 Code/Source/solver/FE/Basis/BasisFunction.cpp
 create mode 100644 Code/Source/solver/FE/Basis/BasisFunction.h
 create mode 100644 Code/Source/solver/FE/Basis/BasisTolerance.h
 create mode 100644 Code/Source/solver/FE/Basis/BasisTraits.h
 create mode 100644 Code/Source/solver/FE/Basis/LagrangeBasis.cpp
 create mode 100644 Code/Source/solver/FE/Basis/LagrangeBasis.h
 create mode 100644 Code/Source/solver/FE/Basis/LagrangeBasisFast.h
 create mode 100644 Code/Source/solver/FE/Basis/LagrangeBasisPyramid.cpp
 create mode 100644 Code/Source/solver/FE/Basis/LagrangeBasisPyramid.h
 create mode 100644 Code/Source/solver/FE/Basis/LagrangeBasisSimplex.cpp
 create mode 100644 Code/Source/solver/FE/Basis/LagrangeBasisSimplex.h
 create mode 100644 Code/Source/solver/FE/Basis/LagrangeBasisUtility.h
 create mode 100644 Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
 create mode 100644 Code/Source/solver/FE/Basis/NodeOrderingConventions.h
 create mode 100644 Code/Source/solver/FE/Basis/PyramidModalBasis.h
 create mode 100644 Code/Source/solver/FE/Basis/SerendipityBasis.cpp
 create mode 100644 Code/Source/solver/FE/Basis/SerendipityBasis.h
 create mode 100644 Code/Source/solver/FE/Basis/VectorBasis.h
 create mode 100644 Code/Source/solver/FE/Basis/VectorBasisEvaluationHelpers.cpp
 create mode 100644 Code/Source/solver/FE/Basis/VectorBasisEvaluationHelpers.h
 create mode 100644 Code/Source/solver/FE/Basis/VectorBasisModalPolynomial.h
 create mode 100644 Code/Source/solver/FE/Common/Alignment.h
 create mode 100644 Code/Source/solver/FE/Common/Types.h
 create mode 100644 Code/Source/solver/FE/Math/DenseLinearAlgebra.cpp
 create mode 100644 Code/Source/solver/FE/Math/DenseLinearAlgebra.h
 create mode 100644 Code/Source/solver/FE/Math/DenseTransformKernels.h
 create mode 100644 Code/Source/solver/FE/Math/ExpressionOps.h
 create mode 100644 Code/Source/solver/FE/Math/IntegerMath.h
 create mode 100644 Code/Source/solver/FE/Math/MathConstants.h
 create mode 100644 Code/Source/solver/FE/Math/Matrix.h
 create mode 100644 Code/Source/solver/FE/Math/MatrixExpr.h
 create mode 100644 Code/Source/solver/FE/Math/Vector.h
 create mode 100644 Code/Source/solver/FE/Math/VectorExpr.h
 create mode 100644 Code/Source/solver/FE/Quadrature/QuadratureRule.h
 create mode 100644 tests/unitTests/FE/Basis/test_BasisCacheFactory.cpp
 create mode 100644 tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
 create mode 100644 tests/unitTests/FE/Basis/test_BasisHessians.cpp
 create mode 100644 tests/unitTests/FE/Basis/test_ConstexprBasis.cpp
 create mode 100644 tests/unitTests/FE/Basis/test_HigherOrderWedgePyramid.cpp
 create mode 100644 tests/unitTests/FE/Basis/test_LagrangeBasis.cpp
 create mode 100644 tests/unitTests/FE/Basis/test_SerendipityTensorModal.cpp
 create mode 100644 tests/unitTests/FE/Math/test_DenseLinearAlgebra.cpp
 create mode 100644 tests/unitTests/FE/Math/test_ExpressionOps.cpp
 create mode 100644 tests/unitTests/FE/Math/test_MathConstants.cpp
 create mode 100644 tests/unitTests/FE/Math/test_Matrix.cpp
 create mode 100644 tests/unitTests/FE/Math/test_MatrixExpr.cpp
 create mode 100644 tests/unitTests/FE/Math/test_Vector.cpp
 create mode 100644 tests/unitTests/FE/Math/test_VectorExpr.cpp

diff --git a/Code/Source/solver/CMakeLists.txt b/Code/Source/solver/CMakeLists.txt
index c546c2822..e42391862 100644
--- a/Code/Source/solver/CMakeLists.txt
+++ b/Code/Source/solver/CMakeLists.txt
@@ -23,15 +23,18 @@
 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 
-set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED True)
 
 include_directories(${SV_SOURCE_DIR}/ThirdParty/eigen/include)
+include_directories(${SV_SOURCE_DIR}/ThirdParty/eigen/include/eigen3)
 include_directories(${SV_SOURCE_DIR}/ThirdParty/parmetis_internal/simvascular_parmetis_internal/ParMETISLib)
 include_directories(${SV_SOURCE_DIR}/ThirdParty/tetgen/simvascular_tetgen)
 include_directories(${SV_SOURCE_DIR}/ThirdParty/tinyxml/simvascular_tinyxml)
 include_directories(${MPI_C_INCLUDE_PATH})
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/Core)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/FE)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/FE/Common)
 
 # Find Trilinos package if requested
@@ -86,7 +89,7 @@ endif()
 # add trilinos flags and defines
 if(USE_TRILINOS)
   ADD_DEFINITIONS(-DWITH_TRILINOS)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++20")
 endif()
 
 # Build with the PETSc linear algebra package.
@@ -245,9 +248,27 @@ file(GLOB SOLVER_FE_COMMON_SRCS CONFIGURE_DEPENDS
   FE/Common/*.h
 )
 
+file(GLOB SOLVER_FE_BASIS_SRCS CONFIGURE_DEPENDS
+  FE/Basis/*.cpp
+  FE/Basis/*.h
+)
+
+file(GLOB SOLVER_FE_MATH_SRCS CONFIGURE_DEPENDS
+  FE/Math/*.cpp
+  FE/Math/*.h
+)
+
+file(GLOB SOLVER_FE_QUADRATURE_SRCS CONFIGURE_DEPENDS
+  FE/Quadrature/*.cpp
+  FE/Quadrature/*.h
+)
+
 list(APPEND CSRCS
   ${SOLVER_CORE_SRCS}
   ${SOLVER_FE_COMMON_SRCS}
+  ${SOLVER_FE_BASIS_SRCS}
+  ${SOLVER_FE_MATH_SRCS}
+  ${SOLVER_FE_QUADRATURE_SRCS}
 )
 
   # Set PETSc interace code.
diff --git a/Code/Source/solver/FE/Basis/BasisCache.cpp b/Code/Source/solver/FE/Basis/BasisCache.cpp
new file mode 100644
index 000000000..6d8a4ede3
--- /dev/null
+++ b/Code/Source/solver/FE/Basis/BasisCache.cpp
@@ -0,0 +1,309 @@
+/* Copyright (c) Stanford University, The Regents of the University of California, and others.
+ *
+ * All Rights Reserved.
+ *
+ * See License file.
+ */
+
+#include "BasisCache.h"
+#include <utility>
+
+namespace svmp {
+namespace FE {
+namespace basis {
+
+namespace {
+
+QuadratureCacheKey make_quadrature_cache_key(const quadrature::QuadratureRule& quad) noexcept {
+    const auto fingerprint = quad.point_fingerprint();
+    return QuadratureCacheKey{fingerprint.dimension,
+                              fingerprint.num_points,
+                              fingerprint.points_hash_a,
+                              fingerprint.points_hash_b};
+}
+
+void mix_hash_word(std::uint64_t word,
+                   std::uint64_t& hash_a,
+                   std::uint64_t& hash_b) noexcept {
+    hash_a ^= word + 0x9e3779b97f4a7c15ULL + (hash_a << 6u) + (hash_a >> 2u);
+    hash_b ^= (word + 0xbf58476d1ce4e5b9ULL) + (hash_b << 7u) + (hash_b >> 3u);
+}
+
+std::pair<std::uint64_t, std::uint64_t>
+identity_fingerprint(const std::string& identity) noexcept {
+    std::uint64_t hash_a = 0xa4093822299f31d0ULL;
+    std::uint64_t hash_b = 0x082efa98ec4e6c89ULL;
+    mix_hash_word(static_cast<std::uint64_t>(identity.size()), hash_a, hash_b);
+    for (const char c : identity) {
+        mix_hash_word(static_cast<std::uint64_t>(static_cast<unsigned char>(c)), hash_a, hash_b);
+    }
+    return {hash_a, hash_b};
+}
+
+BasisCacheKey make_basis_cache_key(const BasisFunction& basis,
+                                   const quadrature::QuadratureRule& quad,
+                                   bool gradients,
+                                   bool hessians) {
+    StructuralBasisKey structural_key{
+        basis.basis_type(),
+        basis.element_type(),
+        basis.dimension(),
+        basis.order(),
+        basis.size(),
+        basis.is_vector_valued(),
+        make_quadrature_cache_key(quad),
+        gradients,
+        hessians
+    };
+
+    BasisCacheKey key;
+    const bool uses_basis_identity = !basis.cache_identity_is_structural();
+    if (!uses_basis_identity) {
+        key.value = structural_key;
+        return key;
+    }
+
+    std::vector<std::uint64_t> basis_identity_words;
+    const bool uses_structured_identity = basis.cache_identity_words(basis_identity_words);
+    if (!uses_structured_identity) {
+        basis_identity_words.clear();
+    }
+    const std::string basis_identity =
+        uses_structured_identity ? std::string{} : basis.cache_identity();
+    BasisIdentityFingerprint cached_identity_hash{};
+    const bool has_cached_identity_hash =
+        uses_structured_identity &&
+        basis.cache_identity_fingerprint(cached_identity_hash.hash_a,
+                                         cached_identity_hash.hash_b);
+    const auto identity_hash = uses_structured_identity
+        ? has_cached_identity_hash
+              ? std::pair<std::uint64_t, std::uint64_t>{
+                    cached_identity_hash.hash_a,
+                    cached_identity_hash.hash_b}
+              : [&basis_identity_words] {
+                    const auto fingerprint =
+                        compute_basis_identity_fingerprint(basis_identity_words);
+                    return std::pair<std::uint64_t, std::uint64_t>{
+                        fingerprint.hash_a,
+                        fingerprint.hash_b};
+                }()
+        : identity_fingerprint(basis_identity);
+    key.value = ParameterizedBasisKey{
+        structural_key,
+        uses_structured_identity,
+        identity_hash.first,
+        identity_hash.second,
+        std::move(basis_identity_words),
+        basis_identity
+    };
+    return key;
+}
+
+} // namespace
+
+BasisCache& BasisCache::instance() {
+    static BasisCache cache;
+    return cache;
+}
+
+const BasisCacheEntry& BasisCache::get_or_compute(
+    const BasisFunction& basis,
+    const quadrature::QuadratureRule& quad,
+    bool gradients,
+    bool hessians) {
+    return *get_or_compute_shared(basis, quad, gradients, hessians);
+}
+
+std::shared_ptr<const BasisCacheEntry> BasisCache::get_or_compute_shared(
+    const BasisFunction& basis,
+    const quadrature::QuadratureRule& quad,
+    bool gradients,
+    bool hessians) {
+    const BasisCacheKey key = make_basis_cache_key(basis, quad, gradients, hessians);
+
+    // Warm path: shared (reader) lock allows concurrent cache hits.
+    {
+        std::shared_lock<std::shared_mutex> read_lock(mutex_);
+        auto it = slots_.find(key);
+        if (it != slots_.end() && it->second.entry) {
+            return it->second.entry;
+        }
+    }
+
+    std::shared_ptr<InFlightComputation> in_flight;
+    bool owner = false;
+    {
+        std::unique_lock<std::shared_mutex> write_lock(mutex_);
+        auto& slot = slots_[key];
+        if (slot.entry) {
+            return slot.entry;
+        }
+
+        if (!slot.pending) {
+            in_flight = std::make_shared<InFlightComputation>();
+            slot.pending = in_flight;
+            owner = true;
+        } else {
+            in_flight = slot.pending;
+        }
+    }
+
+    if (!owner) {
+        std::unique_lock<std::mutex> wait_lock(in_flight->mutex);
+        in_flight->ready_cv.wait(wait_lock, [&in_flight] { return in_flight->ready; });
+        if (in_flight->exception) {
+            std::rethrow_exception(in_flight->exception);
+        }
+        return in_flight->entry;
+    }
+
+    try {
+        auto entry = std::make_shared<BasisCacheEntry>(compute(basis, quad, gradients, hessians));
+        {
+            std::unique_lock<std::shared_mutex> write_lock(mutex_);
+            auto slot_it = slots_.find(key);
+            if (slot_it == slots_.end()) {
+                slot_it = slots_.emplace(key, CacheSlot{}).first;
+            }
+            auto& slot = slot_it->second;
+            if (slot.entry) {
+                entry = slot.entry;
+            } else {
+                slot.entry = entry;
+            }
+            if (slot.pending == in_flight) {
+                slot.pending.reset();
+            }
+        }
+        {
+            std::lock_guard<std::mutex> ready_lock(in_flight->mutex);
+            in_flight->entry = entry;
+            in_flight->ready = true;
+        }
+        in_flight->ready_cv.notify_all();
+        return entry;
+    } catch (...) {
+        {
+            std::lock_guard<std::mutex> ready_lock(in_flight->mutex);
+            in_flight->exception = std::current_exception();
+            in_flight->ready = true;
+        }
+        {
+            std::unique_lock<std::shared_mutex> write_lock(mutex_);
+            auto slot_it = slots_.find(key);
+            if (slot_it != slots_.end() && slot_it->second.pending == in_flight) {
+                slot_it->second.pending.reset();
+                if (!slot_it->second.entry) {
+                    slots_.erase(slot_it);
+                }
+            }
+        }
+        in_flight->ready_cv.notify_all();
+        throw;
+    }
+}
+
+const BasisCacheEntry& BasisCache::prewarm(
+    const BasisFunction& basis,
+    const quadrature::QuadratureRule& quad,
+    bool gradients,
+    bool hessians) {
+    return get_or_compute(basis, quad, gradients, hessians);
+}
+
+BasisCacheHandle BasisCache::prewarm_handle(
+    const BasisFunction& basis,
+    const quadrature::QuadratureRule& quad,
+    bool gradients,
+    bool hessians) {
+    return BasisCacheHandle(get_or_compute_shared(basis, quad, gradients, hessians));
+}
+
+BasisCacheEntry BasisCache::compute_uncached(
+    const BasisFunction& basis,
+    const quadrature::QuadratureRule& quad,
+    bool gradients,
+    bool hessians) const {
+    return compute(basis, quad, gradients, hessians);
+}
+
+void BasisCache::clear() {
+    std::unique_lock<std::shared_mutex> lock(mutex_);
+    for (auto it = slots_.begin(); it != slots_.end();) {
+        if (it->second.pending) {
+            it->second.entry.reset();
+            ++it;
+        } else {
+            it = slots_.erase(it);
+        }
+    }
+}
+
+std::size_t BasisCache::size() const {
+    std::shared_lock<std::shared_mutex> lock(mutex_);
+    std::size_t completed = 0;
+    for (const auto& [key, slot] : slots_) {
+        (void)key;
+        if (slot.entry) {
+            ++completed;
+        }
+    }
+    return completed;
+}
+
+BasisCacheEntry BasisCache::compute(const BasisFunction& basis,
+                                    const quadrature::QuadratureRule& quad,
+                                    bool gradients,
+                                    bool hessians) const {
+    BasisCacheEntry entry;
+    const auto& points = quad.points();
+    entry.num_qpts = points.size();
+    entry.num_dofs = basis.size();
+
+    const bool vector_basis = basis.is_vector_valued();
+    if (!vector_basis) {
+        entry.scalar_values.assign(entry.num_dofs * entry.num_qpts, Real(0));
+        if (gradients) {
+            entry.gradients.assign(entry.num_dofs * 3u * entry.num_qpts, Real(0));
+        }
+        if (hessians) {
+            entry.hessians.assign(entry.num_dofs * 9u * entry.num_qpts, Real(0));
+        }
+    } else {
+        entry.vector_values_xyz.assign(entry.num_dofs * 3u * entry.num_qpts, Real(0));
+        if (gradients && basis.supports_vector_jacobians()) {
+            entry.vector_jacobians.assign(entry.num_dofs * 9u * entry.num_qpts, Real(0));
+        }
+        if (gradients && basis.supports_curl()) {
+            entry.vector_curls_xyz.assign(entry.num_dofs * 3u * entry.num_qpts, Real(0));
+        }
+        if (gradients && basis.supports_divergence()) {
+            entry.vector_divergence.assign(entry.num_dofs * entry.num_qpts, Real(0));
+        }
+    }
+
+    if (vector_basis) {
+        if (entry.num_dofs > 0 && entry.num_qpts > 0) {
+            basis.evaluate_vector_at_quadrature_points(
+                points,
+                entry.vector_values_xyz.data(),
+                entry.vector_jacobians.empty() ? nullptr : entry.vector_jacobians.data(),
+                entry.vector_curls_xyz.empty() ? nullptr : entry.vector_curls_xyz.data(),
+                entry.vector_divergence.empty() ? nullptr : entry.vector_divergence.data());
+        }
+        return entry;
+    }
+
+    if (entry.num_dofs > 0 && entry.num_qpts > 0) {
+        basis.fill_scalar_cache_entry(points,
+                                      entry.num_qpts,
+                                      entry.scalar_values.data(),
+                                      gradients ? entry.gradients.data() : nullptr,
+                                      hessians ? entry.hessians.data() : nullptr);
+    }
+
+    return entry;
+}
+} // namespace basis
+} // namespace FE
+} // namespace svmp
diff --git a/Code/Source/solver/FE/Basis/BasisCache.h b/Code/Source/solver/FE/Basis/BasisCache.h
new file mode 100644
index 000000000..a84c0e87a
--- /dev/null
+++ b/Code/Source/solver/FE/Basis/BasisCache.h
@@ -0,0 +1,456 @@
+/* Copyright (c) Stanford University, The Regents of the University of California, and others.
+ *
+ * All Rights Reserved.
+ *
+ * See License file.
+ */
+
+#ifndef SVMP_FE_BASIS_BASISCACHE_H
+#define SVMP_FE_BASIS_BASISCACHE_H
+
+/**
+ * @file BasisCache.h
+ * @brief Cache for basis evaluations at quadrature points
+ */
+
+#include "BasisFunction.h"
+#include "Quadrature/QuadratureRule.h"
+#include <cstddef>
+#include <condition_variable>
+#include <exception>
+#include <functional>
+#include <memory>
+#include <mutex>
+#include <cstdint>
+#include <shared_mutex>
+#include <span>
+#include <string>
+#include <type_traits>
+#include <unordered_map>
+#include <utility>
+#include <variant>
+#include <vector>
+
+namespace svmp {
+namespace FE {
+namespace basis {
+
+struct QuadratureCacheKey {
+    int dimension{0};
+    std::size_t num_points{0};
+    // Quadrature coordinates are intentionally fingerprinted from their exact
+    // Real bit patterns. Values such as -0.0 and +0.0 therefore produce
+    // distinct cache keys unless a future API explicitly normalizes them. The
+    // key intentionally ignores weights and rule class because basis values only
+    // depend on reference coordinates; bit-identical point sets share entries.
+    std::uint64_t points_hash_a{0};
+    std::uint64_t points_hash_b{0};
+
+    bool operator==(const QuadratureCacheKey& other) const noexcept {
+        return dimension == other.dimension &&
+               num_points == other.num_points &&
+               points_hash_a == other.points_hash_a &&
+               points_hash_b == other.points_hash_b;
+    }
+};
+
+struct StructuralBasisKey {
+    BasisType basis_type{BasisType::Custom};
+    ElementType element_type{ElementType::Unknown};
+    int dimension{0};
+    int order{0};
+    std::size_t num_dofs{0};
+    bool vector_valued{false};
+    QuadratureCacheKey quadrature;
+    bool with_gradients{false};
+    bool with_hessians{false};
+
+    bool operator==(const StructuralBasisKey& other) const noexcept {
+        return basis_type == other.basis_type &&
+               element_type == other.element_type &&
+               dimension == other.dimension &&
+               order == other.order &&
+               num_dofs == other.num_dofs &&
+               vector_valued == other.vector_valued &&
+               quadrature == other.quadrature &&
+               with_gradients == other.with_gradients &&
+               with_hessians == other.with_hessians;
+    }
+};
+
+struct ParameterizedBasisKey {
+    StructuralBasisKey structural;
+    bool uses_structured_identity{false};
+    std::uint64_t identity_hash_a{0};
+    std::uint64_t identity_hash_b{0};
+    std::vector<std::uint64_t> basis_identity_words;
+    std::string basis_identity;
+
+    bool operator==(const ParameterizedBasisKey& other) const noexcept {
+        return structural == other.structural &&
+               uses_structured_identity == other.uses_structured_identity &&
+               identity_hash_a == other.identity_hash_a &&
+               identity_hash_b == other.identity_hash_b &&
+               basis_identity_words == other.basis_identity_words &&
+               basis_identity == other.basis_identity;
+    }
+};
+
+struct BasisCacheKey {
+    std::variant<StructuralBasisKey, ParameterizedBasisKey> value;
+
+    bool operator==(const BasisCacheKey& other) const noexcept {
+        return value == other.value;
+    }
+};
+
+struct BasisCacheKeyHash {
+    std::size_t operator()(const BasisCacheKey& key) const noexcept {
+        std::size_t seed = 0;
+        auto combine = [&seed](std::size_t value) noexcept {
+            seed ^= value + 0x9e3779b97f4a7c15ULL + (seed << 6u) + (seed >> 2u);
+        };
+
+        auto hash_structural = [&](const StructuralBasisKey& structural) noexcept {
+            combine(std::hash<int>()(structural.quadrature.dimension));
+            combine(std::hash<std::size_t>()(structural.quadrature.num_points));
+            combine(std::hash<std::uint64_t>()(structural.quadrature.points_hash_a));
+            combine(std::hash<std::uint64_t>()(structural.quadrature.points_hash_b));
+            combine(std::hash<int>()(static_cast<int>(structural.basis_type)));
+            combine(std::hash<int>()(static_cast<int>(structural.element_type)));
+            combine(std::hash<int>()(structural.dimension));
+            combine(std::hash<int>()(structural.order));
+            combine(std::hash<std::size_t>()(structural.num_dofs));
+            unsigned flags = 0u;
+            flags |= structural.vector_valued ? 1u : 0u;
+            flags |= structural.with_gradients ? 2u : 0u;
+            flags |= structural.with_hessians ? 4u : 0u;
+            combine(std::hash<unsigned>()(flags));
+        };
+
+        std::visit([&](const auto& active_key) {
+            using ActiveKey = std::decay_t<decltype(active_key)>;
+            if constexpr (std::is_same_v<ActiveKey, StructuralBasisKey>) {
+                combine(0x5354525543544b45ULL);
+                hash_structural(active_key);
+            } else {
+                combine(0x504152414d4b4559ULL);
+                hash_structural(active_key.structural);
+                combine(active_key.uses_structured_identity ? 1u : 0u);
+                combine(std::hash<std::uint64_t>()(active_key.identity_hash_a));
+                combine(std::hash<std::uint64_t>()(active_key.identity_hash_b));
+            }
+        }, key.value);
+        return seed;
+    }
+};
+
+struct BasisCacheEntry {
+    std::size_t num_qpts{0};
+    std::size_t num_dofs{0};
+    // Scalar basis values in dof-major SoA layout: [dof * num_qpts + qp].
+    std::vector<Real> scalar_values;
+    // Scalar reference gradients in dof/component/qpt SoA layout:
+    // [(dof * 3 + component) * num_qpts + qp].
+    std::vector<Real> gradients;
+    // Scalar reference Hessians in dof/component/qpt SoA layout:
+    // [(dof * 9 + row * 3 + col) * num_qpts + qp].
+    std::vector<Real> hessians;
+
+    // Vector basis values in dof/component/qpt SoA layout:
+    // [(dof * 3 + component) * num_qpts + qp].
+    std::vector<Real> vector_values_xyz;
+    // Vector basis reference Jacobians in dof/component/derivative/qpt layout:
+    // [(dof * 9 + component * 3 + derivative) * num_qpts + qp].
+    std::vector<Real> vector_jacobians;
+    // Vector basis curls in dof/component/qpt SoA layout.
+    std::vector<Real> vector_curls_xyz;
+    // Vector basis divergences in dof/qpt SoA layout.
+    std::vector<Real> vector_divergence;
+
+    // The object-returning accessors below are convenience helpers for tests,
+    // diagnostics, and occasional scalar use. Hot loops should prefer the SoA
+    // span accessors so they do not reconstruct Gradient, Hessian, or matrix
+    // objects per DOF and quadrature point.
+
+    [[nodiscard]] Real scalarValue(std::size_t dof, std::size_t qp) const noexcept {
+        return scalar_values[dof * num_qpts + qp];
+    }
+
+    [[nodiscard]] std::span<const Real> scalarValuesForDof(std::size_t dof) const noexcept {
+        if (num_qpts == 0) return {};
+        return std::span<const Real>(scalar_values.data() + dof * num_qpts, num_qpts);
+    }
+
+    [[nodiscard]] Real gradientValue(std::size_t dof,
+                                     std::size_t component,
+                                     std::size_t qp) const noexcept {
+        return gradients[(dof * 3u + component) * num_qpts + qp];
+    }
+
+    [[nodiscard]] Gradient gradientVector(std::size_t dof, std::size_t qp) const noexcept {
+        Gradient out{};
+        for (std::size_t component = 0; component < 3u; ++component) {
+            out[component] = gradientValue(dof, component, qp);
+        }
+        return out;
+    }
+
+    [[nodiscard]] std::span<const Real> gradientsForDofComponent(std::size_t dof,
+                                                                  std::size_t component) const noexcept {
+        if (num_qpts == 0) return {};
+        return std::span<const Real>(gradients.data() + (dof * 3u + component) * num_qpts, num_qpts);
+    }
+
+    [[nodiscard]] std::span<const Real> gradientsForDof(std::size_t dof) const noexcept {
+        if (num_qpts == 0) return {};
+        return std::span<const Real>(gradients.data() + dof * 3u * num_qpts, 3u * num_qpts);
+    }
+
+    [[nodiscard]] Real hessianValue(std::size_t dof,
+                                    std::size_t row,
+                                    std::size_t col,
+                                    std::size_t qp) const noexcept {
+        return hessians[(dof * 9u + row * 3u + col) * num_qpts + qp];
+    }
+
+    [[nodiscard]] Hessian hessianMatrix(std::size_t dof, std::size_t qp) const noexcept {
+        Hessian out{};
+        for (std::size_t row = 0; row < 3u; ++row) {
+            for (std::size_t col = 0; col < 3u; ++col) {
+                out(row, col) = hessianValue(dof, row, col, qp);
+            }
+        }
+        return out;
+    }
+
+    [[nodiscard]] std::span<const Real> hessiansForDofComponent(std::size_t dof,
+                                                                 std::size_t row,
+                                                                 std::size_t col) const noexcept {
+        if (num_qpts == 0) return {};
+        return std::span<const Real>(hessians.data() + (dof * 9u + row * 3u + col) * num_qpts, num_qpts);
+    }
+
+    [[nodiscard]] std::span<const Real> hessiansForDof(std::size_t dof) const noexcept {
+        if (num_qpts == 0) return {};
+        return std::span<const Real>(hessians.data() + dof * 9u * num_qpts, 9u * num_qpts);
+    }
+
+    [[nodiscard]] Real vectorValue(std::size_t dof,
+                                   std::size_t component,
+                                   std::size_t qp) const noexcept {
+        return vector_values_xyz[(dof * 3u + component) * num_qpts + qp];
+    }
+
+    [[nodiscard]] math::Vector<Real, 3> vectorValue(std::size_t dof,
+                                                     std::size_t qp) const noexcept {
+        math::Vector<Real, 3> out{};
+        for (std::size_t component = 0; component < 3u; ++component) {
+            out[component] = vectorValue(dof, component, qp);
+        }
+        return out;
+    }
+
+    [[nodiscard]] std::span<const Real> vectorValuesForDofComponent(std::size_t dof,
+                                                                     std::size_t component) const noexcept {
+        if (num_qpts == 0) return {};
+        return std::span<const Real>(vector_values_xyz.data() + (dof * 3u + component) * num_qpts, num_qpts);
+    }
+
+    [[nodiscard]] std::span<const Real> vectorValuesForDof(std::size_t dof) const noexcept {
+        if (num_qpts == 0 || vector_values_xyz.empty()) return {};
+        return std::span<const Real>(vector_values_xyz.data() + dof * 3u * num_qpts, 3u * num_qpts);
+    }
+
+    [[nodiscard]] Real vectorJacobianValue(std::size_t dof,
+                                           std::size_t component,
+                                           std::size_t derivative,
+                                           std::size_t qp) const noexcept {
+        return vector_jacobians[(dof * 9u + component * 3u + derivative) * num_qpts + qp];
+    }
+
+    [[nodiscard]] VectorJacobian vectorJacobianMatrix(std::size_t dof,
+                                                       std::size_t qp) const noexcept {
+        VectorJacobian out{};
+        for (std::size_t component = 0; component < 3u; ++component) {
+            for (std::size_t derivative = 0; derivative < 3u; ++derivative) {
+                out(component, derivative) =
+                    vectorJacobianValue(dof, component, derivative, qp);
+            }
+        }
+        return out;
+    }
+
+    [[nodiscard]] std::span<const Real> vectorJacobiansForDofComponentDerivative(
+        std::size_t dof,
+        std::size_t component,
+        std::size_t derivative) const noexcept {
+        if (num_qpts == 0 || vector_jacobians.empty()) return {};
+        return std::span<const Real>(
+            vector_jacobians.data() + (dof * 9u + component * 3u + derivative) * num_qpts,
+            num_qpts);
+    }
+
+    [[nodiscard]] std::span<const Real> vectorJacobiansForDof(std::size_t dof) const noexcept {
+        if (num_qpts == 0 || vector_jacobians.empty()) return {};
+        return std::span<const Real>(vector_jacobians.data() + dof * 9u * num_qpts, 9u * num_qpts);
+    }
+
+    [[nodiscard]] Real vectorCurlValue(std::size_t dof,
+                                       std::size_t component,
+                                       std::size_t qp) const noexcept {
+        return vector_curls_xyz[(dof * 3u + component) * num_qpts + qp];
+    }
+
+    [[nodiscard]] math::Vector<Real, 3> vectorCurl(std::size_t dof,
+                                                    std::size_t qp) const noexcept {
+        math::Vector<Real, 3> out{};
+        for (std::size_t component = 0; component < 3u; ++component) {
+            out[component] = vectorCurlValue(dof, component, qp);
+        }
+        return out;
+    }
+
+    [[nodiscard]] std::span<const Real> vectorCurlsForDofComponent(std::size_t dof,
+                                                                    std::size_t component) const noexcept {
+        if (num_qpts == 0 || vector_curls_xyz.empty()) return {};
+        return std::span<const Real>(vector_curls_xyz.data() + (dof * 3u + component) * num_qpts, num_qpts);
+    }
+
+    [[nodiscard]] std::span<const Real> vectorCurlsForDof(std::size_t dof) const noexcept {
+        if (num_qpts == 0 || vector_curls_xyz.empty()) return {};
+        return std::span<const Real>(vector_curls_xyz.data() + dof * 3u * num_qpts, 3u * num_qpts);
+    }
+
+    [[nodiscard]] Real vectorDivergenceValue(std::size_t dof,
+                                             std::size_t qp) const noexcept {
+        return vector_divergence[dof * num_qpts + qp];
+    }
+
+    [[nodiscard]] std::span<const Real> vectorDivergenceForDof(std::size_t dof) const noexcept {
+        if (num_qpts == 0 || vector_divergence.empty()) return {};
+        return std::span<const Real>(vector_divergence.data() + dof * num_qpts, num_qpts);
+    }
+};
+
+class BasisCacheHandle {
+public:
+    BasisCacheHandle() = default;
+
+    [[nodiscard]] const BasisCacheEntry& entry() const {
+        BASIS_CHECK_CONFIG(entry_ != nullptr,
+                           "BasisCacheHandle: attempted to access an empty handle");
+        return *entry_;
+    }
+
+    [[nodiscard]] bool valid() const noexcept { return entry_ != nullptr; }
+    explicit operator bool() const noexcept { return valid(); }
+
+private:
+    friend class BasisCache;
+
+    explicit BasisCacheHandle(std::shared_ptr<const BasisCacheEntry> entry)
+        : entry_(std::move(entry)) {}
+
+    std::shared_ptr<const BasisCacheEntry> entry_;
+};
+
+class BasisCache {
+public:
+    static BasisCache& instance();
+
+    const BasisCacheEntry& get_or_compute(
+        const BasisFunction& basis,
+        const quadrature::QuadratureRule& quad,
+        bool gradients = true,
+        bool hessians = false);
+
+    /**
+     * @brief Compute an entry without consulting, publishing to, or waiting on
+     * the shared cache.
+     */
+    BasisCacheEntry compute_uncached(
+        const BasisFunction& basis,
+        const quadrature::QuadratureRule& quad,
+        bool gradients = true,
+        bool hessians = false) const;
+
+    /**
+     * @brief Eagerly populate the cache for the given (basis, quadrature) key
+     *
+     * Pays the compute cost up front so that subsequent get_or_compute calls
+     * for the same key hit the warm-cache path immediately. Equivalent to
+     * calling get_or_compute and discarding the return value.
+     *
+     * Returns the inserted (or pre-existing) entry for convenience.
+     */
+    const BasisCacheEntry& prewarm(
+        const BasisFunction& basis,
+        const quadrature::QuadratureRule& quad,
+        bool gradients = true,
+        bool hessians = false);
+
+    /**
+     * @brief Eagerly populate the cache and return a hot-loop handle.
+     *
+     * The returned handle owns a shared reference to the completed entry. Access
+     * through BasisCacheHandle::entry() performs no key construction, hashing,
+     * map lookup, or cache mutex acquisition. Calling clear() removes the entry
+     * from the global lookup map but does not invalidate existing handles.
+     */
+    BasisCacheHandle prewarm_handle(
+        const BasisFunction& basis,
+        const quadrature::QuadratureRule& quad,
+        bool gradients = true,
+        bool hessians = false);
+
+    /**
+     * @brief Remove completed cache entries.
+     *
+     * This is a soft clear: computations that were already in flight before
+     * clear() was called are allowed to publish their completed entry afterward.
+     * This preserves the returned-reference lifetime contract for concurrent
+     * get_or_compute() callers while still dropping all entries that had already
+     * completed at the time of the call.
+     */
+    void clear();
+    std::size_t size() const;
+
+private:
+    struct InFlightComputation {
+        std::mutex mutex;
+        std::condition_variable ready_cv;
+        bool ready{false};
+        std::shared_ptr<BasisCacheEntry> entry;
+        std::exception_ptr exception;
+    };
+
+    struct CacheSlot {
+        std::shared_ptr<BasisCacheEntry> entry;
+        std::shared_ptr<InFlightComputation> pending;
+    };
+
+    BasisCache() = default;
+    BasisCache(const BasisCache&) = delete;
+    BasisCache& operator=(const BasisCache&) = delete;
+
+    BasisCacheEntry compute(const BasisFunction& basis,
+                            const quadrature::QuadratureRule& quad,
+                            bool gradients,
+                            bool hessians) const;
+
+    std::shared_ptr<const BasisCacheEntry> get_or_compute_shared(
+        const BasisFunction& basis,
+        const quadrature::QuadratureRule& quad,
+        bool gradients,
+        bool hessians);
+
+    mutable std::shared_mutex mutex_;
+    std::unordered_map<BasisCacheKey, CacheSlot, BasisCacheKeyHash> slots_;
+};
+
+} // namespace basis
+} // namespace FE
+} // namespace svmp
+
+#endif // SVMP_FE_BASIS_BASISCACHE_H
diff --git a/Code/Source/solver/FE/Basis/BasisExceptions.h b/Code/Source/solver/FE/Basis/BasisExceptions.h
new file mode 100644
index 000000000..8ee92a3dd
--- /dev/null
+++ b/Code/Source/solver/FE/Basis/BasisExceptions.h
@@ -0,0 +1,134 @@
+/* Copyright (c) Stanford University, The Regents of the University of California, and others.
+ *
+ * All Rights Reserved.
+ *
+ * See License file.
+ */
+
+#ifndef SVMP_FE_BASIS_BASISEXCEPTIONS_H
+#define SVMP_FE_BASIS_BASISEXCEPTIONS_H
+
+#include "FEException.h"
+
+namespace svmp {
+namespace FE {
+namespace basis {
+
+/**
+ * @brief Base exception type for errors originating in the Basis module
+ */
+class BasisException : public FEException {
+public:
+    BasisException(const std::string& message,
+                   const char* file = "",
+                   int line = 0,
+                   const char* function = "",
+                   StatusCode status = StatusCode::Unknown)
+        : FEException(message, status, file, line, function) {}
+};
+
+/**
+ * @brief Invalid Basis request or configuration
+ */
+class BasisConfigurationException : public BasisException {
+public:
+    BasisConfigurationException(const std::string& message,
+                                const char* file = "",
+                                int line = 0,
+                                const char* function = "")
+        : BasisException(message, file, line, function, StatusCode::InvalidArgument) {}
+};
+
+/**
+ * @brief Requested element topology is incompatible with the basis family
+ */
+class BasisElementCompatibilityException : public BasisException {
+public:
+    BasisElementCompatibilityException(const std::string& message,
+                                       const char* file = "",
+                                       int line = 0,
+                                       const char* function = "")
+        : BasisException(message, file, line, function, StatusCode::InvalidArgument) {}
+};
+
+/**
+ * @brief Basis evaluation request cannot be satisfied
+ */
+class BasisEvaluationException : public BasisException {
+public:
+    BasisEvaluationException(const std::string& message,
+                             const char* file = "",
+                             int line = 0,
+                             const char* function = "")
+        : BasisException(message, file, line, function, StatusCode::InvalidArgument) {}
+};
+
+/**
+ * @brief Public-to-canonical node ordering or coordinate lookup failure
+ */
+class BasisNodeOrderingException : public BasisException {
+public:
+    BasisNodeOrderingException(const std::string& message,
+                               const char* file = "",
+                               int line = 0,
+                               const char* function = "")
+        : BasisException(message, file, line, function, StatusCode::InvalidArgument) {}
+};
+
+/**
+ * @brief Internal basis construction or transform setup failure
+ */
+class BasisConstructionException : public BasisException {
+public:
+    BasisConstructionException(const std::string& message,
+                               const char* file = "",
+                               int line = 0,
+                               const char* function = "")
+        : BasisException(message, file, line, function, StatusCode::InternalError) {}
+};
+
+#define BASIS_CHECK_CONFIG(condition, message)                                                 \
+    do {                                                                                       \
+        if (!(condition)) {                                                                    \
+            throw ::svmp::FE::basis::BasisConfigurationException((message),                    \
+                                                                  __FILE__, __LINE__, __func__); \
+        }                                                                                      \
+    } while (false)
+
+#define BASIS_CHECK_COMPAT(condition, message)                                                 \
+    do {                                                                                       \
+        if (!(condition)) {                                                                    \
+            throw ::svmp::FE::basis::BasisElementCompatibilityException((message),             \
+                                                                         __FILE__, __LINE__, __func__); \
+        }                                                                                      \
+    } while (false)
+
+#define BASIS_CHECK_EVAL(condition, message)                                                   \
+    do {                                                                                       \
+        if (!(condition)) {                                                                    \
+            throw ::svmp::FE::basis::BasisEvaluationException((message),                       \
+                                                               __FILE__, __LINE__, __func__);  \
+        }                                                                                      \
+    } while (false)
+
+#define BASIS_CHECK_NODE_ORDER(condition, message)                                             \
+    do {                                                                                       \
+        if (!(condition)) {                                                                    \
+            throw ::svmp::FE::basis::BasisNodeOrderingException((message),                     \
+                                                                 __FILE__, __LINE__, __func__); \
+        }                                                                                      \
+    } while (false)
+
+#define BASIS_CHECK_CONSTRUCTION(condition, message)                                           \
+    do {                                                                                       \
+        if (!(condition)) {                                                                    \
+            throw ::svmp::FE::basis::BasisConstructionException((message),                     \
+                                                                 __FILE__, __LINE__, __func__); \
+        }                                                                                      \
+    } while (false)
+
+} // namespace basis
+} // namespace FE
+} // namespace svmp
+
+#endif // SVMP_FE_BASIS_BASISEXCEPTIONS_H
diff --git a/Code/Source/solver/FE/Basis/BasisFactory.cpp b/Code/Source/solver/FE/Basis/BasisFactory.cpp
new file mode 100644
index 000000000..dddbd4c5c
--- /dev/null
+++ b/Code/Source/solver/FE/Basis/BasisFactory.cpp
@@ -0,0 +1,160 @@
+/* Copyright (c) Stanford University, The Regents of the University of California, and others.
+ *
+ * All Rights Reserved.
+ *
+ * See License file.
+ */
+
+#include "BasisFactory.h"
+
+#include "LagrangeBasis.h"
+#include "SerendipityBasis.h"
+
+#include <mutex>
+#include <unordered_map>
+#include <utility>
+
+namespace svmp {
+namespace FE {
+namespace basis {
+
+namespace {
+
+using CustomRegistryMap =
+    std::unordered_map<std::string, basis_factory::CustomFactory>;
+
+CustomRegistryMap& custom_registry() {
+    static CustomRegistryMap registry;
+    return registry;
+}
+
+std::mutex& custom_registry_mutex() {
+    static std::mutex mutex;
+    return mutex;
+}
+
+int require_basis_order(const BasisRequest& req,
+                        const char* missing_message,
+                        const char* negative_message) {
+    if (!req.order.has_value()) {
+        throw BasisConfigurationException(missing_message,
+                                          __FILE__, __LINE__, __func__);
+    }
+    if (*req.order < 0) {
+        throw BasisConfigurationException(negative_message,
+                                          __FILE__, __LINE__, __func__);
+    }
+    return *req.order;
+}
+
+void require_scalar_c0_request(const BasisRequest& req) {
+    if (req.field_type != FieldType::Scalar) {
+        throw BasisConfigurationException(
+            "BasisFactory: Lagrange/Serendipity bases currently support scalar fields only",
+            __FILE__, __LINE__, __func__);
+    }
+    if (req.continuity != Continuity::C0) {
+        throw BasisConfigurationException(
+            "BasisFactory: migrated Lagrange/Serendipity scope supports C0 continuity only",
+            __FILE__, __LINE__, __func__);
+    }
+}
+
+std::shared_ptr<BasisFunction> create_lagrange(const BasisRequest& req) {
+    require_scalar_c0_request(req);
+    const int order = require_basis_order(
+        req,
+        "BasisFactory: Lagrange creation requires an explicit order",
+        "BasisFactory: Lagrange requires non-negative order");
+    return std::make_shared<LagrangeBasis>(req.element_type, order);
+}
+
+std::shared_ptr<BasisFunction> create_serendipity(const BasisRequest& req) {
+    require_scalar_c0_request(req);
+    const int order = require_basis_order(
+        req,
+        "BasisFactory: Serendipity creation requires an explicit order",
+        "BasisFactory: Serendipity requires non-negative order");
+    return std::make_shared<SerendipityBasis>(req.element_type, order);
+}
+
+std::shared_ptr<BasisFunction> create_custom(const BasisRequest& req) {
+    if (req.custom_id.empty()) {
+        throw BasisConfigurationException(
+            "BasisFactory: custom basis requests require custom_id",
+            __FILE__, __LINE__, __func__);
+    }
+
+    basis_factory::CustomFactory factory;
+    {
+        std::lock_guard<std::mutex> lock(custom_registry_mutex());
+        const auto it = custom_registry().find(req.custom_id);
+        if (it == custom_registry().end()) {
+            throw BasisConfigurationException(
+                "BasisFactory: no custom basis factory registered for id '" +
+                    req.custom_id + "'",
+                __FILE__, __LINE__, __func__);
+        }
+        factory = it->second;
+    }
+
+    auto basis = factory(req);
+    if (!basis) {
+        throw BasisConstructionException(
+            "BasisFactory: custom factory returned null basis for id '" +
+                req.custom_id + "'",
+            __FILE__, __LINE__, __func__);
+    }
+    return basis;
+}
+
+} // namespace
+
+namespace basis_factory {
+
+std::shared_ptr<BasisFunction> create(const BasisRequest& req) {
+    switch (req.basis_type) {
+        case BasisType::Lagrange:
+            return create_lagrange(req);
+        case BasisType::Serendipity:
+            return create_serendipity(req);
+        case BasisType::Custom:
+            return create_custom(req);
+        default:
+            throw BasisConfigurationException(
+                "BasisFactory: requested basis family is outside the migrated Lagrange/Serendipity scope",
+                __FILE__, __LINE__, __func__);
+    }
+}
+
+void register_custom(std::string custom_id, CustomFactory factory) {
+    if (custom_id.empty()) {
+        throw BasisConfigurationException(
+            "BasisFactory: custom factory id must not be empty",
+            __FILE__, __LINE__, __func__);
+    }
+    if (!factory) {
+        throw BasisConfigurationException(
+            "BasisFactory: custom factory must be callable",
+            __FILE__, __LINE__, __func__);
+    }
+
+    std::lock_guard<std::mutex> lock(custom_registry_mutex());
+    custom_registry()[std::move(custom_id)] = std::move(factory);
+}
+
+void unregister_custom(const std::string& custom_id) {
+    std::lock_guard<std::mutex> lock(custom_registry_mutex());
+    custom_registry().erase(custom_id);
+}
+
+void clear_custom_registry_for_tests() {
+    std::lock_guard<std::mutex> lock(custom_registry_mutex());
+    custom_registry().clear();
+}
+
+} // namespace basis_factory
+
+} // namespace basis
+} // namespace FE
+} // namespace svmp
diff --git a/Code/Source/solver/FE/Basis/BasisFactory.h b/Code/Source/solver/FE/Basis/BasisFactory.h
new file mode 100644
index 000000000..cedf1ba6d
--- /dev/null
+++ b/Code/Source/solver/FE/Basis/BasisFactory.h
@@ -0,0 +1,57 @@
+/* Copyright (c) Stanford University, The Regents of the University of California, and others.
+ *
+ * All Rights Reserved.
+ *
+ * See License file.
+ */
+
+#ifndef SVMP_FE_BASIS_BASISFACTORY_H
+#define SVMP_FE_BASIS_BASISFACTORY_H
+
+/**
+ * @file BasisFactory.h
+ * @brief Runtime creation of basis families
+ */
+
+#include "BasisFunction.h"
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+namespace svmp {
+namespace FE {
+namespace basis {
+
+struct BasisRequest {
+    ElementType element_type;
+    BasisType basis_type;
+    std::optional<int> order{};
+    Continuity continuity{Continuity::C0};
+    FieldType field_type{FieldType::Scalar};
+    std::vector<Real> knot_vector{};
+    std::vector<Real> weights{};
+    std::vector<int> axis_orders{};
+    std::vector<std::vector<Real>> axis_knot_vectors{};
+    std::vector<std::vector<Real>> axis_weights{};
+    std::vector<int> tensor_extents{};
+    std::string custom_id{};
+};
+
+namespace basis_factory {
+
+using CustomFactory = std::function<std::shared_ptr<BasisFunction>(const BasisRequest&)>;
+
+[[nodiscard]] std::shared_ptr<BasisFunction> create(const BasisRequest& req);
+void register_custom(std::string custom_id, CustomFactory factory);
+void unregister_custom(const std::string& custom_id);
+void clear_custom_registry_for_tests();
+
+} // namespace basis_factory
+
+} // namespace basis
+} // namespace FE
+} // namespace svmp
+
+#endif // SVMP_FE_BASIS_BASISFACTORY_H
diff --git a/Code/Source/solver/FE/Basis/BasisFunction.cpp b/Code/Source/solver/FE/Basis/BasisFunction.cpp
new file mode 100644
index 000000000..49c8d8763
--- /dev/null
+++ b/Code/Source/solver/FE/Basis/BasisFunction.cpp
@@ -0,0 +1,366 @@
+/* Copyright (c) Stanford University, The Regents of the University of California, and others.
+ *
+ * All Rights Reserved.
+ *
+ * See License file.
+ */
+
+#include "BasisFunction.h"
+#include "VectorBasisEvaluationHelpers.h"
+#include <algorithm>
+#include <iomanip>
+#include <limits>
+#include <sstream>
+
+namespace svmp {
+namespace FE {
+namespace basis {
+
+namespace {
+
+struct BasisFunctionScratch {
+    std::vector<Real> scalar_values;
+    std::vector<Gradient> scalar_gradients;
+    std::vector<Hessian> scalar_hessians;
+    std::vector<math::Vector<Real, 3>> vector_values;
+    std::vector<VectorJacobian> vector_jacobians;
+    std::vector<math::Vector<Real, 3>> vector_curls;
+    std::vector<Real> vector_divergences;
+
+    void prewarm(std::size_t max_size) {
+        scalar_values.reserve(max_size);
+        scalar_gradients.reserve(max_size);
+        scalar_hessians.reserve(max_size);
+        vector_values.reserve(max_size);
+        vector_jacobians.reserve(max_size);
+        vector_curls.reserve(max_size);
+        vector_divergences.reserve(max_size);
+    }
+};
+
+BasisFunctionScratch& basis_function_scratch() {
+    // Scratch is intentionally thread-local: production assembly uses a
+    // persistent worker-thread team, so buffers stay warm on each worker.
+    static thread_local BasisFunctionScratch scratch;
+    return scratch;
+}
+
+void mix_identity_hash_word(std::uint64_t word,
+                            std::uint64_t& hash_a,
+                            std::uint64_t& hash_b) noexcept {
+    hash_a ^= word + 0x9e3779b97f4a7c15ULL + (hash_a << 6u) + (hash_a >> 2u);
+    hash_b ^= (word + 0xbf58476d1ce4e5b9ULL) + (hash_b << 7u) + (hash_b >> 3u);
+}
+
+} // namespace
+
+BasisIdentityFingerprint
+compute_basis_identity_fingerprint(std::span<const std::uint64_t> words) noexcept {
+    BasisIdentityFingerprint fingerprint{0x243f6a8885a308d3ULL,
+                                         0x13198a2e03707344ULL};
+    mix_identity_hash_word(static_cast<std::uint64_t>(words.size()),
+                           fingerprint.hash_a,
+                           fingerprint.hash_b);
+    for (const auto word : words) {
+        mix_identity_hash_word(word, fingerprint.hash_a, fingerprint.hash_b);
+    }
+    return fingerprint;
+}
+
+std::string BasisFunction::cache_identity() const {
+    std::ostringstream oss;
+    oss << "basis=" << static_cast<int>(basis_type())
+        << "|elem=" << static_cast<int>(element_type())
+        << "|dim=" << dimension()
+        << "|order=" << order()
+        << "|size=" << size()
+        << "|vector=" << is_vector_valued();
+    return oss.str();
+}
+
+bool BasisFunction::cache_identity_words(std::vector<std::uint64_t>& words) const {
+    (void)words;
+    return false;
+}
+
+bool BasisFunction::cache_identity_fingerprint(std::uint64_t& hash_a,
+                                               std::uint64_t& hash_b) const {
+    (void)hash_a;
+    (void)hash_b;
+    return false;
+}
+
+void prewarm_basis_function_scratch(std::size_t max_size,
+                                    std::size_t max_qpts) {
+    (void)max_qpts;
+    basis_function_scratch().prewarm(max_size);
+}
+
+void BasisFunction::evaluate_gradients(const math::Vector<Real, 3>& xi,
+                                       std::vector<Gradient>& gradients) const {
+    (void)xi;
+    (void)gradients;
+    throw BasisEvaluationException("Analytic gradient evaluation is not implemented for this basis",
+                                   __FILE__, __LINE__, __func__);
+}
+
+void BasisFunction::evaluate_hessians(const math::Vector<Real, 3>& xi,
+                                      std::vector<Hessian>& hessians) const {
+    (void)xi;
+    (void)hessians;
+    throw BasisEvaluationException("Analytic Hessian evaluation is not implemented for this basis",
+                                   __FILE__, __LINE__, __func__);
+}
+
+void BasisFunction::evaluate_all(const math::Vector<Real, 3>& xi,
+                                 std::vector<Real>& values,
+                                 std::vector<Gradient>& gradients,
+                                 std::vector<Hessian>& hessians) const {
+    evaluate_values(xi, values);
+    evaluate_gradients(xi, gradients);
+    evaluate_hessians(xi, hessians);
+}
+
+void BasisFunction::evaluate_values_to(const math::Vector<Real, 3>& xi,
+                                       Real* SVMP_RESTRICT values_out) const {
+    auto& tmp = basis_function_scratch().scalar_values;
+    tmp.resize(size());
+    evaluate_values(xi, tmp);
+    std::copy_n(tmp.data(), tmp.size(), values_out);
+}
+
+void BasisFunction::evaluate_gradients_to(const math::Vector<Real, 3>& xi,
+                                          Real* SVMP_RESTRICT gradients_out) const {
+    auto& tmp = basis_function_scratch().scalar_gradients;
+    tmp.resize(size());
+    evaluate_gradients(xi, tmp);
+    for (std::size_t i = 0; i < tmp.size(); ++i) {
+        gradients_out[i * 3u + 0u] = tmp[i][0];
+        gradients_out[i * 3u + 1u] = tmp[i][1];
+        gradients_out[i * 3u + 2u] = tmp[i][2];
+    }
+}
+
+void BasisFunction::evaluate_hessians_to(const math::Vector<Real, 3>& xi,
+                                         Real* SVMP_RESTRICT hessians_out) const {
+    auto& tmp = basis_function_scratch().scalar_hessians;
+    tmp.resize(size());
+    evaluate_hessians(xi, tmp);
+    for (std::size_t i = 0; i < tmp.size(); ++i) {
+        store_hessian(tmp[i], hessians_out + i * 9u);
+    }
+}
+
+void BasisFunction::evaluate_at_quadrature_points(
+    const std::vector<math::Vector<Real, 3>>& points,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) const {
+    evaluate_at_quadrature_points_strided(
+        points, points.size(), values_out, gradients_out, hessians_out);
+}
+
+void BasisFunction::evaluate_at_quadrature_points_strided(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) const {
+    const std::size_t num_qpts = points.size();
+    const std::size_t num_dofs = size();
+    if (output_stride < num_qpts) {
+        throw BasisConfigurationException(
+            "BasisFunction strided evaluation requires output_stride >= points.size()",
+            __FILE__, __LINE__, __func__);
+    }
+
+    auto& scratch = basis_function_scratch();
+    auto& v_tmp = scratch.scalar_values;
+    auto& g_tmp = scratch.scalar_gradients;
+    auto& h_tmp = scratch.scalar_hessians;
+    if (values_out) v_tmp.resize(num_dofs);
+    if (gradients_out) g_tmp.resize(num_dofs);
+    if (hessians_out) h_tmp.resize(num_dofs);
+
+    for (std::size_t q = 0; q < num_qpts; ++q) {
+        if (values_out && gradients_out && hessians_out) {
+            evaluate_all(points[q], v_tmp, g_tmp, h_tmp);
+        } else {
+            if (values_out) evaluate_values(points[q], v_tmp);
+            if (gradients_out) evaluate_gradients(points[q], g_tmp);
+            if (hessians_out) evaluate_hessians(points[q], h_tmp);
+        }
+
+        if (values_out) {
+            for (std::size_t dof = 0; dof < num_dofs; ++dof) {
+                values_out[dof * output_stride + q] = v_tmp[dof];
+            }
+        }
+        if (gradients_out) {
+            for (std::size_t dof = 0; dof < num_dofs; ++dof) {
+                for (std::size_t component = 0; component < 3u; ++component) {
+                    gradients_out[(dof * 3u + component) * output_stride + q] =
+                        g_tmp[dof][component];
+                }
+            }
+        }
+        if (hessians_out) {
+            for (std::size_t dof = 0; dof < num_dofs; ++dof) {
+                store_hessian_strided(
+                    h_tmp[dof], hessians_out + dof * 9u * output_stride, output_stride, q);
+            }
+        }
+    }
+}
+
+void BasisFunction::fill_scalar_cache_entry(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) const {
+    evaluate_at_quadrature_points_strided(
+        points, output_stride, values_out, gradients_out, hessians_out);
+}
+
+void BasisFunction::evaluate_vector_at_quadrature_points(
+    const std::vector<math::Vector<Real, 3>>& points,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT jacobians_out,
+    Real* SVMP_RESTRICT curls_out,
+    Real* SVMP_RESTRICT divergence_out) const {
+    evaluate_vector_at_quadrature_points_strided(
+        points, points.size(), values_out, jacobians_out, curls_out, divergence_out);
+}
+
+void BasisFunction::evaluate_vector_at_quadrature_points_strided(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT jacobians_out,
+    Real* SVMP_RESTRICT curls_out,
+    Real* SVMP_RESTRICT divergence_out) const {
+    const std::size_t num_qpts = points.size();
+    const std::size_t num_dofs = size();
+    detail::vector_common::validate_vector_strided_outputs(
+        num_qpts, output_stride, "BasisFunction");
+
+    auto& scratch = basis_function_scratch();
+    auto& v_tmp = scratch.vector_values;
+    auto& j_tmp = scratch.vector_jacobians;
+    auto& c_tmp = scratch.vector_curls;
+    auto& d_tmp = scratch.vector_divergences;
+    if (values_out) v_tmp.resize(num_dofs);
+    if (jacobians_out) j_tmp.resize(num_dofs);
+    if (curls_out) c_tmp.resize(num_dofs);
+    if (divergence_out) d_tmp.resize(num_dofs);
+
+    for (std::size_t q = 0; q < num_qpts; ++q) {
+        if (values_out) {
+            evaluate_vector_values(points[q], v_tmp);
+            detail::vector_common::write_vector_values_strided(
+                v_tmp, num_dofs, output_stride, q, values_out);
+        }
+
+        if (jacobians_out) {
+            evaluate_vector_jacobians(points[q], j_tmp);
+            detail::vector_common::write_vector_jacobians_strided(
+                j_tmp, num_dofs, output_stride, q, jacobians_out);
+        }
+
+        if (curls_out) {
+            evaluate_curl(points[q], c_tmp);
+            detail::vector_common::write_vector_curl_strided(
+                c_tmp, num_dofs, output_stride, q, curls_out);
+        }
+
+        if (divergence_out) {
+            evaluate_divergence(points[q], d_tmp);
+            detail::vector_common::write_vector_divergence_strided(
+                d_tmp, num_dofs, output_stride, q, divergence_out);
+        }
+    }
+}
+
+void BasisFunction::evaluate_vector_values(
+    const math::Vector<Real, 3>&,
+    std::vector<math::Vector<Real, 3>>&) const {
+    throw BasisEvaluationException("Vector-valued evaluation requested on scalar basis",
+                                   __FILE__, __LINE__, __func__);
+}
+
+void BasisFunction::evaluate_vector_jacobians(
+    const math::Vector<Real, 3>&,
+    std::vector<VectorJacobian>&) const {
+    throw BasisEvaluationException("Vector-basis Jacobian evaluation requested on scalar basis",
+                                   __FILE__, __LINE__, __func__);
+}
+
+void BasisFunction::evaluate_divergence(
+    const math::Vector<Real, 3>&,
+    std::vector<Real>&) const {
+    throw BasisEvaluationException("Divergence requested on scalar basis",
+                                   __FILE__, __LINE__, __func__);
+}
+
+void BasisFunction::evaluate_curl(
+    const math::Vector<Real, 3>&,
+    std::vector<math::Vector<Real, 3>>&) const {
+    throw BasisEvaluationException("Curl requested on scalar basis",
+                                   __FILE__, __LINE__, __func__);
+}
+
+void BasisFunction::numerical_gradient(const math::Vector<Real, 3>& xi,
+                                       std::vector<Gradient>& gradients,
+                                       Real eps) const {
+    std::vector<Real> base;
+    evaluate_values(xi, base);
+    gradients.assign(base.size(), Gradient{});
+
+    for (int d = 0; d < dimension(); ++d) {
+        math::Vector<Real, 3> forward = xi;
+        math::Vector<Real, 3> backward = xi;
+        const std::size_t idx = static_cast<std::size_t>(d);
+        forward[idx] += eps;
+        backward[idx] -= eps;
+
+        std::vector<Real> fwd, bwd;
+        evaluate_values(forward, fwd);
+        evaluate_values(backward, bwd);
+
+        for (std::size_t i = 0; i < base.size(); ++i) {
+            gradients[i][idx] = (fwd[i] - bwd[i]) / (Real(2) * eps);
+        }
+    }
+}
+
+void BasisFunction::numerical_hessian(const math::Vector<Real, 3>& xi,
+                                      std::vector<Hessian>& hessians,
+                                      Real eps) const {
+    std::vector<Gradient> base_grad;
+    evaluate_gradients(xi, base_grad);
+    hessians.assign(base_grad.size(), Hessian{});
+
+    for (int d = 0; d < dimension(); ++d) {
+        math::Vector<Real, 3> forward = xi;
+        math::Vector<Real, 3> backward = xi;
+        const std::size_t col = static_cast<std::size_t>(d);
+        forward[col] += eps;
+        backward[col] -= eps;
+
+        std::vector<Gradient> g_forward, g_backward;
+        evaluate_gradients(forward, g_forward);
+        evaluate_gradients(backward, g_backward);
+
+        for (std::size_t i = 0; i < base_grad.size(); ++i) {
+            for (int k = 0; k < dimension(); ++k) {
+                const std::size_t row = static_cast<std::size_t>(k);
+                hessians[i](row, col) = (g_forward[i][row] - g_backward[i][row]) / (Real(2) * eps);
+            }
+        }
+    }
+}
+
+} // namespace basis
+} // namespace FE
+} // namespace svmp
diff --git a/Code/Source/solver/FE/Basis/BasisFunction.h b/Code/Source/solver/FE/Basis/BasisFunction.h
new file mode 100644
index 000000000..ee38a5b19
--- /dev/null
+++ b/Code/Source/solver/FE/Basis/BasisFunction.h
@@ -0,0 +1,426 @@
+/* Copyright (c) Stanford University, The Regents of the University of California, and others.
+ *
+ * All Rights Reserved.
+ *
+ * See License file.
+ */
+
+#ifndef SVMP_FE_BASIS_BASISFUNCTION_H
+#define SVMP_FE_BASIS_BASISFUNCTION_H
+
+/**
+ * @file BasisFunction.h
+ * @brief Abstract interface for basis function evaluation on reference elements
+ *
+ * The Basis module operates purely on reference elements and is independent of
+ * mesh-specific data structures. Implementations may leverage Math and
+ * Quadrature utilities but must not read mesh connectivity or geometry.
+ */
+
+#include "Types.h"
+#include "BasisExceptions.h"
+#include "Math/Vector.h"
+#include "Math/Matrix.h"
+#include <algorithm>
+#include <cstdint>
+#include <functional>
+#include <span>
+#include <string>
+#include <vector>
+
+namespace svmp {
+namespace FE {
+namespace basis {
+
+using Gradient = math::Vector<Real, 3>;
+using Hessian  = math::Matrix<Real, 3, 3>;
+using VectorJacobian = math::Matrix<Real, 3, 3>;
+
+struct BasisIdentityFingerprint {
+    std::uint64_t hash_a{0};
+    std::uint64_t hash_b{0};
+};
+
+[[nodiscard]] BasisIdentityFingerprint
+compute_basis_identity_fingerprint(std::span<const std::uint64_t> words) noexcept;
+
+void prewarm_basis_function_scratch(std::size_t max_size,
+                                    std::size_t max_qpts = 0);
+
+[[nodiscard]] inline Hessian make_symmetric_hessian(Real xx,
+                                                    Real yy,
+                                                    Real zz,
+                                                    Real xy,
+                                                    Real xz,
+                                                    Real yz) {
+    Hessian hessian{};
+    hessian(0, 0) = xx;
+    hessian(1, 1) = yy;
+    hessian(2, 2) = zz;
+    hessian(0, 1) = xy;
+    hessian(1, 0) = xy;
+    hessian(0, 2) = xz;
+    hessian(2, 0) = xz;
+    hessian(1, 2) = yz;
+    hessian(2, 1) = yz;
+    return hessian;
+}
+
+// Raw Hessian buffers use row-major 3x3 blocks:
+// dst[row * 3 + col] = H(row, col).
+inline void store_hessian(const Hessian& hessian, Real* dst) noexcept {
+    dst[0u] = hessian(0u, 0u);
+    dst[1u] = hessian(0u, 1u);
+    dst[2u] = hessian(0u, 2u);
+    dst[3u] = hessian(1u, 0u);
+    dst[4u] = hessian(1u, 1u);
+    dst[5u] = hessian(1u, 2u);
+    dst[6u] = hessian(2u, 0u);
+    dst[7u] = hessian(2u, 1u);
+    dst[8u] = hessian(2u, 2u);
+}
+
+inline void store_hessian_strided(const Hessian& hessian,
+                                  Real* dst,
+                                  std::size_t stride,
+                                  std::size_t offset) noexcept {
+    dst[0u * stride + offset] = hessian(0u, 0u);
+    dst[1u * stride + offset] = hessian(0u, 1u);
+    dst[2u * stride + offset] = hessian(0u, 2u);
+    dst[3u * stride + offset] = hessian(1u, 0u);
+    dst[4u * stride + offset] = hessian(1u, 1u);
+    dst[5u * stride + offset] = hessian(1u, 2u);
+    dst[6u * stride + offset] = hessian(2u, 0u);
+    dst[7u * stride + offset] = hessian(2u, 1u);
+    dst[8u * stride + offset] = hessian(2u, 2u);
+}
+
+inline void scatter_hessian_components_strided(const Real* src,
+                                               Real* dst,
+                                               std::size_t stride,
+                                               std::size_t offset) noexcept {
+    dst[0u * stride + offset] = src[0u];
+    dst[1u * stride + offset] = src[1u];
+    dst[2u * stride + offset] = src[2u];
+    dst[3u * stride + offset] = src[3u];
+    dst[4u * stride + offset] = src[4u];
+    dst[5u * stride + offset] = src[5u];
+    dst[6u * stride + offset] = src[6u];
+    dst[7u * stride + offset] = src[7u];
+    dst[8u * stride + offset] = src[8u];
+}
+
+[[nodiscard]] inline Hessian load_hessian(const Real* src) noexcept {
+    Hessian hessian{};
+    hessian(0u, 0u) = src[0u];
+    hessian(0u, 1u) = src[1u];
+    hessian(0u, 2u) = src[2u];
+    hessian(1u, 0u) = src[3u];
+    hessian(1u, 1u) = src[4u];
+    hessian(1u, 2u) = src[5u];
+    hessian(2u, 0u) = src[6u];
+    hessian(2u, 1u) = src[7u];
+    hessian(2u, 2u) = src[8u];
+    return hessian;
+}
+
+inline void add_scaled_hessian(Hessian& target,
+                               const Hessian& source,
+                               Real scale) noexcept {
+    target(0u, 0u) += scale * source(0u, 0u);
+    target(0u, 1u) += scale * source(0u, 1u);
+    target(0u, 2u) += scale * source(0u, 2u);
+    target(1u, 0u) += scale * source(1u, 0u);
+    target(1u, 1u) += scale * source(1u, 1u);
+    target(1u, 2u) += scale * source(1u, 2u);
+    target(2u, 0u) += scale * source(2u, 0u);
+    target(2u, 1u) += scale * source(2u, 1u);
+    target(2u, 2u) += scale * source(2u, 2u);
+}
+
+/**
+ * @brief Base interface for scalar and vector-valued basis families
+ *
+ * All basis implementations operate in reference space. Physical mappings are
+ * handled by the Geometry module. Derivatives are returned with unused
+ * components set to zero for lower dimensional elements.
+ */
+class BasisFunction {
+public:
+    virtual ~BasisFunction() = default;
+
+    /// Basis family identifier
+    virtual BasisType basis_type() const noexcept = 0;
+
+    /// Underlying element type on the reference domain
+    virtual ElementType element_type() const noexcept = 0;
+
+    /// Reference dimensionality (1, 2, or 3)
+    virtual int dimension() const noexcept = 0;
+
+    /// Polynomial order (modal/nodal definition dependent)
+    virtual int order() const noexcept = 0;
+
+    /// Number of basis functions (scalar or vector-valued)
+    virtual std::size_t size() const noexcept = 0;
+
+    /**
+     * @brief Whether BasisCache can key this basis from common structural fields.
+     *
+     * Return true only when basis_type/element_type/dimension/order/size and
+     * vector-valued status fully determine evaluation behavior. Parameterized
+     * bases such as splines and custom user bases should keep the default false
+     * so BasisCache includes cache_identity() in the key.
+     */
+    virtual bool cache_identity_is_structural() const noexcept { return false; }
+
+    /// Whether the basis is vector-valued (H(div)/H(curl))
+    virtual bool is_vector_valued() const noexcept { return false; }
+
+    /// Whether vector-valued basis Jacobians are available.
+    virtual bool supports_vector_jacobians() const noexcept { return false; }
+
+    /// Whether vector-valued basis curls are available.
+    virtual bool supports_curl() const noexcept { return false; }
+
+    /// Whether vector-valued basis divergences are available.
+    virtual bool supports_divergence() const noexcept { return false; }
+
+    /**
+     * @brief Stable semantic identity used by BasisCache
+     *
+     * Derived classes should override this when evaluation depends on
+     * additional state beyond basis family / element / order metadata.
+     */
+    virtual std::string cache_identity() const;
+
+    /**
+     * @brief Optional exact structured identity payload for BasisCache keys.
+     *
+     * Parameterized bases may append stable integer/bit-pattern words and
+     * return true to let BasisCache avoid using cache_identity() as the exact
+     * key payload. The human-readable cache_identity() remains available for
+     * diagnostics and for custom bases that do not implement this path.
+     */
+    virtual bool cache_identity_words(std::vector<std::uint64_t>& words) const;
+
+    /**
+     * @brief Optional cached fingerprint for structured identity words.
+     *
+     * Implementations that precompute cache_identity_words() may also cache the
+     * corresponding fingerprint. BasisCache still retains exact identity words
+     * for equality after hash matches.
+     */
+    virtual bool cache_identity_fingerprint(std::uint64_t& hash_a,
+                                            std::uint64_t& hash_b) const;
+
+    /**
+     * @brief Evaluate scalar basis values at a reference point
+     * @param xi Reference coordinates (unused entries are ignored)
+     * @param[out] values Output array resized to size()
+     */
+    virtual void evaluate_values(const math::Vector<Real, 3>& xi,
+                                 std::vector<Real>& values) const = 0;
+
+    /**
+     * @brief Evaluate gradients of scalar basis functions
+     *
+     * Production bases must override this with analytic derivatives.
+     * Use numerical_gradient explicitly in tests or diagnostics when a finite
+     * difference approximation is intended.
+     */
+    virtual void evaluate_gradients(const math::Vector<Real, 3>& xi,
+                                    std::vector<Gradient>& gradients) const;
+
+    /**
+     * @brief Evaluate Hessians of scalar basis functions
+     *
+     * Production bases must override this with analytic second derivatives.
+     * Use numerical_hessian explicitly in tests or diagnostics when a finite
+     * difference approximation is intended.
+     */
+    virtual void evaluate_hessians(const math::Vector<Real, 3>& xi,
+                                   std::vector<Hessian>& hessians) const;
+
+    /**
+     * @brief Fused evaluation of values, gradients, and Hessians at one point
+     *
+     * Default implementation calls evaluate_values, evaluate_gradients, and
+     * evaluate_hessians in sequence. Bases that share intermediate
+     * computations (e.g., LagrangeBasis sharing per-axis 1D evaluations)
+     * should override this to avoid redundant work.
+     */
+    virtual void evaluate_all(const math::Vector<Real, 3>& xi,
+                              std::vector<Real>& values,
+                              std::vector<Gradient>& gradients,
+                              std::vector<Hessian>& hessians) const;
+
+    /**
+     * @brief Fill SoA buffers with basis evaluations at all quadrature points
+     *
+     * Outputs are written directly to caller-provided strided buffers in
+     * DOF-major SoA layout — no scratch+transpose required by the caller.
+     * Pass `nullptr` for any output that is not needed.
+     *
+     *   values_out:    size num_dofs * num_qpts; element [d * num_qpts + q]
+     *   gradients_out: size num_dofs * 3 * num_qpts; element [(d*3 + c) * num_qpts + q]
+     *   hessians_out:  size num_dofs * 9 * num_qpts; element [(d*9 + r*3 + c) * num_qpts + q]
+     *
+     * Non-null output ranges must not overlap each other. Implementations may
+     * fill requested quantities in any order that is efficient for the basis.
+     *
+     * Default implementation calls evaluate_all (or evaluate_values/gradients/
+     * hessians as appropriate) per QP, materializing into temp buffers then
+     * scatter-writing to the output. Performance-sensitive bases must override
+     * this path so batched assembly does not fall back to Q virtual point
+     * evaluations. Unit coverage keeps an explicit list of hot bases that are
+     * expected to provide a direct strided implementation.
+     */
+    virtual void evaluate_at_quadrature_points(
+        const std::vector<math::Vector<Real, 3>>& points,
+        Real* SVMP_RESTRICT values_out,
+        Real* SVMP_RESTRICT gradients_out,
+        Real* SVMP_RESTRICT hessians_out) const;
+
+    /**
+     * @brief Fill strided SoA buffers with basis evaluations at quadrature points
+     *
+     * Same component layout as evaluate_at_quadrature_points, but each
+     * dof/component row advances by `output_stride` rather than `points.size()`.
+     * This lets padded SIMD cache storage be filled directly. Non-null output
+     * ranges have the same non-overlap requirement.
+     */
+    virtual void evaluate_at_quadrature_points_strided(
+        const std::vector<math::Vector<Real, 3>>& points,
+        std::size_t output_stride,
+        Real* SVMP_RESTRICT values_out,
+        Real* SVMP_RESTRICT gradients_out,
+        Real* SVMP_RESTRICT hessians_out) const;
+
+    /**
+     * @brief Fill zero-initialized scalar cache storage.
+     *
+     * BasisCache allocates and zero-initializes its scalar SoA buffers before
+     * calling this hook. The default implementation overwrites all requested
+     * entries through the public strided evaluator. Sparse-support bases may
+     * override this and write only active entries, relying on the caller's
+     * zero-initialization for inactive DOFs and unused derivative components.
+     */
+    virtual void fill_scalar_cache_entry(
+        const std::vector<math::Vector<Real, 3>>& points,
+        std::size_t output_stride,
+        Real* SVMP_RESTRICT values_out,
+        Real* SVMP_RESTRICT gradients_out,
+        Real* SVMP_RESTRICT hessians_out) const;
+
+    /**
+     * @brief Fill SoA buffers with vector-basis evaluations at all quadrature points
+     *
+     * Outputs are written in DOF-major SoA layout. Pass `nullptr` for any
+     * quantity that is not needed.
+     *
+     *   values_out:     size num_dofs * 3 * num_qpts; element [(d*3 + c) * num_qpts + q]
+     *   jacobians_out:  size num_dofs * 9 * num_qpts; element [(d*9 + c*3 + r) * num_qpts + q]
+     *   curls_out:      size num_dofs * 3 * num_qpts; element [(d*3 + c) * num_qpts + q]
+     *   divergence_out: size num_dofs * num_qpts; element [d * num_qpts + q]
+     *
+     * Non-null output ranges must not overlap each other. Implementations may
+     * fill requested quantities in any order that is efficient for the basis.
+     */
+    virtual void evaluate_vector_at_quadrature_points(
+        const std::vector<math::Vector<Real, 3>>& points,
+        Real* SVMP_RESTRICT values_out,
+        Real* SVMP_RESTRICT jacobians_out,
+        Real* SVMP_RESTRICT curls_out,
+        Real* SVMP_RESTRICT divergence_out) const;
+
+    /**
+     * @brief Fill strided SoA buffers with vector-basis evaluations
+     *
+     * Same component layout as evaluate_vector_at_quadrature_points, but each
+     * dof/component row advances by `output_stride` rather than `points.size()`.
+     * Non-null output ranges have the same non-overlap requirement.
+     *
+     * The base fallback loops over quadrature points through virtual point
+     * evaluation. H(div)/H(curl) bases used in assembly should override this
+     * method directly, and tests track the current hot vector families.
+     */
+    virtual void evaluate_vector_at_quadrature_points_strided(
+        const std::vector<math::Vector<Real, 3>>& points,
+        std::size_t output_stride,
+        Real* SVMP_RESTRICT values_out,
+        Real* SVMP_RESTRICT jacobians_out,
+        Real* SVMP_RESTRICT curls_out,
+        Real* SVMP_RESTRICT divergence_out) const;
+
+    /**
+     * @brief Evaluate scalar basis values into a caller-provided raw buffer
+     *
+     * Caller is responsible for providing a buffer of at least size() Real
+     * entries. This avoids the per-call std::vector::resize() cost of the
+     * vector-output overload. Default implementation forwards through a temp
+     * vector; bases should override for direct write.
+     */
+    virtual void evaluate_values_to(const math::Vector<Real, 3>& xi,
+                                    Real* SVMP_RESTRICT values_out) const;
+
+    /**
+     * @brief Evaluate gradients into a flat caller-provided buffer
+     *
+     * Layout: gradients_out[i * 3 + c] = component c of gradient of basis i.
+     * Caller provides a buffer of size() * 3 Real entries.
+     */
+    virtual void evaluate_gradients_to(const math::Vector<Real, 3>& xi,
+                                       Real* SVMP_RESTRICT gradients_out) const;
+
+    /**
+     * @brief Evaluate Hessians into a flat caller-provided buffer
+     *
+     * Layout: hessians_out[i * 9 + r * 3 + c] = H_i(r, c).
+     */
+    virtual void evaluate_hessians_to(const math::Vector<Real, 3>& xi,
+                                      Real* SVMP_RESTRICT hessians_out) const;
+
+    /**
+     * @brief Evaluate vector-valued basis functions (H(div)/H(curl))
+     *
+     * Default implementation throws; vector bases must override.
+     */
+    virtual void evaluate_vector_values(const math::Vector<Real, 3>& xi,
+                                        std::vector<math::Vector<Real, 3>>& values) const;
+
+    /**
+     * @brief Evaluate reference-space Jacobians of vector-valued basis functions
+     *
+     * The returned matrix for basis function `i` has entries
+     * `jacobians[i](component, derivative_direction) = d phi_i_component / d xi_direction`.
+     * Unused rows/columns are zero-filled for lower-dimensional elements.
+     */
+    virtual void evaluate_vector_jacobians(const math::Vector<Real, 3>& xi,
+                                           std::vector<VectorJacobian>& jacobians) const;
+
+    /// Evaluate divergence of vector-valued basis functions (if applicable)
+    virtual void evaluate_divergence(const math::Vector<Real, 3>& xi,
+                                     std::vector<Real>& divergence) const;
+
+    /// Evaluate curl of vector-valued basis functions (if applicable)
+    virtual void evaluate_curl(const math::Vector<Real, 3>& xi,
+                               std::vector<math::Vector<Real, 3>>& curl) const;
+
+protected:
+    /// Finite-difference helper for gradients of scalar bases
+    void numerical_gradient(const math::Vector<Real, 3>& xi,
+                            std::vector<Gradient>& gradients,
+                            Real eps = Real(1e-6)) const;
+
+    /// Finite-difference helper for Hessians of scalar bases
+    void numerical_hessian(const math::Vector<Real, 3>& xi,
+                           std::vector<Hessian>& hessians,
+                           Real eps = Real(1e-5)) const;
+};
+
+} // namespace basis
+} // namespace FE
+} // namespace svmp
+
+#endif // SVMP_FE_BASIS_BASISFUNCTION_H
diff --git a/Code/Source/solver/FE/Basis/BasisTolerance.h b/Code/Source/solver/FE/Basis/BasisTolerance.h
new file mode 100644
index 000000000..423551f09
--- /dev/null
+++ b/Code/Source/solver/FE/Basis/BasisTolerance.h
@@ -0,0 +1,52 @@
+/* Copyright (c) Stanford University, The Regents of the University of California, and others.
+ *
+ * All Rights Reserved.
+ *
+ * See License file.
+ */
+
+#ifndef SVMP_FE_BASIS_BASISTOLERANCE_H
+#define SVMP_FE_BASIS_BASISTOLERANCE_H
+
+#include "Types.h"
+
+#include <limits>
+
+namespace svmp {
+namespace FE {
+namespace basis {
+namespace detail {
+
+[[nodiscard]] constexpr Real basis_abs(Real value) noexcept {
+    return value < Real(0) ? -value : value;
+}
+
+[[nodiscard]] constexpr Real basis_max(Real lhs, Real rhs) noexcept {
+    return lhs < rhs ? rhs : lhs;
+}
+
+[[nodiscard]] constexpr Real basis_scaled_tolerance(Real scale = Real(1),
+                                                    Real multiplier = Real(64)) noexcept {
+    return multiplier * std::numeric_limits<Real>::epsilon() *
+           basis_max(Real(1), basis_abs(scale));
+}
+
+[[nodiscard]] constexpr bool basis_near_zero(Real value,
+                                             Real scale = Real(1),
+                                             Real multiplier = Real(64)) noexcept {
+    return basis_abs(value) <= basis_scaled_tolerance(scale, multiplier);
+}
+
+[[nodiscard]] constexpr bool basis_nearly_equal(Real a,
+                                                Real b,
+                                                Real multiplier = Real(64)) noexcept {
+    const Real scale = basis_max(Real(1), basis_max(basis_abs(a), basis_abs(b)));
+    return basis_abs(a - b) <= basis_scaled_tolerance(scale, multiplier);
+}
+
+} // namespace detail
+} // namespace basis
+} // namespace FE
+} // namespace svmp
+
+#endif // SVMP_FE_BASIS_BASISTOLERANCE_H
diff --git a/Code/Source/solver/FE/Basis/BasisTraits.h b/Code/Source/solver/FE/Basis/BasisTraits.h
new file mode 100644
index 000000000..835dfe705
--- /dev/null
+++ b/Code/Source/solver/FE/Basis/BasisTraits.h
@@ -0,0 +1,218 @@
+/* Copyright (c) Stanford University, The Regents of the University of California, and others.
+ *
+ * All Rights Reserved.
+ *
+ * See License file.
+ */
+
+#ifndef SVMP_FE_BASIS_BASISTRAITS_H
+#define SVMP_FE_BASIS_BASISTRAITS_H
+
+#include "Types.h"
+
+#include <cstddef>
+
+namespace svmp {
+namespace FE {
+namespace basis {
+
+enum class BasisTopology {
+    Unknown,
+    Point,
+    Line,
+    Triangle,
+    Quadrilateral,
+    Tetrahedron,
+    Hexahedron,
+    Wedge,
+    Pyramid,
+};
+
+[[nodiscard]] constexpr bool is_point(ElementType type) noexcept {
+    return type == ElementType::Point1;
+}
+
+[[nodiscard]] constexpr bool is_line(ElementType type) noexcept {
+    return type == ElementType::Line2 || type == ElementType::Line3;
+}
+
+[[nodiscard]] constexpr bool is_triangle(ElementType type) noexcept {
+    return type == ElementType::Triangle3 || type == ElementType::Triangle6;
+}
+
+[[nodiscard]] constexpr bool is_quadrilateral(ElementType type) noexcept {
+    return type == ElementType::Quad4 || type == ElementType::Quad8 ||
+           type == ElementType::Quad9;
+}
+
+[[nodiscard]] constexpr bool is_tetrahedron(ElementType type) noexcept {
+    return type == ElementType::Tetra4 || type == ElementType::Tetra10;
+}
+
+[[nodiscard]] constexpr bool is_hexahedron(ElementType type) noexcept {
+    return type == ElementType::Hex8 || type == ElementType::Hex20 ||
+           type == ElementType::Hex27;
+}
+
+[[nodiscard]] constexpr bool is_wedge(ElementType type) noexcept {
+    return type == ElementType::Wedge6 || type == ElementType::Wedge15 ||
+           type == ElementType::Wedge18;
+}
+
+[[nodiscard]] constexpr bool is_pyramid(ElementType type) noexcept {
+    return type == ElementType::Pyramid5 || type == ElementType::Pyramid13 ||
+           type == ElementType::Pyramid14;
+}
+
+[[nodiscard]] constexpr bool is_simplex(ElementType type) noexcept {
+    return is_triangle(type) || is_tetrahedron(type);
+}
+
+[[nodiscard]] constexpr bool is_tensor_product(ElementType type) noexcept {
+    return is_line(type) || is_quadrilateral(type) || is_hexahedron(type);
+}
+
+[[nodiscard]] constexpr int reference_dimension(ElementType type) noexcept {
+    return element_dimension(type);
+}
+
+[[nodiscard]] constexpr BasisTopology topology(ElementType type) noexcept {
+    if (is_point(type)) {
+        return BasisTopology::Point;
+    }
+    if (is_line(type)) {
+        return BasisTopology::Line;
+    }
+    if (is_triangle(type)) {
+        return BasisTopology::Triangle;
+    }
+    if (is_quadrilateral(type)) {
+        return BasisTopology::Quadrilateral;
+    }
+    if (is_tetrahedron(type)) {
+        return BasisTopology::Tetrahedron;
+    }
+    if (is_hexahedron(type)) {
+        return BasisTopology::Hexahedron;
+    }
+    if (is_wedge(type)) {
+        return BasisTopology::Wedge;
+    }
+    if (is_pyramid(type)) {
+        return BasisTopology::Pyramid;
+    }
+    return BasisTopology::Unknown;
+}
+
+[[nodiscard]] constexpr ElementType canonical_lagrange_type(ElementType type) noexcept {
+    switch (type) {
+        case ElementType::Line2:
+        case ElementType::Line3:
+            return ElementType::Line2;
+        case ElementType::Triangle3:
+        case ElementType::Triangle6:
+            return ElementType::Triangle3;
+        case ElementType::Quad4:
+        case ElementType::Quad9:
+            return ElementType::Quad4;
+        case ElementType::Tetra4:
+        case ElementType::Tetra10:
+            return ElementType::Tetra4;
+        case ElementType::Hex8:
+        case ElementType::Hex27:
+            return ElementType::Hex8;
+        case ElementType::Wedge6:
+        case ElementType::Wedge18:
+            return ElementType::Wedge6;
+        case ElementType::Pyramid5:
+        case ElementType::Pyramid14:
+            return ElementType::Pyramid5;
+        default:
+            return type;
+    }
+}
+
+[[nodiscard]] constexpr int complete_lagrange_alias_order(ElementType type) noexcept {
+    switch (type) {
+        case ElementType::Line2:
+        case ElementType::Triangle3:
+        case ElementType::Quad4:
+        case ElementType::Tetra4:
+        case ElementType::Hex8:
+        case ElementType::Wedge6:
+        case ElementType::Pyramid5:
+            return 1;
+        case ElementType::Line3:
+        case ElementType::Triangle6:
+        case ElementType::Quad9:
+        case ElementType::Tetra10:
+        case ElementType::Hex27:
+        case ElementType::Wedge18:
+        case ElementType::Pyramid14:
+            return 2;
+        default:
+            return -1;
+    }
+}
+
+[[nodiscard]] constexpr std::size_t line_lagrange_size(int order) noexcept {
+    return order >= 0 ? static_cast<std::size_t>(order + 1) : 0u;
+}
+
+[[nodiscard]] constexpr std::size_t triangle_lagrange_size(int order) noexcept {
+    return order >= 0 ? static_cast<std::size_t>((order + 1) * (order + 2) / 2) : 0u;
+}
+
+[[nodiscard]] constexpr std::size_t quad_lagrange_size(int order) noexcept {
+    return order >= 0 ? static_cast<std::size_t>((order + 1) * (order + 1)) : 0u;
+}
+
+[[nodiscard]] constexpr std::size_t tetra_lagrange_size(int order) noexcept {
+    return order >= 0 ? static_cast<std::size_t>((order + 1) * (order + 2) * (order + 3) / 6) : 0u;
+}
+
+[[nodiscard]] constexpr std::size_t hex_lagrange_size(int order) noexcept {
+    return order >= 0 ? static_cast<std::size_t>((order + 1) * (order + 1) * (order + 1)) : 0u;
+}
+
+[[nodiscard]] constexpr std::size_t wedge_lagrange_size(int order) noexcept {
+    return triangle_lagrange_size(order) * line_lagrange_size(order);
+}
+
+[[nodiscard]] constexpr std::size_t pyramid_lagrange_size(int order) noexcept {
+    if (order < 0) {
+        return 0u;
+    }
+    const std::size_t p = static_cast<std::size_t>(order);
+    return (p + 1u) * (p + 2u) * (2u * p + 3u) / 6u;
+}
+
+[[nodiscard]] constexpr std::size_t complete_lagrange_alias_size(ElementType type) noexcept {
+    const int order = complete_lagrange_alias_order(type);
+    switch (canonical_lagrange_type(type)) {
+        case ElementType::Point1:
+            return 1u;
+        case ElementType::Line2:
+            return line_lagrange_size(order);
+        case ElementType::Triangle3:
+            return triangle_lagrange_size(order);
+        case ElementType::Quad4:
+            return quad_lagrange_size(order);
+        case ElementType::Tetra4:
+            return tetra_lagrange_size(order);
+        case ElementType::Hex8:
+            return hex_lagrange_size(order);
+        case ElementType::Wedge6:
+            return wedge_lagrange_size(order);
+        case ElementType::Pyramid5:
+            return pyramid_lagrange_size(order);
+        default:
+            return 0u;
+    }
+}
+
+} // namespace basis
+} // namespace FE
+} // namespace svmp
+
+#endif // SVMP_FE_BASIS_BASISTRAITS_H
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
new file mode 100644
index 000000000..63b947516
--- /dev/null
+++ b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
@@ -0,0 +1,8323 @@
+/* Copyright (c) Stanford University, The Regents of the University of California, and others.
+ *
+ * All Rights Reserved.
+ *
+ * See License file.
+ */
+
+#include "LagrangeBasis.h"
+#include "BasisTraits.h"
+#include "BasisTolerance.h"
+#include "LagrangeBasisFast.h"
+#include "NodeOrderingConventions.h"
+#include "LagrangeBasisPyramid.h"
+#include "LagrangeBasisSimplex.h"
+#include "LagrangeBasisUtility.h"
+#include <algorithm>
+#include <cmath>
+#include <unordered_map>
+
+namespace svmp {
+namespace FE {
+namespace basis {
+
+namespace {
+
+using LagrangeTopology = BasisTopology;
+
+#if defined(_MSC_VER)
+#define SVMP_LAGRANGE_NOINLINE __declspec(noinline)
+#define SVMP_LAGRANGE_ALIGN64
+#elif defined(__GNUC__) || defined(__clang__)
+#define SVMP_LAGRANGE_NOINLINE __attribute__((noinline))
+#define SVMP_LAGRANGE_ALIGN64 __attribute__((aligned(64)))
+#else
+#define SVMP_LAGRANGE_NOINLINE
+#define SVMP_LAGRANGE_ALIGN64
+#endif
+
+#ifndef FE_ALWAYS_INLINE
+#if defined(_MSC_VER)
+#define FE_ALWAYS_INLINE __forceinline
+#elif defined(__GNUC__) || defined(__clang__)
+#define FE_ALWAYS_INLINE __attribute__((always_inline)) inline
+#else
+#define FE_ALWAYS_INLINE inline
+#endif
+#endif
+
+SVMP_LAGRANGE_NOINLINE void evaluate_triangle_order1_gradients_strided(
+    std::size_t num_qpts,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT gradients_out);
+
+struct LagrangeTopologyTraits {
+    LagrangeTopology topology;
+    int dimension;
+};
+
+struct SimplexExponentHash {
+    std::size_t operator()(const std::array<int, 4>& exponents) const noexcept {
+        std::size_t seed = 0x9e3779b97f4a7c15ull;
+        for (const int exponent : exponents) {
+            const auto value = static_cast<std::size_t>(exponent);
+            seed ^= value + 0x9e3779b97f4a7c15ull + (seed << 6u) + (seed >> 2u);
+        }
+        return seed;
+    }
+};
+
+template<typename T, std::size_t N>
+void assign_array(std::vector<T>& out, const std::array<T, N>& values) {
+    out.assign(values.begin(), values.end());
+}
+
+bool coordinate_matches_expected(Real coord, Real expected) noexcept {
+    return detail::basis_nearly_equal(coord, expected);
+}
+
+template<typename FastBasis>
+void evaluate_fast_outputs(const math::Vector<Real, 3>& xi,
+                           std::vector<Real>* values,
+                           std::vector<Gradient>* gradients,
+                           std::vector<Hessian>* hessians) {
+    if (values != nullptr) {
+        std::array<Real, FastBasis::n_dofs> fast_values{};
+        FastBasis::evaluate(xi, fast_values);
+        assign_array(*values, fast_values);
+    }
+    if (gradients != nullptr) {
+        std::array<Gradient, FastBasis::n_dofs> fast_gradients{};
+        FastBasis::evaluate_gradients(xi, fast_gradients);
+        assign_array(*gradients, fast_gradients);
+    }
+    if (hessians != nullptr) {
+        std::array<Hessian, FastBasis::n_dofs> fast_hessians{};
+        FastBasis::evaluate_hessians(xi, fast_hessians);
+        assign_array(*hessians, fast_hessians);
+    }
+}
+
+template<typename FastBasis>
+void evaluate_fast_outputs_to(const math::Vector<Real, 3>& xi,
+                              Real* SVMP_RESTRICT values_out,
+                              Real* SVMP_RESTRICT gradients_out,
+                              Real* SVMP_RESTRICT hessians_out) {
+    if (values_out != nullptr) {
+        std::array<Real, FastBasis::n_dofs> fast_values{};
+        FastBasis::evaluate(xi, fast_values);
+        for (std::size_t i = 0; i < fast_values.size(); ++i) {
+            values_out[i] = fast_values[i];
+        }
+    }
+    if (gradients_out != nullptr) {
+        std::array<Gradient, FastBasis::n_dofs> fast_gradients{};
+        FastBasis::evaluate_gradients(xi, fast_gradients);
+        for (std::size_t i = 0; i < fast_gradients.size(); ++i) {
+            gradients_out[i * 3u + 0u] = fast_gradients[i][0];
+            gradients_out[i * 3u + 1u] = fast_gradients[i][1];
+            gradients_out[i * 3u + 2u] = fast_gradients[i][2];
+        }
+    }
+    if (hessians_out != nullptr) {
+        std::array<Hessian, FastBasis::n_dofs> fast_hessians{};
+        FastBasis::evaluate_hessians(xi, fast_hessians);
+        for (std::size_t i = 0; i < fast_hessians.size(); ++i) {
+            store_hessian(fast_hessians[i], hessians_out + i * 9u);
+        }
+    }
+}
+
+template<typename FastBasis>
+void evaluate_fast_outputs_strided(const std::vector<math::Vector<Real, 3>>& points,
+                                   std::size_t output_stride,
+                                   Real* SVMP_RESTRICT values_out,
+                                   Real* SVMP_RESTRICT gradients_out,
+                                   Real* SVMP_RESTRICT hessians_out) {
+    for (std::size_t q = 0; q < points.size(); ++q) {
+        const auto& xi = points[q];
+        if (values_out != nullptr) {
+            std::array<Real, FastBasis::n_dofs> fast_values{};
+            FastBasis::evaluate(xi, fast_values);
+            for (std::size_t i = 0; i < fast_values.size(); ++i) {
+                values_out[i * output_stride + q] = fast_values[i];
+            }
+        }
+        if (gradients_out != nullptr) {
+            std::array<Gradient, FastBasis::n_dofs> fast_gradients{};
+            FastBasis::evaluate_gradients(xi, fast_gradients);
+            for (std::size_t i = 0; i < fast_gradients.size(); ++i) {
+                Real* g = gradients_out + i * 3u * output_stride;
+                g[0u * output_stride + q] = fast_gradients[i][0];
+                g[1u * output_stride + q] = fast_gradients[i][1];
+                g[2u * output_stride + q] = fast_gradients[i][2];
+            }
+        }
+        if (hessians_out != nullptr) {
+            std::array<Hessian, FastBasis::n_dofs> fast_hessians{};
+            FastBasis::evaluate_hessians(xi, fast_hessians);
+            for (std::size_t i = 0; i < fast_hessians.size(); ++i) {
+                const Hessian& hessian = fast_hessians[i];
+                Real* H = hessians_out + i * 9u * output_stride;
+                H[0u * output_stride + q] = hessian(0, 0);
+                H[1u * output_stride + q] = hessian(0, 1);
+                H[2u * output_stride + q] = hessian(0, 2);
+                H[3u * output_stride + q] = hessian(1, 0);
+                H[4u * output_stride + q] = hessian(1, 1);
+                H[5u * output_stride + q] = hessian(1, 2);
+                H[6u * output_stride + q] = hessian(2, 0);
+                H[7u * output_stride + q] = hessian(2, 1);
+                H[8u * output_stride + q] = hessian(2, 2);
+            }
+        }
+    }
+}
+
+template<int Order>
+bool evaluate_fixed_lagrange_fast_order(LagrangeTopology topology,
+                                        const math::Vector<Real, 3>& xi,
+                                        std::vector<Real>* values,
+                                        std::vector<Gradient>* gradients,
+                                        std::vector<Hessian>* hessians) {
+    switch (topology) {
+        case LagrangeTopology::Line:
+            evaluate_fast_outputs<LagrangeLineFast<Order>>(xi, values, gradients, hessians);
+            return true;
+        case LagrangeTopology::Quadrilateral:
+            evaluate_fast_outputs<LagrangeQuadFast<Order>>(xi, values, gradients, hessians);
+            return true;
+        case LagrangeTopology::Hexahedron:
+            evaluate_fast_outputs<LagrangeHexFast<Order>>(xi, values, gradients, hessians);
+            return true;
+        case LagrangeTopology::Triangle:
+            evaluate_fast_outputs<LagrangeTriFast<Order>>(xi, values, gradients, hessians);
+            return true;
+        case LagrangeTopology::Tetrahedron:
+            evaluate_fast_outputs<LagrangeTetFast<Order>>(xi, values, gradients, hessians);
+            return true;
+        default:
+            return false;
+    }
+}
+
+template<int Order>
+bool evaluate_fixed_lagrange_fast_to_order(LagrangeTopology topology,
+                                           const math::Vector<Real, 3>& xi,
+                                           Real* SVMP_RESTRICT values_out,
+                                           Real* SVMP_RESTRICT gradients_out,
+                                           Real* SVMP_RESTRICT hessians_out) {
+    switch (topology) {
+        case LagrangeTopology::Line:
+            evaluate_fast_outputs_to<LagrangeLineFast<Order>>(xi, values_out, gradients_out, hessians_out);
+            return true;
+        case LagrangeTopology::Quadrilateral:
+            evaluate_fast_outputs_to<LagrangeQuadFast<Order>>(xi, values_out, gradients_out, hessians_out);
+            return true;
+        case LagrangeTopology::Hexahedron:
+            evaluate_fast_outputs_to<LagrangeHexFast<Order>>(xi, values_out, gradients_out, hessians_out);
+            return true;
+        case LagrangeTopology::Triangle:
+            evaluate_fast_outputs_to<LagrangeTriFast<Order>>(xi, values_out, gradients_out, hessians_out);
+            return true;
+        case LagrangeTopology::Tetrahedron:
+            evaluate_fast_outputs_to<LagrangeTetFast<Order>>(xi, values_out, gradients_out, hessians_out);
+            return true;
+        default:
+            return false;
+    }
+}
+
+template<int Order>
+bool evaluate_fixed_lagrange_fast_strided_order(
+    LagrangeTopology topology,
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) {
+    switch (topology) {
+        case LagrangeTopology::Line:
+            evaluate_fast_outputs_strided<LagrangeLineFast<Order>>(
+                points, output_stride, values_out, gradients_out, hessians_out);
+            return true;
+        case LagrangeTopology::Quadrilateral:
+            evaluate_fast_outputs_strided<LagrangeQuadFast<Order>>(
+                points, output_stride, values_out, gradients_out, hessians_out);
+            return true;
+        case LagrangeTopology::Hexahedron:
+            evaluate_fast_outputs_strided<LagrangeHexFast<Order>>(
+                points, output_stride, values_out, gradients_out, hessians_out);
+            return true;
+        case LagrangeTopology::Triangle:
+            evaluate_fast_outputs_strided<LagrangeTriFast<Order>>(
+                points, output_stride, values_out, gradients_out, hessians_out);
+            return true;
+        case LagrangeTopology::Tetrahedron:
+            evaluate_fast_outputs_strided<LagrangeTetFast<Order>>(
+                points, output_stride, values_out, gradients_out, hessians_out);
+            return true;
+        default:
+            return false;
+    }
+}
+
+void evaluate_triangle_order3_values_strided(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out) {
+    Real* row0 = values_out + 0u * output_stride;
+    Real* row1 = values_out + 1u * output_stride;
+    Real* row2 = values_out + 2u * output_stride;
+    Real* row3 = values_out + 3u * output_stride;
+    Real* row4 = values_out + 4u * output_stride;
+    Real* row5 = values_out + 5u * output_stride;
+    Real* row6 = values_out + 6u * output_stride;
+    Real* row7 = values_out + 7u * output_stride;
+    Real* row8 = values_out + 8u * output_stride;
+    Real* row9 = values_out + 9u * output_stride;
+
+    if (points.size() == 4u && output_stride == 4u) {
+        Real p10[4];
+        Real p11[4];
+        Real p12[4];
+        Real p20[4];
+        Real p21[4];
+        Real p22[4];
+        Real p30[4];
+        Real p31[4];
+        Real p32[4];
+
+        for (std::size_t q = 0; q < 4u; ++q) {
+            const auto& xi = points[q];
+            const Real l1 = xi[0];
+            const Real l2 = xi[1];
+            const Real l0 = Real(1) - l1 - l2;
+
+            p10[q] = Real(3) * l0;
+            p11[q] = Real(3) * l1;
+            p12[q] = Real(3) * l2;
+            p20[q] = Real(0.5) * p10[q] * (p10[q] - Real(1));
+            p21[q] = Real(0.5) * p11[q] * (p11[q] - Real(1));
+            p22[q] = Real(0.5) * p12[q] * (p12[q] - Real(1));
+            p30[q] = (p10[q] * (p10[q] - Real(1)) * (p10[q] - Real(2))) / Real(6);
+            p31[q] = (p11[q] * (p11[q] - Real(1)) * (p11[q] - Real(2))) / Real(6);
+            p32[q] = (p12[q] * (p12[q] - Real(1)) * (p12[q] - Real(2))) / Real(6);
+        }
+
+        row0[0] = p30[0]; row0[1] = p30[1]; row0[2] = p30[2]; row0[3] = p30[3];
+        row1[0] = p31[0]; row1[1] = p31[1]; row1[2] = p31[2]; row1[3] = p31[3];
+        row2[0] = p32[0]; row2[1] = p32[1]; row2[2] = p32[2]; row2[3] = p32[3];
+        row3[0] = p20[0] * p11[0];
+        row3[1] = p20[1] * p11[1];
+        row3[2] = p20[2] * p11[2];
+        row3[3] = p20[3] * p11[3];
+        row4[0] = p10[0] * p21[0];
+        row4[1] = p10[1] * p21[1];
+        row4[2] = p10[2] * p21[2];
+        row4[3] = p10[3] * p21[3];
+        row5[0] = p21[0] * p12[0];
+        row5[1] = p21[1] * p12[1];
+        row5[2] = p21[2] * p12[2];
+        row5[3] = p21[3] * p12[3];
+        row6[0] = p11[0] * p22[0];
+        row6[1] = p11[1] * p22[1];
+        row6[2] = p11[2] * p22[2];
+        row6[3] = p11[3] * p22[3];
+        row7[0] = p10[0] * p22[0];
+        row7[1] = p10[1] * p22[1];
+        row7[2] = p10[2] * p22[2];
+        row7[3] = p10[3] * p22[3];
+        row8[0] = p20[0] * p12[0];
+        row8[1] = p20[1] * p12[1];
+        row8[2] = p20[2] * p12[2];
+        row8[3] = p20[3] * p12[3];
+        row9[0] = p10[0] * p11[0] * p12[0];
+        row9[1] = p10[1] * p11[1] * p12[1];
+        row9[2] = p10[2] * p11[2] * p12[2];
+        row9[3] = p10[3] * p11[3] * p12[3];
+        return;
+    }
+
+    for (std::size_t q = 0; q < points.size(); ++q) {
+        const auto& xi = points[q];
+        const Real l1 = xi[0];
+        const Real l2 = xi[1];
+        const Real l0 = Real(1) - l1 - l2;
+
+        const Real p10 = Real(3) * l0;
+        const Real p11 = Real(3) * l1;
+        const Real p12 = Real(3) * l2;
+        const Real p20 = Real(0.5) * p10 * (p10 - Real(1));
+        const Real p21 = Real(0.5) * p11 * (p11 - Real(1));
+        const Real p22 = Real(0.5) * p12 * (p12 - Real(1));
+        const Real p30 = (p10 * (p10 - Real(1)) * (p10 - Real(2))) / Real(6);
+        const Real p31 = (p11 * (p11 - Real(1)) * (p11 - Real(2))) / Real(6);
+        const Real p32 = (p12 * (p12 - Real(1)) * (p12 - Real(2))) / Real(6);
+
+        row0[q] = p30;
+        row1[q] = p31;
+        row2[q] = p32;
+        row3[q] = p20 * p11;
+        row4[q] = p10 * p21;
+        row5[q] = p21 * p12;
+        row6[q] = p11 * p22;
+        row7[q] = p10 * p22;
+        row8[q] = p20 * p12;
+        row9[q] = p10 * p11 * p12;
+    }
+}
+
+void evaluate_triangle_order2_values_strided(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out) {
+    Real* row0 = values_out + 0u * output_stride;
+    Real* row1 = values_out + 1u * output_stride;
+    Real* row2 = values_out + 2u * output_stride;
+    Real* row3 = values_out + 3u * output_stride;
+    Real* row4 = values_out + 4u * output_stride;
+    Real* row5 = values_out + 5u * output_stride;
+
+    if (points.size() == 4u && output_stride == 4u) {
+        Real l0[4];
+        Real l1[4];
+        Real l2[4];
+        for (std::size_t q = 0; q < 4u; ++q) {
+            const auto& xi = points[q];
+            l1[q] = xi[0];
+            l2[q] = xi[1];
+            l0[q] = Real(1) - l1[q] - l2[q];
+        }
+
+        row0[0] = l0[0] * (Real(2) * l0[0] - Real(1));
+        row0[1] = l0[1] * (Real(2) * l0[1] - Real(1));
+        row0[2] = l0[2] * (Real(2) * l0[2] - Real(1));
+        row0[3] = l0[3] * (Real(2) * l0[3] - Real(1));
+        row1[0] = l1[0] * (Real(2) * l1[0] - Real(1));
+        row1[1] = l1[1] * (Real(2) * l1[1] - Real(1));
+        row1[2] = l1[2] * (Real(2) * l1[2] - Real(1));
+        row1[3] = l1[3] * (Real(2) * l1[3] - Real(1));
+        row2[0] = l2[0] * (Real(2) * l2[0] - Real(1));
+        row2[1] = l2[1] * (Real(2) * l2[1] - Real(1));
+        row2[2] = l2[2] * (Real(2) * l2[2] - Real(1));
+        row2[3] = l2[3] * (Real(2) * l2[3] - Real(1));
+        row3[0] = Real(4) * l0[0] * l1[0];
+        row3[1] = Real(4) * l0[1] * l1[1];
+        row3[2] = Real(4) * l0[2] * l1[2];
+        row3[3] = Real(4) * l0[3] * l1[3];
+        row4[0] = Real(4) * l1[0] * l2[0];
+        row4[1] = Real(4) * l1[1] * l2[1];
+        row4[2] = Real(4) * l1[2] * l2[2];
+        row4[3] = Real(4) * l1[3] * l2[3];
+        row5[0] = Real(4) * l0[0] * l2[0];
+        row5[1] = Real(4) * l0[1] * l2[1];
+        row5[2] = Real(4) * l0[2] * l2[2];
+        row5[3] = Real(4) * l0[3] * l2[3];
+        return;
+    }
+
+    auto write_q = [&](std::size_t q) {
+        const auto& xi = points[q];
+        const Real l1 = xi[0];
+        const Real l2 = xi[1];
+        const Real l0 = Real(1) - l1 - l2;
+        row0[q] = l0 * (Real(2) * l0 - Real(1));
+        row1[q] = l1 * (Real(2) * l1 - Real(1));
+        row2[q] = l2 * (Real(2) * l2 - Real(1));
+        row3[q] = Real(4) * l0 * l1;
+        row4[q] = Real(4) * l1 * l2;
+        row5[q] = Real(4) * l0 * l2;
+    };
+
+    if (points.size() == 4u) {
+        write_q(0u);
+        write_q(1u);
+        write_q(2u);
+        write_q(3u);
+        return;
+    }
+
+    for (std::size_t q = 0; q < points.size(); ++q) {
+        write_q(q);
+    }
+}
+
+void evaluate_triangle_order1_values_strided(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out) {
+    Real* row0 = values_out + 0u * output_stride;
+    Real* row1 = values_out + 1u * output_stride;
+    Real* row2 = values_out + 2u * output_stride;
+
+    for (std::size_t q = 0; q < points.size(); ++q) {
+        const auto& xi = points[q];
+        row0[q] = Real(1) - xi[0] - xi[1];
+        row1[q] = xi[0];
+        row2[q] = xi[1];
+    }
+}
+
+void evaluate_triangle_order2_gradients_strided(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT gradients_out) {
+    Real* row0 = gradients_out + 0u * 3u * output_stride;
+    Real* row1 = gradients_out + 1u * 3u * output_stride;
+    Real* row2 = gradients_out + 2u * 3u * output_stride;
+    Real* row3 = gradients_out + 3u * 3u * output_stride;
+    Real* row4 = gradients_out + 4u * 3u * output_stride;
+    Real* row5 = gradients_out + 5u * 3u * output_stride;
+
+    for (std::size_t q = 0; q < points.size(); ++q) {
+        const auto& xi = points[q];
+        const Real l1 = xi[0];
+        const Real l2 = xi[1];
+        const Real l0 = Real(1) - l1 - l2;
+        const Real g0 = Real(1) - Real(4) * l0;
+        row0[0u * output_stride + q] = g0;
+        row0[1u * output_stride + q] = g0;
+        row0[2u * output_stride + q] = Real(0);
+        row1[0u * output_stride + q] = Real(4) * l1 - Real(1);
+        row1[1u * output_stride + q] = Real(0);
+        row1[2u * output_stride + q] = Real(0);
+        row2[0u * output_stride + q] = Real(0);
+        row2[1u * output_stride + q] = Real(4) * l2 - Real(1);
+        row2[2u * output_stride + q] = Real(0);
+        row3[0u * output_stride + q] = Real(4) * (l0 - l1);
+        row3[1u * output_stride + q] = Real(-4) * l1;
+        row3[2u * output_stride + q] = Real(0);
+        row4[0u * output_stride + q] = Real(4) * l2;
+        row4[1u * output_stride + q] = Real(4) * l1;
+        row4[2u * output_stride + q] = Real(0);
+        row5[0u * output_stride + q] = Real(-4) * l2;
+        row5[1u * output_stride + q] = Real(4) * (l0 - l2);
+        row5[2u * output_stride + q] = Real(0);
+    }
+}
+
+inline void write_constant_hessian_q4(Real* SVMP_RESTRICT row,
+                                      std::size_t output_stride,
+                                      Real h00,
+                                      Real h01,
+                                      Real h02,
+                                      Real h10,
+                                      Real h11,
+                                      Real h12,
+                                      Real h20,
+                                      Real h21,
+                                      Real h22) {
+    Real* c0 = row + 0u * output_stride;
+    Real* c1 = row + 1u * output_stride;
+    Real* c2 = row + 2u * output_stride;
+    Real* c3 = row + 3u * output_stride;
+    Real* c4 = row + 4u * output_stride;
+    Real* c5 = row + 5u * output_stride;
+    Real* c6 = row + 6u * output_stride;
+    Real* c7 = row + 7u * output_stride;
+    Real* c8 = row + 8u * output_stride;
+
+    c0[0] = h00; c0[1] = h00; c0[2] = h00; c0[3] = h00;
+    c1[0] = h01; c1[1] = h01; c1[2] = h01; c1[3] = h01;
+    c2[0] = h02; c2[1] = h02; c2[2] = h02; c2[3] = h02;
+    c3[0] = h10; c3[1] = h10; c3[2] = h10; c3[3] = h10;
+    c4[0] = h11; c4[1] = h11; c4[2] = h11; c4[3] = h11;
+    c5[0] = h12; c5[1] = h12; c5[2] = h12; c5[3] = h12;
+    c6[0] = h20; c6[1] = h20; c6[2] = h20; c6[3] = h20;
+    c7[0] = h21; c7[1] = h21; c7[2] = h21; c7[3] = h21;
+    c8[0] = h22; c8[1] = h22; c8[2] = h22; c8[3] = h22;
+}
+
+SVMP_LAGRANGE_NOINLINE void evaluate_triangle_order2_hessians_q4(
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT hessians_out) {
+    write_constant_hessian_q4(hessians_out + 0u * 9u * output_stride,
+                              output_stride,
+                              Real(4), Real(4), Real(0),
+                              Real(4), Real(4), Real(0),
+                              Real(0), Real(0), Real(0));
+    write_constant_hessian_q4(hessians_out + 1u * 9u * output_stride,
+                              output_stride,
+                              Real(4), Real(0), Real(0),
+                              Real(0), Real(0), Real(0),
+                              Real(0), Real(0), Real(0));
+    write_constant_hessian_q4(hessians_out + 2u * 9u * output_stride,
+                              output_stride,
+                              Real(0), Real(0), Real(0),
+                              Real(0), Real(4), Real(0),
+                              Real(0), Real(0), Real(0));
+    write_constant_hessian_q4(hessians_out + 3u * 9u * output_stride,
+                              output_stride,
+                              Real(-8), Real(-4), Real(0),
+                              Real(-4), Real(0), Real(0),
+                              Real(0), Real(0), Real(0));
+    write_constant_hessian_q4(hessians_out + 4u * 9u * output_stride,
+                              output_stride,
+                              Real(0), Real(4), Real(0),
+                              Real(4), Real(0), Real(0),
+                              Real(0), Real(0), Real(0));
+    write_constant_hessian_q4(hessians_out + 5u * 9u * output_stride,
+                              output_stride,
+                              Real(0), Real(-4), Real(0),
+                              Real(-4), Real(-8), Real(0),
+                              Real(0), Real(0), Real(0));
+}
+
+SVMP_LAGRANGE_NOINLINE void evaluate_tet_order2_hessians_q4(
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT hessians_out) {
+    write_constant_hessian_q4(hessians_out + 0u * 9u * output_stride,
+                              output_stride,
+                              Real(4), Real(4), Real(4),
+                              Real(4), Real(4), Real(4),
+                              Real(4), Real(4), Real(4));
+    write_constant_hessian_q4(hessians_out + 1u * 9u * output_stride,
+                              output_stride,
+                              Real(4), Real(0), Real(0),
+                              Real(0), Real(0), Real(0),
+                              Real(0), Real(0), Real(0));
+    write_constant_hessian_q4(hessians_out + 2u * 9u * output_stride,
+                              output_stride,
+                              Real(0), Real(0), Real(0),
+                              Real(0), Real(4), Real(0),
+                              Real(0), Real(0), Real(0));
+    write_constant_hessian_q4(hessians_out + 3u * 9u * output_stride,
+                              output_stride,
+                              Real(0), Real(0), Real(0),
+                              Real(0), Real(0), Real(0),
+                              Real(0), Real(0), Real(4));
+    write_constant_hessian_q4(hessians_out + 4u * 9u * output_stride,
+                              output_stride,
+                              Real(-8), Real(-4), Real(-4),
+                              Real(-4), Real(0), Real(0),
+                              Real(-4), Real(0), Real(0));
+    write_constant_hessian_q4(hessians_out + 5u * 9u * output_stride,
+                              output_stride,
+                              Real(0), Real(4), Real(0),
+                              Real(4), Real(0), Real(0),
+                              Real(0), Real(0), Real(0));
+    write_constant_hessian_q4(hessians_out + 6u * 9u * output_stride,
+                              output_stride,
+                              Real(0), Real(-4), Real(0),
+                              Real(-4), Real(-8), Real(-4),
+                              Real(0), Real(-4), Real(0));
+    write_constant_hessian_q4(hessians_out + 7u * 9u * output_stride,
+                              output_stride,
+                              Real(0), Real(0), Real(-4),
+                              Real(0), Real(0), Real(-4),
+                              Real(-4), Real(-4), Real(-8));
+    write_constant_hessian_q4(hessians_out + 8u * 9u * output_stride,
+                              output_stride,
+                              Real(0), Real(0), Real(4),
+                              Real(0), Real(0), Real(0),
+                              Real(4), Real(0), Real(0));
+    write_constant_hessian_q4(hessians_out + 9u * 9u * output_stride,
+                              output_stride,
+                              Real(0), Real(0), Real(0),
+                              Real(0), Real(0), Real(4),
+                              Real(0), Real(4), Real(0));
+}
+
+void evaluate_tet_order1_values_strided(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out) {
+    Real* row0 = values_out + 0u * output_stride;
+    Real* row1 = values_out + 1u * output_stride;
+    Real* row2 = values_out + 2u * output_stride;
+    Real* row3 = values_out + 3u * output_stride;
+
+    for (std::size_t q = 0; q < points.size(); ++q) {
+        const auto& xi = points[q];
+        row0[q] = Real(1) - xi[0] - xi[1] - xi[2];
+        row1[q] = xi[0];
+        row2[q] = xi[1];
+        row3[q] = xi[2];
+    }
+}
+
+void evaluate_tet_order1_gradients_strided(
+    std::size_t num_qpts,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT gradients_out) {
+    Real* row0 = gradients_out + 0u * 3u * output_stride;
+    Real* row1 = gradients_out + 1u * 3u * output_stride;
+    Real* row2 = gradients_out + 2u * 3u * output_stride;
+    Real* row3 = gradients_out + 3u * 3u * output_stride;
+
+    for (std::size_t q = 0; q < num_qpts; ++q) {
+        row0[0u * output_stride + q] = Real(-1);
+        row0[1u * output_stride + q] = Real(-1);
+        row0[2u * output_stride + q] = Real(-1);
+        row1[0u * output_stride + q] = Real(1);
+        row1[1u * output_stride + q] = Real(0);
+        row1[2u * output_stride + q] = Real(0);
+        row2[0u * output_stride + q] = Real(0);
+        row2[1u * output_stride + q] = Real(1);
+        row2[2u * output_stride + q] = Real(0);
+        row3[0u * output_stride + q] = Real(0);
+        row3[1u * output_stride + q] = Real(0);
+        row3[2u * output_stride + q] = Real(1);
+    }
+}
+
+void evaluate_zero_hessians_strided(
+    std::size_t num_nodes,
+    std::size_t num_qpts,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT hessians_out) {
+    if (num_qpts == 4u) {
+        for (std::size_t node = 0; node < num_nodes; ++node) {
+            write_constant_hessian_q4(hessians_out + node * 9u * output_stride,
+                                      output_stride,
+                                      Real(0), Real(0), Real(0),
+                                      Real(0), Real(0), Real(0),
+                                      Real(0), Real(0), Real(0));
+        }
+        return;
+    }
+
+    for (std::size_t node = 0; node < num_nodes; ++node) {
+        Real* row = hessians_out + node * 9u * output_stride;
+        for (std::size_t q = 0; q < num_qpts; ++q) {
+            row[0u * output_stride + q] = Real(0);
+            row[1u * output_stride + q] = Real(0);
+            row[2u * output_stride + q] = Real(0);
+            row[3u * output_stride + q] = Real(0);
+            row[4u * output_stride + q] = Real(0);
+            row[5u * output_stride + q] = Real(0);
+            row[6u * output_stride + q] = Real(0);
+            row[7u * output_stride + q] = Real(0);
+            row[8u * output_stride + q] = Real(0);
+        }
+    }
+}
+
+void evaluate_tet_order2_values_strided(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out) {
+    Real* row0 = values_out + 0u * output_stride;
+    Real* row1 = values_out + 1u * output_stride;
+    Real* row2 = values_out + 2u * output_stride;
+    Real* row3 = values_out + 3u * output_stride;
+    Real* row4 = values_out + 4u * output_stride;
+    Real* row5 = values_out + 5u * output_stride;
+    Real* row6 = values_out + 6u * output_stride;
+    Real* row7 = values_out + 7u * output_stride;
+    Real* row8 = values_out + 8u * output_stride;
+    Real* row9 = values_out + 9u * output_stride;
+
+    if (points.size() == 4u && output_stride == 4u) {
+        Real l0[4];
+        Real l1[4];
+        Real l2[4];
+        Real l3[4];
+        for (std::size_t q = 0; q < 4u; ++q) {
+            const auto& xi = points[q];
+            l1[q] = xi[0];
+            l2[q] = xi[1];
+            l3[q] = xi[2];
+            l0[q] = Real(1) - l1[q] - l2[q] - l3[q];
+        }
+
+        row0[0] = l0[0] * (Real(2) * l0[0] - Real(1));
+        row0[1] = l0[1] * (Real(2) * l0[1] - Real(1));
+        row0[2] = l0[2] * (Real(2) * l0[2] - Real(1));
+        row0[3] = l0[3] * (Real(2) * l0[3] - Real(1));
+        row1[0] = l1[0] * (Real(2) * l1[0] - Real(1));
+        row1[1] = l1[1] * (Real(2) * l1[1] - Real(1));
+        row1[2] = l1[2] * (Real(2) * l1[2] - Real(1));
+        row1[3] = l1[3] * (Real(2) * l1[3] - Real(1));
+        row2[0] = l2[0] * (Real(2) * l2[0] - Real(1));
+        row2[1] = l2[1] * (Real(2) * l2[1] - Real(1));
+        row2[2] = l2[2] * (Real(2) * l2[2] - Real(1));
+        row2[3] = l2[3] * (Real(2) * l2[3] - Real(1));
+        row3[0] = l3[0] * (Real(2) * l3[0] - Real(1));
+        row3[1] = l3[1] * (Real(2) * l3[1] - Real(1));
+        row3[2] = l3[2] * (Real(2) * l3[2] - Real(1));
+        row3[3] = l3[3] * (Real(2) * l3[3] - Real(1));
+        row4[0] = Real(4) * l0[0] * l1[0];
+        row4[1] = Real(4) * l0[1] * l1[1];
+        row4[2] = Real(4) * l0[2] * l1[2];
+        row4[3] = Real(4) * l0[3] * l1[3];
+        row5[0] = Real(4) * l1[0] * l2[0];
+        row5[1] = Real(4) * l1[1] * l2[1];
+        row5[2] = Real(4) * l1[2] * l2[2];
+        row5[3] = Real(4) * l1[3] * l2[3];
+        row6[0] = Real(4) * l0[0] * l2[0];
+        row6[1] = Real(4) * l0[1] * l2[1];
+        row6[2] = Real(4) * l0[2] * l2[2];
+        row6[3] = Real(4) * l0[3] * l2[3];
+        row7[0] = Real(4) * l0[0] * l3[0];
+        row7[1] = Real(4) * l0[1] * l3[1];
+        row7[2] = Real(4) * l0[2] * l3[2];
+        row7[3] = Real(4) * l0[3] * l3[3];
+        row8[0] = Real(4) * l1[0] * l3[0];
+        row8[1] = Real(4) * l1[1] * l3[1];
+        row8[2] = Real(4) * l1[2] * l3[2];
+        row8[3] = Real(4) * l1[3] * l3[3];
+        row9[0] = Real(4) * l2[0] * l3[0];
+        row9[1] = Real(4) * l2[1] * l3[1];
+        row9[2] = Real(4) * l2[2] * l3[2];
+        row9[3] = Real(4) * l2[3] * l3[3];
+        return;
+    }
+
+    auto write_q = [&](std::size_t q) {
+        const auto& xi = points[q];
+        const Real l1 = xi[0];
+        const Real l2 = xi[1];
+        const Real l3 = xi[2];
+        const Real l0 = Real(1) - l1 - l2 - l3;
+        row0[q] = l0 * (Real(2) * l0 - Real(1));
+        row1[q] = l1 * (Real(2) * l1 - Real(1));
+        row2[q] = l2 * (Real(2) * l2 - Real(1));
+        row3[q] = l3 * (Real(2) * l3 - Real(1));
+        row4[q] = Real(4) * l0 * l1;
+        row5[q] = Real(4) * l1 * l2;
+        row6[q] = Real(4) * l0 * l2;
+        row7[q] = Real(4) * l0 * l3;
+        row8[q] = Real(4) * l1 * l3;
+        row9[q] = Real(4) * l2 * l3;
+    };
+
+    if (points.size() == 4u) {
+        write_q(0u);
+        write_q(1u);
+        write_q(2u);
+        write_q(3u);
+        return;
+    }
+
+    for (std::size_t q = 0; q < points.size(); ++q) {
+        write_q(q);
+    }
+}
+
+inline void write_tet_order2_gradient_q(Real* SVMP_RESTRICT row,
+                                        std::size_t output_stride,
+                                        std::size_t q,
+                                        Real gx,
+                                        Real gy,
+                                        Real gz) {
+    row[0u * output_stride + q] = gx;
+    row[1u * output_stride + q] = gy;
+    row[2u * output_stride + q] = gz;
+}
+
+SVMP_LAGRANGE_NOINLINE void evaluate_tet_order2_gradients_strided(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT gradients_out) {
+    Real* row0 = gradients_out + 0u * 3u * output_stride;
+    Real* row1 = gradients_out + 1u * 3u * output_stride;
+    Real* row2 = gradients_out + 2u * 3u * output_stride;
+    Real* row3 = gradients_out + 3u * 3u * output_stride;
+    Real* row4 = gradients_out + 4u * 3u * output_stride;
+    Real* row5 = gradients_out + 5u * 3u * output_stride;
+    Real* row6 = gradients_out + 6u * 3u * output_stride;
+    Real* row7 = gradients_out + 7u * 3u * output_stride;
+    Real* row8 = gradients_out + 8u * 3u * output_stride;
+    Real* row9 = gradients_out + 9u * 3u * output_stride;
+
+    auto write_q = [&](std::size_t q) {
+        const auto& xi = points[q];
+        const Real l1 = xi[0];
+        const Real l2 = xi[1];
+        const Real l3 = xi[2];
+        const Real l0 = Real(1) - l1 - l2 - l3;
+        const Real four = Real(4);
+        const Real g0 = Real(1) - four * l0;
+
+        write_tet_order2_gradient_q(row0, output_stride, q, g0, g0, g0);
+        write_tet_order2_gradient_q(row1, output_stride, q, four * l1 - Real(1), Real(0), Real(0));
+        write_tet_order2_gradient_q(row2, output_stride, q, Real(0), four * l2 - Real(1), Real(0));
+        write_tet_order2_gradient_q(row3, output_stride, q, Real(0), Real(0), four * l3 - Real(1));
+        write_tet_order2_gradient_q(row4, output_stride, q, four * (l0 - l1), -four * l1, -four * l1);
+        write_tet_order2_gradient_q(row5, output_stride, q, four * l2, four * l1, Real(0));
+        write_tet_order2_gradient_q(row6, output_stride, q, -four * l2, four * (l0 - l2), -four * l2);
+        write_tet_order2_gradient_q(row7, output_stride, q, -four * l3, -four * l3, four * (l0 - l3));
+        write_tet_order2_gradient_q(row8, output_stride, q, four * l3, Real(0), four * l1);
+        write_tet_order2_gradient_q(row9, output_stride, q, Real(0), four * l3, four * l2);
+    };
+
+    if (points.size() == 4u) {
+        write_q(0u);
+        write_q(1u);
+        write_q(2u);
+        write_q(3u);
+        return;
+    }
+
+    for (std::size_t q = 0; q < points.size(); ++q) {
+        write_q(q);
+    }
+}
+
+inline void fill_simplex_order3_factor_values(Real lambda, Real* SVMP_RESTRICT phi) {
+    const Real t = Real(3) * lambda;
+    phi[0] = Real(1);
+    phi[1] = t;
+    phi[2] = phi[1] * (t - Real(1)) * Real(0.5);
+    phi[3] = phi[2] * (t - Real(2)) / Real(3);
+}
+
+void evaluate_tet_order3_values_strided(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out) {
+    Real* row0 = values_out + 0u * output_stride;
+    Real* row1 = values_out + 1u * output_stride;
+    Real* row2 = values_out + 2u * output_stride;
+    Real* row3 = values_out + 3u * output_stride;
+    Real* row4 = values_out + 4u * output_stride;
+    Real* row5 = values_out + 5u * output_stride;
+    Real* row6 = values_out + 6u * output_stride;
+    Real* row7 = values_out + 7u * output_stride;
+    Real* row8 = values_out + 8u * output_stride;
+    Real* row9 = values_out + 9u * output_stride;
+    Real* row10 = values_out + 10u * output_stride;
+    Real* row11 = values_out + 11u * output_stride;
+    Real* row12 = values_out + 12u * output_stride;
+    Real* row13 = values_out + 13u * output_stride;
+    Real* row14 = values_out + 14u * output_stride;
+    Real* row15 = values_out + 15u * output_stride;
+    Real* row16 = values_out + 16u * output_stride;
+    Real* row17 = values_out + 17u * output_stride;
+    Real* row18 = values_out + 18u * output_stride;
+    Real* row19 = values_out + 19u * output_stride;
+
+    for (std::size_t q = 0; q < points.size(); ++q) {
+        const auto& xi = points[q];
+        const Real l1 = xi[0];
+        const Real l2 = xi[1];
+        const Real l3 = xi[2];
+        const Real l0 = Real(1) - l1 - l2 - l3;
+        Real p0[4];
+        Real p1[4];
+        Real p2[4];
+        Real p3[4];
+        fill_simplex_order3_factor_values(l0, p0);
+        fill_simplex_order3_factor_values(l1, p1);
+        fill_simplex_order3_factor_values(l2, p2);
+        fill_simplex_order3_factor_values(l3, p3);
+
+        row0[q] = p0[3];
+        row1[q] = p1[3];
+        row2[q] = p2[3];
+        row3[q] = p3[3];
+        row4[q] = p0[2] * p1[1];
+        row5[q] = p0[1] * p1[2];
+        row6[q] = p1[2] * p2[1];
+        row7[q] = p1[1] * p2[2];
+        row8[q] = p0[1] * p2[2];
+        row9[q] = p0[2] * p2[1];
+        row10[q] = p0[2] * p3[1];
+        row11[q] = p0[1] * p3[2];
+        row12[q] = p1[2] * p3[1];
+        row13[q] = p1[1] * p3[2];
+        row14[q] = p2[2] * p3[1];
+        row15[q] = p2[1] * p3[2];
+        row16[q] = p0[1] * p1[1] * p2[1];
+        row17[q] = p0[1] * p1[1] * p3[1];
+        row18[q] = p1[1] * p2[1] * p3[1];
+        row19[q] = p0[1] * p2[1] * p3[1];
+    }
+}
+
+void evaluate_triangle_order3_gradients_strided(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT gradients_out) {
+    Real* rows[10] = {
+        gradients_out + 0u * 3u * output_stride,
+        gradients_out + 1u * 3u * output_stride,
+        gradients_out + 2u * 3u * output_stride,
+        gradients_out + 3u * 3u * output_stride,
+        gradients_out + 4u * 3u * output_stride,
+        gradients_out + 5u * 3u * output_stride,
+        gradients_out + 6u * 3u * output_stride,
+        gradients_out + 7u * 3u * output_stride,
+        gradients_out + 8u * 3u * output_stride,
+        gradients_out + 9u * 3u * output_stride,
+    };
+
+    for (std::size_t q = 0; q < points.size(); ++q) {
+        const auto& xi = points[q];
+        const Real l1 = xi[0];
+        const Real l2 = xi[1];
+        const Real l0 = Real(1) - l1 - l2;
+
+        const Real p10 = Real(3) * l0;
+        const Real p11 = Real(3) * l1;
+        const Real p12 = Real(3) * l2;
+        const Real p20 = Real(0.5) * p10 * (p10 - Real(1));
+        const Real p21 = Real(0.5) * p11 * (p11 - Real(1));
+        const Real p22 = Real(0.5) * p12 * (p12 - Real(1));
+        const Real d10 = Real(3);
+        const Real d11 = Real(3);
+        const Real d12 = Real(3);
+        const Real d20 = Real(3) * p10 - Real(1.5);
+        const Real d21 = Real(3) * p11 - Real(1.5);
+        const Real d22 = Real(3) * p12 - Real(1.5);
+        const Real d30 = Real(1.5) * p10 * p10 - Real(3) * p10 + Real(1);
+        const Real d31 = Real(1.5) * p11 * p11 - Real(3) * p11 + Real(1);
+        const Real d32 = Real(1.5) * p12 * p12 - Real(3) * p12 + Real(1);
+
+        const Real dl0[10] = {
+            d30,
+            Real(0),
+            Real(0),
+            d20 * p11,
+            d10 * p21,
+            Real(0),
+            Real(0),
+            d10 * p22,
+            d20 * p12,
+            d10 * p11 * p12,
+        };
+        const Real dl1[10] = {
+            Real(0),
+            d31,
+            Real(0),
+            p20 * d11,
+            p10 * d21,
+            d21 * p12,
+            d11 * p22,
+            Real(0),
+            Real(0),
+            p10 * d11 * p12,
+        };
+        const Real dl2[10] = {
+            Real(0),
+            Real(0),
+            d32,
+            Real(0),
+            Real(0),
+            p21 * d12,
+            p11 * d22,
+            p10 * d22,
+            p20 * d12,
+            p10 * p11 * d12,
+        };
+
+        for (std::size_t node = 0; node < 10u; ++node) {
+            Real* g = rows[node];
+            g[0u * output_stride + q] = dl1[node] - dl0[node];
+            g[1u * output_stride + q] = dl2[node] - dl0[node];
+            g[2u * output_stride + q] = Real(0);
+        }
+    }
+}
+
+void evaluate_hex_order1_values_strided(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out) {
+    Real* row0 = values_out + 0u * output_stride;
+    Real* row1 = values_out + 1u * output_stride;
+    Real* row2 = values_out + 2u * output_stride;
+    Real* row3 = values_out + 3u * output_stride;
+    Real* row4 = values_out + 4u * output_stride;
+    Real* row5 = values_out + 5u * output_stride;
+    Real* row6 = values_out + 6u * output_stride;
+    Real* row7 = values_out + 7u * output_stride;
+
+    const auto write_q = [&](std::size_t q) {
+        const auto& xi = points[q];
+        const Real lx = (Real(1) - xi[0]) * Real(0.5);
+        const Real ly = (Real(1) - xi[1]) * Real(0.5);
+        const Real lz = (Real(1) - xi[2]) * Real(0.5);
+        const Real ux = (Real(1) + xi[0]) * Real(0.5);
+        const Real uy = (Real(1) + xi[1]) * Real(0.5);
+        const Real uz = (Real(1) + xi[2]) * Real(0.5);
+        const Real lxly = lx * ly;
+        const Real uxly = ux * ly;
+        const Real uxuy = ux * uy;
+        const Real lxuy = lx * uy;
+        row0[q] = lxly * lz;
+        row1[q] = uxly * lz;
+        row2[q] = uxuy * lz;
+        row3[q] = lxuy * lz;
+        row4[q] = lxly * uz;
+        row5[q] = uxly * uz;
+        row6[q] = uxuy * uz;
+        row7[q] = lxuy * uz;
+    };
+    if (points.size() == 4u) {
+        write_q(0u);
+        write_q(1u);
+        write_q(2u);
+        write_q(3u);
+        return;
+    }
+
+    for (std::size_t q = 0; q < points.size(); ++q) {
+        write_q(q);
+    }
+}
+
+template <bool NeedValues, bool NeedGradients, bool NeedHessians>
+void evaluate_hex_order1_outputs_strided(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) {
+    constexpr Real half = Real(0.5);
+    constexpr std::array<Real, 8> dx{{-half, half, half, -half, -half, half, half, -half}};
+    constexpr std::array<Real, 8> dy{{-half, -half, half, half, -half, -half, half, half}};
+    constexpr std::array<Real, 8> dz{{-half, -half, -half, -half, half, half, half, half}};
+
+    for (std::size_t q = 0; q < points.size(); ++q) {
+        const auto& xi = points[q];
+        const Real lx = (Real(1) - xi[0]) * half;
+        const Real ly = (Real(1) - xi[1]) * half;
+        const Real lz = (Real(1) - xi[2]) * half;
+        const Real ux = (Real(1) + xi[0]) * half;
+        const Real uy = (Real(1) + xi[1]) * half;
+        const Real uz = (Real(1) + xi[2]) * half;
+        const Real xval[8] = {lx, ux, ux, lx, lx, ux, ux, lx};
+        const Real yval[8] = {ly, ly, uy, uy, ly, ly, uy, uy};
+        const Real zval[8] = {lz, lz, lz, lz, uz, uz, uz, uz};
+
+        for (std::size_t node = 0; node < 8u; ++node) {
+            if constexpr (NeedValues) {
+                values_out[node * output_stride + q] =
+                    xval[node] * yval[node] * zval[node];
+            }
+            if constexpr (NeedGradients) {
+                Real* SVMP_RESTRICT g = gradients_out + node * 3u * output_stride;
+                g[0u * output_stride + q] = dx[node] * yval[node] * zval[node];
+                g[1u * output_stride + q] = xval[node] * dy[node] * zval[node];
+                g[2u * output_stride + q] = xval[node] * yval[node] * dz[node];
+            }
+            if constexpr (NeedHessians) {
+                Real* SVMP_RESTRICT H = hessians_out + node * 9u * output_stride;
+                const Real hxy = dx[node] * dy[node] * zval[node];
+                const Real hxz = dx[node] * yval[node] * dz[node];
+                const Real hyz = xval[node] * dy[node] * dz[node];
+                H[0u * output_stride + q] = Real(0);
+                H[1u * output_stride + q] = hxy;
+                H[2u * output_stride + q] = hxz;
+                H[3u * output_stride + q] = hxy;
+                H[4u * output_stride + q] = Real(0);
+                H[5u * output_stride + q] = hyz;
+                H[6u * output_stride + q] = hxz;
+                H[7u * output_stride + q] = hyz;
+                H[8u * output_stride + q] = Real(0);
+            }
+        }
+    }
+}
+
+void evaluate_quad_order1_values_strided(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out) {
+    Real* row0 = values_out + 0u * output_stride;
+    Real* row1 = values_out + 1u * output_stride;
+    Real* row2 = values_out + 2u * output_stride;
+    Real* row3 = values_out + 3u * output_stride;
+
+    if (points.size() == 4u && output_stride == 4u) {
+        Real lx[4];
+        Real ux[4];
+        Real ly[4];
+        Real uy[4];
+        for (std::size_t q = 0; q < 4u; ++q) {
+            const auto& xi = points[q];
+            lx[q] = (Real(1) - xi[0]) * Real(0.5);
+            ux[q] = (Real(1) + xi[0]) * Real(0.5);
+            ly[q] = (Real(1) - xi[1]) * Real(0.5);
+            uy[q] = (Real(1) + xi[1]) * Real(0.5);
+        }
+        row0[0] = lx[0] * ly[0];
+        row0[1] = lx[1] * ly[1];
+        row0[2] = lx[2] * ly[2];
+        row0[3] = lx[3] * ly[3];
+        row1[0] = ux[0] * ly[0];
+        row1[1] = ux[1] * ly[1];
+        row1[2] = ux[2] * ly[2];
+        row1[3] = ux[3] * ly[3];
+        row2[0] = ux[0] * uy[0];
+        row2[1] = ux[1] * uy[1];
+        row2[2] = ux[2] * uy[2];
+        row2[3] = ux[3] * uy[3];
+        row3[0] = lx[0] * uy[0];
+        row3[1] = lx[1] * uy[1];
+        row3[2] = lx[2] * uy[2];
+        row3[3] = lx[3] * uy[3];
+        return;
+    }
+
+    for (std::size_t q = 0; q < points.size(); ++q) {
+        const auto& xi = points[q];
+        const Real lx = (Real(1) - xi[0]) * Real(0.5);
+        const Real ly = (Real(1) - xi[1]) * Real(0.5);
+        const Real ux = (Real(1) + xi[0]) * Real(0.5);
+        const Real uy = (Real(1) + xi[1]) * Real(0.5);
+        row0[q] = lx * ly;
+        row1[q] = ux * ly;
+        row2[q] = ux * uy;
+        row3[q] = lx * uy;
+    }
+}
+
+void evaluate_quad_order1_gradients_strided(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT gradients_out) {
+    Real* row0 = gradients_out + 0u * 3u * output_stride;
+    Real* row1 = gradients_out + 1u * 3u * output_stride;
+    Real* row2 = gradients_out + 2u * 3u * output_stride;
+    Real* row3 = gradients_out + 3u * 3u * output_stride;
+
+    if (points.size() == 4u) {
+        Real lx[4];
+        Real ly[4];
+        Real ux[4];
+        Real uy[4];
+        for (std::size_t q = 0; q < 4u; ++q) {
+            const auto& xi = points[q];
+            lx[q] = (Real(1) - xi[0]) * Real(0.5);
+            ly[q] = (Real(1) - xi[1]) * Real(0.5);
+            ux[q] = (Real(1) + xi[0]) * Real(0.5);
+            uy[q] = (Real(1) + xi[1]) * Real(0.5);
+        }
+
+        auto write_component = [](Real* SVMP_RESTRICT row,
+                                  Real a0,
+                                  Real a1,
+                                  Real a2,
+                                  Real a3) {
+            row[0] = a0;
+            row[1] = a1;
+            row[2] = a2;
+            row[3] = a3;
+        };
+
+        write_component(row0, Real(-0.5) * ly[0], Real(-0.5) * ly[1],
+                        Real(-0.5) * ly[2], Real(-0.5) * ly[3]);
+        write_component(row0 + output_stride, Real(-0.5) * lx[0], Real(-0.5) * lx[1],
+                        Real(-0.5) * lx[2], Real(-0.5) * lx[3]);
+        write_component(row0 + 2u * output_stride, Real(0), Real(0), Real(0), Real(0));
+
+        write_component(row1, Real(0.5) * ly[0], Real(0.5) * ly[1],
+                        Real(0.5) * ly[2], Real(0.5) * ly[3]);
+        write_component(row1 + output_stride, Real(-0.5) * ux[0], Real(-0.5) * ux[1],
+                        Real(-0.5) * ux[2], Real(-0.5) * ux[3]);
+        write_component(row1 + 2u * output_stride, Real(0), Real(0), Real(0), Real(0));
+
+        write_component(row2, Real(0.5) * uy[0], Real(0.5) * uy[1],
+                        Real(0.5) * uy[2], Real(0.5) * uy[3]);
+        write_component(row2 + output_stride, Real(0.5) * ux[0], Real(0.5) * ux[1],
+                        Real(0.5) * ux[2], Real(0.5) * ux[3]);
+        write_component(row2 + 2u * output_stride, Real(0), Real(0), Real(0), Real(0));
+
+        write_component(row3, Real(-0.5) * uy[0], Real(-0.5) * uy[1],
+                        Real(-0.5) * uy[2], Real(-0.5) * uy[3]);
+        write_component(row3 + output_stride, Real(0.5) * lx[0], Real(0.5) * lx[1],
+                        Real(0.5) * lx[2], Real(0.5) * lx[3]);
+        write_component(row3 + 2u * output_stride, Real(0), Real(0), Real(0), Real(0));
+        return;
+    }
+
+    for (std::size_t q = 0; q < points.size(); ++q) {
+        const auto& xi = points[q];
+        const Real lx = (Real(1) - xi[0]) * Real(0.5);
+        const Real ly = (Real(1) - xi[1]) * Real(0.5);
+        const Real ux = (Real(1) + xi[0]) * Real(0.5);
+        const Real uy = (Real(1) + xi[1]) * Real(0.5);
+        row0[0u * output_stride + q] = Real(-0.5) * ly;
+        row0[1u * output_stride + q] = Real(-0.5) * lx;
+        row0[2u * output_stride + q] = Real(0);
+        row1[0u * output_stride + q] = Real( 0.5) * ly;
+        row1[1u * output_stride + q] = Real(-0.5) * ux;
+        row1[2u * output_stride + q] = Real(0);
+        row2[0u * output_stride + q] = Real( 0.5) * uy;
+        row2[1u * output_stride + q] = Real( 0.5) * ux;
+        row2[2u * output_stride + q] = Real(0);
+        row3[0u * output_stride + q] = Real(-0.5) * uy;
+        row3[1u * output_stride + q] = Real( 0.5) * lx;
+        row3[2u * output_stride + q] = Real(0);
+    }
+}
+
+inline void write_quad_order1_hessian_q(
+    Real* SVMP_RESTRICT row,
+    std::size_t output_stride,
+    std::size_t q,
+    Real xy) {
+    row[0u * output_stride + q] = Real(0);
+    row[1u * output_stride + q] = xy;
+    row[2u * output_stride + q] = Real(0);
+    row[3u * output_stride + q] = xy;
+    row[4u * output_stride + q] = Real(0);
+    row[5u * output_stride + q] = Real(0);
+    row[6u * output_stride + q] = Real(0);
+    row[7u * output_stride + q] = Real(0);
+    row[8u * output_stride + q] = Real(0);
+}
+
+void evaluate_quad_order1_hessians_strided(
+    std::size_t num_qpts,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT hessians_out) {
+    Real* row0 = hessians_out + 0u * 9u * output_stride;
+    Real* row1 = hessians_out + 1u * 9u * output_stride;
+    Real* row2 = hessians_out + 2u * 9u * output_stride;
+    Real* row3 = hessians_out + 3u * 9u * output_stride;
+
+    constexpr Real positive = Real(0.25);
+    constexpr Real negative = Real(-0.25);
+    for (std::size_t q = 0; q < num_qpts; ++q) {
+        write_quad_order1_hessian_q(row0, output_stride, q, positive);
+        write_quad_order1_hessian_q(row1, output_stride, q, negative);
+        write_quad_order1_hessian_q(row2, output_stride, q, positive);
+        write_quad_order1_hessian_q(row3, output_stride, q, negative);
+    }
+}
+
+template <std::size_t Q>
+inline void write_quad_order1_all_q4(
+    std::size_t output_stride,
+    std::size_t i,
+    std::size_t j,
+    const Real lx[4][2],
+    const Real ly[4][2],
+    Real* SVMP_RESTRICT value_row,
+    Real* SVMP_RESTRICT grad_row,
+    Real* SVMP_RESTRICT hess_row) {
+    const Real xv = lx[Q][i];
+    const Real yv = ly[Q][j];
+    const Real xd = (i == 0u) ? Real(-0.5) : Real(0.5);
+    const Real yd = (j == 0u) ? Real(-0.5) : Real(0.5);
+    const Real hxy = xd * yd;
+
+    value_row[Q] = xv * yv;
+    grad_row[0u * output_stride + Q] = xd * yv;
+    grad_row[1u * output_stride + Q] = xv * yd;
+    grad_row[2u * output_stride + Q] = Real(0);
+    hess_row[0u * output_stride + Q] = Real(0);
+    hess_row[4u * output_stride + Q] = Real(0);
+    hess_row[8u * output_stride + Q] = Real(0);
+    hess_row[1u * output_stride + Q] = hxy;
+    hess_row[3u * output_stride + Q] = hxy;
+    hess_row[2u * output_stride + Q] = Real(0);
+    hess_row[6u * output_stride + Q] = Real(0);
+    hess_row[5u * output_stride + Q] = Real(0);
+    hess_row[7u * output_stride + Q] = Real(0);
+}
+
+void evaluate_quad_order1_all_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) {
+    constexpr auto node_axes = detail::make_quad_tensor_node_axes<1>();
+
+    Real lx[4][2];
+    Real ly[4][2];
+    for (std::size_t q = 0; q < 4u; ++q) {
+        const auto& xi = points[q];
+        lx[q][0] = (Real(1) - xi[0]) * Real(0.5);
+        lx[q][1] = (Real(1) + xi[0]) * Real(0.5);
+        ly[q][0] = (Real(1) - xi[1]) * Real(0.5);
+        ly[q][1] = (Real(1) + xi[1]) * Real(0.5);
+    }
+
+    for (std::size_t node = 0; node < node_axes.size(); ++node) {
+        const auto& axes = node_axes[node];
+        const std::size_t i = axes[0];
+        const std::size_t j = axes[1];
+        Real* value_row = values_out + node * output_stride;
+        Real* grad_row = gradients_out + node * 3u * output_stride;
+        Real* hess_row = hessians_out + node * 9u * output_stride;
+        write_quad_order1_all_q4<0u>(
+            output_stride, i, j, lx, ly, value_row, grad_row, hess_row);
+        write_quad_order1_all_q4<1u>(
+            output_stride, i, j, lx, ly, value_row, grad_row, hess_row);
+        write_quad_order1_all_q4<2u>(
+            output_stride, i, j, lx, ly, value_row, grad_row, hess_row);
+        write_quad_order1_all_q4<3u>(
+            output_stride, i, j, lx, ly, value_row, grad_row, hess_row);
+    }
+}
+
+void evaluate_quad_order2_values_strided(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out) {
+    Real* row0 = values_out + 0u * output_stride;
+    Real* row1 = values_out + 1u * output_stride;
+    Real* row2 = values_out + 2u * output_stride;
+    Real* row3 = values_out + 3u * output_stride;
+    Real* row4 = values_out + 4u * output_stride;
+    Real* row5 = values_out + 5u * output_stride;
+    Real* row6 = values_out + 6u * output_stride;
+    Real* row7 = values_out + 7u * output_stride;
+    Real* row8 = values_out + 8u * output_stride;
+
+    for (std::size_t q = 0; q < points.size(); ++q) {
+        const auto& xi = points[q];
+        const Real x = xi[0];
+        const Real y = xi[1];
+        const Real x0 = x * (x - Real(1)) * Real(0.5);
+        const Real x1 = x * (x + Real(1)) * Real(0.5);
+        const Real x2 = Real(1) - x * x;
+        const Real y0 = y * (y - Real(1)) * Real(0.5);
+        const Real y1 = y * (y + Real(1)) * Real(0.5);
+        const Real y2 = Real(1) - y * y;
+
+        row0[q] = x0 * y0;
+        row1[q] = x1 * y0;
+        row2[q] = x1 * y1;
+        row3[q] = x0 * y1;
+        row4[q] = x2 * y0;
+        row5[q] = x1 * y2;
+        row6[q] = x2 * y1;
+        row7[q] = x0 * y2;
+        row8[q] = x2 * y2;
+    }
+}
+
+inline void write_quad_order2_gradient_q(
+    Real* SVMP_RESTRICT row,
+    std::size_t output_stride,
+    std::size_t q,
+    Real dx,
+    Real dy) {
+    row[0u * output_stride + q] = dx;
+    row[1u * output_stride + q] = dy;
+    row[2u * output_stride + q] = Real(0);
+}
+
+void evaluate_quad_order2_gradients_strided(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT gradients_out) {
+    if (points.size() == 4u) {
+        Real xv[4][3];
+        Real yv[4][3];
+        Real xd[4][3];
+        Real yd[4][3];
+        for (std::size_t q = 0; q < 4u; ++q) {
+            const auto& xi = points[q];
+            const Real x = xi[0];
+            const Real y = xi[1];
+            xv[q][0] = x * (x - Real(1)) * Real(0.5);
+            xv[q][1] = x * (x + Real(1)) * Real(0.5);
+            xv[q][2] = Real(1) - x * x;
+            yv[q][0] = y * (y - Real(1)) * Real(0.5);
+            yv[q][1] = y * (y + Real(1)) * Real(0.5);
+            yv[q][2] = Real(1) - y * y;
+            xd[q][0] = x - Real(0.5);
+            xd[q][1] = x + Real(0.5);
+            xd[q][2] = Real(-2) * x;
+            yd[q][0] = y - Real(0.5);
+            yd[q][1] = y + Real(0.5);
+            yd[q][2] = Real(-2) * y;
+        }
+
+        auto write_node = [&](std::size_t node, std::size_t i, std::size_t j) {
+            Real* SVMP_RESTRICT row = gradients_out + node * 3u * output_stride;
+            row[0u] = xd[0][i] * yv[0][j];
+            row[1u] = xd[1][i] * yv[1][j];
+            row[2u] = xd[2][i] * yv[2][j];
+            row[3u] = xd[3][i] * yv[3][j];
+            row[output_stride + 0u] = xv[0][i] * yd[0][j];
+            row[output_stride + 1u] = xv[1][i] * yd[1][j];
+            row[output_stride + 2u] = xv[2][i] * yd[2][j];
+            row[output_stride + 3u] = xv[3][i] * yd[3][j];
+            row[2u * output_stride + 0u] = Real(0);
+            row[2u * output_stride + 1u] = Real(0);
+            row[2u * output_stride + 2u] = Real(0);
+            row[2u * output_stride + 3u] = Real(0);
+        };
+
+        write_node(0u, 0u, 0u);
+        write_node(1u, 1u, 0u);
+        write_node(2u, 1u, 1u);
+        write_node(3u, 0u, 1u);
+        write_node(4u, 2u, 0u);
+        write_node(5u, 1u, 2u);
+        write_node(6u, 2u, 1u);
+        write_node(7u, 0u, 2u);
+        write_node(8u, 2u, 2u);
+        return;
+    }
+
+    Real* row0 = gradients_out + 0u * 3u * output_stride;
+    Real* row1 = gradients_out + 1u * 3u * output_stride;
+    Real* row2 = gradients_out + 2u * 3u * output_stride;
+    Real* row3 = gradients_out + 3u * 3u * output_stride;
+    Real* row4 = gradients_out + 4u * 3u * output_stride;
+    Real* row5 = gradients_out + 5u * 3u * output_stride;
+    Real* row6 = gradients_out + 6u * 3u * output_stride;
+    Real* row7 = gradients_out + 7u * 3u * output_stride;
+    Real* row8 = gradients_out + 8u * 3u * output_stride;
+
+    for (std::size_t q = 0; q < points.size(); ++q) {
+        const auto& xi = points[q];
+        const Real x = xi[0];
+        const Real y = xi[1];
+        const Real x0 = x * (x - Real(1)) * Real(0.5);
+        const Real x1 = x * (x + Real(1)) * Real(0.5);
+        const Real x2 = Real(1) - x * x;
+        const Real y0 = y * (y - Real(1)) * Real(0.5);
+        const Real y1 = y * (y + Real(1)) * Real(0.5);
+        const Real y2 = Real(1) - y * y;
+        const Real dx0 = x - Real(0.5);
+        const Real dx1 = x + Real(0.5);
+        const Real dx2 = Real(-2) * x;
+        const Real dy0 = y - Real(0.5);
+        const Real dy1 = y + Real(0.5);
+        const Real dy2 = Real(-2) * y;
+
+        write_quad_order2_gradient_q(row0, output_stride, q, dx0 * y0, x0 * dy0);
+        write_quad_order2_gradient_q(row1, output_stride, q, dx1 * y0, x1 * dy0);
+        write_quad_order2_gradient_q(row2, output_stride, q, dx1 * y1, x1 * dy1);
+        write_quad_order2_gradient_q(row3, output_stride, q, dx0 * y1, x0 * dy1);
+        write_quad_order2_gradient_q(row4, output_stride, q, dx2 * y0, x2 * dy0);
+        write_quad_order2_gradient_q(row5, output_stride, q, dx1 * y2, x1 * dy2);
+        write_quad_order2_gradient_q(row6, output_stride, q, dx2 * y1, x2 * dy1);
+        write_quad_order2_gradient_q(row7, output_stride, q, dx0 * y2, x0 * dy2);
+        write_quad_order2_gradient_q(row8, output_stride, q, dx2 * y2, x2 * dy2);
+    }
+}
+
+inline void write_quad_order2_hessian_q(
+    Real* SVMP_RESTRICT row,
+    std::size_t output_stride,
+    std::size_t q,
+    Real hxx,
+    Real hxy,
+    Real hyy) {
+    row[0u * output_stride + q] = hxx;
+    row[1u * output_stride + q] = hxy;
+    row[2u * output_stride + q] = Real(0);
+    row[3u * output_stride + q] = hxy;
+    row[4u * output_stride + q] = hyy;
+    row[5u * output_stride + q] = Real(0);
+    row[6u * output_stride + q] = Real(0);
+    row[7u * output_stride + q] = Real(0);
+    row[8u * output_stride + q] = Real(0);
+}
+
+void evaluate_quad_order2_hessians_strided(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT hessians_out) {
+    Real* row0 = hessians_out + 0u * 9u * output_stride;
+    Real* row1 = hessians_out + 1u * 9u * output_stride;
+    Real* row2 = hessians_out + 2u * 9u * output_stride;
+    Real* row3 = hessians_out + 3u * 9u * output_stride;
+    Real* row4 = hessians_out + 4u * 9u * output_stride;
+    Real* row5 = hessians_out + 5u * 9u * output_stride;
+    Real* row6 = hessians_out + 6u * 9u * output_stride;
+    Real* row7 = hessians_out + 7u * 9u * output_stride;
+    Real* row8 = hessians_out + 8u * 9u * output_stride;
+
+    for (std::size_t q = 0; q < points.size(); ++q) {
+        const auto& xi = points[q];
+        const Real x = xi[0];
+        const Real y = xi[1];
+        const Real x0 = x * (x - Real(1)) * Real(0.5);
+        const Real x1 = x * (x + Real(1)) * Real(0.5);
+        const Real x2 = Real(1) - x * x;
+        const Real y0 = y * (y - Real(1)) * Real(0.5);
+        const Real y1 = y * (y + Real(1)) * Real(0.5);
+        const Real y2 = Real(1) - y * y;
+        const Real dx0 = x - Real(0.5);
+        const Real dx1 = x + Real(0.5);
+        const Real dx2 = Real(-2) * x;
+        const Real dy0 = y - Real(0.5);
+        const Real dy1 = y + Real(0.5);
+        const Real dy2 = Real(-2) * y;
+
+        write_quad_order2_hessian_q(row0, output_stride, q, y0, dx0 * dy0, x0);
+        write_quad_order2_hessian_q(row1, output_stride, q, y0, dx1 * dy0, x1);
+        write_quad_order2_hessian_q(row2, output_stride, q, y1, dx1 * dy1, x1);
+        write_quad_order2_hessian_q(row3, output_stride, q, y1, dx0 * dy1, x0);
+        write_quad_order2_hessian_q(row4, output_stride, q, Real(-2) * y0, dx2 * dy0, x2);
+        write_quad_order2_hessian_q(row5, output_stride, q, y2, dx1 * dy2, Real(-2) * x1);
+        write_quad_order2_hessian_q(row6, output_stride, q, Real(-2) * y1, dx2 * dy1, x2);
+        write_quad_order2_hessian_q(row7, output_stride, q, y2, dx0 * dy2, Real(-2) * x0);
+        write_quad_order2_hessian_q(row8, output_stride, q, Real(-2) * y2, dx2 * dy2, Real(-2) * x2);
+    }
+}
+
+inline void fill_order3_axis_values(Real x, Real* SVMP_RESTRICT values) {
+    const Real x2 = x * x;
+    values[0] = Real(-9.0 / 16.0) * (x - Real(1)) * (x2 - Real(1.0 / 9.0));
+    values[1] = Real( 9.0 / 16.0) * (x + Real(1)) * (x2 - Real(1.0 / 9.0));
+    values[2] = Real(27.0 / 16.0) * (x2 - Real(1)) * (x - Real(1.0 / 3.0));
+    values[3] = Real(-27.0 / 16.0) * (x2 - Real(1)) * (x + Real(1.0 / 3.0));
+}
+
+inline void fill_order3_axis_value_scalars(Real x,
+                                           Real& v0,
+                                           Real& v1,
+                                           Real& v2,
+                                           Real& v3) {
+    const Real x2 = x * x;
+    v0 = Real(-9.0 / 16.0) * (x - Real(1)) * (x2 - Real(1.0 / 9.0));
+    v1 = Real( 9.0 / 16.0) * (x + Real(1)) * (x2 - Real(1.0 / 9.0));
+    v2 = Real(27.0 / 16.0) * (x2 - Real(1)) * (x - Real(1.0 / 3.0));
+    v3 = Real(-27.0 / 16.0) * (x2 - Real(1)) * (x + Real(1.0 / 3.0));
+}
+
+void evaluate_line_order1_values_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out) {
+    Real* row0 = values_out + 0u * output_stride;
+    Real* row1 = values_out + 1u * output_stride;
+    for (std::size_t q = 0; q < 4u; ++q) {
+        const Real x = points[q][0];
+        row0[q] = (Real(1) - x) * Real(0.5);
+        row1[q] = (Real(1) + x) * Real(0.5);
+    }
+}
+
+void evaluate_line_order2_values_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out) {
+    Real* row0 = values_out + 0u * output_stride;
+    Real* row1 = values_out + 1u * output_stride;
+    Real* row2 = values_out + 2u * output_stride;
+    for (std::size_t q = 0; q < 4u; ++q) {
+        const Real x = points[q][0];
+        row0[q] = x * (x - Real(1)) * Real(0.5);
+        row1[q] = x * (x + Real(1)) * Real(0.5);
+        row2[q] = Real(1) - x * x;
+    }
+}
+
+void evaluate_line_order3_values_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out) {
+    Real* row0 = values_out + 0u * output_stride;
+    Real* row1 = values_out + 1u * output_stride;
+    Real* row2 = values_out + 2u * output_stride;
+    Real* row3 = values_out + 3u * output_stride;
+    for (std::size_t q = 0; q < 4u; ++q) {
+        Real values[4];
+        fill_order3_axis_values(points[q][0], values);
+        row0[q] = values[0];
+        row1[q] = values[1];
+        row2[q] = values[2];
+        row3[q] = values[3];
+    }
+}
+
+inline void fill_order3_axis_values_first(Real x,
+                                          Real* SVMP_RESTRICT values,
+                                          Real* SVMP_RESTRICT first);
+
+inline void fill_order3_axis_values_first_second(Real x,
+                                                 Real* SVMP_RESTRICT values,
+                                                 Real* SVMP_RESTRICT first,
+                                                 Real* SVMP_RESTRICT second);
+
+inline void write_line_gradient_q4_row(Real* SVMP_RESTRICT row,
+                                       std::size_t output_stride,
+                                       Real g0,
+                                       Real g1,
+                                       Real g2,
+                                       Real g3) {
+    row[0] = g0;
+    row[1] = g1;
+    row[2] = g2;
+    row[3] = g3;
+    row[output_stride + 0u] = Real(0);
+    row[output_stride + 1u] = Real(0);
+    row[output_stride + 2u] = Real(0);
+    row[output_stride + 3u] = Real(0);
+    row[2u * output_stride + 0u] = Real(0);
+    row[2u * output_stride + 1u] = Real(0);
+    row[2u * output_stride + 2u] = Real(0);
+    row[2u * output_stride + 3u] = Real(0);
+}
+
+inline void write_line_hessian_q4_row(Real* SVMP_RESTRICT row,
+                                      std::size_t output_stride,
+                                      Real h0,
+                                      Real h1,
+                                      Real h2,
+                                      Real h3) {
+    row[0] = h0;
+    row[1] = h1;
+    row[2] = h2;
+    row[3] = h3;
+    for (std::size_t component = 1u; component < 9u; ++component) {
+        Real* slot = row + component * output_stride;
+        slot[0] = Real(0);
+        slot[1] = Real(0);
+        slot[2] = Real(0);
+        slot[3] = Real(0);
+    }
+}
+
+SVMP_LAGRANGE_NOINLINE void evaluate_line_order1_gradients_q4(
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT gradients_out) {
+    write_line_gradient_q4_row(gradients_out + 0u * 3u * output_stride,
+                               output_stride,
+                               Real(-0.5), Real(-0.5), Real(-0.5), Real(-0.5));
+    write_line_gradient_q4_row(gradients_out + 1u * 3u * output_stride,
+                               output_stride,
+                               Real(0.5), Real(0.5), Real(0.5), Real(0.5));
+}
+
+SVMP_LAGRANGE_NOINLINE void evaluate_line_order1_hessians_q4(
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT hessians_out) {
+    write_line_hessian_q4_row(hessians_out + 0u * 9u * output_stride,
+                              output_stride, Real(0), Real(0), Real(0), Real(0));
+    write_line_hessian_q4_row(hessians_out + 1u * 9u * output_stride,
+                              output_stride, Real(0), Real(0), Real(0), Real(0));
+}
+
+SVMP_LAGRANGE_NOINLINE void evaluate_line_order1_all_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) {
+    evaluate_line_order1_values_q4(points, output_stride, values_out);
+    evaluate_line_order1_gradients_q4(output_stride, gradients_out);
+    evaluate_line_order1_hessians_q4(output_stride, hessians_out);
+}
+
+SVMP_LAGRANGE_NOINLINE void evaluate_line_order2_gradients_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT gradients_out) {
+    const Real x0 = points[0][0];
+    const Real x1 = points[1][0];
+    const Real x2 = points[2][0];
+    const Real x3 = points[3][0];
+    write_line_gradient_q4_row(gradients_out + 0u * 3u * output_stride,
+                               output_stride,
+                               x0 - Real(0.5), x1 - Real(0.5),
+                               x2 - Real(0.5), x3 - Real(0.5));
+    write_line_gradient_q4_row(gradients_out + 1u * 3u * output_stride,
+                               output_stride,
+                               x0 + Real(0.5), x1 + Real(0.5),
+                               x2 + Real(0.5), x3 + Real(0.5));
+    write_line_gradient_q4_row(gradients_out + 2u * 3u * output_stride,
+                               output_stride,
+                               Real(-2) * x0, Real(-2) * x1,
+                               Real(-2) * x2, Real(-2) * x3);
+}
+
+SVMP_LAGRANGE_NOINLINE void evaluate_line_order2_hessians_q4(
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT hessians_out) {
+    write_line_hessian_q4_row(hessians_out + 0u * 9u * output_stride,
+                              output_stride, Real(1), Real(1), Real(1), Real(1));
+    write_line_hessian_q4_row(hessians_out + 1u * 9u * output_stride,
+                              output_stride, Real(1), Real(1), Real(1), Real(1));
+    write_line_hessian_q4_row(hessians_out + 2u * 9u * output_stride,
+                              output_stride, Real(-2), Real(-2), Real(-2), Real(-2));
+}
+
+SVMP_LAGRANGE_NOINLINE void evaluate_line_order2_all_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) {
+    evaluate_line_order2_values_q4(points, output_stride, values_out);
+    evaluate_line_order2_gradients_q4(points, output_stride, gradients_out);
+    evaluate_line_order2_hessians_q4(output_stride, hessians_out);
+}
+
+SVMP_LAGRANGE_NOINLINE void evaluate_line_order3_gradients_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT gradients_out) {
+    Real first[4][4];
+    for (std::size_t q = 0; q < 4u; ++q) {
+        Real values[4];
+        fill_order3_axis_values_first(points[q][0], values, first[q]);
+    }
+    for (std::size_t node = 0; node < 4u; ++node) {
+        write_line_gradient_q4_row(gradients_out + node * 3u * output_stride,
+                                   output_stride,
+                                   first[0][node], first[1][node],
+                                   first[2][node], first[3][node]);
+    }
+}
+
+SVMP_LAGRANGE_NOINLINE void evaluate_line_order3_hessians_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT hessians_out) {
+    Real second[4][4];
+    for (std::size_t q = 0; q < 4u; ++q) {
+        Real values[4];
+        Real first[4];
+        fill_order3_axis_values_first_second(points[q][0], values, first, second[q]);
+    }
+    for (std::size_t node = 0; node < 4u; ++node) {
+        write_line_hessian_q4_row(hessians_out + node * 9u * output_stride,
+                                  output_stride,
+                                  second[0][node], second[1][node],
+                                  second[2][node], second[3][node]);
+    }
+}
+
+SVMP_LAGRANGE_NOINLINE void evaluate_line_order3_all_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) {
+    Real values[4][4];
+    Real first[4][4];
+    Real second[4][4];
+    for (std::size_t q = 0; q < 4u; ++q) {
+        fill_order3_axis_values_first_second(points[q][0], values[q], first[q], second[q]);
+    }
+    for (std::size_t node = 0; node < 4u; ++node) {
+        Real* value_row = values_out + node * output_stride;
+        value_row[0] = values[0][node];
+        value_row[1] = values[1][node];
+        value_row[2] = values[2][node];
+        value_row[3] = values[3][node];
+        write_line_gradient_q4_row(gradients_out + node * 3u * output_stride,
+                                   output_stride,
+                                   first[0][node], first[1][node],
+                                   first[2][node], first[3][node]);
+        write_line_hessian_q4_row(hessians_out + node * 9u * output_stride,
+                                  output_stride,
+                                  second[0][node], second[1][node],
+                                  second[2][node], second[3][node]);
+    }
+}
+
+inline void fill_order3_axis_values_first(Real x,
+                                          Real* SVMP_RESTRICT values,
+                                          Real* SVMP_RESTRICT first) {
+    fill_order3_axis_values(x, values);
+    const Real x2 = x * x;
+    first[0] = Real(-9.0 / 16.0) * (Real(3) * x2 - Real(2) * x - Real(1.0 / 9.0));
+    first[1] = Real( 9.0 / 16.0) * (Real(3) * x2 + Real(2) * x - Real(1.0 / 9.0));
+    first[2] = Real(27.0 / 16.0) * (Real(3) * x2 - Real(2.0 / 3.0) * x - Real(1));
+    first[3] = Real(-27.0 / 16.0) * (Real(3) * x2 + Real(2.0 / 3.0) * x - Real(1));
+}
+
+inline void fill_order3_axis_values_first_second(Real x,
+                                                 Real* SVMP_RESTRICT values,
+                                                 Real* SVMP_RESTRICT first,
+                                                 Real* SVMP_RESTRICT second) {
+    fill_order3_axis_values_first(x, values, first);
+    second[0] = Real(-9.0 / 16.0) * (Real(6) * x - Real(2));
+    second[1] = Real( 9.0 / 16.0) * (Real(6) * x + Real(2));
+    second[2] = Real(27.0 / 16.0) * (Real(6) * x - Real(2.0 / 3.0));
+    second[3] = Real(-27.0 / 16.0) * (Real(6) * x + Real(2.0 / 3.0));
+}
+
+inline void write_quad_order3_value_row_q4(Real* SVMP_RESTRICT row,
+                                           const Real lx[4][4],
+                                           const Real ly[4][4],
+                                           std::size_t i,
+                                           std::size_t j) {
+    row[0] = lx[0][i] * ly[0][j];
+    row[1] = lx[1][i] * ly[1][j];
+    row[2] = lx[2][i] * ly[2][j];
+    row[3] = lx[3][i] * ly[3][j];
+}
+
+void evaluate_quad_order3_values_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out) {
+    if (output_stride == 4u) {
+        Real* row0 = values_out + 0u * 4u;
+        Real* row1 = values_out + 1u * 4u;
+        Real* row2 = values_out + 2u * 4u;
+        Real* row3 = values_out + 3u * 4u;
+        Real* row4 = values_out + 4u * 4u;
+        Real* row5 = values_out + 5u * 4u;
+        Real* row6 = values_out + 6u * 4u;
+        Real* row7 = values_out + 7u * 4u;
+        Real* row8 = values_out + 8u * 4u;
+        Real* row9 = values_out + 9u * 4u;
+        Real* row10 = values_out + 10u * 4u;
+        Real* row11 = values_out + 11u * 4u;
+        Real* row12 = values_out + 12u * 4u;
+        Real* row13 = values_out + 13u * 4u;
+        Real* row14 = values_out + 14u * 4u;
+        Real* row15 = values_out + 15u * 4u;
+
+        auto write_q = [&](std::size_t q) {
+            const auto& xi = points[q];
+            Real x0;
+            Real x1;
+            Real x2;
+            Real x3;
+            Real y0;
+            Real y1;
+            Real y2;
+            Real y3;
+            fill_order3_axis_value_scalars(xi[0], x0, x1, x2, x3);
+            fill_order3_axis_value_scalars(xi[1], y0, y1, y2, y3);
+            row0[q] = x0 * y0;
+            row1[q] = x1 * y0;
+            row2[q] = x1 * y1;
+            row3[q] = x0 * y1;
+            row4[q] = x2 * y0;
+            row5[q] = x3 * y0;
+            row6[q] = x1 * y2;
+            row7[q] = x1 * y3;
+            row8[q] = x3 * y1;
+            row9[q] = x2 * y1;
+            row10[q] = x0 * y3;
+            row11[q] = x0 * y2;
+            row12[q] = x2 * y2;
+            row13[q] = x3 * y2;
+            row14[q] = x2 * y3;
+            row15[q] = x3 * y3;
+        };
+
+        write_q(0u);
+        write_q(1u);
+        write_q(2u);
+        write_q(3u);
+        return;
+    }
+
+    Real lx[4][4];
+    Real ly[4][4];
+    for (std::size_t q = 0; q < 4u; ++q) {
+        const auto& xi = points[q];
+        fill_order3_axis_values(xi[0], lx[q]);
+        fill_order3_axis_values(xi[1], ly[q]);
+    }
+
+    write_quad_order3_value_row_q4(values_out + 0u * output_stride, lx, ly, 0u, 0u);
+    write_quad_order3_value_row_q4(values_out + 1u * output_stride, lx, ly, 1u, 0u);
+    write_quad_order3_value_row_q4(values_out + 2u * output_stride, lx, ly, 1u, 1u);
+    write_quad_order3_value_row_q4(values_out + 3u * output_stride, lx, ly, 0u, 1u);
+    write_quad_order3_value_row_q4(values_out + 4u * output_stride, lx, ly, 2u, 0u);
+    write_quad_order3_value_row_q4(values_out + 5u * output_stride, lx, ly, 3u, 0u);
+    write_quad_order3_value_row_q4(values_out + 6u * output_stride, lx, ly, 1u, 2u);
+    write_quad_order3_value_row_q4(values_out + 7u * output_stride, lx, ly, 1u, 3u);
+    write_quad_order3_value_row_q4(values_out + 8u * output_stride, lx, ly, 3u, 1u);
+    write_quad_order3_value_row_q4(values_out + 9u * output_stride, lx, ly, 2u, 1u);
+    write_quad_order3_value_row_q4(values_out + 10u * output_stride, lx, ly, 0u, 3u);
+    write_quad_order3_value_row_q4(values_out + 11u * output_stride, lx, ly, 0u, 2u);
+    write_quad_order3_value_row_q4(values_out + 12u * output_stride, lx, ly, 2u, 2u);
+    write_quad_order3_value_row_q4(values_out + 13u * output_stride, lx, ly, 3u, 2u);
+    write_quad_order3_value_row_q4(values_out + 14u * output_stride, lx, ly, 2u, 3u);
+    write_quad_order3_value_row_q4(values_out + 15u * output_stride, lx, ly, 3u, 3u);
+}
+
+void evaluate_quad_order3_values_strided(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out) {
+    if (points.size() == 4u) {
+        evaluate_quad_order3_values_q4(points, output_stride, values_out);
+        return;
+    }
+
+    Real* row0 = values_out + 0u * output_stride;
+    Real* row1 = values_out + 1u * output_stride;
+    Real* row2 = values_out + 2u * output_stride;
+    Real* row3 = values_out + 3u * output_stride;
+    Real* row4 = values_out + 4u * output_stride;
+    Real* row5 = values_out + 5u * output_stride;
+    Real* row6 = values_out + 6u * output_stride;
+    Real* row7 = values_out + 7u * output_stride;
+    Real* row8 = values_out + 8u * output_stride;
+    Real* row9 = values_out + 9u * output_stride;
+    Real* row10 = values_out + 10u * output_stride;
+    Real* row11 = values_out + 11u * output_stride;
+    Real* row12 = values_out + 12u * output_stride;
+    Real* row13 = values_out + 13u * output_stride;
+    Real* row14 = values_out + 14u * output_stride;
+    Real* row15 = values_out + 15u * output_stride;
+
+    for (std::size_t q = 0; q < points.size(); ++q) {
+        const auto& xi = points[q];
+        Real lx[4];
+        Real ly[4];
+        fill_order3_axis_values(xi[0], lx);
+        fill_order3_axis_values(xi[1], ly);
+        row0[q] = lx[0] * ly[0];
+        row1[q] = lx[1] * ly[0];
+        row2[q] = lx[1] * ly[1];
+        row3[q] = lx[0] * ly[1];
+        row4[q] = lx[2] * ly[0];
+        row5[q] = lx[3] * ly[0];
+        row6[q] = lx[1] * ly[2];
+        row7[q] = lx[1] * ly[3];
+        row8[q] = lx[3] * ly[1];
+        row9[q] = lx[2] * ly[1];
+        row10[q] = lx[0] * ly[3];
+        row11[q] = lx[0] * ly[2];
+        row12[q] = lx[2] * ly[2];
+        row13[q] = lx[3] * ly[2];
+        row14[q] = lx[2] * ly[3];
+        row15[q] = lx[3] * ly[3];
+    }
+}
+
+template <std::size_t N>
+inline void write_quad_gradient_row_q4(
+    Real* SVMP_RESTRICT row,
+    std::size_t output_stride,
+    const Real (&lx)[4][N],
+    const Real (&ly)[4][N],
+    const Real (&dx)[4][N],
+    const Real (&dy)[4][N],
+    std::size_t i,
+    std::size_t j) {
+    row[0u] = dx[0][i] * ly[0][j];
+    row[1u] = dx[1][i] * ly[1][j];
+    row[2u] = dx[2][i] * ly[2][j];
+    row[3u] = dx[3][i] * ly[3][j];
+    row[output_stride + 0u] = lx[0][i] * dy[0][j];
+    row[output_stride + 1u] = lx[1][i] * dy[1][j];
+    row[output_stride + 2u] = lx[2][i] * dy[2][j];
+    row[output_stride + 3u] = lx[3][i] * dy[3][j];
+    row[2u * output_stride + 0u] = Real(0);
+    row[2u * output_stride + 1u] = Real(0);
+    row[2u * output_stride + 2u] = Real(0);
+    row[2u * output_stride + 3u] = Real(0);
+}
+
+inline void fill_order4_axis_values_first(Real x,
+                                          Real* SVMP_RESTRICT values,
+                                          Real* SVMP_RESTRICT first) {
+    const Real r = (x + Real(1)) * Real(2);
+    const Real r2 = r * r;
+    const Real r3 = r2 * r;
+    const Real f0 = r;
+    const Real f1 = r - Real(1);
+    const Real f2 = r - Real(2);
+    const Real f3 = r - Real(3);
+    const Real f4 = r - Real(4);
+    const Real f01 = f0 * f1;
+    const Real f12 = f1 * f2;
+    const Real f23 = f2 * f3;
+    const Real f34 = f3 * f4;
+
+    values[0] = (f12 * f34) / Real(24);
+    values[1] = (f01 * f23) / Real(24);
+    values[2] = -(f0 * f2 * f34) / Real(6);
+    values[3] = (f01 * f34) / Real(4);
+    values[4] = -(f01 * f2 * f4) / Real(6);
+
+    first[0] = (Real(4) * r3 - Real(30) * r2 + Real(70) * r - Real(50)) / Real(12);
+    first[1] = (Real(4) * r3 - Real(18) * r2 + Real(22) * r - Real(6)) / Real(12);
+    first[2] = (-Real(4) * r3 + Real(27) * r2 - Real(52) * r + Real(24)) / Real(3);
+    first[3] = Real(2) * r3 - Real(12) * r2 + Real(19) * r - Real(6);
+    first[4] = (-Real(4) * r3 + Real(21) * r2 - Real(28) * r + Real(8)) / Real(3);
+}
+
+SVMP_LAGRANGE_NOINLINE void evaluate_quad_order3_gradients_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT gradients_out) {
+    constexpr auto node_axes = detail::make_quad_tensor_node_axes<3>();
+
+    Real lx[4][4];
+    Real ly[4][4];
+    Real dx[4][4];
+    Real dy[4][4];
+    for (std::size_t q = 0; q < 4u; ++q) {
+        const auto& xi = points[q];
+        fill_order3_axis_values_first(xi[0], lx[q], dx[q]);
+        fill_order3_axis_values_first(xi[1], ly[q], dy[q]);
+    }
+
+    for (std::size_t node = 0; node < node_axes.size(); ++node) {
+        const auto& axes = node_axes[node];
+        write_quad_gradient_row_q4(
+            gradients_out + node * 3u * output_stride,
+            output_stride,
+            lx,
+            ly,
+            dx,
+            dy,
+            axes[0],
+            axes[1]);
+    }
+}
+
+SVMP_LAGRANGE_NOINLINE void evaluate_quad_order4_gradients_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT gradients_out) {
+    constexpr auto node_axes = detail::make_quad_tensor_node_axes<4>();
+
+    Real lx[4][5];
+    Real ly[4][5];
+    Real dx[4][5];
+    Real dy[4][5];
+    for (std::size_t q = 0; q < 4u; ++q) {
+        const auto& xi = points[q];
+        fill_order4_axis_values_first(xi[0], lx[q], dx[q]);
+        fill_order4_axis_values_first(xi[1], ly[q], dy[q]);
+    }
+
+    for (std::size_t node = 0; node < node_axes.size(); ++node) {
+        const auto& axes = node_axes[node];
+        write_quad_gradient_row_q4(
+            gradients_out + node * 3u * output_stride,
+            output_stride,
+            lx,
+            ly,
+            dx,
+            dy,
+            axes[0],
+            axes[1]);
+    }
+}
+
+void evaluate_quad_order3_gradients_strided(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT gradients_out) {
+    if (points.size() == 4u) {
+        evaluate_quad_order3_gradients_q4(points, output_stride, gradients_out);
+        return;
+    }
+
+    Real* row0 = gradients_out + 0u * 3u * output_stride;
+    Real* row1 = gradients_out + 1u * 3u * output_stride;
+    Real* row2 = gradients_out + 2u * 3u * output_stride;
+    Real* row3 = gradients_out + 3u * 3u * output_stride;
+    Real* row4 = gradients_out + 4u * 3u * output_stride;
+    Real* row5 = gradients_out + 5u * 3u * output_stride;
+    Real* row6 = gradients_out + 6u * 3u * output_stride;
+    Real* row7 = gradients_out + 7u * 3u * output_stride;
+    Real* row8 = gradients_out + 8u * 3u * output_stride;
+    Real* row9 = gradients_out + 9u * 3u * output_stride;
+    Real* row10 = gradients_out + 10u * 3u * output_stride;
+    Real* row11 = gradients_out + 11u * 3u * output_stride;
+    Real* row12 = gradients_out + 12u * 3u * output_stride;
+    Real* row13 = gradients_out + 13u * 3u * output_stride;
+    Real* row14 = gradients_out + 14u * 3u * output_stride;
+    Real* row15 = gradients_out + 15u * 3u * output_stride;
+
+    for (std::size_t q = 0; q < points.size(); ++q) {
+        const auto& xi = points[q];
+        Real lx[4];
+        Real ly[4];
+        Real dx[4];
+        Real dy[4];
+        fill_order3_axis_values_first(xi[0], lx, dx);
+        fill_order3_axis_values_first(xi[1], ly, dy);
+        write_quad_order2_gradient_q(row0, output_stride, q, dx[0] * ly[0], lx[0] * dy[0]);
+        write_quad_order2_gradient_q(row1, output_stride, q, dx[1] * ly[0], lx[1] * dy[0]);
+        write_quad_order2_gradient_q(row2, output_stride, q, dx[1] * ly[1], lx[1] * dy[1]);
+        write_quad_order2_gradient_q(row3, output_stride, q, dx[0] * ly[1], lx[0] * dy[1]);
+        write_quad_order2_gradient_q(row4, output_stride, q, dx[2] * ly[0], lx[2] * dy[0]);
+        write_quad_order2_gradient_q(row5, output_stride, q, dx[3] * ly[0], lx[3] * dy[0]);
+        write_quad_order2_gradient_q(row6, output_stride, q, dx[1] * ly[2], lx[1] * dy[2]);
+        write_quad_order2_gradient_q(row7, output_stride, q, dx[1] * ly[3], lx[1] * dy[3]);
+        write_quad_order2_gradient_q(row8, output_stride, q, dx[3] * ly[1], lx[3] * dy[1]);
+        write_quad_order2_gradient_q(row9, output_stride, q, dx[2] * ly[1], lx[2] * dy[1]);
+        write_quad_order2_gradient_q(row10, output_stride, q, dx[0] * ly[3], lx[0] * dy[3]);
+        write_quad_order2_gradient_q(row11, output_stride, q, dx[0] * ly[2], lx[0] * dy[2]);
+        write_quad_order2_gradient_q(row12, output_stride, q, dx[2] * ly[2], lx[2] * dy[2]);
+        write_quad_order2_gradient_q(row13, output_stride, q, dx[3] * ly[2], lx[3] * dy[2]);
+        write_quad_order2_gradient_q(row14, output_stride, q, dx[2] * ly[3], lx[2] * dy[3]);
+        write_quad_order2_gradient_q(row15, output_stride, q, dx[3] * ly[3], lx[3] * dy[3]);
+    }
+}
+
+void evaluate_quad_order3_hessians_strided(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT hessians_out) {
+    Real* row0 = hessians_out + 0u * 9u * output_stride;
+    Real* row1 = hessians_out + 1u * 9u * output_stride;
+    Real* row2 = hessians_out + 2u * 9u * output_stride;
+    Real* row3 = hessians_out + 3u * 9u * output_stride;
+    Real* row4 = hessians_out + 4u * 9u * output_stride;
+    Real* row5 = hessians_out + 5u * 9u * output_stride;
+    Real* row6 = hessians_out + 6u * 9u * output_stride;
+    Real* row7 = hessians_out + 7u * 9u * output_stride;
+    Real* row8 = hessians_out + 8u * 9u * output_stride;
+    Real* row9 = hessians_out + 9u * 9u * output_stride;
+    Real* row10 = hessians_out + 10u * 9u * output_stride;
+    Real* row11 = hessians_out + 11u * 9u * output_stride;
+    Real* row12 = hessians_out + 12u * 9u * output_stride;
+    Real* row13 = hessians_out + 13u * 9u * output_stride;
+    Real* row14 = hessians_out + 14u * 9u * output_stride;
+    Real* row15 = hessians_out + 15u * 9u * output_stride;
+
+    for (std::size_t q = 0; q < points.size(); ++q) {
+        const auto& xi = points[q];
+        Real lx[4];
+        Real ly[4];
+        Real dx[4];
+        Real dy[4];
+        Real hx[4];
+        Real hy[4];
+        fill_order3_axis_values_first_second(xi[0], lx, dx, hx);
+        fill_order3_axis_values_first_second(xi[1], ly, dy, hy);
+        write_quad_order2_hessian_q(row0, output_stride, q, hx[0] * ly[0], dx[0] * dy[0], lx[0] * hy[0]);
+        write_quad_order2_hessian_q(row1, output_stride, q, hx[1] * ly[0], dx[1] * dy[0], lx[1] * hy[0]);
+        write_quad_order2_hessian_q(row2, output_stride, q, hx[1] * ly[1], dx[1] * dy[1], lx[1] * hy[1]);
+        write_quad_order2_hessian_q(row3, output_stride, q, hx[0] * ly[1], dx[0] * dy[1], lx[0] * hy[1]);
+        write_quad_order2_hessian_q(row4, output_stride, q, hx[2] * ly[0], dx[2] * dy[0], lx[2] * hy[0]);
+        write_quad_order2_hessian_q(row5, output_stride, q, hx[3] * ly[0], dx[3] * dy[0], lx[3] * hy[0]);
+        write_quad_order2_hessian_q(row6, output_stride, q, hx[1] * ly[2], dx[1] * dy[2], lx[1] * hy[2]);
+        write_quad_order2_hessian_q(row7, output_stride, q, hx[1] * ly[3], dx[1] * dy[3], lx[1] * hy[3]);
+        write_quad_order2_hessian_q(row8, output_stride, q, hx[3] * ly[1], dx[3] * dy[1], lx[3] * hy[1]);
+        write_quad_order2_hessian_q(row9, output_stride, q, hx[2] * ly[1], dx[2] * dy[1], lx[2] * hy[1]);
+        write_quad_order2_hessian_q(row10, output_stride, q, hx[0] * ly[3], dx[0] * dy[3], lx[0] * hy[3]);
+        write_quad_order2_hessian_q(row11, output_stride, q, hx[0] * ly[2], dx[0] * dy[2], lx[0] * hy[2]);
+        write_quad_order2_hessian_q(row12, output_stride, q, hx[2] * ly[2], dx[2] * dy[2], lx[2] * hy[2]);
+        write_quad_order2_hessian_q(row13, output_stride, q, hx[3] * ly[2], dx[3] * dy[2], lx[3] * hy[2]);
+        write_quad_order2_hessian_q(row14, output_stride, q, hx[2] * ly[3], dx[2] * dy[3], lx[2] * hy[3]);
+        write_quad_order2_hessian_q(row15, output_stride, q, hx[3] * ly[3], dx[3] * dy[3], lx[3] * hy[3]);
+    }
+}
+
+template <std::size_t Q>
+inline void write_quad_order3_all_q4(
+    std::size_t output_stride,
+    std::size_t i,
+    std::size_t j,
+    const Real lx[4][4],
+    const Real ly[4][4],
+    const Real dx[4][4],
+    const Real dy[4][4],
+    const Real hx[4][4],
+    const Real hy[4][4],
+    Real* SVMP_RESTRICT value_row,
+    Real* SVMP_RESTRICT grad_row,
+    Real* SVMP_RESTRICT hess_row) {
+    const Real xv = lx[Q][i];
+    const Real yv = ly[Q][j];
+    const Real xd = dx[Q][i];
+    const Real yd = dy[Q][j];
+    const Real hxy = xd * yd;
+
+    value_row[Q] = xv * yv;
+    grad_row[0u * output_stride + Q] = xd * yv;
+    grad_row[1u * output_stride + Q] = xv * yd;
+    grad_row[2u * output_stride + Q] = Real(0);
+    hess_row[0u * output_stride + Q] = hx[Q][i] * yv;
+    hess_row[4u * output_stride + Q] = xv * hy[Q][j];
+    hess_row[8u * output_stride + Q] = Real(0);
+    hess_row[1u * output_stride + Q] = hxy;
+    hess_row[3u * output_stride + Q] = hxy;
+    hess_row[2u * output_stride + Q] = Real(0);
+    hess_row[6u * output_stride + Q] = Real(0);
+    hess_row[5u * output_stride + Q] = Real(0);
+    hess_row[7u * output_stride + Q] = Real(0);
+}
+
+void evaluate_quad_order3_all_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) {
+    constexpr auto node_axes = detail::make_quad_tensor_node_axes<3>();
+
+    Real lx[4][4];
+    Real ly[4][4];
+    Real dx[4][4];
+    Real dy[4][4];
+    Real hx[4][4];
+    Real hy[4][4];
+    for (std::size_t q = 0; q < 4u; ++q) {
+        const auto& xi = points[q];
+        fill_order3_axis_values_first_second(xi[0], lx[q], dx[q], hx[q]);
+        fill_order3_axis_values_first_second(xi[1], ly[q], dy[q], hy[q]);
+    }
+
+    for (std::size_t node = 0; node < node_axes.size(); ++node) {
+        const auto& axes = node_axes[node];
+        const std::size_t i = axes[0];
+        const std::size_t j = axes[1];
+        Real* value_row = values_out + node * output_stride;
+        Real* grad_row = gradients_out + node * 3u * output_stride;
+        Real* hess_row = hessians_out + node * 9u * output_stride;
+        write_quad_order3_all_q4<0u>(
+            output_stride, i, j, lx, ly, dx, dy, hx, hy, value_row, grad_row, hess_row);
+        write_quad_order3_all_q4<1u>(
+            output_stride, i, j, lx, ly, dx, dy, hx, hy, value_row, grad_row, hess_row);
+        write_quad_order3_all_q4<2u>(
+            output_stride, i, j, lx, ly, dx, dy, hx, hy, value_row, grad_row, hess_row);
+        write_quad_order3_all_q4<3u>(
+            output_stride, i, j, lx, ly, dx, dy, hx, hy, value_row, grad_row, hess_row);
+    }
+}
+
+void evaluate_hex_order3_values_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out) {
+    constexpr auto node_axes = detail::make_hex_tensor_node_axes<3>();
+
+    Real lx[4][4];
+    Real ly[4][4];
+    Real lz[4][4];
+    for (std::size_t q = 0; q < 4u; ++q) {
+        const auto& xi = points[q];
+        fill_order3_axis_values(xi[0], lx[q]);
+        fill_order3_axis_values(xi[1], ly[q]);
+        fill_order3_axis_values(xi[2], lz[q]);
+    }
+
+    for (std::size_t node = 0; node < node_axes.size(); ++node) {
+        const auto& axes = node_axes[node];
+        const std::size_t i = axes[0];
+        const std::size_t j = axes[1];
+        const std::size_t k = axes[2];
+        Real* row = values_out + node * output_stride;
+        row[0] = lx[0][i] * ly[0][j] * lz[0][k];
+        row[1] = lx[1][i] * ly[1][j] * lz[1][k];
+        row[2] = lx[2][i] * ly[2][j] * lz[2][k];
+        row[3] = lx[3][i] * ly[3][j] * lz[3][k];
+    }
+}
+
+void evaluate_hex_order3_gradients_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT gradients_out) {
+    constexpr auto node_axes = detail::make_hex_tensor_node_axes<3>();
+
+    Real lx[4][4];
+    Real ly[4][4];
+    Real lz[4][4];
+    Real dx[4][4];
+    Real dy[4][4];
+    Real dz[4][4];
+    for (std::size_t q = 0; q < 4u; ++q) {
+        const auto& xi = points[q];
+        fill_order3_axis_values_first(xi[0], lx[q], dx[q]);
+        fill_order3_axis_values_first(xi[1], ly[q], dy[q]);
+        fill_order3_axis_values_first(xi[2], lz[q], dz[q]);
+    }
+
+    for (std::size_t node = 0; node < node_axes.size(); ++node) {
+        const auto& axes = node_axes[node];
+        const std::size_t i = axes[0];
+        const std::size_t j = axes[1];
+        const std::size_t k = axes[2];
+        Real* row = gradients_out + node * 3u * output_stride;
+        row[0] = dx[0][i] * ly[0][j] * lz[0][k];
+        row[1] = dx[1][i] * ly[1][j] * lz[1][k];
+        row[2] = dx[2][i] * ly[2][j] * lz[2][k];
+        row[3] = dx[3][i] * ly[3][j] * lz[3][k];
+        row[output_stride + 0u] = lx[0][i] * dy[0][j] * lz[0][k];
+        row[output_stride + 1u] = lx[1][i] * dy[1][j] * lz[1][k];
+        row[output_stride + 2u] = lx[2][i] * dy[2][j] * lz[2][k];
+        row[output_stride + 3u] = lx[3][i] * dy[3][j] * lz[3][k];
+        row[2u * output_stride + 0u] = lx[0][i] * ly[0][j] * dz[0][k];
+        row[2u * output_stride + 1u] = lx[1][i] * ly[1][j] * dz[1][k];
+        row[2u * output_stride + 2u] = lx[2][i] * ly[2][j] * dz[2][k];
+        row[2u * output_stride + 3u] = lx[3][i] * ly[3][j] * dz[3][k];
+    }
+}
+
+template <std::size_t Q, bool WriteValue, bool WriteGradient>
+inline void write_hex_order3_q4_hessian_outputs(
+    std::size_t output_stride,
+    std::size_t i,
+    std::size_t j,
+    std::size_t k,
+    const Real lx[4][4],
+    const Real ly[4][4],
+    const Real lz[4][4],
+    const Real dx[4][4],
+    const Real dy[4][4],
+    const Real dz[4][4],
+    const Real hx[4][4],
+    const Real hy[4][4],
+    const Real hz[4][4],
+    Real* SVMP_RESTRICT value_row,
+    Real* SVMP_RESTRICT grad_row,
+    Real* SVMP_RESTRICT hess_row) {
+    const Real xv = lx[Q][i];
+    const Real yv = ly[Q][j];
+    const Real zv = lz[Q][k];
+    const Real yz = yv * zv;
+
+    if constexpr (WriteValue) {
+        value_row[Q] = xv * yz;
+    }
+
+    const Real xd = dx[Q][i];
+    const Real yd = dy[Q][j];
+    const Real zd = dz[Q][k];
+    const Real yd_z = yd * zv;
+    const Real yv_zd = yv * zd;
+
+    if constexpr (WriteGradient) {
+        grad_row[0u * output_stride + Q] = xd * yz;
+        grad_row[1u * output_stride + Q] = xv * yd_z;
+        grad_row[2u * output_stride + Q] = xv * yv_zd;
+    }
+
+    const Real hxy = xd * yd_z;
+    const Real hxz = xd * yv_zd;
+    const Real hyz = xv * yd * zd;
+    hess_row[0u * output_stride + Q] = hx[Q][i] * yz;
+    hess_row[4u * output_stride + Q] = xv * hy[Q][j] * zv;
+    hess_row[8u * output_stride + Q] = xv * yv * hz[Q][k];
+    hess_row[1u * output_stride + Q] = hxy;
+    hess_row[3u * output_stride + Q] = hxy;
+    hess_row[2u * output_stride + Q] = hxz;
+    hess_row[6u * output_stride + Q] = hxz;
+    hess_row[5u * output_stride + Q] = hyz;
+    hess_row[7u * output_stride + Q] = hyz;
+}
+
+template <bool WriteValue, bool WriteGradient>
+void evaluate_hex_order3_q4_hessian_outputs(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) {
+    constexpr auto node_axes = detail::make_hex_tensor_node_axes<3>();
+
+    Real lx[4][4];
+    Real ly[4][4];
+    Real lz[4][4];
+    Real dx[4][4];
+    Real dy[4][4];
+    Real dz[4][4];
+    Real hx[4][4];
+    Real hy[4][4];
+    Real hz[4][4];
+    for (std::size_t q = 0; q < 4u; ++q) {
+        const auto& xi = points[q];
+        fill_order3_axis_values_first_second(xi[0], lx[q], dx[q], hx[q]);
+        fill_order3_axis_values_first_second(xi[1], ly[q], dy[q], hy[q]);
+        fill_order3_axis_values_first_second(xi[2], lz[q], dz[q], hz[q]);
+    }
+
+    for (std::size_t node = 0; node < node_axes.size(); ++node) {
+        const auto& axes = node_axes[node];
+        const std::size_t i = axes[0];
+        const std::size_t j = axes[1];
+        const std::size_t k = axes[2];
+        Real* value_row = values_out ? values_out + node * output_stride : nullptr;
+        Real* grad_row = gradients_out ? gradients_out + node * 3u * output_stride : nullptr;
+        Real* hess_row = hessians_out + node * 9u * output_stride;
+        write_hex_order3_q4_hessian_outputs<0u, WriteValue, WriteGradient>(
+            output_stride, i, j, k, lx, ly, lz, dx, dy, dz, hx, hy, hz,
+            value_row, grad_row, hess_row);
+        write_hex_order3_q4_hessian_outputs<1u, WriteValue, WriteGradient>(
+            output_stride, i, j, k, lx, ly, lz, dx, dy, dz, hx, hy, hz,
+            value_row, grad_row, hess_row);
+        write_hex_order3_q4_hessian_outputs<2u, WriteValue, WriteGradient>(
+            output_stride, i, j, k, lx, ly, lz, dx, dy, dz, hx, hy, hz,
+            value_row, grad_row, hess_row);
+        write_hex_order3_q4_hessian_outputs<3u, WriteValue, WriteGradient>(
+            output_stride, i, j, k, lx, ly, lz, dx, dy, dz, hx, hy, hz,
+            value_row, grad_row, hess_row);
+    }
+}
+
+void evaluate_hex_order3_hessians_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT hessians_out) {
+    evaluate_hex_order3_q4_hessian_outputs<false, false>(
+        points, output_stride, nullptr, nullptr, hessians_out);
+}
+
+void evaluate_hex_order3_all_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) {
+    evaluate_hex_order3_q4_hessian_outputs<true, true>(
+        points, output_stride, values_out, gradients_out, hessians_out);
+}
+
+void evaluate_hex_order2_values_strided(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out) {
+    Real* row0 = values_out + 0u * output_stride;
+    Real* row1 = values_out + 1u * output_stride;
+    Real* row2 = values_out + 2u * output_stride;
+    Real* row3 = values_out + 3u * output_stride;
+    Real* row4 = values_out + 4u * output_stride;
+    Real* row5 = values_out + 5u * output_stride;
+    Real* row6 = values_out + 6u * output_stride;
+    Real* row7 = values_out + 7u * output_stride;
+    Real* row8 = values_out + 8u * output_stride;
+    Real* row9 = values_out + 9u * output_stride;
+    Real* row10 = values_out + 10u * output_stride;
+    Real* row11 = values_out + 11u * output_stride;
+    Real* row12 = values_out + 12u * output_stride;
+    Real* row13 = values_out + 13u * output_stride;
+    Real* row14 = values_out + 14u * output_stride;
+    Real* row15 = values_out + 15u * output_stride;
+    Real* row16 = values_out + 16u * output_stride;
+    Real* row17 = values_out + 17u * output_stride;
+    Real* row18 = values_out + 18u * output_stride;
+    Real* row19 = values_out + 19u * output_stride;
+    Real* row20 = values_out + 20u * output_stride;
+    Real* row21 = values_out + 21u * output_stride;
+    Real* row22 = values_out + 22u * output_stride;
+    Real* row23 = values_out + 23u * output_stride;
+    Real* row24 = values_out + 24u * output_stride;
+    Real* row25 = values_out + 25u * output_stride;
+    Real* row26 = values_out + 26u * output_stride;
+
+    for (std::size_t q = 0; q < points.size(); ++q) {
+        const auto& xi = points[q];
+        const Real x = xi[0];
+        const Real y = xi[1];
+        const Real z = xi[2];
+        const Real x0 = x * (x - Real(1)) * Real(0.5);
+        const Real x1 = x * (x + Real(1)) * Real(0.5);
+        const Real x2 = Real(1) - x * x;
+        const Real y0 = y * (y - Real(1)) * Real(0.5);
+        const Real y1 = y * (y + Real(1)) * Real(0.5);
+        const Real y2 = Real(1) - y * y;
+        const Real z0 = z * (z - Real(1)) * Real(0.5);
+        const Real z1 = z * (z + Real(1)) * Real(0.5);
+        const Real z2 = Real(1) - z * z;
+        const Real x0y0 = x0 * y0;
+        const Real x1y0 = x1 * y0;
+        const Real x1y1 = x1 * y1;
+        const Real x0y1 = x0 * y1;
+        const Real x2y0 = x2 * y0;
+        const Real x1y2 = x1 * y2;
+        const Real x2y1 = x2 * y1;
+        const Real x0y2 = x0 * y2;
+        const Real x2y2 = x2 * y2;
+
+        row0[q] = x0y0 * z0;
+        row1[q] = x1y0 * z0;
+        row2[q] = x1y1 * z0;
+        row3[q] = x0y1 * z0;
+        row4[q] = x0y0 * z1;
+        row5[q] = x1y0 * z1;
+        row6[q] = x1y1 * z1;
+        row7[q] = x0y1 * z1;
+        row8[q] = x2y0 * z0;
+        row9[q] = x1y2 * z0;
+        row10[q] = x2y1 * z0;
+        row11[q] = x0y2 * z0;
+        row12[q] = x2y0 * z1;
+        row13[q] = x1y2 * z1;
+        row14[q] = x2y1 * z1;
+        row15[q] = x0y2 * z1;
+        row16[q] = x0y0 * z2;
+        row17[q] = x1y0 * z2;
+        row18[q] = x1y1 * z2;
+        row19[q] = x0y1 * z2;
+        row20[q] = x2y2 * z0;
+        row21[q] = x2y2 * z1;
+        row22[q] = x2y0 * z2;
+        row23[q] = x1y2 * z2;
+        row24[q] = x2y1 * z2;
+        row25[q] = x0y2 * z2;
+        row26[q] = x2y2 * z2;
+    }
+}
+
+inline void fill_order2_axis_values_first(Real x,
+                                          Real* SVMP_RESTRICT values,
+                                          Real* SVMP_RESTRICT first) {
+    values[0] = x * (x - Real(1)) * Real(0.5);
+    values[1] = x * (x + Real(1)) * Real(0.5);
+    values[2] = Real(1) - x * x;
+    first[0] = x - Real(0.5);
+    first[1] = x + Real(0.5);
+    first[2] = Real(-2) * x;
+}
+
+inline void fill_order2_axis_values_first_second(Real x,
+                                                 Real* SVMP_RESTRICT values,
+                                                 Real* SVMP_RESTRICT first,
+                                                 Real* SVMP_RESTRICT second) {
+    fill_order2_axis_values_first(x, values, first);
+    second[0] = Real(1);
+    second[1] = Real(1);
+    second[2] = Real(-2);
+}
+
+template <std::size_t Q>
+inline void write_hex_order2_hessian_q4(
+    std::size_t output_stride,
+    std::size_t i,
+    std::size_t j,
+    std::size_t k,
+    const Real lx[4][3],
+    const Real ly[4][3],
+    const Real lz[4][3],
+    const Real dx[4][3],
+    const Real dy[4][3],
+    const Real dz[4][3],
+    const Real hx[4][3],
+    const Real hy[4][3],
+    const Real hz[4][3],
+    Real* SVMP_RESTRICT hess_row) {
+    const Real xv = lx[Q][i];
+    const Real yv = ly[Q][j];
+    const Real zv = lz[Q][k];
+    const Real yz = yv * zv;
+    const Real xd = dx[Q][i];
+    const Real yd = dy[Q][j];
+    const Real zd = dz[Q][k];
+    const Real yd_z = yd * zv;
+    const Real yv_zd = yv * zd;
+    const Real hxy = xd * yd_z;
+    const Real hxz = xd * yv_zd;
+    const Real hyz = xv * yd * zd;
+    hess_row[0u * output_stride + Q] = hx[Q][i] * yz;
+    hess_row[4u * output_stride + Q] = xv * hy[Q][j] * zv;
+    hess_row[8u * output_stride + Q] = xv * yv * hz[Q][k];
+    hess_row[1u * output_stride + Q] = hxy;
+    hess_row[3u * output_stride + Q] = hxy;
+    hess_row[2u * output_stride + Q] = hxz;
+    hess_row[6u * output_stride + Q] = hxz;
+    hess_row[5u * output_stride + Q] = hyz;
+    hess_row[7u * output_stride + Q] = hyz;
+}
+
+void evaluate_hex_order2_hessians_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT hessians_out) {
+    constexpr auto node_axes = detail::make_hex_tensor_node_axes<2>();
+
+    Real lx[4][3];
+    Real ly[4][3];
+    Real lz[4][3];
+    Real dx[4][3];
+    Real dy[4][3];
+    Real dz[4][3];
+    Real hx[4][3];
+    Real hy[4][3];
+    Real hz[4][3];
+    for (std::size_t q = 0; q < 4u; ++q) {
+        const auto& xi = points[q];
+        fill_order2_axis_values_first_second(xi[0], lx[q], dx[q], hx[q]);
+        fill_order2_axis_values_first_second(xi[1], ly[q], dy[q], hy[q]);
+        fill_order2_axis_values_first_second(xi[2], lz[q], dz[q], hz[q]);
+    }
+
+    for (std::size_t node = 0; node < node_axes.size(); ++node) {
+        const auto& axes = node_axes[node];
+        const std::size_t i = axes[0];
+        const std::size_t j = axes[1];
+        const std::size_t k = axes[2];
+        Real* hess_row = hessians_out + node * 9u * output_stride;
+        write_hex_order2_hessian_q4<0u>(
+            output_stride, i, j, k, lx, ly, lz, dx, dy, dz, hx, hy, hz, hess_row);
+        write_hex_order2_hessian_q4<1u>(
+            output_stride, i, j, k, lx, ly, lz, dx, dy, dz, hx, hy, hz, hess_row);
+        write_hex_order2_hessian_q4<2u>(
+            output_stride, i, j, k, lx, ly, lz, dx, dy, dz, hx, hy, hz, hess_row);
+        write_hex_order2_hessian_q4<3u>(
+            output_stride, i, j, k, lx, ly, lz, dx, dy, dz, hx, hy, hz, hess_row);
+    }
+}
+
+template <std::size_t Q>
+inline void write_quad_order2_all_q4(
+    std::size_t output_stride,
+    std::size_t i,
+    std::size_t j,
+    const Real lx[4][3],
+    const Real ly[4][3],
+    const Real dx[4][3],
+    const Real dy[4][3],
+    const Real hx[4][3],
+    const Real hy[4][3],
+    Real* SVMP_RESTRICT value_row,
+    Real* SVMP_RESTRICT grad_row,
+    Real* SVMP_RESTRICT hess_row) {
+    const Real xv = lx[Q][i];
+    const Real yv = ly[Q][j];
+    const Real xd = dx[Q][i];
+    const Real yd = dy[Q][j];
+    const Real hxy = xd * yd;
+
+    value_row[Q] = xv * yv;
+    grad_row[0u * output_stride + Q] = xd * yv;
+    grad_row[1u * output_stride + Q] = xv * yd;
+    grad_row[2u * output_stride + Q] = Real(0);
+    hess_row[0u * output_stride + Q] = hx[Q][i] * yv;
+    hess_row[4u * output_stride + Q] = xv * hy[Q][j];
+    hess_row[8u * output_stride + Q] = Real(0);
+    hess_row[1u * output_stride + Q] = hxy;
+    hess_row[3u * output_stride + Q] = hxy;
+    hess_row[2u * output_stride + Q] = Real(0);
+    hess_row[6u * output_stride + Q] = Real(0);
+    hess_row[5u * output_stride + Q] = Real(0);
+    hess_row[7u * output_stride + Q] = Real(0);
+}
+
+void evaluate_quad_order2_all_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) {
+    constexpr auto node_axes = detail::make_quad_tensor_node_axes<2>();
+
+    Real lx[4][3];
+    Real ly[4][3];
+    Real dx[4][3];
+    Real dy[4][3];
+    Real hx[4][3];
+    Real hy[4][3];
+    for (std::size_t q = 0; q < 4u; ++q) {
+        const auto& xi = points[q];
+        fill_order2_axis_values_first_second(xi[0], lx[q], dx[q], hx[q]);
+        fill_order2_axis_values_first_second(xi[1], ly[q], dy[q], hy[q]);
+    }
+
+    for (std::size_t node = 0; node < node_axes.size(); ++node) {
+        const auto& axes = node_axes[node];
+        const std::size_t i = axes[0];
+        const std::size_t j = axes[1];
+        Real* value_row = values_out + node * output_stride;
+        Real* grad_row = gradients_out + node * 3u * output_stride;
+        Real* hess_row = hessians_out + node * 9u * output_stride;
+        write_quad_order2_all_q4<0u>(
+            output_stride, i, j, lx, ly, dx, dy, hx, hy, value_row, grad_row, hess_row);
+        write_quad_order2_all_q4<1u>(
+            output_stride, i, j, lx, ly, dx, dy, hx, hy, value_row, grad_row, hess_row);
+        write_quad_order2_all_q4<2u>(
+            output_stride, i, j, lx, ly, dx, dy, hx, hy, value_row, grad_row, hess_row);
+        write_quad_order2_all_q4<3u>(
+            output_stride, i, j, lx, ly, dx, dy, hx, hy, value_row, grad_row, hess_row);
+    }
+}
+
+void evaluate_hex_order2_gradients_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT gradients_out) {
+    constexpr std::array<std::array<std::size_t, 3>, 27> node_axes = {{
+        {{0u, 0u, 0u}}, {{1u, 0u, 0u}}, {{1u, 1u, 0u}}, {{0u, 1u, 0u}},
+        {{0u, 0u, 1u}}, {{1u, 0u, 1u}}, {{1u, 1u, 1u}}, {{0u, 1u, 1u}},
+        {{2u, 0u, 0u}}, {{1u, 2u, 0u}}, {{2u, 1u, 0u}}, {{0u, 2u, 0u}},
+        {{2u, 0u, 1u}}, {{1u, 2u, 1u}}, {{2u, 1u, 1u}}, {{0u, 2u, 1u}},
+        {{0u, 0u, 2u}}, {{1u, 0u, 2u}}, {{1u, 1u, 2u}}, {{0u, 1u, 2u}},
+        {{2u, 2u, 0u}}, {{2u, 2u, 1u}}, {{2u, 0u, 2u}}, {{1u, 2u, 2u}},
+        {{2u, 1u, 2u}}, {{0u, 2u, 2u}}, {{2u, 2u, 2u}},
+    }};
+
+    Real lx[4][3];
+    Real ly[4][3];
+    Real lz[4][3];
+    Real dx[4][3];
+    Real dy[4][3];
+    Real dz[4][3];
+    for (std::size_t q = 0; q < 4u; ++q) {
+        const auto& xi = points[q];
+        fill_order2_axis_values_first(xi[0], lx[q], dx[q]);
+        fill_order2_axis_values_first(xi[1], ly[q], dy[q]);
+        fill_order2_axis_values_first(xi[2], lz[q], dz[q]);
+    }
+
+    for (std::size_t node = 0; node < node_axes.size(); ++node) {
+        const auto& axes = node_axes[node];
+        const std::size_t i = axes[0];
+        const std::size_t j = axes[1];
+        const std::size_t k = axes[2];
+        Real* row = gradients_out + node * 3u * output_stride;
+        row[0] = dx[0][i] * ly[0][j] * lz[0][k];
+        row[1] = dx[1][i] * ly[1][j] * lz[1][k];
+        row[2] = dx[2][i] * ly[2][j] * lz[2][k];
+        row[3] = dx[3][i] * ly[3][j] * lz[3][k];
+        row[output_stride + 0u] = lx[0][i] * dy[0][j] * lz[0][k];
+        row[output_stride + 1u] = lx[1][i] * dy[1][j] * lz[1][k];
+        row[output_stride + 2u] = lx[2][i] * dy[2][j] * lz[2][k];
+        row[output_stride + 3u] = lx[3][i] * dy[3][j] * lz[3][k];
+        row[2u * output_stride + 0u] = lx[0][i] * ly[0][j] * dz[0][k];
+        row[2u * output_stride + 1u] = lx[1][i] * ly[1][j] * dz[1][k];
+        row[2u * output_stride + 2u] = lx[2][i] * ly[2][j] * dz[2][k];
+        row[2u * output_stride + 3u] = lx[3][i] * ly[3][j] * dz[3][k];
+    }
+}
+
+template<typename FastBasis>
+void evaluate_constant_fast_hessians_strided(
+    std::size_t num_qpts,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT hessians_out) {
+    std::array<Hessian, FastBasis::n_dofs> fast_hessians{};
+    FastBasis::evaluate_hessians(math::Vector<Real, 3>{}, fast_hessians);
+    for (std::size_t i = 0; i < fast_hessians.size(); ++i) {
+        const Hessian& hessian = fast_hessians[i];
+        Real* H = hessians_out + i * 9u * output_stride;
+        const Real h00 = hessian(0, 0);
+        const Real h01 = hessian(0, 1);
+        const Real h02 = hessian(0, 2);
+        const Real h10 = hessian(1, 0);
+        const Real h11 = hessian(1, 1);
+        const Real h12 = hessian(1, 2);
+        const Real h20 = hessian(2, 0);
+        const Real h21 = hessian(2, 1);
+        const Real h22 = hessian(2, 2);
+        for (std::size_t q = 0; q < num_qpts; ++q) {
+            H[0u * output_stride + q] = h00;
+            H[1u * output_stride + q] = h01;
+            H[2u * output_stride + q] = h02;
+            H[3u * output_stride + q] = h10;
+            H[4u * output_stride + q] = h11;
+            H[5u * output_stride + q] = h12;
+            H[6u * output_stride + q] = h20;
+            H[7u * output_stride + q] = h21;
+            H[8u * output_stride + q] = h22;
+        }
+    }
+}
+
+template<typename FastBasis>
+void evaluate_fast_outputs_with_constant_hessians_strided(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) {
+    for (std::size_t q = 0; q < points.size(); ++q) {
+        const auto& xi = points[q];
+        if (values_out != nullptr) {
+            std::array<Real, FastBasis::n_dofs> fast_values{};
+            FastBasis::evaluate(xi, fast_values);
+            for (std::size_t i = 0; i < fast_values.size(); ++i) {
+                values_out[i * output_stride + q] = fast_values[i];
+            }
+        }
+        if (gradients_out != nullptr) {
+            std::array<Gradient, FastBasis::n_dofs> fast_gradients{};
+            FastBasis::evaluate_gradients(xi, fast_gradients);
+            for (std::size_t i = 0; i < fast_gradients.size(); ++i) {
+                Real* g = gradients_out + i * 3u * output_stride;
+                g[0u * output_stride + q] = fast_gradients[i][0];
+                g[1u * output_stride + q] = fast_gradients[i][1];
+                g[2u * output_stride + q] = fast_gradients[i][2];
+            }
+        }
+    }
+    evaluate_constant_fast_hessians_strided<FastBasis>(
+        points.size(), output_stride, hessians_out);
+}
+
+template<int Order>
+void evaluate_wedge_fast_outputs_strided(
+    const std::vector<std::array<std::size_t, 2>>& wedge_indices,
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) {
+    static_assert(Order >= 1 && Order <= 2,
+                  "wedge fast outputs rely on low-order public triangle ordering");
+    using TriFast = LagrangeTriFast<Order>;
+    constexpr std::size_t axis_size = static_cast<std::size_t>(Order + 1);
+    const bool need_grad = gradients_out != nullptr;
+    const bool need_hess = hessians_out != nullptr;
+
+    for (std::size_t q = 0; q < points.size(); ++q) {
+        const auto& xi = points[q];
+        std::array<Real, TriFast::n_dofs> tri_values{};
+        std::array<Gradient, TriFast::n_dofs> tri_gradients{};
+        std::array<Hessian, TriFast::n_dofs> tri_hessians{};
+        std::array<Real, axis_size> z_values{};
+        std::array<Real, axis_size> z_first{};
+        std::array<Real, axis_size> z_second{};
+
+        TriFast::evaluate(xi, tri_values);
+        if (need_grad || need_hess) {
+            TriFast::evaluate_gradients(xi, tri_gradients);
+        }
+        if (need_hess) {
+            TriFast::evaluate_hessians(xi, tri_hessians);
+            detail::fill_axis_values_first_second<Order>(xi[2], z_values, z_first, z_second);
+        } else if (need_grad) {
+            detail::fill_axis_values_first<Order>(xi[2], z_values, z_first);
+        } else {
+            detail::fill_axis_values<Order>(xi[2], z_values);
+        }
+
+        for (std::size_t node = 0; node < wedge_indices.size(); ++node) {
+            const auto& index = wedge_indices[node];
+            const std::size_t tri = index[0];
+            const std::size_t z = index[1];
+            const Real tri_v = tri_values[tri];
+            const Real zv = z_values[z];
+
+            if (values_out != nullptr) {
+                values_out[node * output_stride + q] = tri_v * zv;
+            }
+
+            if (gradients_out != nullptr) {
+                Real* g = gradients_out + node * 3u * output_stride;
+                const Gradient& tri_g = tri_gradients[tri];
+                g[0u * output_stride + q] = tri_g[0] * zv;
+                g[1u * output_stride + q] = tri_g[1] * zv;
+                g[2u * output_stride + q] = tri_v * z_first[z];
+            }
+
+            if (hessians_out != nullptr) {
+                Real* H = hessians_out + node * 9u * output_stride;
+                const Gradient& tri_g = tri_gradients[tri];
+                const Hessian& tri_H = tri_hessians[tri];
+                const Real zd = z_first[z];
+                const Real hxz = tri_g[0] * zd;
+                const Real hxy = tri_H(0, 1) * zv;
+                const Real hyz = tri_g[1] * zd;
+                H[0u * output_stride + q] = tri_H(0, 0) * zv;
+                H[1u * output_stride + q] = hxy;
+                H[2u * output_stride + q] = hxz;
+                H[3u * output_stride + q] = hxy;
+                H[4u * output_stride + q] = tri_H(1, 1) * zv;
+                H[5u * output_stride + q] = hyz;
+                H[6u * output_stride + q] = hxz;
+                H[7u * output_stride + q] = hyz;
+                H[8u * output_stride + q] = tri_v * z_second[z];
+            }
+        }
+    }
+}
+
+template <int Order>
+inline void fill_triangle_simplex_product_factors(Real lambda, Real* SVMP_RESTRICT factors) {
+    const Real t = static_cast<Real>(Order) * lambda;
+    factors[0] = Real(1);
+    for (int a = 1; a <= Order; ++a) {
+        factors[a] =
+            factors[a - 1] *
+            (t - static_cast<Real>(a - 1)) /
+            static_cast<Real>(a);
+    }
+}
+
+template <int Order>
+SVMP_LAGRANGE_NOINLINE SVMP_LAGRANGE_ALIGN64 bool evaluate_wedge_values_product_q4(
+    const std::vector<std::array<int, 4>>& simplex_exponents,
+    const std::vector<std::array<std::size_t, 2>>& wedge_indices,
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out) {
+    constexpr std::size_t tri_count =
+        static_cast<std::size_t>((Order + 1) * (Order + 2) / 2);
+    if (simplex_exponents.size() != tri_count || points.size() != 4u) {
+        return false;
+    }
+
+    Real tri_values[4][tri_count];
+    std::array<Real, Order + 1> z_values[4];
+    for (std::size_t q = 0; q < 4u; ++q) {
+        const auto& xi = points[q];
+        const Real l1 = xi[0];
+        const Real l2 = xi[1];
+        const Real l0 = Real(1) - l1 - l2;
+        Real f0[Order + 1];
+        Real f1[Order + 1];
+        Real f2[Order + 1];
+        fill_triangle_simplex_product_factors<Order>(l0, f0);
+        fill_triangle_simplex_product_factors<Order>(l1, f1);
+        fill_triangle_simplex_product_factors<Order>(l2, f2);
+        detail::fill_axis_values<Order>(xi[2], z_values[q]);
+
+        for (std::size_t tri = 0; tri < tri_count; ++tri) {
+            const auto& e = simplex_exponents[tri];
+            tri_values[q][tri] =
+                f0[static_cast<std::size_t>(e[0])] *
+                f1[static_cast<std::size_t>(e[1])] *
+                f2[static_cast<std::size_t>(e[2])];
+        }
+    }
+
+    for (std::size_t node = 0; node < wedge_indices.size(); ++node) {
+        const auto& index = wedge_indices[node];
+        const std::size_t tri = index[0];
+        const std::size_t z = index[1];
+        Real* SVMP_RESTRICT row = values_out + node * output_stride;
+        row[0] = tri_values[0][tri] * z_values[0][z];
+        row[1] = tri_values[1][tri] * z_values[1][z];
+        row[2] = tri_values[2][tri] * z_values[2][z];
+        row[3] = tri_values[3][tri] * z_values[3][z];
+    }
+    return true;
+}
+
+bool try_evaluate_wedge_values_product_q4(
+    const std::vector<std::array<int, 4>>& simplex_exponents,
+    const std::vector<std::array<std::size_t, 2>>& wedge_indices,
+    int order,
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out) {
+    switch (order) {
+        case 4:
+            return evaluate_wedge_values_product_q4<4>(
+                simplex_exponents, wedge_indices, points, output_stride, values_out);
+        case 5:
+            return evaluate_wedge_values_product_q4<5>(
+                simplex_exponents, wedge_indices, points, output_stride, values_out);
+        case 6:
+            return evaluate_wedge_values_product_q4<6>(
+                simplex_exponents, wedge_indices, points, output_stride, values_out);
+        case 7:
+            return evaluate_wedge_values_product_q4<7>(
+                simplex_exponents, wedge_indices, points, output_stride, values_out);
+        case 8:
+            return evaluate_wedge_values_product_q4<8>(
+                simplex_exponents, wedge_indices, points, output_stride, values_out);
+        default:
+            return false;
+    }
+}
+
+void evaluate_wedge_order1_values_q4(
+    const std::vector<std::array<std::size_t, 2>>& wedge_indices,
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out) {
+    Real tri[4][3];
+    Real axis[4][2];
+    for (std::size_t q = 0; q < 4u; ++q) {
+        const auto& xi = points[q];
+        tri[q][0] = Real(1) - xi[0] - xi[1];
+        tri[q][1] = xi[0];
+        tri[q][2] = xi[1];
+        axis[q][0] = (Real(1) - xi[2]) * Real(0.5);
+        axis[q][1] = (Real(1) + xi[2]) * Real(0.5);
+    }
+
+    for (std::size_t node = 0; node < wedge_indices.size(); ++node) {
+        const auto& index = wedge_indices[node];
+        const std::size_t tri_node = index[0];
+        const std::size_t axis_node = index[1];
+        Real* row = values_out + node * output_stride;
+        row[0] = tri[0][tri_node] * axis[0][axis_node];
+        row[1] = tri[1][tri_node] * axis[1][axis_node];
+        row[2] = tri[2][tri_node] * axis[2][axis_node];
+        row[3] = tri[3][tri_node] * axis[3][axis_node];
+    }
+}
+
+bool evaluate_wedge_fast_strided(
+    int order,
+    const std::vector<std::array<std::size_t, 2>>& wedge_indices,
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) {
+    if (order == 3) {
+        return false;
+    }
+    if (order == 1 &&
+        points.size() == 4u &&
+        values_out != nullptr &&
+        gradients_out == nullptr &&
+        hessians_out == nullptr) {
+        evaluate_wedge_order1_values_q4(wedge_indices, points, output_stride, values_out);
+        return true;
+    }
+
+    switch (order) {
+        case 1:
+            evaluate_wedge_fast_outputs_strided<1>(
+                wedge_indices, points, output_stride, values_out, gradients_out, hessians_out);
+            return true;
+        case 2:
+            evaluate_wedge_fast_outputs_strided<2>(
+                wedge_indices, points, output_stride, values_out, gradients_out, hessians_out);
+            return true;
+        default:
+            return false;
+    }
+}
+
+bool evaluate_fixed_lagrange_fast(LagrangeTopology topology,
+                                  int order,
+                                  const math::Vector<Real, 3>& xi,
+                                  std::vector<Real>* values,
+                                  std::vector<Gradient>* gradients,
+                                  std::vector<Hessian>* hessians) {
+    switch (order) {
+        case 1:
+            return evaluate_fixed_lagrange_fast_order<1>(
+                topology, xi, values, gradients, hessians);
+        case 2:
+            return evaluate_fixed_lagrange_fast_order<2>(
+                topology, xi, values, gradients, hessians);
+        case 3:
+            return evaluate_fixed_lagrange_fast_order<3>(
+                topology, xi, values, gradients, hessians);
+        default:
+            return false;
+    }
+}
+
+bool evaluate_fixed_lagrange_fast_strided(LagrangeTopology topology,
+                                          int order,
+                                          const std::vector<math::Vector<Real, 3>>& points,
+                                          std::size_t output_stride,
+                                          Real* SVMP_RESTRICT values_out,
+                                          Real* SVMP_RESTRICT gradients_out,
+                                          Real* SVMP_RESTRICT hessians_out) {
+    if (topology == LagrangeTopology::Line &&
+        points.size() == 4u) {
+        const bool values_only =
+            values_out != nullptr && gradients_out == nullptr && hessians_out == nullptr;
+        const bool gradients_only =
+            values_out == nullptr && gradients_out != nullptr && hessians_out == nullptr;
+        const bool hessians_only =
+            values_out == nullptr && gradients_out == nullptr && hessians_out != nullptr;
+        const bool all_outputs =
+            values_out != nullptr && gradients_out != nullptr && hessians_out != nullptr;
+        if (values_only) {
+            if (order == 1) {
+                evaluate_line_order1_values_q4(points, output_stride, values_out);
+                return true;
+            }
+            if (order == 2) {
+                evaluate_line_order2_values_q4(points, output_stride, values_out);
+                return true;
+            }
+            if (order == 3) {
+                evaluate_line_order3_values_q4(points, output_stride, values_out);
+                return true;
+            }
+        }
+        if (order == 1) {
+            if (gradients_only) {
+                evaluate_line_order1_gradients_q4(output_stride, gradients_out);
+                return true;
+            }
+            if (hessians_only) {
+                evaluate_line_order1_hessians_q4(output_stride, hessians_out);
+                return true;
+            }
+            if (all_outputs) {
+                evaluate_line_order1_all_q4(
+                    points, output_stride, values_out, gradients_out, hessians_out);
+                return true;
+            }
+        }
+        if (order == 2) {
+            if (gradients_only) {
+                evaluate_line_order2_gradients_q4(points, output_stride, gradients_out);
+                return true;
+            }
+            if (hessians_only) {
+                evaluate_line_order2_hessians_q4(output_stride, hessians_out);
+                return true;
+            }
+            if (all_outputs) {
+                evaluate_line_order2_all_q4(
+                    points, output_stride, values_out, gradients_out, hessians_out);
+                return true;
+            }
+        }
+        if (order == 3) {
+            if (gradients_only) {
+                evaluate_line_order3_gradients_q4(points, output_stride, gradients_out);
+                return true;
+            }
+            if (hessians_only) {
+                evaluate_line_order3_hessians_q4(points, output_stride, hessians_out);
+                return true;
+            }
+            if (all_outputs) {
+                evaluate_line_order3_all_q4(
+                    points, output_stride, values_out, gradients_out, hessians_out);
+                return true;
+            }
+        }
+    }
+
+    if (topology == LagrangeTopology::Tetrahedron &&
+        order == 3 &&
+        (gradients_out != nullptr || hessians_out != nullptr)) {
+        return false;
+    }
+    if (topology == LagrangeTopology::Triangle &&
+        order == 3 &&
+        hessians_out != nullptr) {
+        return false;
+    }
+    if (topology == LagrangeTopology::Triangle &&
+        order == 1 &&
+        values_out != nullptr &&
+        gradients_out == nullptr &&
+        hessians_out == nullptr) {
+        evaluate_triangle_order1_values_strided(points, output_stride, values_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Triangle &&
+        order == 1 &&
+        values_out == nullptr &&
+        gradients_out != nullptr &&
+        hessians_out == nullptr) {
+        evaluate_triangle_order1_gradients_strided(points.size(), output_stride, gradients_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Tetrahedron &&
+        order == 1 &&
+        values_out != nullptr &&
+        gradients_out == nullptr &&
+        hessians_out == nullptr) {
+        evaluate_tet_order1_values_strided(points, output_stride, values_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Tetrahedron &&
+        order == 1 &&
+        values_out == nullptr &&
+        gradients_out != nullptr &&
+        hessians_out == nullptr) {
+        evaluate_tet_order1_gradients_strided(points.size(), output_stride, gradients_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Triangle &&
+        order == 1 &&
+        values_out == nullptr &&
+        gradients_out == nullptr &&
+        hessians_out != nullptr) {
+        evaluate_zero_hessians_strided(3u, points.size(), output_stride, hessians_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Tetrahedron &&
+        order == 1 &&
+        values_out == nullptr &&
+        gradients_out == nullptr &&
+        hessians_out != nullptr) {
+        evaluate_zero_hessians_strided(4u, points.size(), output_stride, hessians_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Triangle &&
+        order == 1 &&
+        values_out != nullptr &&
+        gradients_out != nullptr &&
+        hessians_out != nullptr) {
+        evaluate_triangle_order1_values_strided(points, output_stride, values_out);
+        evaluate_triangle_order1_gradients_strided(points.size(), output_stride, gradients_out);
+        evaluate_zero_hessians_strided(3u, points.size(), output_stride, hessians_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Tetrahedron &&
+        order == 1 &&
+        values_out != nullptr &&
+        gradients_out != nullptr &&
+        hessians_out != nullptr) {
+        evaluate_tet_order1_values_strided(points, output_stride, values_out);
+        evaluate_tet_order1_gradients_strided(points.size(), output_stride, gradients_out);
+        evaluate_zero_hessians_strided(4u, points.size(), output_stride, hessians_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Triangle &&
+        order == 2 &&
+        values_out != nullptr &&
+        gradients_out == nullptr &&
+        hessians_out == nullptr) {
+        evaluate_triangle_order2_values_strided(points, output_stride, values_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Triangle &&
+        order == 2 &&
+        values_out == nullptr &&
+        gradients_out != nullptr &&
+        hessians_out == nullptr) {
+        evaluate_triangle_order2_gradients_strided(points, output_stride, gradients_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Triangle &&
+        order == 2 &&
+        points.size() == 4u &&
+        values_out == nullptr &&
+        gradients_out == nullptr &&
+        hessians_out != nullptr) {
+        evaluate_triangle_order2_hessians_q4(output_stride, hessians_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Triangle &&
+        order == 2 &&
+        points.size() == 4u &&
+        values_out != nullptr &&
+        gradients_out != nullptr &&
+        hessians_out != nullptr) {
+        evaluate_triangle_order2_values_strided(points, output_stride, values_out);
+        evaluate_triangle_order2_gradients_strided(points, output_stride, gradients_out);
+        evaluate_triangle_order2_hessians_q4(output_stride, hessians_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Tetrahedron &&
+        order == 2 &&
+        values_out != nullptr &&
+        gradients_out == nullptr &&
+        hessians_out == nullptr) {
+        evaluate_tet_order2_values_strided(points, output_stride, values_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Tetrahedron &&
+        order == 2 &&
+        values_out == nullptr &&
+        gradients_out != nullptr &&
+        hessians_out == nullptr) {
+        evaluate_tet_order2_gradients_strided(points, output_stride, gradients_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Tetrahedron &&
+        order == 2 &&
+        points.size() == 4u &&
+        values_out == nullptr &&
+        gradients_out == nullptr &&
+        hessians_out != nullptr) {
+        evaluate_tet_order2_hessians_q4(output_stride, hessians_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Tetrahedron &&
+        order == 2 &&
+        points.size() == 4u &&
+        values_out != nullptr &&
+        gradients_out != nullptr &&
+        hessians_out != nullptr) {
+        evaluate_tet_order2_values_strided(points, output_stride, values_out);
+        evaluate_tet_order2_gradients_strided(points, output_stride, gradients_out);
+        evaluate_tet_order2_hessians_q4(output_stride, hessians_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Tetrahedron &&
+        order == 3 &&
+        values_out != nullptr &&
+        gradients_out == nullptr &&
+        hessians_out == nullptr) {
+        evaluate_tet_order3_values_strided(points, output_stride, values_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Triangle &&
+        order == 3 &&
+        values_out != nullptr &&
+        gradients_out == nullptr &&
+        hessians_out == nullptr) {
+        evaluate_triangle_order3_values_strided(points, output_stride, values_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Triangle &&
+        order == 3 &&
+        values_out == nullptr &&
+        gradients_out != nullptr &&
+        hessians_out == nullptr) {
+        evaluate_triangle_order3_gradients_strided(points, output_stride, gradients_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Hexahedron &&
+        order == 1 &&
+        values_out != nullptr &&
+        gradients_out == nullptr &&
+        hessians_out == nullptr) {
+        evaluate_hex_order1_values_strided(points, output_stride, values_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Hexahedron &&
+        order == 1 &&
+        values_out == nullptr &&
+        gradients_out != nullptr &&
+        hessians_out == nullptr) {
+        evaluate_hex_order1_outputs_strided<false, true, false>(
+            points, output_stride, values_out, gradients_out, hessians_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Hexahedron &&
+        order == 1 &&
+        values_out == nullptr &&
+        gradients_out == nullptr &&
+        hessians_out != nullptr) {
+        evaluate_hex_order1_outputs_strided<false, false, true>(
+            points, output_stride, values_out, gradients_out, hessians_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Hexahedron &&
+        order == 1 &&
+        values_out != nullptr &&
+        gradients_out != nullptr &&
+        hessians_out != nullptr) {
+        evaluate_hex_order1_outputs_strided<true, true, true>(
+            points, output_stride, values_out, gradients_out, hessians_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Quadrilateral &&
+        order == 1 &&
+        values_out != nullptr &&
+        gradients_out == nullptr &&
+        hessians_out == nullptr) {
+        evaluate_quad_order1_values_strided(points, output_stride, values_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Quadrilateral &&
+        order == 1 &&
+        values_out == nullptr &&
+        gradients_out != nullptr &&
+        hessians_out == nullptr) {
+        evaluate_quad_order1_gradients_strided(points, output_stride, gradients_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Quadrilateral &&
+        order == 1 &&
+        values_out == nullptr &&
+        gradients_out == nullptr &&
+        hessians_out != nullptr) {
+        evaluate_quad_order1_hessians_strided(points.size(), output_stride, hessians_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Quadrilateral &&
+        order == 1 &&
+        points.size() == 4u &&
+        values_out != nullptr &&
+        gradients_out != nullptr &&
+        hessians_out != nullptr) {
+        evaluate_quad_order1_all_q4(points, output_stride, values_out, gradients_out, hessians_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Quadrilateral &&
+        order == 2 &&
+        values_out != nullptr &&
+        gradients_out == nullptr &&
+        hessians_out == nullptr) {
+        evaluate_quad_order2_values_strided(points, output_stride, values_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Quadrilateral &&
+        order == 2 &&
+        values_out == nullptr &&
+        gradients_out != nullptr &&
+        hessians_out == nullptr) {
+        evaluate_quad_order2_gradients_strided(points, output_stride, gradients_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Quadrilateral &&
+        order == 2 &&
+        values_out == nullptr &&
+        gradients_out == nullptr &&
+        hessians_out != nullptr) {
+        evaluate_quad_order2_hessians_strided(points, output_stride, hessians_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Quadrilateral &&
+        order == 2 &&
+        points.size() == 4u &&
+        values_out != nullptr &&
+        gradients_out != nullptr &&
+        hessians_out != nullptr) {
+        evaluate_quad_order2_all_q4(points, output_stride, values_out, gradients_out, hessians_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Quadrilateral &&
+        order == 3 &&
+        values_out != nullptr &&
+        gradients_out == nullptr &&
+        hessians_out == nullptr) {
+        evaluate_quad_order3_values_strided(points, output_stride, values_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Quadrilateral &&
+        order == 3 &&
+        values_out == nullptr &&
+        gradients_out != nullptr &&
+        hessians_out == nullptr) {
+        evaluate_quad_order3_gradients_strided(points, output_stride, gradients_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Quadrilateral &&
+        order == 3 &&
+        values_out == nullptr &&
+        gradients_out == nullptr &&
+        hessians_out != nullptr) {
+        evaluate_quad_order3_hessians_strided(points, output_stride, hessians_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Quadrilateral &&
+        order == 3 &&
+        points.size() == 4u &&
+        values_out != nullptr &&
+        gradients_out != nullptr &&
+        hessians_out != nullptr) {
+        evaluate_quad_order3_all_q4(points, output_stride, values_out, gradients_out, hessians_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Hexahedron &&
+        order == 2 &&
+        values_out != nullptr &&
+        gradients_out == nullptr &&
+        hessians_out == nullptr) {
+        evaluate_hex_order2_values_strided(points, output_stride, values_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Hexahedron &&
+        order == 2 &&
+        points.size() == 4u &&
+        values_out == nullptr &&
+        gradients_out != nullptr &&
+        hessians_out == nullptr) {
+        evaluate_hex_order2_gradients_q4(points, output_stride, gradients_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Hexahedron &&
+        order == 2 &&
+        points.size() == 4u &&
+        values_out == nullptr &&
+        gradients_out == nullptr &&
+        hessians_out != nullptr) {
+        evaluate_hex_order2_hessians_q4(points, output_stride, hessians_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Hexahedron &&
+        order == 2 &&
+        points.size() == 4u &&
+        values_out != nullptr &&
+        gradients_out != nullptr &&
+        hessians_out != nullptr) {
+        evaluate_hex_order2_values_strided(points, output_stride, values_out);
+        evaluate_hex_order2_gradients_q4(points, output_stride, gradients_out);
+        evaluate_hex_order2_hessians_q4(points, output_stride, hessians_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Hexahedron &&
+        order == 3 &&
+        points.size() == 4u &&
+        output_stride == 4u &&
+        hessians_out != nullptr) {
+        return false;
+    }
+    if (topology == LagrangeTopology::Hexahedron &&
+        order == 3 &&
+        points.size() == 4u &&
+        values_out != nullptr &&
+        gradients_out == nullptr &&
+        hessians_out == nullptr) {
+        evaluate_hex_order3_values_q4(points, output_stride, values_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Hexahedron &&
+        order == 3 &&
+        points.size() == 4u &&
+        values_out == nullptr &&
+        gradients_out != nullptr &&
+        hessians_out == nullptr) {
+        evaluate_hex_order3_gradients_q4(points, output_stride, gradients_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Hexahedron &&
+        order == 3 &&
+        points.size() == 4u &&
+        values_out == nullptr &&
+        gradients_out == nullptr &&
+        hessians_out != nullptr) {
+        evaluate_hex_order3_hessians_q4(points, output_stride, hessians_out);
+        return true;
+    }
+    if (topology == LagrangeTopology::Hexahedron &&
+        order == 3 &&
+        points.size() == 4u &&
+        values_out != nullptr &&
+        gradients_out != nullptr &&
+        hessians_out != nullptr) {
+        evaluate_hex_order3_all_q4(points, output_stride, values_out, gradients_out, hessians_out);
+        return true;
+    }
+    if (hessians_out != nullptr && order > 1 &&
+        (topology == LagrangeTopology::Quadrilateral ||
+         topology == LagrangeTopology::Hexahedron)) {
+        return false;
+    }
+    if (hessians_out != nullptr) {
+        const bool hessians_only = values_out == nullptr && gradients_out == nullptr;
+        if (order == 1) {
+            if (topology == LagrangeTopology::Triangle && hessians_only) {
+                evaluate_fast_outputs_with_constant_hessians_strided<LagrangeTriFast<1>>(
+                    points, output_stride, values_out, gradients_out, hessians_out);
+                return true;
+            }
+            if (topology == LagrangeTopology::Tetrahedron) {
+                evaluate_fast_outputs_with_constant_hessians_strided<LagrangeTetFast<1>>(
+                    points, output_stride, values_out, gradients_out, hessians_out);
+                return true;
+            }
+        } else if (order == 2) {
+            if (topology == LagrangeTopology::Triangle && hessians_only) {
+                evaluate_fast_outputs_with_constant_hessians_strided<LagrangeTriFast<2>>(
+                    points, output_stride, values_out, gradients_out, hessians_out);
+                return true;
+            }
+            if (topology == LagrangeTopology::Tetrahedron) {
+                evaluate_fast_outputs_with_constant_hessians_strided<LagrangeTetFast<2>>(
+                    points, output_stride, values_out, gradients_out, hessians_out);
+                return true;
+            }
+        }
+    }
+
+    switch (order) {
+        case 1:
+            return evaluate_fixed_lagrange_fast_strided_order<1>(
+                topology, points, output_stride, values_out, gradients_out, hessians_out);
+        case 2:
+            return evaluate_fixed_lagrange_fast_strided_order<2>(
+                topology, points, output_stride, values_out, gradients_out, hessians_out);
+        case 3:
+            return evaluate_fixed_lagrange_fast_strided_order<3>(
+                topology, points, output_stride, values_out, gradients_out, hessians_out);
+        default:
+            return false;
+    }
+}
+
+bool evaluate_fixed_lagrange_fast_to(LagrangeTopology topology,
+                                     int order,
+                                     const math::Vector<Real, 3>& xi,
+                                     Real* SVMP_RESTRICT values_out,
+                                     Real* SVMP_RESTRICT gradients_out,
+                                     Real* SVMP_RESTRICT hessians_out) {
+    switch (order) {
+        case 1:
+            return evaluate_fixed_lagrange_fast_to_order<1>(
+                topology, xi, values_out, gradients_out, hessians_out);
+        case 2:
+            return evaluate_fixed_lagrange_fast_to_order<2>(
+                topology, xi, values_out, gradients_out, hessians_out);
+        case 3:
+            return evaluate_fixed_lagrange_fast_to_order<3>(
+                topology, xi, values_out, gradients_out, hessians_out);
+        default:
+            return false;
+    }
+}
+
+template<std::size_t N>
+struct AxisMonomialCoefficientTable {
+    std::array<Real, N * N> values{};
+    std::array<Real, N * (N > 1 ? N - 1 : 0)> first{};
+    std::array<Real, N * (N > 2 ? N - 2 : 0)> second{};
+};
+
+template<std::size_t N>
+constexpr AxisMonomialCoefficientTable<N> make_axis_monomial_coefficient_table() {
+    AxisMonomialCoefficientTable<N> table{};
+    std::array<Real, N> nodes{};
+    constexpr int order = static_cast<int>(N) - 1;
+    for (std::size_t i = 0; i < N; ++i) {
+        nodes[i] = detail::equispaced_pm_one_coord(static_cast<int>(i), order);
+    }
+
+    for (std::size_t i = 0; i < N; ++i) {
+        std::array<Real, N> coeffs{};
+        std::array<Real, N> next{};
+        coeffs[0] = Real(1);
+        std::size_t degree = 0;
+        for (std::size_t j = 0; j < N; ++j) {
+            if (j == i) {
+                continue;
+            }
+            next = {};
+            for (std::size_t k = 0; k <= degree; ++k) {
+                next[k] -= nodes[j] * coeffs[k];
+                next[k + 1] += coeffs[k];
+            }
+            coeffs = next;
+            ++degree;
+        }
+
+        Real denominator = Real(1);
+        for (std::size_t j = 0; j < N; ++j) {
+            if (j != i) {
+                denominator *= nodes[i] - nodes[j];
+            }
+        }
+        const Real inv_denominator = Real(1) / denominator;
+        for (std::size_t k = 0; k < N; ++k) {
+            table.values[i * N + k] = coeffs[k] * inv_denominator;
+        }
+        if constexpr (N >= 2) {
+            for (std::size_t k = 1; k < N; ++k) {
+                table.first[i * (N - 1) + (k - 1)] =
+                    static_cast<Real>(k) * table.values[i * N + k];
+            }
+        }
+        if constexpr (N >= 3) {
+            for (std::size_t k = 2; k < N; ++k) {
+                table.second[i * (N - 2) + (k - 2)] =
+                    static_cast<Real>(k * (k - 1)) * table.values[i * N + k];
+            }
+        }
+    }
+
+    return table;
+}
+
+template<std::size_t N>
+void assign_axis_coefficient_table(const AxisMonomialCoefficientTable<N>& table,
+                                   std::vector<Real>& values,
+                                   std::vector<Real>& first,
+                                   std::vector<Real>& second) {
+    assign_array(values, table.values);
+    assign_array(first, table.first);
+    assign_array(second, table.second);
+}
+
+bool assign_precomputed_axis_coefficients(int n_axis,
+                                          std::vector<Real>& values,
+                                          std::vector<Real>& first,
+                                          std::vector<Real>& second) {
+    static constexpr auto kAxisCoefficients1 = make_axis_monomial_coefficient_table<1>();
+    static constexpr auto kAxisCoefficients2 = make_axis_monomial_coefficient_table<2>();
+    static constexpr auto kAxisCoefficients3 = make_axis_monomial_coefficient_table<3>();
+    static constexpr auto kAxisCoefficients4 = make_axis_monomial_coefficient_table<4>();
+    static constexpr auto kAxisCoefficients5 = make_axis_monomial_coefficient_table<5>();
+
+    switch (n_axis) {
+        case 1:
+            assign_axis_coefficient_table(kAxisCoefficients1, values, first, second);
+            return true;
+        case 2:
+            assign_axis_coefficient_table(kAxisCoefficients2, values, first, second);
+            return true;
+        case 3:
+            assign_axis_coefficient_table(kAxisCoefficients3, values, first, second);
+            return true;
+        case 4:
+            assign_axis_coefficient_table(kAxisCoefficients4, values, first, second);
+            return true;
+        case 5:
+            assign_axis_coefficient_table(kAxisCoefficients5, values, first, second);
+            return true;
+        default:
+            return false;
+    }
+}
+
+LagrangeTopologyTraits lagrange_topology_traits(ElementType type) {
+    const auto topo = topology(type);
+    if (topo != LagrangeTopology::Unknown) {
+        return {topo, reference_dimension(type)};
+    }
+
+    throw BasisElementCompatibilityException("Unsupported element type for LagrangeBasis",
+                                             __FILE__, __LINE__, __func__);
+}
+
+std::size_t lattice_index_pm_one(Real coord, int order, const char* context) {
+    if (order <= 0) {
+        if (!coordinate_matches_expected(coord, Real(0))) {
+            throw BasisNodeOrderingException(context, __FILE__, __LINE__, __func__);
+        }
+        return 0;
+    }
+
+    const Real scaled = (coord + Real(1)) * static_cast<Real>(order) / Real(2);
+    const long idx = std::lround(scaled);
+    if (idx < 0 || idx > order ||
+        !coordinate_matches_expected(
+            coord,
+            detail::equispaced_pm_one_coord(static_cast<int>(idx), order))) {
+        throw BasisNodeOrderingException(context, __FILE__, __LINE__, __func__);
+    }
+    return static_cast<std::size_t>(idx);
+}
+
+int simplex_lattice_index(Real coord, int order, const char* context) {
+    if (order <= 0) {
+        if (!coordinate_matches_expected(coord, Real(0)) &&
+            !coordinate_matches_expected(coord, Real(0.25)) &&
+            !coordinate_matches_expected(coord, Real(1) / Real(3))) {
+            throw BasisNodeOrderingException(context, __FILE__, __LINE__, __func__);
+        }
+        return 0;
+    }
+
+    const Real scaled = coord * static_cast<Real>(order);
+    const long idx = std::lround(scaled);
+    const Real reconstructed = static_cast<Real>(idx) / static_cast<Real>(order);
+    if (idx < 0 || idx > order || !coordinate_matches_expected(coord, reconstructed)) {
+        throw BasisNodeOrderingException(context, __FILE__, __LINE__, __func__);
+    }
+    return static_cast<int>(idx);
+}
+
+std::array<int, 4> triangle_exponents_from_public_node(const math::Vector<Real, 3>& node,
+                                                       int order) {
+    if (order == 0) {
+        return {0, 0, 0, 0};
+    }
+
+    const int j = simplex_lattice_index(node[0], order,
+                                        "LagrangeBasis: invalid triangle node coordinate for public ordering");
+    const int k = simplex_lattice_index(node[1], order,
+                                        "LagrangeBasis: invalid triangle node coordinate for public ordering");
+    const int i = order - j - k;
+    if (i < 0) {
+        throw BasisNodeOrderingException("LagrangeBasis: invalid triangle barycentric coordinates for public ordering",
+                                         __FILE__, __LINE__, __func__);
+    }
+    return {i, j, k, 0};
+}
+
+std::array<int, 4> tetrahedron_exponents_from_public_node(const math::Vector<Real, 3>& node,
+                                                          int order) {
+    if (order == 0) {
+        return {0, 0, 0, 0};
+    }
+
+    const int j = simplex_lattice_index(node[0], order,
+                                        "LagrangeBasis: invalid tetrahedron node x-coordinate for public ordering");
+    const int k = simplex_lattice_index(node[1], order,
+                                        "LagrangeBasis: invalid tetrahedron node y-coordinate for public ordering");
+    const int l = simplex_lattice_index(node[2], order,
+                                        "LagrangeBasis: invalid tetrahedron node z-coordinate for public ordering");
+    const int i = order - j - k - l;
+    if (i < 0) {
+        throw BasisNodeOrderingException("LagrangeBasis: invalid tetrahedron barycentric coordinates for public ordering",
+                                         __FILE__, __LINE__, __func__);
+    }
+    return {i, j, k, l};
+}
+
+struct NormalizedLagrangeRequest {
+    ElementType element_type;
+    int order;
+};
+
+// Non-owning view of the per-axis 1D Lagrange basis evaluations
+// (values, first derivative, second derivative), each of length `size`.
+struct AxisBasisEvaluations {
+    const Real* values;
+    const Real* first;
+    const Real* second;
+    std::size_t size;
+};
+
+AxisBasisEvaluations constant_axis_basis() {
+    static const Real kOne[1]  = {Real(1)};
+    static const Real kZero[1] = {Real(0)};
+    return AxisBasisEvaluations{kOne, kZero, kZero, 1};
+}
+
+// Horner-form evaluator for the precomputed 1D Lagrange basis.
+//
+// Inputs are precomputed monomial coefficients of L_i(x), L_i'(x), L_i''(x)
+// (built once at LagrangeBasis construction). Evaluation is purely
+// multiply-add on the coefficients — no divisions and no node-position
+// lookups in the hot path. Templated on N for compile-time loop unrolling
+// and FMA-friendly straight-line code on the common Hex/Quad/Line orders.
+//
+// Layout:
+//   v_coeffs:  N * N entries; row i holds [c_i0, c_i1, ..., c_i(N-1)]
+//              such that L_i(x) = sum_k c_ik * x^k
+//   d_coeffs:  N * (N-1) entries; row i holds derivative coefficients of L_i'(x)
+//   d2_coeffs: N * (N-2) entries; row i holds coefficients of L_i''(x)
+//              (only valid when N >= 3)
+template<int N>
+inline void evaluate_1d_horner_impl(const Real* v_coeffs,
+                                    const Real* d_coeffs,
+                                    const Real* d2_coeffs,
+                                    Real xi,
+                                    Real* values, Real* first, Real* second) {
+    if constexpr (N == 1) {
+        values[0] = v_coeffs[0];
+        if (first)  first[0]  = Real(0);
+        if (second) second[0] = Real(0);
+        return;
+    } else {
+        // Values: degree N-1 polynomials.
+        for (int i = 0; i < N; ++i) {
+            const Real* c = v_coeffs + i * N;
+            Real r = c[N - 1];
+            for (int k = N - 1; k > 0; --k) {
+                r = r * xi + c[k - 1];
+            }
+            values[i] = r;
+        }
+
+        if (!first && !second) return;
+
+        if (first) {
+            // First derivatives: degree N-2 polynomials (per row of d_coeffs).
+            for (int i = 0; i < N; ++i) {
+                const Real* c = d_coeffs + i * (N - 1);
+                Real r = c[N - 2];
+                for (int k = N - 2; k > 0; --k) {
+                    r = r * xi + c[k - 1];
+                }
+                first[i] = r;
+            }
+        }
+
+        if (!second) return;
+
+        if constexpr (N <= 2) {
+            for (int i = 0; i < N; ++i) second[i] = Real(0);
+        } else {
+            // Second derivatives: degree N-3 polynomials (per row of d2_coeffs).
+            for (int i = 0; i < N; ++i) {
+                const Real* c = d2_coeffs + i * (N - 2);
+                Real r = c[N - 3];
+                for (int k = N - 3; k > 0; --k) {
+                    r = r * xi + c[k - 1];
+                }
+                second[i] = r;
+            }
+        }
+    }
+}
+
+void fill_equispaced_barycentric_weights(int n_axis, Real* weights) {
+    const int order = n_axis - 1;
+    Real weight = (order % 2 == 0) ? Real(1) : Real(-1);
+    Real max_abs = Real(0);
+    for (int i = 0; i < n_axis; ++i) {
+        weights[i] = weight;
+        max_abs = std::max(max_abs, std::abs(weight));
+        if (i < order) {
+            weight *= -static_cast<Real>(order - i) / static_cast<Real>(i + 1);
+        }
+    }
+
+    if (max_abs > Real(0)) {
+        const Real inv_scale = Real(1) / max_abs;
+        for (int i = 0; i < n_axis; ++i) {
+            weights[i] *= inv_scale;
+        }
+    }
+}
+
+bool coordinate_matches_axis_node(Real xi, Real node) {
+    return coordinate_matches_expected(xi, node);
+}
+
+struct CompensatedSum {
+    Real sum{Real(0)};
+    Real compensation{Real(0)};
+
+    void add(Real value) noexcept {
+        const Real y = value - compensation;
+        const Real t = sum + y;
+        compensation = (t - sum) - y;
+        sum = t;
+    }
+};
+
+void distribute_residual_by_abs(int n_axis, Real* values, Real residual) {
+    if (values == nullptr || n_axis <= 0 || residual == Real(0)) {
+        return;
+    }
+
+    CompensatedSum abs_sum;
+    int largest_index = 0;
+    Real largest_abs = Real(0);
+    for (int i = 0; i < n_axis; ++i) {
+        const Real magnitude = std::abs(values[i]);
+        abs_sum.add(magnitude);
+        if (magnitude > largest_abs) {
+            largest_abs = magnitude;
+            largest_index = i;
+        }
+    }
+
+    if (abs_sum.sum <= Real(0)) {
+        values[0] += residual;
+        return;
+    }
+
+    const Real inv_abs_sum = Real(1) / abs_sum.sum;
+    CompensatedSum applied;
+    for (int i = 0; i < n_axis; ++i) {
+        const Real correction = residual * std::abs(values[i]) * inv_abs_sum;
+        values[i] += correction;
+        applied.add(correction);
+    }
+    values[largest_index] += residual - applied.sum;
+}
+
+void evaluate_1d_barycentric_runtime(int n_axis,
+                                     Real xi,
+                                     const Real* weights,
+                                     Real* values,
+                                     Real* first,
+                                     Real* second) {
+    const int order = n_axis - 1;
+    BASIS_CHECK_EVAL(weights != nullptr,
+                     "LagrangeBasis: missing cached barycentric weights for runtime axis evaluation");
+
+    int node_index = -1;
+    for (int i = 0; i < n_axis; ++i) {
+        const Real node = detail::equispaced_pm_one_coord(i, order);
+        if (coordinate_matches_axis_node(xi, node)) {
+            node_index = i;
+            break;
+        }
+    }
+
+    if (node_index >= 0) {
+        std::fill(values, values + n_axis, Real(0));
+        values[node_index] = Real(1);
+        if (!first && !second) {
+            return;
+        }
+
+        const Real xk = detail::equispaced_pm_one_coord(node_index, order);
+        const Real wk = weights[static_cast<std::size_t>(node_index)];
+        Real reciprocal_sum = Real(0);
+        if (second) {
+            for (int m = 0; m < n_axis; ++m) {
+                if (m == node_index) {
+                    continue;
+                }
+                const Real xm = detail::equispaced_pm_one_coord(m, order);
+                reciprocal_sum += Real(1) / (xk - xm);
+            }
+        }
+
+        Real first_diagonal = Real(0);
+        Real second_diagonal = Real(0);
+        if (first) {
+            std::fill(first, first + n_axis, Real(0));
+        }
+        if (second) {
+            std::fill(second, second + n_axis, Real(0));
+        }
+
+        for (int j = 0; j < n_axis; ++j) {
+            if (j == node_index) {
+                continue;
+            }
+            const Real xj = detail::equispaced_pm_one_coord(j, order);
+            const Real distance = xk - xj;
+            const Real offdiag_first = weights[static_cast<std::size_t>(j)] / (wk * distance);
+            first_diagonal -= offdiag_first;
+            if (first) {
+                first[j] = offdiag_first;
+            }
+            if (second) {
+                const Real offdiag_second =
+                    Real(2) * offdiag_first * (reciprocal_sum - Real(1) / distance);
+                second[j] = offdiag_second;
+                second_diagonal -= offdiag_second;
+            }
+        }
+        if (first) {
+            first[node_index] = first_diagonal;
+        }
+        if (second) {
+            second[node_index] = second_diagonal;
+        }
+        return;
+    }
+
+    Real sum0 = Real(0);
+    Real sum1 = Real(0);
+    Real sum2 = Real(0);
+    for (int i = 0; i < n_axis; ++i) {
+        const Real node = detail::equispaced_pm_one_coord(i, order);
+        const Real inv_distance = Real(1) / (xi - node);
+        const Real weighted = weights[static_cast<std::size_t>(i)] * inv_distance;
+        sum0 += weighted;
+        sum1 += weighted * inv_distance;
+        sum2 += weighted * inv_distance * inv_distance;
+    }
+
+    const Real inv_sum0 = Real(1) / sum0;
+    const Real first_ratio = sum1 * inv_sum0;
+    const Real second_ratio = sum2 * inv_sum0;
+    const Real first_ratio_sq = first_ratio * first_ratio;
+
+    CompensatedSum value_sum;
+    CompensatedSum first_sum;
+    CompensatedSum second_sum;
+    for (int i = 0; i < n_axis; ++i) {
+        const Real node = detail::equispaced_pm_one_coord(i, order);
+        const Real inv_distance = Real(1) / (xi - node);
+        const Real value = weights[static_cast<std::size_t>(i)] * inv_distance * inv_sum0;
+        values[i] = value;
+        value_sum.add(value);
+        if (first || second) {
+            const Real derivative_factor = first_ratio - inv_distance;
+            if (first) {
+                first[i] = value * derivative_factor;
+                first_sum.add(first[i]);
+            }
+            if (second) {
+                second[i] = value * (derivative_factor * derivative_factor +
+                                     inv_distance * inv_distance -
+                                     Real(2) * second_ratio +
+                                     first_ratio_sq);
+                second_sum.add(second[i]);
+            }
+        }
+    }
+
+    distribute_residual_by_abs(n_axis, values, Real(1) - value_sum.sum);
+    if (first) {
+        distribute_residual_by_abs(n_axis, first, -first_sum.sum);
+    }
+    if (second) {
+        distribute_residual_by_abs(n_axis, second, -second_sum.sum);
+    }
+}
+
+// 1D Lagrange-basis evaluator. Writes n_axis entries to each non-null output
+// buffer. Dispatches to compile-time Horner specializations for sizes 1..9
+// (orders 0..8 — the Lagrange performance sweep) and uses barycentric
+// evaluation above that threshold to avoid high-order monomial conditioning
+// issues.
+void evaluate_1d_basis_to(const Real* v_coeffs,
+                          const Real* d_coeffs,
+                          const Real* d2_coeffs,
+                          const Real* barycentric_weights,
+                          int n_axis, Real xi,
+                          Real* values, Real* first, Real* second) {
+    switch (n_axis) {
+        case 1: evaluate_1d_horner_impl<1>(v_coeffs, d_coeffs, d2_coeffs, xi, values, first, second); return;
+        case 2: evaluate_1d_horner_impl<2>(v_coeffs, d_coeffs, d2_coeffs, xi, values, first, second); return;
+        case 3: evaluate_1d_horner_impl<3>(v_coeffs, d_coeffs, d2_coeffs, xi, values, first, second); return;
+        case 4: evaluate_1d_horner_impl<4>(v_coeffs, d_coeffs, d2_coeffs, xi, values, first, second); return;
+        case 5: evaluate_1d_horner_impl<5>(v_coeffs, d_coeffs, d2_coeffs, xi, values, first, second); return;
+        case 6: evaluate_1d_horner_impl<6>(v_coeffs, d_coeffs, d2_coeffs, xi, values, first, second); return;
+        case 7: evaluate_1d_horner_impl<7>(v_coeffs, d_coeffs, d2_coeffs, xi, values, first, second); return;
+        case 8: evaluate_1d_horner_impl<8>(v_coeffs, d_coeffs, d2_coeffs, xi, values, first, second); return;
+        case 9: evaluate_1d_horner_impl<9>(v_coeffs, d_coeffs, d2_coeffs, xi, values, first, second); return;
+        default:
+            evaluate_1d_barycentric_runtime(n_axis, xi, barycentric_weights, values, first, second);
+            return;
+    }
+}
+
+// Selects which derivative passes are computed by the 1D evaluator.
+enum class AxisDeriv {
+    ValuesOnly,           // skip first and second
+    ValuesAndFirst,       // for gradients
+    ValuesAndFirstAndSecond, // for hessians or fused evaluate_all
+};
+
+// Per-axis storage (values, first derivative, second derivative). Backed by
+// per-thread scratch that grows lazily; subsequent calls reuse capacity with no
+// reallocation.
+struct AxisScratch {
+    std::vector<Real> values;
+    std::vector<Real> first;
+    std::vector<Real> second;
+
+    void reserveFor(std::size_t n) {
+        if (values.size() < n) values.resize(n);
+        if (first.size() < n) first.resize(n);
+        if (second.size() < n) second.resize(n);
+    }
+};
+
+struct AxisBatchScratch {
+    std::vector<Real> values;
+    std::vector<Real> first;
+    std::vector<Real> second;
+
+    void resizeFor(std::size_t count, AxisDeriv level) {
+        if (values.size() < count) values.resize(count);
+        if (level != AxisDeriv::ValuesOnly && first.size() < count) first.resize(count);
+        if (level == AxisDeriv::ValuesAndFirstAndSecond && second.size() < count) second.resize(count);
+    }
+};
+
+template<int Order, bool NeedFirst, bool NeedSecond>
+inline void fill_simplex_factor_sequence_fixed(Real lambda,
+                                               Real* SVMP_RESTRICT phi,
+                                               Real* SVMP_RESTRICT dphi,
+                                               Real* SVMP_RESTRICT d2phi) {
+    static_assert(!NeedSecond || NeedFirst,
+                  "second derivative factors require first-derivative recurrence state");
+    phi[0] = Real(1);
+    if constexpr (NeedFirst) {
+        dphi[0] = Real(0);
+    }
+    if constexpr (NeedSecond) {
+        d2phi[0] = Real(0);
+    }
+
+    const Real t = static_cast<Real>(Order) * lambda;
+    const Real dt_dlambda = static_cast<Real>(Order);
+    Real dphi_dt_prev = Real(0);
+    Real d2phi_dt2_prev = Real(0);
+    for (int a = 1; a <= Order; ++a) {
+        const std::size_t au = static_cast<std::size_t>(a);
+        const Real inv_a = Real(1) / static_cast<Real>(a);
+        const Real s = (t - static_cast<Real>(a - 1)) * inv_a;
+        phi[au] = s * phi[au - 1];
+
+        if constexpr (NeedFirst) {
+            const Real dphi_dt_old = dphi_dt_prev;
+            const Real dphi_dt = inv_a * phi[au - 1] + s * dphi_dt_old;
+            dphi[au] = dt_dlambda * dphi_dt;
+
+            if constexpr (NeedSecond) {
+                const Real d2phi_dt2 = Real(2) * inv_a * dphi_dt_old + s * d2phi_dt2_prev;
+                d2phi[au] = dt_dlambda * dt_dlambda * d2phi_dt2;
+                d2phi_dt2_prev = d2phi_dt2;
+            }
+            dphi_dt_prev = dphi_dt;
+        }
+    }
+}
+
+template<int Order, bool NeedSecond>
+inline void fill_triangle_factors_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    Real (&phi0)[4][Order + 1],
+    Real (&phi1)[4][Order + 1],
+    Real (&phi2)[4][Order + 1],
+    Real (&dphi0)[4][Order + 1],
+    Real (&dphi1)[4][Order + 1],
+    Real (&dphi2)[4][Order + 1],
+    Real (&d2phi0)[4][Order + 1],
+    Real (&d2phi1)[4][Order + 1],
+    Real (&d2phi2)[4][Order + 1]) {
+    for (std::size_t q = 0; q < 4u; ++q) {
+        const auto& xi = points[q];
+        const Real l1 = xi[0];
+        const Real l2 = xi[1];
+        const Real l0 = Real(1) - l1 - l2;
+        if constexpr (NeedSecond) {
+            fill_simplex_factor_sequence_fixed<Order, true, true>(
+                l0, phi0[q], dphi0[q], d2phi0[q]);
+            fill_simplex_factor_sequence_fixed<Order, true, true>(
+                l1, phi1[q], dphi1[q], d2phi1[q]);
+            fill_simplex_factor_sequence_fixed<Order, true, true>(
+                l2, phi2[q], dphi2[q], d2phi2[q]);
+        } else {
+            fill_simplex_factor_sequence_fixed<Order, true, false>(
+                l0, phi0[q], dphi0[q], nullptr);
+            fill_simplex_factor_sequence_fixed<Order, true, false>(
+                l1, phi1[q], dphi1[q], nullptr);
+            fill_simplex_factor_sequence_fixed<Order, true, false>(
+                l2, phi2[q], dphi2[q], nullptr);
+        }
+    }
+}
+
+template<std::size_t Q>
+inline void write_wedge_gradient_strided_q(std::size_t tri_stride,
+                                           std::size_t axis_stride,
+                                           std::size_t tri,
+                                           std::size_t z,
+                                           std::size_t output_stride,
+                                           const Real* SVMP_RESTRICT tri_values,
+                                           const Real* SVMP_RESTRICT tri_g,
+                                           const AxisBatchScratch& axis_batch,
+                                           Real* SVMP_RESTRICT g) {
+    const std::size_t tri_q = tri * tri_stride + Q;
+    const std::size_t z_q = Q * axis_stride + z;
+    const Real tri_v = tri_values[tri_q];
+    const Real zv = axis_batch.values[z_q];
+    g[0u * output_stride + Q] = tri_g[0u * tri_stride + Q] * zv;
+    g[1u * output_stride + Q] = tri_g[1u * tri_stride + Q] * zv;
+    g[2u * output_stride + Q] = tri_v * axis_batch.first[z_q];
+}
+
+template<std::size_t Q>
+inline void write_wedge_gradient_stride4_q(std::size_t tri_stride,
+                                           std::size_t axis_stride,
+                                           std::size_t tri,
+                                           std::size_t z,
+                                           const Real* SVMP_RESTRICT tri_values,
+                                           const Real* SVMP_RESTRICT tri_g,
+                                           const AxisBatchScratch& axis_batch,
+                                           Real* SVMP_RESTRICT g) {
+    const std::size_t tri_q = tri * tri_stride + Q;
+    const std::size_t z_q = Q * axis_stride + z;
+    const Real tri_v = tri_values[tri_q];
+    const Real zv = axis_batch.values[z_q];
+    g[Q] = tri_g[0u * tri_stride + Q] * zv;
+    g[4u + Q] = tri_g[1u * tri_stride + Q] * zv;
+    g[8u + Q] = tri_v * axis_batch.first[z_q];
+}
+
+template<std::size_t Q>
+inline void write_wedge_hessian_strided_q(std::size_t tri_stride,
+                                          std::size_t axis_stride,
+                                          std::size_t tri,
+                                          std::size_t z,
+                                          std::size_t output_stride,
+                                          const Real* SVMP_RESTRICT tri_values,
+                                          const Real* SVMP_RESTRICT tri_g,
+                                          const Real* SVMP_RESTRICT tri_H,
+                                          const AxisBatchScratch& axis_batch,
+                                          Real* SVMP_RESTRICT H) {
+    const std::size_t tri_q = tri * tri_stride + Q;
+    const std::size_t z_q = Q * axis_stride + z;
+    const Real tri_v = tri_values[tri_q];
+    const Real zv = axis_batch.values[z_q];
+    const Real zd = axis_batch.first[z_q];
+    const Real tri_gx = tri_g[0u * tri_stride + Q];
+    const Real tri_gy = tri_g[1u * tri_stride + Q];
+    const Real tri_hxx = tri_H[0u * tri_stride + Q];
+    const Real tri_hxy = tri_H[1u * tri_stride + Q];
+    const Real tri_hyy = tri_H[2u * tri_stride + Q];
+    const Real hxz = tri_gx * zd;
+    const Real hxy = tri_hxy * zv;
+    const Real hyz = tri_gy * zd;
+
+    H[0u * output_stride + Q] = tri_hxx * zv;
+    H[1u * output_stride + Q] = hxy;
+    H[2u * output_stride + Q] = hxz;
+    H[3u * output_stride + Q] = hxy;
+    H[4u * output_stride + Q] = tri_hyy * zv;
+    H[5u * output_stride + Q] = hyz;
+    H[6u * output_stride + Q] = hxz;
+    H[7u * output_stride + Q] = hyz;
+    H[8u * output_stride + Q] = tri_v * axis_batch.second[z_q];
+}
+
+template<std::size_t Q>
+inline void write_wedge_hessian_stride4_q(std::size_t tri_stride,
+                                          std::size_t axis_stride,
+                                          std::size_t tri,
+                                          std::size_t z,
+                                          const Real* SVMP_RESTRICT tri_values,
+                                          const Real* SVMP_RESTRICT tri_g,
+                                          const Real* SVMP_RESTRICT tri_H,
+                                          const AxisBatchScratch& axis_batch,
+                                          Real* SVMP_RESTRICT H) {
+    const std::size_t tri_q = tri * tri_stride + Q;
+    const std::size_t z_q = Q * axis_stride + z;
+    const Real tri_v = tri_values[tri_q];
+    const Real zv = axis_batch.values[z_q];
+    const Real zd = axis_batch.first[z_q];
+    const Real tri_gx = tri_g[0u * tri_stride + Q];
+    const Real tri_gy = tri_g[1u * tri_stride + Q];
+    const Real tri_hxx = tri_H[0u * tri_stride + Q];
+    const Real tri_hxy = tri_H[1u * tri_stride + Q];
+    const Real tri_hyy = tri_H[2u * tri_stride + Q];
+    const Real hxz = tri_gx * zd;
+    const Real hxy = tri_hxy * zv;
+    const Real hyz = tri_gy * zd;
+
+    H[Q] = tri_hxx * zv;
+    H[4u + Q] = hxy;
+    H[8u + Q] = hxz;
+    H[12u + Q] = hxy;
+    H[16u + Q] = tri_hyy * zv;
+    H[20u + Q] = hyz;
+    H[24u + Q] = hxz;
+    H[28u + Q] = hyz;
+    H[32u + Q] = tri_v * axis_batch.second[z_q];
+}
+
+template<std::size_t Q>
+inline void write_wedge_all_strided_q(std::size_t tri_stride,
+                                      std::size_t axis_stride,
+                                      std::size_t tri,
+                                      std::size_t z,
+                                      std::size_t output_stride,
+                                      const Real* SVMP_RESTRICT tri_values,
+                                      const Real* SVMP_RESTRICT tri_g,
+                                      const Real* SVMP_RESTRICT tri_H,
+                                      const AxisBatchScratch& axis_batch,
+                                      Real* SVMP_RESTRICT value_row,
+                                      Real* SVMP_RESTRICT g,
+                                      Real* SVMP_RESTRICT H) {
+    const std::size_t tri_q = tri * tri_stride + Q;
+    const std::size_t z_q = Q * axis_stride + z;
+    const Real tri_v = tri_values[tri_q];
+    const Real zv = axis_batch.values[z_q];
+    const Real zd = axis_batch.first[z_q];
+    const Real tri_gx = tri_g[0u * tri_stride + Q];
+    const Real tri_gy = tri_g[1u * tri_stride + Q];
+    const Real tri_hxx = tri_H[0u * tri_stride + Q];
+    const Real tri_hxy = tri_H[1u * tri_stride + Q];
+    const Real tri_hyy = tri_H[2u * tri_stride + Q];
+    const Real hxz = tri_gx * zd;
+    const Real hxy = tri_hxy * zv;
+    const Real hyz = tri_gy * zd;
+
+    value_row[Q] = tri_v * zv;
+    g[0u * output_stride + Q] = tri_gx * zv;
+    g[1u * output_stride + Q] = tri_gy * zv;
+    g[2u * output_stride + Q] = tri_v * zd;
+    H[0u * output_stride + Q] = tri_hxx * zv;
+    H[1u * output_stride + Q] = hxy;
+    H[2u * output_stride + Q] = hxz;
+    H[3u * output_stride + Q] = hxy;
+    H[4u * output_stride + Q] = tri_hyy * zv;
+    H[5u * output_stride + Q] = hyz;
+    H[6u * output_stride + Q] = hxz;
+    H[7u * output_stride + Q] = hyz;
+    H[8u * output_stride + Q] = tri_v * axis_batch.second[z_q];
+}
+
+template<std::size_t Q>
+inline void write_wedge_all_stride4_q(std::size_t tri_stride,
+                                      std::size_t axis_stride,
+                                      std::size_t tri,
+                                      std::size_t z,
+                                      const Real* SVMP_RESTRICT tri_values,
+                                      const Real* SVMP_RESTRICT tri_g,
+                                      const Real* SVMP_RESTRICT tri_H,
+                                      const AxisBatchScratch& axis_batch,
+                                      Real* SVMP_RESTRICT value_row,
+                                      Real* SVMP_RESTRICT g,
+                                      Real* SVMP_RESTRICT H) {
+    const std::size_t tri_q = tri * tri_stride + Q;
+    const std::size_t z_q = Q * axis_stride + z;
+    const Real tri_v = tri_values[tri_q];
+    const Real zv = axis_batch.values[z_q];
+    const Real zd = axis_batch.first[z_q];
+    const Real tri_gx = tri_g[0u * tri_stride + Q];
+    const Real tri_gy = tri_g[1u * tri_stride + Q];
+    const Real tri_hxx = tri_H[0u * tri_stride + Q];
+    const Real tri_hxy = tri_H[1u * tri_stride + Q];
+    const Real tri_hyy = tri_H[2u * tri_stride + Q];
+    const Real hxz = tri_gx * zd;
+    const Real hxy = tri_hxy * zv;
+    const Real hyz = tri_gy * zd;
+
+    value_row[Q] = tri_v * zv;
+    g[Q] = tri_gx * zv;
+    g[4u + Q] = tri_gy * zv;
+    g[8u + Q] = tri_v * zd;
+    H[Q] = tri_hxx * zv;
+    H[4u + Q] = hxy;
+    H[8u + Q] = hxz;
+    H[12u + Q] = hxy;
+    H[16u + Q] = tri_hyy * zv;
+    H[20u + Q] = hyz;
+    H[24u + Q] = hxz;
+    H[28u + Q] = hyz;
+    H[32u + Q] = tri_v * axis_batch.second[z_q];
+}
+
+template<int Order, bool NeedHess>
+bool evaluate_wedge_fused_stride4_q4(
+    const std::vector<std::array<int, 4>>& simplex_exponents,
+    const std::vector<std::size_t>& wedge_node_by_tri_z,
+    const std::vector<math::Vector<Real, 3>>& points,
+    const AxisBatchScratch& axis_batch,
+    int n_axis,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) {
+    static_assert(Order >= 3 && Order <= 8, "fused wedge q4 path covers orders 3..8");
+    const std::size_t tri_count = simplex_exponents.size();
+    const std::size_t z_count = static_cast<std::size_t>(n_axis);
+    if (points.size() != 4u ||
+        z_count != static_cast<std::size_t>(Order + 1) ||
+        wedge_node_by_tri_z.size() != tri_count * z_count) {
+        return false;
+    }
+
+    Real phi0[4][Order + 1];
+    Real phi1[4][Order + 1];
+    Real phi2[4][Order + 1];
+    Real dphi0[4][Order + 1];
+    Real dphi1[4][Order + 1];
+    Real dphi2[4][Order + 1];
+    Real d2phi0[4][Order + 1];
+    Real d2phi1[4][Order + 1];
+    Real d2phi2[4][Order + 1];
+    fill_triangle_factors_q4<Order, NeedHess>(
+        points, phi0, phi1, phi2, dphi0, dphi1, dphi2, d2phi0, d2phi1, d2phi2);
+
+    for (std::size_t tri = 0; tri < tri_count; ++tri) {
+        const auto& e = simplex_exponents[tri];
+        const std::size_t i0 = static_cast<std::size_t>(e[0]);
+        const std::size_t i1 = static_cast<std::size_t>(e[1]);
+        const std::size_t i2 = static_cast<std::size_t>(e[2]);
+
+        Real tri_v[4];
+        Real tri_gx[4];
+        Real tri_gy[4];
+        Real tri_hxx[4];
+        Real tri_hxy[4];
+        Real tri_hyy[4];
+        for (std::size_t q = 0; q < 4u; ++q) {
+            const Real v0 = phi0[q][i0];
+            const Real v1 = phi1[q][i1];
+            const Real v2 = phi2[q][i2];
+            const Real D0 = dphi0[q][i0];
+            const Real D1 = dphi1[q][i1];
+            const Real D2 = dphi2[q][i2];
+            const Real dl0 = D0 * v1 * v2;
+            tri_v[q] = v0 * v1 * v2;
+            tri_gx[q] = v0 * D1 * v2 - dl0;
+            tri_gy[q] = v0 * v1 * D2 - dl0;
+
+            if constexpr (NeedHess) {
+                const Real DD0 = d2phi0[q][i0];
+                const Real DD1 = d2phi1[q][i1];
+                const Real DD2 = d2phi2[q][i2];
+                const Real H00 = DD0 * v1 * v2;
+                const Real H11 = v0 * DD1 * v2;
+                const Real H22 = v0 * v1 * DD2;
+                const Real H01 = D0 * D1 * v2;
+                const Real H02 = D0 * v1 * D2;
+                const Real H12 = v0 * D1 * D2;
+                tri_hxx[q] = H00 - Real(2) * H01 + H11;
+                tri_hxy[q] = H00 - H01 - H02 + H12;
+                tri_hyy[q] = H00 - Real(2) * H02 + H22;
+            }
+        }
+
+        for (std::size_t z = 0; z < z_count; ++z) {
+            const std::size_t node = wedge_node_by_tri_z[tri * z_count + z];
+            Real* SVMP_RESTRICT value_row =
+                values_out != nullptr ? values_out + node * 4u : nullptr;
+            Real* SVMP_RESTRICT g =
+                gradients_out != nullptr ? gradients_out + node * 12u : nullptr;
+            Real* SVMP_RESTRICT H =
+                hessians_out != nullptr ? hessians_out + node * 36u : nullptr;
+
+            const Real z0 = axis_batch.values[z];
+            const Real z1 = axis_batch.values[z_count + z];
+            const Real z2 = axis_batch.values[2u * z_count + z];
+            const Real z3 = axis_batch.values[3u * z_count + z];
+            const Real dz0 = axis_batch.first[z];
+            const Real dz1 = axis_batch.first[z_count + z];
+            const Real dz2 = axis_batch.first[2u * z_count + z];
+            const Real dz3 = axis_batch.first[3u * z_count + z];
+
+            if (value_row != nullptr) {
+                value_row[0] = tri_v[0] * z0;
+                value_row[1] = tri_v[1] * z1;
+                value_row[2] = tri_v[2] * z2;
+                value_row[3] = tri_v[3] * z3;
+            }
+            if (g != nullptr) {
+                g[0] = tri_gx[0] * z0;
+                g[1] = tri_gx[1] * z1;
+                g[2] = tri_gx[2] * z2;
+                g[3] = tri_gx[3] * z3;
+                g[4] = tri_gy[0] * z0;
+                g[5] = tri_gy[1] * z1;
+                g[6] = tri_gy[2] * z2;
+                g[7] = tri_gy[3] * z3;
+                g[8] = tri_v[0] * dz0;
+                g[9] = tri_v[1] * dz1;
+                g[10] = tri_v[2] * dz2;
+                g[11] = tri_v[3] * dz3;
+            }
+            if constexpr (NeedHess) {
+                if (H != nullptr) {
+                    const Real d2z0 = axis_batch.second[z];
+                    const Real d2z1 = axis_batch.second[z_count + z];
+                    const Real d2z2 = axis_batch.second[2u * z_count + z];
+                    const Real d2z3 = axis_batch.second[3u * z_count + z];
+                    const Real hxz0 = tri_gx[0] * dz0;
+                    const Real hxz1 = tri_gx[1] * dz1;
+                    const Real hxz2 = tri_gx[2] * dz2;
+                    const Real hxz3 = tri_gx[3] * dz3;
+                    const Real hyz0 = tri_gy[0] * dz0;
+                    const Real hyz1 = tri_gy[1] * dz1;
+                    const Real hyz2 = tri_gy[2] * dz2;
+                    const Real hyz3 = tri_gy[3] * dz3;
+                    H[0] = tri_hxx[0] * z0;
+                    H[1] = tri_hxx[1] * z1;
+                    H[2] = tri_hxx[2] * z2;
+                    H[3] = tri_hxx[3] * z3;
+                    H[4] = tri_hxy[0] * z0;
+                    H[5] = tri_hxy[1] * z1;
+                    H[6] = tri_hxy[2] * z2;
+                    H[7] = tri_hxy[3] * z3;
+                    H[8] = hxz0;
+                    H[9] = hxz1;
+                    H[10] = hxz2;
+                    H[11] = hxz3;
+                    H[12] = H[4];
+                    H[13] = H[5];
+                    H[14] = H[6];
+                    H[15] = H[7];
+                    H[16] = tri_hyy[0] * z0;
+                    H[17] = tri_hyy[1] * z1;
+                    H[18] = tri_hyy[2] * z2;
+                    H[19] = tri_hyy[3] * z3;
+                    H[20] = hyz0;
+                    H[21] = hyz1;
+                    H[22] = hyz2;
+                    H[23] = hyz3;
+                    H[24] = hxz0;
+                    H[25] = hxz1;
+                    H[26] = hxz2;
+                    H[27] = hxz3;
+                    H[28] = hyz0;
+                    H[29] = hyz1;
+                    H[30] = hyz2;
+                    H[31] = hyz3;
+                    H[32] = tri_v[0] * d2z0;
+                    H[33] = tri_v[1] * d2z1;
+                    H[34] = tri_v[2] * d2z2;
+                    H[35] = tri_v[3] * d2z3;
+                }
+            }
+        }
+    }
+    return true;
+}
+
+template<bool NeedHess>
+bool try_evaluate_wedge_fused_stride4_q4(
+    const std::vector<std::array<int, 4>>& simplex_exponents,
+    const std::vector<std::size_t>& wedge_node_by_tri_z,
+    int order,
+    const std::vector<math::Vector<Real, 3>>& points,
+    const AxisBatchScratch& axis_batch,
+    int n_axis,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) {
+    switch (order) {
+        case 3:
+            return evaluate_wedge_fused_stride4_q4<3, NeedHess>(
+                simplex_exponents, wedge_node_by_tri_z, points, axis_batch, n_axis,
+                values_out, gradients_out, hessians_out);
+        case 4:
+            return evaluate_wedge_fused_stride4_q4<4, NeedHess>(
+                simplex_exponents, wedge_node_by_tri_z, points, axis_batch, n_axis,
+                values_out, gradients_out, hessians_out);
+        case 5:
+            return evaluate_wedge_fused_stride4_q4<5, NeedHess>(
+                simplex_exponents, wedge_node_by_tri_z, points, axis_batch, n_axis,
+                values_out, gradients_out, hessians_out);
+        case 6:
+            return evaluate_wedge_fused_stride4_q4<6, NeedHess>(
+                simplex_exponents, wedge_node_by_tri_z, points, axis_batch, n_axis,
+                values_out, gradients_out, hessians_out);
+        case 7:
+            return evaluate_wedge_fused_stride4_q4<7, NeedHess>(
+                simplex_exponents, wedge_node_by_tri_z, points, axis_batch, n_axis,
+                values_out, gradients_out, hessians_out);
+        case 8:
+            return evaluate_wedge_fused_stride4_q4<8, NeedHess>(
+                simplex_exponents, wedge_node_by_tri_z, points, axis_batch, n_axis,
+                values_out, gradients_out, hessians_out);
+        default:
+            return false;
+    }
+}
+
+struct TensorProductTableScratch {
+    std::vector<Real> vv;
+    std::vector<Real> dv;
+    std::vector<Real> vd;
+    std::vector<Real> d2v;
+    std::vector<Real> vd2;
+    std::vector<Real> dd;
+
+    void resizeFor(std::size_t count) {
+        if (vv.size() < count) vv.resize(count);
+        if (dv.size() < count) dv.resize(count);
+        if (vd.size() < count) vd.resize(count);
+        if (d2v.size() < count) d2v.resize(count);
+        if (vd2.size() < count) vd2.resize(count);
+        if (dd.size() < count) dd.resize(count);
+    }
+};
+
+// Caller-provided scratch buffers used by tensor-product evaluation. Three
+// independent axes plus reusable simplex/wedge intermediates.
+struct LagrangeEvaluateScratch {
+    AxisScratch axis_x;
+    AxisScratch axis_y;
+    AxisScratch axis_z;
+    AxisBatchScratch axis_x_batch;
+    AxisBatchScratch axis_y_batch;
+    AxisBatchScratch axis_z_batch;
+    TensorProductTableScratch tensor_tables;
+
+    std::vector<Real> tri_values;
+    std::vector<Gradient> tri_gradients;
+    std::vector<Hessian> tri_hessians;
+    std::vector<Real> tri_gradient_components;
+    std::vector<Real> tri_hessian_components;
+    std::vector<Real> wedge_tri_values_batch;
+    std::vector<Real> wedge_tri_gradient_batch;
+    std::vector<Real> wedge_tri_hessian_batch;
+
+    std::vector<Real> strided_values_tmp;
+    std::vector<Real> strided_gradients_tmp;
+    std::vector<Real> strided_hessians_tmp;
+
+    void prewarm(int max_order, std::size_t max_qpts) {
+        const int clamped_order = std::max(max_order, 0);
+        const std::size_t axis_size = static_cast<std::size_t>(clamped_order) + 1u;
+        const std::size_t axis_batch_size = axis_size * max_qpts;
+        const std::size_t tensor_table_size =
+            axis_size * axis_size * std::max<std::size_t>(max_qpts, 1u);
+        const std::size_t tensor_dofs = tensor_table_size * axis_size;
+        const std::size_t tri_count = axis_size * (axis_size + 1u) / 2u;
+
+        axis_x.reserveFor(axis_size);
+        axis_y.reserveFor(axis_size);
+        axis_z.reserveFor(axis_size);
+        axis_x_batch.resizeFor(axis_batch_size, AxisDeriv::ValuesAndFirstAndSecond);
+        axis_y_batch.resizeFor(axis_batch_size, AxisDeriv::ValuesAndFirstAndSecond);
+        axis_z_batch.resizeFor(axis_batch_size, AxisDeriv::ValuesAndFirstAndSecond);
+        tensor_tables.resizeFor(tensor_table_size);
+        tri_values.reserve(tri_count);
+        tri_gradients.reserve(tri_count);
+        tri_hessians.reserve(tri_count);
+        tri_gradient_components.reserve(tri_count * 3u);
+        tri_hessian_components.reserve(tri_count * 9u);
+        wedge_tri_values_batch.reserve(tri_count * max_qpts);
+        wedge_tri_gradient_batch.reserve(tri_count * 3u * max_qpts);
+        wedge_tri_hessian_batch.reserve(tri_count * 9u * max_qpts);
+        strided_values_tmp.reserve(tensor_dofs);
+        strided_gradients_tmp.reserve(tensor_dofs * 3u);
+        strided_hessians_tmp.reserve(tensor_dofs * 9u);
+    }
+};
+
+LagrangeEvaluateScratch& evaluate_scratch() {
+    // Scratch is intentionally thread-local: assembly and benchmark callers run
+    // evaluation on persistent worker threads, so capacity is reused by thread.
+    static thread_local LagrangeEvaluateScratch s;
+    return s;
+}
+
+// Fill axis scratch and return a non-owning view. Uncomputed slots still have
+// valid pointers to scratch storage (they may hold stale data) — callers must
+// only read the slots they requested via `level`. Common low orders use
+// precomputed Horner coefficients; high orders use barycentric axis evaluation.
+AxisBasisEvaluations fill_axis_scratch(AxisScratch& s,
+                                       const Real* v_coeffs,
+                                       const Real* d_coeffs,
+                                       const Real* d2_coeffs,
+                                       const Real* barycentric_weights,
+                                       int n_axis, Real xi,
+                                       AxisDeriv level) {
+    const std::size_t n = static_cast<std::size_t>(n_axis);
+    s.reserveFor(n);
+    Real* first  = (level == AxisDeriv::ValuesOnly) ? nullptr : s.first.data();
+    Real* second = (level == AxisDeriv::ValuesAndFirstAndSecond) ? s.second.data() : nullptr;
+    evaluate_1d_basis_to(v_coeffs, d_coeffs, d2_coeffs, barycentric_weights,
+                         n_axis, xi, s.values.data(), first, second);
+    return AxisBasisEvaluations{s.values.data(), s.first.data(), s.second.data(), n};
+}
+
+void fill_axis_batch(AxisBatchScratch& scratch,
+                     const std::vector<math::Vector<Real, 3>>& points,
+                     std::size_t component,
+                     const Real* v_coeffs,
+                     const Real* d_coeffs,
+                     const Real* d2_coeffs,
+                     const Real* barycentric_weights,
+                     int n_axis,
+                     AxisDeriv level) {
+    const std::size_t count = points.size() * static_cast<std::size_t>(n_axis);
+    scratch.resizeFor(count, level);
+    Real* first = (level == AxisDeriv::ValuesOnly) ? nullptr : scratch.first.data();
+    Real* second = (level == AxisDeriv::ValuesAndFirstAndSecond) ? scratch.second.data() : nullptr;
+    const std::size_t axis_stride = static_cast<std::size_t>(n_axis);
+    for (std::size_t q = 0; q < points.size(); ++q) {
+        evaluate_1d_basis_to(v_coeffs, d_coeffs, d2_coeffs, barycentric_weights, n_axis,
+                             points[q][component],
+                             scratch.values.data() + q * axis_stride,
+                             first ? first + q * axis_stride : nullptr,
+                             second ? second + q * axis_stride : nullptr);
+    }
+}
+
+// Maximum yz-table footprint that fits comfortably on the stack for the
+// Lagrange performance sweep. Order-8 hex q=4 needs 4*(9x9) entries per table.
+// Higher orders fall back to thread_local heap buffers.
+inline constexpr std::size_t kMaxStackYZ = 384;
+
+struct TensorProductVectorSink {
+    std::vector<Real>* values;
+    std::vector<Gradient>* gradients;
+    std::vector<Hessian>* hessians;
+
+    bool wants_values() const noexcept { return values != nullptr; }
+    bool wants_gradients() const noexcept { return gradients != nullptr; }
+    bool wants_hessians() const noexcept { return hessians != nullptr; }
+
+    void prepare(std::size_t n_nodes) const {
+        if (values)    values->resize(n_nodes);
+        if (gradients) gradients->resize(n_nodes);
+        if (hessians)  hessians->resize(n_nodes);
+    }
+
+    void write_value(std::size_t n, Real value) const {
+        (*values)[n] = value;
+    }
+
+    void write_gradient(std::size_t n, Real dx, Real dy, Real dz) const {
+        auto& g = (*gradients)[n];
+        g[0] = dx;
+        g[1] = dy;
+        g[2] = dz;
+    }
+
+    void write_hessian(std::size_t n,
+                       Real xx,
+                       Real yy,
+                       Real zz,
+                       Real xy,
+                       Real xz,
+                       Real yz) const {
+        (*hessians)[n] = make_symmetric_hessian(xx, yy, zz, xy, xz, yz);
+    }
+};
+
+struct TensorProductRawSink {
+    Real* values;
+    Real* gradients;
+    Real* hessians;
+
+    bool wants_values() const noexcept { return values != nullptr; }
+    bool wants_gradients() const noexcept { return gradients != nullptr; }
+    bool wants_hessians() const noexcept { return hessians != nullptr; }
+
+    void prepare(std::size_t) const {}
+
+    void write_value(std::size_t n, Real value) const {
+        values[n] = value;
+    }
+
+    void write_gradient(std::size_t n, Real dx, Real dy, Real dz) const {
+        Real* g = gradients + n * 3u;
+        g[0] = dx;
+        g[1] = dy;
+        g[2] = dz;
+    }
+
+    void write_hessian(std::size_t n,
+                       Real xx,
+                       Real yy,
+                       Real zz,
+                       Real xy,
+                       Real xz,
+                       Real yz) const {
+        Real* H = hessians + n * 9u;
+        H[0] = xx;
+        H[4] = yy;
+        H[8] = zz;
+        H[1] = xy; H[3] = xy;
+        H[2] = xz; H[6] = xz;
+        H[5] = yz; H[7] = yz;
+    }
+};
+
+// Fused sum-factorized tensor-product evaluator.
+//
+// Precomputes one to six (ny x nz)-shaped tables of partial products
+// `M_xy[j*nz + k]` so that the inner per-node loop performs at most one
+// multiplication per output instead of two. With all three output buffers
+// supplied, this is the fused values + gradients + hessians path that shares
+// every per-axis evaluation.
+//
+// Per-node multiply count (vs. the unfactored variants):
+//   values only       : 1  (was 2)
+//   gradients only    : 3  (was 6)
+//   hessians only     : 6  (was 12)
+//   all three         : 10 (was 20)
+//
+// Dimensional scope: works uniformly for Line/Quadrilateral/Hexahedron with
+// the unused axes' size folded to 1 via constant_axis_basis().
+template <typename Sink>
+void evaluate_tensor_product_factorized_impl(
+    const std::vector<std::array<std::size_t, 3>>& tensor_indices,
+    const AxisBasisEvaluations& x_axis,
+    const AxisBasisEvaluations& y_axis,
+    const AxisBasisEvaluations& z_axis,
+    const Sink& sink) {
+    const std::size_t ny = y_axis.size;
+    const std::size_t nz = z_axis.size;
+    const std::size_t nyz = ny * nz;
+    const bool need_values = sink.wants_values();
+    const bool need_grad = sink.wants_gradients();
+    const bool need_hess = sink.wants_hessians();
+
+    Real Mvv_stack[kMaxStackYZ];
+    Real Mdv_stack[kMaxStackYZ];
+    Real Mvd_stack[kMaxStackYZ];
+    Real Md2v_stack[kMaxStackYZ];
+    Real Mvd2_stack[kMaxStackYZ];
+    Real Mdd_stack[kMaxStackYZ];
+
+    Real* Mvv;
+    Real* Mdv;
+    Real* Mvd;
+    Real* Md2v;
+    Real* Mvd2;
+    Real* Mdd;
+    if (nyz <= kMaxStackYZ) {
+        Mvv = Mvv_stack;
+        Mdv = Mdv_stack;
+        Mvd = Mvd_stack;
+        Md2v = Md2v_stack;
+        Mvd2 = Mvd2_stack;
+        Mdd = Mdd_stack;
+    } else {
+        auto& tables = evaluate_scratch().tensor_tables;
+        tables.resizeFor(nyz);
+        Mvv = tables.vv.data();
+        Mdv = tables.dv.data();
+        Mvd = tables.vd.data();
+        Md2v = tables.d2v.data();
+        Mvd2 = tables.vd2.data();
+        Mdd = tables.dd.data();
+    }
+
+    // M_vv is required by every output (values, ∂ξ, ∂ξ²).
+    for (std::size_t j = 0; j < ny; ++j) {
+        const Real yv = y_axis.values[j];
+        for (std::size_t k = 0; k < nz; ++k) {
+            Mvv[j * nz + k] = yv * z_axis.values[k];
+        }
+    }
+
+    if (need_grad || need_hess) {
+        for (std::size_t j = 0; j < ny; ++j) {
+            const Real yv = y_axis.values[j];
+            const Real yd = y_axis.first[j];
+            for (std::size_t k = 0; k < nz; ++k) {
+                Mdv[j * nz + k] = yd * z_axis.values[k];
+                Mvd[j * nz + k] = yv * z_axis.first[k];
+            }
+        }
+    }
+
+    if (need_hess) {
+        for (std::size_t j = 0; j < ny; ++j) {
+            const Real yv = y_axis.values[j];
+            const Real yd = y_axis.first[j];
+            const Real yd2 = y_axis.second[j];
+            for (std::size_t k = 0; k < nz; ++k) {
+                Md2v[j * nz + k] = yd2 * z_axis.values[k];
+                Mvd2[j * nz + k] = yv  * z_axis.second[k];
+                Mdd[j * nz + k]  = yd  * z_axis.first[k];
+            }
+        }
+    }
+
+    const std::size_t n_nodes = tensor_indices.size();
+    sink.prepare(n_nodes);
+
+    for (std::size_t n = 0; n < n_nodes; ++n) {
+        const auto& idx = tensor_indices[n];
+        const std::size_t i = idx[0];
+        const std::size_t jk = idx[1] * nz + idx[2];
+
+        const Real Lx = x_axis.values[i];
+
+        if (need_values) {
+            sink.write_value(n, Lx * Mvv[jk]);
+        }
+
+        if (need_grad) {
+            const Real dLx = x_axis.first[i];
+            sink.write_gradient(n,
+                                dLx * Mvv[jk],
+                                Lx  * Mdv[jk],
+                                Lx  * Mvd[jk]);
+        }
+
+        if (need_hess) {
+            const Real dLx  = x_axis.first[i];
+            const Real d2Lx = x_axis.second[i];
+            sink.write_hessian(n,
+                               d2Lx * Mvv[jk],
+                               Lx   * Md2v[jk],
+                               Lx   * Mvd2[jk],
+                               dLx  * Mdv[jk],
+                               dLx  * Mvd[jk],
+                               Lx   * Mdd[jk]);
+        }
+    }
+}
+
+void evaluate_tensor_product_factorized(
+    const std::vector<std::array<std::size_t, 3>>& tensor_indices,
+    const AxisBasisEvaluations& x_axis,
+    const AxisBasisEvaluations& y_axis,
+    const AxisBasisEvaluations& z_axis,
+    std::vector<Real>* values_out,
+    std::vector<Gradient>* gradients_out,
+    std::vector<Hessian>* hessians_out) {
+    const TensorProductVectorSink sink{values_out, gradients_out, hessians_out};
+    evaluate_tensor_product_factorized_impl(tensor_indices, x_axis, y_axis, z_axis, sink);
+}
+
+void evaluate_tensor_product_factorized_to(
+    const std::vector<std::array<std::size_t, 3>>& tensor_indices,
+    const AxisBasisEvaluations& x_axis,
+    const AxisBasisEvaluations& y_axis,
+    const AxisBasisEvaluations& z_axis,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) {
+    const TensorProductRawSink sink{values_out, gradients_out, hessians_out};
+    evaluate_tensor_product_factorized_impl(tensor_indices, x_axis, y_axis, z_axis, sink);
+}
+
+template <std::size_t Q>
+inline void write_tensor_product_value_strided_q(
+    std::size_t axis_stride,
+    std::size_t nyz,
+    std::size_t i,
+    std::size_t jk,
+    const AxisBatchScratch& x_batch,
+    const Real* SVMP_RESTRICT Mvv,
+    Real* SVMP_RESTRICT value_row) {
+    const std::size_t q_axis = Q * axis_stride;
+    const std::size_t slot = Q * nyz + jk;
+    value_row[Q] = x_batch.values[q_axis + i] * Mvv[slot];
+}
+
+template <std::size_t Q>
+inline void write_tensor_product_hessian_strided_q(
+    std::size_t axis_stride,
+    std::size_t nyz,
+    std::size_t i,
+    std::size_t jk,
+    std::size_t output_stride,
+    const AxisBatchScratch& x_batch,
+    const Real* SVMP_RESTRICT Mvv,
+    const Real* SVMP_RESTRICT Mdv,
+    const Real* SVMP_RESTRICT Mvd,
+    const Real* SVMP_RESTRICT Md2v,
+    const Real* SVMP_RESTRICT Mvd2,
+    const Real* SVMP_RESTRICT Mdd,
+    Real* SVMP_RESTRICT hess_row) {
+    const std::size_t q_axis = Q * axis_stride;
+    const std::size_t slot = Q * nyz + jk;
+    const Real xv = x_batch.values[q_axis + i];
+    const Real xd = x_batch.first[q_axis + i];
+    const Real x2 = x_batch.second[q_axis + i];
+    const Real hxy = xd * Mdv[slot];
+    const Real hxz = xd * Mvd[slot];
+    const Real hyz = xv * Mdd[slot];
+    hess_row[0u * output_stride + Q] = x2 * Mvv[slot];
+    hess_row[4u * output_stride + Q] = xv * Md2v[slot];
+    hess_row[8u * output_stride + Q] = xv * Mvd2[slot];
+    hess_row[1u * output_stride + Q] = hxy;
+    hess_row[3u * output_stride + Q] = hxy;
+    hess_row[2u * output_stride + Q] = hxz;
+    hess_row[6u * output_stride + Q] = hxz;
+    hess_row[5u * output_stride + Q] = hyz;
+    hess_row[7u * output_stride + Q] = hyz;
+}
+
+template <std::size_t Q>
+inline void write_tensor_product_hessian_stride4_q(
+    std::size_t axis_stride,
+    std::size_t nyz,
+    std::size_t i,
+    std::size_t jk,
+    const AxisBatchScratch& x_batch,
+    const Real* SVMP_RESTRICT Mvv,
+    const Real* SVMP_RESTRICT Mdv,
+    const Real* SVMP_RESTRICT Mvd,
+    const Real* SVMP_RESTRICT Md2v,
+    const Real* SVMP_RESTRICT Mvd2,
+    const Real* SVMP_RESTRICT Mdd,
+    Real* SVMP_RESTRICT hess_row) {
+    const std::size_t q_axis = Q * axis_stride;
+    const std::size_t slot = Q * nyz + jk;
+    const Real xv = x_batch.values[q_axis + i];
+    const Real xd = x_batch.first[q_axis + i];
+    const Real x2 = x_batch.second[q_axis + i];
+    const Real hxy = xd * Mdv[slot];
+    const Real hxz = xd * Mvd[slot];
+    const Real hyz = xv * Mdd[slot];
+    hess_row[Q] = x2 * Mvv[slot];
+    hess_row[16u + Q] = xv * Md2v[slot];
+    hess_row[32u + Q] = xv * Mvd2[slot];
+    hess_row[4u + Q] = hxy;
+    hess_row[12u + Q] = hxy;
+    hess_row[8u + Q] = hxz;
+    hess_row[24u + Q] = hxz;
+    hess_row[20u + Q] = hyz;
+    hess_row[28u + Q] = hyz;
+}
+
+template <std::size_t Q>
+inline void write_tensor_product_gradient_strided_q(
+    std::size_t axis_stride,
+    std::size_t nyz,
+    std::size_t i,
+    std::size_t jk,
+    std::size_t output_stride,
+    const AxisBatchScratch& x_batch,
+    const Real* SVMP_RESTRICT Mvv,
+    const Real* SVMP_RESTRICT Mdv,
+    const Real* SVMP_RESTRICT Mvd,
+    Real* SVMP_RESTRICT grad_row) {
+    const std::size_t q_axis = Q * axis_stride;
+    const std::size_t slot = Q * nyz + jk;
+    const Real xv = x_batch.values[q_axis + i];
+    const Real xd = x_batch.first[q_axis + i];
+    grad_row[0u * output_stride + Q] = xd * Mvv[slot];
+    grad_row[1u * output_stride + Q] = xv * Mdv[slot];
+    grad_row[2u * output_stride + Q] = xv * Mvd[slot];
+}
+
+template <std::size_t Q>
+inline void write_tensor_product_gradient_stride4_q(
+    std::size_t axis_stride,
+    std::size_t nyz,
+    std::size_t i,
+    std::size_t jk,
+    const AxisBatchScratch& x_batch,
+    const Real* SVMP_RESTRICT Mvv,
+    const Real* SVMP_RESTRICT Mdv,
+    const Real* SVMP_RESTRICT Mvd,
+    Real* SVMP_RESTRICT grad_row) {
+    const std::size_t q_axis = Q * axis_stride;
+    const std::size_t slot = Q * nyz + jk;
+    const Real xv = x_batch.values[q_axis + i];
+    const Real xd = x_batch.first[q_axis + i];
+    grad_row[Q] = xd * Mvv[slot];
+    grad_row[4u + Q] = xv * Mdv[slot];
+    grad_row[8u + Q] = xv * Mvd[slot];
+}
+
+template <std::size_t Q>
+inline void write_tensor_product_all_strided_q(
+    std::size_t axis_stride,
+    std::size_t nyz,
+    std::size_t i,
+    std::size_t jk,
+    std::size_t output_stride,
+    const AxisBatchScratch& x_batch,
+    const Real* SVMP_RESTRICT Mvv,
+    const Real* SVMP_RESTRICT Mdv,
+    const Real* SVMP_RESTRICT Mvd,
+    const Real* SVMP_RESTRICT Md2v,
+    const Real* SVMP_RESTRICT Mvd2,
+    const Real* SVMP_RESTRICT Mdd,
+    Real* SVMP_RESTRICT value_row,
+    Real* SVMP_RESTRICT grad_row,
+    Real* SVMP_RESTRICT hess_row) {
+    const std::size_t q_axis = Q * axis_stride;
+    const std::size_t slot = Q * nyz + jk;
+    const Real xv = x_batch.values[q_axis + i];
+    const Real xd = x_batch.first[q_axis + i];
+    value_row[Q] = xv * Mvv[slot];
+    grad_row[0u * output_stride + Q] = xd * Mvv[slot];
+    grad_row[1u * output_stride + Q] = xv * Mdv[slot];
+    grad_row[2u * output_stride + Q] = xv * Mvd[slot];
+
+    const Real x2 = x_batch.second[q_axis + i];
+    const Real hxy = xd * Mdv[slot];
+    const Real hxz = xd * Mvd[slot];
+    const Real hyz = xv * Mdd[slot];
+    hess_row[0u * output_stride + Q] = x2 * Mvv[slot];
+    hess_row[4u * output_stride + Q] = xv * Md2v[slot];
+    hess_row[8u * output_stride + Q] = xv * Mvd2[slot];
+    hess_row[1u * output_stride + Q] = hxy;
+    hess_row[3u * output_stride + Q] = hxy;
+    hess_row[2u * output_stride + Q] = hxz;
+    hess_row[6u * output_stride + Q] = hxz;
+    hess_row[5u * output_stride + Q] = hyz;
+    hess_row[7u * output_stride + Q] = hyz;
+}
+
+template <std::size_t Q>
+inline void write_tensor_product_all_stride4_q(
+    std::size_t axis_stride,
+    std::size_t nyz,
+    std::size_t i,
+    std::size_t jk,
+    const AxisBatchScratch& x_batch,
+    const Real* SVMP_RESTRICT Mvv,
+    const Real* SVMP_RESTRICT Mdv,
+    const Real* SVMP_RESTRICT Mvd,
+    const Real* SVMP_RESTRICT Md2v,
+    const Real* SVMP_RESTRICT Mvd2,
+    const Real* SVMP_RESTRICT Mdd,
+    Real* SVMP_RESTRICT value_row,
+    Real* SVMP_RESTRICT grad_row,
+    Real* SVMP_RESTRICT hess_row) {
+    const std::size_t q_axis = Q * axis_stride;
+    const std::size_t slot = Q * nyz + jk;
+    const Real xv = x_batch.values[q_axis + i];
+    const Real xd = x_batch.first[q_axis + i];
+    value_row[Q] = xv * Mvv[slot];
+    grad_row[Q] = xd * Mvv[slot];
+    grad_row[4u + Q] = xv * Mdv[slot];
+    grad_row[8u + Q] = xv * Mvd[slot];
+
+    const Real x2 = x_batch.second[q_axis + i];
+    const Real hxy = xd * Mdv[slot];
+    const Real hxz = xd * Mvd[slot];
+    const Real hyz = xv * Mdd[slot];
+    hess_row[Q] = x2 * Mvv[slot];
+    hess_row[16u + Q] = xv * Md2v[slot];
+    hess_row[32u + Q] = xv * Mvd2[slot];
+    hess_row[4u + Q] = hxy;
+    hess_row[12u + Q] = hxy;
+    hess_row[8u + Q] = hxz;
+    hess_row[24u + Q] = hxz;
+    hess_row[20u + Q] = hyz;
+    hess_row[28u + Q] = hyz;
+}
+
+SVMP_LAGRANGE_NOINLINE SVMP_LAGRANGE_ALIGN64 bool
+evaluate_tensor_product_values_stride4_q4_transposed(
+    const std::vector<std::array<std::size_t, 3>>& tensor_indices,
+    std::size_t axis_stride,
+    const AxisBatchScratch& x_batch,
+    const AxisBatchScratch& y_batch,
+    const AxisBatchScratch& z_batch,
+    Real* SVMP_RESTRICT values_out) {
+    const std::size_t nyz = axis_stride * axis_stride;
+    const std::size_t table_count = 4u * nyz;
+    if (table_count > kMaxStackYZ || values_out == nullptr) {
+        return false;
+    }
+
+    Real Mvv_stack[kMaxStackYZ];
+    for (std::size_t j = 0; j < axis_stride; ++j) {
+        const Real yv0 = y_batch.values[j];
+        const Real yv1 = y_batch.values[axis_stride + j];
+        const Real yv2 = y_batch.values[2u * axis_stride + j];
+        const Real yv3 = y_batch.values[3u * axis_stride + j];
+        for (std::size_t k = 0; k < axis_stride; ++k) {
+            const std::size_t base = (j * axis_stride + k) * 4u;
+            Mvv_stack[base + 0u] = yv0 * z_batch.values[k];
+            Mvv_stack[base + 1u] = yv1 * z_batch.values[axis_stride + k];
+            Mvv_stack[base + 2u] = yv2 * z_batch.values[2u * axis_stride + k];
+            Mvv_stack[base + 3u] = yv3 * z_batch.values[3u * axis_stride + k];
+        }
+    }
+
+    for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
+        const auto& idx = tensor_indices[node];
+        const std::size_t i = idx[0];
+        const std::size_t jk = (idx[1] * axis_stride + idx[2]) * 4u;
+        Real* SVMP_RESTRICT value_row = values_out + node * 4u;
+        value_row[0u] = x_batch.values[i] * Mvv_stack[jk + 0u];
+        value_row[1u] = x_batch.values[axis_stride + i] * Mvv_stack[jk + 1u];
+        value_row[2u] = x_batch.values[2u * axis_stride + i] * Mvv_stack[jk + 2u];
+        value_row[3u] = x_batch.values[3u * axis_stride + i] * Mvv_stack[jk + 3u];
+    }
+
+    return true;
+}
+
+SVMP_LAGRANGE_NOINLINE SVMP_LAGRANGE_ALIGN64 bool
+evaluate_tensor_product_gradients_stride4_q4_transposed(
+    const std::vector<std::array<std::size_t, 3>>& tensor_indices,
+    std::size_t axis_stride,
+    const AxisBatchScratch& x_batch,
+    const AxisBatchScratch& y_batch,
+    const AxisBatchScratch& z_batch,
+    Real* SVMP_RESTRICT gradients_out) {
+    const std::size_t nyz = axis_stride * axis_stride;
+    const std::size_t table_count = 4u * nyz;
+    if (table_count > kMaxStackYZ || gradients_out == nullptr) {
+        return false;
+    }
+
+    Real Mvv_stack[kMaxStackYZ];
+    Real Mdv_stack[kMaxStackYZ];
+    Real Mvd_stack[kMaxStackYZ];
+    for (std::size_t j = 0; j < axis_stride; ++j) {
+        const Real yv0 = y_batch.values[j];
+        const Real yv1 = y_batch.values[axis_stride + j];
+        const Real yv2 = y_batch.values[2u * axis_stride + j];
+        const Real yv3 = y_batch.values[3u * axis_stride + j];
+        const Real yd0 = y_batch.first[j];
+        const Real yd1 = y_batch.first[axis_stride + j];
+        const Real yd2 = y_batch.first[2u * axis_stride + j];
+        const Real yd3 = y_batch.first[3u * axis_stride + j];
+        for (std::size_t k = 0; k < axis_stride; ++k) {
+            const std::size_t base = (j * axis_stride + k) * 4u;
+            const Real zv0 = z_batch.values[k];
+            const Real zv1 = z_batch.values[axis_stride + k];
+            const Real zv2 = z_batch.values[2u * axis_stride + k];
+            const Real zv3 = z_batch.values[3u * axis_stride + k];
+            const Real zd0 = z_batch.first[k];
+            const Real zd1 = z_batch.first[axis_stride + k];
+            const Real zd2 = z_batch.first[2u * axis_stride + k];
+            const Real zd3 = z_batch.first[3u * axis_stride + k];
+
+            Mvv_stack[base + 0u] = yv0 * zv0;
+            Mvv_stack[base + 1u] = yv1 * zv1;
+            Mvv_stack[base + 2u] = yv2 * zv2;
+            Mvv_stack[base + 3u] = yv3 * zv3;
+            Mdv_stack[base + 0u] = yd0 * zv0;
+            Mdv_stack[base + 1u] = yd1 * zv1;
+            Mdv_stack[base + 2u] = yd2 * zv2;
+            Mdv_stack[base + 3u] = yd3 * zv3;
+            Mvd_stack[base + 0u] = yv0 * zd0;
+            Mvd_stack[base + 1u] = yv1 * zd1;
+            Mvd_stack[base + 2u] = yv2 * zd2;
+            Mvd_stack[base + 3u] = yv3 * zd3;
+        }
+    }
+
+    for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
+        const auto& idx = tensor_indices[node];
+        const std::size_t i = idx[0];
+        const std::size_t jk = (idx[1] * axis_stride + idx[2]) * 4u;
+
+        const Real xv0 = x_batch.values[i];
+        const Real xv1 = x_batch.values[axis_stride + i];
+        const Real xv2 = x_batch.values[2u * axis_stride + i];
+        const Real xv3 = x_batch.values[3u * axis_stride + i];
+        const Real xd0 = x_batch.first[i];
+        const Real xd1 = x_batch.first[axis_stride + i];
+        const Real xd2 = x_batch.first[2u * axis_stride + i];
+        const Real xd3 = x_batch.first[3u * axis_stride + i];
+
+        Real* SVMP_RESTRICT grad_row = gradients_out + node * 12u;
+        grad_row[0u] = xd0 * Mvv_stack[jk + 0u];
+        grad_row[1u] = xd1 * Mvv_stack[jk + 1u];
+        grad_row[2u] = xd2 * Mvv_stack[jk + 2u];
+        grad_row[3u] = xd3 * Mvv_stack[jk + 3u];
+        grad_row[4u] = xv0 * Mdv_stack[jk + 0u];
+        grad_row[5u] = xv1 * Mdv_stack[jk + 1u];
+        grad_row[6u] = xv2 * Mdv_stack[jk + 2u];
+        grad_row[7u] = xv3 * Mdv_stack[jk + 3u];
+        grad_row[8u] = xv0 * Mvd_stack[jk + 0u];
+        grad_row[9u] = xv1 * Mvd_stack[jk + 1u];
+        grad_row[10u] = xv2 * Mvd_stack[jk + 2u];
+        grad_row[11u] = xv3 * Mvd_stack[jk + 3u];
+    }
+
+    return true;
+}
+
+template<bool NeedAllOutputs>
+SVMP_LAGRANGE_NOINLINE SVMP_LAGRANGE_ALIGN64 bool
+evaluate_tensor_product_second_stride4_q4_transposed(
+    const std::vector<std::array<std::size_t, 3>>& tensor_indices,
+    std::size_t axis_stride,
+    const AxisBatchScratch& x_batch,
+    const AxisBatchScratch& y_batch,
+    const AxisBatchScratch& z_batch,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) {
+    const std::size_t nyz = axis_stride * axis_stride;
+    const std::size_t table_count = 4u * nyz;
+    if (table_count > kMaxStackYZ || hessians_out == nullptr) {
+        return false;
+    }
+    if constexpr (NeedAllOutputs) {
+        if (values_out == nullptr || gradients_out == nullptr) {
+            return false;
+        }
+    }
+
+    Real Mvv_stack[kMaxStackYZ];
+    Real Mdv_stack[kMaxStackYZ];
+    Real Mvd_stack[kMaxStackYZ];
+    Real Md2v_stack[kMaxStackYZ];
+    Real Mvd2_stack[kMaxStackYZ];
+    Real Mdd_stack[kMaxStackYZ];
+
+    for (std::size_t j = 0; j < axis_stride; ++j) {
+        const Real yv0 = y_batch.values[j];
+        const Real yv1 = y_batch.values[axis_stride + j];
+        const Real yv2 = y_batch.values[2u * axis_stride + j];
+        const Real yv3 = y_batch.values[3u * axis_stride + j];
+        const Real yd0 = y_batch.first[j];
+        const Real yd1 = y_batch.first[axis_stride + j];
+        const Real yd2 = y_batch.first[2u * axis_stride + j];
+        const Real yd3 = y_batch.first[3u * axis_stride + j];
+        const Real y20 = y_batch.second[j];
+        const Real y21 = y_batch.second[axis_stride + j];
+        const Real y22 = y_batch.second[2u * axis_stride + j];
+        const Real y23 = y_batch.second[3u * axis_stride + j];
+
+        for (std::size_t k = 0; k < axis_stride; ++k) {
+            const std::size_t base = (j * axis_stride + k) * 4u;
+            const Real zv0 = z_batch.values[k];
+            const Real zv1 = z_batch.values[axis_stride + k];
+            const Real zv2 = z_batch.values[2u * axis_stride + k];
+            const Real zv3 = z_batch.values[3u * axis_stride + k];
+            const Real zd0 = z_batch.first[k];
+            const Real zd1 = z_batch.first[axis_stride + k];
+            const Real zd2 = z_batch.first[2u * axis_stride + k];
+            const Real zd3 = z_batch.first[3u * axis_stride + k];
+            const Real z20 = z_batch.second[k];
+            const Real z21 = z_batch.second[axis_stride + k];
+            const Real z22 = z_batch.second[2u * axis_stride + k];
+            const Real z23 = z_batch.second[3u * axis_stride + k];
+
+            Mvv_stack[base + 0u] = yv0 * zv0;
+            Mvv_stack[base + 1u] = yv1 * zv1;
+            Mvv_stack[base + 2u] = yv2 * zv2;
+            Mvv_stack[base + 3u] = yv3 * zv3;
+            Mdv_stack[base + 0u] = yd0 * zv0;
+            Mdv_stack[base + 1u] = yd1 * zv1;
+            Mdv_stack[base + 2u] = yd2 * zv2;
+            Mdv_stack[base + 3u] = yd3 * zv3;
+            Mvd_stack[base + 0u] = yv0 * zd0;
+            Mvd_stack[base + 1u] = yv1 * zd1;
+            Mvd_stack[base + 2u] = yv2 * zd2;
+            Mvd_stack[base + 3u] = yv3 * zd3;
+            Md2v_stack[base + 0u] = y20 * zv0;
+            Md2v_stack[base + 1u] = y21 * zv1;
+            Md2v_stack[base + 2u] = y22 * zv2;
+            Md2v_stack[base + 3u] = y23 * zv3;
+            Mvd2_stack[base + 0u] = yv0 * z20;
+            Mvd2_stack[base + 1u] = yv1 * z21;
+            Mvd2_stack[base + 2u] = yv2 * z22;
+            Mvd2_stack[base + 3u] = yv3 * z23;
+            Mdd_stack[base + 0u] = yd0 * zd0;
+            Mdd_stack[base + 1u] = yd1 * zd1;
+            Mdd_stack[base + 2u] = yd2 * zd2;
+            Mdd_stack[base + 3u] = yd3 * zd3;
+        }
+    }
+
+    for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
+        const auto& idx = tensor_indices[node];
+        const std::size_t i = idx[0];
+        const std::size_t jk = (idx[1] * axis_stride + idx[2]) * 4u;
+
+        const Real xv0 = x_batch.values[i];
+        const Real xv1 = x_batch.values[axis_stride + i];
+        const Real xv2 = x_batch.values[2u * axis_stride + i];
+        const Real xv3 = x_batch.values[3u * axis_stride + i];
+        const Real xd0 = x_batch.first[i];
+        const Real xd1 = x_batch.first[axis_stride + i];
+        const Real xd2 = x_batch.first[2u * axis_stride + i];
+        const Real xd3 = x_batch.first[3u * axis_stride + i];
+        const Real x20 = x_batch.second[i];
+        const Real x21 = x_batch.second[axis_stride + i];
+        const Real x22 = x_batch.second[2u * axis_stride + i];
+        const Real x23 = x_batch.second[3u * axis_stride + i];
+
+        const Real mvv0 = Mvv_stack[jk + 0u];
+        const Real mvv1 = Mvv_stack[jk + 1u];
+        const Real mvv2 = Mvv_stack[jk + 2u];
+        const Real mvv3 = Mvv_stack[jk + 3u];
+        const Real mdv0 = Mdv_stack[jk + 0u];
+        const Real mdv1 = Mdv_stack[jk + 1u];
+        const Real mdv2 = Mdv_stack[jk + 2u];
+        const Real mdv3 = Mdv_stack[jk + 3u];
+        const Real mvd0 = Mvd_stack[jk + 0u];
+        const Real mvd1 = Mvd_stack[jk + 1u];
+        const Real mvd2 = Mvd_stack[jk + 2u];
+        const Real mvd3 = Mvd_stack[jk + 3u];
+        const Real md2v0 = Md2v_stack[jk + 0u];
+        const Real md2v1 = Md2v_stack[jk + 1u];
+        const Real md2v2 = Md2v_stack[jk + 2u];
+        const Real md2v3 = Md2v_stack[jk + 3u];
+        const Real mvd20 = Mvd2_stack[jk + 0u];
+        const Real mvd21 = Mvd2_stack[jk + 1u];
+        const Real mvd22 = Mvd2_stack[jk + 2u];
+        const Real mvd23 = Mvd2_stack[jk + 3u];
+        const Real mdd0 = Mdd_stack[jk + 0u];
+        const Real mdd1 = Mdd_stack[jk + 1u];
+        const Real mdd2 = Mdd_stack[jk + 2u];
+        const Real mdd3 = Mdd_stack[jk + 3u];
+
+        if constexpr (NeedAllOutputs) {
+            Real* SVMP_RESTRICT value_row = values_out + node * 4u;
+            value_row[0u] = xv0 * mvv0;
+            value_row[1u] = xv1 * mvv1;
+            value_row[2u] = xv2 * mvv2;
+            value_row[3u] = xv3 * mvv3;
+
+            Real* SVMP_RESTRICT grad_row = gradients_out + node * 12u;
+            grad_row[0u] = xd0 * mvv0;
+            grad_row[1u] = xd1 * mvv1;
+            grad_row[2u] = xd2 * mvv2;
+            grad_row[3u] = xd3 * mvv3;
+            grad_row[4u] = xv0 * mdv0;
+            grad_row[5u] = xv1 * mdv1;
+            grad_row[6u] = xv2 * mdv2;
+            grad_row[7u] = xv3 * mdv3;
+            grad_row[8u] = xv0 * mvd0;
+            grad_row[9u] = xv1 * mvd1;
+            grad_row[10u] = xv2 * mvd2;
+            grad_row[11u] = xv3 * mvd3;
+        }
+
+        const Real hxy0 = xd0 * mdv0;
+        const Real hxy1 = xd1 * mdv1;
+        const Real hxy2 = xd2 * mdv2;
+        const Real hxy3 = xd3 * mdv3;
+        const Real hxz0 = xd0 * mvd0;
+        const Real hxz1 = xd1 * mvd1;
+        const Real hxz2 = xd2 * mvd2;
+        const Real hxz3 = xd3 * mvd3;
+        const Real hyz0 = xv0 * mdd0;
+        const Real hyz1 = xv1 * mdd1;
+        const Real hyz2 = xv2 * mdd2;
+        const Real hyz3 = xv3 * mdd3;
+
+        Real* SVMP_RESTRICT hess_row = hessians_out + node * 36u;
+        hess_row[0u] = x20 * mvv0;
+        hess_row[1u] = x21 * mvv1;
+        hess_row[2u] = x22 * mvv2;
+        hess_row[3u] = x23 * mvv3;
+        hess_row[4u] = hxy0;
+        hess_row[5u] = hxy1;
+        hess_row[6u] = hxy2;
+        hess_row[7u] = hxy3;
+        hess_row[8u] = hxz0;
+        hess_row[9u] = hxz1;
+        hess_row[10u] = hxz2;
+        hess_row[11u] = hxz3;
+        hess_row[12u] = hxy0;
+        hess_row[13u] = hxy1;
+        hess_row[14u] = hxy2;
+        hess_row[15u] = hxy3;
+        hess_row[16u] = xv0 * md2v0;
+        hess_row[17u] = xv1 * md2v1;
+        hess_row[18u] = xv2 * md2v2;
+        hess_row[19u] = xv3 * md2v3;
+        hess_row[20u] = hyz0;
+        hess_row[21u] = hyz1;
+        hess_row[22u] = hyz2;
+        hess_row[23u] = hyz3;
+        hess_row[24u] = hxz0;
+        hess_row[25u] = hxz1;
+        hess_row[26u] = hxz2;
+        hess_row[27u] = hxz3;
+        hess_row[28u] = hyz0;
+        hess_row[29u] = hyz1;
+        hess_row[30u] = hyz2;
+        hess_row[31u] = hyz3;
+        hess_row[32u] = xv0 * mvd20;
+        hess_row[33u] = xv1 * mvd21;
+        hess_row[34u] = xv2 * mvd22;
+        hess_row[35u] = xv3 * mvd23;
+    }
+
+    return true;
+}
+
+template<int N>
+constexpr std::size_t line_public_axis_index(std::size_t node) noexcept {
+    return node == 0u ? 0u : (node == 1u ? static_cast<std::size_t>(N - 1) : node - 1u);
+}
+
+template<int N>
+constexpr std::array<Real, N> make_line_axis_inv_denoms() noexcept {
+    std::array<Real, N> inv_denoms{};
+    for (int i = 0; i < N; ++i) {
+        Real denom = Real(1);
+        for (int j = 0; j < N; ++j) {
+            if (j != i) {
+                denom *= static_cast<Real>(i - j);
+            }
+        }
+        inv_denoms[static_cast<std::size_t>(i)] = Real(1) / denom;
+    }
+    return inv_denoms;
+}
+
+template<int N>
+void fill_line_values_product(Real x, Real* SVMP_RESTRICT values) {
+    static constexpr auto inv_denoms = make_line_axis_inv_denoms<N>();
+    const Real p = static_cast<Real>(N - 1);
+    const Real r = (x + Real(1)) * p * Real(0.5);
+    Real prefix[N];
+    Real suffix[N];
+    prefix[0] = Real(1);
+    for (int i = 1; i < N; ++i) {
+        prefix[i] = prefix[i - 1] * (r - static_cast<Real>(i - 1));
+    }
+    suffix[N - 1] = Real(1);
+    for (int i = N - 2; i >= 0; --i) {
+        suffix[i] = suffix[i + 1] * (r - static_cast<Real>(i + 1));
+    }
+    for (int i = 0; i < N; ++i) {
+        const std::size_t slot = static_cast<std::size_t>(i);
+        values[slot] = prefix[i] * suffix[i] * inv_denoms[slot];
+    }
+}
+
+template<int N>
+void fill_line_values_product_derivatives(Real x,
+                                          Real* SVMP_RESTRICT values,
+                                          Real* SVMP_RESTRICT first,
+                                          Real* SVMP_RESTRICT second) {
+    static constexpr auto inv_denoms = make_line_axis_inv_denoms<N>();
+    const Real p = static_cast<Real>(N - 1);
+    const Real drdx = p * Real(0.5);
+    const Real d2rdx2 = drdx * drdx;
+    const Real r = (x + Real(1)) * drdx;
+
+    Real prefix[N + 1];
+    Real prefix_d1[N + 1];
+    Real prefix_d2[N + 1];
+    Real suffix[N + 1];
+    Real suffix_d1[N + 1];
+    Real suffix_d2[N + 1];
+
+    const bool need_second = second != nullptr;
+
+    prefix[0] = Real(1);
+    prefix_d1[0] = Real(0);
+    if (need_second) {
+        prefix_d2[0] = Real(0);
+    }
+    for (int i = 0; i < N; ++i) {
+        const Real factor = r - static_cast<Real>(i);
+        prefix[i + 1] = prefix[i] * factor;
+        prefix_d1[i + 1] = prefix_d1[i] * factor + prefix[i];
+        if (need_second) {
+            prefix_d2[i + 1] = prefix_d2[i] * factor + Real(2) * prefix_d1[i];
+        }
+    }
+
+    suffix[N] = Real(1);
+    suffix_d1[N] = Real(0);
+    if (need_second) {
+        suffix_d2[N] = Real(0);
+    }
+    for (int i = N - 1; i >= 0; --i) {
+        const Real factor = r - static_cast<Real>(i);
+        suffix[i] = suffix[i + 1] * factor;
+        suffix_d1[i] = suffix_d1[i + 1] * factor + suffix[i + 1];
+        if (need_second) {
+            suffix_d2[i] = suffix_d2[i + 1] * factor + Real(2) * suffix_d1[i + 1];
+        }
+    }
+
+    for (int i = 0; i < N; ++i) {
+        const std::size_t slot = static_cast<std::size_t>(i);
+        const Real inv = inv_denoms[slot];
+        const Real pre = prefix[i];
+        const Real suf = suffix[i + 1];
+        const Real pre_d1 = prefix_d1[i];
+        const Real suf_d1 = suffix_d1[i + 1];
+        values[slot] = pre * suf * inv;
+        if (first != nullptr) {
+            first[slot] = (pre_d1 * suf + pre * suf_d1) * inv * drdx;
+        }
+        if (second != nullptr) {
+            const Real d2 =
+                prefix_d2[i] * suf +
+                Real(2) * pre_d1 * suf_d1 +
+                pre * suffix_d2[i + 1];
+            second[slot] = d2 * inv * d2rdx2;
+        }
+    }
+}
+
+template<int N>
+void fill_axis_batch_product_q4(
+    AxisBatchScratch& scratch,
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t component,
+    AxisDeriv level) {
+    constexpr std::size_t axis_stride = static_cast<std::size_t>(N);
+    scratch.resizeFor(4u * axis_stride, level);
+    for (std::size_t q = 0; q < 4u; ++q) {
+        Real* values = scratch.values.data() + q * axis_stride;
+        if (level == AxisDeriv::ValuesOnly) {
+            fill_line_values_product<N>(points[q][component], values);
+        } else {
+            Real* first = scratch.first.data() + q * axis_stride;
+            Real* second = level == AxisDeriv::ValuesAndFirstAndSecond
+                ? scratch.second.data() + q * axis_stride
+                : nullptr;
+            fill_line_values_product_derivatives<N>(
+                points[q][component], values, first, second);
+        }
+    }
+}
+
+bool try_fill_axis_batch_product_q4(
+    AxisBatchScratch& scratch,
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t component,
+    int n_axis,
+    AxisDeriv level) {
+    switch (n_axis) {
+        case 5:
+            fill_axis_batch_product_q4<5>(scratch, points, component, level);
+            return true;
+        case 6:
+            fill_axis_batch_product_q4<6>(scratch, points, component, level);
+            return true;
+        case 7:
+            fill_axis_batch_product_q4<7>(scratch, points, component, level);
+            return true;
+        case 8:
+            fill_axis_batch_product_q4<8>(scratch, points, component, level);
+            return true;
+        case 9:
+            fill_axis_batch_product_q4<9>(scratch, points, component, level);
+            return true;
+        default:
+            return false;
+    }
+}
+
+template<int N>
+SVMP_LAGRANGE_NOINLINE SVMP_LAGRANGE_ALIGN64 void evaluate_line_values_product_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out) {
+    Real q0[N];
+    Real q1[N];
+    Real q2[N];
+    Real q3[N];
+    fill_line_values_product<N>(points[0][0], q0);
+    fill_line_values_product<N>(points[1][0], q1);
+    fill_line_values_product<N>(points[2][0], q2);
+    fill_line_values_product<N>(points[3][0], q3);
+
+    for (std::size_t node = 0; node < static_cast<std::size_t>(N); ++node) {
+        const std::size_t i = line_public_axis_index<N>(node);
+        Real* row = values_out + node * output_stride;
+        row[0] = q0[i];
+        row[1] = q1[i];
+        row[2] = q2[i];
+        row[3] = q3[i];
+    }
+}
+
+FE_ALWAYS_INLINE void write_line_order4_values_q(
+    Real x,
+    std::size_t q,
+    Real* SVMP_RESTRICT row0,
+    Real* SVMP_RESTRICT row1,
+    Real* SVMP_RESTRICT row2,
+    Real* SVMP_RESTRICT row3,
+    Real* SVMP_RESTRICT row4) {
+    const Real r = (x + Real(1)) * Real(2);
+    const Real f0 = r;
+    const Real f1 = r - Real(1);
+    const Real f2 = r - Real(2);
+    const Real f3 = r - Real(3);
+    const Real f4 = r - Real(4);
+    const Real f01 = f0 * f1;
+    const Real f12 = f1 * f2;
+    const Real f23 = f2 * f3;
+    const Real f34 = f3 * f4;
+    const Real v0 = (f12 * f34) / Real(24);
+    const Real v1 = -(f0 * f2 * f34) / Real(6);
+    const Real v2 = (f01 * f34) / Real(4);
+    const Real v3 = -(f01 * f2 * f4) / Real(6);
+    const Real v4 = (f01 * f23) / Real(24);
+    row0[q] = v0;
+    row1[q] = v4;
+    row2[q] = v1;
+    row3[q] = v2;
+    row4[q] = v3;
+}
+
+SVMP_LAGRANGE_NOINLINE SVMP_LAGRANGE_ALIGN64 void evaluate_line_order4_values_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out) {
+    Real* row0 = values_out + 0u * output_stride;
+    Real* row1 = values_out + 1u * output_stride;
+    Real* row2 = values_out + 2u * output_stride;
+    Real* row3 = values_out + 3u * output_stride;
+    Real* row4 = values_out + 4u * output_stride;
+    write_line_order4_values_q(points[0][0], 0u, row0, row1, row2, row3, row4);
+    write_line_order4_values_q(points[1][0], 1u, row0, row1, row2, row3, row4);
+    write_line_order4_values_q(points[2][0], 2u, row0, row1, row2, row3, row4);
+    write_line_order4_values_q(points[3][0], 3u, row0, row1, row2, row3, row4);
+}
+
+SVMP_LAGRANGE_NOINLINE void evaluate_triangle_order1_gradients_strided(
+    std::size_t num_qpts,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT gradients_out) {
+    Real* SVMP_RESTRICT row0 = gradients_out + 0u * 3u * output_stride;
+    Real* SVMP_RESTRICT row1 = gradients_out + 1u * 3u * output_stride;
+    Real* SVMP_RESTRICT row2 = gradients_out + 2u * 3u * output_stride;
+
+    for (std::size_t q = 0; q < num_qpts; ++q) {
+        row0[0u * output_stride + q] = Real(-1);
+        row0[1u * output_stride + q] = Real(-1);
+        row0[2u * output_stride + q] = Real(0);
+        row1[0u * output_stride + q] = Real(1);
+        row1[1u * output_stride + q] = Real(0);
+        row1[2u * output_stride + q] = Real(0);
+        row2[0u * output_stride + q] = Real(0);
+        row2[1u * output_stride + q] = Real(1);
+        row2[2u * output_stride + q] = Real(0);
+    }
+}
+
+template<int N>
+SVMP_LAGRANGE_NOINLINE void evaluate_line_hessians_product_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT hessians_out) {
+    Real values[4][N];
+    Real second[4][N];
+    for (std::size_t q = 0; q < 4u; ++q) {
+        fill_line_values_product_derivatives<N>(
+            points[q][0], values[q], nullptr, second[q]);
+    }
+    for (std::size_t node = 0; node < static_cast<std::size_t>(N); ++node) {
+        const std::size_t i = line_public_axis_index<N>(node);
+        write_line_hessian_q4_row(hessians_out + node * 9u * output_stride,
+                                  output_stride,
+                                  second[0][i], second[1][i],
+                                  second[2][i], second[3][i]);
+    }
+}
+
+template<int N>
+SVMP_LAGRANGE_NOINLINE void evaluate_line_all_product_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) {
+    Real values[4][N];
+    Real first[4][N];
+    Real second[4][N];
+    for (std::size_t q = 0; q < 4u; ++q) {
+        fill_line_values_product_derivatives<N>(
+            points[q][0], values[q], first[q], second[q]);
+    }
+    for (std::size_t node = 0; node < static_cast<std::size_t>(N); ++node) {
+        const std::size_t i = line_public_axis_index<N>(node);
+        Real* value_row = values_out + node * output_stride;
+        value_row[0] = values[0][i];
+        value_row[1] = values[1][i];
+        value_row[2] = values[2][i];
+        value_row[3] = values[3][i];
+        write_line_gradient_q4_row(gradients_out + node * 3u * output_stride,
+                                   output_stride,
+                                   first[0][i], first[1][i],
+                                   first[2][i], first[3][i]);
+        write_line_hessian_q4_row(hessians_out + node * 9u * output_stride,
+                                  output_stride,
+                                  second[0][i], second[1][i],
+                                  second[2][i], second[3][i]);
+    }
+}
+
+inline void write_quad_product_value_row_q4(
+    Real* SVMP_RESTRICT row,
+    const Real* SVMP_RESTRICT x0,
+    const Real* SVMP_RESTRICT x1,
+    const Real* SVMP_RESTRICT x2,
+    const Real* SVMP_RESTRICT x3,
+    const Real* SVMP_RESTRICT y0,
+    const Real* SVMP_RESTRICT y1,
+    const Real* SVMP_RESTRICT y2,
+    const Real* SVMP_RESTRICT y3,
+    std::size_t i,
+    std::size_t j) {
+    row[0] = x0[i] * y0[j];
+    row[1] = x1[i] * y1[j];
+    row[2] = x2[i] * y2[j];
+    row[3] = x3[i] * y3[j];
+}
+
+template<int N>
+void evaluate_quad_values_product_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out) {
+    Real x0[N];
+    Real x1[N];
+    Real x2[N];
+    Real x3[N];
+    Real y0[N];
+    Real y1[N];
+    Real y2[N];
+    Real y3[N];
+    fill_line_values_product<N>(points[0][0], x0);
+    fill_line_values_product<N>(points[1][0], x1);
+    fill_line_values_product<N>(points[2][0], x2);
+    fill_line_values_product<N>(points[3][0], x3);
+    fill_line_values_product<N>(points[0][1], y0);
+    fill_line_values_product<N>(points[1][1], y1);
+    fill_line_values_product<N>(points[2][1], y2);
+    fill_line_values_product<N>(points[3][1], y3);
+
+    constexpr std::size_t p = static_cast<std::size_t>(N - 1);
+    std::size_t node = 0u;
+    write_quad_product_value_row_q4(values_out + node++ * output_stride,
+                                    x0, x1, x2, x3, y0, y1, y2, y3, 0u, 0u);
+    write_quad_product_value_row_q4(values_out + node++ * output_stride,
+                                    x0, x1, x2, x3, y0, y1, y2, y3, p, 0u);
+    write_quad_product_value_row_q4(values_out + node++ * output_stride,
+                                    x0, x1, x2, x3, y0, y1, y2, y3, p, p);
+    write_quad_product_value_row_q4(values_out + node++ * output_stride,
+                                    x0, x1, x2, x3, y0, y1, y2, y3, 0u, p);
+
+    for (std::size_t i = 1u; i < p; ++i) {
+        write_quad_product_value_row_q4(values_out + node++ * output_stride,
+                                        x0, x1, x2, x3, y0, y1, y2, y3, i, 0u);
+    }
+    for (std::size_t j = 1u; j < p; ++j) {
+        write_quad_product_value_row_q4(values_out + node++ * output_stride,
+                                        x0, x1, x2, x3, y0, y1, y2, y3, p, j);
+    }
+    for (std::size_t i = p - 1u; i > 0u; --i) {
+        write_quad_product_value_row_q4(values_out + node++ * output_stride,
+                                        x0, x1, x2, x3, y0, y1, y2, y3, i, p);
+    }
+    for (std::size_t j = p - 1u; j > 0u; --j) {
+        write_quad_product_value_row_q4(values_out + node++ * output_stride,
+                                        x0, x1, x2, x3, y0, y1, y2, y3, 0u, j);
+    }
+    for (std::size_t j = 1u; j < p; ++j) {
+        for (std::size_t i = 1u; i < p; ++i) {
+            write_quad_product_value_row_q4(values_out + node++ * output_stride,
+                                            x0, x1, x2, x3, y0, y1, y2, y3, i, j);
+        }
+    }
+}
+
+template<int N>
+void evaluate_quad_derivatives_product_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) {
+    const bool need_grad = gradients_out != nullptr;
+    const bool need_hess = hessians_out != nullptr;
+    Real xv[4][N];
+    Real xd[4][N];
+    Real x2[4][N];
+    Real yv[4][N];
+    Real yd[4][N];
+    Real y2[4][N];
+
+    for (std::size_t q = 0; q < 4u; ++q) {
+        fill_line_values_product_derivatives<N>(
+            points[q][0], xv[q], (need_grad || need_hess) ? xd[q] : nullptr,
+            need_hess ? x2[q] : nullptr);
+        fill_line_values_product_derivatives<N>(
+            points[q][1], yv[q], (need_grad || need_hess) ? yd[q] : nullptr,
+            need_hess ? y2[q] : nullptr);
+    }
+
+    constexpr std::size_t p = static_cast<std::size_t>(N - 1);
+    std::size_t node = 0u;
+    auto write_node = [&](std::size_t i, std::size_t j) {
+        Real* value_row = values_out != nullptr ? values_out + node * output_stride : nullptr;
+        Real* grad_row = gradients_out != nullptr ? gradients_out + node * 3u * output_stride : nullptr;
+        Real* hess_row = hessians_out != nullptr ? hessians_out + node * 9u * output_stride : nullptr;
+        if (grad_row != nullptr) {
+            grad_row[2u * output_stride + 0u] = Real(0);
+            grad_row[2u * output_stride + 1u] = Real(0);
+            grad_row[2u * output_stride + 2u] = Real(0);
+            grad_row[2u * output_stride + 3u] = Real(0);
+        }
+        if (hess_row != nullptr) {
+            hess_row[2u * output_stride + 0u] = Real(0);
+            hess_row[2u * output_stride + 1u] = Real(0);
+            hess_row[2u * output_stride + 2u] = Real(0);
+            hess_row[2u * output_stride + 3u] = Real(0);
+            hess_row[5u * output_stride + 0u] = Real(0);
+            hess_row[5u * output_stride + 1u] = Real(0);
+            hess_row[5u * output_stride + 2u] = Real(0);
+            hess_row[5u * output_stride + 3u] = Real(0);
+            hess_row[6u * output_stride + 0u] = Real(0);
+            hess_row[6u * output_stride + 1u] = Real(0);
+            hess_row[6u * output_stride + 2u] = Real(0);
+            hess_row[6u * output_stride + 3u] = Real(0);
+            hess_row[7u * output_stride + 0u] = Real(0);
+            hess_row[7u * output_stride + 1u] = Real(0);
+            hess_row[7u * output_stride + 2u] = Real(0);
+            hess_row[7u * output_stride + 3u] = Real(0);
+            hess_row[8u * output_stride + 0u] = Real(0);
+            hess_row[8u * output_stride + 1u] = Real(0);
+            hess_row[8u * output_stride + 2u] = Real(0);
+            hess_row[8u * output_stride + 3u] = Real(0);
+        }
+        for (std::size_t q = 0; q < 4u; ++q) {
+            const Real x_value = xv[q][i];
+            const Real y_value = yv[q][j];
+            if (value_row != nullptr) {
+                value_row[q] = x_value * y_value;
+            }
+            if (grad_row != nullptr) {
+                grad_row[0u * output_stride + q] = xd[q][i] * y_value;
+                grad_row[1u * output_stride + q] = x_value * yd[q][j];
+            }
+            if (hess_row != nullptr) {
+                const Real hxy = xd[q][i] * yd[q][j];
+                hess_row[0u * output_stride + q] = x2[q][i] * y_value;
+                hess_row[1u * output_stride + q] = hxy;
+                hess_row[3u * output_stride + q] = hxy;
+                hess_row[4u * output_stride + q] = x_value * y2[q][j];
+            }
+        }
+        ++node;
+    };
+
+    write_node(0u, 0u);
+    write_node(p, 0u);
+    write_node(p, p);
+    write_node(0u, p);
+    for (std::size_t i = 1u; i < p; ++i) {
+        write_node(i, 0u);
+    }
+    for (std::size_t j = 1u; j < p; ++j) {
+        write_node(p, j);
+    }
+    for (std::size_t i = p - 1u; i > 0u; --i) {
+        write_node(i, p);
+    }
+    for (std::size_t j = p - 1u; j > 0u; --j) {
+        write_node(0u, j);
+    }
+    for (std::size_t j = 1u; j < p; ++j) {
+        for (std::size_t i = 1u; i < p; ++i) {
+            write_node(i, j);
+        }
+    }
+}
+
+SVMP_LAGRANGE_NOINLINE SVMP_LAGRANGE_ALIGN64 void evaluate_quad_order8_gradients_product_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT gradients_out) {
+    constexpr int N = 9;
+    constexpr std::size_t p = 8u;
+    Real xv[4][N];
+    Real xd[4][N];
+    Real yv[4][N];
+    Real yd[4][N];
+
+    for (std::size_t q = 0; q < 4u; ++q) {
+        fill_line_values_product_derivatives<N>(points[q][0], xv[q], xd[q], nullptr);
+        fill_line_values_product_derivatives<N>(points[q][1], yv[q], yd[q], nullptr);
+    }
+
+    std::size_t node = 0u;
+    auto write_node = [&](std::size_t i, std::size_t j) {
+        Real* SVMP_RESTRICT row = gradients_out + node * 3u * output_stride;
+        row[0u] = xd[0][i] * yv[0][j];
+        row[1u] = xd[1][i] * yv[1][j];
+        row[2u] = xd[2][i] * yv[2][j];
+        row[3u] = xd[3][i] * yv[3][j];
+        row[output_stride + 0u] = xv[0][i] * yd[0][j];
+        row[output_stride + 1u] = xv[1][i] * yd[1][j];
+        row[output_stride + 2u] = xv[2][i] * yd[2][j];
+        row[output_stride + 3u] = xv[3][i] * yd[3][j];
+        row[2u * output_stride + 0u] = Real(0);
+        row[2u * output_stride + 1u] = Real(0);
+        row[2u * output_stride + 2u] = Real(0);
+        row[2u * output_stride + 3u] = Real(0);
+        ++node;
+    };
+
+    write_node(0u, 0u);
+    write_node(p, 0u);
+    write_node(p, p);
+    write_node(0u, p);
+    for (std::size_t i = 1u; i < p; ++i) {
+        write_node(i, 0u);
+    }
+    for (std::size_t j = 1u; j < p; ++j) {
+        write_node(p, j);
+    }
+    for (std::size_t i = p - 1u; i > 0u; --i) {
+        write_node(i, p);
+    }
+    for (std::size_t j = p - 1u; j > 0u; --j) {
+        write_node(0u, j);
+    }
+    for (std::size_t j = 1u; j < p; ++j) {
+        for (std::size_t i = 1u; i < p; ++i) {
+            write_node(i, j);
+        }
+    }
+}
+
+template<int N>
+void evaluate_line_gradients_horner_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    const Real* SVMP_RESTRICT d_coeffs,
+    Real* SVMP_RESTRICT gradients_out) {
+    const Real x0 = points[0][0];
+    const Real x1 = points[1][0];
+    const Real x2 = points[2][0];
+    const Real x3 = points[3][0];
+
+    for (std::size_t node = 0; node < static_cast<std::size_t>(N); ++node) {
+        const std::size_t i = line_public_axis_index<N>(node);
+        const Real* c = d_coeffs + i * static_cast<std::size_t>(N - 1);
+        Real r0 = c[N - 2];
+        Real r1 = c[N - 2];
+        Real r2 = c[N - 2];
+        Real r3 = c[N - 2];
+        for (int k = N - 2; k > 0; --k) {
+            const Real ck = c[k - 1];
+            r0 = r0 * x0 + ck;
+            r1 = r1 * x1 + ck;
+            r2 = r2 * x2 + ck;
+            r3 = r3 * x3 + ck;
+        }
+        Real* row = gradients_out + node * 3u * output_stride;
+        row[0] = r0;
+        row[1] = r1;
+        row[2] = r2;
+        row[3] = r3;
+        row[output_stride + 0u] = Real(0);
+        row[output_stride + 1u] = Real(0);
+        row[output_stride + 2u] = Real(0);
+        row[output_stride + 3u] = Real(0);
+        row[2u * output_stride + 0u] = Real(0);
+        row[2u * output_stride + 1u] = Real(0);
+        row[2u * output_stride + 2u] = Real(0);
+        row[2u * output_stride + 3u] = Real(0);
+    }
+}
+
+bool try_evaluate_line_values_horner_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    const Real* SVMP_RESTRICT v_coeffs,
+    int n_axis,
+    Real* SVMP_RESTRICT values_out) {
+    (void)v_coeffs;
+    switch (n_axis) {
+        case 5:
+            evaluate_line_order4_values_q4(points, output_stride, values_out);
+            return true;
+        case 6:
+            evaluate_line_values_product_q4<6>(points, output_stride, values_out);
+            return true;
+        case 7:
+            evaluate_line_values_product_q4<7>(points, output_stride, values_out);
+            return true;
+        case 8:
+            evaluate_line_values_product_q4<8>(points, output_stride, values_out);
+            return true;
+        case 9:
+            evaluate_line_values_product_q4<9>(points, output_stride, values_out);
+            return true;
+        default:
+            return false;
+    }
+}
+
+bool try_evaluate_line_gradients_horner_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    const Real* SVMP_RESTRICT d_coeffs,
+    int n_axis,
+    Real* SVMP_RESTRICT gradients_out) {
+    switch (n_axis) {
+        case 5:
+            evaluate_line_gradients_horner_q4<5>(points, output_stride, d_coeffs, gradients_out);
+            return true;
+        case 6:
+            evaluate_line_gradients_horner_q4<6>(points, output_stride, d_coeffs, gradients_out);
+            return true;
+        case 7:
+            evaluate_line_gradients_horner_q4<7>(points, output_stride, d_coeffs, gradients_out);
+            return true;
+        case 8:
+            evaluate_line_gradients_horner_q4<8>(points, output_stride, d_coeffs, gradients_out);
+            return true;
+        case 9:
+            evaluate_line_gradients_horner_q4<9>(points, output_stride, d_coeffs, gradients_out);
+            return true;
+        default:
+            return false;
+    }
+}
+
+SVMP_LAGRANGE_NOINLINE bool try_evaluate_line_hessians_product_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    int n_axis,
+    Real* SVMP_RESTRICT hessians_out) {
+    switch (n_axis) {
+        case 5:
+            evaluate_line_hessians_product_q4<5>(points, output_stride, hessians_out);
+            return true;
+        case 6:
+            evaluate_line_hessians_product_q4<6>(points, output_stride, hessians_out);
+            return true;
+        case 7:
+            evaluate_line_hessians_product_q4<7>(points, output_stride, hessians_out);
+            return true;
+        case 8:
+            evaluate_line_hessians_product_q4<8>(points, output_stride, hessians_out);
+            return true;
+        case 9:
+            evaluate_line_hessians_product_q4<9>(points, output_stride, hessians_out);
+            return true;
+        default:
+            return false;
+    }
+}
+
+SVMP_LAGRANGE_NOINLINE bool try_evaluate_line_all_product_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    int n_axis,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) {
+    switch (n_axis) {
+        case 5:
+            evaluate_line_all_product_q4<5>(
+                points, output_stride, values_out, gradients_out, hessians_out);
+            return true;
+        case 6:
+            evaluate_line_all_product_q4<6>(
+                points, output_stride, values_out, gradients_out, hessians_out);
+            return true;
+        case 7:
+            evaluate_line_all_product_q4<7>(
+                points, output_stride, values_out, gradients_out, hessians_out);
+            return true;
+        case 8:
+            evaluate_line_all_product_q4<8>(
+                points, output_stride, values_out, gradients_out, hessians_out);
+            return true;
+        case 9:
+            evaluate_line_all_product_q4<9>(
+                points, output_stride, values_out, gradients_out, hessians_out);
+            return true;
+        default:
+            return false;
+    }
+}
+
+SVMP_LAGRANGE_NOINLINE bool try_evaluate_quad_values_product_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    int n_axis,
+    Real* SVMP_RESTRICT values_out) {
+    switch (n_axis) {
+        case 5:
+            evaluate_quad_values_product_q4<5>(points, output_stride, values_out);
+            return true;
+        case 6:
+            evaluate_quad_values_product_q4<6>(points, output_stride, values_out);
+            return true;
+        case 7:
+            evaluate_quad_values_product_q4<7>(points, output_stride, values_out);
+            return true;
+        case 8:
+            evaluate_quad_values_product_q4<8>(points, output_stride, values_out);
+            return true;
+        case 9:
+            evaluate_quad_values_product_q4<9>(points, output_stride, values_out);
+            return true;
+        default:
+            return false;
+    }
+}
+
+SVMP_LAGRANGE_NOINLINE bool try_evaluate_quad_derivatives_product_q4(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    int n_axis,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) {
+    switch (n_axis) {
+        case 5:
+            evaluate_quad_derivatives_product_q4<5>(
+                points, output_stride, values_out, gradients_out, hessians_out);
+            return true;
+        case 6:
+            evaluate_quad_derivatives_product_q4<6>(
+                points, output_stride, values_out, gradients_out, hessians_out);
+            return true;
+        case 7:
+            evaluate_quad_derivatives_product_q4<7>(
+                points, output_stride, values_out, gradients_out, hessians_out);
+            return true;
+        case 8:
+            evaluate_quad_derivatives_product_q4<8>(
+                points, output_stride, values_out, gradients_out, hessians_out);
+            return true;
+        case 9:
+            evaluate_quad_derivatives_product_q4<9>(
+                points, output_stride, values_out, gradients_out, hessians_out);
+            return true;
+        default:
+            return false;
+    }
+}
+
+void evaluate_tensor_product_points_strided(
+    LagrangeTopology topology,
+    const std::vector<std::array<std::size_t, 3>>& tensor_indices,
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    const Real* v_coeffs,
+    const Real* d_coeffs,
+    const Real* d2_coeffs,
+    const Real* barycentric_weights,
+    int n_axis,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) {
+    const std::size_t num_qpts = points.size();
+    if (num_qpts == 0 || tensor_indices.empty()) {
+        return;
+    }
+
+    const bool need_grad = gradients_out != nullptr;
+    const bool need_hess = hessians_out != nullptr;
+    const bool values_only = values_out != nullptr && !need_grad && !need_hess;
+    const bool gradients_only = values_out == nullptr && need_grad && !need_hess;
+    const bool hessians_only = values_out == nullptr && gradients_out == nullptr && need_hess;
+    const bool all_outputs = values_out != nullptr && need_grad && need_hess;
+    const AxisDeriv level = need_hess
+        ? AxisDeriv::ValuesAndFirstAndSecond
+        : (need_grad ? AxisDeriv::ValuesAndFirst : AxisDeriv::ValuesOnly);
+
+    if (topology == LagrangeTopology::Line && num_qpts == 4u) {
+        if (values_only &&
+            try_evaluate_line_values_horner_q4(
+                points, output_stride, v_coeffs, n_axis, values_out)) {
+            return;
+        }
+        if (gradients_only &&
+            try_evaluate_line_gradients_horner_q4(
+                points, output_stride, d_coeffs, n_axis, gradients_out)) {
+            return;
+        }
+        if (hessians_only &&
+            try_evaluate_line_hessians_product_q4(
+                points, output_stride, n_axis, hessians_out)) {
+            return;
+        }
+        if (all_outputs &&
+            try_evaluate_line_all_product_q4(
+                points, output_stride, n_axis, values_out, gradients_out, hessians_out)) {
+            return;
+        }
+    }
+    if (topology == LagrangeTopology::Quadrilateral &&
+        values_only &&
+        num_qpts == 4u &&
+        try_evaluate_quad_values_product_q4(points, output_stride, n_axis, values_out)) {
+        return;
+    }
+    if (topology == LagrangeTopology::Quadrilateral &&
+        gradients_only &&
+        num_qpts == 4u &&
+        n_axis == 5) {
+        evaluate_quad_order4_gradients_q4(points, output_stride, gradients_out);
+        return;
+    }
+    if (topology == LagrangeTopology::Quadrilateral &&
+        gradients_only &&
+        num_qpts == 4u &&
+        n_axis == 9) {
+        evaluate_quad_order8_gradients_product_q4(points, output_stride, gradients_out);
+        return;
+    }
+    if (topology == LagrangeTopology::Quadrilateral &&
+        (gradients_only || hessians_only || all_outputs) &&
+        num_qpts == 4u &&
+        try_evaluate_quad_derivatives_product_q4(
+            points, output_stride, n_axis, values_out, gradients_out, hessians_out)) {
+        return;
+    }
+
+    auto& scratch = evaluate_scratch();
+    AxisBatchScratch& x_batch = scratch.axis_x_batch;
+    AxisBatchScratch& y_batch = scratch.axis_y_batch;
+    AxisBatchScratch& z_batch = scratch.axis_z_batch;
+
+    const bool has_y = topology != LagrangeTopology::Line;
+    const bool has_z = topology == LagrangeTopology::Hexahedron;
+    const std::size_t axis_stride = static_cast<std::size_t>(n_axis);
+    const bool use_product_axis_batch =
+        has_z &&
+        gradients_only &&
+        num_qpts == 4u &&
+        n_axis >= 5 &&
+        n_axis <= 9;
+    auto fill_tensor_axis_batch = [&](AxisBatchScratch& batch, std::size_t component) {
+        if (use_product_axis_batch &&
+            try_fill_axis_batch_product_q4(batch, points, component, n_axis, level)) {
+            return;
+        }
+        fill_axis_batch(batch, points, component, v_coeffs, d_coeffs, d2_coeffs,
+                        barycentric_weights, n_axis, level);
+    };
+
+    fill_tensor_axis_batch(x_batch, 0u);
+    if (!has_y) {
+        if (values_only) {
+            if (num_qpts == 4u) {
+                for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
+                    const std::size_t i = tensor_indices[node][0];
+                    Real* value_row = values_out + node * output_stride;
+                    value_row[0] = x_batch.values[i];
+                    value_row[1] = x_batch.values[axis_stride + i];
+                    value_row[2] = x_batch.values[2u * axis_stride + i];
+                    value_row[3] = x_batch.values[3u * axis_stride + i];
+                }
+                return;
+            }
+            for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
+                const std::size_t i = tensor_indices[node][0];
+                Real* value_row = values_out + node * output_stride;
+
+                for (std::size_t q = 0; q < num_qpts; ++q) {
+                    value_row[q] = x_batch.values[q * axis_stride + i];
+                }
+            }
+            return;
+        }
+
+        if (gradients_only) {
+            for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
+                const std::size_t i = tensor_indices[node][0];
+                Real* grad_row = gradients_out + node * 3u * output_stride;
+
+                for (std::size_t q = 0; q < num_qpts; ++q) {
+                    grad_row[0u * output_stride + q] =
+                        x_batch.first[q * axis_stride + i];
+                    grad_row[1u * output_stride + q] = Real(0);
+                    grad_row[2u * output_stride + q] = Real(0);
+                }
+            }
+            return;
+        }
+
+        for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
+            const std::size_t i = tensor_indices[node][0];
+            Real* value_row = values_out ? values_out + node * output_stride : nullptr;
+            Real* grad_row = gradients_out ? gradients_out + node * 3u * output_stride : nullptr;
+            Real* hess_row = hessians_out ? hessians_out + node * 9u * output_stride : nullptr;
+
+            for (std::size_t q = 0; q < num_qpts; ++q) {
+                const std::size_t q_axis = q * axis_stride + i;
+                if (value_row != nullptr) {
+                    value_row[q] = x_batch.values[q_axis];
+                }
+                if (need_grad) {
+                    grad_row[0u * output_stride + q] = x_batch.first[q_axis];
+                    grad_row[1u * output_stride + q] = Real(0);
+                    grad_row[2u * output_stride + q] = Real(0);
+                }
+                if (need_hess) {
+                    hess_row[0u * output_stride + q] = x_batch.second[q_axis];
+                    hess_row[1u * output_stride + q] = Real(0);
+                    hess_row[2u * output_stride + q] = Real(0);
+                    hess_row[3u * output_stride + q] = Real(0);
+                    hess_row[4u * output_stride + q] = Real(0);
+                    hess_row[5u * output_stride + q] = Real(0);
+                    hess_row[6u * output_stride + q] = Real(0);
+                    hess_row[7u * output_stride + q] = Real(0);
+                    hess_row[8u * output_stride + q] = Real(0);
+                }
+            }
+        }
+        return;
+    }
+    const bool use_tensor_tables =
+        has_z ||
+        (axis_stride == 8u && !(need_hess && values_out == nullptr && gradients_out == nullptr));
+    if (use_tensor_tables) {
+        fill_tensor_axis_batch(y_batch, 1u);
+    } else if (has_y) {
+        fill_tensor_axis_batch(y_batch, 1u);
+    }
+    if (has_z) {
+        fill_tensor_axis_batch(z_batch, 2u);
+    }
+
+    if (use_tensor_tables) {
+        const std::size_t ny = axis_stride;
+        const std::size_t nz = has_z ? axis_stride : 1u;
+        const std::size_t nyz = ny * nz;
+        const std::size_t table_count = num_qpts * nyz;
+
+        if (has_z && num_qpts == 4u && output_stride == 4u) {
+            if (values_only &&
+                evaluate_tensor_product_values_stride4_q4_transposed(
+                    tensor_indices, axis_stride, x_batch, y_batch, z_batch, values_out)) {
+                return;
+            }
+            if (gradients_only &&
+                evaluate_tensor_product_gradients_stride4_q4_transposed(
+                    tensor_indices, axis_stride, x_batch, y_batch, z_batch, gradients_out)) {
+                return;
+            }
+            if (hessians_only &&
+                evaluate_tensor_product_second_stride4_q4_transposed<false>(
+                    tensor_indices, axis_stride, x_batch, y_batch, z_batch,
+                    nullptr, nullptr, hessians_out)) {
+                return;
+            }
+            if (all_outputs &&
+                evaluate_tensor_product_second_stride4_q4_transposed<true>(
+                    tensor_indices, axis_stride, x_batch, y_batch, z_batch,
+                    values_out, gradients_out, hessians_out)) {
+                return;
+            }
+        }
+
+        Real Mvv_stack[kMaxStackYZ];
+        Real Mdv_stack[kMaxStackYZ];
+        Real Mvd_stack[kMaxStackYZ];
+        Real Md2v_stack[kMaxStackYZ];
+        Real Mvd2_stack[kMaxStackYZ];
+        Real Mdd_stack[kMaxStackYZ];
+
+        Real* Mvv;
+        Real* Mdv;
+        Real* Mvd;
+        Real* Md2v;
+        Real* Mvd2;
+        Real* Mdd;
+        if (table_count <= kMaxStackYZ) {
+            Mvv = Mvv_stack;
+            Mdv = Mdv_stack;
+            Mvd = Mvd_stack;
+            Md2v = Md2v_stack;
+            Mvd2 = Mvd2_stack;
+            Mdd = Mdd_stack;
+        } else {
+            auto& tables = scratch.tensor_tables;
+            tables.resizeFor(table_count);
+            Mvv = tables.vv.data();
+            Mdv = tables.dv.data();
+            Mvd = tables.vd.data();
+            Md2v = tables.d2v.data();
+            Mvd2 = tables.vd2.data();
+            Mdd = tables.dd.data();
+        }
+
+        for (std::size_t q = 0; q < num_qpts; ++q) {
+            const std::size_t q_axis = q * axis_stride;
+            const std::size_t q_table = q * nyz;
+            for (std::size_t j = 0; j < ny; ++j) {
+                const Real yv = y_batch.values[q_axis + j];
+                const Real yd = (need_grad || need_hess) ? y_batch.first[q_axis + j] : Real(0);
+                const Real y2 = need_hess ? y_batch.second[q_axis + j] : Real(0);
+                for (std::size_t k = 0; k < nz; ++k) {
+                    const std::size_t slot = q_table + j * nz + k;
+                    const Real zv = has_z ? z_batch.values[q_axis + k] : Real(1);
+                    Mvv[slot] = yv * zv;
+                    if (need_grad || need_hess) {
+                        const Real zd = has_z ? z_batch.first[q_axis + k] : Real(0);
+                        Mdv[slot] = yd * zv;
+                        Mvd[slot] = yv * zd;
+                    }
+                    if (need_hess) {
+                        const Real zd = has_z ? z_batch.first[q_axis + k] : Real(0);
+                        const Real z2 = has_z ? z_batch.second[q_axis + k] : Real(0);
+                        Md2v[slot] = y2 * zv;
+                        Mvd2[slot] = yv * z2;
+                        Mdd[slot] = yd * zd;
+                    }
+                }
+            }
+        }
+
+        if (values_only) {
+            if (has_z && num_qpts == 4u) {
+                for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
+                    const auto& idx = tensor_indices[node];
+                    const std::size_t i = idx[0];
+                    const std::size_t jk = idx[1] * nz + idx[2];
+                    Real* value_row = values_out + node * output_stride;
+
+                    write_tensor_product_value_strided_q<0>(
+                        axis_stride, nyz, i, jk, x_batch, Mvv, value_row);
+                    write_tensor_product_value_strided_q<1>(
+                        axis_stride, nyz, i, jk, x_batch, Mvv, value_row);
+                    write_tensor_product_value_strided_q<2>(
+                        axis_stride, nyz, i, jk, x_batch, Mvv, value_row);
+                    write_tensor_product_value_strided_q<3>(
+                        axis_stride, nyz, i, jk, x_batch, Mvv, value_row);
+                }
+                return;
+            }
+            for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
+                const auto& idx = tensor_indices[node];
+                const std::size_t i = idx[0];
+                const std::size_t jk = idx[1] * nz + idx[2];
+                Real* value_row = values_out + node * output_stride;
+
+                for (std::size_t q = 0; q < num_qpts; ++q) {
+                    const std::size_t q_axis = q * axis_stride;
+                    const std::size_t slot = q * nyz + jk;
+                    value_row[q] = x_batch.values[q_axis + i] * Mvv[slot];
+                }
+            }
+            return;
+        }
+
+        if (gradients_only) {
+            if (has_z && num_qpts == 4u) {
+                if (output_stride == 4u) {
+                    for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
+                        const auto& idx = tensor_indices[node];
+                        const std::size_t i = idx[0];
+                        const std::size_t jk = idx[1] * nz + idx[2];
+                        Real* grad_row = gradients_out + node * 3u * output_stride;
+
+                        write_tensor_product_gradient_stride4_q<0>(
+                            axis_stride, nyz, i, jk, x_batch, Mvv, Mdv, Mvd, grad_row);
+                        write_tensor_product_gradient_stride4_q<1>(
+                            axis_stride, nyz, i, jk, x_batch, Mvv, Mdv, Mvd, grad_row);
+                        write_tensor_product_gradient_stride4_q<2>(
+                            axis_stride, nyz, i, jk, x_batch, Mvv, Mdv, Mvd, grad_row);
+                        write_tensor_product_gradient_stride4_q<3>(
+                            axis_stride, nyz, i, jk, x_batch, Mvv, Mdv, Mvd, grad_row);
+                    }
+                } else {
+                    for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
+                        const auto& idx = tensor_indices[node];
+                        const std::size_t i = idx[0];
+                        const std::size_t jk = idx[1] * nz + idx[2];
+                        Real* grad_row = gradients_out + node * 3u * output_stride;
+
+                        write_tensor_product_gradient_strided_q<0>(
+                            axis_stride, nyz, i, jk, output_stride, x_batch,
+                            Mvv, Mdv, Mvd, grad_row);
+                        write_tensor_product_gradient_strided_q<1>(
+                            axis_stride, nyz, i, jk, output_stride, x_batch,
+                            Mvv, Mdv, Mvd, grad_row);
+                        write_tensor_product_gradient_strided_q<2>(
+                            axis_stride, nyz, i, jk, output_stride, x_batch,
+                            Mvv, Mdv, Mvd, grad_row);
+                        write_tensor_product_gradient_strided_q<3>(
+                            axis_stride, nyz, i, jk, output_stride, x_batch,
+                            Mvv, Mdv, Mvd, grad_row);
+                    }
+                }
+                return;
+            }
+
+            for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
+                const auto& idx = tensor_indices[node];
+                const std::size_t i = idx[0];
+                const std::size_t jk = idx[1] * nz + idx[2];
+                Real* grad_row = gradients_out + node * 3u * output_stride;
+
+                for (std::size_t q = 0; q < num_qpts; ++q) {
+                    const std::size_t q_axis = q * axis_stride;
+                    const std::size_t slot = q * nyz + jk;
+                    const Real xv = x_batch.values[q_axis + i];
+                    const Real xd = x_batch.first[q_axis + i];
+                    grad_row[0u * output_stride + q] = xd * Mvv[slot];
+                    grad_row[1u * output_stride + q] = xv * Mdv[slot];
+                    grad_row[2u * output_stride + q] = xv * Mvd[slot];
+                }
+            }
+            return;
+        }
+
+        if (has_z && num_qpts == 4u && hessians_only) {
+            if (output_stride == 4u) {
+                for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
+                    const auto& idx = tensor_indices[node];
+                    const std::size_t i = idx[0];
+                    const std::size_t jk = idx[1] * nz + idx[2];
+                    Real* hess_row = hessians_out + node * 9u * output_stride;
+
+                    write_tensor_product_hessian_stride4_q<0>(
+                        axis_stride, nyz, i, jk, x_batch,
+                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, hess_row);
+                    write_tensor_product_hessian_stride4_q<1>(
+                        axis_stride, nyz, i, jk, x_batch,
+                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, hess_row);
+                    write_tensor_product_hessian_stride4_q<2>(
+                        axis_stride, nyz, i, jk, x_batch,
+                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, hess_row);
+                    write_tensor_product_hessian_stride4_q<3>(
+                        axis_stride, nyz, i, jk, x_batch,
+                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, hess_row);
+                }
+            } else {
+                for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
+                    const auto& idx = tensor_indices[node];
+                    const std::size_t i = idx[0];
+                    const std::size_t jk = idx[1] * nz + idx[2];
+                    Real* hess_row = hessians_out + node * 9u * output_stride;
+
+                    write_tensor_product_hessian_strided_q<0>(
+                        axis_stride, nyz, i, jk, output_stride, x_batch,
+                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, hess_row);
+                    write_tensor_product_hessian_strided_q<1>(
+                        axis_stride, nyz, i, jk, output_stride, x_batch,
+                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, hess_row);
+                    write_tensor_product_hessian_strided_q<2>(
+                        axis_stride, nyz, i, jk, output_stride, x_batch,
+                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, hess_row);
+                    write_tensor_product_hessian_strided_q<3>(
+                        axis_stride, nyz, i, jk, output_stride, x_batch,
+                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, hess_row);
+                }
+            }
+            return;
+        }
+
+        if (has_z && num_qpts == 4u && all_outputs) {
+            if (output_stride == 4u) {
+                for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
+                    const auto& idx = tensor_indices[node];
+                    const std::size_t i = idx[0];
+                    const std::size_t jk = idx[1] * nz + idx[2];
+                    Real* value_row = values_out + node * output_stride;
+                    Real* grad_row = gradients_out + node * 3u * output_stride;
+                    Real* hess_row = hessians_out + node * 9u * output_stride;
+
+                    write_tensor_product_all_stride4_q<0>(
+                        axis_stride, nyz, i, jk, x_batch,
+                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, value_row, grad_row, hess_row);
+                    write_tensor_product_all_stride4_q<1>(
+                        axis_stride, nyz, i, jk, x_batch,
+                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, value_row, grad_row, hess_row);
+                    write_tensor_product_all_stride4_q<2>(
+                        axis_stride, nyz, i, jk, x_batch,
+                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, value_row, grad_row, hess_row);
+                    write_tensor_product_all_stride4_q<3>(
+                        axis_stride, nyz, i, jk, x_batch,
+                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, value_row, grad_row, hess_row);
+                }
+            } else {
+                for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
+                    const auto& idx = tensor_indices[node];
+                    const std::size_t i = idx[0];
+                    const std::size_t jk = idx[1] * nz + idx[2];
+                    Real* value_row = values_out + node * output_stride;
+                    Real* grad_row = gradients_out + node * 3u * output_stride;
+                    Real* hess_row = hessians_out + node * 9u * output_stride;
+
+                    write_tensor_product_all_strided_q<0>(
+                        axis_stride, nyz, i, jk, output_stride, x_batch,
+                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, value_row, grad_row, hess_row);
+                    write_tensor_product_all_strided_q<1>(
+                        axis_stride, nyz, i, jk, output_stride, x_batch,
+                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, value_row, grad_row, hess_row);
+                    write_tensor_product_all_strided_q<2>(
+                        axis_stride, nyz, i, jk, output_stride, x_batch,
+                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, value_row, grad_row, hess_row);
+                    write_tensor_product_all_strided_q<3>(
+                        axis_stride, nyz, i, jk, output_stride, x_batch,
+                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, value_row, grad_row, hess_row);
+                }
+            }
+            return;
+        }
+
+        for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
+            const auto& idx = tensor_indices[node];
+            const std::size_t i = idx[0];
+            const std::size_t jk = idx[1] * nz + idx[2];
+
+            Real* value_row = values_out ? values_out + node * output_stride : nullptr;
+            Real* grad_row = gradients_out ? gradients_out + node * 3u * output_stride : nullptr;
+            Real* hess_row = hessians_out ? hessians_out + node * 9u * output_stride : nullptr;
+
+            for (std::size_t q = 0; q < num_qpts; ++q) {
+                const std::size_t q_axis = q * axis_stride;
+                const std::size_t slot = q * nyz + jk;
+                const Real xv = x_batch.values[q_axis + i];
+
+                if (value_row != nullptr) {
+                    value_row[q] = xv * Mvv[slot];
+                }
+
+                if (need_grad) {
+                    const Real xd = x_batch.first[q_axis + i];
+                    grad_row[0u * output_stride + q] = xd * Mvv[slot];
+                    grad_row[1u * output_stride + q] = xv * Mdv[slot];
+                    grad_row[2u * output_stride + q] = xv * Mvd[slot];
+                }
+
+                if (need_hess) {
+                    const Real xd = x_batch.first[q_axis + i];
+                    const Real x2 = x_batch.second[q_axis + i];
+                    const Real hxy = xd * Mdv[slot];
+                    const Real hxz = xd * Mvd[slot];
+                    const Real hyz = xv * Mdd[slot];
+                    hess_row[0u * output_stride + q] = x2 * Mvv[slot];
+                    hess_row[4u * output_stride + q] = xv * Md2v[slot];
+                    hess_row[8u * output_stride + q] = xv * Mvd2[slot];
+                    hess_row[1u * output_stride + q] = hxy;
+                    hess_row[3u * output_stride + q] = hxy;
+                    hess_row[2u * output_stride + q] = hxz;
+                    hess_row[6u * output_stride + q] = hxz;
+                    hess_row[5u * output_stride + q] = hyz;
+                    hess_row[7u * output_stride + q] = hyz;
+                }
+            }
+        }
+        return;
+    }
+
+    for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
+        const auto& idx = tensor_indices[node];
+        const std::size_t i = idx[0];
+        const std::size_t j = idx[1];
+
+        Real* value_row = values_out ? values_out + node * output_stride : nullptr;
+        Real* grad_row = gradients_out ? gradients_out + node * 3u * output_stride : nullptr;
+        Real* hess_row = hessians_out ? hessians_out + node * 9u * output_stride : nullptr;
+
+        for (std::size_t q = 0; q < num_qpts; ++q) {
+            const std::size_t q_axis = q * axis_stride;
+            const Real xv = x_batch.values[q_axis + i];
+            const Real yv = y_batch.values[q_axis + j];
+
+            if (value_row != nullptr) {
+                value_row[q] = xv * yv;
+            }
+
+            if (need_grad) {
+                const Real xd = x_batch.first[q_axis + i];
+                const Real yd = y_batch.first[q_axis + j];
+                grad_row[0u * output_stride + q] = xd * yv;
+                grad_row[1u * output_stride + q] = xv * yd;
+                grad_row[2u * output_stride + q] = Real(0);
+            }
+
+            if (need_hess) {
+                const Real xd = x_batch.first[q_axis + i];
+                const Real yd = y_batch.first[q_axis + j];
+                const Real x2 = x_batch.second[q_axis + i];
+                const Real y2 = y_batch.second[q_axis + j];
+                const Real hxy = xd * yd;
+
+                hess_row[0u * output_stride + q] = x2 * yv;
+                hess_row[4u * output_stride + q] = xv * y2;
+                hess_row[8u * output_stride + q] = Real(0);
+                hess_row[1u * output_stride + q] = hxy;
+                hess_row[3u * output_stride + q] = hxy;
+                hess_row[2u * output_stride + q] = Real(0);
+                hess_row[6u * output_stride + q] = Real(0);
+                hess_row[5u * output_stride + q] = Real(0);
+                hess_row[7u * output_stride + q] = Real(0);
+            }
+        }
+    }
+}
+
+void evaluate_wedge_points_strided(
+    const std::vector<std::array<int, 4>>& simplex_exponents,
+    const std::vector<std::array<std::size_t, 2>>& wedge_indices,
+    const std::vector<std::size_t>& wedge_node_by_tri_z,
+    int order,
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    const Real* v_coeffs,
+    const Real* d_coeffs,
+    const Real* d2_coeffs,
+    const Real* barycentric_weights,
+    int n_axis,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) {
+    if (points.empty() || wedge_indices.empty()) {
+        return;
+    }
+
+    const bool want_values = values_out != nullptr;
+    const bool need_grad = gradients_out != nullptr;
+    const bool need_hess = hessians_out != nullptr;
+    const bool values_only = want_values && !need_grad && !need_hess;
+    const bool gradients_only = !want_values && need_grad && !need_hess;
+    const bool hessians_only = !want_values && !need_grad && need_hess;
+    const bool all_outputs = want_values && need_grad && need_hess;
+    const bool use_batched_wedge =
+        (values_only && order <= 3) ||
+        (gradients_only && order >= 2) ||
+        (hessians_only && order >= 3) ||
+        (all_outputs && order >= 3);
+    if (values_only &&
+        order >= 4 &&
+        order <= 8 &&
+        try_evaluate_wedge_values_product_q4(
+            simplex_exponents, wedge_indices, order, points, output_stride, values_out)) {
+        return;
+    }
+    const AxisDeriv level = need_hess
+        ? AxisDeriv::ValuesAndFirstAndSecond
+        : (need_grad ? AxisDeriv::ValuesAndFirst : AxisDeriv::ValuesOnly);
+
+    LagrangeEvaluateScratch& scratch = evaluate_scratch();
+    const std::size_t tri_count = simplex_exponents.size();
+    if (use_batched_wedge) {
+        const std::size_t num_qpts = points.size();
+        const std::size_t tri_stride = num_qpts;
+        if (num_qpts == 4u &&
+            output_stride == 4u &&
+            (gradients_only || hessians_only || all_outputs) &&
+            order >= 3 &&
+            order <= 8 &&
+            wedge_node_by_tri_z.size() == tri_count * static_cast<std::size_t>(n_axis)) {
+            const bool use_product_axis_batch =
+                gradients_only &&
+                n_axis >= 5 &&
+                n_axis <= 9;
+            if (!use_product_axis_batch ||
+                !try_fill_axis_batch_product_q4(
+                    scratch.axis_z_batch, points, 2u, n_axis, level)) {
+                fill_axis_batch(scratch.axis_z_batch,
+                                points,
+                                2u,
+                                v_coeffs,
+                                d_coeffs,
+                                d2_coeffs,
+                                barycentric_weights,
+                                n_axis,
+                                level);
+            }
+            if (need_hess) {
+                if (try_evaluate_wedge_fused_stride4_q4<true>(
+                        simplex_exponents, wedge_node_by_tri_z, order, points,
+                        scratch.axis_z_batch, n_axis, values_out, gradients_out, hessians_out)) {
+                    return;
+                }
+            } else if (try_evaluate_wedge_fused_stride4_q4<false>(
+                           simplex_exponents, wedge_node_by_tri_z, order, points,
+                           scratch.axis_z_batch, n_axis, values_out, gradients_out, hessians_out)) {
+                return;
+            }
+        }
+
+        const std::size_t tri_values_size = tri_count * tri_stride;
+        scratch.wedge_tri_values_batch.resize(tri_values_size);
+        if (need_grad || need_hess) {
+            scratch.wedge_tri_gradient_batch.resize(tri_count * 2u * tri_stride);
+        }
+        if (need_hess) {
+            scratch.wedge_tri_hessian_batch.resize(tri_count * 3u * tri_stride);
+        }
+
+        detail::evaluate_triangle_simplex_basis_wedge_components_strided(
+            simplex_exponents,
+            order,
+            points,
+            tri_stride,
+            scratch.wedge_tri_values_batch.data(),
+            (need_grad || need_hess) ? scratch.wedge_tri_gradient_batch.data() : nullptr,
+            need_hess ? scratch.wedge_tri_hessian_batch.data() : nullptr);
+
+        const bool use_product_axis_batch =
+            gradients_only &&
+            points.size() == 4u &&
+            n_axis >= 5 &&
+            n_axis <= 9;
+        if (!use_product_axis_batch ||
+            !try_fill_axis_batch_product_q4(
+                scratch.axis_z_batch, points, 2u, n_axis, level)) {
+            fill_axis_batch(scratch.axis_z_batch,
+                            points,
+                            2u,
+                            v_coeffs,
+                            d_coeffs,
+                            d2_coeffs,
+                            barycentric_weights,
+                            n_axis,
+                            level);
+        }
+
+        const std::size_t axis_stride = static_cast<std::size_t>(n_axis);
+        if (all_outputs) {
+            if (num_qpts == 4u) {
+                if (output_stride == 4u) {
+                    for (std::size_t node = 0; node < wedge_indices.size(); ++node) {
+                        const auto& index = wedge_indices[node];
+                        const std::size_t tri = index[0];
+                        const std::size_t z = index[1];
+                        Real* value_row = values_out + node * output_stride;
+                        Real* g = gradients_out + node * 3u * output_stride;
+                        Real* H = hessians_out + node * 9u * output_stride;
+                        const Real* tri_g = scratch.wedge_tri_gradient_batch.data() + tri * 2u * tri_stride;
+                        const Real* tri_H = scratch.wedge_tri_hessian_batch.data() + tri * 3u * tri_stride;
+                        const Real* tri_values = scratch.wedge_tri_values_batch.data();
+
+                        write_wedge_all_stride4_q<0>(
+                            tri_stride, axis_stride, tri, z,
+                            tri_values, tri_g, tri_H, scratch.axis_z_batch, value_row, g, H);
+                        write_wedge_all_stride4_q<1>(
+                            tri_stride, axis_stride, tri, z,
+                            tri_values, tri_g, tri_H, scratch.axis_z_batch, value_row, g, H);
+                        write_wedge_all_stride4_q<2>(
+                            tri_stride, axis_stride, tri, z,
+                            tri_values, tri_g, tri_H, scratch.axis_z_batch, value_row, g, H);
+                        write_wedge_all_stride4_q<3>(
+                            tri_stride, axis_stride, tri, z,
+                            tri_values, tri_g, tri_H, scratch.axis_z_batch, value_row, g, H);
+                    }
+                } else {
+                    for (std::size_t node = 0; node < wedge_indices.size(); ++node) {
+                        const auto& index = wedge_indices[node];
+                        const std::size_t tri = index[0];
+                        const std::size_t z = index[1];
+                        Real* value_row = values_out + node * output_stride;
+                        Real* g = gradients_out + node * 3u * output_stride;
+                        Real* H = hessians_out + node * 9u * output_stride;
+                        const Real* tri_g = scratch.wedge_tri_gradient_batch.data() + tri * 2u * tri_stride;
+                        const Real* tri_H = scratch.wedge_tri_hessian_batch.data() + tri * 3u * tri_stride;
+                        const Real* tri_values = scratch.wedge_tri_values_batch.data();
+
+                        write_wedge_all_strided_q<0>(
+                            tri_stride, axis_stride, tri, z, output_stride,
+                            tri_values, tri_g, tri_H, scratch.axis_z_batch, value_row, g, H);
+                        write_wedge_all_strided_q<1>(
+                            tri_stride, axis_stride, tri, z, output_stride,
+                            tri_values, tri_g, tri_H, scratch.axis_z_batch, value_row, g, H);
+                        write_wedge_all_strided_q<2>(
+                            tri_stride, axis_stride, tri, z, output_stride,
+                            tri_values, tri_g, tri_H, scratch.axis_z_batch, value_row, g, H);
+                        write_wedge_all_strided_q<3>(
+                            tri_stride, axis_stride, tri, z, output_stride,
+                            tri_values, tri_g, tri_H, scratch.axis_z_batch, value_row, g, H);
+                    }
+                }
+                return;
+            }
+
+            for (std::size_t node = 0; node < wedge_indices.size(); ++node) {
+                const auto& index = wedge_indices[node];
+                const std::size_t tri = index[0];
+                const std::size_t z = index[1];
+                Real* value_row = values_out + node * output_stride;
+                Real* g = gradients_out + node * 3u * output_stride;
+                Real* H = hessians_out + node * 9u * output_stride;
+                const Real* tri_g = scratch.wedge_tri_gradient_batch.data() + tri * 2u * tri_stride;
+                const Real* tri_H = scratch.wedge_tri_hessian_batch.data() + tri * 3u * tri_stride;
+
+                for (std::size_t q = 0; q < num_qpts; ++q) {
+                    const std::size_t tri_q = tri * tri_stride + q;
+                    const std::size_t z_q = q * axis_stride + z;
+                    const Real tri_v = scratch.wedge_tri_values_batch[tri_q];
+                    const Real zv = scratch.axis_z_batch.values[z_q];
+                    const Real zd = scratch.axis_z_batch.first[z_q];
+                    const Real tri_gx = tri_g[0u * tri_stride + q];
+                    const Real tri_gy = tri_g[1u * tri_stride + q];
+                    const Real tri_hxx = tri_H[0u * tri_stride + q];
+                    const Real tri_hxy = tri_H[1u * tri_stride + q];
+                    const Real tri_hyy = tri_H[2u * tri_stride + q];
+                    const Real hxz = tri_gx * zd;
+                    const Real hxy = tri_hxy * zv;
+                    const Real hyz = tri_gy * zd;
+
+                    value_row[q] = tri_v * zv;
+                    g[0u * output_stride + q] = tri_gx * zv;
+                    g[1u * output_stride + q] = tri_gy * zv;
+                    g[2u * output_stride + q] = tri_v * zd;
+                    H[0u * output_stride + q] = tri_hxx * zv;
+                    H[1u * output_stride + q] = hxy;
+                    H[2u * output_stride + q] = hxz;
+                    H[3u * output_stride + q] = hxy;
+                    H[4u * output_stride + q] = tri_hyy * zv;
+                    H[5u * output_stride + q] = hyz;
+                    H[6u * output_stride + q] = hxz;
+                    H[7u * output_stride + q] = hyz;
+                    H[8u * output_stride + q] = tri_v * scratch.axis_z_batch.second[z_q];
+                }
+            }
+            return;
+        }
+
+        if (hessians_only) {
+            if (num_qpts == 4u) {
+                if (output_stride == 4u) {
+                    for (std::size_t node = 0; node < wedge_indices.size(); ++node) {
+                        const auto& index = wedge_indices[node];
+                        const std::size_t tri = index[0];
+                        const std::size_t z = index[1];
+                        Real* H = hessians_out + node * 9u * output_stride;
+                        const Real* tri_g = scratch.wedge_tri_gradient_batch.data() + tri * 2u * tri_stride;
+                        const Real* tri_H = scratch.wedge_tri_hessian_batch.data() + tri * 3u * tri_stride;
+                        const Real* tri_values = scratch.wedge_tri_values_batch.data();
+
+                        write_wedge_hessian_stride4_q<0>(
+                            tri_stride, axis_stride, tri, z,
+                            tri_values, tri_g, tri_H, scratch.axis_z_batch, H);
+                        write_wedge_hessian_stride4_q<1>(
+                            tri_stride, axis_stride, tri, z,
+                            tri_values, tri_g, tri_H, scratch.axis_z_batch, H);
+                        write_wedge_hessian_stride4_q<2>(
+                            tri_stride, axis_stride, tri, z,
+                            tri_values, tri_g, tri_H, scratch.axis_z_batch, H);
+                        write_wedge_hessian_stride4_q<3>(
+                            tri_stride, axis_stride, tri, z,
+                            tri_values, tri_g, tri_H, scratch.axis_z_batch, H);
+                    }
+                } else {
+                    for (std::size_t node = 0; node < wedge_indices.size(); ++node) {
+                        const auto& index = wedge_indices[node];
+                        const std::size_t tri = index[0];
+                        const std::size_t z = index[1];
+                        Real* H = hessians_out + node * 9u * output_stride;
+                        const Real* tri_g = scratch.wedge_tri_gradient_batch.data() + tri * 2u * tri_stride;
+                        const Real* tri_H = scratch.wedge_tri_hessian_batch.data() + tri * 3u * tri_stride;
+                        const Real* tri_values = scratch.wedge_tri_values_batch.data();
+
+                        write_wedge_hessian_strided_q<0>(
+                            tri_stride, axis_stride, tri, z, output_stride,
+                            tri_values, tri_g, tri_H, scratch.axis_z_batch, H);
+                        write_wedge_hessian_strided_q<1>(
+                            tri_stride, axis_stride, tri, z, output_stride,
+                            tri_values, tri_g, tri_H, scratch.axis_z_batch, H);
+                        write_wedge_hessian_strided_q<2>(
+                            tri_stride, axis_stride, tri, z, output_stride,
+                            tri_values, tri_g, tri_H, scratch.axis_z_batch, H);
+                        write_wedge_hessian_strided_q<3>(
+                            tri_stride, axis_stride, tri, z, output_stride,
+                            tri_values, tri_g, tri_H, scratch.axis_z_batch, H);
+                    }
+                }
+                return;
+            }
+
+            for (std::size_t node = 0; node < wedge_indices.size(); ++node) {
+                const auto& index = wedge_indices[node];
+                const std::size_t tri = index[0];
+                const std::size_t z = index[1];
+                Real* H = hessians_out + node * 9u * output_stride;
+                const Real* tri_g = scratch.wedge_tri_gradient_batch.data() + tri * 2u * tri_stride;
+                const Real* tri_H = scratch.wedge_tri_hessian_batch.data() + tri * 3u * tri_stride;
+
+                for (std::size_t q = 0; q < num_qpts; ++q) {
+                    const std::size_t tri_q = tri * tri_stride + q;
+                    const std::size_t z_q = q * axis_stride + z;
+                    const Real tri_v = scratch.wedge_tri_values_batch[tri_q];
+                    const Real zv = scratch.axis_z_batch.values[z_q];
+                    const Real zd = scratch.axis_z_batch.first[z_q];
+                    const Real tri_gx = tri_g[0u * tri_stride + q];
+                    const Real tri_gy = tri_g[1u * tri_stride + q];
+                    const Real tri_hxx = tri_H[0u * tri_stride + q];
+                    const Real tri_hxy = tri_H[1u * tri_stride + q];
+                    const Real tri_hyy = tri_H[2u * tri_stride + q];
+                    const Real hxz = tri_gx * zd;
+                    const Real hxy = tri_hxy * zv;
+                    const Real hyz = tri_gy * zd;
+
+                    H[0u * output_stride + q] = tri_hxx * zv;
+                    H[1u * output_stride + q] = hxy;
+                    H[2u * output_stride + q] = hxz;
+                    H[3u * output_stride + q] = hxy;
+                    H[4u * output_stride + q] = tri_hyy * zv;
+                    H[5u * output_stride + q] = hyz;
+                    H[6u * output_stride + q] = hxz;
+                    H[7u * output_stride + q] = hyz;
+                    H[8u * output_stride + q] = tri_v * scratch.axis_z_batch.second[z_q];
+                }
+            }
+            return;
+        }
+
+        if (gradients_only) {
+            if (num_qpts == 4u) {
+                if (output_stride == 4u) {
+                    for (std::size_t node = 0; node < wedge_indices.size(); ++node) {
+                        const auto& index = wedge_indices[node];
+                        const std::size_t tri = index[0];
+                        const std::size_t z = index[1];
+                        Real* g = gradients_out + node * 3u * output_stride;
+                        const Real* tri_g = scratch.wedge_tri_gradient_batch.data() + tri * 2u * tri_stride;
+                        const Real* tri_values = scratch.wedge_tri_values_batch.data();
+
+                        write_wedge_gradient_stride4_q<0>(
+                            tri_stride, axis_stride, tri, z,
+                            tri_values, tri_g, scratch.axis_z_batch, g);
+                        write_wedge_gradient_stride4_q<1>(
+                            tri_stride, axis_stride, tri, z,
+                            tri_values, tri_g, scratch.axis_z_batch, g);
+                        write_wedge_gradient_stride4_q<2>(
+                            tri_stride, axis_stride, tri, z,
+                            tri_values, tri_g, scratch.axis_z_batch, g);
+                        write_wedge_gradient_stride4_q<3>(
+                            tri_stride, axis_stride, tri, z,
+                            tri_values, tri_g, scratch.axis_z_batch, g);
+                    }
+                } else {
+                    for (std::size_t node = 0; node < wedge_indices.size(); ++node) {
+                        const auto& index = wedge_indices[node];
+                        const std::size_t tri = index[0];
+                        const std::size_t z = index[1];
+                        Real* g = gradients_out + node * 3u * output_stride;
+                        const Real* tri_g = scratch.wedge_tri_gradient_batch.data() + tri * 2u * tri_stride;
+                        const Real* tri_values = scratch.wedge_tri_values_batch.data();
+
+                        write_wedge_gradient_strided_q<0>(
+                            tri_stride, axis_stride, tri, z, output_stride,
+                            tri_values, tri_g, scratch.axis_z_batch, g);
+                        write_wedge_gradient_strided_q<1>(
+                            tri_stride, axis_stride, tri, z, output_stride,
+                            tri_values, tri_g, scratch.axis_z_batch, g);
+                        write_wedge_gradient_strided_q<2>(
+                            tri_stride, axis_stride, tri, z, output_stride,
+                            tri_values, tri_g, scratch.axis_z_batch, g);
+                        write_wedge_gradient_strided_q<3>(
+                            tri_stride, axis_stride, tri, z, output_stride,
+                            tri_values, tri_g, scratch.axis_z_batch, g);
+                    }
+                }
+                return;
+            }
+
+            for (std::size_t node = 0; node < wedge_indices.size(); ++node) {
+                const auto& index = wedge_indices[node];
+                const std::size_t tri = index[0];
+                const std::size_t z = index[1];
+                Real* g = gradients_out + node * 3u * output_stride;
+                const Real* tri_g = scratch.wedge_tri_gradient_batch.data() + tri * 2u * tri_stride;
+
+                for (std::size_t q = 0; q < num_qpts; ++q) {
+                    const std::size_t tri_q = tri * tri_stride + q;
+                    const std::size_t z_q = q * axis_stride + z;
+                    const Real tri_v = scratch.wedge_tri_values_batch[tri_q];
+                    const Real zv = scratch.axis_z_batch.values[z_q];
+                    g[0u * output_stride + q] = tri_g[0u * tri_stride + q] * zv;
+                    g[1u * output_stride + q] = tri_g[1u * tri_stride + q] * zv;
+                    g[2u * output_stride + q] = tri_v * scratch.axis_z_batch.first[z_q];
+                }
+            }
+            return;
+        }
+
+        for (std::size_t node = 0; node < wedge_indices.size(); ++node) {
+            const auto& index = wedge_indices[node];
+            const std::size_t tri = index[0];
+            const std::size_t z = index[1];
+            Real* value_row = values_out ? values_out + node * output_stride : nullptr;
+            Real* g = gradients_out ? gradients_out + node * 3u * output_stride : nullptr;
+            Real* H = hessians_out ? hessians_out + node * 9u * output_stride : nullptr;
+
+            for (std::size_t q = 0; q < num_qpts; ++q) {
+                const std::size_t tri_q = tri * tri_stride + q;
+                const std::size_t z_q = q * axis_stride + z;
+                const Real tri_v = scratch.wedge_tri_values_batch[tri_q];
+                const Real zv = scratch.axis_z_batch.values[z_q];
+                if (values_out != nullptr) {
+                    value_row[q] = tri_v * zv;
+                }
+
+                if (need_grad) {
+                    const Real* tri_g = scratch.wedge_tri_gradient_batch.data() + tri * 2u * tri_stride;
+                    g[0u * output_stride + q] = tri_g[0u * tri_stride + q] * zv;
+                    g[1u * output_stride + q] = tri_g[1u * tri_stride + q] * zv;
+                    g[2u * output_stride + q] = tri_v * scratch.axis_z_batch.first[z_q];
+                }
+
+                if (need_hess) {
+                    const Real* tri_g = scratch.wedge_tri_gradient_batch.data() + tri * 2u * tri_stride;
+                    const Real* tri_H = scratch.wedge_tri_hessian_batch.data() + tri * 3u * tri_stride;
+                    const Real zd = scratch.axis_z_batch.first[z_q];
+                    const Real hxz = tri_g[0u * tri_stride + q] * zd;
+                    const Real hxy = tri_H[1u * tri_stride + q] * zv;
+                    const Real hyz = tri_g[1u * tri_stride + q] * zd;
+                    H[0u * output_stride + q] = tri_H[0u * tri_stride + q] * zv;
+                    H[1u * output_stride + q] = hxy;
+                    H[2u * output_stride + q] = hxz;
+                    H[3u * output_stride + q] = hxy;
+                    H[4u * output_stride + q] = tri_H[2u * tri_stride + q] * zv;
+                    H[5u * output_stride + q] = hyz;
+                    H[6u * output_stride + q] = hxz;
+                    H[7u * output_stride + q] = hyz;
+                    H[8u * output_stride + q] = tri_v * scratch.axis_z_batch.second[z_q];
+                }
+            }
+        }
+
+        return;
+    }
+
+    scratch.tri_values.resize(tri_count);
+    if (need_grad || need_hess) {
+        scratch.tri_gradient_components.resize(tri_count * 3u);
+    }
+    if (need_hess) {
+        scratch.tri_hessian_components.resize(tri_count * 9u);
+    }
+
+    for (std::size_t q = 0; q < points.size(); ++q) {
+        const auto& xi = points[q];
+        const AxisBasisEvaluations z_axis =
+            fill_axis_scratch(scratch.axis_z,
+                              v_coeffs,
+                              d_coeffs,
+                              d2_coeffs,
+                              barycentric_weights,
+                              n_axis,
+                              xi[2],
+                              level);
+        detail::evaluate_triangle_simplex_basis_to(
+            simplex_exponents,
+            order,
+            xi,
+            scratch.tri_values.data(),
+            (need_grad || need_hess) ? scratch.tri_gradient_components.data() : nullptr,
+            need_hess ? scratch.tri_hessian_components.data() : nullptr);
+
+        for (std::size_t node = 0; node < wedge_indices.size(); ++node) {
+            const auto& index = wedge_indices[node];
+            const std::size_t tri = index[0];
+            const std::size_t z = index[1];
+            const Real tri_v = scratch.tri_values[tri];
+            const Real zv = z_axis.values[z];
+
+            if (values_out != nullptr) {
+                values_out[node * output_stride + q] = tri_v * zv;
+            }
+
+            if (need_grad) {
+                const Real* tri_g = scratch.tri_gradient_components.data() + tri * 3u;
+                Real* g = gradients_out + node * 3u * output_stride;
+                g[0u * output_stride + q] = tri_g[0] * zv;
+                g[1u * output_stride + q] = tri_g[1] * zv;
+                g[2u * output_stride + q] = tri_v * z_axis.first[z];
+            }
+
+            if (need_hess) {
+                const Real* tri_g = scratch.tri_gradient_components.data() + tri * 3u;
+                const Real* tri_H = scratch.tri_hessian_components.data() + tri * 9u;
+                const Real zd = z_axis.first[z];
+                const Real hxz = tri_g[0] * zd;
+                const Real hxy = tri_H[1] * zv;
+                const Real hyz = tri_g[1] * zd;
+                Real* H = hessians_out + node * 9u * output_stride;
+                H[0u * output_stride + q] = tri_H[0] * zv;
+                H[1u * output_stride + q] = hxy;
+                H[2u * output_stride + q] = hxz;
+                H[3u * output_stride + q] = hxy;
+                H[4u * output_stride + q] = tri_H[4] * zv;
+                H[5u * output_stride + q] = hyz;
+                H[6u * output_stride + q] = hxz;
+                H[7u * output_stride + q] = hyz;
+                H[8u * output_stride + q] = tri_v * z_axis.second[z];
+            }
+        }
+    }
+}
+
+NormalizedLagrangeRequest normalize_lagrange_request(ElementType element_type, int order) {
+    switch (element_type) {
+        case ElementType::Line3:
+            return {ElementType::Line2, std::max(order, 2)};
+        case ElementType::Triangle6:
+            return {ElementType::Triangle3, std::max(order, 2)};
+        case ElementType::Quad9:
+            return {ElementType::Quad4, std::max(order, 2)};
+        case ElementType::Quad8:
+            throw BasisElementCompatibilityException(
+                "Quad8 is a serendipity element; use SerendipityBasis for Quad8",
+                __FILE__, __LINE__, __func__);
+        case ElementType::Tetra10:
+            return {ElementType::Tetra4, std::max(order, 2)};
+        case ElementType::Hex27:
+            return {ElementType::Hex8, std::max(order, 2)};
+        case ElementType::Hex20:
+            throw BasisElementCompatibilityException(
+                "Hex20 is a serendipity element; use SerendipityBasis for Hex20",
+                __FILE__, __LINE__, __func__);
+        case ElementType::Wedge18:
+            return {ElementType::Wedge6, std::max(order, 2)};
+        case ElementType::Wedge15:
+            throw BasisElementCompatibilityException(
+                "Wedge15 is a serendipity element; use SerendipityBasis for Wedge15",
+                __FILE__, __LINE__, __func__);
+        case ElementType::Pyramid13:
+            throw BasisElementCompatibilityException(
+                "Pyramid13 is a serendipity variant; use SerendipityBasis (Pyramid13) or the complete-family Lagrange path via LagrangeBasis (Pyramid5, order >= 2)",
+                __FILE__, __LINE__, __func__);
+        case ElementType::Pyramid14:
+            return {ElementType::Pyramid5, std::max(order, 2)};
+        default:
+            return {element_type, order};
+    }
+}
+
+} // namespace
+
+void prewarm_lagrange_basis_scratch(int max_order, std::size_t max_qpts) {
+    evaluate_scratch().prewarm(max_order, max_qpts);
+}
+
+LagrangeBasis::LagrangeBasis(ElementType type, int order)
+    : element_type_(type), dimension_(0), order_(order) {
+    const NormalizedLagrangeRequest normalized = normalize_lagrange_request(element_type_, order_);
+    element_type_ = normalized.element_type;
+    order_ = normalized.order;
+
+    if (order_ < 0) {
+        throw BasisConfigurationException("LagrangeBasis requires non-negative polynomial order",
+                                          __FILE__, __LINE__, __func__);
+    }
+
+    dimension_ = lagrange_topology_traits(element_type_).dimension;
+
+    init_nodes();
+    init_evaluation_dispatch();
+}
+
+void LagrangeBasis::init_nodes() {
+    nodes_.clear();
+    nodes_1d_.clear();
+    tensor_indices_.clear();
+    simplex_exponents_.clear();
+    wedge_indices_.clear();
+    wedge_node_by_tri_z_.clear();
+    axis_v_coeffs_.clear();
+    axis_d_coeffs_.clear();
+    axis_d2_coeffs_.clear();
+    axis_barycentric_weights_.clear();
+    const auto topology = lagrange_topology_traits(element_type_).topology;
+    topology_id_ = static_cast<int>(topology);
+    switch (topology) {
+        case LagrangeTopology::Point:
+            build_point_nodes();
+            return;
+        case LagrangeTopology::Line:
+            build_tensor_product_nodes(1);
+            compute_axis_monomial_coefficients();
+            return;
+        case LagrangeTopology::Quadrilateral:
+            build_tensor_product_nodes(2);
+            compute_axis_monomial_coefficients();
+            return;
+        case LagrangeTopology::Hexahedron:
+            build_tensor_product_nodes(3);
+            compute_axis_monomial_coefficients();
+            return;
+        case LagrangeTopology::Triangle:
+        case LagrangeTopology::Tetrahedron:
+            build_simplex_nodes();
+            return;
+        case LagrangeTopology::Wedge:
+            build_wedge_nodes();
+            compute_axis_monomial_coefficients();
+            return;
+        case LagrangeTopology::Pyramid:
+            build_pyramid_nodes();
+            return;
+        case LagrangeTopology::Unknown:
+            break;
+    }
+
+    throw BasisElementCompatibilityException("Unsupported element type in LagrangeBasis::init_nodes",
+                                             __FILE__, __LINE__, __func__);
+}
+
+void LagrangeBasis::init_evaluation_dispatch() {
+    const auto topology = static_cast<LagrangeTopology>(topology_id_);
+    switch (topology) {
+        case LagrangeTopology::Point:
+            vector_evaluation_dispatch_ = &LagrangeBasis::evaluate_point_vectors;
+            return;
+        case LagrangeTopology::Line:
+        case LagrangeTopology::Quadrilateral:
+        case LagrangeTopology::Hexahedron:
+            vector_evaluation_dispatch_ = &LagrangeBasis::evaluate_tensor_product_vectors;
+            return;
+        case LagrangeTopology::Triangle:
+            vector_evaluation_dispatch_ = &LagrangeBasis::evaluate_triangle_vectors;
+            return;
+        case LagrangeTopology::Tetrahedron:
+            vector_evaluation_dispatch_ = &LagrangeBasis::evaluate_tetrahedron_vectors;
+            return;
+        case LagrangeTopology::Wedge:
+            vector_evaluation_dispatch_ = &LagrangeBasis::evaluate_wedge_vectors;
+            return;
+        case LagrangeTopology::Pyramid:
+            vector_evaluation_dispatch_ = &LagrangeBasis::evaluate_pyramid_vectors;
+            return;
+        case LagrangeTopology::Unknown:
+            break;
+    }
+    vector_evaluation_dispatch_ = &LagrangeBasis::evaluate_unsupported_vectors;
+}
+
+void LagrangeBasis::compute_axis_monomial_coefficients() {
+    const int N = static_cast<int>(nodes_1d_.size());
+    if (N == 0) return;
+
+    axis_barycentric_weights_.resize(static_cast<std::size_t>(N));
+    fill_equispaced_barycentric_weights(N, axis_barycentric_weights_.data());
+
+    if (assign_precomputed_axis_coefficients(N, axis_v_coeffs_, axis_d_coeffs_, axis_d2_coeffs_)) {
+        return;
+    }
+
+    axis_v_coeffs_.assign(static_cast<std::size_t>(N) * static_cast<std::size_t>(N), Real(0));
+    if (N >= 2) {
+        axis_d_coeffs_.assign(static_cast<std::size_t>(N) * static_cast<std::size_t>(N - 1), Real(0));
+    }
+    if (N >= 3) {
+        axis_d2_coeffs_.assign(static_cast<std::size_t>(N) * static_cast<std::size_t>(N - 2), Real(0));
+    }
+
+    if (N == 1) {
+        axis_v_coeffs_[0] = Real(1);
+        return;
+    }
+
+    // For each L_i, compute monomial coefficients of P_i(x) = prod_{j != i} (x - x_j),
+    // then divide by w_i = prod_{j != i} (x_i - x_j).
+    std::vector<Real> coeffs;
+    coeffs.reserve(static_cast<std::size_t>(N));
+    for (int i = 0; i < N; ++i) {
+        coeffs.assign(1, Real(1));  // start with constant polynomial 1
+        for (int j = 0; j < N; ++j) {
+            if (j == i) continue;
+            // Multiply (x - x_j) into coeffs (in-place via temp).
+            std::vector<Real> next(coeffs.size() + 1, Real(0));
+            for (std::size_t k = 0; k < coeffs.size(); ++k) {
+                next[k]     -= nodes_1d_[static_cast<std::size_t>(j)] * coeffs[k];
+                next[k + 1] += coeffs[k];
+            }
+            coeffs.swap(next);
+        }
+        // Divide by w_i.
+        Real denom = Real(1);
+        for (int j = 0; j < N; ++j) {
+            if (j == i) continue;
+            denom *= (nodes_1d_[static_cast<std::size_t>(i)] - nodes_1d_[static_cast<std::size_t>(j)]);
+        }
+        const Real inv_denom = Real(1) / denom;
+        for (int k = 0; k < N; ++k) {
+            axis_v_coeffs_[static_cast<std::size_t>(i) * static_cast<std::size_t>(N) + static_cast<std::size_t>(k)]
+                = coeffs[static_cast<std::size_t>(k)] * inv_denom;
+        }
+
+        // First derivative coefficients: d/dx (sum_k c_ik * x^k) = sum_{k>=1} k*c_ik * x^(k-1).
+        if (N >= 2) {
+            for (int k = 1; k < N; ++k) {
+                axis_d_coeffs_[static_cast<std::size_t>(i) * static_cast<std::size_t>(N - 1)
+                              + static_cast<std::size_t>(k - 1)]
+                    = static_cast<Real>(k)
+                      * axis_v_coeffs_[static_cast<std::size_t>(i) * static_cast<std::size_t>(N)
+                                       + static_cast<std::size_t>(k)];
+            }
+        }
+
+        // Second derivative coefficients: d^2/dx^2 = sum_{k>=2} k*(k-1)*c_ik * x^(k-2).
+        if (N >= 3) {
+            for (int k = 2; k < N; ++k) {
+                axis_d2_coeffs_[static_cast<std::size_t>(i) * static_cast<std::size_t>(N - 2)
+                              + static_cast<std::size_t>(k - 2)]
+                    = static_cast<Real>(k * (k - 1))
+                      * axis_v_coeffs_[static_cast<std::size_t>(i) * static_cast<std::size_t>(N)
+                                       + static_cast<std::size_t>(k)];
+            }
+        }
+    }
+}
+
+void LagrangeBasis::build_point_nodes() {
+    nodes_.push_back(math::Vector<Real, 3>{Real(0), Real(0), Real(0)});
+}
+
+void LagrangeBasis::init_equispaced_1d_nodes() {
+    nodes_1d_.clear();
+    for (int i = 0; i <= std::max(order_, 0); ++i) {
+        nodes_1d_.push_back(detail::equispaced_pm_one_coord(i, order_));
+    }
+}
+
+void LagrangeBasis::build_tensor_product_nodes(int dimensions) {
+    init_equispaced_1d_nodes();
+
+    if (dimensions < 1 || dimensions > 3) {
+        throw BasisConfigurationException("LagrangeBasis::build_tensor_product_nodes requires dimension 1, 2, or 3",
+                                          __FILE__, __LINE__, __func__);
+    }
+
+    nodes_ = ReferenceNodeLayout::get_lagrange_node_coords(element_type_, order_);
+    tensor_indices_.resize(nodes_.size(), TensorNodeIndex{0u, 0u, 0u});
+    for (std::size_t n = 0; n < nodes_.size(); ++n) {
+        tensor_indices_[n][0] = lattice_index_pm_one(
+            nodes_[n][0], order_,
+            "LagrangeBasis: invalid tensor-product x-coordinate in public node ordering");
+        if (dimensions >= 2) {
+            tensor_indices_[n][1] = lattice_index_pm_one(
+                nodes_[n][1], order_,
+                "LagrangeBasis: invalid tensor-product y-coordinate in public node ordering");
+        }
+        if (dimensions == 3) {
+            tensor_indices_[n][2] = lattice_index_pm_one(
+                nodes_[n][2], order_,
+                "LagrangeBasis: invalid tensor-product z-coordinate in public node ordering");
+        }
+    }
+}
+
+void LagrangeBasis::build_simplex_nodes() {
+    nodes_ = ReferenceNodeLayout::get_lagrange_node_coords(element_type_, order_);
+    const auto topology = static_cast<LagrangeTopology>(topology_id_);
+    simplex_exponents_.clear();
+    simplex_exponents_.reserve(nodes_.size());
+    for (const auto& node : nodes_) {
+        switch (topology) {
+            case LagrangeTopology::Triangle:
+                simplex_exponents_.push_back(triangle_exponents_from_public_node(node, order_));
+                break;
+            case LagrangeTopology::Tetrahedron:
+                simplex_exponents_.push_back(tetrahedron_exponents_from_public_node(node, order_));
+                break;
+            default:
+                throw BasisElementCompatibilityException("LagrangeBasis::build_simplex_nodes requires simplex topology",
+                                                         __FILE__, __LINE__, __func__);
+        }
+    }
+}
+
+void LagrangeBasis::build_wedge_nodes() {
+    init_equispaced_1d_nodes();
+    const auto triangle_nodes = ReferenceNodeLayout::get_lagrange_node_coords(ElementType::Triangle3, order_);
+    simplex_exponents_.clear();
+    simplex_exponents_.reserve(triangle_nodes.size());
+    std::unordered_map<std::array<int, 4>, std::size_t, SimplexExponentHash> triangle_index_by_exponent;
+    triangle_index_by_exponent.reserve(triangle_nodes.size());
+    for (std::size_t tri = 0; tri < triangle_nodes.size(); ++tri) {
+        const auto exponents = triangle_exponents_from_public_node(triangle_nodes[tri], order_);
+        simplex_exponents_.push_back(exponents);
+        const auto inserted = triangle_index_by_exponent.emplace(exponents, tri);
+        if (!inserted.second) {
+            throw BasisNodeOrderingException("LagrangeBasis: duplicate wedge triangle descriptor",
+                                             __FILE__, __LINE__, __func__);
+        }
+    }
+
+    nodes_ = ReferenceNodeLayout::get_lagrange_node_coords(element_type_, order_);
+    wedge_indices_.clear();
+    wedge_indices_.reserve(nodes_.size());
+    const std::size_t z_count = static_cast<std::size_t>(order_ + 1);
+    const std::size_t missing_node = nodes_.size();
+    wedge_node_by_tri_z_.assign(triangle_nodes.size() * z_count, missing_node);
+    for (std::size_t node_index = 0; node_index < nodes_.size(); ++node_index) {
+        const auto& node = nodes_[node_index];
+        const auto exponents = triangle_exponents_from_public_node(node, order_);
+        const auto found = triangle_index_by_exponent.find(exponents);
+        if (found == triangle_index_by_exponent.end()) {
+            throw BasisNodeOrderingException("LagrangeBasis: failed to resolve wedge triangle descriptor in public ordering",
+                                             __FILE__, __LINE__, __func__);
+        }
+        const std::size_t tri = found->second;
+        const std::size_t z =
+            lattice_index_pm_one(node[2], order_,
+                                 "LagrangeBasis: invalid wedge z-coordinate in public node ordering");
+        wedge_indices_.push_back(WedgeNodeIndex{tri, z});
+        wedge_node_by_tri_z_[tri * z_count + z] = node_index;
+    }
+    for (std::size_t entry = 0; entry < wedge_node_by_tri_z_.size(); ++entry) {
+        if (wedge_node_by_tri_z_[entry] == missing_node) {
+            throw BasisNodeOrderingException("LagrangeBasis: incomplete wedge tensor-product node map",
+                                             __FILE__, __LINE__, __func__);
+        }
+    }
+}
+
+void LagrangeBasis::build_pyramid_nodes() {
+    nodes_ = detail::lagrange_pyramid::nodes(order_);
+}
+
+void LagrangeBasis::evaluate_point_vectors(const math::Vector<Real, 3>&,
+                                           std::vector<Real>* values,
+                                           std::vector<Gradient>* gradients,
+                                           std::vector<Hessian>* hessians) const {
+    if (values != nullptr) {
+        values->resize(1u);
+        (*values)[0] = Real(1);
+    }
+    if (gradients != nullptr) {
+        gradients->resize(1u);
+        (*gradients)[0] = Gradient{};
+    }
+    if (hessians != nullptr) {
+        hessians->resize(1u);
+        (*hessians)[0] = Hessian{};
+    }
+}
+
+void LagrangeBasis::evaluate_tensor_product_vectors(const math::Vector<Real, 3>& xi,
+                                                    std::vector<Real>* values,
+                                                    std::vector<Gradient>* gradients,
+                                                    std::vector<Hessian>* hessians) const {
+    const auto topology = static_cast<LagrangeTopology>(topology_id_);
+    if (evaluate_fixed_lagrange_fast(topology, order_, xi, values, gradients, hessians)) {
+        return;
+    }
+
+    const int n_axis = static_cast<int>(nodes_1d_.size());
+    const Real* vc = axis_v_coeffs_.data();
+    const Real* dc = axis_d_coeffs_.data();
+    const Real* d2c = axis_d2_coeffs_.data();
+    const Real* bw = axis_barycentric_weights_.data();
+    const AxisDeriv level = hessians != nullptr ? AxisDeriv::ValuesAndFirstAndSecond
+                           : gradients != nullptr ? AxisDeriv::ValuesAndFirst
+                                                  : AxisDeriv::ValuesOnly;
+
+    LagrangeEvaluateScratch& scratch = evaluate_scratch();
+    const AxisBasisEvaluations x_axis =
+        fill_axis_scratch(scratch.axis_x, vc, dc, d2c, bw, n_axis, xi[0], level);
+    AxisBasisEvaluations y_axis = constant_axis_basis();
+    AxisBasisEvaluations z_axis = constant_axis_basis();
+
+    if (topology != LagrangeTopology::Line) {
+        y_axis = fill_axis_scratch(scratch.axis_y, vc, dc, d2c, bw, n_axis, xi[1], level);
+    }
+    if (topology == LagrangeTopology::Hexahedron) {
+        z_axis = fill_axis_scratch(scratch.axis_z, vc, dc, d2c, bw, n_axis, xi[2], level);
+    }
+
+    evaluate_tensor_product_factorized(tensor_indices_, x_axis, y_axis, z_axis,
+                                       values, gradients, hessians);
+}
+
+void LagrangeBasis::evaluate_triangle_vectors(const math::Vector<Real, 3>& xi,
+                                              std::vector<Real>* values,
+                                              std::vector<Gradient>* gradients,
+                                              std::vector<Hessian>* hessians) const {
+    const auto topology = static_cast<LagrangeTopology>(topology_id_);
+    if (evaluate_fixed_lagrange_fast(topology, order_, xi, values, gradients, hessians)) {
+        return;
+    }
+    detail::evaluate_triangle_simplex_basis(simplex_exponents_, order_, xi,
+                                            values, gradients, hessians);
+}
+
+void LagrangeBasis::evaluate_tetrahedron_vectors(const math::Vector<Real, 3>& xi,
+                                                 std::vector<Real>* values,
+                                                 std::vector<Gradient>* gradients,
+                                                 std::vector<Hessian>* hessians) const {
+    const auto topology = static_cast<LagrangeTopology>(topology_id_);
+    if (evaluate_fixed_lagrange_fast(topology, order_, xi, values, gradients, hessians)) {
+        return;
+    }
+    detail::evaluate_tetrahedron_simplex_basis(simplex_exponents_, order_, xi,
+                                               values, gradients, hessians);
+}
+
+void LagrangeBasis::evaluate_wedge_vectors(const math::Vector<Real, 3>& xi,
+                                           std::vector<Real>* values,
+                                           std::vector<Gradient>* gradients,
+                                           std::vector<Hessian>* hessians) const {
+    const int n_axis = static_cast<int>(nodes_1d_.size());
+    const Real* vc = axis_v_coeffs_.data();
+    const Real* dc = axis_d_coeffs_.data();
+    const Real* d2c = axis_d2_coeffs_.data();
+    const Real* bw = axis_barycentric_weights_.data();
+    const AxisDeriv level = hessians != nullptr ? AxisDeriv::ValuesAndFirstAndSecond
+                           : gradients != nullptr ? AxisDeriv::ValuesAndFirst
+                                                  : AxisDeriv::ValuesOnly;
+
+    LagrangeEvaluateScratch& scratch = evaluate_scratch();
+    const AxisBasisEvaluations z_axis =
+        fill_axis_scratch(scratch.axis_z, vc, dc, d2c, bw, n_axis, xi[2], level);
+
+    if (hessians != nullptr) {
+        detail::evaluate_triangle_simplex_basis(
+            simplex_exponents_, order_, xi,
+            &scratch.tri_values, &scratch.tri_gradients, &scratch.tri_hessians);
+    } else if (gradients != nullptr) {
+        detail::evaluate_triangle_simplex_basis(
+            simplex_exponents_, order_, xi,
+            &scratch.tri_values, &scratch.tri_gradients, nullptr);
+    } else {
+        detail::evaluate_triangle_simplex_basis(
+            simplex_exponents_, order_, xi,
+            &scratch.tri_values, nullptr, nullptr);
+    }
+
+    const std::size_t n_nodes = wedge_indices_.size();
+    if (values != nullptr) {
+        values->resize(n_nodes);
+    }
+    if (gradients != nullptr) {
+        gradients->resize(n_nodes);
+    }
+    if (hessians != nullptr) {
+        hessians->resize(n_nodes);
+    }
+
+    for (std::size_t n = 0; n < n_nodes; ++n) {
+        const auto& index = wedge_indices_[n];
+        const std::size_t tri_idx = index[0];
+        const std::size_t z_idx = index[1];
+        const Real zv = z_axis.values[z_idx];
+        const Real tri_v = scratch.tri_values[tri_idx];
+
+        if (values != nullptr) {
+            (*values)[n] = tri_v * zv;
+        }
+        if (gradients != nullptr) {
+            const Real zd = z_axis.first[z_idx];
+            (*gradients)[n][0] = scratch.tri_gradients[tri_idx][0] * zv;
+            (*gradients)[n][1] = scratch.tri_gradients[tri_idx][1] * zv;
+            (*gradients)[n][2] = tri_v * zd;
+        }
+        if (hessians != nullptr) {
+            const Real zd = z_axis.first[z_idx];
+            const Real zd2 = z_axis.second[z_idx];
+            Hessian H{};
+            H(0, 0) = scratch.tri_hessians[tri_idx](0, 0) * zv;
+            H(1, 1) = scratch.tri_hessians[tri_idx](1, 1) * zv;
+            H(0, 1) = scratch.tri_hessians[tri_idx](0, 1) * zv;
+            H(1, 0) = H(0, 1);
+            H(2, 2) = tri_v * zd2;
+            H(0, 2) = scratch.tri_gradients[tri_idx][0] * zd;
+            H(2, 0) = H(0, 2);
+            H(1, 2) = scratch.tri_gradients[tri_idx][1] * zd;
+            H(2, 1) = H(1, 2);
+            (*hessians)[n] = H;
+        }
+    }
+}
+
+void LagrangeBasis::evaluate_pyramid_vectors(const math::Vector<Real, 3>& xi,
+                                             std::vector<Real>* values,
+                                             std::vector<Gradient>* gradients,
+                                             std::vector<Hessian>* hessians) const {
+    if (values != nullptr && gradients != nullptr && hessians != nullptr) {
+        detail::lagrange_pyramid::evaluate_all(order_, xi, *values, *gradients, *hessians);
+        return;
+    }
+    if (values != nullptr) {
+        detail::lagrange_pyramid::evaluate_values(order_, xi, *values);
+    }
+    if (gradients != nullptr) {
+        detail::lagrange_pyramid::evaluate_gradients(order_, xi, *gradients);
+    }
+    if (hessians != nullptr) {
+        detail::lagrange_pyramid::evaluate_hessians(order_, xi, *hessians);
+    }
+}
+
+void LagrangeBasis::evaluate_unsupported_vectors(const math::Vector<Real, 3>&,
+                                                 std::vector<Real>*,
+                                                 std::vector<Gradient>*,
+                                                 std::vector<Hessian>*) const {
+    throw BasisEvaluationException("Unsupported element in LagrangeBasis vector evaluation",
+                                   __FILE__, __LINE__, __func__);
+}
+
+void LagrangeBasis::evaluate_values(const math::Vector<Real, 3>& xi,
+                                    std::vector<Real>& values) const {
+    (this->*vector_evaluation_dispatch_)(xi, &values, nullptr, nullptr);
+}
+
+void LagrangeBasis::evaluate_gradients(const math::Vector<Real, 3>& xi,
+                                       std::vector<Gradient>& gradients) const {
+    (this->*vector_evaluation_dispatch_)(xi, nullptr, &gradients, nullptr);
+}
+
+void LagrangeBasis::evaluate_hessians(const math::Vector<Real, 3>& xi,
+                                      std::vector<Hessian>& hessians) const {
+    (this->*vector_evaluation_dispatch_)(xi, nullptr, nullptr, &hessians);
+}
+
+void LagrangeBasis::evaluate_all(const math::Vector<Real, 3>& xi,
+                                 std::vector<Real>& values,
+                                 std::vector<Gradient>& gradients,
+                                 std::vector<Hessian>& hessians) const {
+    (this->*vector_evaluation_dispatch_)(xi, &values, &gradients, &hessians);
+}
+
+void LagrangeBasis::evaluate_values_to(const math::Vector<Real, 3>& xi,
+                                       Real* SVMP_RESTRICT values_out) const {
+    const auto topology = static_cast<LagrangeTopology>(topology_id_);
+    if (evaluate_fixed_lagrange_fast_to(topology, order_, xi, values_out, nullptr, nullptr)) {
+        return;
+    }
+
+    const int n_axis = static_cast<int>(nodes_1d_.size());
+    const Real* vc = axis_v_coeffs_.data();
+    const Real* dc = axis_d_coeffs_.data();
+    const Real* d2c = axis_d2_coeffs_.data();
+    const Real* bw = axis_barycentric_weights_.data();
+    switch (topology) {
+        case LagrangeTopology::Point:
+            values_out[0] = Real(1);
+            return;
+        case LagrangeTopology::Line:
+        case LagrangeTopology::Quadrilateral:
+        case LagrangeTopology::Hexahedron: {
+            LagrangeEvaluateScratch& scratch = evaluate_scratch();
+            const AxisBasisEvaluations x_axis =
+                fill_axis_scratch(scratch.axis_x, vc, dc, d2c, bw, n_axis, xi[0], AxisDeriv::ValuesOnly);
+            AxisBasisEvaluations y_axis = constant_axis_basis();
+            AxisBasisEvaluations z_axis = constant_axis_basis();
+            if (topology != LagrangeTopology::Line) {
+                y_axis = fill_axis_scratch(scratch.axis_y, vc, dc, d2c, bw, n_axis, xi[1], AxisDeriv::ValuesOnly);
+            }
+            if (topology == LagrangeTopology::Hexahedron) {
+                z_axis = fill_axis_scratch(scratch.axis_z, vc, dc, d2c, bw, n_axis, xi[2], AxisDeriv::ValuesOnly);
+            }
+            evaluate_tensor_product_factorized_to(tensor_indices_, x_axis, y_axis, z_axis,
+                                                  values_out, nullptr, nullptr);
+            return;
+        }
+        case LagrangeTopology::Triangle:
+            detail::evaluate_triangle_simplex_basis_to(simplex_exponents_, order_, xi,
+                                                       values_out, nullptr, nullptr);
+            return;
+        case LagrangeTopology::Tetrahedron:
+            detail::evaluate_tetrahedron_simplex_basis_to(simplex_exponents_, order_, xi,
+                                                          values_out, nullptr, nullptr);
+            return;
+        case LagrangeTopology::Wedge: {
+            LagrangeEvaluateScratch& scratch = evaluate_scratch();
+            const AxisBasisEvaluations z_axis =
+                fill_axis_scratch(scratch.axis_z, vc, dc, d2c, bw, n_axis, xi[2], AxisDeriv::ValuesOnly);
+            scratch.tri_values.resize(simplex_exponents_.size());
+            detail::evaluate_triangle_simplex_basis_to(simplex_exponents_, order_, xi,
+                                                       scratch.tri_values.data(), nullptr, nullptr);
+            for (std::size_t n = 0; n < wedge_indices_.size(); ++n) {
+                const auto& index = wedge_indices_[n];
+                values_out[n] = scratch.tri_values[index[0]] * z_axis.values[index[1]];
+            }
+            return;
+        }
+        case LagrangeTopology::Pyramid: {
+            detail::lagrange_pyramid::evaluate_values_to(order_, xi, values_out);
+            return;
+        }
+        case LagrangeTopology::Unknown:
+            break;
+    }
+
+    throw BasisEvaluationException("Unsupported element in evaluate_values_to",
+                                   __FILE__, __LINE__, __func__);
+}
+
+void LagrangeBasis::evaluate_gradients_to(const math::Vector<Real, 3>& xi,
+                                          Real* SVMP_RESTRICT gradients_out) const {
+    const auto topology = static_cast<LagrangeTopology>(topology_id_);
+    if (evaluate_fixed_lagrange_fast_to(topology, order_, xi, nullptr, gradients_out, nullptr)) {
+        return;
+    }
+
+    const int n_axis = static_cast<int>(nodes_1d_.size());
+    const Real* vc = axis_v_coeffs_.data();
+    const Real* dc = axis_d_coeffs_.data();
+    const Real* d2c = axis_d2_coeffs_.data();
+    const Real* bw = axis_barycentric_weights_.data();
+    switch (topology) {
+        case LagrangeTopology::Point:
+            gradients_out[0] = Real(0);
+            gradients_out[1] = Real(0);
+            gradients_out[2] = Real(0);
+            return;
+        case LagrangeTopology::Line:
+        case LagrangeTopology::Quadrilateral:
+        case LagrangeTopology::Hexahedron: {
+            LagrangeEvaluateScratch& scratch = evaluate_scratch();
+            const AxisBasisEvaluations x_axis =
+                fill_axis_scratch(scratch.axis_x, vc, dc, d2c, bw, n_axis, xi[0], AxisDeriv::ValuesAndFirst);
+            AxisBasisEvaluations y_axis = constant_axis_basis();
+            AxisBasisEvaluations z_axis = constant_axis_basis();
+            if (topology != LagrangeTopology::Line) {
+                y_axis = fill_axis_scratch(scratch.axis_y, vc, dc, d2c, bw, n_axis, xi[1], AxisDeriv::ValuesAndFirst);
+            }
+            if (topology == LagrangeTopology::Hexahedron) {
+                z_axis = fill_axis_scratch(scratch.axis_z, vc, dc, d2c, bw, n_axis, xi[2], AxisDeriv::ValuesAndFirst);
+            }
+            evaluate_tensor_product_factorized_to(tensor_indices_, x_axis, y_axis, z_axis,
+                                                  nullptr, gradients_out, nullptr);
+            return;
+        }
+        case LagrangeTopology::Triangle:
+            detail::evaluate_triangle_simplex_basis_to(simplex_exponents_, order_, xi,
+                                                       nullptr, gradients_out, nullptr);
+            return;
+        case LagrangeTopology::Tetrahedron:
+            detail::evaluate_tetrahedron_simplex_basis_to(simplex_exponents_, order_, xi,
+                                                          nullptr, gradients_out, nullptr);
+            return;
+        case LagrangeTopology::Wedge: {
+            LagrangeEvaluateScratch& scratch = evaluate_scratch();
+            const AxisBasisEvaluations z_axis =
+                fill_axis_scratch(scratch.axis_z, vc, dc, d2c, bw, n_axis, xi[2], AxisDeriv::ValuesAndFirst);
+            const std::size_t tri_count = simplex_exponents_.size();
+            scratch.tri_values.resize(tri_count);
+            scratch.tri_gradient_components.resize(tri_count * 3u);
+            detail::evaluate_triangle_simplex_basis_to(simplex_exponents_, order_, xi,
+                                                       scratch.tri_values.data(),
+                                                       scratch.tri_gradient_components.data(),
+                                                       nullptr);
+            for (std::size_t n = 0; n < wedge_indices_.size(); ++n) {
+                const auto& index = wedge_indices_[n];
+                const std::size_t tri = index[0];
+                const std::size_t z = index[1];
+                const Real* tri_g = scratch.tri_gradient_components.data() + tri * 3u;
+                Real* g = gradients_out + n * 3u;
+                g[0] = tri_g[0] * z_axis.values[z];
+                g[1] = tri_g[1] * z_axis.values[z];
+                g[2] = scratch.tri_values[tri] * z_axis.first[z];
+            }
+            return;
+        }
+        case LagrangeTopology::Pyramid: {
+            detail::lagrange_pyramid::evaluate_gradients_to(order_, xi, gradients_out);
+            return;
+        }
+        case LagrangeTopology::Unknown:
+            break;
+    }
+
+    throw BasisEvaluationException("Unsupported element in evaluate_gradients_to",
+                                   __FILE__, __LINE__, __func__);
+}
+
+void LagrangeBasis::evaluate_hessians_to(const math::Vector<Real, 3>& xi,
+                                         Real* SVMP_RESTRICT hessians_out) const {
+    const auto topology = static_cast<LagrangeTopology>(topology_id_);
+    if (evaluate_fixed_lagrange_fast_to(topology, order_, xi, nullptr, nullptr, hessians_out)) {
+        return;
+    }
+
+    const int n_axis = static_cast<int>(nodes_1d_.size());
+    const Real* vc = axis_v_coeffs_.data();
+    const Real* dc = axis_d_coeffs_.data();
+    const Real* d2c = axis_d2_coeffs_.data();
+    const Real* bw = axis_barycentric_weights_.data();
+    switch (topology) {
+        case LagrangeTopology::Point:
+            for (std::size_t i = 0; i < 9; ++i) {
+                hessians_out[i] = Real(0);
+            }
+            return;
+        case LagrangeTopology::Line:
+        case LagrangeTopology::Quadrilateral:
+        case LagrangeTopology::Hexahedron: {
+            LagrangeEvaluateScratch& scratch = evaluate_scratch();
+            const AxisBasisEvaluations x_axis =
+                fill_axis_scratch(scratch.axis_x, vc, dc, d2c, bw, n_axis, xi[0], AxisDeriv::ValuesAndFirstAndSecond);
+            AxisBasisEvaluations y_axis = constant_axis_basis();
+            AxisBasisEvaluations z_axis = constant_axis_basis();
+            if (topology != LagrangeTopology::Line) {
+                y_axis = fill_axis_scratch(scratch.axis_y, vc, dc, d2c, bw, n_axis, xi[1], AxisDeriv::ValuesAndFirstAndSecond);
+            }
+            if (topology == LagrangeTopology::Hexahedron) {
+                z_axis = fill_axis_scratch(scratch.axis_z, vc, dc, d2c, bw, n_axis, xi[2], AxisDeriv::ValuesAndFirstAndSecond);
+            }
+            evaluate_tensor_product_factorized_to(tensor_indices_, x_axis, y_axis, z_axis,
+                                                  nullptr, nullptr, hessians_out);
+            return;
+        }
+        case LagrangeTopology::Triangle:
+            detail::evaluate_triangle_simplex_basis_to(simplex_exponents_, order_, xi,
+                                                       nullptr, nullptr, hessians_out);
+            return;
+        case LagrangeTopology::Tetrahedron:
+            detail::evaluate_tetrahedron_simplex_basis_to(simplex_exponents_, order_, xi,
+                                                          nullptr, nullptr, hessians_out);
+            return;
+        case LagrangeTopology::Wedge: {
+            LagrangeEvaluateScratch& scratch = evaluate_scratch();
+            const AxisBasisEvaluations z_axis =
+                fill_axis_scratch(scratch.axis_z, vc, dc, d2c, bw, n_axis, xi[2], AxisDeriv::ValuesAndFirstAndSecond);
+            const std::size_t tri_count = simplex_exponents_.size();
+            scratch.tri_values.resize(tri_count);
+            scratch.tri_gradient_components.resize(tri_count * 3u);
+            scratch.tri_hessian_components.resize(tri_count * 9u);
+            detail::evaluate_triangle_simplex_basis_to(simplex_exponents_, order_, xi,
+                                                       scratch.tri_values.data(),
+                                                       scratch.tri_gradient_components.data(),
+                                                       scratch.tri_hessian_components.data());
+            for (std::size_t n = 0; n < wedge_indices_.size(); ++n) {
+                const auto& index = wedge_indices_[n];
+                const std::size_t tri = index[0];
+                const std::size_t z = index[1];
+                const Real zv = z_axis.values[z];
+                const Real zd = z_axis.first[z];
+                const Real zd2 = z_axis.second[z];
+                const Real* tri_g = scratch.tri_gradient_components.data() + tri * 3u;
+                const Real* tri_H = scratch.tri_hessian_components.data() + tri * 9u;
+                const Real hxy = tri_H[1] * zv;
+                const Real hxz = tri_g[0] * zd;
+                const Real hyz = tri_g[1] * zd;
+                Real* H = hessians_out + n * 9u;
+                H[0] = tri_H[0] * zv;
+                H[4] = tri_H[4] * zv;
+                H[1] = hxy;
+                H[3] = hxy;
+                H[8] = scratch.tri_values[tri] * zd2;
+                H[2] = hxz;
+                H[6] = hxz;
+                H[5] = hyz;
+                H[7] = hyz;
+            }
+            return;
+        }
+        case LagrangeTopology::Pyramid: {
+            detail::lagrange_pyramid::evaluate_hessians_to(order_, xi, hessians_out);
+            return;
+        }
+        case LagrangeTopology::Unknown:
+            break;
+    }
+
+    throw BasisEvaluationException("Unsupported element in evaluate_hessians_to",
+                                   __FILE__, __LINE__, __func__);
+}
+
+void LagrangeBasis::evaluate_all_to(const math::Vector<Real, 3>& xi,
+                                    Real* SVMP_RESTRICT values_out,
+                                    Real* SVMP_RESTRICT gradients_out,
+                                    Real* SVMP_RESTRICT hessians_out) const {
+    const auto topology = static_cast<LagrangeTopology>(topology_id_);
+    if (evaluate_fixed_lagrange_fast_to(topology, order_, xi, values_out, gradients_out, hessians_out)) {
+        return;
+    }
+
+    const int n_axis = static_cast<int>(nodes_1d_.size());
+    const Real* vc = axis_v_coeffs_.data();
+    const Real* dc = axis_d_coeffs_.data();
+    const Real* d2c = axis_d2_coeffs_.data();
+    const Real* bw = axis_barycentric_weights_.data();
+    switch (topology) {
+        case LagrangeTopology::Point:
+            values_out[0] = Real(1);
+            gradients_out[0] = Real(0);
+            gradients_out[1] = Real(0);
+            gradients_out[2] = Real(0);
+            for (std::size_t i = 0; i < 9; ++i) {
+                hessians_out[i] = Real(0);
+            }
+            return;
+        case LagrangeTopology::Line:
+        case LagrangeTopology::Quadrilateral:
+        case LagrangeTopology::Hexahedron: {
+            LagrangeEvaluateScratch& scratch = evaluate_scratch();
+            const AxisBasisEvaluations x_axis =
+                fill_axis_scratch(scratch.axis_x, vc, dc, d2c, bw, n_axis, xi[0], AxisDeriv::ValuesAndFirstAndSecond);
+            AxisBasisEvaluations y_axis = constant_axis_basis();
+            AxisBasisEvaluations z_axis = constant_axis_basis();
+            if (topology != LagrangeTopology::Line) {
+                y_axis = fill_axis_scratch(scratch.axis_y, vc, dc, d2c, bw, n_axis, xi[1], AxisDeriv::ValuesAndFirstAndSecond);
+            }
+            if (topology == LagrangeTopology::Hexahedron) {
+                z_axis = fill_axis_scratch(scratch.axis_z, vc, dc, d2c, bw, n_axis, xi[2], AxisDeriv::ValuesAndFirstAndSecond);
+            }
+            evaluate_tensor_product_factorized_to(tensor_indices_, x_axis, y_axis, z_axis,
+                                                  values_out, gradients_out, hessians_out);
+            return;
+        }
+        case LagrangeTopology::Triangle:
+            detail::evaluate_triangle_simplex_basis_to(simplex_exponents_, order_, xi,
+                                                       values_out, gradients_out, hessians_out);
+            return;
+        case LagrangeTopology::Tetrahedron:
+            detail::evaluate_tetrahedron_simplex_basis_to(simplex_exponents_, order_, xi,
+                                                          values_out, gradients_out, hessians_out);
+            return;
+        case LagrangeTopology::Wedge: {
+            LagrangeEvaluateScratch& scratch = evaluate_scratch();
+            const AxisBasisEvaluations z_axis =
+                fill_axis_scratch(scratch.axis_z, vc, dc, d2c, bw, n_axis, xi[2], AxisDeriv::ValuesAndFirstAndSecond);
+            const std::size_t tri_count = simplex_exponents_.size();
+            scratch.tri_values.resize(tri_count);
+            scratch.tri_gradient_components.resize(tri_count * 3u);
+            scratch.tri_hessian_components.resize(tri_count * 9u);
+            detail::evaluate_triangle_simplex_basis_to(simplex_exponents_, order_, xi,
+                                                       scratch.tri_values.data(),
+                                                       scratch.tri_gradient_components.data(),
+                                                       scratch.tri_hessian_components.data());
+            for (std::size_t n = 0; n < wedge_indices_.size(); ++n) {
+                const auto& index = wedge_indices_[n];
+                const std::size_t tri = index[0];
+                const std::size_t z = index[1];
+                const Real zv = z_axis.values[z];
+                const Real zd = z_axis.first[z];
+                const Real zd2 = z_axis.second[z];
+                const Real tri_v = scratch.tri_values[tri];
+                const Real* tri_g = scratch.tri_gradient_components.data() + tri * 3u;
+                const Real* tri_H = scratch.tri_hessian_components.data() + tri * 9u;
+                const Real hxy = tri_H[1] * zv;
+                const Real hxz = tri_g[0] * zd;
+                const Real hyz = tri_g[1] * zd;
+
+                values_out[n] = tri_v * zv;
+
+                Real* g = gradients_out + n * 3u;
+                g[0] = tri_g[0] * zv;
+                g[1] = tri_g[1] * zv;
+                g[2] = tri_v * zd;
+
+                Real* H = hessians_out + n * 9u;
+                H[0] = tri_H[0] * zv;
+                H[4] = tri_H[4] * zv;
+                H[1] = hxy;
+                H[3] = hxy;
+                H[8] = tri_v * zd2;
+                H[2] = hxz;
+                H[6] = hxz;
+                H[5] = hyz;
+                H[7] = hyz;
+            }
+            return;
+        }
+        case LagrangeTopology::Pyramid: {
+            detail::lagrange_pyramid::evaluate_all_to(
+                order_, xi, values_out, gradients_out, hessians_out);
+            return;
+        }
+        case LagrangeTopology::Unknown:
+            break;
+    }
+
+    throw BasisEvaluationException("Unsupported element in evaluate_all_to",
+                                   __FILE__, __LINE__, __func__);
+}
+
+void LagrangeBasis::evaluate_at_quadrature_points(
+    const std::vector<math::Vector<Real, 3>>& points,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) const {
+    evaluate_at_quadrature_points_strided(points, points.size(), values_out, gradients_out, hessians_out);
+}
+
+void LagrangeBasis::evaluate_at_quadrature_points_strided(
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) const {
+    const std::size_t num_qpts = points.size();
+    const std::size_t num_dofs = size();
+    if (output_stride < num_qpts) {
+        throw BasisConfigurationException("LagrangeBasis strided evaluation requires output_stride >= points.size()",
+                                          __FILE__, __LINE__, __func__);
+    }
+    if (values_out == nullptr && gradients_out == nullptr && hessians_out == nullptr) {
+        return;
+    }
+
+    const auto topology = static_cast<LagrangeTopology>(topology_id_);
+    if (evaluate_fixed_lagrange_fast_strided(topology,
+                                             order_,
+                                             points,
+                                             output_stride,
+                                             values_out,
+                                             gradients_out,
+                                             hessians_out)) {
+        return;
+    }
+
+    if (topology == LagrangeTopology::Line ||
+        topology == LagrangeTopology::Quadrilateral ||
+        topology == LagrangeTopology::Hexahedron) {
+        evaluate_tensor_product_points_strided(topology,
+                                               tensor_indices_,
+                                               points,
+                                               output_stride,
+                                               axis_v_coeffs_.data(),
+                                               axis_d_coeffs_.data(),
+                                               axis_d2_coeffs_.data(),
+                                               axis_barycentric_weights_.data(),
+                                               static_cast<int>(nodes_1d_.size()),
+                                               values_out,
+                                               gradients_out,
+                                               hessians_out);
+        return;
+    }
+
+    if (topology == LagrangeTopology::Triangle) {
+        detail::evaluate_triangle_simplex_basis_strided(
+            simplex_exponents_, order_, points, output_stride, values_out, gradients_out, hessians_out);
+        return;
+    }
+
+    if (topology == LagrangeTopology::Tetrahedron) {
+        detail::evaluate_tetrahedron_simplex_basis_strided(
+            simplex_exponents_, order_, points, output_stride, values_out, gradients_out, hessians_out);
+        return;
+    }
+
+    if (topology == LagrangeTopology::Wedge &&
+        evaluate_wedge_fast_strided(order_,
+                                    wedge_indices_,
+                                    points,
+                                    output_stride,
+                                    values_out,
+                                    gradients_out,
+                                    hessians_out)) {
+        return;
+    }
+
+    const bool wedge_scalar_hessian_fallback =
+        topology == LagrangeTopology::Wedge &&
+        values_out == nullptr &&
+        gradients_out == nullptr &&
+        hessians_out != nullptr &&
+        order_ <= 2;
+    if (topology == LagrangeTopology::Wedge && !wedge_scalar_hessian_fallback) {
+        evaluate_wedge_points_strided(simplex_exponents_,
+                                      wedge_indices_,
+                                      wedge_node_by_tri_z_,
+                                      order_,
+                                      points,
+                                      output_stride,
+                                      axis_v_coeffs_.data(),
+                                      axis_d_coeffs_.data(),
+                                      axis_d2_coeffs_.data(),
+                                      axis_barycentric_weights_.data(),
+                                      static_cast<int>(nodes_1d_.size()),
+                                      values_out,
+                                      gradients_out,
+                                      hessians_out);
+        return;
+    }
+
+    if (topology == LagrangeTopology::Pyramid) {
+        detail::lagrange_pyramid::evaluate_at_quadrature_points_strided(
+            order_, points, output_stride, values_out, gradients_out, hessians_out);
+        return;
+    }
+
+    auto& scratch = evaluate_scratch();
+    auto& v_tmp = scratch.strided_values_tmp;
+    auto& g_tmp = scratch.strided_gradients_tmp;
+    auto& h_tmp = scratch.strided_hessians_tmp;
+
+    if (values_out)    v_tmp.resize(num_dofs);
+    if (gradients_out) g_tmp.resize(num_dofs * 3u);
+    if (hessians_out)  h_tmp.resize(num_dofs * 9u);
+
+    for (std::size_t q = 0; q < num_qpts; ++q) {
+        if (values_out && gradients_out && hessians_out) {
+            evaluate_all_to(points[q], v_tmp.data(), g_tmp.data(), h_tmp.data());
+        } else {
+            if (values_out)    evaluate_values_to(points[q], v_tmp.data());
+            if (gradients_out) evaluate_gradients_to(points[q], g_tmp.data());
+            if (hessians_out)  evaluate_hessians_to(points[q], h_tmp.data());
+        }
+
+        if (values_out) {
+            for (std::size_t d = 0; d < num_dofs; ++d) {
+                values_out[d * output_stride + q] = v_tmp[d];
+            }
+        }
+        if (gradients_out) {
+            for (std::size_t d = 0; d < num_dofs; ++d) {
+                gradients_out[(d * 3u + 0u) * output_stride + q] = g_tmp[d * 3u + 0u];
+                gradients_out[(d * 3u + 1u) * output_stride + q] = g_tmp[d * 3u + 1u];
+                gradients_out[(d * 3u + 2u) * output_stride + q] = g_tmp[d * 3u + 2u];
+            }
+        }
+        if (hessians_out) {
+            for (std::size_t d = 0; d < num_dofs; ++d) {
+                scatter_hessian_components_strided(
+                    h_tmp.data() + d * 9u,
+                    hessians_out + d * 9u * output_stride,
+                    output_stride,
+                    q);
+            }
+        }
+    }
+}
+
+} // namespace basis
+} // namespace FE
+} // namespace svmp
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasis.h b/Code/Source/solver/FE/Basis/LagrangeBasis.h
new file mode 100644
index 000000000..91f7e379c
--- /dev/null
+++ b/Code/Source/solver/FE/Basis/LagrangeBasis.h
@@ -0,0 +1,175 @@
+/* Copyright (c) Stanford University, The Regents of the University of California, and others.
+ *
+ * All Rights Reserved.
+ *
+ * See License file.
+ */
+
+#ifndef SVMP_FE_BASIS_LAGRANGEBASIS_H
+#define SVMP_FE_BASIS_LAGRANGEBASIS_H
+
+/**
+ * @file LagrangeBasis.h
+ * @brief Nodal Lagrange polynomial basis on reference elements
+ */
+
+#include "BasisFunction.h"
+#include <array>
+#include <cstddef>
+
+namespace svmp {
+namespace FE {
+namespace basis {
+
+void prewarm_lagrange_basis_scratch(int max_order, std::size_t max_qpts = 0);
+
+/**
+ * @brief Complete nodal H1 Lagrange basis on canonical reference topologies
+ *
+ * Supports arbitrary polynomial order on the canonical complete families:
+ * `Line2`, `Triangle3`, `Quad4`, `Tetra4`, `Hex8`, `Wedge6`, and `Pyramid5`.
+ * Low-order complete-family aliases (`Line3`, `Triangle6`, `Quad9`,
+ * `Tetra10`, `Hex27`, `Wedge18`, `Pyramid14`) normalize to their canonical
+ * topology plus order. Serendipity variants remain intentionally excluded.
+ *
+ * Node locations are generated on canonical reference elements using
+ * equispaced coordinates on tensor-product elements, barycentric grids on
+ * simplices, tensorized triangle-line grids on wedges, and a rational nodal
+ * pyramid construction on `Pyramid5`.
+ *
+ * The evaluator is numerically stabilized for those nodes, but the
+ * interpolation problem itself remains the equispaced Lagrange problem. For
+ * high-order interpolation, especially order >= 4, prefer `SpectralBasis`
+ * (GLL / Warp & Blend nodes) unless exact equispaced nodal placement is part
+ * of the requested discretization.
+ *
+ * For the rational pyramid family, basis values remain exact at the apex.
+ * Gradients and Hessians are analytic on the supported interior reference
+ * domain, but the exact-apex nodal derivative limit is not unique and those
+ * derivative queries throw at the exact apex.
+ */
+class LagrangeBasis : public BasisFunction {
+public:
+    LagrangeBasis(ElementType type, int order);
+
+    BasisType basis_type() const noexcept override { return BasisType::Lagrange; }
+    ElementType element_type() const noexcept override { return element_type_; }
+    int dimension() const noexcept override { return dimension_; }
+    int order() const noexcept override { return order_; }
+    std::size_t size() const noexcept override { return nodes_.size(); }
+    bool cache_identity_is_structural() const noexcept override { return true; }
+
+    const std::vector<math::Vector<Real, 3>>& nodes() const noexcept { return nodes_; }
+
+    void evaluate_values(const math::Vector<Real, 3>& xi,
+                         std::vector<Real>& values) const final;
+    void evaluate_gradients(const math::Vector<Real, 3>& xi,
+                            std::vector<Gradient>& gradients) const final;
+    void evaluate_hessians(const math::Vector<Real, 3>& xi,
+                           std::vector<Hessian>& hessians) const final;
+    void evaluate_all(const math::Vector<Real, 3>& xi,
+                      std::vector<Real>& values,
+                      std::vector<Gradient>& gradients,
+                      std::vector<Hessian>& hessians) const final;
+
+    void evaluate_at_quadrature_points(
+        const std::vector<math::Vector<Real, 3>>& points,
+        Real* SVMP_RESTRICT values_out,
+        Real* SVMP_RESTRICT gradients_out,
+        Real* SVMP_RESTRICT hessians_out) const final;
+    void evaluate_at_quadrature_points_strided(
+        const std::vector<math::Vector<Real, 3>>& points,
+        std::size_t output_stride,
+        Real* SVMP_RESTRICT values_out,
+        Real* SVMP_RESTRICT gradients_out,
+        Real* SVMP_RESTRICT hessians_out) const final;
+
+    // Raw-pointer output API. Caller must pre-size buffers to size().
+    void evaluate_values_to(const math::Vector<Real, 3>& xi, Real* SVMP_RESTRICT values_out) const final;
+    void evaluate_gradients_to(const math::Vector<Real, 3>& xi, Real* SVMP_RESTRICT gradients_out) const final;
+    void evaluate_hessians_to(const math::Vector<Real, 3>& xi, Real* SVMP_RESTRICT hessians_out) const final;
+
+private:
+    using TensorNodeIndex = std::array<std::size_t, 3>;
+    using WedgeNodeIndex = std::array<std::size_t, 2>;
+    using VectorEvaluationDispatch = void (LagrangeBasis::*)(
+        const math::Vector<Real, 3>&,
+        std::vector<Real>*,
+        std::vector<Gradient>*,
+        std::vector<Hessian>*) const;
+
+    // Cached topology encoded as int because the topology enum lives in
+    // the .cpp anon namespace. Set once in init_nodes.
+    int topology_id_ = 0;
+
+    ElementType element_type_;
+    int dimension_;
+    int order_;
+
+    std::vector<Real> nodes_1d_;
+    std::vector<math::Vector<Real, 3>> nodes_;
+    std::vector<TensorNodeIndex> tensor_indices_;
+    std::vector<std::array<int, 4>> simplex_exponents_;
+    std::vector<WedgeNodeIndex> wedge_indices_;
+    std::vector<std::size_t> wedge_node_by_tri_z_;
+
+    // Precomputed Horner-form coefficients of the 1D Lagrange basis.
+    // Layout per axis (n_axis = nodes_1d_.size() = order_+1):
+    //   axis_v_coeffs_[i * n_axis + k] = coeff of x^k in L_i(x), 0 <= i,k < n_axis
+    //   axis_d_coeffs_[i * (n_axis - 1) + k] = coeff of x^k in L_i'(x)
+    //   axis_d2_coeffs_[i * (n_axis - 2) + k] = coeff of x^k in L_i''(x)  (only if n_axis >= 3)
+    // Populated by build_tensor_product_nodes / build_wedge_nodes.
+    std::vector<Real> axis_v_coeffs_;
+    std::vector<Real> axis_d_coeffs_;
+    std::vector<Real> axis_d2_coeffs_;
+    std::vector<Real> axis_barycentric_weights_;
+    VectorEvaluationDispatch vector_evaluation_dispatch_{nullptr};
+
+    void init_nodes();
+    void init_evaluation_dispatch();
+    void build_point_nodes();
+    void build_tensor_product_nodes(int dimensions);
+    void build_simplex_nodes();
+    void build_wedge_nodes();
+    void build_pyramid_nodes();
+    void init_equispaced_1d_nodes();
+    void compute_axis_monomial_coefficients();
+    void evaluate_point_vectors(const math::Vector<Real, 3>& xi,
+                                std::vector<Real>* values,
+                                std::vector<Gradient>* gradients,
+                                std::vector<Hessian>* hessians) const;
+    void evaluate_tensor_product_vectors(const math::Vector<Real, 3>& xi,
+                                         std::vector<Real>* values,
+                                         std::vector<Gradient>* gradients,
+                                         std::vector<Hessian>* hessians) const;
+    void evaluate_triangle_vectors(const math::Vector<Real, 3>& xi,
+                                   std::vector<Real>* values,
+                                   std::vector<Gradient>* gradients,
+                                   std::vector<Hessian>* hessians) const;
+    void evaluate_tetrahedron_vectors(const math::Vector<Real, 3>& xi,
+                                      std::vector<Real>* values,
+                                      std::vector<Gradient>* gradients,
+                                      std::vector<Hessian>* hessians) const;
+    void evaluate_wedge_vectors(const math::Vector<Real, 3>& xi,
+                                std::vector<Real>* values,
+                                std::vector<Gradient>* gradients,
+                                std::vector<Hessian>* hessians) const;
+    void evaluate_pyramid_vectors(const math::Vector<Real, 3>& xi,
+                                  std::vector<Real>* values,
+                                  std::vector<Gradient>* gradients,
+                                  std::vector<Hessian>* hessians) const;
+    void evaluate_unsupported_vectors(const math::Vector<Real, 3>& xi,
+                                      std::vector<Real>* values,
+                                      std::vector<Gradient>* gradients,
+                                      std::vector<Hessian>* hessians) const;
+    void evaluate_all_to(const math::Vector<Real, 3>& xi,
+                         Real* SVMP_RESTRICT values_out,
+                         Real* SVMP_RESTRICT gradients_out,
+                         Real* SVMP_RESTRICT hessians_out) const;
+};
+
+} // namespace basis
+} // namespace FE
+} // namespace svmp
+
+#endif // SVMP_FE_BASIS_LAGRANGEBASIS_H
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasisFast.h b/Code/Source/solver/FE/Basis/LagrangeBasisFast.h
new file mode 100644
index 000000000..5b9faae04
--- /dev/null
+++ b/Code/Source/solver/FE/Basis/LagrangeBasisFast.h
@@ -0,0 +1,1378 @@
+/* Copyright (c) Stanford University, The Regents of the University of California, and others.
+ *
+ * All Rights Reserved.
+ *
+ * See License file.
+ */
+
+#ifndef SVMP_FE_BASIS_LAGRANGEBASISFAST_H
+#define SVMP_FE_BASIS_LAGRANGEBASISFAST_H
+
+/**
+ * @file LagrangeBasisFast.h
+ * @brief Header-only zero-overhead specializations of the Lagrange basis
+ *
+ * Provides templated static methods for the common nodal Lagrange families
+ * with compile-time-known polynomial order. Callers that know their basis
+ * type and order at compile time use these directly — there is no virtual
+ * dispatch, no std::vector allocation, no scratch lookup, and no topology
+ * switch. The output buffers are stack-allocated std::array, sized at
+ * compile time. The compiler fully unrolls and constant-folds.
+ *
+ * These specializations are an alternative entry point to the runtime path
+ * provided by `LagrangeBasis`. The runtime path remains the canonical API
+ * for generic callers; these specializations serve hot loops that know the
+ * element type.
+ *
+ * Node orderings match `ReferenceNodeLayout::get_lagrange_node_coords(...)` (VTK).
+ */
+
+#include "Types.h"
+#include "Math/Vector.h"
+#include "Math/Matrix.h"
+#include <array>
+#include <cstddef>
+
+namespace svmp {
+namespace FE {
+namespace basis {
+
+using Gradient = math::Vector<Real, 3>;
+using Hessian  = math::Matrix<Real, 3, 3>;
+
+namespace detail {
+
+constexpr Gradient scaled_gradient(const Gradient& gradient, Real scale) {
+    return Gradient{scale * gradient[0], scale * gradient[1], scale * gradient[2]};
+}
+
+constexpr Gradient p2_edge_gradient(Real left,
+                                    const Gradient& left_gradient,
+                                    Real right,
+                                    const Gradient& right_gradient) {
+    return Gradient{
+        Real(4) * (left_gradient[0] * right + right_gradient[0] * left),
+        Real(4) * (left_gradient[1] * right + right_gradient[1] * left),
+        Real(4) * (left_gradient[2] * right + right_gradient[2] * left),
+    };
+}
+
+constexpr Hessian p2_vertex_hessian(const Gradient& gradient) {
+    Hessian hessian{};
+    for (std::size_t row = 0; row < 3u; ++row) {
+        for (std::size_t col = 0; col < 3u; ++col) {
+            hessian(row, col) = Real(4) * gradient[row] * gradient[col];
+        }
+    }
+    return hessian;
+}
+
+constexpr Hessian p2_edge_hessian(const Gradient& left_gradient,
+                                  const Gradient& right_gradient) {
+    Hessian hessian{};
+    for (std::size_t row = 0; row < 3u; ++row) {
+        for (std::size_t col = 0; col < 3u; ++col) {
+            hessian(row, col) = Real(4) * (
+                left_gradient[row] * right_gradient[col] +
+                right_gradient[row] * left_gradient[col]);
+        }
+    }
+    return hessian;
+}
+
+constexpr std::size_t public_axis_index(int lattice, int order) noexcept {
+    return lattice == 0 ? 0u :
+           lattice == order ? 1u :
+           static_cast<std::size_t>(lattice + 1);
+}
+
+template<int Order>
+constexpr Real public_axis_coord(std::size_t public_index) noexcept {
+    const int lattice = public_index == 0u ? 0 :
+                        public_index == 1u ? Order :
+                        static_cast<int>(public_index) - 1;
+    return Real(-1) + Real(2) * static_cast<Real>(lattice) / static_cast<Real>(Order);
+}
+
+template<int Order>
+constexpr std::array<Real, Order + 1> make_public_axis_nodes() {
+    std::array<Real, Order + 1> nodes{};
+    for (std::size_t i = 0; i < nodes.size(); ++i) {
+        nodes[i] = public_axis_coord<Order>(i);
+    }
+    return nodes;
+}
+
+template<int Order>
+constexpr std::array<Real, Order + 1> make_public_axis_inverse_denominators() {
+    constexpr auto nodes = make_public_axis_nodes<Order>();
+    std::array<Real, Order + 1> inv_denominators{};
+    for (std::size_t i = 0; i < nodes.size(); ++i) {
+        Real denominator = Real(1);
+        for (std::size_t j = 0; j < nodes.size(); ++j) {
+            if (j != i) {
+                denominator *= nodes[i] - nodes[j];
+            }
+        }
+        inv_denominators[i] = Real(1) / denominator;
+    }
+    return inv_denominators;
+}
+
+template<int Order, bool NeedFirst, bool NeedSecond>
+void fill_axis_lagrange(Real x,
+                        std::array<Real, Order + 1>& values,
+                        std::array<Real, Order + 1>* first,
+                        std::array<Real, Order + 1>* second) {
+    constexpr auto nodes = make_public_axis_nodes<Order>();
+    constexpr auto inv_denominators = make_public_axis_inverse_denominators<Order>();
+    for (std::size_t i = 0; i < nodes.size(); ++i) {
+        Real product = Real(1);
+        for (std::size_t j = 0; j < nodes.size(); ++j) {
+            if (j != i) {
+                product *= x - nodes[j];
+            }
+        }
+        values[i] = product * inv_denominators[i];
+
+        if constexpr (NeedFirst) {
+            Real derivative = Real(0);
+            for (std::size_t m = 0; m < nodes.size(); ++m) {
+                if (m == i) {
+                    continue;
+                }
+                Real term = Real(1);
+                for (std::size_t j = 0; j < nodes.size(); ++j) {
+                    if (j != i && j != m) {
+                        term *= x - nodes[j];
+                    }
+                }
+                derivative += term;
+            }
+            (*first)[i] = derivative * inv_denominators[i];
+        }
+
+        if constexpr (NeedSecond) {
+            Real curvature = Real(0);
+            for (std::size_t m = 0; m < nodes.size(); ++m) {
+                if (m == i) {
+                    continue;
+                }
+                for (std::size_t l = 0; l < nodes.size(); ++l) {
+                    if (l == i || l == m) {
+                        continue;
+                    }
+                    Real term = Real(1);
+                    for (std::size_t j = 0; j < nodes.size(); ++j) {
+                        if (j != i && j != m && j != l) {
+                            term *= x - nodes[j];
+                        }
+                    }
+                    curvature += term;
+                }
+            }
+            (*second)[i] = curvature * inv_denominators[i];
+        }
+    }
+}
+
+template<int Order>
+void fill_axis_values(Real x, std::array<Real, Order + 1>& values) {
+    fill_axis_lagrange<Order, false, false>(x, values, nullptr, nullptr);
+}
+
+template<int Order>
+void fill_axis_values_first(Real x,
+                            std::array<Real, Order + 1>& values,
+                            std::array<Real, Order + 1>& first) {
+    fill_axis_lagrange<Order, true, false>(x, values, &first, nullptr);
+}
+
+template<int Order>
+void fill_axis_values_first_second(Real x,
+                                   std::array<Real, Order + 1>& values,
+                                   std::array<Real, Order + 1>& first,
+                                   std::array<Real, Order + 1>& second) {
+    fill_axis_lagrange<Order, true, true>(x, values, &first, &second);
+}
+
+template<int Order>
+constexpr std::array<std::array<std::size_t, 2>, (Order + 1) * (Order + 1)>
+make_quad_tensor_node_axes() {
+    std::array<std::array<std::size_t, 2>, (Order + 1) * (Order + 1)> axes{};
+    std::size_t n = 0;
+
+    axes[n++] = {{0u, 0u}};
+    axes[n++] = {{1u, 0u}};
+    axes[n++] = {{1u, 1u}};
+    axes[n++] = {{0u, 1u}};
+
+    for (int i = 1; i < Order; ++i) {
+        axes[n++] = {{public_axis_index(i, Order), 0u}};
+    }
+    for (int j = 1; j < Order; ++j) {
+        axes[n++] = {{1u, public_axis_index(j, Order)}};
+    }
+    for (int i = Order - 1; i >= 1; --i) {
+        axes[n++] = {{public_axis_index(i, Order), 1u}};
+    }
+    for (int j = Order - 1; j >= 1; --j) {
+        axes[n++] = {{0u, public_axis_index(j, Order)}};
+    }
+
+    for (int j = 1; j < Order; ++j) {
+        for (int i = 1; i < Order; ++i) {
+            axes[n++] = {{public_axis_index(i, Order), public_axis_index(j, Order)}};
+        }
+    }
+
+    return axes;
+}
+
+template<int Order>
+constexpr std::array<std::array<std::size_t, 3>, (Order + 1) * (Order + 1) * (Order + 1)>
+make_hex_tensor_node_axes() {
+    std::array<std::array<std::size_t, 3>, (Order + 1) * (Order + 1) * (Order + 1)> axes{};
+    std::size_t n = 0;
+
+    axes[n++] = {{0u, 0u, 0u}};
+    axes[n++] = {{1u, 0u, 0u}};
+    axes[n++] = {{1u, 1u, 0u}};
+    axes[n++] = {{0u, 1u, 0u}};
+    axes[n++] = {{0u, 0u, 1u}};
+    axes[n++] = {{1u, 0u, 1u}};
+    axes[n++] = {{1u, 1u, 1u}};
+    axes[n++] = {{0u, 1u, 1u}};
+
+    for (int i = 1; i < Order; ++i) {
+        axes[n++] = {{public_axis_index(i, Order), 0u, 0u}};
+    }
+    for (int j = 1; j < Order; ++j) {
+        axes[n++] = {{1u, public_axis_index(j, Order), 0u}};
+    }
+    for (int i = Order - 1; i >= 1; --i) {
+        axes[n++] = {{public_axis_index(i, Order), 1u, 0u}};
+    }
+    for (int j = Order - 1; j >= 1; --j) {
+        axes[n++] = {{0u, public_axis_index(j, Order), 0u}};
+    }
+    for (int i = 1; i < Order; ++i) {
+        axes[n++] = {{public_axis_index(i, Order), 0u, 1u}};
+    }
+    for (int j = 1; j < Order; ++j) {
+        axes[n++] = {{1u, public_axis_index(j, Order), 1u}};
+    }
+    for (int i = Order - 1; i >= 1; --i) {
+        axes[n++] = {{public_axis_index(i, Order), 1u, 1u}};
+    }
+    for (int j = Order - 1; j >= 1; --j) {
+        axes[n++] = {{0u, public_axis_index(j, Order), 1u}};
+    }
+    for (int k = 1; k < Order; ++k) {
+        axes[n++] = {{0u, 0u, public_axis_index(k, Order)}};
+    }
+    for (int k = 1; k < Order; ++k) {
+        axes[n++] = {{1u, 0u, public_axis_index(k, Order)}};
+    }
+    for (int k = 1; k < Order; ++k) {
+        axes[n++] = {{1u, 1u, public_axis_index(k, Order)}};
+    }
+    for (int k = 1; k < Order; ++k) {
+        axes[n++] = {{0u, 1u, public_axis_index(k, Order)}};
+    }
+
+    for (int j = 1; j < Order; ++j) {
+        for (int i = 1; i < Order; ++i) {
+            axes[n++] = {{public_axis_index(i, Order), public_axis_index(j, Order), 0u}};
+        }
+    }
+    for (int j = 1; j < Order; ++j) {
+        for (int i = 1; i < Order; ++i) {
+            axes[n++] = {{public_axis_index(i, Order), public_axis_index(j, Order), 1u}};
+        }
+    }
+    for (int k = 1; k < Order; ++k) {
+        for (int i = 1; i < Order; ++i) {
+            axes[n++] = {{public_axis_index(i, Order), 0u, public_axis_index(k, Order)}};
+        }
+    }
+    for (int k = 1; k < Order; ++k) {
+        for (int j = 1; j < Order; ++j) {
+            axes[n++] = {{1u, public_axis_index(j, Order), public_axis_index(k, Order)}};
+        }
+    }
+    for (int k = 1; k < Order; ++k) {
+        for (int i = Order - 1; i >= 1; --i) {
+            axes[n++] = {{public_axis_index(i, Order), 1u, public_axis_index(k, Order)}};
+        }
+    }
+    for (int k = 1; k < Order; ++k) {
+        for (int j = Order - 1; j >= 1; --j) {
+            axes[n++] = {{0u, public_axis_index(j, Order), public_axis_index(k, Order)}};
+        }
+    }
+
+    for (int k = 1; k < Order; ++k) {
+        for (int j = 1; j < Order; ++j) {
+            for (int i = 1; i < Order; ++i) {
+                axes[n++] = {{public_axis_index(i, Order),
+                              public_axis_index(j, Order),
+                              public_axis_index(k, Order)}};
+            }
+        }
+    }
+
+    return axes;
+}
+
+template<int Order>
+constexpr std::array<std::array<std::size_t, 3>, (Order + 1) * (Order + 2) / 2>
+make_triangle_simplex_exponents() {
+    std::array<std::array<std::size_t, 3>, (Order + 1) * (Order + 2) / 2> exponents{};
+    std::size_t n = 0;
+
+    exponents[n++] = {{static_cast<std::size_t>(Order), 0u, 0u}};
+    exponents[n++] = {{0u, static_cast<std::size_t>(Order), 0u}};
+    exponents[n++] = {{0u, 0u, static_cast<std::size_t>(Order)}};
+
+    for (int m = 1; m < Order; ++m) {
+        exponents[n++] = {{static_cast<std::size_t>(Order - m), static_cast<std::size_t>(m), 0u}};
+    }
+    for (int m = 1; m < Order; ++m) {
+        exponents[n++] = {{0u, static_cast<std::size_t>(Order - m), static_cast<std::size_t>(m)}};
+    }
+    for (int m = 1; m < Order; ++m) {
+        exponents[n++] = {{static_cast<std::size_t>(m), 0u, static_cast<std::size_t>(Order - m)}};
+    }
+
+    for (int c = 1; c <= Order - 2; ++c) {
+        for (int b = 1; b <= Order - c - 1; ++b) {
+            const int a = Order - b - c;
+            exponents[n++] = {{static_cast<std::size_t>(a),
+                               static_cast<std::size_t>(b),
+                               static_cast<std::size_t>(c)}};
+        }
+    }
+
+    return exponents;
+}
+
+template<int Order>
+constexpr std::array<std::array<std::size_t, 4>, (Order + 1) * (Order + 2) * (Order + 3) / 6>
+make_tetrahedron_simplex_exponents() {
+    std::array<std::array<std::size_t, 4>, (Order + 1) * (Order + 2) * (Order + 3) / 6> exponents{};
+    std::size_t n = 0;
+
+    exponents[n++] = {{static_cast<std::size_t>(Order), 0u, 0u, 0u}};
+    exponents[n++] = {{0u, static_cast<std::size_t>(Order), 0u, 0u}};
+    exponents[n++] = {{0u, 0u, static_cast<std::size_t>(Order), 0u}};
+    exponents[n++] = {{0u, 0u, 0u, static_cast<std::size_t>(Order)}};
+
+    constexpr int edges[6][2] = {
+        {0, 1}, {1, 2}, {2, 0}, {0, 3}, {1, 3}, {2, 3}
+    };
+    for (const auto& edge : edges) {
+        for (int m = 1; m < Order; ++m) {
+            std::array<std::size_t, 4> e{};
+            e[static_cast<std::size_t>(edge[0])] = static_cast<std::size_t>(Order - m);
+            e[static_cast<std::size_t>(edge[1])] = static_cast<std::size_t>(m);
+            exponents[n++] = e;
+        }
+    }
+
+    constexpr int faces[4][3] = {
+        {0, 1, 2},
+        {0, 1, 3},
+        {1, 2, 3},
+        {0, 2, 3},
+    };
+    for (const auto& face : faces) {
+        for (int c = 1; c <= Order - 2; ++c) {
+            for (int b = 1; b <= Order - c - 1; ++b) {
+                const int a = Order - b - c;
+                std::array<std::size_t, 4> e{};
+                e[static_cast<std::size_t>(face[0])] = static_cast<std::size_t>(a);
+                e[static_cast<std::size_t>(face[1])] = static_cast<std::size_t>(b);
+                e[static_cast<std::size_t>(face[2])] = static_cast<std::size_t>(c);
+                exponents[n++] = e;
+            }
+        }
+    }
+
+    for (int l = 1; l <= Order - 3; ++l) {
+        for (int k = 1; k <= Order - l - 2; ++k) {
+            for (int j = 1; j <= Order - l - k - 1; ++j) {
+                const int i = Order - j - k - l;
+                exponents[n++] = {{static_cast<std::size_t>(i),
+                                   static_cast<std::size_t>(j),
+                                   static_cast<std::size_t>(k),
+                                   static_cast<std::size_t>(l)}};
+            }
+        }
+    }
+
+    return exponents;
+}
+
+template<int Order, bool NeedFirst, bool NeedSecond>
+void fill_simplex_factor_sequence(Real lambda,
+                                  std::array<Real, Order + 1>& phi,
+                                  std::array<Real, Order + 1>* dphi,
+                                  std::array<Real, Order + 1>* d2phi) {
+    phi[0] = Real(1);
+    if constexpr (NeedFirst) {
+        (*dphi)[0] = Real(0);
+    }
+    if constexpr (NeedSecond) {
+        (*d2phi)[0] = Real(0);
+    }
+
+    const Real t = static_cast<Real>(Order) * lambda;
+    constexpr Real dt_dlambda = static_cast<Real>(Order);
+    Real dphi_dt_prev = Real(0);
+    Real d2phi_dt2_prev = Real(0);
+
+    for (int a = 1; a <= Order; ++a) {
+        const std::size_t au = static_cast<std::size_t>(a);
+        const Real inv_a = Real(1) / static_cast<Real>(a);
+        const Real s = (t - static_cast<Real>(a - 1)) * inv_a;
+        phi[au] = s * phi[au - 1];
+
+        if constexpr (NeedFirst) {
+            const Real dphi_dt = inv_a * phi[au - 1] + s * dphi_dt_prev;
+            (*dphi)[au] = dt_dlambda * dphi_dt;
+
+            if constexpr (NeedSecond) {
+                const Real d2phi_dt2 = Real(2) * inv_a * dphi_dt_prev + s * d2phi_dt2_prev;
+                (*d2phi)[au] = dt_dlambda * dt_dlambda * d2phi_dt2;
+                d2phi_dt2_prev = d2phi_dt2;
+            }
+
+            dphi_dt_prev = dphi_dt;
+        }
+    }
+}
+
+template<int Order>
+void fill_simplex_factor_values(Real lambda, std::array<Real, Order + 1>& phi) {
+    fill_simplex_factor_sequence<Order, false, false>(lambda, phi, nullptr, nullptr);
+}
+
+template<int Order>
+void fill_simplex_factor_values_first(Real lambda,
+                                      std::array<Real, Order + 1>& phi,
+                                      std::array<Real, Order + 1>& dphi) {
+    fill_simplex_factor_sequence<Order, true, false>(lambda, phi, &dphi, nullptr);
+}
+
+template<int Order>
+void fill_simplex_factor_values_first_second(Real lambda,
+                                             std::array<Real, Order + 1>& phi,
+                                             std::array<Real, Order + 1>& dphi,
+                                             std::array<Real, Order + 1>& d2phi) {
+    fill_simplex_factor_sequence<Order, true, true>(lambda, phi, &dphi, &d2phi);
+}
+
+} // namespace detail
+
+// ---------------------------------------------------------------------------
+// LagrangeLineFast<Order>
+// ---------------------------------------------------------------------------
+template<int Order>
+struct LagrangeLineFast;
+
+template<>
+struct LagrangeLineFast<1> {
+    static constexpr int n_dofs = 2;
+
+    static constexpr void evaluate(const math::Vector<Real, 3>& xi, std::array<Real, n_dofs>& out) {
+        out[0] = (Real(1) - xi[0]) * Real(0.5);
+        out[1] = (Real(1) + xi[0]) * Real(0.5);
+    }
+
+    static constexpr void evaluate_gradients(const math::Vector<Real, 3>& /*xi*/,
+                                             std::array<Gradient, n_dofs>& out) {
+        out[0] = Gradient{Real(-0.5), Real(0), Real(0)};
+        out[1] = Gradient{Real( 0.5), Real(0), Real(0)};
+    }
+
+    static constexpr void evaluate_hessians(const math::Vector<Real, 3>& /*xi*/,
+                                            std::array<Hessian, n_dofs>& out) {
+        out[0] = Hessian{};
+        out[1] = Hessian{};
+    }
+};
+
+template<>
+struct LagrangeLineFast<2> {
+    static constexpr int n_dofs = 3;
+
+    static constexpr void evaluate(const math::Vector<Real, 3>& xi, std::array<Real, n_dofs>& out) {
+        const Real x = xi[0];
+        out[0] = x * (x - Real(1)) * Real(0.5);
+        out[1] = x * (x + Real(1)) * Real(0.5);
+        out[2] = (Real(1) - x) * (Real(1) + x);
+    }
+
+    static constexpr void evaluate_gradients(const math::Vector<Real, 3>& xi,
+                                             std::array<Gradient, n_dofs>& out) {
+        const Real x = xi[0];
+        out[0] = Gradient{x - Real(0.5), Real(0), Real(0)};
+        out[1] = Gradient{x + Real(0.5), Real(0), Real(0)};
+        out[2] = Gradient{Real(-2) * x, Real(0), Real(0)};
+    }
+
+    static constexpr void evaluate_hessians(const math::Vector<Real, 3>& /*xi*/,
+                                            std::array<Hessian, n_dofs>& out) {
+        out[0] = Hessian{};
+        out[1] = Hessian{};
+        out[2] = Hessian{};
+        out[0](0, 0) = Real(1);
+        out[1](0, 0) = Real(1);
+        out[2](0, 0) = Real(-2);
+    }
+};
+
+template<>
+struct LagrangeLineFast<3> {
+    static constexpr int n_dofs = 4;
+
+    static void evaluate(const math::Vector<Real, 3>& xi, std::array<Real, n_dofs>& out) {
+        detail::fill_axis_values<3>(xi[0], out);
+    }
+
+    static void evaluate_gradients(const math::Vector<Real, 3>& xi,
+                                   std::array<Gradient, n_dofs>& out) {
+        std::array<Real, n_dofs> values{};
+        std::array<Real, n_dofs> first{};
+        detail::fill_axis_values_first<3>(xi[0], values, first);
+        for (std::size_t i = 0; i < first.size(); ++i) {
+            out[i] = Gradient{first[i], Real(0), Real(0)};
+        }
+    }
+
+    static void evaluate_hessians(const math::Vector<Real, 3>& xi,
+                                  std::array<Hessian, n_dofs>& out) {
+        std::array<Real, n_dofs> values{};
+        std::array<Real, n_dofs> first{};
+        std::array<Real, n_dofs> second{};
+        detail::fill_axis_values_first_second<3>(xi[0], values, first, second);
+        for (std::size_t i = 0; i < second.size(); ++i) {
+            Hessian H{};
+            H(0, 0) = second[i];
+            out[i] = H;
+        }
+    }
+};
+
+// ---------------------------------------------------------------------------
+// LagrangeQuadFast<Order>
+// ---------------------------------------------------------------------------
+template<int Order>
+struct LagrangeQuadFast;
+
+template<>
+struct LagrangeQuadFast<1> {
+    static constexpr int n_dofs = 4;
+
+    // VTK Quad4 corner ordering: (-,-), (+,-), (+,+), (-,+).
+    static constexpr void evaluate(const math::Vector<Real, 3>& xi, std::array<Real, n_dofs>& out) {
+        const Real lx = (Real(1) - xi[0]) * Real(0.5);
+        const Real ly = (Real(1) - xi[1]) * Real(0.5);
+        const Real ux = (Real(1) + xi[0]) * Real(0.5);
+        const Real uy = (Real(1) + xi[1]) * Real(0.5);
+        out[0] = lx * ly;
+        out[1] = ux * ly;
+        out[2] = ux * uy;
+        out[3] = lx * uy;
+    }
+
+    static constexpr void evaluate_gradients(const math::Vector<Real, 3>& xi,
+                                             std::array<Gradient, n_dofs>& out) {
+        const Real lx = (Real(1) - xi[0]) * Real(0.5);
+        const Real ly = (Real(1) - xi[1]) * Real(0.5);
+        const Real ux = (Real(1) + xi[0]) * Real(0.5);
+        const Real uy = (Real(1) + xi[1]) * Real(0.5);
+        out[0] = Gradient{Real(-0.5) * ly, Real(-0.5) * lx, Real(0)};
+        out[1] = Gradient{Real( 0.5) * ly, Real(-0.5) * ux, Real(0)};
+        out[2] = Gradient{Real( 0.5) * uy, Real( 0.5) * ux, Real(0)};
+        out[3] = Gradient{Real(-0.5) * uy, Real( 0.5) * lx, Real(0)};
+    }
+
+    static constexpr void evaluate_hessians(const math::Vector<Real, 3>& /*xi*/,
+                                            std::array<Hessian, n_dofs>& out) {
+        out[0] = Hessian{};
+        out[1] = Hessian{};
+        out[2] = Hessian{};
+        out[3] = Hessian{};
+        constexpr Real qrt = Real(0.25);
+        out[0](0, 1) = qrt;  out[0](1, 0) = qrt;
+        out[1](0, 1) = -qrt; out[1](1, 0) = -qrt;
+        out[2](0, 1) = qrt;  out[2](1, 0) = qrt;
+        out[3](0, 1) = -qrt; out[3](1, 0) = -qrt;
+    }
+};
+
+template<>
+struct LagrangeQuadFast<2> {
+    static constexpr int n_dofs = 9;
+
+    static constexpr std::array<std::array<std::size_t, 2>, n_dofs> node_axes = {{
+        {{0u, 0u}}, {{1u, 0u}}, {{1u, 1u}}, {{0u, 1u}},
+        {{2u, 0u}}, {{1u, 2u}}, {{2u, 1u}}, {{0u, 2u}},
+        {{2u, 2u}},
+    }};
+
+    static void evaluate(const math::Vector<Real, 3>& xi, std::array<Real, n_dofs>& out) {
+        std::array<Real, LagrangeLineFast<2>::n_dofs> lx{};
+        std::array<Real, LagrangeLineFast<2>::n_dofs> ly{};
+        LagrangeLineFast<2>::evaluate({xi[0], Real(0), Real(0)}, lx);
+        LagrangeLineFast<2>::evaluate({xi[1], Real(0), Real(0)}, ly);
+        for (std::size_t n = 0; n < node_axes.size(); ++n) {
+            out[n] = lx[node_axes[n][0]] * ly[node_axes[n][1]];
+        }
+    }
+
+    static void evaluate_gradients(const math::Vector<Real, 3>& xi,
+                                   std::array<Gradient, n_dofs>& out) {
+        std::array<Real, LagrangeLineFast<2>::n_dofs> lx{};
+        std::array<Real, LagrangeLineFast<2>::n_dofs> ly{};
+        std::array<Gradient, LagrangeLineFast<2>::n_dofs> gx{};
+        std::array<Gradient, LagrangeLineFast<2>::n_dofs> gy{};
+        LagrangeLineFast<2>::evaluate({xi[0], Real(0), Real(0)}, lx);
+        LagrangeLineFast<2>::evaluate({xi[1], Real(0), Real(0)}, ly);
+        LagrangeLineFast<2>::evaluate_gradients({xi[0], Real(0), Real(0)}, gx);
+        LagrangeLineFast<2>::evaluate_gradients({xi[1], Real(0), Real(0)}, gy);
+        for (std::size_t n = 0; n < node_axes.size(); ++n) {
+            const auto i = node_axes[n][0];
+            const auto j = node_axes[n][1];
+            out[n] = Gradient{gx[i][0] * ly[j], lx[i] * gy[j][0], Real(0)};
+        }
+    }
+
+    static void evaluate_hessians(const math::Vector<Real, 3>& xi,
+                                  std::array<Hessian, n_dofs>& out) {
+        std::array<Real, LagrangeLineFast<2>::n_dofs> lx{};
+        std::array<Real, LagrangeLineFast<2>::n_dofs> ly{};
+        std::array<Gradient, LagrangeLineFast<2>::n_dofs> gx{};
+        std::array<Gradient, LagrangeLineFast<2>::n_dofs> gy{};
+        std::array<Hessian, LagrangeLineFast<2>::n_dofs> hx{};
+        std::array<Hessian, LagrangeLineFast<2>::n_dofs> hy{};
+        LagrangeLineFast<2>::evaluate({xi[0], Real(0), Real(0)}, lx);
+        LagrangeLineFast<2>::evaluate({xi[1], Real(0), Real(0)}, ly);
+        LagrangeLineFast<2>::evaluate_gradients({xi[0], Real(0), Real(0)}, gx);
+        LagrangeLineFast<2>::evaluate_gradients({xi[1], Real(0), Real(0)}, gy);
+        LagrangeLineFast<2>::evaluate_hessians({xi[0], Real(0), Real(0)}, hx);
+        LagrangeLineFast<2>::evaluate_hessians({xi[1], Real(0), Real(0)}, hy);
+        for (std::size_t n = 0; n < node_axes.size(); ++n) {
+            const auto i = node_axes[n][0];
+            const auto j = node_axes[n][1];
+            Hessian H{};
+            H(0, 0) = hx[i](0, 0) * ly[j];
+            H(1, 1) = lx[i] * hy[j](0, 0);
+            H(0, 1) = gx[i][0] * gy[j][0];
+            H(1, 0) = H(0, 1);
+            out[n] = H;
+        }
+    }
+};
+
+template<>
+struct LagrangeQuadFast<3> {
+    static constexpr int n_dofs = 16;
+
+    static constexpr std::array<std::array<std::size_t, 2>, n_dofs> node_axes =
+        detail::make_quad_tensor_node_axes<3>();
+
+    static void evaluate(const math::Vector<Real, 3>& xi, std::array<Real, n_dofs>& out) {
+        std::array<Real, LagrangeLineFast<3>::n_dofs> lx{};
+        std::array<Real, LagrangeLineFast<3>::n_dofs> ly{};
+        detail::fill_axis_values<3>(xi[0], lx);
+        detail::fill_axis_values<3>(xi[1], ly);
+        for (std::size_t n = 0; n < node_axes.size(); ++n) {
+            out[n] = lx[node_axes[n][0]] * ly[node_axes[n][1]];
+        }
+    }
+
+    static void evaluate_gradients(const math::Vector<Real, 3>& xi,
+                                   std::array<Gradient, n_dofs>& out) {
+        std::array<Real, LagrangeLineFast<3>::n_dofs> lx{};
+        std::array<Real, LagrangeLineFast<3>::n_dofs> ly{};
+        std::array<Real, LagrangeLineFast<3>::n_dofs> gx{};
+        std::array<Real, LagrangeLineFast<3>::n_dofs> gy{};
+        detail::fill_axis_values_first<3>(xi[0], lx, gx);
+        detail::fill_axis_values_first<3>(xi[1], ly, gy);
+        for (std::size_t n = 0; n < node_axes.size(); ++n) {
+            const auto i = node_axes[n][0];
+            const auto j = node_axes[n][1];
+            out[n] = Gradient{gx[i] * ly[j], lx[i] * gy[j], Real(0)};
+        }
+    }
+
+    static void evaluate_hessians(const math::Vector<Real, 3>& xi,
+                                  std::array<Hessian, n_dofs>& out) {
+        std::array<Real, LagrangeLineFast<3>::n_dofs> lx{};
+        std::array<Real, LagrangeLineFast<3>::n_dofs> ly{};
+        std::array<Real, LagrangeLineFast<3>::n_dofs> gx{};
+        std::array<Real, LagrangeLineFast<3>::n_dofs> gy{};
+        std::array<Real, LagrangeLineFast<3>::n_dofs> hx{};
+        std::array<Real, LagrangeLineFast<3>::n_dofs> hy{};
+        detail::fill_axis_values_first_second<3>(xi[0], lx, gx, hx);
+        detail::fill_axis_values_first_second<3>(xi[1], ly, gy, hy);
+        for (std::size_t n = 0; n < node_axes.size(); ++n) {
+            const auto i = node_axes[n][0];
+            const auto j = node_axes[n][1];
+            Hessian H{};
+            H(0, 0) = hx[i] * ly[j];
+            H(1, 1) = lx[i] * hy[j];
+            H(0, 1) = gx[i] * gy[j];
+            H(1, 0) = H(0, 1);
+            out[n] = H;
+        }
+    }
+};
+
+// ---------------------------------------------------------------------------
+// LagrangeHexFast<Order>
+// ---------------------------------------------------------------------------
+template<int Order>
+struct LagrangeHexFast;
+
+template<>
+struct LagrangeHexFast<1> {
+    static constexpr int n_dofs = 8;
+
+    // VTK Hex8 corner ordering: (-,-,-), (+,-,-), (+,+,-), (-,+,-),
+    //                           (-,-,+), (+,-,+), (+,+,+), (-,+,+).
+    static constexpr void evaluate(const math::Vector<Real, 3>& xi, std::array<Real, n_dofs>& out) {
+        const Real lx = (Real(1) - xi[0]) * Real(0.5);
+        const Real ly = (Real(1) - xi[1]) * Real(0.5);
+        const Real lz = (Real(1) - xi[2]) * Real(0.5);
+        const Real ux = (Real(1) + xi[0]) * Real(0.5);
+        const Real uy = (Real(1) + xi[1]) * Real(0.5);
+        const Real uz = (Real(1) + xi[2]) * Real(0.5);
+        // Precompute z-plane partial products (sum factorization).
+        const Real lxly = lx * ly;
+        const Real uxly = ux * ly;
+        const Real uxuy = ux * uy;
+        const Real lxuy = lx * uy;
+        out[0] = lxly * lz;
+        out[1] = uxly * lz;
+        out[2] = uxuy * lz;
+        out[3] = lxuy * lz;
+        out[4] = lxly * uz;
+        out[5] = uxly * uz;
+        out[6] = uxuy * uz;
+        out[7] = lxuy * uz;
+    }
+
+    static constexpr void evaluate_gradients(const math::Vector<Real, 3>& xi,
+                                             std::array<Gradient, n_dofs>& out) {
+        const Real lx = (Real(1) - xi[0]) * Real(0.5);
+        const Real ly = (Real(1) - xi[1]) * Real(0.5);
+        const Real lz = (Real(1) - xi[2]) * Real(0.5);
+        const Real ux = (Real(1) + xi[0]) * Real(0.5);
+        const Real uy = (Real(1) + xi[1]) * Real(0.5);
+        const Real uz = (Real(1) + xi[2]) * Real(0.5);
+        // dL_0(x)/dx = -0.5, dL_1(x)/dx = +0.5 along each axis.
+        out[0] = Gradient{Real(-0.5) * ly * lz, Real(-0.5) * lx * lz, Real(-0.5) * lx * ly};
+        out[1] = Gradient{Real( 0.5) * ly * lz, Real(-0.5) * ux * lz, Real(-0.5) * ux * ly};
+        out[2] = Gradient{Real( 0.5) * uy * lz, Real( 0.5) * ux * lz, Real(-0.5) * ux * uy};
+        out[3] = Gradient{Real(-0.5) * uy * lz, Real( 0.5) * lx * lz, Real(-0.5) * lx * uy};
+        out[4] = Gradient{Real(-0.5) * ly * uz, Real(-0.5) * lx * uz, Real( 0.5) * lx * ly};
+        out[5] = Gradient{Real( 0.5) * ly * uz, Real(-0.5) * ux * uz, Real( 0.5) * ux * ly};
+        out[6] = Gradient{Real( 0.5) * uy * uz, Real( 0.5) * ux * uz, Real( 0.5) * ux * uy};
+        out[7] = Gradient{Real(-0.5) * uy * uz, Real( 0.5) * lx * uz, Real( 0.5) * lx * uy};
+    }
+
+    static constexpr void evaluate_hessians(const math::Vector<Real, 3>& xi,
+                                            std::array<Hessian, n_dofs>& out) {
+        const Real lx = (Real(1) - xi[0]) * Real(0.5);
+        const Real ly = (Real(1) - xi[1]) * Real(0.5);
+        const Real lz = (Real(1) - xi[2]) * Real(0.5);
+        const Real ux = (Real(1) + xi[0]) * Real(0.5);
+        const Real uy = (Real(1) + xi[1]) * Real(0.5);
+        const Real uz = (Real(1) + xi[2]) * Real(0.5);
+        const Real ax[8] = {lx, ux, ux, lx, lx, ux, ux, lx};
+        const Real ay[8] = {ly, ly, uy, uy, ly, ly, uy, uy};
+        const Real az[8] = {lz, lz, lz, lz, uz, uz, uz, uz};
+        const int sx[8] = {-1, 1, 1, -1, -1, 1, 1, -1};
+        const int sy[8] = {-1, -1, 1, 1, -1, -1, 1, 1};
+        const int sz[8] = {-1, -1, -1, -1, 1, 1, 1, 1};
+        constexpr Real qrt = Real(0.25);
+        for (std::size_t n = 0; n < static_cast<std::size_t>(n_dofs); ++n) {
+            out[n] = Hessian{};
+            out[n](0, 1) = static_cast<Real>(sx[n] * sy[n]) * qrt * az[n];
+            out[n](1, 0) = out[n](0, 1);
+            out[n](0, 2) = static_cast<Real>(sx[n] * sz[n]) * qrt * ay[n];
+            out[n](2, 0) = out[n](0, 2);
+            out[n](1, 2) = static_cast<Real>(sy[n] * sz[n]) * qrt * ax[n];
+            out[n](2, 1) = out[n](1, 2);
+        }
+    }
+};
+
+template<>
+struct LagrangeHexFast<2> {
+    static constexpr int n_dofs = 27;
+
+    static constexpr std::array<std::array<std::size_t, 3>, n_dofs> node_axes = {{
+        {{0u, 0u, 0u}}, {{1u, 0u, 0u}}, {{1u, 1u, 0u}}, {{0u, 1u, 0u}},
+        {{0u, 0u, 1u}}, {{1u, 0u, 1u}}, {{1u, 1u, 1u}}, {{0u, 1u, 1u}},
+        {{2u, 0u, 0u}}, {{1u, 2u, 0u}}, {{2u, 1u, 0u}}, {{0u, 2u, 0u}},
+        {{2u, 0u, 1u}}, {{1u, 2u, 1u}}, {{2u, 1u, 1u}}, {{0u, 2u, 1u}},
+        {{0u, 0u, 2u}}, {{1u, 0u, 2u}}, {{1u, 1u, 2u}}, {{0u, 1u, 2u}},
+        {{2u, 2u, 0u}}, {{2u, 2u, 1u}}, {{2u, 0u, 2u}}, {{1u, 2u, 2u}},
+        {{2u, 1u, 2u}}, {{0u, 2u, 2u}}, {{2u, 2u, 2u}},
+    }};
+
+    static void evaluate(const math::Vector<Real, 3>& xi, std::array<Real, n_dofs>& out) {
+        std::array<Real, LagrangeLineFast<2>::n_dofs> lx{};
+        std::array<Real, LagrangeLineFast<2>::n_dofs> ly{};
+        std::array<Real, LagrangeLineFast<2>::n_dofs> lz{};
+        LagrangeLineFast<2>::evaluate({xi[0], Real(0), Real(0)}, lx);
+        LagrangeLineFast<2>::evaluate({xi[1], Real(0), Real(0)}, ly);
+        LagrangeLineFast<2>::evaluate({xi[2], Real(0), Real(0)}, lz);
+        for (std::size_t n = 0; n < node_axes.size(); ++n) {
+            out[n] = lx[node_axes[n][0]] * ly[node_axes[n][1]] * lz[node_axes[n][2]];
+        }
+    }
+
+    static void evaluate_gradients(const math::Vector<Real, 3>& xi,
+                                   std::array<Gradient, n_dofs>& out) {
+        std::array<Real, LagrangeLineFast<2>::n_dofs> lx{};
+        std::array<Real, LagrangeLineFast<2>::n_dofs> ly{};
+        std::array<Real, LagrangeLineFast<2>::n_dofs> lz{};
+        std::array<Gradient, LagrangeLineFast<2>::n_dofs> gx{};
+        std::array<Gradient, LagrangeLineFast<2>::n_dofs> gy{};
+        std::array<Gradient, LagrangeLineFast<2>::n_dofs> gz{};
+        LagrangeLineFast<2>::evaluate({xi[0], Real(0), Real(0)}, lx);
+        LagrangeLineFast<2>::evaluate({xi[1], Real(0), Real(0)}, ly);
+        LagrangeLineFast<2>::evaluate({xi[2], Real(0), Real(0)}, lz);
+        LagrangeLineFast<2>::evaluate_gradients({xi[0], Real(0), Real(0)}, gx);
+        LagrangeLineFast<2>::evaluate_gradients({xi[1], Real(0), Real(0)}, gy);
+        LagrangeLineFast<2>::evaluate_gradients({xi[2], Real(0), Real(0)}, gz);
+        for (std::size_t n = 0; n < node_axes.size(); ++n) {
+            const auto i = node_axes[n][0];
+            const auto j = node_axes[n][1];
+            const auto k = node_axes[n][2];
+            out[n] = Gradient{
+                gx[i][0] * ly[j] * lz[k],
+                lx[i] * gy[j][0] * lz[k],
+                lx[i] * ly[j] * gz[k][0],
+            };
+        }
+    }
+
+    static void evaluate_hessians(const math::Vector<Real, 3>& xi,
+                                  std::array<Hessian, n_dofs>& out) {
+        std::array<Real, LagrangeLineFast<2>::n_dofs> lx{};
+        std::array<Real, LagrangeLineFast<2>::n_dofs> ly{};
+        std::array<Real, LagrangeLineFast<2>::n_dofs> lz{};
+        std::array<Gradient, LagrangeLineFast<2>::n_dofs> gx{};
+        std::array<Gradient, LagrangeLineFast<2>::n_dofs> gy{};
+        std::array<Gradient, LagrangeLineFast<2>::n_dofs> gz{};
+        std::array<Hessian, LagrangeLineFast<2>::n_dofs> hx{};
+        std::array<Hessian, LagrangeLineFast<2>::n_dofs> hy{};
+        std::array<Hessian, LagrangeLineFast<2>::n_dofs> hz{};
+        LagrangeLineFast<2>::evaluate({xi[0], Real(0), Real(0)}, lx);
+        LagrangeLineFast<2>::evaluate({xi[1], Real(0), Real(0)}, ly);
+        LagrangeLineFast<2>::evaluate({xi[2], Real(0), Real(0)}, lz);
+        LagrangeLineFast<2>::evaluate_gradients({xi[0], Real(0), Real(0)}, gx);
+        LagrangeLineFast<2>::evaluate_gradients({xi[1], Real(0), Real(0)}, gy);
+        LagrangeLineFast<2>::evaluate_gradients({xi[2], Real(0), Real(0)}, gz);
+        LagrangeLineFast<2>::evaluate_hessians({xi[0], Real(0), Real(0)}, hx);
+        LagrangeLineFast<2>::evaluate_hessians({xi[1], Real(0), Real(0)}, hy);
+        LagrangeLineFast<2>::evaluate_hessians({xi[2], Real(0), Real(0)}, hz);
+        for (std::size_t n = 0; n < node_axes.size(); ++n) {
+            const auto i = node_axes[n][0];
+            const auto j = node_axes[n][1];
+            const auto k = node_axes[n][2];
+            Hessian H{};
+            H(0, 0) = hx[i](0, 0) * ly[j] * lz[k];
+            H(1, 1) = lx[i] * hy[j](0, 0) * lz[k];
+            H(2, 2) = lx[i] * ly[j] * hz[k](0, 0);
+            H(0, 1) = gx[i][0] * gy[j][0] * lz[k];
+            H(1, 0) = H(0, 1);
+            H(0, 2) = gx[i][0] * ly[j] * gz[k][0];
+            H(2, 0) = H(0, 2);
+            H(1, 2) = lx[i] * gy[j][0] * gz[k][0];
+            H(2, 1) = H(1, 2);
+            out[n] = H;
+        }
+    }
+};
+
+template<>
+struct LagrangeHexFast<3> {
+    static constexpr int n_dofs = 64;
+
+    static constexpr std::array<std::array<std::size_t, 3>, n_dofs> node_axes =
+        detail::make_hex_tensor_node_axes<3>();
+
+    static void evaluate(const math::Vector<Real, 3>& xi, std::array<Real, n_dofs>& out) {
+        std::array<Real, LagrangeLineFast<3>::n_dofs> lx{};
+        std::array<Real, LagrangeLineFast<3>::n_dofs> ly{};
+        std::array<Real, LagrangeLineFast<3>::n_dofs> lz{};
+        detail::fill_axis_values<3>(xi[0], lx);
+        detail::fill_axis_values<3>(xi[1], ly);
+        detail::fill_axis_values<3>(xi[2], lz);
+        for (std::size_t n = 0; n < node_axes.size(); ++n) {
+            out[n] = lx[node_axes[n][0]] * ly[node_axes[n][1]] * lz[node_axes[n][2]];
+        }
+    }
+
+    static void evaluate_gradients(const math::Vector<Real, 3>& xi,
+                                   std::array<Gradient, n_dofs>& out) {
+        std::array<Real, LagrangeLineFast<3>::n_dofs> lx{};
+        std::array<Real, LagrangeLineFast<3>::n_dofs> ly{};
+        std::array<Real, LagrangeLineFast<3>::n_dofs> lz{};
+        std::array<Real, LagrangeLineFast<3>::n_dofs> gx{};
+        std::array<Real, LagrangeLineFast<3>::n_dofs> gy{};
+        std::array<Real, LagrangeLineFast<3>::n_dofs> gz{};
+        detail::fill_axis_values_first<3>(xi[0], lx, gx);
+        detail::fill_axis_values_first<3>(xi[1], ly, gy);
+        detail::fill_axis_values_first<3>(xi[2], lz, gz);
+        for (std::size_t n = 0; n < node_axes.size(); ++n) {
+            const auto i = node_axes[n][0];
+            const auto j = node_axes[n][1];
+            const auto k = node_axes[n][2];
+            out[n] = Gradient{
+                gx[i] * ly[j] * lz[k],
+                lx[i] * gy[j] * lz[k],
+                lx[i] * ly[j] * gz[k],
+            };
+        }
+    }
+
+    static void evaluate_hessians(const math::Vector<Real, 3>& xi,
+                                  std::array<Hessian, n_dofs>& out) {
+        std::array<Real, LagrangeLineFast<3>::n_dofs> lx{};
+        std::array<Real, LagrangeLineFast<3>::n_dofs> ly{};
+        std::array<Real, LagrangeLineFast<3>::n_dofs> lz{};
+        std::array<Real, LagrangeLineFast<3>::n_dofs> gx{};
+        std::array<Real, LagrangeLineFast<3>::n_dofs> gy{};
+        std::array<Real, LagrangeLineFast<3>::n_dofs> gz{};
+        std::array<Real, LagrangeLineFast<3>::n_dofs> hx{};
+        std::array<Real, LagrangeLineFast<3>::n_dofs> hy{};
+        std::array<Real, LagrangeLineFast<3>::n_dofs> hz{};
+        detail::fill_axis_values_first_second<3>(xi[0], lx, gx, hx);
+        detail::fill_axis_values_first_second<3>(xi[1], ly, gy, hy);
+        detail::fill_axis_values_first_second<3>(xi[2], lz, gz, hz);
+        for (std::size_t n = 0; n < node_axes.size(); ++n) {
+            const auto i = node_axes[n][0];
+            const auto j = node_axes[n][1];
+            const auto k = node_axes[n][2];
+            Hessian H{};
+            H(0, 0) = hx[i] * ly[j] * lz[k];
+            H(1, 1) = lx[i] * hy[j] * lz[k];
+            H(2, 2) = lx[i] * ly[j] * hz[k];
+            H(0, 1) = gx[i] * gy[j] * lz[k];
+            H(1, 0) = H(0, 1);
+            H(0, 2) = gx[i] * ly[j] * gz[k];
+            H(2, 0) = H(0, 2);
+            H(1, 2) = lx[i] * gy[j] * gz[k];
+            H(2, 1) = H(1, 2);
+            out[n] = H;
+        }
+    }
+};
+
+// ---------------------------------------------------------------------------
+// LagrangeTriFast<Order>
+// ---------------------------------------------------------------------------
+template<int Order>
+struct LagrangeTriFast;
+
+template<>
+struct LagrangeTriFast<1> {
+    static constexpr int n_dofs = 3;
+
+    static constexpr void evaluate(const math::Vector<Real, 3>& xi, std::array<Real, n_dofs>& out) {
+        out[0] = Real(1) - xi[0] - xi[1];
+        out[1] = xi[0];
+        out[2] = xi[1];
+    }
+
+    static constexpr void evaluate_gradients(const math::Vector<Real, 3>& /*xi*/,
+                                             std::array<Gradient, n_dofs>& out) {
+        out[0] = Gradient{Real(-1), Real(-1), Real(0)};
+        out[1] = Gradient{Real( 1), Real( 0), Real(0)};
+        out[2] = Gradient{Real( 0), Real( 1), Real(0)};
+    }
+
+    static constexpr void evaluate_hessians(const math::Vector<Real, 3>& /*xi*/,
+                                            std::array<Hessian, n_dofs>& out) {
+        out[0] = Hessian{};
+        out[1] = Hessian{};
+        out[2] = Hessian{};
+    }
+};
+
+template<>
+struct LagrangeTriFast<2> {
+    static constexpr int n_dofs = 6;
+
+    static constexpr void evaluate(const math::Vector<Real, 3>& xi, std::array<Real, n_dofs>& out) {
+        const Real l1 = xi[0];
+        const Real l2 = xi[1];
+        const Real l0 = Real(1) - l1 - l2;
+
+        out[0] = l0 * (Real(2) * l0 - Real(1));
+        out[1] = l1 * (Real(2) * l1 - Real(1));
+        out[2] = l2 * (Real(2) * l2 - Real(1));
+        out[3] = Real(4) * l0 * l1;
+        out[4] = Real(4) * l1 * l2;
+        out[5] = Real(4) * l0 * l2;
+    }
+
+    static constexpr void evaluate_gradients(const math::Vector<Real, 3>& xi,
+                                             std::array<Gradient, n_dofs>& out) {
+        const Real l1 = xi[0];
+        const Real l2 = xi[1];
+        const Real l0 = Real(1) - l1 - l2;
+        constexpr Gradient g0{Real(-1), Real(-1), Real(0)};
+        constexpr Gradient g1{Real( 1), Real( 0), Real(0)};
+        constexpr Gradient g2{Real( 0), Real( 1), Real(0)};
+
+        out[0] = detail::scaled_gradient(g0, Real(4) * l0 - Real(1));
+        out[1] = detail::scaled_gradient(g1, Real(4) * l1 - Real(1));
+        out[2] = detail::scaled_gradient(g2, Real(4) * l2 - Real(1));
+        out[3] = detail::p2_edge_gradient(l0, g0, l1, g1);
+        out[4] = detail::p2_edge_gradient(l1, g1, l2, g2);
+        out[5] = detail::p2_edge_gradient(l0, g0, l2, g2);
+    }
+
+    static constexpr void evaluate_hessians(const math::Vector<Real, 3>& /*xi*/,
+                                            std::array<Hessian, n_dofs>& out) {
+        constexpr Gradient g0{Real(-1), Real(-1), Real(0)};
+        constexpr Gradient g1{Real( 1), Real( 0), Real(0)};
+        constexpr Gradient g2{Real( 0), Real( 1), Real(0)};
+
+        out[0] = detail::p2_vertex_hessian(g0);
+        out[1] = detail::p2_vertex_hessian(g1);
+        out[2] = detail::p2_vertex_hessian(g2);
+        out[3] = detail::p2_edge_hessian(g0, g1);
+        out[4] = detail::p2_edge_hessian(g1, g2);
+        out[5] = detail::p2_edge_hessian(g0, g2);
+    }
+};
+
+template<>
+struct LagrangeTriFast<3> {
+    static constexpr int n_dofs = 10;
+
+    static constexpr std::array<std::array<std::size_t, 3>, n_dofs> exponents =
+        detail::make_triangle_simplex_exponents<3>();
+
+    static void evaluate(const math::Vector<Real, 3>& xi, std::array<Real, n_dofs>& out) {
+        const Real l1 = xi[0];
+        const Real l2 = xi[1];
+        const Real l0 = Real(1) - l1 - l2;
+        std::array<Real, 4> phi0{};
+        std::array<Real, 4> phi1{};
+        std::array<Real, 4> phi2{};
+        detail::fill_simplex_factor_values<3>(l0, phi0);
+        detail::fill_simplex_factor_values<3>(l1, phi1);
+        detail::fill_simplex_factor_values<3>(l2, phi2);
+
+        for (std::size_t n = 0; n < exponents.size(); ++n) {
+            const auto& e = exponents[n];
+            out[n] = phi0[e[0]] * phi1[e[1]] * phi2[e[2]];
+        }
+    }
+
+    static void evaluate_gradients(const math::Vector<Real, 3>& xi,
+                                   std::array<Gradient, n_dofs>& out) {
+        const Real l1 = xi[0];
+        const Real l2 = xi[1];
+        const Real l0 = Real(1) - l1 - l2;
+        std::array<Real, 4> phi0{};
+        std::array<Real, 4> phi1{};
+        std::array<Real, 4> phi2{};
+        std::array<Real, 4> dphi0{};
+        std::array<Real, 4> dphi1{};
+        std::array<Real, 4> dphi2{};
+        detail::fill_simplex_factor_values_first<3>(l0, phi0, dphi0);
+        detail::fill_simplex_factor_values_first<3>(l1, phi1, dphi1);
+        detail::fill_simplex_factor_values_first<3>(l2, phi2, dphi2);
+
+        for (std::size_t n = 0; n < exponents.size(); ++n) {
+            const auto& e = exponents[n];
+            const Real v0 = phi0[e[0]];
+            const Real v1 = phi1[e[1]];
+            const Real v2 = phi2[e[2]];
+            const Real dl0 = dphi0[e[0]] * v1 * v2;
+            const Real dl1 = v0 * dphi1[e[1]] * v2;
+            const Real dl2 = v0 * v1 * dphi2[e[2]];
+            out[n] = Gradient{dl1 - dl0, dl2 - dl0, Real(0)};
+        }
+    }
+
+    static void evaluate_hessians(const math::Vector<Real, 3>& xi,
+                                  std::array<Hessian, n_dofs>& out) {
+        const Real l1 = xi[0];
+        const Real l2 = xi[1];
+        const Real l0 = Real(1) - l1 - l2;
+        std::array<Real, 4> phi0{};
+        std::array<Real, 4> phi1{};
+        std::array<Real, 4> phi2{};
+        std::array<Real, 4> dphi0{};
+        std::array<Real, 4> dphi1{};
+        std::array<Real, 4> dphi2{};
+        std::array<Real, 4> d2phi0{};
+        std::array<Real, 4> d2phi1{};
+        std::array<Real, 4> d2phi2{};
+        detail::fill_simplex_factor_values_first_second<3>(l0, phi0, dphi0, d2phi0);
+        detail::fill_simplex_factor_values_first_second<3>(l1, phi1, dphi1, d2phi1);
+        detail::fill_simplex_factor_values_first_second<3>(l2, phi2, dphi2, d2phi2);
+
+        for (std::size_t n = 0; n < exponents.size(); ++n) {
+            const auto& e = exponents[n];
+            const Real v0 = phi0[e[0]];
+            const Real v1 = phi1[e[1]];
+            const Real v2 = phi2[e[2]];
+            const Real D0 = dphi0[e[0]];
+            const Real D1 = dphi1[e[1]];
+            const Real D2 = dphi2[e[2]];
+            const Real H00 = d2phi0[e[0]] * v1 * v2;
+            const Real H11 = v0 * d2phi1[e[1]] * v2;
+            const Real H22 = v0 * v1 * d2phi2[e[2]];
+            const Real H01 = D0 * D1 * v2;
+            const Real H02 = D0 * v1 * D2;
+            const Real H12 = v0 * D1 * D2;
+
+            Hessian H{};
+            H(0, 0) = H00 - Real(2) * H01 + H11;
+            H(1, 1) = H00 - Real(2) * H02 + H22;
+            H(0, 1) = H00 - H01 - H02 + H12;
+            H(1, 0) = H(0, 1);
+            out[n] = H;
+        }
+    }
+};
+
+// ---------------------------------------------------------------------------
+// LagrangeTetFast<Order>
+// ---------------------------------------------------------------------------
+template<int Order>
+struct LagrangeTetFast;
+
+template<>
+struct LagrangeTetFast<1> {
+    static constexpr int n_dofs = 4;
+
+    static constexpr void evaluate(const math::Vector<Real, 3>& xi, std::array<Real, n_dofs>& out) {
+        out[0] = Real(1) - xi[0] - xi[1] - xi[2];
+        out[1] = xi[0];
+        out[2] = xi[1];
+        out[3] = xi[2];
+    }
+
+    static constexpr void evaluate_gradients(const math::Vector<Real, 3>& /*xi*/,
+                                             std::array<Gradient, n_dofs>& out) {
+        out[0] = Gradient{Real(-1), Real(-1), Real(-1)};
+        out[1] = Gradient{Real( 1), Real( 0), Real( 0)};
+        out[2] = Gradient{Real( 0), Real( 1), Real( 0)};
+        out[3] = Gradient{Real( 0), Real( 0), Real( 1)};
+    }
+
+    static constexpr void evaluate_hessians(const math::Vector<Real, 3>& /*xi*/,
+                                            std::array<Hessian, n_dofs>& out) {
+        out[0] = Hessian{};
+        out[1] = Hessian{};
+        out[2] = Hessian{};
+        out[3] = Hessian{};
+    }
+};
+
+template<>
+struct LagrangeTetFast<2> {
+    static constexpr int n_dofs = 10;
+
+    static constexpr void evaluate(const math::Vector<Real, 3>& xi, std::array<Real, n_dofs>& out) {
+        const Real l1 = xi[0];
+        const Real l2 = xi[1];
+        const Real l3 = xi[2];
+        const Real l0 = Real(1) - l1 - l2 - l3;
+
+        out[0] = l0 * (Real(2) * l0 - Real(1));
+        out[1] = l1 * (Real(2) * l1 - Real(1));
+        out[2] = l2 * (Real(2) * l2 - Real(1));
+        out[3] = l3 * (Real(2) * l3 - Real(1));
+        out[4] = Real(4) * l0 * l1;
+        out[5] = Real(4) * l1 * l2;
+        out[6] = Real(4) * l0 * l2;
+        out[7] = Real(4) * l0 * l3;
+        out[8] = Real(4) * l1 * l3;
+        out[9] = Real(4) * l2 * l3;
+    }
+
+    static constexpr void evaluate_gradients(const math::Vector<Real, 3>& xi,
+                                             std::array<Gradient, n_dofs>& out) {
+        const Real l1 = xi[0];
+        const Real l2 = xi[1];
+        const Real l3 = xi[2];
+        const Real l0 = Real(1) - l1 - l2 - l3;
+        constexpr Gradient g0{Real(-1), Real(-1), Real(-1)};
+        constexpr Gradient g1{Real( 1), Real( 0), Real( 0)};
+        constexpr Gradient g2{Real( 0), Real( 1), Real( 0)};
+        constexpr Gradient g3{Real( 0), Real( 0), Real( 1)};
+
+        out[0] = detail::scaled_gradient(g0, Real(4) * l0 - Real(1));
+        out[1] = detail::scaled_gradient(g1, Real(4) * l1 - Real(1));
+        out[2] = detail::scaled_gradient(g2, Real(4) * l2 - Real(1));
+        out[3] = detail::scaled_gradient(g3, Real(4) * l3 - Real(1));
+        out[4] = detail::p2_edge_gradient(l0, g0, l1, g1);
+        out[5] = detail::p2_edge_gradient(l1, g1, l2, g2);
+        out[6] = detail::p2_edge_gradient(l0, g0, l2, g2);
+        out[7] = detail::p2_edge_gradient(l0, g0, l3, g3);
+        out[8] = detail::p2_edge_gradient(l1, g1, l3, g3);
+        out[9] = detail::p2_edge_gradient(l2, g2, l3, g3);
+    }
+
+    static constexpr void evaluate_hessians(const math::Vector<Real, 3>& /*xi*/,
+                                            std::array<Hessian, n_dofs>& out) {
+        constexpr Gradient g0{Real(-1), Real(-1), Real(-1)};
+        constexpr Gradient g1{Real( 1), Real( 0), Real( 0)};
+        constexpr Gradient g2{Real( 0), Real( 1), Real( 0)};
+        constexpr Gradient g3{Real( 0), Real( 0), Real( 1)};
+
+        out[0] = detail::p2_vertex_hessian(g0);
+        out[1] = detail::p2_vertex_hessian(g1);
+        out[2] = detail::p2_vertex_hessian(g2);
+        out[3] = detail::p2_vertex_hessian(g3);
+        out[4] = detail::p2_edge_hessian(g0, g1);
+        out[5] = detail::p2_edge_hessian(g1, g2);
+        out[6] = detail::p2_edge_hessian(g0, g2);
+        out[7] = detail::p2_edge_hessian(g0, g3);
+        out[8] = detail::p2_edge_hessian(g1, g3);
+        out[9] = detail::p2_edge_hessian(g2, g3);
+    }
+};
+
+template<>
+struct LagrangeTetFast<3> {
+    static constexpr int n_dofs = 20;
+
+    static constexpr std::array<std::array<std::size_t, 4>, n_dofs> exponents =
+        detail::make_tetrahedron_simplex_exponents<3>();
+
+    static void evaluate(const math::Vector<Real, 3>& xi, std::array<Real, n_dofs>& out) {
+        const Real l1 = xi[0];
+        const Real l2 = xi[1];
+        const Real l3 = xi[2];
+        const Real l0 = Real(1) - l1 - l2 - l3;
+        std::array<Real, 4> phi0{};
+        std::array<Real, 4> phi1{};
+        std::array<Real, 4> phi2{};
+        std::array<Real, 4> phi3{};
+        detail::fill_simplex_factor_values<3>(l0, phi0);
+        detail::fill_simplex_factor_values<3>(l1, phi1);
+        detail::fill_simplex_factor_values<3>(l2, phi2);
+        detail::fill_simplex_factor_values<3>(l3, phi3);
+
+        for (std::size_t n = 0; n < exponents.size(); ++n) {
+            const auto& e = exponents[n];
+            out[n] = phi0[e[0]] * phi1[e[1]] * phi2[e[2]] * phi3[e[3]];
+        }
+    }
+
+    static void evaluate_gradients(const math::Vector<Real, 3>& xi,
+                                   std::array<Gradient, n_dofs>& out) {
+        const Real l1 = xi[0];
+        const Real l2 = xi[1];
+        const Real l3 = xi[2];
+        const Real l0 = Real(1) - l1 - l2 - l3;
+        std::array<Real, 4> phi0{};
+        std::array<Real, 4> phi1{};
+        std::array<Real, 4> phi2{};
+        std::array<Real, 4> phi3{};
+        std::array<Real, 4> dphi0{};
+        std::array<Real, 4> dphi1{};
+        std::array<Real, 4> dphi2{};
+        std::array<Real, 4> dphi3{};
+        detail::fill_simplex_factor_values_first<3>(l0, phi0, dphi0);
+        detail::fill_simplex_factor_values_first<3>(l1, phi1, dphi1);
+        detail::fill_simplex_factor_values_first<3>(l2, phi2, dphi2);
+        detail::fill_simplex_factor_values_first<3>(l3, phi3, dphi3);
+
+        for (std::size_t n = 0; n < exponents.size(); ++n) {
+            const auto& e = exponents[n];
+            const Real v0 = phi0[e[0]];
+            const Real v1 = phi1[e[1]];
+            const Real v2 = phi2[e[2]];
+            const Real v3 = phi3[e[3]];
+            const Real dl0 = dphi0[e[0]] * v1 * v2 * v3;
+            const Real dl1 = v0 * dphi1[e[1]] * v2 * v3;
+            const Real dl2 = v0 * v1 * dphi2[e[2]] * v3;
+            const Real dl3 = v0 * v1 * v2 * dphi3[e[3]];
+            out[n] = Gradient{dl1 - dl0, dl2 - dl0, dl3 - dl0};
+        }
+    }
+
+    static void evaluate_hessians(const math::Vector<Real, 3>& xi,
+                                  std::array<Hessian, n_dofs>& out) {
+        const Real l1 = xi[0];
+        const Real l2 = xi[1];
+        const Real l3 = xi[2];
+        const Real l0 = Real(1) - l1 - l2 - l3;
+        std::array<Real, 4> phi0{};
+        std::array<Real, 4> phi1{};
+        std::array<Real, 4> phi2{};
+        std::array<Real, 4> phi3{};
+        std::array<Real, 4> dphi0{};
+        std::array<Real, 4> dphi1{};
+        std::array<Real, 4> dphi2{};
+        std::array<Real, 4> dphi3{};
+        std::array<Real, 4> d2phi0{};
+        std::array<Real, 4> d2phi1{};
+        std::array<Real, 4> d2phi2{};
+        std::array<Real, 4> d2phi3{};
+        detail::fill_simplex_factor_values_first_second<3>(l0, phi0, dphi0, d2phi0);
+        detail::fill_simplex_factor_values_first_second<3>(l1, phi1, dphi1, d2phi1);
+        detail::fill_simplex_factor_values_first_second<3>(l2, phi2, dphi2, d2phi2);
+        detail::fill_simplex_factor_values_first_second<3>(l3, phi3, dphi3, d2phi3);
+
+        for (std::size_t n = 0; n < exponents.size(); ++n) {
+            const auto& e = exponents[n];
+            const Real v0 = phi0[e[0]];
+            const Real v1 = phi1[e[1]];
+            const Real v2 = phi2[e[2]];
+            const Real v3 = phi3[e[3]];
+            const Real D0 = dphi0[e[0]];
+            const Real D1 = dphi1[e[1]];
+            const Real D2 = dphi2[e[2]];
+            const Real D3 = dphi3[e[3]];
+
+            const Real H00 = d2phi0[e[0]] * v1 * v2 * v3;
+            const Real H11 = v0 * d2phi1[e[1]] * v2 * v3;
+            const Real H22 = v0 * v1 * d2phi2[e[2]] * v3;
+            const Real H33 = v0 * v1 * v2 * d2phi3[e[3]];
+            const Real H01 = D0 * D1 * v2 * v3;
+            const Real H02 = D0 * v1 * D2 * v3;
+            const Real H03 = D0 * v1 * v2 * D3;
+            const Real H12 = v0 * D1 * D2 * v3;
+            const Real H13 = v0 * D1 * v2 * D3;
+            const Real H23 = v0 * v1 * D2 * D3;
+
+            Hessian H{};
+            H(0, 0) = H00 - Real(2) * H01 + H11;
+            H(1, 1) = H00 - Real(2) * H02 + H22;
+            H(2, 2) = H00 - Real(2) * H03 + H33;
+            H(0, 1) = H00 - H01 - H02 + H12;
+            H(1, 0) = H(0, 1);
+            H(0, 2) = H00 - H01 - H03 + H13;
+            H(2, 0) = H(0, 2);
+            H(1, 2) = H00 - H02 - H03 + H23;
+            H(2, 1) = H(1, 2);
+            out[n] = H;
+        }
+    }
+};
+
+} // namespace basis
+} // namespace FE
+} // namespace svmp
+
+#endif // SVMP_FE_BASIS_LAGRANGEBASISFAST_H
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasisPyramid.cpp b/Code/Source/solver/FE/Basis/LagrangeBasisPyramid.cpp
new file mode 100644
index 000000000..4a332621e
--- /dev/null
+++ b/Code/Source/solver/FE/Basis/LagrangeBasisPyramid.cpp
@@ -0,0 +1,2069 @@
+#include "LagrangeBasisPyramid.h"
+
+#include <algorithm>
+#include <array>
+#include <cmath>
+#include <map>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "Basis/BasisExceptions.h"
+#include "BasisTolerance.h"
+#include "Math/DenseLinearAlgebra.h"
+#include "Math/DenseTransformKernels.h"
+#include "LagrangeBasisUtility.h"
+#include "PyramidModalBasis.h"
+
+namespace svmp {
+namespace FE {
+namespace basis {
+namespace detail {
+
+class PyramidLagrangeCache {
+public:
+    using ModalTerm = pyramid_modal::Term;
+
+    struct UvPolynomial {
+        using Power = std::pair<int, int>;
+        std::vector<std::pair<Power, Real>> coeffs;
+
+        void add_term(int pu, int pv, Real coeff, Real tol = Real(1e-14)) {
+            if (std::abs(coeff) <= tol) {
+                return;
+            }
+            const auto key = std::make_pair(pu, pv);
+            const auto found = std::lower_bound(
+                coeffs.begin(),
+                coeffs.end(),
+                key,
+                [](const auto& entry, const Power& value) { return entry.first < value; });
+            if (found == coeffs.end() || found->first != key) {
+                coeffs.insert(found, {key, coeff});
+                return;
+            }
+
+            found->second += coeff;
+            if (std::abs(found->second) <= tol) {
+                coeffs.erase(found);
+            }
+        }
+
+        void add_scaled(const UvPolynomial& other, Real scale, Real tol = Real(1e-14)) {
+            if (std::abs(scale) <= tol) {
+                return;
+            }
+            for (const auto& [powers, coeff] : other.coeffs) {
+                add_term(powers.first, powers.second, scale * coeff, tol);
+            }
+        }
+
+        bool empty(Real tol = Real(1e-12)) const {
+            for (const auto& [powers, coeff] : coeffs) {
+                (void)powers;
+                if (std::abs(coeff) > tol) {
+                    return false;
+                }
+            }
+            return true;
+        }
+
+        bool is_constant(Real tol = Real(1e-12)) const {
+            for (const auto& [powers, coeff] : coeffs) {
+                if (std::abs(coeff) <= tol) {
+                    continue;
+                }
+                if (powers.first != 0 || powers.second != 0) {
+                    return false;
+                }
+            }
+            return true;
+        }
+
+        Real constant_value(Real tol = Real(1e-12)) const {
+            Real value = Real(0);
+            for (const auto& [powers, coeff] : coeffs) {
+                if (std::abs(coeff) <= tol) {
+                    continue;
+                }
+                if (powers.first == 0 && powers.second == 0) {
+                    value += coeff;
+                }
+            }
+            return value;
+        }
+    };
+
+    struct ApexSeries {
+        std::vector<std::pair<int, UvPolynomial>> by_power;
+
+        void add_term(int beta, int pu, int pv, Real coeff, Real tol = Real(1e-14)) {
+            const auto found = find_or_insert(beta);
+            found->second.add_term(pu, pv, coeff, tol);
+            if (found->second.empty(tol)) {
+                by_power.erase(found);
+            }
+        }
+
+        void add_scaled(const ApexSeries& other, Real scale, Real tol = Real(1e-14)) {
+            if (std::abs(scale) <= tol) {
+                return;
+            }
+            for (const auto& [beta, poly] : other.by_power) {
+                const auto found = find_or_insert(beta);
+                found->second.add_scaled(poly, scale, tol);
+                if (found->second.empty(tol)) {
+                    by_power.erase(found);
+                }
+            }
+        }
+
+    private:
+        std::vector<std::pair<int, UvPolynomial>>::iterator find_or_insert(int beta) {
+            const auto found = std::lower_bound(
+                by_power.begin(),
+                by_power.end(),
+                beta,
+                [](const auto& entry, int value) { return entry.first < value; });
+            if (found != by_power.end() && found->first == beta) {
+                return found;
+            }
+            return by_power.insert(found, {beta, UvPolynomial{}});
+        }
+    };
+
+    using GradientSeries = std::array<ApexSeries, 3>;
+    using HessianSeries = std::array<std::array<ApexSeries, 3>, 3>;
+
+    enum class ApexLimitKind {
+        Constant,
+        DirectionDependent,
+        Singular,
+    };
+
+    enum class ApexRankStatus {
+        Exact,
+        DirectionDependent,
+        Singular,
+    };
+
+    struct ApexClassification {
+        ApexLimitKind kind{ApexLimitKind::Constant};
+        Real constant_value{0};
+        int leading_power{1};
+    };
+
+    struct ApexData {
+        std::vector<Real> values;
+        std::vector<Gradient> gradients;
+        std::vector<Hessian> hessians;
+        ApexRankStatus gradient_status{ApexRankStatus::Exact};
+        ApexRankStatus hessian_status{ApexRankStatus::Exact};
+    };
+
+    struct OrderData {
+        int order{0};
+        std::vector<math::Vector<Real, 3>> nodes;
+        std::vector<ModalTerm> modal_terms;
+        std::vector<Real> modal_to_nodal;
+        ApexData apex;
+    };
+
+    struct EvaluationScratch {
+        std::vector<Real> modal_values;
+        std::vector<Real> modal_gradient_components;
+        std::vector<Real> modal_hessian_components;
+        std::vector<Gradient> modal_gradients;
+        std::vector<Hessian> modal_hessians;
+        pyramid_modal::EvaluationPoint modal_point;
+
+        void prewarm(std::size_t max_size, std::size_t max_qpts) {
+            const std::size_t batched_size = max_size * std::max<std::size_t>(max_qpts, 1u);
+            modal_values.reserve(batched_size);
+            modal_gradient_components.reserve(batched_size * 3u);
+            modal_hessian_components.reserve(batched_size * 9u);
+            modal_gradients.reserve(max_size);
+            modal_hessians.reserve(max_size);
+        }
+    };
+
+    static EvaluationScratch& evaluation_scratch() {
+        // Scratch is intentionally thread-local: production assembly uses a
+        // persistent worker-thread team, so buffers stay warm on each worker.
+        static thread_local EvaluationScratch scratch;
+        return scratch;
+    }
+
+    static void prewarm_scratch(std::size_t max_size, std::size_t max_qpts) {
+        evaluation_scratch().prewarm(max_size, max_qpts);
+    }
+
+    static bool is_apex_point(const math::Vector<Real, 3>& xi) {
+        const Real tol = apex_coord_tolerance();
+        return std::abs(xi[0]) <= tol &&
+               std::abs(xi[1]) <= tol &&
+               std::abs(Real(1) - xi[2]) <= tol;
+    }
+
+    static bool on_degenerate_top_plane(const math::Vector<Real, 3>& xi) {
+        return basis_near_zero(Real(1) - xi[2]);
+    }
+
+    static void validate_top_plane_query(const math::Vector<Real, 3>& xi) {
+        if (on_degenerate_top_plane(xi) && !is_apex_point(xi)) [[unlikely]] {
+            throw BasisEvaluationException(
+                "Pyramid reference evaluation on the degenerate z=1 plane is only defined at the apex",
+                __FILE__, __LINE__, __func__);
+        }
+    }
+
+    static OrderData build_order_data(int order) {
+        OrderData data;
+        data.order = order;
+
+        data.nodes = build_public_nodes(order);
+        data.modal_terms = pyramid_modal::build_terms(order);
+
+        const std::size_t n = data.nodes.size();
+        if (data.modal_terms.size() != n) {
+            throw BasisConstructionException("LagrangeBasis pyramid modal basis size mismatch",
+                                             __FILE__, __LINE__, __func__);
+        }
+
+        std::vector<Real> vandermonde(n * n, Real(0));
+        for (std::size_t row = 0; row < n; ++row) {
+            pyramid_modal::EvaluationPoint modal_point;
+            pyramid_modal::prepare_evaluation_point(
+                data.modal_terms, data.nodes[row], modal_point);
+            for (std::size_t col = 0; col < n; ++col) {
+                Real value = Real(0);
+                pyramid_modal::evaluate_term(data.modal_terms[col], modal_point, value);
+                vandermonde[row * n + col] = value;
+            }
+        }
+
+        const auto inverse_result = math::invert_dense_matrix_with_diagnostics(
+            std::move(vandermonde),
+            n,
+            "LagrangeBasis pyramid Vandermonde");
+        math::validate_dense_inverse_diagnostics(
+            inverse_result,
+            n,
+            "LagrangeBasis pyramid Vandermonde");
+        const std::vector<Real>& inverse = inverse_result.inverse;
+
+        data.modal_to_nodal.assign(n * n, Real(0));
+        for (std::size_t basis_i = 0; basis_i < n; ++basis_i) {
+            for (std::size_t modal_j = 0; modal_j < n; ++modal_j) {
+                data.modal_to_nodal[basis_i * n + modal_j] =
+                    inverse[modal_j * n + basis_i];
+            }
+        }
+        data.apex = build_apex_data(data);
+        return data;
+    }
+
+    static bool has_low_order_fast_modal_to_nodal(const OrderData& data) noexcept {
+        return data.order == 1 || data.order == 2;
+    }
+
+    static const OrderData& get(int order) {
+        constexpr int kMaxOnceCachedOrder = 12;
+        if (order >= 0 && order <= kMaxOnceCachedOrder) {
+            static std::array<std::once_flag, kMaxOnceCachedOrder + 1> flags;
+            static std::array<std::unique_ptr<OrderData>, kMaxOnceCachedOrder + 1> cache;
+            const auto idx = static_cast<std::size_t>(order);
+            std::call_once(flags[idx], [idx, order]() {
+                cache[idx] = std::make_unique<OrderData>(build_order_data(order));
+            });
+            return *cache[idx];
+        }
+
+        static std::mutex fallback_mutex;
+        static std::map<int, std::unique_ptr<OrderData>> fallback_cache;
+
+        std::lock_guard<std::mutex> lock(fallback_mutex);
+        const auto found = fallback_cache.find(order);
+        if (found != fallback_cache.end()) {
+            return *found->second;
+        }
+
+        auto data = std::make_unique<OrderData>(build_order_data(order));
+        const auto [it, inserted] = fallback_cache.emplace(order, std::move(data));
+        (void)inserted;
+        return *it->second;
+    }
+
+    static void evaluate_values(const OrderData& data,
+                                const math::Vector<Real, 3>& xi,
+                                std::vector<Real>& values) {
+        validate_top_plane_query(xi);
+        if (is_apex_point(xi)) {
+            values = data.apex.values;
+            return;
+        }
+
+        auto& scratch = evaluation_scratch();
+        auto& modal = scratch.modal_values;
+        auto& modal_point = scratch.modal_point;
+        modal.resize(data.modal_terms.size());
+        pyramid_modal::prepare_evaluation_point(data.modal_terms, xi, modal_point);
+        for (std::size_t m = 0; m < data.modal_terms.size(); ++m) {
+            pyramid_modal::evaluate_term(data.modal_terms[m], modal_point, modal[m]);
+        }
+        if (has_low_order_fast_modal_to_nodal(data)) {
+            apply_sparse_basis_to_nodal(data, modal, values);
+        } else {
+            apply_modal_to_nodal(data, modal, values);
+        }
+    }
+
+    static void evaluate_gradients(const OrderData& data,
+                                   const math::Vector<Real, 3>& xi,
+                                   std::vector<Gradient>& gradients) {
+        validate_top_plane_query(xi);
+        if (is_apex_point(xi)) {
+            if (data.apex.gradient_status != ApexRankStatus::Exact) {
+                throw BasisEvaluationException(
+                    apex_status_message("gradient", data.apex.gradient_status),
+                    __FILE__, __LINE__, __func__);
+            }
+            gradients = data.apex.gradients;
+            return;
+        }
+
+        auto& scratch = evaluation_scratch();
+        auto& modal_gradients = scratch.modal_gradients;
+        auto& modal_point = scratch.modal_point;
+        modal_gradients.resize(data.modal_terms.size());
+        pyramid_modal::prepare_evaluation_point(data.modal_terms, xi, modal_point);
+        for (std::size_t m = 0; m < data.modal_terms.size(); ++m) {
+            Real value = Real(0);
+            pyramid_modal::evaluate_term(data.modal_terms[m], modal_point, value, &modal_gradients[m]);
+        }
+        if (has_low_order_fast_modal_to_nodal(data)) {
+            apply_sparse_basis_to_nodal(data, modal_gradients, gradients);
+        } else {
+            apply_modal_to_nodal(data, modal_gradients, gradients);
+        }
+    }
+
+    static void evaluate_hessians(const OrderData& data,
+                                  const math::Vector<Real, 3>& xi,
+                                  std::vector<Hessian>& hessians) {
+        validate_top_plane_query(xi);
+        if (is_apex_point(xi)) {
+            if (data.apex.hessian_status != ApexRankStatus::Exact) {
+                throw BasisEvaluationException(
+                    apex_status_message("Hessian", data.apex.hessian_status),
+                    __FILE__, __LINE__, __func__);
+            }
+            hessians = data.apex.hessians;
+            return;
+        }
+
+        auto& scratch = evaluation_scratch();
+        auto& modal_hessians = scratch.modal_hessians;
+        auto& modal_point = scratch.modal_point;
+        modal_hessians.resize(data.modal_terms.size());
+        pyramid_modal::prepare_evaluation_point(data.modal_terms, xi, modal_point);
+        for (std::size_t m = 0; m < data.modal_terms.size(); ++m) {
+            Real value = Real(0);
+            pyramid_modal::evaluate_term(data.modal_terms[m], modal_point, value, nullptr, &modal_hessians[m]);
+        }
+        if (has_low_order_fast_modal_to_nodal(data)) {
+            apply_sparse_basis_to_nodal(data, modal_hessians, hessians);
+        } else {
+            apply_modal_to_nodal(data, modal_hessians, hessians);
+        }
+    }
+
+    static void evaluate_all(const OrderData& data,
+                             const math::Vector<Real, 3>& xi,
+                             std::vector<Real>& values,
+                             std::vector<Gradient>& gradients,
+                             std::vector<Hessian>& hessians) {
+        validate_top_plane_query(xi);
+        if (is_apex_point(xi)) {
+            if (data.apex.gradient_status != ApexRankStatus::Exact) {
+                throw BasisEvaluationException(
+                    apex_status_message("gradient", data.apex.gradient_status),
+                    __FILE__, __LINE__, __func__);
+            }
+            if (data.apex.hessian_status != ApexRankStatus::Exact) {
+                throw BasisEvaluationException(
+                    apex_status_message("Hessian", data.apex.hessian_status),
+                    __FILE__, __LINE__, __func__);
+            }
+            values = data.apex.values;
+            gradients = data.apex.gradients;
+            hessians = data.apex.hessians;
+            return;
+        }
+
+        const std::size_t n = data.modal_terms.size();
+        auto& scratch = evaluation_scratch();
+        auto& modal_values = scratch.modal_values;
+        auto& modal_gradients = scratch.modal_gradients;
+        auto& modal_hessians = scratch.modal_hessians;
+        auto& modal_point = scratch.modal_point;
+        modal_values.resize(n);
+        modal_gradients.resize(n);
+        modal_hessians.resize(n);
+        pyramid_modal::prepare_evaluation_point(data.modal_terms, xi, modal_point);
+
+        for (std::size_t m = 0; m < n; ++m) {
+            pyramid_modal::evaluate_term(
+                data.modal_terms[m], modal_point, modal_values[m], &modal_gradients[m], &modal_hessians[m]);
+        }
+
+        if (has_low_order_fast_modal_to_nodal(data)) {
+            apply_sparse_basis_to_nodal_all(
+                data, modal_values, modal_gradients, modal_hessians, values, gradients, hessians);
+            return;
+        }
+
+        values.resize(n);
+        gradients.resize(n);
+        hessians.resize(n);
+        for (std::size_t basis_i = 0; basis_i < n; ++basis_i) {
+            const Real* row = data.modal_to_nodal.data() + basis_i * n;
+            Gradient gradient{};
+            Hessian hessian{};
+            Real value = Real(0);
+            for (std::size_t modal_j = 0; modal_j < n; ++modal_j) {
+                const Real coeff = row[modal_j];
+                value += coeff * modal_values[modal_j];
+
+                const Real* modal_gradient = modal_gradients[modal_j].data();
+                gradient[0] += coeff * modal_gradient[0];
+                gradient[1] += coeff * modal_gradient[1];
+                gradient[2] += coeff * modal_gradient[2];
+
+                const Real* modal_hessian = modal_hessians[modal_j].data();
+                Real* hessian_data = hessian.data();
+                hessian_data[0] += coeff * modal_hessian[0];
+                hessian_data[1] += coeff * modal_hessian[1];
+                hessian_data[2] += coeff * modal_hessian[2];
+                hessian_data[4] += coeff * modal_hessian[4];
+                hessian_data[5] += coeff * modal_hessian[5];
+                hessian_data[8] += coeff * modal_hessian[8];
+            }
+            values[basis_i] = value;
+            gradients[basis_i] = gradient;
+            Real* hessian_data = hessian.data();
+            hessian_data[3] = hessian_data[1];
+            hessian_data[6] = hessian_data[2];
+            hessian_data[7] = hessian_data[5];
+            hessians[basis_i] = hessian;
+        }
+    }
+
+    static void evaluate_values_to(const OrderData& data,
+                                   const math::Vector<Real, 3>& xi,
+                                   Real* SVMP_RESTRICT values_out) {
+        validate_top_plane_query(xi);
+        if (is_apex_point(xi)) {
+            std::copy(data.apex.values.begin(), data.apex.values.end(), values_out);
+            return;
+        }
+
+        auto& scratch = evaluation_scratch();
+        auto& modal = scratch.modal_values;
+        auto& modal_point = scratch.modal_point;
+        modal.resize(data.modal_terms.size());
+        pyramid_modal::prepare_evaluation_point(data.modal_terms, xi, modal_point);
+        for (std::size_t m = 0; m < data.modal_terms.size(); ++m) {
+            pyramid_modal::evaluate_term(data.modal_terms[m], modal_point, modal[m]);
+        }
+        if (has_low_order_fast_modal_to_nodal(data)) {
+            apply_sparse_basis_to_nodal_to(data, modal, values_out);
+        } else {
+            apply_modal_to_nodal_to(data, modal, values_out);
+        }
+    }
+
+    static void evaluate_gradients_to(const OrderData& data,
+                                      const math::Vector<Real, 3>& xi,
+                                      Real* SVMP_RESTRICT gradients_out) {
+        validate_top_plane_query(xi);
+        if (is_apex_point(xi)) {
+            if (data.apex.gradient_status != ApexRankStatus::Exact) {
+                throw BasisEvaluationException(
+                    apex_status_message("gradient", data.apex.gradient_status),
+                    __FILE__, __LINE__, __func__);
+            }
+            for (std::size_t i = 0; i < data.apex.gradients.size(); ++i) {
+                gradients_out[i * 3u + 0u] = data.apex.gradients[i][0];
+                gradients_out[i * 3u + 1u] = data.apex.gradients[i][1];
+                gradients_out[i * 3u + 2u] = data.apex.gradients[i][2];
+            }
+            return;
+        }
+
+        auto& scratch = evaluation_scratch();
+        auto& modal_gradients = scratch.modal_gradients;
+        auto& modal_point = scratch.modal_point;
+        modal_gradients.resize(data.modal_terms.size());
+        pyramid_modal::prepare_evaluation_point(data.modal_terms, xi, modal_point);
+        for (std::size_t m = 0; m < data.modal_terms.size(); ++m) {
+            Real value = Real(0);
+            pyramid_modal::evaluate_term(data.modal_terms[m], modal_point, value, &modal_gradients[m]);
+        }
+        if (has_low_order_fast_modal_to_nodal(data)) {
+            apply_sparse_basis_to_nodal_to(data, modal_gradients, gradients_out);
+        } else {
+            apply_modal_to_nodal_to(data, modal_gradients, gradients_out);
+        }
+    }
+
+    static void evaluate_hessians_to(const OrderData& data,
+                                     const math::Vector<Real, 3>& xi,
+                                     Real* SVMP_RESTRICT hessians_out) {
+        validate_top_plane_query(xi);
+        if (is_apex_point(xi)) {
+            if (data.apex.hessian_status != ApexRankStatus::Exact) {
+                throw BasisEvaluationException(
+                    apex_status_message("Hessian", data.apex.hessian_status),
+                    __FILE__, __LINE__, __func__);
+            }
+            for (std::size_t i = 0; i < data.apex.hessians.size(); ++i) {
+                store_hessian(data.apex.hessians[i], hessians_out + i * 9u);
+            }
+            return;
+        }
+
+        auto& scratch = evaluation_scratch();
+        auto& modal_hessians = scratch.modal_hessians;
+        auto& modal_point = scratch.modal_point;
+        modal_hessians.resize(data.modal_terms.size());
+        pyramid_modal::prepare_evaluation_point(data.modal_terms, xi, modal_point);
+        for (std::size_t m = 0; m < data.modal_terms.size(); ++m) {
+            Real value = Real(0);
+            pyramid_modal::evaluate_term(data.modal_terms[m], modal_point, value, nullptr, &modal_hessians[m]);
+        }
+        if (has_low_order_fast_modal_to_nodal(data)) {
+            apply_sparse_basis_to_nodal_to(data, modal_hessians, hessians_out);
+        } else {
+            apply_modal_to_nodal_to(data, modal_hessians, hessians_out);
+        }
+    }
+
+    static void evaluate_all_to(const OrderData& data,
+                                const math::Vector<Real, 3>& xi,
+                                Real* SVMP_RESTRICT values_out,
+                                Real* SVMP_RESTRICT gradients_out,
+                                Real* SVMP_RESTRICT hessians_out) {
+        validate_top_plane_query(xi);
+        if (is_apex_point(xi)) {
+            if (data.apex.gradient_status != ApexRankStatus::Exact) {
+                throw BasisEvaluationException(
+                    apex_status_message("gradient", data.apex.gradient_status),
+                    __FILE__, __LINE__, __func__);
+            }
+            if (data.apex.hessian_status != ApexRankStatus::Exact) {
+                throw BasisEvaluationException(
+                    apex_status_message("Hessian", data.apex.hessian_status),
+                    __FILE__, __LINE__, __func__);
+            }
+            std::copy(data.apex.values.begin(), data.apex.values.end(), values_out);
+            for (std::size_t i = 0; i < data.apex.gradients.size(); ++i) {
+                gradients_out[i * 3u + 0u] = data.apex.gradients[i][0];
+                gradients_out[i * 3u + 1u] = data.apex.gradients[i][1];
+                gradients_out[i * 3u + 2u] = data.apex.gradients[i][2];
+            }
+            for (std::size_t i = 0; i < data.apex.hessians.size(); ++i) {
+                const Real* hessian = data.apex.hessians[i].data();
+                std::copy(hessian, hessian + 9u, hessians_out + i * 9u);
+            }
+            return;
+        }
+
+        const std::size_t n = data.modal_terms.size();
+        auto& scratch = evaluation_scratch();
+        auto& modal_values = scratch.modal_values;
+        auto& modal_gradients = scratch.modal_gradients;
+        auto& modal_hessians = scratch.modal_hessians;
+        auto& modal_point = scratch.modal_point;
+        modal_values.resize(n);
+        modal_gradients.resize(n);
+        modal_hessians.resize(n);
+        pyramid_modal::prepare_evaluation_point(data.modal_terms, xi, modal_point);
+
+        for (std::size_t m = 0; m < n; ++m) {
+            pyramid_modal::evaluate_term(
+                data.modal_terms[m], modal_point, modal_values[m], &modal_gradients[m], &modal_hessians[m]);
+        }
+
+        if (has_low_order_fast_modal_to_nodal(data)) {
+            apply_sparse_basis_to_nodal_all_to(
+                data, modal_values, modal_gradients, modal_hessians, values_out, gradients_out, hessians_out);
+            return;
+        }
+
+        for (std::size_t basis_i = 0; basis_i < n; ++basis_i) {
+            const Real* row = data.modal_to_nodal.data() + basis_i * n;
+            Real value = Real(0);
+            Real gradient[3] = {Real(0), Real(0), Real(0)};
+            Real hessian[9] = {};
+            for (std::size_t modal_j = 0; modal_j < n; ++modal_j) {
+                const Real coeff = row[modal_j];
+                value += coeff * modal_values[modal_j];
+
+                const Real* modal_gradient = modal_gradients[modal_j].data();
+                gradient[0] += coeff * modal_gradient[0];
+                gradient[1] += coeff * modal_gradient[1];
+                gradient[2] += coeff * modal_gradient[2];
+
+                const Real* modal_hessian = modal_hessians[modal_j].data();
+                hessian[0] += coeff * modal_hessian[0];
+                hessian[1] += coeff * modal_hessian[1];
+                hessian[2] += coeff * modal_hessian[2];
+                hessian[4] += coeff * modal_hessian[4];
+                hessian[5] += coeff * modal_hessian[5];
+                hessian[8] += coeff * modal_hessian[8];
+            }
+
+            values_out[basis_i] = value;
+            Real* gradient_out = gradients_out + basis_i * 3u;
+            gradient_out[0] = gradient[0];
+            gradient_out[1] = gradient[1];
+            gradient_out[2] = gradient[2];
+
+            Real* hessian_out = hessians_out + basis_i * 9u;
+            hessian_out[0] = hessian[0];
+            hessian_out[1] = hessian[1];
+            hessian_out[2] = hessian[2];
+            hessian_out[3] = hessian[1];
+            hessian_out[4] = hessian[4];
+            hessian_out[5] = hessian[5];
+            hessian_out[6] = hessian[2];
+            hessian_out[7] = hessian[5];
+            hessian_out[8] = hessian[8];
+        }
+    }
+
+    static void evaluate_at_quadrature_points_strided(
+        const OrderData& data,
+        const std::vector<math::Vector<Real, 3>>& points,
+        std::size_t output_stride,
+        Real* SVMP_RESTRICT values_out,
+        Real* SVMP_RESTRICT gradients_out,
+        Real* SVMP_RESTRICT hessians_out) {
+        const unsigned mask = (values_out != nullptr ? 1u : 0u) |
+                              (gradients_out != nullptr ? 2u : 0u) |
+                              (hessians_out != nullptr ? 4u : 0u);
+        switch (mask) {
+            case 0u:
+                validate_strided_points(points);
+                return;
+            case 1u:
+                evaluate_at_quadrature_points_strided_impl<true, false, false>(
+                    data, points, output_stride, values_out, gradients_out, hessians_out);
+                return;
+            case 2u:
+                evaluate_at_quadrature_points_strided_impl<false, true, false>(
+                    data, points, output_stride, values_out, gradients_out, hessians_out);
+                return;
+            case 3u:
+                evaluate_at_quadrature_points_strided_impl<true, true, false>(
+                    data, points, output_stride, values_out, gradients_out, hessians_out);
+                return;
+            case 4u:
+                evaluate_at_quadrature_points_strided_impl<false, false, true>(
+                    data, points, output_stride, values_out, gradients_out, hessians_out);
+                return;
+            case 5u:
+                evaluate_at_quadrature_points_strided_impl<true, false, true>(
+                    data, points, output_stride, values_out, gradients_out, hessians_out);
+                return;
+            case 6u:
+                evaluate_at_quadrature_points_strided_impl<false, true, true>(
+                    data, points, output_stride, values_out, gradients_out, hessians_out);
+                return;
+            case 7u:
+                evaluate_at_quadrature_points_strided_impl<true, true, true>(
+                    data, points, output_stride, values_out, gradients_out, hessians_out);
+                return;
+            default:
+                return;
+        }
+    }
+
+private:
+    static void validate_strided_points(const std::vector<math::Vector<Real, 3>>& points) {
+        for (const auto& xi : points) {
+            validate_top_plane_query(xi);
+        }
+    }
+
+    template <bool NeedValues, bool NeedGradients, bool NeedHessians>
+    static void write_apex_strided(const OrderData& data,
+                                   std::size_t q,
+                                   std::size_t output_stride,
+                                   Real* SVMP_RESTRICT values_out,
+                                   Real* SVMP_RESTRICT gradients_out,
+                                   Real* SVMP_RESTRICT hessians_out) {
+        const std::size_t n = data.modal_terms.size();
+        if constexpr (NeedValues) {
+            for (std::size_t basis_i = 0; basis_i < n; ++basis_i) {
+                values_out[basis_i * output_stride + q] = data.apex.values[basis_i];
+            }
+        }
+        if constexpr (NeedGradients) {
+            if (data.apex.gradient_status != ApexRankStatus::Exact) {
+                throw BasisEvaluationException(
+                    apex_status_message("gradient", data.apex.gradient_status),
+                    __FILE__, __LINE__, __func__);
+            }
+            for (std::size_t basis_i = 0; basis_i < n; ++basis_i) {
+                Real* g = gradients_out + basis_i * 3u * output_stride;
+                g[0u * output_stride + q] = data.apex.gradients[basis_i][0];
+                g[1u * output_stride + q] = data.apex.gradients[basis_i][1];
+                g[2u * output_stride + q] = data.apex.gradients[basis_i][2];
+            }
+        }
+        if constexpr (NeedHessians) {
+            if (data.apex.hessian_status != ApexRankStatus::Exact) {
+                throw BasisEvaluationException(
+                    apex_status_message("Hessian", data.apex.hessian_status),
+                    __FILE__, __LINE__, __func__);
+            }
+            for (std::size_t basis_i = 0; basis_i < n; ++basis_i) {
+                const Real* hessian = data.apex.hessians[basis_i].data();
+                Real* H = hessians_out + basis_i * 9u * output_stride;
+                for (std::size_t component = 0; component < 9u; ++component) {
+                    H[component * output_stride + q] = hessian[component];
+                }
+            }
+        }
+    }
+
+    template <int Px,
+              int Py,
+              int Pz,
+              int DenomPower,
+              bool NeedValues,
+              bool NeedGradients,
+              bool NeedHessians>
+    static void fill_low_order_modal_jet(std::size_t modal_i,
+                                         const Real* SVMP_RESTRICT xp,
+                                         const Real* SVMP_RESTRICT yp,
+                                         const Real* SVMP_RESTRICT zp,
+                                         const Real* SVMP_RESTRICT inv_tp,
+                                         Real* SVMP_RESTRICT modal_values,
+                                         Real (*SVMP_RESTRICT modal_gradients)[3],
+                                         Real (*SVMP_RESTRICT modal_hessians)[9]) {
+        const Real xy_base = xp[Px] * yp[Py];
+        const Real base = xy_base * zp[Pz];
+        const Real inv_denom = inv_tp[DenomPower];
+        const Real value = base * inv_denom;
+
+        if constexpr (NeedValues) {
+            modal_values[modal_i] = value;
+        }
+        if constexpr (NeedGradients) {
+            Real* g = modal_gradients[modal_i];
+            if constexpr (Px > 0) {
+                g[0] = static_cast<Real>(Px) * xp[Px - 1] * yp[Py] * zp[Pz] * inv_denom;
+            } else {
+                g[0] = Real(0);
+            }
+            if constexpr (Py > 0) {
+                g[1] = static_cast<Real>(Py) * xp[Px] * yp[Py - 1] * zp[Pz] * inv_denom;
+            } else {
+                g[1] = Real(0);
+            }
+            Real gz = Real(0);
+            if constexpr (Pz > 0) {
+                gz += static_cast<Real>(Pz) * xy_base * zp[Pz - 1] * inv_denom;
+            }
+            if constexpr (DenomPower > 0) {
+                gz += static_cast<Real>(DenomPower) * base * inv_tp[DenomPower + 1];
+            }
+            g[2] = gz;
+        }
+        if constexpr (NeedHessians) {
+            Real* H = modal_hessians[modal_i];
+            if constexpr (Px > 1) {
+                H[0] = static_cast<Real>(Px * (Px - 1)) *
+                       xp[Px - 2] * yp[Py] * zp[Pz] * inv_denom;
+            } else {
+                H[0] = Real(0);
+            }
+            if constexpr (Py > 1) {
+                H[4] = static_cast<Real>(Py * (Py - 1)) *
+                       xp[Px] * yp[Py - 2] * zp[Pz] * inv_denom;
+            } else {
+                H[4] = Real(0);
+            }
+            Real hxy = Real(0);
+            if constexpr (Px > 0 && Py > 0) {
+                hxy = static_cast<Real>(Px * Py) *
+                      xp[Px - 1] * yp[Py - 1] * zp[Pz] * inv_denom;
+            }
+            H[1] = hxy;
+            H[3] = hxy;
+
+            Real hxz = Real(0);
+            if constexpr (Px > 0) {
+                constexpr Real px_real = static_cast<Real>(Px);
+                const Real x_deriv_y = px_real * xp[Px - 1] * yp[Py];
+                if constexpr (Pz > 0) {
+                    hxz += x_deriv_y * static_cast<Real>(Pz) *
+                           zp[Pz - 1] * inv_denom;
+                }
+                if constexpr (DenomPower > 0) {
+                    hxz += x_deriv_y * static_cast<Real>(DenomPower) *
+                           zp[Pz] * inv_tp[DenomPower + 1];
+                }
+            }
+            H[2] = hxz;
+            H[6] = hxz;
+
+            Real hyz = Real(0);
+            if constexpr (Py > 0) {
+                constexpr Real py_real = static_cast<Real>(Py);
+                const Real x_y_deriv = py_real * xp[Px] * yp[Py - 1];
+                if constexpr (Pz > 0) {
+                    hyz += x_y_deriv * static_cast<Real>(Pz) *
+                           zp[Pz - 1] * inv_denom;
+                }
+                if constexpr (DenomPower > 0) {
+                    hyz += x_y_deriv * static_cast<Real>(DenomPower) *
+                           zp[Pz] * inv_tp[DenomPower + 1];
+                }
+            }
+            H[5] = hyz;
+            H[7] = hyz;
+
+            Real hzz = Real(0);
+            if constexpr (Pz > 1) {
+                hzz += static_cast<Real>(Pz * (Pz - 1)) *
+                       xy_base * zp[Pz - 2] * inv_denom;
+            }
+            if constexpr (Pz > 0 && DenomPower > 0) {
+                hzz += static_cast<Real>(2 * Pz * DenomPower) * xy_base *
+                       zp[Pz - 1] * inv_tp[DenomPower + 1];
+            }
+            if constexpr (DenomPower > 0) {
+                hzz += static_cast<Real>(DenomPower * (DenomPower + 1)) *
+                       base * inv_tp[DenomPower + 2];
+            }
+            H[8] = hzz;
+        }
+    }
+
+    template <bool NeedValues, bool NeedGradients, bool NeedHessians>
+    static void evaluate_low_order_modal_jets(const OrderData& data,
+                                              const math::Vector<Real, 3>& xi,
+                                              Real* SVMP_RESTRICT modal_values,
+                                              Real (*SVMP_RESTRICT modal_gradients)[3],
+                                              Real (*SVMP_RESTRICT modal_hessians)[9]) {
+        const Real x = xi[0];
+        const Real y = xi[1];
+        const Real z = xi[2];
+        const Real inv_t = Real(1) / (Real(1) - z);
+        const Real xp[3] = {Real(1), x, x * x};
+        const Real yp[3] = {Real(1), y, y * y};
+        const Real zp[3] = {Real(1), z, z * z};
+        Real inv_tp[5] = {Real(1), inv_t, Real(0), Real(0), Real(0)};
+        inv_tp[2] = inv_tp[1] * inv_t;
+        inv_tp[3] = inv_tp[2] * inv_t;
+        inv_tp[4] = inv_tp[3] * inv_t;
+
+        fill_low_order_modal_jet<0, 0, 0, 0, NeedValues, NeedGradients, NeedHessians>(
+            0u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
+        fill_low_order_modal_jet<1, 0, 0, 0, NeedValues, NeedGradients, NeedHessians>(
+            1u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
+        if (data.order == 1) {
+            fill_low_order_modal_jet<0, 1, 0, 0, NeedValues, NeedGradients, NeedHessians>(
+                2u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
+            fill_low_order_modal_jet<1, 1, 0, 1, NeedValues, NeedGradients, NeedHessians>(
+                3u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
+            fill_low_order_modal_jet<0, 0, 1, 0, NeedValues, NeedGradients, NeedHessians>(
+                4u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
+            return;
+        }
+
+        fill_low_order_modal_jet<2, 0, 0, 0, NeedValues, NeedGradients, NeedHessians>(
+            2u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
+        fill_low_order_modal_jet<0, 1, 0, 0, NeedValues, NeedGradients, NeedHessians>(
+            3u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
+        fill_low_order_modal_jet<1, 1, 0, 1, NeedValues, NeedGradients, NeedHessians>(
+            4u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
+        fill_low_order_modal_jet<2, 1, 0, 1, NeedValues, NeedGradients, NeedHessians>(
+            5u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
+        fill_low_order_modal_jet<0, 2, 0, 0, NeedValues, NeedGradients, NeedHessians>(
+            6u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
+        fill_low_order_modal_jet<1, 2, 0, 1, NeedValues, NeedGradients, NeedHessians>(
+            7u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
+        fill_low_order_modal_jet<2, 2, 0, 2, NeedValues, NeedGradients, NeedHessians>(
+            8u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
+        fill_low_order_modal_jet<0, 0, 1, 0, NeedValues, NeedGradients, NeedHessians>(
+            9u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
+        fill_low_order_modal_jet<1, 0, 1, 0, NeedValues, NeedGradients, NeedHessians>(
+            10u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
+        fill_low_order_modal_jet<0, 1, 1, 0, NeedValues, NeedGradients, NeedHessians>(
+            11u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
+        fill_low_order_modal_jet<1, 1, 1, 1, NeedValues, NeedGradients, NeedHessians>(
+            12u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
+        fill_low_order_modal_jet<0, 0, 2, 0, NeedValues, NeedGradients, NeedHessians>(
+            13u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
+    }
+
+    template <bool NeedValues, bool NeedGradients, bool NeedHessians>
+    static bool try_evaluate_low_order_strided(
+        const OrderData& data,
+        const std::vector<math::Vector<Real, 3>>& points,
+        std::size_t output_stride,
+        Real* SVMP_RESTRICT values_out,
+        Real* SVMP_RESTRICT gradients_out,
+        Real* SVMP_RESTRICT hessians_out) {
+        if (!has_low_order_fast_modal_to_nodal(data)) {
+            return false;
+        }
+        for (const auto& xi : points) {
+            validate_top_plane_query(xi);
+            if (is_apex_point(xi)) {
+                return false;
+            }
+        }
+
+        Real modal_values[14];
+        Real modal_gradients[14][3];
+        Real modal_hessians[14][9];
+        for (std::size_t q = 0; q < points.size(); ++q) {
+            evaluate_low_order_modal_jets<NeedValues, NeedGradients, NeedHessians>(
+                data, points[q], modal_values, modal_gradients, modal_hessians);
+            if constexpr (NeedValues) {
+                apply_low_order_combination(
+                    data,
+                    1u,
+                    [&](std::size_t modal_i, std::size_t) {
+                        return modal_values[modal_i];
+                    },
+                    [&](std::size_t basis_i, std::size_t, Real value) {
+                        values_out[basis_i * output_stride + q] = value;
+                    });
+            }
+            if constexpr (NeedGradients) {
+                apply_low_order_combination(
+                    data,
+                    3u,
+                    [&](std::size_t modal_i, std::size_t component) {
+                        return modal_gradients[modal_i][component];
+                    },
+                    [&](std::size_t basis_i, std::size_t component, Real value) {
+                        gradients_out[basis_i * 3u * output_stride +
+                                      component * output_stride + q] = value;
+                    });
+            }
+            if constexpr (NeedHessians) {
+                apply_low_order_combination(
+                    data,
+                    9u,
+                    [&](std::size_t modal_i, std::size_t component) {
+                        return modal_hessians[modal_i][component];
+                    },
+                    [&](std::size_t basis_i, std::size_t component, Real value) {
+                        hessians_out[basis_i * 9u * output_stride +
+                                     component * output_stride + q] = value;
+                    });
+            }
+        }
+        return true;
+    }
+
+    template <bool NeedValues, bool NeedGradients, bool NeedHessians>
+    static void evaluate_at_quadrature_points_strided_impl(
+        const OrderData& data,
+        const std::vector<math::Vector<Real, 3>>& points,
+        std::size_t output_stride,
+        Real* SVMP_RESTRICT values_out,
+        Real* SVMP_RESTRICT gradients_out,
+        Real* SVMP_RESTRICT hessians_out) {
+        const std::size_t n = data.modal_terms.size();
+        if (points.empty() || n == 0u) {
+            return;
+        }
+        if (try_evaluate_low_order_strided<NeedValues, NeedGradients, NeedHessians>(
+                data, points, output_stride, values_out, gradients_out, hessians_out)) {
+            return;
+        }
+
+        auto& scratch = evaluation_scratch();
+        auto& modal_values = scratch.modal_values;
+        auto& modal_gradients = scratch.modal_gradients;
+        auto& modal_hessians = scratch.modal_hessians;
+        auto& modal_point = scratch.modal_point;
+        if constexpr (NeedValues) {
+            modal_values.resize(n);
+        }
+        if constexpr (NeedGradients) {
+            modal_gradients.resize(n);
+        }
+        if constexpr (NeedHessians) {
+            modal_hessians.resize(n);
+        }
+        const bool use_fast_modal_to_nodal = has_low_order_fast_modal_to_nodal(data);
+
+        if (!use_fast_modal_to_nodal) {
+            bool has_apex_query = false;
+            for (const auto& xi : points) {
+                validate_top_plane_query(xi);
+                has_apex_query = has_apex_query || is_apex_point(xi);
+            }
+
+            if (!has_apex_query) {
+                const std::size_t num_qpts = points.size();
+                if constexpr (NeedValues) {
+                    modal_values.resize(n * num_qpts);
+                }
+                if constexpr (NeedGradients) {
+                    scratch.modal_gradient_components.resize(n * 3u * num_qpts);
+                }
+                if constexpr (NeedHessians) {
+                    scratch.modal_hessian_components.resize(n * 9u * num_qpts);
+                }
+
+                for (std::size_t q = 0; q < num_qpts; ++q) {
+                    const auto& xi = points[q];
+                    pyramid_modal::prepare_evaluation_point(data.modal_terms, xi, modal_point);
+                    for (std::size_t modal_j = 0; modal_j < n; ++modal_j) {
+                        Real modal_value = Real(0);
+                        Gradient modal_gradient{};
+                        Hessian modal_hessian{};
+                        pyramid_modal::evaluate_term(
+                            data.modal_terms[modal_j],
+                            modal_point,
+                            modal_value,
+                            NeedGradients ? &modal_gradient : nullptr,
+                            NeedHessians ? &modal_hessian : nullptr);
+                        if constexpr (NeedValues) {
+                            modal_values[modal_j * num_qpts + q] = modal_value;
+                        }
+                        if constexpr (NeedGradients) {
+                            for (std::size_t component = 0; component < 3u; ++component) {
+                                scratch.modal_gradient_components[
+                                    (modal_j * 3u + component) * num_qpts + q] =
+                                    modal_gradient[component];
+                            }
+                        }
+                        if constexpr (NeedHessians) {
+                            for (std::size_t component = 0; component < 9u; ++component) {
+                                scratch.modal_hessian_components[
+                                    (modal_j * 9u + component) * num_qpts + q] =
+                                    modal_hessian.data()[component];
+                            }
+                        }
+                    }
+                }
+
+                const Real* transform = data.modal_to_nodal.data();
+                if constexpr (NeedValues) {
+                    math::dense_transform_batched_row_major(
+                        transform,
+                        n,
+                        n,
+                        modal_values.data(),
+                        num_qpts,
+                        values_out,
+                        output_stride,
+                        num_qpts);
+                }
+                if constexpr (NeedGradients) {
+                    for (std::size_t component = 0; component < 3u; ++component) {
+                        math::dense_transform_batched_row_major(
+                            transform,
+                            n,
+                            n,
+                            scratch.modal_gradient_components.data() + component * num_qpts,
+                            3u * num_qpts,
+                            gradients_out + component * output_stride,
+                            3u * output_stride,
+                            num_qpts);
+                    }
+                }
+                if constexpr (NeedHessians) {
+                    for (std::size_t component = 0; component < 9u; ++component) {
+                        math::dense_transform_batched_row_major(
+                            transform,
+                            n,
+                            n,
+                            scratch.modal_hessian_components.data() + component * num_qpts,
+                            9u * num_qpts,
+                            hessians_out + component * output_stride,
+                            9u * output_stride,
+                            num_qpts);
+                    }
+                }
+                return;
+            }
+        }
+
+        for (std::size_t q = 0; q < points.size(); ++q) {
+            const auto& xi = points[q];
+            validate_top_plane_query(xi);
+
+            if (is_apex_point(xi)) {
+                write_apex_strided<NeedValues, NeedGradients, NeedHessians>(
+                    data, q, output_stride, values_out, gradients_out, hessians_out);
+                continue;
+            }
+
+            pyramid_modal::prepare_evaluation_point(data.modal_terms, xi, modal_point);
+            for (std::size_t modal_j = 0; modal_j < n; ++modal_j) {
+                Gradient* gradient_out = nullptr;
+                Hessian* hessian_out = nullptr;
+                if constexpr (NeedGradients) {
+                    gradient_out = &modal_gradients[modal_j];
+                }
+                if constexpr (NeedHessians) {
+                    hessian_out = &modal_hessians[modal_j];
+                }
+                if constexpr (NeedValues) {
+                    pyramid_modal::evaluate_term(
+                        data.modal_terms[modal_j],
+                        modal_point,
+                        modal_values[modal_j],
+                        gradient_out,
+                        hessian_out);
+                } else {
+                    Real value = Real(0);
+                    pyramid_modal::evaluate_term(
+                        data.modal_terms[modal_j],
+                        modal_point,
+                        value,
+                        gradient_out,
+                        hessian_out);
+                }
+            }
+
+            if (use_fast_modal_to_nodal) {
+                if constexpr (NeedValues) {
+                    apply_low_order_combination(
+                        data,
+                        1u,
+                        [&](std::size_t modal_i, std::size_t) {
+                            return modal_values[modal_i];
+                        },
+                        [&](std::size_t basis_i, std::size_t, Real value) {
+                            values_out[basis_i * output_stride + q] = value;
+                        });
+                }
+                if constexpr (NeedGradients) {
+                    apply_low_order_combination(
+                        data,
+                        3u,
+                        [&](std::size_t modal_i, std::size_t component) {
+                            return modal_gradients[modal_i][component];
+                        },
+                        [&](std::size_t basis_i, std::size_t component, Real value) {
+                            gradients_out[basis_i * 3u * output_stride +
+                                          component * output_stride + q] = value;
+                        });
+                }
+                if constexpr (NeedHessians) {
+                    apply_low_order_combination(
+                        data,
+                        9u,
+                        [&](std::size_t modal_i, std::size_t component) {
+                            return modal_hessians[modal_i].data()[component];
+                        },
+                        [&](std::size_t basis_i, std::size_t component, Real value) {
+                            hessians_out[basis_i * 9u * output_stride +
+                                         component * output_stride + q] = value;
+                        });
+                }
+                continue;
+            }
+
+            for (std::size_t basis_i = 0; basis_i < n; ++basis_i) {
+                const Real* matrix_row = data.modal_to_nodal.data() + basis_i * n;
+                [[maybe_unused]] Real value = Real(0);
+                [[maybe_unused]] std::array<Real, 3> gradient{};
+                [[maybe_unused]] std::array<Real, 9> hessian{};
+
+                for (std::size_t modal_j = 0; modal_j < n; ++modal_j) {
+                    const Real coeff = matrix_row[modal_j];
+                    if constexpr (NeedValues) {
+                        value += coeff * modal_values[modal_j];
+                    }
+                    if constexpr (NeedGradients) {
+                        const Real* modal_gradient = modal_gradients[modal_j].data();
+                        gradient[0] += coeff * modal_gradient[0];
+                        gradient[1] += coeff * modal_gradient[1];
+                        gradient[2] += coeff * modal_gradient[2];
+                    }
+                    if constexpr (NeedHessians) {
+                        const Real* modal_hessian = modal_hessians[modal_j].data();
+                        for (std::size_t component = 0; component < 9u; ++component) {
+                            hessian[component] += coeff * modal_hessian[component];
+                        }
+                    }
+                }
+
+                if constexpr (NeedValues) {
+                    values_out[basis_i * output_stride + q] = value;
+                }
+                if constexpr (NeedGradients) {
+                    Real* g = gradients_out + basis_i * 3u * output_stride;
+                    g[0u * output_stride + q] = gradient[0];
+                    g[1u * output_stride + q] = gradient[1];
+                    g[2u * output_stride + q] = gradient[2];
+                }
+                if constexpr (NeedHessians) {
+                    Real* H = hessians_out + basis_i * 9u * output_stride;
+                    for (std::size_t component = 0; component < 9u; ++component) {
+                        H[component * output_stride + q] = hessian[component];
+                    }
+                }
+            }
+        }
+    }
+
+    static Real apex_coord_tolerance() noexcept {
+        return basis_scaled_tolerance();
+    }
+
+    // Coefficient pruning for symbolic apex series, not a reference-coordinate
+    // roundoff test. Keep this strict and separate from BasisTolerance.
+    static constexpr Real kSeriesTolerance = Real(1e-12);
+
+    static Real binomial_coeff(int n, int k) {
+        if (k < 0 || k > n) {
+            return Real(0);
+        }
+        if (k == 0 || k == n) {
+            return Real(1);
+        }
+        k = std::min(k, n - k);
+        Real coeff = Real(1);
+        for (int i = 1; i <= k; ++i) {
+            coeff *= static_cast<Real>(n - (k - i));
+            coeff /= static_cast<Real>(i);
+        }
+        return coeff;
+    }
+
+    static void add_z_expansion(ApexSeries& series,
+                                int z_power,
+                                int beta0,
+                                int pu,
+                                int pv,
+                                Real coeff) {
+        for (int q = 0; q <= z_power; ++q) {
+            const Real z_coeff = coeff * binomial_coeff(z_power, q) *
+                                 ((q % 2 == 0) ? Real(1) : Real(-1));
+            series.add_term(beta0 + q, pu, pv, z_coeff, kSeriesTolerance);
+        }
+    }
+
+    static ApexSeries modal_value_asymptotic(const ModalTerm& term) {
+        ApexSeries series;
+        add_z_expansion(series,
+                        term.pz,
+                        term.px + term.py - term.denom_power,
+                        term.px,
+                        term.py,
+                        Real(1));
+        return series;
+    }
+
+    static GradientSeries modal_gradient_asymptotic(const ModalTerm& term) {
+        GradientSeries gradient_series{};
+
+        if (term.px > 0) {
+            add_z_expansion(gradient_series[0],
+                            term.pz,
+                            term.px - 1 + term.py - term.denom_power,
+                            term.px - 1,
+                            term.py,
+                            static_cast<Real>(term.px));
+        }
+
+        if (term.py > 0) {
+            add_z_expansion(gradient_series[1],
+                            term.pz,
+                            term.px + term.py - 1 - term.denom_power,
+                            term.px,
+                            term.py - 1,
+                            static_cast<Real>(term.py));
+        }
+
+        if (term.pz > 0) {
+            add_z_expansion(gradient_series[2],
+                            term.pz - 1,
+                            term.px + term.py - term.denom_power,
+                            term.px,
+                            term.py,
+                            static_cast<Real>(term.pz));
+        }
+        if (term.denom_power > 0) {
+            add_z_expansion(gradient_series[2],
+                            term.pz,
+                            term.px + term.py - term.denom_power - 1,
+                            term.px,
+                            term.py,
+                            static_cast<Real>(term.denom_power));
+        }
+
+        return gradient_series;
+    }
+
+    static HessianSeries modal_hessian_asymptotic(const ModalTerm& term) {
+        HessianSeries hessian_series{};
+
+        if (term.px > 1) {
+            add_z_expansion(hessian_series[0][0],
+                            term.pz,
+                            term.px - 2 + term.py - term.denom_power,
+                            term.px - 2,
+                            term.py,
+                            static_cast<Real>(term.px * (term.px - 1)));
+        }
+
+        if (term.py > 1) {
+            add_z_expansion(hessian_series[1][1],
+                            term.pz,
+                            term.px + term.py - 2 - term.denom_power,
+                            term.px,
+                            term.py - 2,
+                            static_cast<Real>(term.py * (term.py - 1)));
+        }
+
+        if (term.px > 0 && term.py > 0) {
+            add_z_expansion(hessian_series[0][1],
+                            term.pz,
+                            term.px + term.py - 2 - term.denom_power,
+                            term.px - 1,
+                            term.py - 1,
+                            static_cast<Real>(term.px * term.py));
+            hessian_series[1][0] = hessian_series[0][1];
+        }
+
+        if (term.px > 0 && term.pz > 0) {
+            add_z_expansion(hessian_series[0][2],
+                            term.pz - 1,
+                            term.px - 1 + term.py - term.denom_power,
+                            term.px - 1,
+                            term.py,
+                            static_cast<Real>(term.px * term.pz));
+        }
+        if (term.px > 0 && term.denom_power > 0) {
+            add_z_expansion(hessian_series[0][2],
+                            term.pz,
+                            term.px - 1 + term.py - term.denom_power - 1,
+                            term.px - 1,
+                            term.py,
+                            static_cast<Real>(term.px * term.denom_power));
+        }
+        hessian_series[2][0] = hessian_series[0][2];
+
+        if (term.py > 0 && term.pz > 0) {
+            add_z_expansion(hessian_series[1][2],
+                            term.pz - 1,
+                            term.px + term.py - 1 - term.denom_power,
+                            term.px,
+                            term.py - 1,
+                            static_cast<Real>(term.py * term.pz));
+        }
+        if (term.py > 0 && term.denom_power > 0) {
+            add_z_expansion(hessian_series[1][2],
+                            term.pz,
+                            term.px + term.py - 1 - term.denom_power - 1,
+                            term.px,
+                            term.py - 1,
+                            static_cast<Real>(term.py * term.denom_power));
+        }
+        hessian_series[2][1] = hessian_series[1][2];
+
+        if (term.pz > 1) {
+            add_z_expansion(hessian_series[2][2],
+                            term.pz - 2,
+                            term.px + term.py - term.denom_power,
+                            term.px,
+                            term.py,
+                            static_cast<Real>(term.pz * (term.pz - 1)));
+        }
+        if (term.pz > 0 && term.denom_power > 0) {
+            add_z_expansion(hessian_series[2][2],
+                            term.pz - 1,
+                            term.px + term.py - term.denom_power - 1,
+                            term.px,
+                            term.py,
+                            static_cast<Real>(2 * term.pz * term.denom_power));
+        }
+        if (term.denom_power > 0) {
+            add_z_expansion(hessian_series[2][2],
+                            term.pz,
+                            term.px + term.py - term.denom_power - 2,
+                            term.px,
+                            term.py,
+                            static_cast<Real>(term.denom_power * (term.denom_power + 1)));
+        }
+
+        return hessian_series;
+    }
+
+    static ApexClassification classify_series(const ApexSeries& series) {
+        for (const auto& [beta, poly] : series.by_power) {
+            if (poly.empty(kSeriesTolerance)) {
+                continue;
+            }
+            if (beta < 0) {
+                return {ApexLimitKind::Singular, Real(0), beta};
+            }
+            if (beta > 0) {
+                return {ApexLimitKind::Constant, Real(0), beta};
+            }
+            if (poly.is_constant(kSeriesTolerance)) {
+                return {ApexLimitKind::Constant, poly.constant_value(kSeriesTolerance), beta};
+            }
+            return {ApexLimitKind::DirectionDependent, Real(0), beta};
+        }
+        return {ApexLimitKind::Constant, Real(0), 1};
+    }
+
+    static void accumulate_rank_status(ApexRankStatus& status,
+                                       const ApexClassification& classification) {
+        if (classification.kind == ApexLimitKind::Singular) {
+            status = ApexRankStatus::Singular;
+            return;
+        }
+        if (classification.kind == ApexLimitKind::DirectionDependent &&
+            status != ApexRankStatus::Singular) {
+            status = ApexRankStatus::DirectionDependent;
+        }
+    }
+
+    static std::string apex_status_message(const char* rank,
+                                           ApexRankStatus status) {
+        switch (status) {
+            case ApexRankStatus::DirectionDependent:
+                return std::string("Pyramid rational nodal ") + rank +
+                       " at the exact apex is not uniquely defined under admissible interior approaches";
+            case ApexRankStatus::Singular:
+                return std::string("Pyramid rational nodal ") + rank +
+                       " at the exact apex is singular for this basis family";
+            case ApexRankStatus::Exact:
+                return std::string("Pyramid rational nodal ") + rank +
+                       " apex evaluation unexpectedly reported non-exact status";
+        }
+        return std::string("Pyramid rational nodal ") + rank +
+               " apex evaluation is not available";
+    }
+
+    static ApexData build_apex_data(const OrderData& data) {
+        const std::size_t n = data.modal_terms.size();
+
+        std::vector<ApexSeries> modal_values(n);
+        std::vector<GradientSeries> modal_gradients(n);
+        std::vector<HessianSeries> modal_hessians(n);
+        for (std::size_t m = 0; m < n; ++m) {
+            modal_values[m] = modal_value_asymptotic(data.modal_terms[m]);
+            modal_gradients[m] = modal_gradient_asymptotic(data.modal_terms[m]);
+            modal_hessians[m] = modal_hessian_asymptotic(data.modal_terms[m]);
+        }
+
+        std::vector<ApexSeries> nodal_values(n);
+        std::vector<GradientSeries> nodal_gradients(n);
+        std::vector<HessianSeries> nodal_hessians(n);
+        for (std::size_t i = 0; i < n; ++i) {
+            for (std::size_t m = 0; m < n; ++m) {
+                const Real coeff = data.modal_to_nodal[i * n + m];
+                nodal_values[i].add_scaled(modal_values[m], coeff, kSeriesTolerance);
+                for (int d = 0; d < 3; ++d) {
+                    nodal_gradients[i][static_cast<std::size_t>(d)].add_scaled(
+                        modal_gradients[m][static_cast<std::size_t>(d)], coeff, kSeriesTolerance);
+                }
+                for (int r = 0; r < 3; ++r) {
+                    for (int c = 0; c < 3; ++c) {
+                        nodal_hessians[i][static_cast<std::size_t>(r)][static_cast<std::size_t>(c)]
+                            .add_scaled(
+                                modal_hessians[m][static_cast<std::size_t>(r)][static_cast<std::size_t>(c)],
+                                coeff,
+                                kSeriesTolerance);
+                    }
+                }
+            }
+        }
+
+        ApexData apex;
+        apex.values.assign(n, Real(0));
+        apex.gradients.assign(n, Gradient{});
+        apex.hessians.assign(n, Hessian{});
+
+        for (std::size_t i = 0; i < n; ++i) {
+            const ApexClassification value_class = classify_series(nodal_values[i]);
+            if (value_class.kind != ApexLimitKind::Constant) {
+                throw BasisConstructionException(
+                    "Pyramid nodal value at apex is not uniquely defined for basis index " +
+                    std::to_string(i),
+                    __FILE__, __LINE__, __func__);
+            }
+            apex.values[i] = value_class.constant_value;
+
+            for (int d = 0; d < 3; ++d) {
+                const ApexClassification grad_class = classify_series(
+                    nodal_gradients[i][static_cast<std::size_t>(d)]);
+                accumulate_rank_status(apex.gradient_status, grad_class);
+                if (grad_class.kind == ApexLimitKind::Constant) {
+                    apex.gradients[i][static_cast<std::size_t>(d)] = grad_class.constant_value;
+                }
+            }
+
+            for (int r = 0; r < 3; ++r) {
+                for (int c = 0; c < 3; ++c) {
+                    const ApexClassification hess_class = classify_series(
+                        nodal_hessians[i][static_cast<std::size_t>(r)][static_cast<std::size_t>(c)]);
+                    accumulate_rank_status(apex.hessian_status, hess_class);
+                    if (hess_class.kind == ApexLimitKind::Constant) {
+                        apex.hessians[i](static_cast<std::size_t>(r),
+                                         static_cast<std::size_t>(c)) = hess_class.constant_value;
+                    }
+                }
+            }
+        }
+
+        if (apex.gradient_status != ApexRankStatus::Exact) {
+            apex.gradients.clear();
+        }
+        if (apex.hessian_status != ApexRankStatus::Exact) {
+            apex.hessians.clear();
+        }
+
+        return apex;
+    }
+
+    static std::vector<math::Vector<Real, 3>> build_public_nodes(int order) {
+        if (order == 0) {
+            return {math::Vector<Real, 3>{Real(0), Real(0), Real(0.25)}};
+        }
+
+        std::vector<math::Vector<Real, 3>> nodes;
+        nodes.reserve(static_cast<std::size_t>((order + 1) * (order + 2) * (2 * order + 3) / 6));
+
+        nodes.push_back(math::Vector<Real, 3>{Real(-1), Real(-1), Real(0)});
+        nodes.push_back(math::Vector<Real, 3>{Real(1), Real(-1), Real(0)});
+        nodes.push_back(math::Vector<Real, 3>{Real(1), Real(1), Real(0)});
+        nodes.push_back(math::Vector<Real, 3>{Real(-1), Real(1), Real(0)});
+        nodes.push_back(math::Vector<Real, 3>{Real(0), Real(0), Real(1)});
+
+        for (int m = 1; m < order; ++m) {
+            nodes.push_back(math::Vector<Real, 3>{equispaced_pm_one_coord(m, order), Real(-1), Real(0)});
+        }
+        for (int m = 1; m < order; ++m) {
+            nodes.push_back(math::Vector<Real, 3>{Real(1), equispaced_pm_one_coord(m, order), Real(0)});
+        }
+        for (int m = order - 1; m >= 1; --m) {
+            nodes.push_back(math::Vector<Real, 3>{equispaced_pm_one_coord(m, order), Real(1), Real(0)});
+        }
+        for (int m = order - 1; m >= 1; --m) {
+            nodes.push_back(math::Vector<Real, 3>{Real(-1), equispaced_pm_one_coord(m, order), Real(0)});
+        }
+
+        for (int level = 1; level < order; ++level) {
+            const Real z = static_cast<Real>(level) / static_cast<Real>(order);
+            const Real scale = Real(1) - z;
+            nodes.push_back(math::Vector<Real, 3>{-scale, -scale, z});
+            nodes.push_back(math::Vector<Real, 3>{scale, -scale, z});
+            nodes.push_back(math::Vector<Real, 3>{scale, scale, z});
+            nodes.push_back(math::Vector<Real, 3>{-scale, scale, z});
+        }
+
+        for (int j = 1; j < order; ++j) {
+            for (int i = 1; i < order; ++i) {
+                nodes.push_back(math::Vector<Real, 3>{equispaced_pm_one_coord(i, order),
+                                                      equispaced_pm_one_coord(j, order),
+                                                      Real(0)});
+            }
+        }
+
+        for (int level = 1; level < order - 1; ++level) {
+            const int n = order - level;
+            const Real z = static_cast<Real>(level) / static_cast<Real>(order);
+            const Real scale = Real(1) - z;
+
+            for (int m = 1; m < n; ++m) {
+                const Real s = equispaced_pm_one_coord(m, n) * scale;
+                nodes.push_back(math::Vector<Real, 3>{s, -scale, z});
+            }
+            for (int m = 1; m < n; ++m) {
+                const Real s = equispaced_pm_one_coord(m, n) * scale;
+                nodes.push_back(math::Vector<Real, 3>{scale, s, z});
+            }
+            for (int m = n - 1; m >= 1; --m) {
+                const Real s = equispaced_pm_one_coord(m, n) * scale;
+                nodes.push_back(math::Vector<Real, 3>{s, scale, z});
+            }
+            for (int m = n - 1; m >= 1; --m) {
+                const Real s = equispaced_pm_one_coord(m, n) * scale;
+                nodes.push_back(math::Vector<Real, 3>{-scale, s, z});
+            }
+        }
+
+        for (int level = 1; level < order - 1; ++level) {
+            const int n = order - level;
+            const Real z = static_cast<Real>(level) / static_cast<Real>(order);
+            const Real scale = Real(1) - z;
+            for (int j = 1; j < n; ++j) {
+                for (int i = 1; i < n; ++i) {
+                    nodes.push_back(math::Vector<Real, 3>{equispaced_pm_one_coord(i, n) * scale,
+                                                          equispaced_pm_one_coord(j, n) * scale,
+                                                          z});
+                }
+            }
+        }
+
+        return nodes;
+    }
+
+    struct VectorValueSink {
+        std::vector<Real>& output;
+        void resize(std::size_t n) const { output.resize(n); }
+        void write(std::size_t i, Real value) const { output[i] = value; }
+    };
+
+    struct RawValueSink {
+        Real* output;
+        void resize(std::size_t) const {}
+        void write(std::size_t i, Real value) const { output[i] = value; }
+    };
+
+    struct VectorGradientSink {
+        std::vector<Gradient>& output;
+        void resize(std::size_t n) const { output.resize(n); }
+        void write(std::size_t i, const Gradient& value) const { output[i] = value; }
+    };
+
+    struct RawGradientSink {
+        Real* output;
+        void resize(std::size_t) const {}
+        void write(std::size_t i, const Gradient& value) const {
+            Real* dst = output + i * 3u;
+            dst[0] = value[0];
+            dst[1] = value[1];
+            dst[2] = value[2];
+        }
+    };
+
+    struct VectorHessianSink {
+        std::vector<Hessian>& output;
+        void resize(std::size_t n) const { output.resize(n); }
+        void write(std::size_t i, const Hessian& value) const { output[i] = value; }
+    };
+
+    struct RawHessianSink {
+        Real* output;
+        void resize(std::size_t) const {}
+        void write(std::size_t i, const Hessian& value) const {
+            store_hessian(value, output + i * 9u);
+        }
+    };
+
+    template <typename Get, typename Set>
+    static void apply_order1_combination(std::size_t components,
+                                         const Get& get,
+                                         const Set& set) {
+        for (std::size_t c = 0; c < components; ++c) {
+            const Real m0 = get(0u, c);
+            const Real m1 = get(1u, c);
+            const Real m2 = get(2u, c);
+            const Real m3 = get(3u, c);
+            const Real m4 = get(4u, c);
+            set(0u, c, Real(0.25) * (m0 - m1 - m2 + m3 - m4));
+            set(1u, c, Real(0.25) * (m0 + m1 - m2 - m3 - m4));
+            set(2u, c, Real(0.25) * (m0 + m1 + m2 + m3 - m4));
+            set(3u, c, Real(0.25) * (m0 - m1 + m2 - m3 - m4));
+            set(4u, c, m4);
+        }
+    }
+
+    template <typename Get, typename Set>
+    static void apply_order2_combination(std::size_t components,
+                                         const Get& get,
+                                         const Set& set) {
+        for (std::size_t c = 0; c < components; ++c) {
+            const Real m0 = get(0u, c);
+            const Real m1 = get(1u, c);
+            const Real m2 = get(2u, c);
+            const Real m3 = get(3u, c);
+            const Real m4 = get(4u, c);
+            const Real m5 = get(5u, c);
+            const Real m6 = get(6u, c);
+            const Real m7 = get(7u, c);
+            const Real m8 = get(8u, c);
+            const Real m9 = get(9u, c);
+            const Real m10 = get(10u, c);
+            const Real m11 = get(11u, c);
+            const Real m12 = get(12u, c);
+            const Real m13 = get(13u, c);
+            set(0u, c, Real(0.25) * (m4 - m5 - m7 + m8 - m9 + m10 + m11 - Real(2) * m12 + m13));
+            set(1u, c, Real(0.25) * (-m4 - m5 + m7 + m8 - m9 - m10 + m11 + Real(2) * m12 + m13));
+            set(2u, c, Real(0.25) * (m4 + m5 + m7 + m8 - m9 - m10 - m11 - Real(2) * m12 + m13));
+            set(3u, c, Real(0.25) * (-m4 + m5 - m7 + m8 - m9 + m10 - m11 + Real(2) * m12 + m13));
+            set(4u, c, -m9 + Real(2) * m13);
+            set(5u, c, Real(0.5) * (-m3 + m5 + m6 - m8 + m11));
+            set(6u, c, Real(0.5) * (m1 + m2 - m7 - m8 - m10));
+            set(7u, c, Real(0.5) * (m3 - m5 + m6 - m8 - m11));
+            set(8u, c, Real(0.5) * (-m1 + m2 + m7 - m8 + m10));
+            set(9u, c, m9 - m10 - m11 + m12 - m13);
+            set(10u, c, m9 + m10 - m11 - m12 - m13);
+            set(11u, c, m9 + m10 + m11 + m12 - m13);
+            set(12u, c, m9 - m10 + m11 - m12 - m13);
+            set(13u, c, m0 - m2 - m6 + m8 - Real(2) * m9 + m13);
+        }
+    }
+
+    template <typename Get, typename Set>
+    static void apply_low_order_combination(const OrderData& data,
+                                            std::size_t components,
+                                            const Get& get,
+                                            const Set& set) {
+        if (data.order == 1) {
+            apply_order1_combination(components, get, set);
+            return;
+        }
+        apply_order2_combination(components, get, set);
+    }
+
+    static void apply_sparse_basis_to_nodal(const OrderData& data,
+                                            const std::vector<Real>& modal_values,
+                                            std::vector<Real>& nodal_values) {
+        const std::size_t n = modal_values.size();
+        nodal_values.resize(n);
+        apply_low_order_combination(
+            data,
+            1u,
+            [&](std::size_t modal_i, std::size_t) { return modal_values[modal_i]; },
+            [&](std::size_t basis_i, std::size_t, Real value) { nodal_values[basis_i] = value; });
+    }
+
+    static void apply_sparse_basis_to_nodal_to(const OrderData& data,
+                                               const std::vector<Real>& modal_values,
+                                               Real* SVMP_RESTRICT nodal_values) {
+        apply_low_order_combination(
+            data,
+            1u,
+            [&](std::size_t modal_i, std::size_t) { return modal_values[modal_i]; },
+            [&](std::size_t basis_i, std::size_t, Real value) { nodal_values[basis_i] = value; });
+    }
+
+    static void apply_sparse_basis_to_nodal(const OrderData& data,
+                                            const std::vector<Gradient>& modal_gradients,
+                                            std::vector<Gradient>& nodal_gradients) {
+        const std::size_t n = modal_gradients.size();
+        nodal_gradients.resize(n);
+        apply_low_order_combination(
+            data,
+            3u,
+            [&](std::size_t modal_i, std::size_t component) {
+                return modal_gradients[modal_i][component];
+            },
+            [&](std::size_t basis_i, std::size_t component, Real value) {
+                nodal_gradients[basis_i][component] = value;
+            });
+    }
+
+    static void apply_sparse_basis_to_nodal_to(const OrderData& data,
+                                               const std::vector<Gradient>& modal_gradients,
+                                               Real* SVMP_RESTRICT nodal_gradients) {
+        apply_low_order_combination(
+            data,
+            3u,
+            [&](std::size_t modal_i, std::size_t component) {
+                return modal_gradients[modal_i][component];
+            },
+            [&](std::size_t basis_i, std::size_t component, Real value) {
+                nodal_gradients[basis_i * 3u + component] = value;
+            });
+    }
+
+    static void apply_sparse_basis_to_nodal(const OrderData& data,
+                                            const std::vector<Hessian>& modal_hessians,
+                                            std::vector<Hessian>& nodal_hessians) {
+        const std::size_t n = modal_hessians.size();
+        nodal_hessians.resize(n);
+        apply_low_order_combination(
+            data,
+            9u,
+            [&](std::size_t modal_i, std::size_t component) {
+                return modal_hessians[modal_i].data()[component];
+            },
+            [&](std::size_t basis_i, std::size_t component, Real value) {
+                nodal_hessians[basis_i].data()[component] = value;
+            });
+    }
+
+    static void apply_sparse_basis_to_nodal_to(const OrderData& data,
+                                               const std::vector<Hessian>& modal_hessians,
+                                               Real* SVMP_RESTRICT nodal_hessians) {
+        apply_low_order_combination(
+            data,
+            9u,
+            [&](std::size_t modal_i, std::size_t component) {
+                return modal_hessians[modal_i].data()[component];
+            },
+            [&](std::size_t basis_i, std::size_t component, Real value) {
+                nodal_hessians[basis_i * 9u + component] = value;
+            });
+    }
+
+    static void apply_sparse_basis_to_nodal_all(
+        const OrderData& data,
+        const std::vector<Real>& modal_values,
+        const std::vector<Gradient>& modal_gradients,
+        const std::vector<Hessian>& modal_hessians,
+        std::vector<Real>& nodal_values,
+        std::vector<Gradient>& nodal_gradients,
+        std::vector<Hessian>& nodal_hessians) {
+        const std::size_t n = modal_values.size();
+        nodal_values.resize(n);
+        nodal_gradients.resize(n);
+        nodal_hessians.resize(n);
+        apply_low_order_combination(
+            data,
+            1u,
+            [&](std::size_t modal_i, std::size_t) { return modal_values[modal_i]; },
+            [&](std::size_t basis_i, std::size_t, Real value) { nodal_values[basis_i] = value; });
+        apply_low_order_combination(
+            data,
+            3u,
+            [&](std::size_t modal_i, std::size_t component) {
+                return modal_gradients[modal_i][component];
+            },
+            [&](std::size_t basis_i, std::size_t component, Real value) {
+                nodal_gradients[basis_i][component] = value;
+            });
+        apply_low_order_combination(
+            data,
+            9u,
+            [&](std::size_t modal_i, std::size_t component) {
+                return modal_hessians[modal_i].data()[component];
+            },
+            [&](std::size_t basis_i, std::size_t component, Real value) {
+                nodal_hessians[basis_i].data()[component] = value;
+            });
+    }
+
+    static void apply_sparse_basis_to_nodal_all_to(
+        const OrderData& data,
+        const std::vector<Real>& modal_values,
+        const std::vector<Gradient>& modal_gradients,
+        const std::vector<Hessian>& modal_hessians,
+        Real* SVMP_RESTRICT nodal_values,
+        Real* SVMP_RESTRICT nodal_gradients,
+        Real* SVMP_RESTRICT nodal_hessians) {
+        apply_low_order_combination(
+            data,
+            1u,
+            [&](std::size_t modal_i, std::size_t) { return modal_values[modal_i]; },
+            [&](std::size_t basis_i, std::size_t, Real value) { nodal_values[basis_i] = value; });
+        apply_low_order_combination(
+            data,
+            3u,
+            [&](std::size_t modal_i, std::size_t component) {
+                return modal_gradients[modal_i][component];
+            },
+            [&](std::size_t basis_i, std::size_t component, Real value) {
+                nodal_gradients[basis_i * 3u + component] = value;
+            });
+        apply_low_order_combination(
+            data,
+            9u,
+            [&](std::size_t modal_i, std::size_t component) {
+                return modal_hessians[modal_i].data()[component];
+            },
+            [&](std::size_t basis_i, std::size_t component, Real value) {
+                nodal_hessians[basis_i * 9u + component] = value;
+            });
+    }
+
+    template <typename Sink>
+    // Keep modal transform helpers free of forced-inline attributes unless
+    // compiler-versioned benchmarks and LLVM IR checks show a stable benefit.
+    static void apply_modal_values_to_nodal(const OrderData& data,
+                                            const std::vector<Real>& modal_values,
+                                            const Sink& sink) {
+        const std::size_t n = modal_values.size();
+        sink.resize(n);
+        for (std::size_t basis_i = 0; basis_i < n; ++basis_i) {
+            const Real* row = data.modal_to_nodal.data() + basis_i * n;
+            Real value = Real(0);
+            for (std::size_t modal_j = 0; modal_j < n; ++modal_j) {
+                value += row[modal_j] * modal_values[modal_j];
+            }
+            sink.write(basis_i, value);
+        }
+    }
+
+    template <typename Sink>
+    static void apply_modal_gradients_to_nodal(const OrderData& data,
+                                               const std::vector<Gradient>& modal_gradients,
+                                               const Sink& sink) {
+        const std::size_t n = modal_gradients.size();
+        sink.resize(n);
+        for (std::size_t basis_i = 0; basis_i < n; ++basis_i) {
+            const Real* row = data.modal_to_nodal.data() + basis_i * n;
+            Gradient gradient{};
+            for (std::size_t modal_j = 0; modal_j < n; ++modal_j) {
+                const Real coeff = row[modal_j];
+                for (std::size_t component = 0; component < 3u; ++component) {
+                    gradient[component] += coeff * modal_gradients[modal_j][component];
+                }
+            }
+            sink.write(basis_i, gradient);
+        }
+    }
+
+    template <typename Sink>
+    static void apply_modal_hessians_to_nodal(const OrderData& data,
+                                              const std::vector<Hessian>& modal_hessians,
+                                              const Sink& sink) {
+        const std::size_t n = modal_hessians.size();
+        sink.resize(n);
+        for (std::size_t basis_i = 0; basis_i < n; ++basis_i) {
+            const Real* matrix_row = data.modal_to_nodal.data() + basis_i * n;
+            Hessian hessian{};
+            for (std::size_t modal_j = 0; modal_j < n; ++modal_j) {
+                const Real coeff = matrix_row[modal_j];
+                for (std::size_t row = 0; row < 3u; ++row) {
+                    for (std::size_t col = 0; col < 3u; ++col) {
+                        hessian(row, col) += coeff * modal_hessians[modal_j](row, col);
+                    }
+                }
+            }
+            sink.write(basis_i, hessian);
+        }
+    }
+
+    static void apply_modal_to_nodal(const OrderData& data,
+                                     const std::vector<Real>& modal_values,
+                                     std::vector<Real>& nodal_values) {
+        apply_modal_values_to_nodal(data, modal_values, VectorValueSink{nodal_values});
+    }
+
+    static void apply_modal_to_nodal(const OrderData& data,
+                                     const std::vector<Gradient>& modal_gradients,
+                                     std::vector<Gradient>& nodal_gradients) {
+        apply_modal_gradients_to_nodal(data, modal_gradients, VectorGradientSink{nodal_gradients});
+    }
+
+    static void apply_modal_to_nodal(const OrderData& data,
+                                     const std::vector<Hessian>& modal_hessians,
+                                     std::vector<Hessian>& nodal_hessians) {
+        apply_modal_hessians_to_nodal(data, modal_hessians, VectorHessianSink{nodal_hessians});
+    }
+
+    static void apply_modal_to_nodal_to(const OrderData& data,
+                                        const std::vector<Real>& modal_values,
+                                        Real* nodal_values) {
+        apply_modal_values_to_nodal(data, modal_values, RawValueSink{nodal_values});
+    }
+
+    static void apply_modal_to_nodal_to(const OrderData& data,
+                                        const std::vector<Gradient>& modal_gradients,
+                                        Real* nodal_gradients) {
+        apply_modal_gradients_to_nodal(data, modal_gradients, RawGradientSink{nodal_gradients});
+    }
+
+    static void apply_modal_to_nodal_to(const OrderData& data,
+                                        const std::vector<Hessian>& modal_hessians,
+                                        Real* nodal_hessians) {
+        apply_modal_hessians_to_nodal(data, modal_hessians, RawHessianSink{nodal_hessians});
+    }
+};
+
+namespace lagrange_pyramid {
+
+const std::vector<math::Vector<Real, 3>>& nodes(int order) {
+    return PyramidLagrangeCache::get(order).nodes;
+}
+
+void prewarm_scratch(int order, std::size_t max_qpts) {
+    const auto& data = PyramidLagrangeCache::get(order);
+    PyramidLagrangeCache::prewarm_scratch(data.modal_terms.size(), max_qpts);
+}
+
+void evaluate_values(int order,
+                     const math::Vector<Real, 3>& xi,
+                     std::vector<Real>& values) {
+    const auto& data = PyramidLagrangeCache::get(order);
+    PyramidLagrangeCache::evaluate_values(data, xi, values);
+}
+
+void evaluate_gradients(int order,
+                        const math::Vector<Real, 3>& xi,
+                        std::vector<Gradient>& gradients) {
+    const auto& data = PyramidLagrangeCache::get(order);
+    PyramidLagrangeCache::evaluate_gradients(data, xi, gradients);
+}
+
+void evaluate_hessians(int order,
+                       const math::Vector<Real, 3>& xi,
+                       std::vector<Hessian>& hessians) {
+    const auto& data = PyramidLagrangeCache::get(order);
+    PyramidLagrangeCache::evaluate_hessians(data, xi, hessians);
+}
+
+void evaluate_all(int order,
+                  const math::Vector<Real, 3>& xi,
+                  std::vector<Real>& values,
+                  std::vector<Gradient>& gradients,
+                  std::vector<Hessian>& hessians) {
+    const auto& data = PyramidLagrangeCache::get(order);
+    PyramidLagrangeCache::evaluate_all(data, xi, values, gradients, hessians);
+}
+
+void evaluate_values_to(int order,
+                        const math::Vector<Real, 3>& xi,
+                        Real* SVMP_RESTRICT values_out) {
+    const auto& data = PyramidLagrangeCache::get(order);
+    PyramidLagrangeCache::evaluate_values_to(data, xi, values_out);
+}
+
+void evaluate_gradients_to(int order,
+                           const math::Vector<Real, 3>& xi,
+                           Real* SVMP_RESTRICT gradients_out) {
+    const auto& data = PyramidLagrangeCache::get(order);
+    PyramidLagrangeCache::evaluate_gradients_to(data, xi, gradients_out);
+}
+
+void evaluate_hessians_to(int order,
+                          const math::Vector<Real, 3>& xi,
+                          Real* SVMP_RESTRICT hessians_out) {
+    const auto& data = PyramidLagrangeCache::get(order);
+    PyramidLagrangeCache::evaluate_hessians_to(data, xi, hessians_out);
+}
+
+void evaluate_all_to(int order,
+                     const math::Vector<Real, 3>& xi,
+                     Real* SVMP_RESTRICT values_out,
+                     Real* SVMP_RESTRICT gradients_out,
+                     Real* SVMP_RESTRICT hessians_out) {
+    const auto& data = PyramidLagrangeCache::get(order);
+    PyramidLagrangeCache::evaluate_all_to(data, xi, values_out, gradients_out, hessians_out);
+}
+
+void evaluate_at_quadrature_points_strided(
+    int order,
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) {
+    const auto& data = PyramidLagrangeCache::get(order);
+    PyramidLagrangeCache::evaluate_at_quadrature_points_strided(
+        data, points, output_stride, values_out, gradients_out, hessians_out);
+}
+
+} // namespace lagrange_pyramid
+
+} // namespace detail
+} // namespace basis
+} // namespace FE
+} // namespace svmp
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasisPyramid.h b/Code/Source/solver/FE/Basis/LagrangeBasisPyramid.h
new file mode 100644
index 000000000..76859501c
--- /dev/null
+++ b/Code/Source/solver/FE/Basis/LagrangeBasisPyramid.h
@@ -0,0 +1,67 @@
+#ifndef SVMP_FE_BASIS_LAGRANGEBASISPYRAMID_H
+#define SVMP_FE_BASIS_LAGRANGEBASISPYRAMID_H
+
+// Private declarations for the rational pyramid Lagrange helper implemented in
+// LagrangeBasisPyramid.cpp. This header is intentionally small so the large
+// construction and apex-classification code stays out of LagrangeBasis.cpp.
+
+#include "BasisFunction.h"
+
+#include <cstddef>
+#include <vector>
+
+namespace svmp {
+namespace FE {
+namespace basis {
+namespace detail {
+namespace lagrange_pyramid {
+
+const std::vector<math::Vector<Real, 3>>& nodes(int order);
+
+void prewarm_scratch(int order, std::size_t max_qpts = 0);
+
+void evaluate_values(int order,
+                     const math::Vector<Real, 3>& xi,
+                     std::vector<Real>& values);
+void evaluate_gradients(int order,
+                        const math::Vector<Real, 3>& xi,
+                        std::vector<Gradient>& gradients);
+void evaluate_hessians(int order,
+                       const math::Vector<Real, 3>& xi,
+                       std::vector<Hessian>& hessians);
+void evaluate_all(int order,
+                  const math::Vector<Real, 3>& xi,
+                  std::vector<Real>& values,
+                  std::vector<Gradient>& gradients,
+                  std::vector<Hessian>& hessians);
+
+void evaluate_values_to(int order,
+                        const math::Vector<Real, 3>& xi,
+                        Real* SVMP_RESTRICT values_out);
+void evaluate_gradients_to(int order,
+                           const math::Vector<Real, 3>& xi,
+                           Real* SVMP_RESTRICT gradients_out);
+void evaluate_hessians_to(int order,
+                          const math::Vector<Real, 3>& xi,
+                          Real* SVMP_RESTRICT hessians_out);
+void evaluate_all_to(int order,
+                     const math::Vector<Real, 3>& xi,
+                     Real* SVMP_RESTRICT values_out,
+                     Real* SVMP_RESTRICT gradients_out,
+                     Real* SVMP_RESTRICT hessians_out);
+
+void evaluate_at_quadrature_points_strided(
+    int order,
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out);
+
+} // namespace lagrange_pyramid
+} // namespace detail
+} // namespace basis
+} // namespace FE
+} // namespace svmp
+
+#endif // SVMP_FE_BASIS_LAGRANGEBASISPYRAMID_H
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasisSimplex.cpp b/Code/Source/solver/FE/Basis/LagrangeBasisSimplex.cpp
new file mode 100644
index 000000000..36325576a
--- /dev/null
+++ b/Code/Source/solver/FE/Basis/LagrangeBasisSimplex.cpp
@@ -0,0 +1,2457 @@
+#include "LagrangeBasisSimplex.h"
+
+#include <array>
+#include <vector>
+
+namespace svmp {
+namespace FE {
+namespace basis {
+namespace detail {
+
+// Falling-factorial (equispaced barycentric) Lagrange factors for simplex nodes.
+//
+// For a fixed polynomial order p and barycentric coordinate lambda in [0, 1],
+// define
+//   phi_a(lambda) = product_{m=0}^{a-1} (p * lambda - m) / (a - m), a = 0..p
+// Then for a multi-index (i0, i1, ..., id) with sum i_k = p, the simplex
+// Lagrange basis function is product_k phi_{i_k}(lambda_k), nodal on the
+// barycentric lattice.
+//
+// Output buffers must each be sized to at least p+1 entries; the function
+// writes every output slot (no pre-zero required by the caller).
+template <bool NeedFirst, bool NeedSecond>
+void simplex_lagrange_factor_sequence_impl(int p,
+                                           Real lambda,
+                                           Real* phi,
+                                           Real* dphi,
+                                           Real* d2phi) {
+    static_assert(!NeedSecond || NeedFirst,
+                  "second derivative factors require first-derivative recurrence state");
+
+    phi[0] = Real(1);
+    if constexpr (NeedFirst) {
+        dphi[0] = Real(0);
+    }
+    if constexpr (NeedSecond) {
+        d2phi[0] = Real(0);
+    }
+    if (p == 0) {
+        return;
+    }
+
+    const Real t = static_cast<Real>(p) * lambda;
+    const Real dt_dlambda = static_cast<Real>(p);
+
+    Real dphi_dt_prev = Real(0);
+    Real d2phi_dt2_prev = Real(0);
+
+    for (int a = 1; a <= p; ++a) {
+        const std::size_t au = static_cast<std::size_t>(a);
+        const Real inv_a = Real(1) / static_cast<Real>(a);
+        const Real s = (t - static_cast<Real>(a - 1)) * inv_a;
+
+        phi[au] = s * phi[au - 1];
+
+        if constexpr (NeedFirst) {
+            const Real dphi_dt_old = dphi_dt_prev;
+            const Real dphi_dt = inv_a * phi[au - 1] + s * dphi_dt_old;
+            dphi[au] = dt_dlambda * dphi_dt;
+
+            if constexpr (NeedSecond) {
+                const Real d2phi_dt2 = Real(2) * inv_a * dphi_dt_old + s * d2phi_dt2_prev;
+                d2phi[au] = dt_dlambda * dt_dlambda * d2phi_dt2;
+                d2phi_dt2_prev = d2phi_dt2;
+            }
+
+            dphi_dt_prev = dphi_dt;
+        }
+    }
+}
+
+void simplex_lagrange_factor_sequence(int p,
+                                      Real lambda,
+                                      Real* phi,
+                                      Real* dphi,
+                                      Real* d2phi) {
+    if (d2phi != nullptr) {
+        simplex_lagrange_factor_sequence_impl<true, true>(p, lambda, phi, dphi, d2phi);
+    } else if (dphi != nullptr) {
+        simplex_lagrange_factor_sequence_impl<true, false>(p, lambda, phi, dphi, nullptr);
+    } else {
+        simplex_lagrange_factor_sequence_impl<false, false>(p, lambda, phi, nullptr, nullptr);
+    }
+}
+
+constexpr int kFixedSimplexAxisOrder = 12;
+constexpr std::size_t kFixedSimplexAxisSize =
+    static_cast<std::size_t>(kFixedSimplexAxisOrder + 1);
+constexpr std::size_t kFixedSimplexBatchEntries = 512;
+
+template <int Order>
+inline void simplex_lagrange_factor_values_product(Real lambda,
+                                                   Real* SVMP_RESTRICT values) {
+    static_assert(Order >= 0, "simplex order must be non-negative");
+    values[0] = Real(1);
+    const Real t = static_cast<Real>(Order) * lambda;
+    for (int a = 1; a <= Order; ++a) {
+        const Real inv_a = Real(1) / static_cast<Real>(a);
+        values[a] = values[a - 1] * (t - static_cast<Real>(a - 1)) * inv_a;
+    }
+}
+
+template <int Order>
+void evaluate_triangle_simplex_values_q4(
+    const std::vector<std::array<int, 4>>& simplex_exponents,
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out) {
+    static_assert(Order >= 4 && Order <= 8, "specialized simplex path covers orders 4..8");
+
+    Real phi0[4][Order + 1];
+    Real phi1[4][Order + 1];
+    Real phi2[4][Order + 1];
+
+    for (std::size_t q = 0; q < 4u; ++q) {
+        const auto& xi = points[q];
+        const Real l1 = xi[0];
+        const Real l2 = xi[1];
+        const Real l0 = Real(1) - l1 - l2;
+        simplex_lagrange_factor_values_product<Order>(l0, phi0[q]);
+        simplex_lagrange_factor_values_product<Order>(l1, phi1[q]);
+        simplex_lagrange_factor_values_product<Order>(l2, phi2[q]);
+    }
+
+    const std::size_t num_nodes = simplex_exponents.size();
+    for (std::size_t node = 0; node < num_nodes; ++node) {
+        const auto& e = simplex_exponents[node];
+        const std::size_t i0 = static_cast<std::size_t>(e[0]);
+        const std::size_t i1 = static_cast<std::size_t>(e[1]);
+        const std::size_t i2 = static_cast<std::size_t>(e[2]);
+        Real* SVMP_RESTRICT row = values_out + node * output_stride;
+        row[0] = phi0[0][i0] * phi1[0][i1] * phi2[0][i2];
+        row[1] = phi0[1][i0] * phi1[1][i1] * phi2[1][i2];
+        row[2] = phi0[2][i0] * phi1[2][i1] * phi2[2][i2];
+        row[3] = phi0[3][i0] * phi1[3][i1] * phi2[3][i2];
+    }
+}
+
+bool try_evaluate_triangle_simplex_values_q4(
+    const std::vector<std::array<int, 4>>& simplex_exponents,
+    int order,
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out) {
+    switch (order) {
+    case 4:
+        evaluate_triangle_simplex_values_q4<4>(
+            simplex_exponents, points, output_stride, values_out);
+        return true;
+    case 5:
+        evaluate_triangle_simplex_values_q4<5>(
+            simplex_exponents, points, output_stride, values_out);
+        return true;
+    case 6:
+        evaluate_triangle_simplex_values_q4<6>(
+            simplex_exponents, points, output_stride, values_out);
+        return true;
+    case 7:
+        evaluate_triangle_simplex_values_q4<7>(
+            simplex_exponents, points, output_stride, values_out);
+        return true;
+    case 8:
+        evaluate_triangle_simplex_values_q4<8>(
+            simplex_exponents, points, output_stride, values_out);
+        return true;
+    default:
+        return false;
+    }
+}
+
+template <int Order>
+void evaluate_tetrahedron_simplex_values_q4(
+    const std::vector<std::array<int, 4>>& simplex_exponents,
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out) {
+    static_assert(Order >= 4 && Order <= 8, "specialized simplex path covers orders 4..8");
+
+    Real phi0[4][Order + 1];
+    Real phi1[4][Order + 1];
+    Real phi2[4][Order + 1];
+    Real phi3[4][Order + 1];
+
+    for (std::size_t q = 0; q < 4u; ++q) {
+        const auto& xi = points[q];
+        const Real l1 = xi[0];
+        const Real l2 = xi[1];
+        const Real l3 = xi[2];
+        const Real l0 = Real(1) - l1 - l2 - l3;
+        simplex_lagrange_factor_values_product<Order>(l0, phi0[q]);
+        simplex_lagrange_factor_values_product<Order>(l1, phi1[q]);
+        simplex_lagrange_factor_values_product<Order>(l2, phi2[q]);
+        simplex_lagrange_factor_values_product<Order>(l3, phi3[q]);
+    }
+
+    const std::size_t num_nodes = simplex_exponents.size();
+    for (std::size_t node = 0; node < num_nodes; ++node) {
+        const auto& e = simplex_exponents[node];
+        const std::size_t i0 = static_cast<std::size_t>(e[0]);
+        const std::size_t i1 = static_cast<std::size_t>(e[1]);
+        const std::size_t i2 = static_cast<std::size_t>(e[2]);
+        const std::size_t i3 = static_cast<std::size_t>(e[3]);
+        Real* SVMP_RESTRICT row = values_out + node * output_stride;
+        row[0] = phi0[0][i0] * phi1[0][i1] * phi2[0][i2] * phi3[0][i3];
+        row[1] = phi0[1][i0] * phi1[1][i1] * phi2[1][i2] * phi3[1][i3];
+        row[2] = phi0[2][i0] * phi1[2][i1] * phi2[2][i2] * phi3[2][i3];
+        row[3] = phi0[3][i0] * phi1[3][i1] * phi2[3][i2] * phi3[3][i3];
+    }
+}
+
+bool try_evaluate_tetrahedron_simplex_values_q4(
+    const std::vector<std::array<int, 4>>& simplex_exponents,
+    int order,
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out) {
+    switch (order) {
+    case 4:
+        evaluate_tetrahedron_simplex_values_q4<4>(
+            simplex_exponents, points, output_stride, values_out);
+        return true;
+    case 5:
+        evaluate_tetrahedron_simplex_values_q4<5>(
+            simplex_exponents, points, output_stride, values_out);
+        return true;
+    case 6:
+        evaluate_tetrahedron_simplex_values_q4<6>(
+            simplex_exponents, points, output_stride, values_out);
+        return true;
+    case 7:
+        evaluate_tetrahedron_simplex_values_q4<7>(
+            simplex_exponents, points, output_stride, values_out);
+        return true;
+    case 8:
+        evaluate_tetrahedron_simplex_values_q4<8>(
+            simplex_exponents, points, output_stride, values_out);
+        return true;
+    default:
+        return false;
+    }
+}
+
+template <int Order>
+void evaluate_tetrahedron_simplex_gradients_q4(
+    const std::vector<std::array<int, 4>>& simplex_exponents,
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT gradients_out) {
+    static_assert(Order >= 3 && Order <= 8,
+                  "specialized tetrahedron gradient path covers orders 3..8");
+
+    Real phi0[4][Order + 1];
+    Real phi1[4][Order + 1];
+    Real phi2[4][Order + 1];
+    Real phi3[4][Order + 1];
+    Real dphi0[4][Order + 1];
+    Real dphi1[4][Order + 1];
+    Real dphi2[4][Order + 1];
+    Real dphi3[4][Order + 1];
+
+    for (std::size_t q = 0; q < 4u; ++q) {
+        const auto& xi = points[q];
+        const Real l1 = xi[0];
+        const Real l2 = xi[1];
+        const Real l3 = xi[2];
+        const Real l0 = Real(1) - l1 - l2 - l3;
+        simplex_lagrange_factor_sequence_impl<true, false>(
+            Order, l0, phi0[q], dphi0[q], nullptr);
+        simplex_lagrange_factor_sequence_impl<true, false>(
+            Order, l1, phi1[q], dphi1[q], nullptr);
+        simplex_lagrange_factor_sequence_impl<true, false>(
+            Order, l2, phi2[q], dphi2[q], nullptr);
+        simplex_lagrange_factor_sequence_impl<true, false>(
+            Order, l3, phi3[q], dphi3[q], nullptr);
+    }
+
+    const std::size_t num_nodes = simplex_exponents.size();
+    for (std::size_t node = 0; node < num_nodes; ++node) {
+        const auto& e = simplex_exponents[node];
+        const std::size_t i0 = static_cast<std::size_t>(e[0]);
+        const std::size_t i1 = static_cast<std::size_t>(e[1]);
+        const std::size_t i2 = static_cast<std::size_t>(e[2]);
+        const std::size_t i3 = static_cast<std::size_t>(e[3]);
+        Real gx[4];
+        Real gy[4];
+        Real gz[4];
+
+        for (std::size_t q = 0; q < 4u; ++q) {
+            const Real v0 = phi0[q][i0];
+            const Real v1 = phi1[q][i1];
+            const Real v2 = phi2[q][i2];
+            const Real v3 = phi3[q][i3];
+            const Real D0 = dphi0[q][i0];
+            const Real D1 = dphi1[q][i1];
+            const Real D2 = dphi2[q][i2];
+            const Real D3 = dphi3[q][i3];
+            const Real v23 = v2 * v3;
+            const Real v01 = v0 * v1;
+            const Real dl0 = D0 * v1 * v23;
+            gx[q] = v0 * D1 * v23 - dl0;
+            gy[q] = v01 * D2 * v3 - dl0;
+            gz[q] = v01 * v2 * D3 - dl0;
+        }
+
+        Real* SVMP_RESTRICT g = gradients_out + node * 3u * output_stride;
+        g[0u] = gx[0];
+        g[1u] = gx[1];
+        g[2u] = gx[2];
+        g[3u] = gx[3];
+        g[output_stride + 0u] = gy[0];
+        g[output_stride + 1u] = gy[1];
+        g[output_stride + 2u] = gy[2];
+        g[output_stride + 3u] = gy[3];
+        g[2u * output_stride + 0u] = gz[0];
+        g[2u * output_stride + 1u] = gz[1];
+        g[2u * output_stride + 2u] = gz[2];
+        g[2u * output_stride + 3u] = gz[3];
+    }
+}
+
+template <int Order>
+void evaluate_triangle_simplex_gradients_q4(
+    const std::vector<std::array<int, 4>>& simplex_exponents,
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT gradients_out) {
+    static_assert((Order == 2) || (Order >= 4 && Order <= 8),
+                  "specialized simplex path covers order 2 and orders 4..8");
+
+    Real phi0[4][Order + 1];
+    Real phi1[4][Order + 1];
+    Real phi2[4][Order + 1];
+    Real dphi0[4][Order + 1];
+    Real dphi1[4][Order + 1];
+    Real dphi2[4][Order + 1];
+
+    for (std::size_t q = 0; q < 4u; ++q) {
+        const auto& xi = points[q];
+        const Real l1 = xi[0];
+        const Real l2 = xi[1];
+        const Real l0 = Real(1) - l1 - l2;
+        simplex_lagrange_factor_sequence_impl<true, false>(
+            Order, l0, phi0[q], dphi0[q], nullptr);
+        simplex_lagrange_factor_sequence_impl<true, false>(
+            Order, l1, phi1[q], dphi1[q], nullptr);
+        simplex_lagrange_factor_sequence_impl<true, false>(
+            Order, l2, phi2[q], dphi2[q], nullptr);
+    }
+
+    const std::size_t num_nodes = simplex_exponents.size();
+    for (std::size_t node = 0; node < num_nodes; ++node) {
+        const auto& e = simplex_exponents[node];
+        const std::size_t i0 = static_cast<std::size_t>(e[0]);
+        const std::size_t i1 = static_cast<std::size_t>(e[1]);
+        const std::size_t i2 = static_cast<std::size_t>(e[2]);
+        Real* SVMP_RESTRICT g = gradients_out + node * 3u * output_stride;
+
+        for (std::size_t q = 0; q < 4u; ++q) {
+            const Real v0 = phi0[q][i0];
+            const Real v1 = phi1[q][i1];
+            const Real v2 = phi2[q][i2];
+            const Real D0 = dphi0[q][i0];
+            const Real D1 = dphi1[q][i1];
+            const Real D2 = dphi2[q][i2];
+            const Real dl0 = D0 * v1 * v2;
+            g[0u * output_stride + q] = v0 * D1 * v2 - dl0;
+            g[1u * output_stride + q] = v0 * v1 * D2 - dl0;
+            g[2u * output_stride + q] = Real(0);
+        }
+    }
+}
+
+bool try_evaluate_triangle_simplex_gradients_q4(
+    const std::vector<std::array<int, 4>>& simplex_exponents,
+    int order,
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT gradients_out) {
+    switch (order) {
+    case 2:
+        evaluate_triangle_simplex_gradients_q4<2>(
+            simplex_exponents, points, output_stride, gradients_out);
+        return true;
+    case 4:
+        evaluate_triangle_simplex_gradients_q4<4>(
+            simplex_exponents, points, output_stride, gradients_out);
+        return true;
+    case 5:
+        evaluate_triangle_simplex_gradients_q4<5>(
+            simplex_exponents, points, output_stride, gradients_out);
+        return true;
+    case 6:
+        evaluate_triangle_simplex_gradients_q4<6>(
+            simplex_exponents, points, output_stride, gradients_out);
+        return true;
+    case 7:
+        evaluate_triangle_simplex_gradients_q4<7>(
+            simplex_exponents, points, output_stride, gradients_out);
+        return true;
+    case 8:
+        evaluate_triangle_simplex_gradients_q4<8>(
+            simplex_exponents, points, output_stride, gradients_out);
+        return true;
+    default:
+        return false;
+    }
+}
+
+template <int Order>
+void evaluate_triangle_simplex_hessian_outputs_q4(
+    const std::vector<std::array<int, 4>>& simplex_exponents,
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) {
+    static_assert(Order >= 2 && Order <= 8, "specialized simplex path covers orders 2..8");
+
+    Real phi0[4][Order + 1];
+    Real phi1[4][Order + 1];
+    Real phi2[4][Order + 1];
+    Real dphi0[4][Order + 1];
+    Real dphi1[4][Order + 1];
+    Real dphi2[4][Order + 1];
+    Real d2phi0[4][Order + 1];
+    Real d2phi1[4][Order + 1];
+    Real d2phi2[4][Order + 1];
+
+    for (std::size_t q = 0; q < 4u; ++q) {
+        const auto& xi = points[q];
+        const Real l1 = xi[0];
+        const Real l2 = xi[1];
+        const Real l0 = Real(1) - l1 - l2;
+        simplex_lagrange_factor_sequence_impl<true, true>(
+            Order, l0, phi0[q], dphi0[q], d2phi0[q]);
+        simplex_lagrange_factor_sequence_impl<true, true>(
+            Order, l1, phi1[q], dphi1[q], d2phi1[q]);
+        simplex_lagrange_factor_sequence_impl<true, true>(
+            Order, l2, phi2[q], dphi2[q], d2phi2[q]);
+    }
+
+    const std::size_t num_nodes = simplex_exponents.size();
+    for (std::size_t node = 0; node < num_nodes; ++node) {
+        const auto& e = simplex_exponents[node];
+        const std::size_t i0 = static_cast<std::size_t>(e[0]);
+        const std::size_t i1 = static_cast<std::size_t>(e[1]);
+        const std::size_t i2 = static_cast<std::size_t>(e[2]);
+        Real* SVMP_RESTRICT value_row = values_out ? values_out + node * output_stride : nullptr;
+        Real* SVMP_RESTRICT g = gradients_out ? gradients_out + node * 3u * output_stride : nullptr;
+        Real* SVMP_RESTRICT H = hessians_out + node * 9u * output_stride;
+        H[2u * output_stride + 0u] = Real(0);
+        H[2u * output_stride + 1u] = Real(0);
+        H[2u * output_stride + 2u] = Real(0);
+        H[2u * output_stride + 3u] = Real(0);
+        H[5u * output_stride + 0u] = Real(0);
+        H[5u * output_stride + 1u] = Real(0);
+        H[5u * output_stride + 2u] = Real(0);
+        H[5u * output_stride + 3u] = Real(0);
+        H[6u * output_stride + 0u] = Real(0);
+        H[6u * output_stride + 1u] = Real(0);
+        H[6u * output_stride + 2u] = Real(0);
+        H[6u * output_stride + 3u] = Real(0);
+        H[7u * output_stride + 0u] = Real(0);
+        H[7u * output_stride + 1u] = Real(0);
+        H[7u * output_stride + 2u] = Real(0);
+        H[7u * output_stride + 3u] = Real(0);
+        H[8u * output_stride + 0u] = Real(0);
+        H[8u * output_stride + 1u] = Real(0);
+        H[8u * output_stride + 2u] = Real(0);
+        H[8u * output_stride + 3u] = Real(0);
+
+        for (std::size_t q = 0; q < 4u; ++q) {
+            const Real v0 = phi0[q][i0];
+            const Real v1 = phi1[q][i1];
+            const Real v2 = phi2[q][i2];
+            if (value_row != nullptr) {
+                value_row[q] = v0 * v1 * v2;
+            }
+
+            const Real D0 = dphi0[q][i0];
+            const Real D1 = dphi1[q][i1];
+            const Real D2 = dphi2[q][i2];
+            if (g != nullptr) {
+                const Real dl0 = D0 * v1 * v2;
+                g[0u * output_stride + q] = v0 * D1 * v2 - dl0;
+                g[1u * output_stride + q] = v0 * v1 * D2 - dl0;
+                g[2u * output_stride + q] = Real(0);
+            }
+
+            const Real DD0 = d2phi0[q][i0];
+            const Real DD1 = d2phi1[q][i1];
+            const Real DD2 = d2phi2[q][i2];
+            const Real H00 = DD0 * v1 * v2;
+            const Real H11 = v0 * DD1 * v2;
+            const Real H22 = v0 * v1 * DD2;
+            const Real H01 = D0 * D1 * v2;
+            const Real H02 = D0 * v1 * D2;
+            const Real H12 = v0 * D1 * D2;
+            const Real h01 = H00 - H01 - H02 + H12;
+            H[0u * output_stride + q] = H00 - Real(2) * H01 + H11;
+            H[1u * output_stride + q] = h01;
+            H[3u * output_stride + q] = h01;
+            H[4u * output_stride + q] = H00 - Real(2) * H02 + H22;
+        }
+    }
+}
+
+bool try_evaluate_triangle_simplex_hessian_outputs_q4(
+    const std::vector<std::array<int, 4>>& simplex_exponents,
+    int order,
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) {
+    switch (order) {
+    case 2:
+        evaluate_triangle_simplex_hessian_outputs_q4<2>(
+            simplex_exponents, points, output_stride, values_out, gradients_out, hessians_out);
+        return true;
+    case 3:
+        evaluate_triangle_simplex_hessian_outputs_q4<3>(
+            simplex_exponents, points, output_stride, values_out, gradients_out, hessians_out);
+        return true;
+    case 4:
+        evaluate_triangle_simplex_hessian_outputs_q4<4>(
+            simplex_exponents, points, output_stride, values_out, gradients_out, hessians_out);
+        return true;
+    case 5:
+        evaluate_triangle_simplex_hessian_outputs_q4<5>(
+            simplex_exponents, points, output_stride, values_out, gradients_out, hessians_out);
+        return true;
+    case 6:
+        evaluate_triangle_simplex_hessian_outputs_q4<6>(
+            simplex_exponents, points, output_stride, values_out, gradients_out, hessians_out);
+        return true;
+    case 7:
+        evaluate_triangle_simplex_hessian_outputs_q4<7>(
+            simplex_exponents, points, output_stride, values_out, gradients_out, hessians_out);
+        return true;
+    case 8:
+        evaluate_triangle_simplex_hessian_outputs_q4<8>(
+            simplex_exponents, points, output_stride, values_out, gradients_out, hessians_out);
+        return true;
+    default:
+        return false;
+    }
+}
+
+template <int Order, std::size_t Q>
+inline void write_tetrahedron_simplex_hessian_q4(
+    const Real (&phi0)[4][Order + 1],
+    const Real (&phi1)[4][Order + 1],
+    const Real (&phi2)[4][Order + 1],
+    const Real (&phi3)[4][Order + 1],
+    const Real (&dphi0)[4][Order + 1],
+    const Real (&dphi1)[4][Order + 1],
+    const Real (&dphi2)[4][Order + 1],
+    const Real (&dphi3)[4][Order + 1],
+    const Real (&d2phi0)[4][Order + 1],
+    const Real (&d2phi1)[4][Order + 1],
+    const Real (&d2phi2)[4][Order + 1],
+    const Real (&d2phi3)[4][Order + 1],
+    std::size_t i0,
+    std::size_t i1,
+    std::size_t i2,
+    std::size_t i3,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT H) {
+    const Real v0 = phi0[Q][i0];
+    const Real v1 = phi1[Q][i1];
+    const Real v2 = phi2[Q][i2];
+    const Real v3 = phi3[Q][i3];
+    const Real D0 = dphi0[Q][i0];
+    const Real D1 = dphi1[Q][i1];
+    const Real D2 = dphi2[Q][i2];
+    const Real D3 = dphi3[Q][i3];
+    const Real DD0 = d2phi0[Q][i0];
+    const Real DD1 = d2phi1[Q][i1];
+    const Real DD2 = d2phi2[Q][i2];
+    const Real DD3 = d2phi3[Q][i3];
+    const Real H00 = DD0 * v1 * v2 * v3;
+    const Real H11 = v0 * DD1 * v2 * v3;
+    const Real H22 = v0 * v1 * DD2 * v3;
+    const Real H33 = v0 * v1 * v2 * DD3;
+    const Real H01 = D0 * D1 * v2 * v3;
+    const Real H02 = D0 * v1 * D2 * v3;
+    const Real H03 = D0 * v1 * v2 * D3;
+    const Real H12 = v0 * D1 * D2 * v3;
+    const Real H13 = v0 * D1 * v2 * D3;
+    const Real H23 = v0 * v1 * D2 * D3;
+    const Real h01 = H00 - H01 - H02 + H12;
+    const Real h02 = H00 - H01 - H03 + H13;
+    const Real h12 = H00 - H02 - H03 + H23;
+    H[0u * output_stride + Q] = H00 - Real(2) * H01 + H11;
+    H[1u * output_stride + Q] = h01;
+    H[2u * output_stride + Q] = h02;
+    H[3u * output_stride + Q] = h01;
+    H[4u * output_stride + Q] = H00 - Real(2) * H02 + H22;
+    H[5u * output_stride + Q] = h12;
+    H[6u * output_stride + Q] = h02;
+    H[7u * output_stride + Q] = h12;
+    H[8u * output_stride + Q] = H00 - Real(2) * H03 + H33;
+}
+
+template <int Order, std::size_t Q>
+inline void write_tetrahedron_simplex_hessian_stride4_q(
+    const Real (&phi0)[4][Order + 1],
+    const Real (&phi1)[4][Order + 1],
+    const Real (&phi2)[4][Order + 1],
+    const Real (&phi3)[4][Order + 1],
+    const Real (&dphi0)[4][Order + 1],
+    const Real (&dphi1)[4][Order + 1],
+    const Real (&dphi2)[4][Order + 1],
+    const Real (&dphi3)[4][Order + 1],
+    const Real (&d2phi0)[4][Order + 1],
+    const Real (&d2phi1)[4][Order + 1],
+    const Real (&d2phi2)[4][Order + 1],
+    const Real (&d2phi3)[4][Order + 1],
+    std::size_t i0,
+    std::size_t i1,
+    std::size_t i2,
+    std::size_t i3,
+    Real* SVMP_RESTRICT H) {
+    const Real v0 = phi0[Q][i0];
+    const Real v1 = phi1[Q][i1];
+    const Real v2 = phi2[Q][i2];
+    const Real v3 = phi3[Q][i3];
+    const Real D0 = dphi0[Q][i0];
+    const Real D1 = dphi1[Q][i1];
+    const Real D2 = dphi2[Q][i2];
+    const Real D3 = dphi3[Q][i3];
+    const Real DD0 = d2phi0[Q][i0];
+    const Real DD1 = d2phi1[Q][i1];
+    const Real DD2 = d2phi2[Q][i2];
+    const Real DD3 = d2phi3[Q][i3];
+    const Real v12 = v1 * v2;
+    const Real v13 = v1 * v3;
+    const Real v23 = v2 * v3;
+    const Real v123 = v1 * v23;
+    const Real v023 = v0 * v23;
+    const Real v013 = v0 * v13;
+    const Real v012 = v0 * v12;
+    const Real H00 = DD0 * v123;
+    const Real H11 = DD1 * v023;
+    const Real H22 = DD2 * v013;
+    const Real H33 = DD3 * v012;
+    const Real H01 = D0 * D1 * v23;
+    const Real H02 = D0 * D2 * v13;
+    const Real H03 = D0 * D3 * v12;
+    const Real H12 = D1 * D2 * v0 * v3;
+    const Real H13 = D1 * D3 * v0 * v2;
+    const Real H23 = D2 * D3 * v0 * v1;
+    const Real h01 = H00 - H01 - H02 + H12;
+    const Real h02 = H00 - H01 - H03 + H13;
+    const Real h12 = H00 - H02 - H03 + H23;
+    H[Q] = H00 - Real(2) * H01 + H11;
+    H[4u + Q] = h01;
+    H[8u + Q] = h02;
+    H[12u + Q] = h01;
+    H[16u + Q] = H00 - Real(2) * H02 + H22;
+    H[20u + Q] = h12;
+    H[24u + Q] = h02;
+    H[28u + Q] = h12;
+    H[32u + Q] = H00 - Real(2) * H03 + H33;
+}
+
+template <int Order, std::size_t Q>
+inline void write_tetrahedron_simplex_all_stride4_q(
+    const Real (&phi0)[4][Order + 1],
+    const Real (&phi1)[4][Order + 1],
+    const Real (&phi2)[4][Order + 1],
+    const Real (&phi3)[4][Order + 1],
+    const Real (&dphi0)[4][Order + 1],
+    const Real (&dphi1)[4][Order + 1],
+    const Real (&dphi2)[4][Order + 1],
+    const Real (&dphi3)[4][Order + 1],
+    const Real (&d2phi0)[4][Order + 1],
+    const Real (&d2phi1)[4][Order + 1],
+    const Real (&d2phi2)[4][Order + 1],
+    const Real (&d2phi3)[4][Order + 1],
+    std::size_t i0,
+    std::size_t i1,
+    std::size_t i2,
+    std::size_t i3,
+    Real* SVMP_RESTRICT value_row,
+    Real* SVMP_RESTRICT g,
+    Real* SVMP_RESTRICT H) {
+    const Real v0 = phi0[Q][i0];
+    const Real v1 = phi1[Q][i1];
+    const Real v2 = phi2[Q][i2];
+    const Real v3 = phi3[Q][i3];
+    const Real D0 = dphi0[Q][i0];
+    const Real D1 = dphi1[Q][i1];
+    const Real D2 = dphi2[Q][i2];
+    const Real D3 = dphi3[Q][i3];
+    const Real DD0 = d2phi0[Q][i0];
+    const Real DD1 = d2phi1[Q][i1];
+    const Real DD2 = d2phi2[Q][i2];
+    const Real DD3 = d2phi3[Q][i3];
+    const Real v12 = v1 * v2;
+    const Real v13 = v1 * v3;
+    const Real v23 = v2 * v3;
+    const Real v123 = v1 * v23;
+    const Real v023 = v0 * v23;
+    const Real v013 = v0 * v13;
+    const Real v012 = v0 * v12;
+    const Real dl0 = D0 * v123;
+    const Real H00 = DD0 * v123;
+    const Real H11 = DD1 * v023;
+    const Real H22 = DD2 * v013;
+    const Real H33 = DD3 * v012;
+    const Real H01 = D0 * D1 * v23;
+    const Real H02 = D0 * D2 * v13;
+    const Real H03 = D0 * D3 * v12;
+    const Real H12 = D1 * D2 * v0 * v3;
+    const Real H13 = D1 * D3 * v0 * v2;
+    const Real H23 = D2 * D3 * v0 * v1;
+    const Real h01 = H00 - H01 - H02 + H12;
+    const Real h02 = H00 - H01 - H03 + H13;
+    const Real h12 = H00 - H02 - H03 + H23;
+
+    value_row[Q] = v0 * v123;
+    g[Q] = D1 * v023 - dl0;
+    g[4u + Q] = D2 * v013 - dl0;
+    g[8u + Q] = D3 * v012 - dl0;
+    H[Q] = H00 - Real(2) * H01 + H11;
+    H[4u + Q] = h01;
+    H[8u + Q] = h02;
+    H[12u + Q] = h01;
+    H[16u + Q] = H00 - Real(2) * H02 + H22;
+    H[20u + Q] = h12;
+    H[24u + Q] = h02;
+    H[28u + Q] = h12;
+    H[32u + Q] = H00 - Real(2) * H03 + H33;
+}
+
+template <int Order>
+void evaluate_tetrahedron_simplex_hessian_outputs_q4(
+    const std::vector<std::array<int, 4>>& simplex_exponents,
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) {
+    static_assert(Order >= 2 && Order <= 8, "specialized simplex path covers orders 2..8");
+
+    Real phi0[4][Order + 1];
+    Real phi1[4][Order + 1];
+    Real phi2[4][Order + 1];
+    Real phi3[4][Order + 1];
+    Real dphi0[4][Order + 1];
+    Real dphi1[4][Order + 1];
+    Real dphi2[4][Order + 1];
+    Real dphi3[4][Order + 1];
+    Real d2phi0[4][Order + 1];
+    Real d2phi1[4][Order + 1];
+    Real d2phi2[4][Order + 1];
+    Real d2phi3[4][Order + 1];
+
+    for (std::size_t q = 0; q < 4u; ++q) {
+        const auto& xi = points[q];
+        const Real l1 = xi[0];
+        const Real l2 = xi[1];
+        const Real l3 = xi[2];
+        const Real l0 = Real(1) - l1 - l2 - l3;
+        simplex_lagrange_factor_sequence_impl<true, true>(
+            Order, l0, phi0[q], dphi0[q], d2phi0[q]);
+        simplex_lagrange_factor_sequence_impl<true, true>(
+            Order, l1, phi1[q], dphi1[q], d2phi1[q]);
+        simplex_lagrange_factor_sequence_impl<true, true>(
+            Order, l2, phi2[q], dphi2[q], d2phi2[q]);
+        simplex_lagrange_factor_sequence_impl<true, true>(
+            Order, l3, phi3[q], dphi3[q], d2phi3[q]);
+    }
+
+    const std::size_t num_nodes = simplex_exponents.size();
+    if (values_out == nullptr && gradients_out == nullptr) {
+        if (output_stride == 4u) {
+            for (std::size_t node = 0; node < num_nodes; ++node) {
+                const auto& e = simplex_exponents[node];
+                const std::size_t i0 = static_cast<std::size_t>(e[0]);
+                const std::size_t i1 = static_cast<std::size_t>(e[1]);
+                const std::size_t i2 = static_cast<std::size_t>(e[2]);
+                const std::size_t i3 = static_cast<std::size_t>(e[3]);
+                Real* SVMP_RESTRICT H = hessians_out + node * 36u;
+                write_tetrahedron_simplex_hessian_stride4_q<Order, 0>(
+                    phi0, phi1, phi2, phi3, dphi0, dphi1, dphi2, dphi3,
+                    d2phi0, d2phi1, d2phi2, d2phi3, i0, i1, i2, i3, H);
+                write_tetrahedron_simplex_hessian_stride4_q<Order, 1>(
+                    phi0, phi1, phi2, phi3, dphi0, dphi1, dphi2, dphi3,
+                    d2phi0, d2phi1, d2phi2, d2phi3, i0, i1, i2, i3, H);
+                write_tetrahedron_simplex_hessian_stride4_q<Order, 2>(
+                    phi0, phi1, phi2, phi3, dphi0, dphi1, dphi2, dphi3,
+                    d2phi0, d2phi1, d2phi2, d2phi3, i0, i1, i2, i3, H);
+                write_tetrahedron_simplex_hessian_stride4_q<Order, 3>(
+                    phi0, phi1, phi2, phi3, dphi0, dphi1, dphi2, dphi3,
+                    d2phi0, d2phi1, d2phi2, d2phi3, i0, i1, i2, i3, H);
+            }
+        } else {
+            for (std::size_t node = 0; node < num_nodes; ++node) {
+                const auto& e = simplex_exponents[node];
+                const std::size_t i0 = static_cast<std::size_t>(e[0]);
+                const std::size_t i1 = static_cast<std::size_t>(e[1]);
+                const std::size_t i2 = static_cast<std::size_t>(e[2]);
+                const std::size_t i3 = static_cast<std::size_t>(e[3]);
+                Real* SVMP_RESTRICT H = hessians_out + node * 9u * output_stride;
+                write_tetrahedron_simplex_hessian_q4<Order, 0>(
+                    phi0, phi1, phi2, phi3, dphi0, dphi1, dphi2, dphi3,
+                    d2phi0, d2phi1, d2phi2, d2phi3, i0, i1, i2, i3, output_stride, H);
+                write_tetrahedron_simplex_hessian_q4<Order, 1>(
+                    phi0, phi1, phi2, phi3, dphi0, dphi1, dphi2, dphi3,
+                    d2phi0, d2phi1, d2phi2, d2phi3, i0, i1, i2, i3, output_stride, H);
+                write_tetrahedron_simplex_hessian_q4<Order, 2>(
+                    phi0, phi1, phi2, phi3, dphi0, dphi1, dphi2, dphi3,
+                    d2phi0, d2phi1, d2phi2, d2phi3, i0, i1, i2, i3, output_stride, H);
+                write_tetrahedron_simplex_hessian_q4<Order, 3>(
+                    phi0, phi1, phi2, phi3, dphi0, dphi1, dphi2, dphi3,
+                    d2phi0, d2phi1, d2phi2, d2phi3, i0, i1, i2, i3, output_stride, H);
+            }
+        }
+        return;
+    }
+
+    if (values_out != nullptr && gradients_out != nullptr) {
+        if (output_stride == 4u) {
+            for (std::size_t node = 0; node < num_nodes; ++node) {
+                const auto& e = simplex_exponents[node];
+                const std::size_t i0 = static_cast<std::size_t>(e[0]);
+                const std::size_t i1 = static_cast<std::size_t>(e[1]);
+                const std::size_t i2 = static_cast<std::size_t>(e[2]);
+                const std::size_t i3 = static_cast<std::size_t>(e[3]);
+                Real* SVMP_RESTRICT value_row = values_out + node * output_stride;
+                Real* SVMP_RESTRICT g = gradients_out + node * 3u * output_stride;
+                Real* SVMP_RESTRICT H = hessians_out + node * 9u * output_stride;
+                write_tetrahedron_simplex_all_stride4_q<Order, 0>(
+                    phi0, phi1, phi2, phi3, dphi0, dphi1, dphi2, dphi3,
+                    d2phi0, d2phi1, d2phi2, d2phi3, i0, i1, i2, i3, value_row, g, H);
+                write_tetrahedron_simplex_all_stride4_q<Order, 1>(
+                    phi0, phi1, phi2, phi3, dphi0, dphi1, dphi2, dphi3,
+                    d2phi0, d2phi1, d2phi2, d2phi3, i0, i1, i2, i3, value_row, g, H);
+                write_tetrahedron_simplex_all_stride4_q<Order, 2>(
+                    phi0, phi1, phi2, phi3, dphi0, dphi1, dphi2, dphi3,
+                    d2phi0, d2phi1, d2phi2, d2phi3, i0, i1, i2, i3, value_row, g, H);
+                write_tetrahedron_simplex_all_stride4_q<Order, 3>(
+                    phi0, phi1, phi2, phi3, dphi0, dphi1, dphi2, dphi3,
+                    d2phi0, d2phi1, d2phi2, d2phi3, i0, i1, i2, i3, value_row, g, H);
+            }
+            return;
+        }
+
+        for (std::size_t node = 0; node < num_nodes; ++node) {
+            const auto& e = simplex_exponents[node];
+            const std::size_t i0 = static_cast<std::size_t>(e[0]);
+            const std::size_t i1 = static_cast<std::size_t>(e[1]);
+            const std::size_t i2 = static_cast<std::size_t>(e[2]);
+            const std::size_t i3 = static_cast<std::size_t>(e[3]);
+            Real* SVMP_RESTRICT value_row = values_out + node * output_stride;
+            Real* SVMP_RESTRICT g = gradients_out + node * 3u * output_stride;
+            Real* SVMP_RESTRICT H = hessians_out + node * 9u * output_stride;
+
+            for (std::size_t q = 0; q < 4u; ++q) {
+                const Real v0 = phi0[q][i0];
+                const Real v1 = phi1[q][i1];
+                const Real v2 = phi2[q][i2];
+                const Real v3 = phi3[q][i3];
+                const Real D0 = dphi0[q][i0];
+                const Real D1 = dphi1[q][i1];
+                const Real D2 = dphi2[q][i2];
+                const Real D3 = dphi3[q][i3];
+                const Real DD0 = d2phi0[q][i0];
+                const Real DD1 = d2phi1[q][i1];
+                const Real DD2 = d2phi2[q][i2];
+                const Real DD3 = d2phi3[q][i3];
+                const Real v12 = v1 * v2;
+                const Real v13 = v1 * v3;
+                const Real v23 = v2 * v3;
+                const Real v123 = v1 * v23;
+                const Real v023 = v0 * v23;
+                const Real v013 = v0 * v13;
+                const Real v012 = v0 * v12;
+                const Real dl0 = D0 * v123;
+                const Real H00 = DD0 * v123;
+                const Real H11 = DD1 * v023;
+                const Real H22 = DD2 * v013;
+                const Real H33 = DD3 * v012;
+                const Real H01 = D0 * D1 * v23;
+                const Real H02 = D0 * D2 * v13;
+                const Real H03 = D0 * D3 * v12;
+                const Real H12 = D1 * D2 * v0 * v3;
+                const Real H13 = D1 * D3 * v0 * v2;
+                const Real H23 = D2 * D3 * v0 * v1;
+                const Real h01 = H00 - H01 - H02 + H12;
+                const Real h02 = H00 - H01 - H03 + H13;
+                const Real h12 = H00 - H02 - H03 + H23;
+
+                value_row[q] = v0 * v123;
+                g[0u * output_stride + q] = D1 * v023 - dl0;
+                g[1u * output_stride + q] = D2 * v013 - dl0;
+                g[2u * output_stride + q] = D3 * v012 - dl0;
+                H[0u * output_stride + q] = H00 - Real(2) * H01 + H11;
+                H[1u * output_stride + q] = h01;
+                H[2u * output_stride + q] = h02;
+                H[3u * output_stride + q] = h01;
+                H[4u * output_stride + q] = H00 - Real(2) * H02 + H22;
+                H[5u * output_stride + q] = h12;
+                H[6u * output_stride + q] = h02;
+                H[7u * output_stride + q] = h12;
+                H[8u * output_stride + q] = H00 - Real(2) * H03 + H33;
+            }
+        }
+        return;
+    }
+
+    for (std::size_t node = 0; node < num_nodes; ++node) {
+        const auto& e = simplex_exponents[node];
+        const std::size_t i0 = static_cast<std::size_t>(e[0]);
+        const std::size_t i1 = static_cast<std::size_t>(e[1]);
+        const std::size_t i2 = static_cast<std::size_t>(e[2]);
+        const std::size_t i3 = static_cast<std::size_t>(e[3]);
+        Real* SVMP_RESTRICT value_row = values_out ? values_out + node * output_stride : nullptr;
+        Real* SVMP_RESTRICT g = gradients_out ? gradients_out + node * 3u * output_stride : nullptr;
+        Real* SVMP_RESTRICT H = hessians_out + node * 9u * output_stride;
+
+        for (std::size_t q = 0; q < 4u; ++q) {
+            const Real v0 = phi0[q][i0];
+            const Real v1 = phi1[q][i1];
+            const Real v2 = phi2[q][i2];
+            const Real v3 = phi3[q][i3];
+            if (value_row != nullptr) {
+                value_row[q] = v0 * v1 * v2 * v3;
+            }
+
+            const Real D0 = dphi0[q][i0];
+            const Real D1 = dphi1[q][i1];
+            const Real D2 = dphi2[q][i2];
+            const Real D3 = dphi3[q][i3];
+            if (g != nullptr) {
+                const Real dl0 = D0 * v1 * v2 * v3;
+                g[0u * output_stride + q] = v0 * D1 * v2 * v3 - dl0;
+                g[1u * output_stride + q] = v0 * v1 * D2 * v3 - dl0;
+                g[2u * output_stride + q] = v0 * v1 * v2 * D3 - dl0;
+            }
+
+            const Real DD0 = d2phi0[q][i0];
+            const Real DD1 = d2phi1[q][i1];
+            const Real DD2 = d2phi2[q][i2];
+            const Real DD3 = d2phi3[q][i3];
+            const Real H00 = DD0 * v1 * v2 * v3;
+            const Real H11 = v0 * DD1 * v2 * v3;
+            const Real H22 = v0 * v1 * DD2 * v3;
+            const Real H33 = v0 * v1 * v2 * DD3;
+            const Real H01 = D0 * D1 * v2 * v3;
+            const Real H02 = D0 * v1 * D2 * v3;
+            const Real H03 = D0 * v1 * v2 * D3;
+            const Real H12 = v0 * D1 * D2 * v3;
+            const Real H13 = v0 * D1 * v2 * D3;
+            const Real H23 = v0 * v1 * D2 * D3;
+            const Real h01 = H00 - H01 - H02 + H12;
+            const Real h02 = H00 - H01 - H03 + H13;
+            const Real h12 = H00 - H02 - H03 + H23;
+            H[0u * output_stride + q] = H00 - Real(2) * H01 + H11;
+            H[1u * output_stride + q] = h01;
+            H[2u * output_stride + q] = h02;
+            H[3u * output_stride + q] = h01;
+            H[4u * output_stride + q] = H00 - Real(2) * H02 + H22;
+            H[5u * output_stride + q] = h12;
+            H[6u * output_stride + q] = h02;
+            H[7u * output_stride + q] = h12;
+            H[8u * output_stride + q] = H00 - Real(2) * H03 + H33;
+        }
+    }
+}
+
+bool try_evaluate_tetrahedron_simplex_hessian_outputs_q4(
+    const std::vector<std::array<int, 4>>& simplex_exponents,
+    int order,
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) {
+    switch (order) {
+    case 2:
+        evaluate_tetrahedron_simplex_hessian_outputs_q4<2>(
+            simplex_exponents, points, output_stride, values_out, gradients_out, hessians_out);
+        return true;
+    case 3:
+        evaluate_tetrahedron_simplex_hessian_outputs_q4<3>(
+            simplex_exponents, points, output_stride, values_out, gradients_out, hessians_out);
+        return true;
+    case 4:
+        evaluate_tetrahedron_simplex_hessian_outputs_q4<4>(
+            simplex_exponents, points, output_stride, values_out, gradients_out, hessians_out);
+        return true;
+    case 5:
+        evaluate_tetrahedron_simplex_hessian_outputs_q4<5>(
+            simplex_exponents, points, output_stride, values_out, gradients_out, hessians_out);
+        return true;
+    case 6:
+        evaluate_tetrahedron_simplex_hessian_outputs_q4<6>(
+            simplex_exponents, points, output_stride, values_out, gradients_out, hessians_out);
+        return true;
+    case 7:
+        evaluate_tetrahedron_simplex_hessian_outputs_q4<7>(
+            simplex_exponents, points, output_stride, values_out, gradients_out, hessians_out);
+        return true;
+    case 8:
+        evaluate_tetrahedron_simplex_hessian_outputs_q4<8>(
+            simplex_exponents, points, output_stride, values_out, gradients_out, hessians_out);
+        return true;
+    default:
+        return false;
+    }
+}
+
+// Per-thread scratch space for simplex factor sequences. Common low orders use
+// fixed storage; higher orders fall back to dynamic vectors.
+struct SimplexAxisScratch {
+    std::size_t size{0};
+    std::array<Real, kFixedSimplexAxisSize> phi_fixed{};
+    std::array<Real, kFixedSimplexAxisSize> dphi_fixed{};
+    std::array<Real, kFixedSimplexAxisSize> d2phi_fixed{};
+    std::vector<Real> phi_dynamic;
+    std::vector<Real> dphi_dynamic;
+    std::vector<Real> d2phi_dynamic;
+
+    void reserveFor(std::size_t n) {
+        size = n;
+        if (n <= kFixedSimplexAxisSize) {
+            return;
+        }
+        if (phi_dynamic.size() < n) phi_dynamic.resize(n);
+        if (dphi_dynamic.size() < n) dphi_dynamic.resize(n);
+        if (d2phi_dynamic.size() < n) d2phi_dynamic.resize(n);
+    }
+
+    Real* phi() noexcept {
+        return size <= kFixedSimplexAxisSize ? phi_fixed.data() : phi_dynamic.data();
+    }
+
+    Real* dphi() noexcept {
+        return size <= kFixedSimplexAxisSize ? dphi_fixed.data() : dphi_dynamic.data();
+    }
+
+    Real* d2phi() noexcept {
+        return size <= kFixedSimplexAxisSize ? d2phi_fixed.data() : d2phi_dynamic.data();
+    }
+
+    const Real* phi() const noexcept {
+        return size <= kFixedSimplexAxisSize ? phi_fixed.data() : phi_dynamic.data();
+    }
+
+    const Real* dphi() const noexcept {
+        return size <= kFixedSimplexAxisSize ? dphi_fixed.data() : dphi_dynamic.data();
+    }
+
+    const Real* d2phi() const noexcept {
+        return size <= kFixedSimplexAxisSize ? d2phi_fixed.data() : d2phi_dynamic.data();
+    }
+};
+
+SimplexAxisScratch& simplex_axis_scratch_slot(int slot) {
+    thread_local SimplexAxisScratch s[4];
+    return s[slot];
+}
+
+struct SimplexVectorSink {
+    std::vector<Real>* values;
+    std::vector<Gradient>* gradients;
+    std::vector<Hessian>* hessians;
+
+    bool wants_values() const noexcept { return values != nullptr; }
+    bool wants_gradients() const noexcept { return gradients != nullptr; }
+    bool wants_hessians() const noexcept { return hessians != nullptr; }
+
+    void prepare(std::size_t n_nodes) const {
+        if (values)    values->resize(n_nodes);
+        if (gradients) gradients->resize(n_nodes);
+        if (hessians)  hessians->resize(n_nodes);
+    }
+
+    void write_value(std::size_t n, Real value) const {
+        (*values)[n] = value;
+    }
+
+    void write_gradient(std::size_t n, Real x, Real y, Real z) const {
+        auto& gradient = (*gradients)[n];
+        gradient[0] = x;
+        gradient[1] = y;
+        gradient[2] = z;
+    }
+
+    void write_hessian(std::size_t n,
+                       Real xx,
+                       Real yy,
+                       Real zz,
+                       Real xy,
+                       Real xz,
+                       Real yz) const {
+        Hessian hessian{};
+        hessian(0, 0) = xx;
+        hessian(1, 1) = yy;
+        hessian(2, 2) = zz;
+        hessian(0, 1) = xy; hessian(1, 0) = xy;
+        hessian(0, 2) = xz; hessian(2, 0) = xz;
+        hessian(1, 2) = yz; hessian(2, 1) = yz;
+        (*hessians)[n] = hessian;
+    }
+};
+
+struct SimplexRawSink {
+    Real* values;
+    Real* gradients;
+    Real* hessians;
+
+    bool wants_values() const noexcept { return values != nullptr; }
+    bool wants_gradients() const noexcept { return gradients != nullptr; }
+    bool wants_hessians() const noexcept { return hessians != nullptr; }
+
+    void prepare(std::size_t) const {}
+
+    void write_value(std::size_t n, Real value) const {
+        values[n] = value;
+    }
+
+    void write_gradient(std::size_t n, Real x, Real y, Real z) const {
+        Real* gradient = gradients + n * 3u;
+        gradient[0] = x;
+        gradient[1] = y;
+        gradient[2] = z;
+    }
+
+    void write_hessian(std::size_t n,
+                       Real xx,
+                       Real yy,
+                       Real zz,
+                       Real xy,
+                       Real xz,
+                       Real yz) const {
+        Real* hessian = hessians + n * 9u;
+        hessian[0] = xx;
+        hessian[1] = xy;
+        hessian[2] = xz;
+        hessian[3] = xy;
+        hessian[4] = yy;
+        hessian[5] = yz;
+        hessian[6] = xz;
+        hessian[7] = yz;
+        hessian[8] = zz;
+    }
+};
+
+template <typename Sink>
+void evaluate_triangle_simplex_basis_impl(const std::vector<std::array<int, 4>>& simplex_exponents,
+                                          int order,
+                                          const math::Vector<Real, 3>& xi,
+                                          const Sink& sink) {
+    const Real l1 = xi[0];
+    const Real l2 = xi[1];
+    const Real l0 = Real(1) - l1 - l2;
+
+    const std::size_t n = static_cast<std::size_t>(order + 1);
+    SimplexAxisScratch& s0 = simplex_axis_scratch_slot(0);
+    SimplexAxisScratch& s1 = simplex_axis_scratch_slot(1);
+    SimplexAxisScratch& s2 = simplex_axis_scratch_slot(2);
+    s0.reserveFor(n);
+    s1.reserveFor(n);
+    s2.reserveFor(n);
+
+    const std::size_t num_nodes = simplex_exponents.size();
+    sink.prepare(num_nodes);
+    const bool need_values = sink.wants_values();
+    const bool need_gradients = sink.wants_gradients();
+    const bool need_hessians = sink.wants_hessians();
+    Real* d0_out = (need_gradients || need_hessians) ? s0.dphi() : nullptr;
+    Real* d1_out = (need_gradients || need_hessians) ? s1.dphi() : nullptr;
+    Real* d2_out = (need_gradients || need_hessians) ? s2.dphi() : nullptr;
+    Real* d20_out = need_hessians ? s0.d2phi() : nullptr;
+    Real* d21_out = need_hessians ? s1.d2phi() : nullptr;
+    Real* d22_out = need_hessians ? s2.d2phi() : nullptr;
+
+    simplex_lagrange_factor_sequence(order, l0, s0.phi(), d0_out, d20_out);
+    simplex_lagrange_factor_sequence(order, l1, s1.phi(), d1_out, d21_out);
+    simplex_lagrange_factor_sequence(order, l2, s2.phi(), d2_out, d22_out);
+    const Real* phi0 = s0.phi();
+    const Real* phi1 = s1.phi();
+    const Real* phi2 = s2.phi();
+    const Real* dphi0 = s0.dphi();
+    const Real* dphi1 = s1.dphi();
+    const Real* dphi2 = s2.dphi();
+    const Real* d2phi0 = s0.d2phi();
+    const Real* d2phi1 = s1.d2phi();
+    const Real* d2phi2 = s2.d2phi();
+
+    for (std::size_t n_idx = 0; n_idx < num_nodes; ++n_idx) {
+        const auto& e = simplex_exponents[n_idx];
+        const std::size_t i0 = static_cast<std::size_t>(e[0]);
+        const std::size_t i1 = static_cast<std::size_t>(e[1]);
+        const std::size_t i2 = static_cast<std::size_t>(e[2]);
+
+        const Real v0 = phi0[i0];
+        const Real v1 = phi1[i1];
+        const Real v2 = phi2[i2];
+        if (need_values) {
+            sink.write_value(n_idx, v0 * v1 * v2);
+        }
+        if (!need_gradients && !need_hessians) {
+            continue;
+        }
+
+        const Real D0 = dphi0[i0];
+        const Real D1 = dphi1[i1];
+        const Real D2 = dphi2[i2];
+
+        if (need_gradients) {
+            const Real dl0 = D0 * v1 * v2;
+            const Real dl1 = v0 * D1 * v2;
+            const Real dl2 = v0 * v1 * D2;
+            sink.write_gradient(n_idx, dl1 - dl0, dl2 - dl0, Real(0));
+        }
+
+        if (need_hessians) {
+            const Real DD0 = d2phi0[i0];
+            const Real DD1 = d2phi1[i1];
+            const Real DD2 = d2phi2[i2];
+
+            const Real H00 = DD0 * v1 * v2;
+            const Real H11 = v0 * DD1 * v2;
+            const Real H22 = v0 * v1 * DD2;
+            const Real H01 = D0 * D1 * v2;
+            const Real H02 = D0 * v1 * D2;
+            const Real H12 = v0 * D1 * D2;
+
+            sink.write_hessian(n_idx,
+                               H00 - Real(2) * H01 + H11,
+                               H00 - Real(2) * H02 + H22,
+                               Real(0),
+                               H00 - H01 - H02 + H12,
+                               Real(0),
+                               Real(0));
+        }
+    }
+}
+
+void evaluate_triangle_simplex_basis(const std::vector<std::array<int, 4>>& simplex_exponents,
+                                     int order,
+                                     const math::Vector<Real, 3>& xi,
+                                     std::vector<Real>* values,
+                                     std::vector<Gradient>* gradients,
+                                     std::vector<Hessian>* hessians) {
+    const SimplexVectorSink sink{values, gradients, hessians};
+    evaluate_triangle_simplex_basis_impl(simplex_exponents, order, xi, sink);
+}
+
+void evaluate_triangle_simplex_basis_to(const std::vector<std::array<int, 4>>& simplex_exponents,
+                                        int order,
+                                        const math::Vector<Real, 3>& xi,
+                                        Real* SVMP_RESTRICT values_out,
+                                        Real* SVMP_RESTRICT gradients_out,
+                                        Real* SVMP_RESTRICT hessians_out) {
+    const SimplexRawSink sink{values_out, gradients_out, hessians_out};
+    evaluate_triangle_simplex_basis_impl(simplex_exponents, order, xi, sink);
+}
+
+void evaluate_triangle_simplex_basis_strided(
+    const std::vector<std::array<int, 4>>& simplex_exponents,
+    int order,
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) {
+    const std::size_t num_nodes = simplex_exponents.size();
+    if (points.empty() || num_nodes == 0u) {
+        return;
+    }
+
+    const std::size_t sequence_size = static_cast<std::size_t>(order + 1);
+    const std::size_t num_qpts = points.size();
+    const bool need_gradients = gradients_out != nullptr;
+    const bool need_hessians = hessians_out != nullptr;
+    if (num_qpts == 4u &&
+        values_out != nullptr &&
+        !need_gradients &&
+        !need_hessians &&
+        try_evaluate_triangle_simplex_values_q4(
+            simplex_exponents, order, points, output_stride, values_out)) {
+        return;
+    }
+    if (num_qpts == 4u &&
+        values_out == nullptr &&
+        need_gradients &&
+        !need_hessians &&
+        try_evaluate_triangle_simplex_gradients_q4(
+            simplex_exponents, order, points, output_stride, gradients_out)) {
+        return;
+    }
+    if (num_qpts == 4u &&
+        need_hessians &&
+        try_evaluate_triangle_simplex_hessian_outputs_q4(
+            simplex_exponents, order, points, output_stride,
+            values_out, gradients_out, hessians_out)) {
+        return;
+    }
+    const std::size_t batch_entries = sequence_size * num_qpts;
+    if (batch_entries <= kFixedSimplexBatchEntries) {
+        if (values_out != nullptr && gradients_out == nullptr && hessians_out == nullptr) {
+            std::array<Real, kFixedSimplexBatchEntries> phi0_batch;
+            std::array<Real, kFixedSimplexBatchEntries> phi1_batch;
+            std::array<Real, kFixedSimplexBatchEntries> phi2_batch;
+
+            for (std::size_t q = 0; q < num_qpts; ++q) {
+                const auto& xi = points[q];
+                const Real l1 = xi[0];
+                const Real l2 = xi[1];
+                const Real l0 = Real(1) - l1 - l2;
+                const std::size_t offset = q * sequence_size;
+                simplex_lagrange_factor_sequence(
+                    order, l0, phi0_batch.data() + offset, nullptr, nullptr);
+                simplex_lagrange_factor_sequence(
+                    order, l1, phi1_batch.data() + offset, nullptr, nullptr);
+                simplex_lagrange_factor_sequence(
+                    order, l2, phi2_batch.data() + offset, nullptr, nullptr);
+            }
+
+            for (std::size_t node = 0; node < num_nodes; ++node) {
+                const auto& e = simplex_exponents[node];
+                const std::size_t i0 = static_cast<std::size_t>(e[0]);
+                const std::size_t i1 = static_cast<std::size_t>(e[1]);
+                const std::size_t i2 = static_cast<std::size_t>(e[2]);
+                Real* value_row = values_out + node * output_stride;
+
+                for (std::size_t q = 0; q < num_qpts; ++q) {
+                    const std::size_t offset = q * sequence_size;
+                    value_row[q] =
+                        phi0_batch[offset + i0] *
+                        phi1_batch[offset + i1] *
+                        phi2_batch[offset + i2];
+                }
+            }
+            return;
+        }
+
+        if (values_out == nullptr && gradients_out != nullptr && hessians_out == nullptr) {
+            std::array<Real, kFixedSimplexBatchEntries> phi0_batch;
+            std::array<Real, kFixedSimplexBatchEntries> phi1_batch;
+            std::array<Real, kFixedSimplexBatchEntries> phi2_batch;
+            std::array<Real, kFixedSimplexBatchEntries> dphi0_batch;
+            std::array<Real, kFixedSimplexBatchEntries> dphi1_batch;
+            std::array<Real, kFixedSimplexBatchEntries> dphi2_batch;
+
+            for (std::size_t q = 0; q < num_qpts; ++q) {
+                const auto& xi = points[q];
+                const Real l1 = xi[0];
+                const Real l2 = xi[1];
+                const Real l0 = Real(1) - l1 - l2;
+                const std::size_t offset = q * sequence_size;
+                simplex_lagrange_factor_sequence(
+                    order, l0, phi0_batch.data() + offset, dphi0_batch.data() + offset, nullptr);
+                simplex_lagrange_factor_sequence(
+                    order, l1, phi1_batch.data() + offset, dphi1_batch.data() + offset, nullptr);
+                simplex_lagrange_factor_sequence(
+                    order, l2, phi2_batch.data() + offset, dphi2_batch.data() + offset, nullptr);
+            }
+
+            for (std::size_t node = 0; node < num_nodes; ++node) {
+                const auto& e = simplex_exponents[node];
+                const std::size_t i0 = static_cast<std::size_t>(e[0]);
+                const std::size_t i1 = static_cast<std::size_t>(e[1]);
+                const std::size_t i2 = static_cast<std::size_t>(e[2]);
+                Real* g = gradients_out + node * 3u * output_stride;
+
+                for (std::size_t q = 0; q < num_qpts; ++q) {
+                    const std::size_t offset = q * sequence_size;
+                    const Real v0 = phi0_batch[offset + i0];
+                    const Real v1 = phi1_batch[offset + i1];
+                    const Real v2 = phi2_batch[offset + i2];
+                    const Real D0 = dphi0_batch[offset + i0];
+                    const Real D1 = dphi1_batch[offset + i1];
+                    const Real D2 = dphi2_batch[offset + i2];
+                    const Real dl0 = D0 * v1 * v2;
+                    g[0u * output_stride + q] = v0 * D1 * v2 - dl0;
+                    g[1u * output_stride + q] = v0 * v1 * D2 - dl0;
+                    g[2u * output_stride + q] = Real(0);
+                }
+            }
+            return;
+        }
+
+        if (order >= 4 &&
+            values_out == nullptr &&
+            gradients_out == nullptr &&
+            hessians_out != nullptr) {
+            std::array<Real, kFixedSimplexBatchEntries> phi0_batch;
+            std::array<Real, kFixedSimplexBatchEntries> phi1_batch;
+            std::array<Real, kFixedSimplexBatchEntries> phi2_batch;
+            std::array<Real, kFixedSimplexBatchEntries> dphi0_batch;
+            std::array<Real, kFixedSimplexBatchEntries> dphi1_batch;
+            std::array<Real, kFixedSimplexBatchEntries> dphi2_batch;
+            std::array<Real, kFixedSimplexBatchEntries> d2phi0_batch;
+            std::array<Real, kFixedSimplexBatchEntries> d2phi1_batch;
+            std::array<Real, kFixedSimplexBatchEntries> d2phi2_batch;
+
+            for (std::size_t q = 0; q < num_qpts; ++q) {
+                const auto& xi = points[q];
+                const Real l1 = xi[0];
+                const Real l2 = xi[1];
+                const Real l0 = Real(1) - l1 - l2;
+                const std::size_t offset = q * sequence_size;
+                simplex_lagrange_factor_sequence(
+                    order, l0, phi0_batch.data() + offset,
+                    dphi0_batch.data() + offset, d2phi0_batch.data() + offset);
+                simplex_lagrange_factor_sequence(
+                    order, l1, phi1_batch.data() + offset,
+                    dphi1_batch.data() + offset, d2phi1_batch.data() + offset);
+                simplex_lagrange_factor_sequence(
+                    order, l2, phi2_batch.data() + offset,
+                    dphi2_batch.data() + offset, d2phi2_batch.data() + offset);
+            }
+
+            for (std::size_t node = 0; node < num_nodes; ++node) {
+                const auto& e = simplex_exponents[node];
+                const std::size_t i0 = static_cast<std::size_t>(e[0]);
+                const std::size_t i1 = static_cast<std::size_t>(e[1]);
+                const std::size_t i2 = static_cast<std::size_t>(e[2]);
+                Real* H = hessians_out + node * 9u * output_stride;
+
+                for (std::size_t q = 0; q < num_qpts; ++q) {
+                    const std::size_t offset = q * sequence_size;
+                    const Real v0 = phi0_batch[offset + i0];
+                    const Real v1 = phi1_batch[offset + i1];
+                    const Real v2 = phi2_batch[offset + i2];
+                    const Real D0 = dphi0_batch[offset + i0];
+                    const Real D1 = dphi1_batch[offset + i1];
+                    const Real D2 = dphi2_batch[offset + i2];
+                    const Real DD0 = d2phi0_batch[offset + i0];
+                    const Real DD1 = d2phi1_batch[offset + i1];
+                    const Real DD2 = d2phi2_batch[offset + i2];
+                    const Real H00 = DD0 * v1 * v2;
+                    const Real H11 = v0 * DD1 * v2;
+                    const Real H22 = v0 * v1 * DD2;
+                    const Real H01 = D0 * D1 * v2;
+                    const Real H02 = D0 * v1 * D2;
+                    const Real H12 = v0 * D1 * D2;
+                    const Real h01 = H00 - H01 - H02 + H12;
+
+                    H[0u * output_stride + q] = H00 - Real(2) * H01 + H11;
+                    H[1u * output_stride + q] = h01;
+                    H[2u * output_stride + q] = Real(0);
+                    H[3u * output_stride + q] = h01;
+                    H[4u * output_stride + q] = H00 - Real(2) * H02 + H22;
+                    H[5u * output_stride + q] = Real(0);
+                    H[6u * output_stride + q] = Real(0);
+                    H[7u * output_stride + q] = Real(0);
+                    H[8u * output_stride + q] = Real(0);
+                }
+            }
+            return;
+        }
+
+        std::array<Real, kFixedSimplexBatchEntries> phi0_batch;
+        std::array<Real, kFixedSimplexBatchEntries> phi1_batch;
+        std::array<Real, kFixedSimplexBatchEntries> phi2_batch;
+        std::array<Real, kFixedSimplexBatchEntries> dphi0_batch;
+        std::array<Real, kFixedSimplexBatchEntries> dphi1_batch;
+        std::array<Real, kFixedSimplexBatchEntries> dphi2_batch;
+        std::array<Real, kFixedSimplexBatchEntries> d2phi0_batch;
+        std::array<Real, kFixedSimplexBatchEntries> d2phi1_batch;
+        std::array<Real, kFixedSimplexBatchEntries> d2phi2_batch;
+
+        for (std::size_t q = 0; q < num_qpts; ++q) {
+            const auto& xi = points[q];
+            const Real l1 = xi[0];
+            const Real l2 = xi[1];
+            const Real l0 = Real(1) - l1 - l2;
+            const std::size_t offset = q * sequence_size;
+            Real* d0_out = (need_gradients || need_hessians) ? dphi0_batch.data() + offset : nullptr;
+            Real* d1_out = (need_gradients || need_hessians) ? dphi1_batch.data() + offset : nullptr;
+            Real* d2_out = (need_gradients || need_hessians) ? dphi2_batch.data() + offset : nullptr;
+            Real* d20_out = need_hessians ? d2phi0_batch.data() + offset : nullptr;
+            Real* d21_out = need_hessians ? d2phi1_batch.data() + offset : nullptr;
+            Real* d22_out = need_hessians ? d2phi2_batch.data() + offset : nullptr;
+            simplex_lagrange_factor_sequence(order, l0, phi0_batch.data() + offset, d0_out, d20_out);
+            simplex_lagrange_factor_sequence(order, l1, phi1_batch.data() + offset, d1_out, d21_out);
+            simplex_lagrange_factor_sequence(order, l2, phi2_batch.data() + offset, d2_out, d22_out);
+        }
+
+        for (std::size_t node = 0; node < num_nodes; ++node) {
+            const auto& e = simplex_exponents[node];
+            const std::size_t i0 = static_cast<std::size_t>(e[0]);
+            const std::size_t i1 = static_cast<std::size_t>(e[1]);
+            const std::size_t i2 = static_cast<std::size_t>(e[2]);
+            Real* value_row = values_out ? values_out + node * output_stride : nullptr;
+            Real* g = gradients_out ? gradients_out + node * 3u * output_stride : nullptr;
+            Real* H = hessians_out ? hessians_out + node * 9u * output_stride : nullptr;
+
+            for (std::size_t q = 0; q < num_qpts; ++q) {
+                const std::size_t offset = q * sequence_size;
+                const Real v0 = phi0_batch[offset + i0];
+                const Real v1 = phi1_batch[offset + i1];
+                const Real v2 = phi2_batch[offset + i2];
+                if (value_row != nullptr) {
+                    value_row[q] = v0 * v1 * v2;
+                }
+                if (!need_gradients && !need_hessians) {
+                    continue;
+                }
+
+                const Real D0 = dphi0_batch[offset + i0];
+                const Real D1 = dphi1_batch[offset + i1];
+                const Real D2 = dphi2_batch[offset + i2];
+
+                if (gradients_out != nullptr) {
+                    const Real dl0 = D0 * v1 * v2;
+                    const Real dl1 = v0 * D1 * v2;
+                    const Real dl2 = v0 * v1 * D2;
+                    g[0u * output_stride + q] = dl1 - dl0;
+                    g[1u * output_stride + q] = dl2 - dl0;
+                    g[2u * output_stride + q] = Real(0);
+                }
+
+                if (hessians_out != nullptr) {
+                    const Real DD0 = d2phi0_batch[offset + i0];
+                    const Real DD1 = d2phi1_batch[offset + i1];
+                    const Real DD2 = d2phi2_batch[offset + i2];
+                    const Real H00 = DD0 * v1 * v2;
+                    const Real H11 = v0 * DD1 * v2;
+                    const Real H22 = v0 * v1 * DD2;
+                    const Real H01 = D0 * D1 * v2;
+                    const Real H02 = D0 * v1 * D2;
+                    const Real H12 = v0 * D1 * D2;
+                    const Real h01 = H00 - H01 - H02 + H12;
+                    H[0u * output_stride + q] = H00 - Real(2) * H01 + H11;
+                    H[1u * output_stride + q] = h01;
+                    H[2u * output_stride + q] = Real(0);
+                    H[3u * output_stride + q] = h01;
+                    H[4u * output_stride + q] = H00 - Real(2) * H02 + H22;
+                    H[5u * output_stride + q] = Real(0);
+                    H[6u * output_stride + q] = Real(0);
+                    H[7u * output_stride + q] = Real(0);
+                    H[8u * output_stride + q] = Real(0);
+                }
+            }
+        }
+        return;
+    }
+
+    SimplexAxisScratch& s0 = simplex_axis_scratch_slot(0);
+    SimplexAxisScratch& s1 = simplex_axis_scratch_slot(1);
+    SimplexAxisScratch& s2 = simplex_axis_scratch_slot(2);
+    s0.reserveFor(sequence_size);
+    s1.reserveFor(sequence_size);
+    s2.reserveFor(sequence_size);
+
+    for (std::size_t q = 0; q < points.size(); ++q) {
+        const auto& xi = points[q];
+        const Real l1 = xi[0];
+        const Real l2 = xi[1];
+        const Real l0 = Real(1) - l1 - l2;
+
+        Real* d0_out = (need_gradients || need_hessians) ? s0.dphi() : nullptr;
+        Real* d1_out = (need_gradients || need_hessians) ? s1.dphi() : nullptr;
+        Real* d2_out = (need_gradients || need_hessians) ? s2.dphi() : nullptr;
+        Real* d20_out = need_hessians ? s0.d2phi() : nullptr;
+        Real* d21_out = need_hessians ? s1.d2phi() : nullptr;
+        Real* d22_out = need_hessians ? s2.d2phi() : nullptr;
+
+        simplex_lagrange_factor_sequence(order, l0, s0.phi(), d0_out, d20_out);
+        simplex_lagrange_factor_sequence(order, l1, s1.phi(), d1_out, d21_out);
+        simplex_lagrange_factor_sequence(order, l2, s2.phi(), d2_out, d22_out);
+        const Real* phi0 = s0.phi();
+        const Real* phi1 = s1.phi();
+        const Real* phi2 = s2.phi();
+        const Real* dphi0 = s0.dphi();
+        const Real* dphi1 = s1.dphi();
+        const Real* dphi2 = s2.dphi();
+        const Real* d2phi0 = s0.d2phi();
+        const Real* d2phi1 = s1.d2phi();
+        const Real* d2phi2 = s2.d2phi();
+
+        for (std::size_t node = 0; node < num_nodes; ++node) {
+            const auto& e = simplex_exponents[node];
+            const std::size_t i0 = static_cast<std::size_t>(e[0]);
+            const std::size_t i1 = static_cast<std::size_t>(e[1]);
+            const std::size_t i2 = static_cast<std::size_t>(e[2]);
+
+            const Real v0 = phi0[i0];
+            const Real v1 = phi1[i1];
+            const Real v2 = phi2[i2];
+            const Real value = v0 * v1 * v2;
+            if (values_out != nullptr) {
+                values_out[node * output_stride + q] = value;
+            }
+            if (!need_gradients && !need_hessians) {
+                continue;
+            }
+
+            const Real D0 = dphi0[i0];
+            const Real D1 = dphi1[i1];
+            const Real D2 = dphi2[i2];
+
+            if (gradients_out != nullptr) {
+                const Real dl0 = D0 * v1 * v2;
+                const Real dl1 = v0 * D1 * v2;
+                const Real dl2 = v0 * v1 * D2;
+                Real* g = gradients_out + node * 3u * output_stride;
+                g[0u * output_stride + q] = dl1 - dl0;
+                g[1u * output_stride + q] = dl2 - dl0;
+                g[2u * output_stride + q] = Real(0);
+            }
+
+            if (hessians_out != nullptr) {
+                const Real DD0 = d2phi0[i0];
+                const Real DD1 = d2phi1[i1];
+                const Real DD2 = d2phi2[i2];
+
+                const Real H00 = DD0 * v1 * v2;
+                const Real H11 = v0 * DD1 * v2;
+                const Real H22 = v0 * v1 * DD2;
+                const Real H01 = D0 * D1 * v2;
+                const Real H02 = D0 * v1 * D2;
+                const Real H12 = v0 * D1 * D2;
+
+                Real* H = hessians_out + node * 9u * output_stride;
+                const Real h01 = H00 - H01 - H02 + H12;
+                H[0u * output_stride + q] = H00 - Real(2) * H01 + H11;
+                H[1u * output_stride + q] = h01;
+                H[2u * output_stride + q] = Real(0);
+                H[3u * output_stride + q] = h01;
+                H[4u * output_stride + q] = H00 - Real(2) * H02 + H22;
+                H[5u * output_stride + q] = Real(0);
+                H[6u * output_stride + q] = Real(0);
+                H[7u * output_stride + q] = Real(0);
+                H[8u * output_stride + q] = Real(0);
+            }
+        }
+    }
+}
+
+void evaluate_triangle_simplex_basis_wedge_components_strided(
+    const std::vector<std::array<int, 4>>& simplex_exponents,
+    int order,
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_xy_out,
+    Real* SVMP_RESTRICT hessians_xx_xy_yy_out) {
+    const std::size_t num_nodes = simplex_exponents.size();
+    if (points.empty() || num_nodes == 0u) {
+        return;
+    }
+
+    const std::size_t sequence_size = static_cast<std::size_t>(order + 1);
+    const std::size_t num_qpts = points.size();
+    const bool need_gradients = gradients_xy_out != nullptr;
+    const bool need_hessians = hessians_xx_xy_yy_out != nullptr;
+    const std::size_t batch_entries = sequence_size * num_qpts;
+
+    if (batch_entries <= kFixedSimplexBatchEntries) {
+        if (values_out != nullptr &&
+            gradients_xy_out != nullptr &&
+            hessians_xx_xy_yy_out == nullptr) {
+            std::array<Real, kFixedSimplexBatchEntries> phi0_batch;
+            std::array<Real, kFixedSimplexBatchEntries> phi1_batch;
+            std::array<Real, kFixedSimplexBatchEntries> phi2_batch;
+            std::array<Real, kFixedSimplexBatchEntries> dphi0_batch;
+            std::array<Real, kFixedSimplexBatchEntries> dphi1_batch;
+            std::array<Real, kFixedSimplexBatchEntries> dphi2_batch;
+
+            for (std::size_t q = 0; q < num_qpts; ++q) {
+                const auto& xi = points[q];
+                const Real l1 = xi[0];
+                const Real l2 = xi[1];
+                const Real l0 = Real(1) - l1 - l2;
+                const std::size_t offset = q * sequence_size;
+                simplex_lagrange_factor_sequence(
+                    order, l0, phi0_batch.data() + offset, dphi0_batch.data() + offset, nullptr);
+                simplex_lagrange_factor_sequence(
+                    order, l1, phi1_batch.data() + offset, dphi1_batch.data() + offset, nullptr);
+                simplex_lagrange_factor_sequence(
+                    order, l2, phi2_batch.data() + offset, dphi2_batch.data() + offset, nullptr);
+            }
+
+            for (std::size_t node = 0; node < num_nodes; ++node) {
+                const auto& e = simplex_exponents[node];
+                const std::size_t i0 = static_cast<std::size_t>(e[0]);
+                const std::size_t i1 = static_cast<std::size_t>(e[1]);
+                const std::size_t i2 = static_cast<std::size_t>(e[2]);
+                Real* value_row = values_out + node * output_stride;
+                Real* g = gradients_xy_out + node * 2u * output_stride;
+
+                for (std::size_t q = 0; q < num_qpts; ++q) {
+                    const std::size_t offset = q * sequence_size;
+                    const Real v0 = phi0_batch[offset + i0];
+                    const Real v1 = phi1_batch[offset + i1];
+                    const Real v2 = phi2_batch[offset + i2];
+                    const Real D0 = dphi0_batch[offset + i0];
+                    const Real D1 = dphi1_batch[offset + i1];
+                    const Real D2 = dphi2_batch[offset + i2];
+                    const Real dl0 = D0 * v1 * v2;
+                    value_row[q] = v0 * v1 * v2;
+                    g[0u * output_stride + q] = v0 * D1 * v2 - dl0;
+                    g[1u * output_stride + q] = v0 * v1 * D2 - dl0;
+                }
+            }
+            return;
+        }
+
+        if (values_out != nullptr &&
+            gradients_xy_out != nullptr &&
+            hessians_xx_xy_yy_out != nullptr) {
+            std::array<Real, kFixedSimplexBatchEntries> phi0_batch;
+            std::array<Real, kFixedSimplexBatchEntries> phi1_batch;
+            std::array<Real, kFixedSimplexBatchEntries> phi2_batch;
+            std::array<Real, kFixedSimplexBatchEntries> dphi0_batch;
+            std::array<Real, kFixedSimplexBatchEntries> dphi1_batch;
+            std::array<Real, kFixedSimplexBatchEntries> dphi2_batch;
+            std::array<Real, kFixedSimplexBatchEntries> d2phi0_batch;
+            std::array<Real, kFixedSimplexBatchEntries> d2phi1_batch;
+            std::array<Real, kFixedSimplexBatchEntries> d2phi2_batch;
+
+            for (std::size_t q = 0; q < num_qpts; ++q) {
+                const auto& xi = points[q];
+                const Real l1 = xi[0];
+                const Real l2 = xi[1];
+                const Real l0 = Real(1) - l1 - l2;
+                const std::size_t offset = q * sequence_size;
+                simplex_lagrange_factor_sequence_impl<true, true>(
+                    order, l0, phi0_batch.data() + offset,
+                    dphi0_batch.data() + offset, d2phi0_batch.data() + offset);
+                simplex_lagrange_factor_sequence_impl<true, true>(
+                    order, l1, phi1_batch.data() + offset,
+                    dphi1_batch.data() + offset, d2phi1_batch.data() + offset);
+                simplex_lagrange_factor_sequence_impl<true, true>(
+                    order, l2, phi2_batch.data() + offset,
+                    dphi2_batch.data() + offset, d2phi2_batch.data() + offset);
+            }
+
+            for (std::size_t node = 0; node < num_nodes; ++node) {
+                const auto& e = simplex_exponents[node];
+                const std::size_t i0 = static_cast<std::size_t>(e[0]);
+                const std::size_t i1 = static_cast<std::size_t>(e[1]);
+                const std::size_t i2 = static_cast<std::size_t>(e[2]);
+                Real* SVMP_RESTRICT value_row = values_out + node * output_stride;
+                Real* SVMP_RESTRICT g = gradients_xy_out + node * 2u * output_stride;
+                Real* SVMP_RESTRICT H = hessians_xx_xy_yy_out + node * 3u * output_stride;
+
+                for (std::size_t q = 0; q < num_qpts; ++q) {
+                    const std::size_t offset = q * sequence_size;
+                    const Real v0 = phi0_batch[offset + i0];
+                    const Real v1 = phi1_batch[offset + i1];
+                    const Real v2 = phi2_batch[offset + i2];
+                    const Real D0 = dphi0_batch[offset + i0];
+                    const Real D1 = dphi1_batch[offset + i1];
+                    const Real D2 = dphi2_batch[offset + i2];
+                    const Real dl0 = D0 * v1 * v2;
+                    const Real dl1 = v0 * D1 * v2;
+                    const Real dl2 = v0 * v1 * D2;
+                    const Real DD0 = d2phi0_batch[offset + i0];
+                    const Real DD1 = d2phi1_batch[offset + i1];
+                    const Real DD2 = d2phi2_batch[offset + i2];
+                    const Real H00 = DD0 * v1 * v2;
+                    const Real H11 = v0 * DD1 * v2;
+                    const Real H22 = v0 * v1 * DD2;
+                    const Real H01 = D0 * D1 * v2;
+                    const Real H02 = D0 * v1 * D2;
+                    const Real H12 = v0 * D1 * D2;
+
+                    value_row[q] = v0 * v1 * v2;
+                    g[0u * output_stride + q] = dl1 - dl0;
+                    g[1u * output_stride + q] = dl2 - dl0;
+                    H[0u * output_stride + q] = H00 - Real(2) * H01 + H11;
+                    H[1u * output_stride + q] = H00 - H01 - H02 + H12;
+                    H[2u * output_stride + q] = H00 - Real(2) * H02 + H22;
+                }
+            }
+            return;
+        }
+
+        std::array<Real, kFixedSimplexBatchEntries> phi0_batch;
+        std::array<Real, kFixedSimplexBatchEntries> phi1_batch;
+        std::array<Real, kFixedSimplexBatchEntries> phi2_batch;
+        std::array<Real, kFixedSimplexBatchEntries> dphi0_batch;
+        std::array<Real, kFixedSimplexBatchEntries> dphi1_batch;
+        std::array<Real, kFixedSimplexBatchEntries> dphi2_batch;
+        std::array<Real, kFixedSimplexBatchEntries> d2phi0_batch;
+        std::array<Real, kFixedSimplexBatchEntries> d2phi1_batch;
+        std::array<Real, kFixedSimplexBatchEntries> d2phi2_batch;
+
+        for (std::size_t q = 0; q < num_qpts; ++q) {
+            const auto& xi = points[q];
+            const Real l1 = xi[0];
+            const Real l2 = xi[1];
+            const Real l0 = Real(1) - l1 - l2;
+            const std::size_t offset = q * sequence_size;
+            Real* d0_out = (need_gradients || need_hessians) ? dphi0_batch.data() + offset : nullptr;
+            Real* d1_out = (need_gradients || need_hessians) ? dphi1_batch.data() + offset : nullptr;
+            Real* d2_out = (need_gradients || need_hessians) ? dphi2_batch.data() + offset : nullptr;
+            Real* d20_out = need_hessians ? d2phi0_batch.data() + offset : nullptr;
+            Real* d21_out = need_hessians ? d2phi1_batch.data() + offset : nullptr;
+            Real* d22_out = need_hessians ? d2phi2_batch.data() + offset : nullptr;
+            simplex_lagrange_factor_sequence(order, l0, phi0_batch.data() + offset, d0_out, d20_out);
+            simplex_lagrange_factor_sequence(order, l1, phi1_batch.data() + offset, d1_out, d21_out);
+            simplex_lagrange_factor_sequence(order, l2, phi2_batch.data() + offset, d2_out, d22_out);
+        }
+
+        for (std::size_t node = 0; node < num_nodes; ++node) {
+            const auto& e = simplex_exponents[node];
+            const std::size_t i0 = static_cast<std::size_t>(e[0]);
+            const std::size_t i1 = static_cast<std::size_t>(e[1]);
+            const std::size_t i2 = static_cast<std::size_t>(e[2]);
+            Real* value_row = values_out ? values_out + node * output_stride : nullptr;
+            Real* g = gradients_xy_out ? gradients_xy_out + node * 2u * output_stride : nullptr;
+            Real* H = hessians_xx_xy_yy_out ? hessians_xx_xy_yy_out + node * 3u * output_stride : nullptr;
+
+            for (std::size_t q = 0; q < num_qpts; ++q) {
+                const std::size_t offset = q * sequence_size;
+                const Real v0 = phi0_batch[offset + i0];
+                const Real v1 = phi1_batch[offset + i1];
+                const Real v2 = phi2_batch[offset + i2];
+                if (value_row != nullptr) {
+                    value_row[q] = v0 * v1 * v2;
+                }
+                if (!need_gradients && !need_hessians) {
+                    continue;
+                }
+
+                const Real D0 = dphi0_batch[offset + i0];
+                const Real D1 = dphi1_batch[offset + i1];
+                const Real D2 = dphi2_batch[offset + i2];
+                const Real dl0 = D0 * v1 * v2;
+                const Real dl1 = v0 * D1 * v2;
+                const Real dl2 = v0 * v1 * D2;
+
+                if (gradients_xy_out != nullptr) {
+                    g[0u * output_stride + q] = dl1 - dl0;
+                    g[1u * output_stride + q] = dl2 - dl0;
+                }
+
+                if (hessians_xx_xy_yy_out != nullptr) {
+                    const Real DD0 = d2phi0_batch[offset + i0];
+                    const Real DD1 = d2phi1_batch[offset + i1];
+                    const Real DD2 = d2phi2_batch[offset + i2];
+                    const Real H00 = DD0 * v1 * v2;
+                    const Real H11 = v0 * DD1 * v2;
+                    const Real H22 = v0 * v1 * DD2;
+                    const Real H01 = D0 * D1 * v2;
+                    const Real H02 = D0 * v1 * D2;
+                    const Real H12 = v0 * D1 * D2;
+                    H[0u * output_stride + q] = H00 - Real(2) * H01 + H11;
+                    H[1u * output_stride + q] = H00 - H01 - H02 + H12;
+                    H[2u * output_stride + q] = H00 - Real(2) * H02 + H22;
+                }
+            }
+        }
+        return;
+    }
+
+    SimplexAxisScratch& s0 = simplex_axis_scratch_slot(0);
+    SimplexAxisScratch& s1 = simplex_axis_scratch_slot(1);
+    SimplexAxisScratch& s2 = simplex_axis_scratch_slot(2);
+    s0.reserveFor(sequence_size);
+    s1.reserveFor(sequence_size);
+    s2.reserveFor(sequence_size);
+
+    for (std::size_t q = 0; q < points.size(); ++q) {
+        const auto& xi = points[q];
+        const Real l1 = xi[0];
+        const Real l2 = xi[1];
+        const Real l0 = Real(1) - l1 - l2;
+
+        Real* d0_out = (need_gradients || need_hessians) ? s0.dphi() : nullptr;
+        Real* d1_out = (need_gradients || need_hessians) ? s1.dphi() : nullptr;
+        Real* d2_out = (need_gradients || need_hessians) ? s2.dphi() : nullptr;
+        Real* d20_out = need_hessians ? s0.d2phi() : nullptr;
+        Real* d21_out = need_hessians ? s1.d2phi() : nullptr;
+        Real* d22_out = need_hessians ? s2.d2phi() : nullptr;
+        simplex_lagrange_factor_sequence(order, l0, s0.phi(), d0_out, d20_out);
+        simplex_lagrange_factor_sequence(order, l1, s1.phi(), d1_out, d21_out);
+        simplex_lagrange_factor_sequence(order, l2, s2.phi(), d2_out, d22_out);
+
+        const Real* phi0 = s0.phi();
+        const Real* phi1 = s1.phi();
+        const Real* phi2 = s2.phi();
+        const Real* dphi0 = s0.dphi();
+        const Real* dphi1 = s1.dphi();
+        const Real* dphi2 = s2.dphi();
+        const Real* d2phi0 = s0.d2phi();
+        const Real* d2phi1 = s1.d2phi();
+        const Real* d2phi2 = s2.d2phi();
+
+        for (std::size_t node = 0; node < num_nodes; ++node) {
+            const auto& e = simplex_exponents[node];
+            const std::size_t i0 = static_cast<std::size_t>(e[0]);
+            const std::size_t i1 = static_cast<std::size_t>(e[1]);
+            const std::size_t i2 = static_cast<std::size_t>(e[2]);
+            const Real v0 = phi0[i0];
+            const Real v1 = phi1[i1];
+            const Real v2 = phi2[i2];
+
+            if (values_out != nullptr) {
+                values_out[node * output_stride + q] = v0 * v1 * v2;
+            }
+            if (!need_gradients && !need_hessians) {
+                continue;
+            }
+
+            const Real D0 = dphi0[i0];
+            const Real D1 = dphi1[i1];
+            const Real D2 = dphi2[i2];
+            const Real dl0 = D0 * v1 * v2;
+            const Real dl1 = v0 * D1 * v2;
+            const Real dl2 = v0 * v1 * D2;
+
+            if (gradients_xy_out != nullptr) {
+                Real* g = gradients_xy_out + node * 2u * output_stride;
+                g[0u * output_stride + q] = dl1 - dl0;
+                g[1u * output_stride + q] = dl2 - dl0;
+            }
+
+            if (hessians_xx_xy_yy_out != nullptr) {
+                const Real DD0 = d2phi0[i0];
+                const Real DD1 = d2phi1[i1];
+                const Real DD2 = d2phi2[i2];
+                const Real H00 = DD0 * v1 * v2;
+                const Real H11 = v0 * DD1 * v2;
+                const Real H22 = v0 * v1 * DD2;
+                const Real H01 = D0 * D1 * v2;
+                const Real H02 = D0 * v1 * D2;
+                const Real H12 = v0 * D1 * D2;
+                Real* H = hessians_xx_xy_yy_out + node * 3u * output_stride;
+                H[0u * output_stride + q] = H00 - Real(2) * H01 + H11;
+                H[1u * output_stride + q] = H00 - H01 - H02 + H12;
+                H[2u * output_stride + q] = H00 - Real(2) * H02 + H22;
+            }
+        }
+    }
+}
+
+template <typename Sink>
+void evaluate_tetrahedron_simplex_basis_impl(const std::vector<std::array<int, 4>>& simplex_exponents,
+                                             int order,
+                                             const math::Vector<Real, 3>& xi,
+                                             const Sink& sink) {
+    const Real l1 = xi[0];
+    const Real l2 = xi[1];
+    const Real l3 = xi[2];
+    const Real l0 = Real(1) - l1 - l2 - l3;
+
+    const std::size_t n = static_cast<std::size_t>(order + 1);
+    SimplexAxisScratch& s0 = simplex_axis_scratch_slot(0);
+    SimplexAxisScratch& s1 = simplex_axis_scratch_slot(1);
+    SimplexAxisScratch& s2 = simplex_axis_scratch_slot(2);
+    SimplexAxisScratch& s3 = simplex_axis_scratch_slot(3);
+    s0.reserveFor(n);
+    s1.reserveFor(n);
+    s2.reserveFor(n);
+    s3.reserveFor(n);
+
+    const std::size_t num_nodes = simplex_exponents.size();
+    sink.prepare(num_nodes);
+    const bool need_values = sink.wants_values();
+    const bool need_gradients = sink.wants_gradients();
+    const bool need_hessians = sink.wants_hessians();
+    Real* d0_out = (need_gradients || need_hessians) ? s0.dphi() : nullptr;
+    Real* d1_out = (need_gradients || need_hessians) ? s1.dphi() : nullptr;
+    Real* d2_out = (need_gradients || need_hessians) ? s2.dphi() : nullptr;
+    Real* d3_out = (need_gradients || need_hessians) ? s3.dphi() : nullptr;
+    Real* d20_out = need_hessians ? s0.d2phi() : nullptr;
+    Real* d21_out = need_hessians ? s1.d2phi() : nullptr;
+    Real* d22_out = need_hessians ? s2.d2phi() : nullptr;
+    Real* d23_out = need_hessians ? s3.d2phi() : nullptr;
+
+    simplex_lagrange_factor_sequence(order, l0, s0.phi(), d0_out, d20_out);
+    simplex_lagrange_factor_sequence(order, l1, s1.phi(), d1_out, d21_out);
+    simplex_lagrange_factor_sequence(order, l2, s2.phi(), d2_out, d22_out);
+    simplex_lagrange_factor_sequence(order, l3, s3.phi(), d3_out, d23_out);
+    const Real* phi0 = s0.phi();
+    const Real* phi1 = s1.phi();
+    const Real* phi2 = s2.phi();
+    const Real* phi3 = s3.phi();
+    const Real* dphi0 = s0.dphi();
+    const Real* dphi1 = s1.dphi();
+    const Real* dphi2 = s2.dphi();
+    const Real* dphi3 = s3.dphi();
+    const Real* d2phi0 = s0.d2phi();
+    const Real* d2phi1 = s1.d2phi();
+    const Real* d2phi2 = s2.d2phi();
+    const Real* d2phi3 = s3.d2phi();
+
+    for (std::size_t n_idx = 0; n_idx < num_nodes; ++n_idx) {
+        const auto& e = simplex_exponents[n_idx];
+        const std::size_t i0 = static_cast<std::size_t>(e[0]);
+        const std::size_t i1 = static_cast<std::size_t>(e[1]);
+        const std::size_t i2 = static_cast<std::size_t>(e[2]);
+        const std::size_t i3 = static_cast<std::size_t>(e[3]);
+
+        const Real v0 = phi0[i0];
+        const Real v1 = phi1[i1];
+        const Real v2 = phi2[i2];
+        const Real v3 = phi3[i3];
+        if (need_values) {
+            sink.write_value(n_idx, v0 * v1 * v2 * v3);
+        }
+        if (!need_gradients && !need_hessians) {
+            continue;
+        }
+
+        const Real D0 = dphi0[i0];
+        const Real D1 = dphi1[i1];
+        const Real D2 = dphi2[i2];
+        const Real D3 = dphi3[i3];
+
+        if (need_gradients) {
+            const Real dl0 = D0 * v1 * v2 * v3;
+            const Real dl1 = v0 * D1 * v2 * v3;
+            const Real dl2 = v0 * v1 * D2 * v3;
+            const Real dl3 = v0 * v1 * v2 * D3;
+            sink.write_gradient(n_idx, dl1 - dl0, dl2 - dl0, dl3 - dl0);
+        }
+
+        if (need_hessians) {
+            const Real DD0 = d2phi0[i0];
+            const Real DD1 = d2phi1[i1];
+            const Real DD2 = d2phi2[i2];
+            const Real DD3 = d2phi3[i3];
+
+            const Real H00 = DD0 * v1 * v2 * v3;
+            const Real H11 = v0 * DD1 * v2 * v3;
+            const Real H22 = v0 * v1 * DD2 * v3;
+            const Real H33 = v0 * v1 * v2 * DD3;
+
+            const Real H01 = D0 * D1 * v2 * v3;
+            const Real H02 = D0 * v1 * D2 * v3;
+            const Real H03 = D0 * v1 * v2 * D3;
+            const Real H12 = v0 * D1 * D2 * v3;
+            const Real H13 = v0 * D1 * v2 * D3;
+            const Real H23 = v0 * v1 * D2 * D3;
+
+            sink.write_hessian(n_idx,
+                               H00 - Real(2) * H01 + H11,
+                               H00 - Real(2) * H02 + H22,
+                               H00 - Real(2) * H03 + H33,
+                               H00 - H01 - H02 + H12,
+                               H00 - H01 - H03 + H13,
+                               H00 - H02 - H03 + H23);
+        }
+    }
+}
+
+void evaluate_tetrahedron_simplex_basis(const std::vector<std::array<int, 4>>& simplex_exponents,
+                                        int order,
+                                        const math::Vector<Real, 3>& xi,
+                                        std::vector<Real>* values,
+                                        std::vector<Gradient>* gradients,
+                                        std::vector<Hessian>* hessians) {
+    const SimplexVectorSink sink{values, gradients, hessians};
+    evaluate_tetrahedron_simplex_basis_impl(simplex_exponents, order, xi, sink);
+}
+
+void evaluate_tetrahedron_simplex_basis_to(const std::vector<std::array<int, 4>>& simplex_exponents,
+                                           int order,
+                                           const math::Vector<Real, 3>& xi,
+                                           Real* SVMP_RESTRICT values_out,
+                                           Real* SVMP_RESTRICT gradients_out,
+                                           Real* SVMP_RESTRICT hessians_out) {
+    const SimplexRawSink sink{values_out, gradients_out, hessians_out};
+    evaluate_tetrahedron_simplex_basis_impl(simplex_exponents, order, xi, sink);
+}
+
+void evaluate_tetrahedron_simplex_basis_strided(
+    const std::vector<std::array<int, 4>>& simplex_exponents,
+    int order,
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out) {
+    const std::size_t num_nodes = simplex_exponents.size();
+    if (points.empty() || num_nodes == 0u) {
+        return;
+    }
+
+    const std::size_t sequence_size = static_cast<std::size_t>(order + 1);
+    const std::size_t num_qpts = points.size();
+    const bool need_gradients = gradients_out != nullptr;
+    const bool need_hessians = hessians_out != nullptr;
+    if (num_qpts == 4u &&
+        values_out != nullptr &&
+        !need_gradients &&
+        !need_hessians &&
+        try_evaluate_tetrahedron_simplex_values_q4(
+            simplex_exponents, order, points, output_stride, values_out)) {
+        return;
+    }
+    if (num_qpts == 4u &&
+        values_out == nullptr &&
+        need_gradients &&
+        !need_hessians) {
+        switch (order) {
+        case 3:
+            evaluate_tetrahedron_simplex_gradients_q4<3>(
+                simplex_exponents, points, output_stride, gradients_out);
+            return;
+        case 4:
+            evaluate_tetrahedron_simplex_gradients_q4<4>(
+                simplex_exponents, points, output_stride, gradients_out);
+            return;
+        case 5:
+            evaluate_tetrahedron_simplex_gradients_q4<5>(
+                simplex_exponents, points, output_stride, gradients_out);
+            return;
+        case 6:
+            evaluate_tetrahedron_simplex_gradients_q4<6>(
+                simplex_exponents, points, output_stride, gradients_out);
+            return;
+        case 7:
+            evaluate_tetrahedron_simplex_gradients_q4<7>(
+                simplex_exponents, points, output_stride, gradients_out);
+            return;
+        case 8:
+            evaluate_tetrahedron_simplex_gradients_q4<8>(
+                simplex_exponents, points, output_stride, gradients_out);
+            return;
+        default:
+            break;
+        }
+    }
+    if (num_qpts == 4u &&
+        need_hessians &&
+        try_evaluate_tetrahedron_simplex_hessian_outputs_q4(
+            simplex_exponents, order, points, output_stride,
+            values_out, gradients_out, hessians_out)) {
+        return;
+    }
+    const std::size_t batch_entries = sequence_size * num_qpts;
+    if (batch_entries <= kFixedSimplexBatchEntries) {
+        if (values_out != nullptr && gradients_out == nullptr && hessians_out == nullptr) {
+            std::array<Real, kFixedSimplexBatchEntries> phi0_batch;
+            std::array<Real, kFixedSimplexBatchEntries> phi1_batch;
+            std::array<Real, kFixedSimplexBatchEntries> phi2_batch;
+            std::array<Real, kFixedSimplexBatchEntries> phi3_batch;
+
+            for (std::size_t q = 0; q < num_qpts; ++q) {
+                const auto& xi = points[q];
+                const Real l1 = xi[0];
+                const Real l2 = xi[1];
+                const Real l3 = xi[2];
+                const Real l0 = Real(1) - l1 - l2 - l3;
+                const std::size_t offset = q * sequence_size;
+                simplex_lagrange_factor_sequence(
+                    order, l0, phi0_batch.data() + offset, nullptr, nullptr);
+                simplex_lagrange_factor_sequence(
+                    order, l1, phi1_batch.data() + offset, nullptr, nullptr);
+                simplex_lagrange_factor_sequence(
+                    order, l2, phi2_batch.data() + offset, nullptr, nullptr);
+                simplex_lagrange_factor_sequence(
+                    order, l3, phi3_batch.data() + offset, nullptr, nullptr);
+            }
+
+            for (std::size_t node = 0; node < num_nodes; ++node) {
+                const auto& e = simplex_exponents[node];
+                const std::size_t i0 = static_cast<std::size_t>(e[0]);
+                const std::size_t i1 = static_cast<std::size_t>(e[1]);
+                const std::size_t i2 = static_cast<std::size_t>(e[2]);
+                const std::size_t i3 = static_cast<std::size_t>(e[3]);
+                Real* value_row = values_out + node * output_stride;
+
+                for (std::size_t q = 0; q < num_qpts; ++q) {
+                    const std::size_t offset = q * sequence_size;
+                    value_row[q] =
+                        phi0_batch[offset + i0] *
+                        phi1_batch[offset + i1] *
+                        phi2_batch[offset + i2] *
+                        phi3_batch[offset + i3];
+                }
+            }
+            return;
+        }
+
+        if (values_out == nullptr && gradients_out != nullptr && hessians_out == nullptr) {
+            std::array<Real, kFixedSimplexBatchEntries> phi0_batch;
+            std::array<Real, kFixedSimplexBatchEntries> phi1_batch;
+            std::array<Real, kFixedSimplexBatchEntries> phi2_batch;
+            std::array<Real, kFixedSimplexBatchEntries> phi3_batch;
+            std::array<Real, kFixedSimplexBatchEntries> dphi0_batch;
+            std::array<Real, kFixedSimplexBatchEntries> dphi1_batch;
+            std::array<Real, kFixedSimplexBatchEntries> dphi2_batch;
+            std::array<Real, kFixedSimplexBatchEntries> dphi3_batch;
+
+            for (std::size_t q = 0; q < num_qpts; ++q) {
+                const auto& xi = points[q];
+                const Real l1 = xi[0];
+                const Real l2 = xi[1];
+                const Real l3 = xi[2];
+                const Real l0 = Real(1) - l1 - l2 - l3;
+                const std::size_t offset = q * sequence_size;
+                simplex_lagrange_factor_sequence(
+                    order, l0, phi0_batch.data() + offset, dphi0_batch.data() + offset, nullptr);
+                simplex_lagrange_factor_sequence(
+                    order, l1, phi1_batch.data() + offset, dphi1_batch.data() + offset, nullptr);
+                simplex_lagrange_factor_sequence(
+                    order, l2, phi2_batch.data() + offset, dphi2_batch.data() + offset, nullptr);
+                simplex_lagrange_factor_sequence(
+                    order, l3, phi3_batch.data() + offset, dphi3_batch.data() + offset, nullptr);
+            }
+
+            for (std::size_t node = 0; node < num_nodes; ++node) {
+                const auto& e = simplex_exponents[node];
+                const std::size_t i0 = static_cast<std::size_t>(e[0]);
+                const std::size_t i1 = static_cast<std::size_t>(e[1]);
+                const std::size_t i2 = static_cast<std::size_t>(e[2]);
+                const std::size_t i3 = static_cast<std::size_t>(e[3]);
+                Real* g = gradients_out + node * 3u * output_stride;
+
+                for (std::size_t q = 0; q < num_qpts; ++q) {
+                    const std::size_t offset = q * sequence_size;
+                    const Real v0 = phi0_batch[offset + i0];
+                    const Real v1 = phi1_batch[offset + i1];
+                    const Real v2 = phi2_batch[offset + i2];
+                    const Real v3 = phi3_batch[offset + i3];
+                    const Real D0 = dphi0_batch[offset + i0];
+                    const Real D1 = dphi1_batch[offset + i1];
+                    const Real D2 = dphi2_batch[offset + i2];
+                    const Real D3 = dphi3_batch[offset + i3];
+                    const Real v23 = v2 * v3;
+                    const Real dl0 = D0 * v1 * v23;
+                    g[0u * output_stride + q] = v0 * D1 * v23 - dl0;
+                    g[1u * output_stride + q] = v0 * v1 * D2 * v3 - dl0;
+                    g[2u * output_stride + q] = v0 * v1 * v2 * D3 - dl0;
+                }
+            }
+            return;
+        }
+
+        std::array<Real, kFixedSimplexBatchEntries> phi0_batch;
+        std::array<Real, kFixedSimplexBatchEntries> phi1_batch;
+        std::array<Real, kFixedSimplexBatchEntries> phi2_batch;
+        std::array<Real, kFixedSimplexBatchEntries> phi3_batch;
+        std::array<Real, kFixedSimplexBatchEntries> dphi0_batch;
+        std::array<Real, kFixedSimplexBatchEntries> dphi1_batch;
+        std::array<Real, kFixedSimplexBatchEntries> dphi2_batch;
+        std::array<Real, kFixedSimplexBatchEntries> dphi3_batch;
+        std::array<Real, kFixedSimplexBatchEntries> d2phi0_batch;
+        std::array<Real, kFixedSimplexBatchEntries> d2phi1_batch;
+        std::array<Real, kFixedSimplexBatchEntries> d2phi2_batch;
+        std::array<Real, kFixedSimplexBatchEntries> d2phi3_batch;
+
+        for (std::size_t q = 0; q < num_qpts; ++q) {
+            const auto& xi = points[q];
+            const Real l1 = xi[0];
+            const Real l2 = xi[1];
+            const Real l3 = xi[2];
+            const Real l0 = Real(1) - l1 - l2 - l3;
+            const std::size_t offset = q * sequence_size;
+            Real* d0_out = (need_gradients || need_hessians) ? dphi0_batch.data() + offset : nullptr;
+            Real* d1_out = (need_gradients || need_hessians) ? dphi1_batch.data() + offset : nullptr;
+            Real* d2_out = (need_gradients || need_hessians) ? dphi2_batch.data() + offset : nullptr;
+            Real* d3_out = (need_gradients || need_hessians) ? dphi3_batch.data() + offset : nullptr;
+            Real* d20_out = need_hessians ? d2phi0_batch.data() + offset : nullptr;
+            Real* d21_out = need_hessians ? d2phi1_batch.data() + offset : nullptr;
+            Real* d22_out = need_hessians ? d2phi2_batch.data() + offset : nullptr;
+            Real* d23_out = need_hessians ? d2phi3_batch.data() + offset : nullptr;
+            simplex_lagrange_factor_sequence(order, l0, phi0_batch.data() + offset, d0_out, d20_out);
+            simplex_lagrange_factor_sequence(order, l1, phi1_batch.data() + offset, d1_out, d21_out);
+            simplex_lagrange_factor_sequence(order, l2, phi2_batch.data() + offset, d2_out, d22_out);
+            simplex_lagrange_factor_sequence(order, l3, phi3_batch.data() + offset, d3_out, d23_out);
+        }
+
+        for (std::size_t node = 0; node < num_nodes; ++node) {
+            const auto& e = simplex_exponents[node];
+            const std::size_t i0 = static_cast<std::size_t>(e[0]);
+            const std::size_t i1 = static_cast<std::size_t>(e[1]);
+            const std::size_t i2 = static_cast<std::size_t>(e[2]);
+            const std::size_t i3 = static_cast<std::size_t>(e[3]);
+            Real* value_row = values_out ? values_out + node * output_stride : nullptr;
+            Real* g = gradients_out ? gradients_out + node * 3u * output_stride : nullptr;
+            Real* H = hessians_out ? hessians_out + node * 9u * output_stride : nullptr;
+
+            for (std::size_t q = 0; q < num_qpts; ++q) {
+                const std::size_t offset = q * sequence_size;
+                const Real v0 = phi0_batch[offset + i0];
+                const Real v1 = phi1_batch[offset + i1];
+                const Real v2 = phi2_batch[offset + i2];
+                const Real v3 = phi3_batch[offset + i3];
+                if (value_row != nullptr) {
+                    value_row[q] = v0 * v1 * v2 * v3;
+                }
+                if (!need_gradients && !need_hessians) {
+                    continue;
+                }
+
+                const Real D0 = dphi0_batch[offset + i0];
+                const Real D1 = dphi1_batch[offset + i1];
+                const Real D2 = dphi2_batch[offset + i2];
+                const Real D3 = dphi3_batch[offset + i3];
+
+                if (gradients_out != nullptr) {
+                    const Real dl0 = D0 * v1 * v2 * v3;
+                    const Real dl1 = v0 * D1 * v2 * v3;
+                    const Real dl2 = v0 * v1 * D2 * v3;
+                    const Real dl3 = v0 * v1 * v2 * D3;
+                    g[0u * output_stride + q] = dl1 - dl0;
+                    g[1u * output_stride + q] = dl2 - dl0;
+                    g[2u * output_stride + q] = dl3 - dl0;
+                }
+
+                if (hessians_out != nullptr) {
+                    const Real DD0 = d2phi0_batch[offset + i0];
+                    const Real DD1 = d2phi1_batch[offset + i1];
+                    const Real DD2 = d2phi2_batch[offset + i2];
+                    const Real DD3 = d2phi3_batch[offset + i3];
+                    const Real H00 = DD0 * v1 * v2 * v3;
+                    const Real H11 = v0 * DD1 * v2 * v3;
+                    const Real H22 = v0 * v1 * DD2 * v3;
+                    const Real H33 = v0 * v1 * v2 * DD3;
+                    const Real H01 = D0 * D1 * v2 * v3;
+                    const Real H02 = D0 * v1 * D2 * v3;
+                    const Real H03 = D0 * v1 * v2 * D3;
+                    const Real H12 = v0 * D1 * D2 * v3;
+                    const Real H13 = v0 * D1 * v2 * D3;
+                    const Real H23 = v0 * v1 * D2 * D3;
+                    const Real h01 = H00 - H01 - H02 + H12;
+                    const Real h02 = H00 - H01 - H03 + H13;
+                    const Real h12 = H00 - H02 - H03 + H23;
+                    H[0u * output_stride + q] = H00 - Real(2) * H01 + H11;
+                    H[1u * output_stride + q] = h01;
+                    H[2u * output_stride + q] = h02;
+                    H[3u * output_stride + q] = h01;
+                    H[4u * output_stride + q] = H00 - Real(2) * H02 + H22;
+                    H[5u * output_stride + q] = h12;
+                    H[6u * output_stride + q] = h02;
+                    H[7u * output_stride + q] = h12;
+                    H[8u * output_stride + q] = H00 - Real(2) * H03 + H33;
+                }
+            }
+        }
+        return;
+    }
+
+    SimplexAxisScratch& s0 = simplex_axis_scratch_slot(0);
+    SimplexAxisScratch& s1 = simplex_axis_scratch_slot(1);
+    SimplexAxisScratch& s2 = simplex_axis_scratch_slot(2);
+    SimplexAxisScratch& s3 = simplex_axis_scratch_slot(3);
+    s0.reserveFor(sequence_size);
+    s1.reserveFor(sequence_size);
+    s2.reserveFor(sequence_size);
+    s3.reserveFor(sequence_size);
+
+    for (std::size_t q = 0; q < points.size(); ++q) {
+        const auto& xi = points[q];
+        const Real l1 = xi[0];
+        const Real l2 = xi[1];
+        const Real l3 = xi[2];
+        const Real l0 = Real(1) - l1 - l2 - l3;
+
+        Real* d0_out = (need_gradients || need_hessians) ? s0.dphi() : nullptr;
+        Real* d1_out = (need_gradients || need_hessians) ? s1.dphi() : nullptr;
+        Real* d2_out = (need_gradients || need_hessians) ? s2.dphi() : nullptr;
+        Real* d3_out = (need_gradients || need_hessians) ? s3.dphi() : nullptr;
+        Real* d20_out = need_hessians ? s0.d2phi() : nullptr;
+        Real* d21_out = need_hessians ? s1.d2phi() : nullptr;
+        Real* d22_out = need_hessians ? s2.d2phi() : nullptr;
+        Real* d23_out = need_hessians ? s3.d2phi() : nullptr;
+
+        simplex_lagrange_factor_sequence(order, l0, s0.phi(), d0_out, d20_out);
+        simplex_lagrange_factor_sequence(order, l1, s1.phi(), d1_out, d21_out);
+        simplex_lagrange_factor_sequence(order, l2, s2.phi(), d2_out, d22_out);
+        simplex_lagrange_factor_sequence(order, l3, s3.phi(), d3_out, d23_out);
+        const Real* phi0 = s0.phi();
+        const Real* phi1 = s1.phi();
+        const Real* phi2 = s2.phi();
+        const Real* phi3 = s3.phi();
+        const Real* dphi0 = s0.dphi();
+        const Real* dphi1 = s1.dphi();
+        const Real* dphi2 = s2.dphi();
+        const Real* dphi3 = s3.dphi();
+        const Real* d2phi0 = s0.d2phi();
+        const Real* d2phi1 = s1.d2phi();
+        const Real* d2phi2 = s2.d2phi();
+        const Real* d2phi3 = s3.d2phi();
+
+        for (std::size_t node = 0; node < num_nodes; ++node) {
+            const auto& e = simplex_exponents[node];
+            const std::size_t i0 = static_cast<std::size_t>(e[0]);
+            const std::size_t i1 = static_cast<std::size_t>(e[1]);
+            const std::size_t i2 = static_cast<std::size_t>(e[2]);
+            const std::size_t i3 = static_cast<std::size_t>(e[3]);
+
+            const Real v0 = phi0[i0];
+            const Real v1 = phi1[i1];
+            const Real v2 = phi2[i2];
+            const Real v3 = phi3[i3];
+            if (values_out != nullptr) {
+                values_out[node * output_stride + q] = v0 * v1 * v2 * v3;
+            }
+            if (!need_gradients && !need_hessians) {
+                continue;
+            }
+
+            const Real D0 = dphi0[i0];
+            const Real D1 = dphi1[i1];
+            const Real D2 = dphi2[i2];
+            const Real D3 = dphi3[i3];
+
+            if (gradients_out != nullptr) {
+                const Real dl0 = D0 * v1 * v2 * v3;
+                const Real dl1 = v0 * D1 * v2 * v3;
+                const Real dl2 = v0 * v1 * D2 * v3;
+                const Real dl3 = v0 * v1 * v2 * D3;
+                Real* g = gradients_out + node * 3u * output_stride;
+                g[0u * output_stride + q] = dl1 - dl0;
+                g[1u * output_stride + q] = dl2 - dl0;
+                g[2u * output_stride + q] = dl3 - dl0;
+            }
+
+            if (hessians_out != nullptr) {
+                const Real DD0 = d2phi0[i0];
+                const Real DD1 = d2phi1[i1];
+                const Real DD2 = d2phi2[i2];
+                const Real DD3 = d2phi3[i3];
+
+                const Real H00 = DD0 * v1 * v2 * v3;
+                const Real H11 = v0 * DD1 * v2 * v3;
+                const Real H22 = v0 * v1 * DD2 * v3;
+                const Real H33 = v0 * v1 * v2 * DD3;
+
+                const Real H01 = D0 * D1 * v2 * v3;
+                const Real H02 = D0 * v1 * D2 * v3;
+                const Real H03 = D0 * v1 * v2 * D3;
+                const Real H12 = v0 * D1 * D2 * v3;
+                const Real H13 = v0 * D1 * v2 * D3;
+                const Real H23 = v0 * v1 * D2 * D3;
+
+                const Real h01 = H00 - H01 - H02 + H12;
+                const Real h02 = H00 - H01 - H03 + H13;
+                const Real h12 = H00 - H02 - H03 + H23;
+
+                Real* H = hessians_out + node * 9u * output_stride;
+                H[0u * output_stride + q] = H00 - Real(2) * H01 + H11;
+                H[1u * output_stride + q] = h01;
+                H[2u * output_stride + q] = h02;
+                H[3u * output_stride + q] = h01;
+                H[4u * output_stride + q] = H00 - Real(2) * H02 + H22;
+                H[5u * output_stride + q] = h12;
+                H[6u * output_stride + q] = h02;
+                H[7u * output_stride + q] = h12;
+                H[8u * output_stride + q] = H00 - Real(2) * H03 + H33;
+            }
+        }
+    }
+}
+
+} // namespace detail
+} // namespace basis
+} // namespace FE
+} // namespace svmp
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasisSimplex.h b/Code/Source/solver/FE/Basis/LagrangeBasisSimplex.h
new file mode 100644
index 000000000..19cf725bd
--- /dev/null
+++ b/Code/Source/solver/FE/Basis/LagrangeBasisSimplex.h
@@ -0,0 +1,78 @@
+#ifndef SVMP_FE_BASIS_LAGRANGEBASISSIMPLEX_H
+#define SVMP_FE_BASIS_LAGRANGEBASISSIMPLEX_H
+
+// Private declarations for simplex Lagrange evaluation helpers implemented in
+// LagrangeBasisSimplex.cpp.
+
+#include "BasisFunction.h"
+
+#include <array>
+#include <cstddef>
+#include <vector>
+
+namespace svmp {
+namespace FE {
+namespace basis {
+namespace detail {
+
+void evaluate_triangle_simplex_basis(const std::vector<std::array<int, 4>>& simplex_exponents,
+                                     int order,
+                                     const math::Vector<Real, 3>& xi,
+                                     std::vector<Real>* values,
+                                     std::vector<Gradient>* gradients,
+                                     std::vector<Hessian>* hessians);
+
+void evaluate_triangle_simplex_basis_to(const std::vector<std::array<int, 4>>& simplex_exponents,
+                                        int order,
+                                        const math::Vector<Real, 3>& xi,
+                                        Real* SVMP_RESTRICT values_out,
+                                        Real* SVMP_RESTRICT gradients_out,
+                                        Real* SVMP_RESTRICT hessians_out);
+
+void evaluate_triangle_simplex_basis_strided(
+    const std::vector<std::array<int, 4>>& simplex_exponents,
+    int order,
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out);
+
+void evaluate_triangle_simplex_basis_wedge_components_strided(
+    const std::vector<std::array<int, 4>>& simplex_exponents,
+    int order,
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_xy_out,
+    Real* SVMP_RESTRICT hessians_xx_xy_yy_out);
+
+void evaluate_tetrahedron_simplex_basis(const std::vector<std::array<int, 4>>& simplex_exponents,
+                                        int order,
+                                        const math::Vector<Real, 3>& xi,
+                                        std::vector<Real>* values,
+                                        std::vector<Gradient>* gradients,
+                                        std::vector<Hessian>* hessians);
+
+void evaluate_tetrahedron_simplex_basis_to(const std::vector<std::array<int, 4>>& simplex_exponents,
+                                           int order,
+                                           const math::Vector<Real, 3>& xi,
+                                           Real* SVMP_RESTRICT values_out,
+                                           Real* SVMP_RESTRICT gradients_out,
+                                           Real* SVMP_RESTRICT hessians_out);
+
+void evaluate_tetrahedron_simplex_basis_strided(
+    const std::vector<std::array<int, 4>>& simplex_exponents,
+    int order,
+    const std::vector<math::Vector<Real, 3>>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT gradients_out,
+    Real* SVMP_RESTRICT hessians_out);
+
+} // namespace detail
+} // namespace basis
+} // namespace FE
+} // namespace svmp
+
+#endif // SVMP_FE_BASIS_LAGRANGEBASISSIMPLEX_H
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasisUtility.h b/Code/Source/solver/FE/Basis/LagrangeBasisUtility.h
new file mode 100644
index 000000000..e622de1c6
--- /dev/null
+++ b/Code/Source/solver/FE/Basis/LagrangeBasisUtility.h
@@ -0,0 +1,25 @@
+#ifndef SVMP_FE_BASIS_LAGRANGEBASISUTILITY_H
+#define SVMP_FE_BASIS_LAGRANGEBASISUTILITY_H
+
+// Private helper for LagrangeBasis internals.
+// This header is only intended to be included after the FE basis scalar types
+// are already available.
+
+namespace svmp {
+namespace FE {
+namespace basis {
+namespace detail {
+
+inline constexpr Real equispaced_pm_one_coord(int i, int order) {
+    if (order <= 0) {
+        return Real(0);
+    }
+    return Real(-1) + Real(2) * static_cast<Real>(i) / static_cast<Real>(order);
+}
+
+} // namespace detail
+} // namespace basis
+} // namespace FE
+} // namespace svmp
+
+#endif // SVMP_FE_BASIS_LAGRANGEBASISUTILITY_H
diff --git a/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp b/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
new file mode 100644
index 000000000..20f743916
--- /dev/null
+++ b/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
@@ -0,0 +1,818 @@
+/* Copyright (c) Stanford University, The Regents of the University of California, and others.
+ *
+ * All Rights Reserved.
+ *
+ * See License file.
+ */
+
+#include "NodeOrderingConventions.h"
+#include "Basis/BasisExceptions.h"
+#include "Basis/BasisTraits.h"
+
+#include <array>
+
+namespace svmp {
+namespace FE {
+namespace basis {
+
+namespace {
+
+using Point = math::Vector<Real, 3>;
+using RawPoint = std::array<Real, 3>;
+
+template<std::size_t N>
+using NodeTable = std::array<RawPoint, N>;
+
+struct NodeTableView {
+    const RawPoint* data{nullptr};
+    std::size_t size{0};
+};
+
+inline constexpr NodeTable<2> kLine2Nodes = {{
+    {Real(-1), Real(0), Real(0)},
+    {Real(1), Real(0), Real(0)},
+}};
+
+inline constexpr NodeTable<3> kLine3Nodes = {{
+    {Real(-1), Real(0), Real(0)},
+    {Real(1), Real(0), Real(0)},
+    {Real(0), Real(0), Real(0)},
+}};
+
+inline constexpr NodeTable<3> kTriangle3Nodes = {{
+    {Real(0), Real(0), Real(0)},
+    {Real(1), Real(0), Real(0)},
+    {Real(0), Real(1), Real(0)},
+}};
+
+inline constexpr NodeTable<6> kTriangle6Nodes = {{
+    {Real(0), Real(0), Real(0)},
+    {Real(1), Real(0), Real(0)},
+    {Real(0), Real(1), Real(0)},
+    {Real(0.5), Real(0), Real(0)},
+    {Real(0.5), Real(0.5), Real(0)},
+    {Real(0), Real(0.5), Real(0)},
+}};
+
+inline constexpr NodeTable<4> kQuad4Nodes = {{
+    {Real(-1), Real(-1), Real(0)},
+    {Real(1), Real(-1), Real(0)},
+    {Real(1), Real(1), Real(0)},
+    {Real(-1), Real(1), Real(0)},
+}};
+
+inline constexpr NodeTable<9> kQuad9Nodes = {{
+    {Real(-1), Real(-1), Real(0)},
+    {Real(1), Real(-1), Real(0)},
+    {Real(1), Real(1), Real(0)},
+    {Real(-1), Real(1), Real(0)},
+    {Real(0), Real(-1), Real(0)},
+    {Real(1), Real(0), Real(0)},
+    {Real(0), Real(1), Real(0)},
+    {Real(-1), Real(0), Real(0)},
+    {Real(0), Real(0), Real(0)},
+}};
+
+inline constexpr NodeTable<8> kQuad8Nodes = {{
+    {Real(-1), Real(-1), Real(0)},
+    {Real(1), Real(-1), Real(0)},
+    {Real(1), Real(1), Real(0)},
+    {Real(-1), Real(1), Real(0)},
+    {Real(0), Real(-1), Real(0)},
+    {Real(1), Real(0), Real(0)},
+    {Real(0), Real(1), Real(0)},
+    {Real(-1), Real(0), Real(0)},
+}};
+
+inline constexpr NodeTable<4> kTetra4Nodes = {{
+    {Real(0), Real(0), Real(0)},
+    {Real(1), Real(0), Real(0)},
+    {Real(0), Real(1), Real(0)},
+    {Real(0), Real(0), Real(1)},
+}};
+
+inline constexpr NodeTable<10> kTetra10Nodes = {{
+    {Real(0), Real(0), Real(0)},
+    {Real(1), Real(0), Real(0)},
+    {Real(0), Real(1), Real(0)},
+    {Real(0), Real(0), Real(1)},
+    {Real(0.5), Real(0), Real(0)},
+    {Real(0.5), Real(0.5), Real(0)},
+    {Real(0), Real(0.5), Real(0)},
+    {Real(0), Real(0), Real(0.5)},
+    {Real(0.5), Real(0), Real(0.5)},
+    {Real(0), Real(0.5), Real(0.5)},
+}};
+
+inline constexpr NodeTable<8> kHex8Nodes = {{
+    {Real(-1), Real(-1), Real(-1)},
+    {Real(1), Real(-1), Real(-1)},
+    {Real(1), Real(1), Real(-1)},
+    {Real(-1), Real(1), Real(-1)},
+    {Real(-1), Real(-1), Real(1)},
+    {Real(1), Real(-1), Real(1)},
+    {Real(1), Real(1), Real(1)},
+    {Real(-1), Real(1), Real(1)},
+}};
+
+inline constexpr NodeTable<27> kHex27Nodes = {{
+    {Real(-1), Real(-1), Real(-1)},
+    {Real(1), Real(-1), Real(-1)},
+    {Real(1), Real(1), Real(-1)},
+    {Real(-1), Real(1), Real(-1)},
+    {Real(-1), Real(-1), Real(1)},
+    {Real(1), Real(-1), Real(1)},
+    {Real(1), Real(1), Real(1)},
+    {Real(-1), Real(1), Real(1)},
+    {Real(0), Real(-1), Real(-1)},
+    {Real(1), Real(0), Real(-1)},
+    {Real(0), Real(1), Real(-1)},
+    {Real(-1), Real(0), Real(-1)},
+    {Real(0), Real(-1), Real(1)},
+    {Real(1), Real(0), Real(1)},
+    {Real(0), Real(1), Real(1)},
+    {Real(-1), Real(0), Real(1)},
+    {Real(-1), Real(-1), Real(0)},
+    {Real(1), Real(-1), Real(0)},
+    {Real(1), Real(1), Real(0)},
+    {Real(-1), Real(1), Real(0)},
+    {Real(0), Real(0), Real(-1)},
+    {Real(0), Real(0), Real(1)},
+    {Real(0), Real(-1), Real(0)},
+    {Real(1), Real(0), Real(0)},
+    {Real(0), Real(1), Real(0)},
+    {Real(-1), Real(0), Real(0)},
+    {Real(0), Real(0), Real(0)},
+}};
+
+inline constexpr NodeTable<20> kHex20Nodes = {{
+    {Real(-1), Real(-1), Real(-1)},
+    {Real(1), Real(-1), Real(-1)},
+    {Real(1), Real(1), Real(-1)},
+    {Real(-1), Real(1), Real(-1)},
+    {Real(-1), Real(-1), Real(1)},
+    {Real(1), Real(-1), Real(1)},
+    {Real(1), Real(1), Real(1)},
+    {Real(-1), Real(1), Real(1)},
+    {Real(0), Real(-1), Real(-1)},
+    {Real(1), Real(0), Real(-1)},
+    {Real(0), Real(1), Real(-1)},
+    {Real(-1), Real(0), Real(-1)},
+    {Real(0), Real(-1), Real(1)},
+    {Real(1), Real(0), Real(1)},
+    {Real(0), Real(1), Real(1)},
+    {Real(-1), Real(0), Real(1)},
+    {Real(-1), Real(-1), Real(0)},
+    {Real(1), Real(-1), Real(0)},
+    {Real(1), Real(1), Real(0)},
+    {Real(-1), Real(1), Real(0)},
+}};
+
+// Mesh uses conventional Hex20 ordering: corners first, then edge midpoints in
+// {bottom, top, vertical} groups. The quadratic Hex20 serendipity polynomial
+// table uses an axis-grouped edge order. This maps public mesh/reference index
+// to the internal polynomial-table index.
+constexpr std::array<std::size_t, 20> kHex20MeshToBasisOrder = {
+    0, 1, 2, 3, 4, 5, 6, 7,
+    8, 13, 10, 12,
+    9, 15, 11, 14,
+    16, 17, 19, 18
+};
+
+inline constexpr NodeTable<6> kWedge6Nodes = {{
+    {Real(0), Real(0), Real(-1)},
+    {Real(1), Real(0), Real(-1)},
+    {Real(0), Real(1), Real(-1)},
+    {Real(0), Real(0), Real(1)},
+    {Real(1), Real(0), Real(1)},
+    {Real(0), Real(1), Real(1)},
+}};
+
+inline constexpr NodeTable<18> kWedge18Nodes = {{
+    {Real(0), Real(0), Real(-1)},
+    {Real(1), Real(0), Real(-1)},
+    {Real(0), Real(1), Real(-1)},
+    {Real(0), Real(0), Real(1)},
+    {Real(1), Real(0), Real(1)},
+    {Real(0), Real(1), Real(1)},
+    {Real(0.5), Real(0), Real(-1)},
+    {Real(0.5), Real(0.5), Real(-1)},
+    {Real(0), Real(0.5), Real(-1)},
+    {Real(0.5), Real(0), Real(1)},
+    {Real(0.5), Real(0.5), Real(1)},
+    {Real(0), Real(0.5), Real(1)},
+    {Real(0), Real(0), Real(0)},
+    {Real(1), Real(0), Real(0)},
+    {Real(0), Real(1), Real(0)},
+    {Real(0.5), Real(0), Real(0)},
+    {Real(0.5), Real(0.5), Real(0)},
+    {Real(0), Real(0.5), Real(0)},
+}};
+
+inline constexpr NodeTable<15> kWedge15Nodes = {{
+    {Real(0), Real(0), Real(-1)},
+    {Real(1), Real(0), Real(-1)},
+    {Real(0), Real(1), Real(-1)},
+    {Real(0), Real(0), Real(1)},
+    {Real(1), Real(0), Real(1)},
+    {Real(0), Real(1), Real(1)},
+    {Real(0.5), Real(0), Real(-1)},
+    {Real(0.5), Real(0.5), Real(-1)},
+    {Real(0), Real(0.5), Real(-1)},
+    {Real(0.5), Real(0), Real(1)},
+    {Real(0.5), Real(0.5), Real(1)},
+    {Real(0), Real(0.5), Real(1)},
+    {Real(0), Real(0), Real(0)},
+    {Real(1), Real(0), Real(0)},
+    {Real(0), Real(1), Real(0)},
+}};
+
+inline constexpr NodeTable<5> kPyramid5Nodes = {{
+    {Real(-1), Real(-1), Real(0)},
+    {Real(1), Real(-1), Real(0)},
+    {Real(1), Real(1), Real(0)},
+    {Real(-1), Real(1), Real(0)},
+    {Real(0), Real(0), Real(1)},
+}};
+
+inline constexpr NodeTable<14> kPyramid14Nodes = {{
+    {Real(-1), Real(-1), Real(0)},
+    {Real(1), Real(-1), Real(0)},
+    {Real(1), Real(1), Real(0)},
+    {Real(-1), Real(1), Real(0)},
+    {Real(0), Real(0), Real(1)},
+    {Real(0), Real(-1), Real(0)},
+    {Real(1), Real(0), Real(0)},
+    {Real(0), Real(1), Real(0)},
+    {Real(-1), Real(0), Real(0)},
+    {Real(-0.5), Real(-0.5), Real(0.5)},
+    {Real(0.5), Real(-0.5), Real(0.5)},
+    {Real(0.5), Real(0.5), Real(0.5)},
+    {Real(-0.5), Real(0.5), Real(0.5)},
+    {Real(0), Real(0), Real(0)},
+}};
+
+inline constexpr NodeTable<13> kPyramid13Nodes = {{
+    {Real(-1), Real(-1), Real(0)},
+    {Real(1), Real(-1), Real(0)},
+    {Real(1), Real(1), Real(0)},
+    {Real(-1), Real(1), Real(0)},
+    {Real(0), Real(0), Real(1)},
+    {Real(0), Real(-1), Real(0)},
+    {Real(1), Real(0), Real(0)},
+    {Real(0), Real(1), Real(0)},
+    {Real(-1), Real(0), Real(0)},
+    {Real(-0.5), Real(-0.5), Real(0.5)},
+    {Real(0.5), Real(-0.5), Real(0.5)},
+    {Real(0.5), Real(0.5), Real(0.5)},
+    {Real(-0.5), Real(0.5), Real(0.5)},
+}};
+
+template<std::size_t N>
+constexpr NodeTableView view(const NodeTable<N>& table) noexcept {
+    return NodeTableView{table.data(), table.size()};
+}
+
+Point to_point(const RawPoint& raw) {
+    return Point{raw[0], raw[1], raw[2]};
+}
+
+constexpr NodeTableView fixed_node_table(ElementType elem_type) noexcept {
+    switch (elem_type) {
+        case ElementType::Line2:     return view(kLine2Nodes);
+        case ElementType::Line3:     return view(kLine3Nodes);
+        case ElementType::Triangle3: return view(kTriangle3Nodes);
+        case ElementType::Triangle6: return view(kTriangle6Nodes);
+        case ElementType::Quad4:     return view(kQuad4Nodes);
+        case ElementType::Quad8:     return view(kQuad8Nodes);
+        case ElementType::Quad9:     return view(kQuad9Nodes);
+        case ElementType::Tetra4:    return view(kTetra4Nodes);
+        case ElementType::Tetra10:   return view(kTetra10Nodes);
+        case ElementType::Hex8:      return view(kHex8Nodes);
+        case ElementType::Hex20:     return view(kHex20Nodes);
+        case ElementType::Hex27:     return view(kHex27Nodes);
+        case ElementType::Wedge6:    return view(kWedge6Nodes);
+        case ElementType::Wedge15:   return view(kWedge15Nodes);
+        case ElementType::Wedge18:   return view(kWedge18Nodes);
+        case ElementType::Pyramid5:  return view(kPyramid5Nodes);
+        case ElementType::Pyramid13: return view(kPyramid13Nodes);
+        case ElementType::Pyramid14: return view(kPyramid14Nodes);
+        default:                     return {};
+    }
+}
+
+constexpr NodeTableView fixed_complete_lagrange_table(ElementType canonical_type,
+                                                      int order) noexcept {
+    switch (canonical_type) {
+        case ElementType::Line2:
+            return order == 1 ? view(kLine2Nodes) :
+                   order == 2 ? view(kLine3Nodes) : NodeTableView{};
+        case ElementType::Triangle3:
+            return order == 1 ? view(kTriangle3Nodes) :
+                   order == 2 ? view(kTriangle6Nodes) : NodeTableView{};
+        case ElementType::Quad4:
+            return order == 1 ? view(kQuad4Nodes) :
+                   order == 2 ? view(kQuad9Nodes) : NodeTableView{};
+        case ElementType::Tetra4:
+            return order == 1 ? view(kTetra4Nodes) :
+                   order == 2 ? view(kTetra10Nodes) : NodeTableView{};
+        case ElementType::Hex8:
+            return order == 1 ? view(kHex8Nodes) :
+                   order == 2 ? view(kHex27Nodes) : NodeTableView{};
+        case ElementType::Wedge6:
+            return order == 1 ? view(kWedge6Nodes) :
+                   order == 2 ? view(kWedge18Nodes) : NodeTableView{};
+        case ElementType::Pyramid5:
+            return order == 1 ? view(kPyramid5Nodes) :
+                   order == 2 ? view(kPyramid14Nodes) : NodeTableView{};
+        default:
+            return {};
+    }
+}
+
+Real line_coord_pm_one(int i, int order) {
+    if (order <= 0) {
+        return Real(0);
+    }
+    return Real(-1) + Real(2) * static_cast<Real>(i) / static_cast<Real>(order);
+}
+
+Real line_coord_zero_one(int i, int order) {
+    if (order <= 0) {
+        return Real(0);
+    }
+    return static_cast<Real>(i) / static_cast<Real>(order);
+}
+
+void append_triangle_face_interior(std::vector<Point>& nodes,
+                                   const Point& v0,
+                                   const Point& v1,
+                                   const Point& v2,
+                                   int order) {
+    for (int c = 1; c <= order - 2; ++c) {
+        for (int b = 1; b <= order - c - 1; ++b) {
+            const int a = order - b - c;
+            const Real la = static_cast<Real>(a) / static_cast<Real>(order);
+            const Real lb = static_cast<Real>(b) / static_cast<Real>(order);
+            const Real lc = static_cast<Real>(c) / static_cast<Real>(order);
+            nodes.push_back(v0 * la + v1 * lb + v2 * lc);
+        }
+    }
+}
+
+std::vector<Point> generate_line_nodes(int order) {
+    if (order == 0) {
+        return {Point{Real(0), Real(0), Real(0)}};
+    }
+
+    std::vector<Point> nodes;
+    nodes.reserve(static_cast<std::size_t>(order + 1));
+    nodes.push_back(Point{Real(-1), Real(0), Real(0)});
+    nodes.push_back(Point{Real(1), Real(0), Real(0)});
+    for (int i = 1; i < order; ++i) {
+        nodes.push_back(Point{line_coord_pm_one(i, order), Real(0), Real(0)});
+    }
+    return nodes;
+}
+
+std::vector<Point> generate_triangle_nodes(int order) {
+    if (order == 0) {
+        return {Point{Real(1) / Real(3), Real(1) / Real(3), Real(0)}};
+    }
+
+    std::vector<Point> nodes;
+    nodes.reserve(static_cast<std::size_t>((order + 1) * (order + 2) / 2));
+
+    nodes.push_back(Point{Real(0), Real(0), Real(0)});
+    nodes.push_back(Point{Real(1), Real(0), Real(0)});
+    nodes.push_back(Point{Real(0), Real(1), Real(0)});
+
+    for (int m = 1; m < order; ++m) {
+        nodes.push_back(Point{line_coord_zero_one(m, order), Real(0), Real(0)});
+    }
+    for (int m = 1; m < order; ++m) {
+        nodes.push_back(Point{line_coord_zero_one(order - m, order),
+                              line_coord_zero_one(m, order), Real(0)});
+    }
+    for (int m = 1; m < order; ++m) {
+        nodes.push_back(Point{Real(0), line_coord_zero_one(order - m, order), Real(0)});
+    }
+
+    append_triangle_face_interior(
+        nodes,
+        Point{Real(0), Real(0), Real(0)},
+        Point{Real(1), Real(0), Real(0)},
+        Point{Real(0), Real(1), Real(0)},
+        order);
+
+    return nodes;
+}
+
+std::vector<Point> generate_quad_nodes(int order) {
+    if (order == 0) {
+        return {Point{Real(0), Real(0), Real(0)}};
+    }
+
+    std::vector<Point> nodes;
+    nodes.reserve(static_cast<std::size_t>((order + 1) * (order + 1)));
+
+    nodes.push_back(Point{Real(-1), Real(-1), Real(0)});
+    nodes.push_back(Point{Real(1), Real(-1), Real(0)});
+    nodes.push_back(Point{Real(1), Real(1), Real(0)});
+    nodes.push_back(Point{Real(-1), Real(1), Real(0)});
+
+    for (int i = 1; i < order; ++i) {
+        nodes.push_back(Point{line_coord_pm_one(i, order), Real(-1), Real(0)});
+    }
+    for (int j = 1; j < order; ++j) {
+        nodes.push_back(Point{Real(1), line_coord_pm_one(j, order), Real(0)});
+    }
+    for (int i = order - 1; i >= 1; --i) {
+        nodes.push_back(Point{line_coord_pm_one(i, order), Real(1), Real(0)});
+    }
+    for (int j = order - 1; j >= 1; --j) {
+        nodes.push_back(Point{Real(-1), line_coord_pm_one(j, order), Real(0)});
+    }
+
+    for (int j = 1; j < order; ++j) {
+        for (int i = 1; i < order; ++i) {
+            nodes.push_back(Point{line_coord_pm_one(i, order), line_coord_pm_one(j, order), Real(0)});
+        }
+    }
+
+    return nodes;
+}
+
+std::vector<Point> generate_tetra_nodes(int order) {
+    if (order == 0) {
+        return {Point{Real(0.25), Real(0.25), Real(0.25)}};
+    }
+
+    std::vector<Point> nodes;
+    nodes.reserve(static_cast<std::size_t>((order + 1) * (order + 2) * (order + 3) / 6));
+
+    const Point verts[] = {
+        Point{Real(0), Real(0), Real(0)},
+        Point{Real(1), Real(0), Real(0)},
+        Point{Real(0), Real(1), Real(0)},
+        Point{Real(0), Real(0), Real(1)},
+    };
+    for (const auto& v : verts) {
+        nodes.push_back(v);
+    }
+
+    const int edges[6][2] = {
+        {0, 1}, {1, 2}, {2, 0}, {0, 3}, {1, 3}, {2, 3}
+    };
+    for (const auto& edge : edges) {
+        for (int m = 1; m < order; ++m) {
+            const Real t = static_cast<Real>(m) / static_cast<Real>(order);
+            nodes.push_back(verts[edge[0]] * (Real(1) - t) + verts[edge[1]] * t);
+        }
+    }
+
+    const int faces[4][3] = {
+        {0, 1, 2},
+        {0, 1, 3},
+        {1, 2, 3},
+        {0, 2, 3},
+    };
+    for (const auto& face : faces) {
+        append_triangle_face_interior(
+            nodes,
+            verts[face[0]],
+            verts[face[1]],
+            verts[face[2]],
+            order);
+    }
+
+    for (int l = 1; l <= order - 3; ++l) {
+        for (int k = 1; k <= order - l - 2; ++k) {
+            for (int j = 1; j <= order - l - k - 1; ++j) {
+                const Real x = static_cast<Real>(j) / static_cast<Real>(order);
+                const Real y = static_cast<Real>(k) / static_cast<Real>(order);
+                const Real z = static_cast<Real>(l) / static_cast<Real>(order);
+                nodes.push_back(Point{x, y, z});
+            }
+        }
+    }
+
+    return nodes;
+}
+
+std::vector<Point> generate_hex_nodes(int order) {
+    if (order == 0) {
+        return {Point{Real(0), Real(0), Real(0)}};
+    }
+
+    std::vector<Point> nodes;
+    nodes.reserve(static_cast<std::size_t>((order + 1) * (order + 1) * (order + 1)));
+
+    const Point verts[] = {
+        Point{Real(-1), Real(-1), Real(-1)},
+        Point{Real(1), Real(-1), Real(-1)},
+        Point{Real(1), Real(1), Real(-1)},
+        Point{Real(-1), Real(1), Real(-1)},
+        Point{Real(-1), Real(-1), Real(1)},
+        Point{Real(1), Real(-1), Real(1)},
+        Point{Real(1), Real(1), Real(1)},
+        Point{Real(-1), Real(1), Real(1)},
+    };
+    for (const auto& v : verts) {
+        nodes.push_back(v);
+    }
+
+    const int edges[12][2] = {
+        {0, 1}, {1, 2}, {2, 3}, {3, 0},
+        {4, 5}, {5, 6}, {6, 7}, {7, 4},
+        {0, 4}, {1, 5}, {2, 6}, {3, 7},
+    };
+    for (const auto& edge : edges) {
+        for (int m = 1; m < order; ++m) {
+            const Real t = static_cast<Real>(m) / static_cast<Real>(order);
+            nodes.push_back(verts[edge[0]] * (Real(1) - t) + verts[edge[1]] * t);
+        }
+    }
+
+    for (int j = 1; j < order; ++j) {
+        for (int i = 1; i < order; ++i) {
+            nodes.push_back(Point{line_coord_pm_one(i, order), line_coord_pm_one(j, order), Real(-1)});
+        }
+    }
+    for (int j = 1; j < order; ++j) {
+        for (int i = 1; i < order; ++i) {
+            nodes.push_back(Point{line_coord_pm_one(i, order), line_coord_pm_one(j, order), Real(1)});
+        }
+    }
+    for (int k = 1; k < order; ++k) {
+        for (int i = 1; i < order; ++i) {
+            nodes.push_back(Point{line_coord_pm_one(i, order), Real(-1), line_coord_pm_one(k, order)});
+        }
+    }
+    for (int k = 1; k < order; ++k) {
+        for (int j = 1; j < order; ++j) {
+            nodes.push_back(Point{Real(1), line_coord_pm_one(j, order), line_coord_pm_one(k, order)});
+        }
+    }
+    for (int k = 1; k < order; ++k) {
+        for (int i = order - 1; i >= 1; --i) {
+            nodes.push_back(Point{line_coord_pm_one(i, order), Real(1), line_coord_pm_one(k, order)});
+        }
+    }
+    for (int k = 1; k < order; ++k) {
+        for (int j = order - 1; j >= 1; --j) {
+            nodes.push_back(Point{Real(-1), line_coord_pm_one(j, order), line_coord_pm_one(k, order)});
+        }
+    }
+
+    for (int k = 1; k < order; ++k) {
+        for (int j = 1; j < order; ++j) {
+            for (int i = 1; i < order; ++i) {
+                nodes.push_back(Point{line_coord_pm_one(i, order),
+                                      line_coord_pm_one(j, order),
+                                      line_coord_pm_one(k, order)});
+            }
+        }
+    }
+
+    return nodes;
+}
+
+std::vector<Point> generate_wedge_nodes(int order) {
+    if (order == 0) {
+        return {Point{Real(1) / Real(3), Real(1) / Real(3), Real(0)}};
+    }
+
+    std::vector<Point> nodes;
+    nodes.reserve(static_cast<std::size_t>((order + 1) * (order + 1) * (order + 2) / 2));
+
+    const Point verts[] = {
+        Point{Real(0), Real(0), Real(-1)},
+        Point{Real(1), Real(0), Real(-1)},
+        Point{Real(0), Real(1), Real(-1)},
+        Point{Real(0), Real(0), Real(1)},
+        Point{Real(1), Real(0), Real(1)},
+        Point{Real(0), Real(1), Real(1)},
+    };
+    for (const auto& v : verts) {
+        nodes.push_back(v);
+    }
+
+    const int edges[9][2] = {
+        {0, 1}, {1, 2}, {2, 0},
+        {3, 4}, {4, 5}, {5, 3},
+        {0, 3}, {1, 4}, {2, 5},
+    };
+    for (const auto& edge : edges) {
+        for (int m = 1; m < order; ++m) {
+            const Real t = static_cast<Real>(m) / static_cast<Real>(order);
+            nodes.push_back(verts[edge[0]] * (Real(1) - t) + verts[edge[1]] * t);
+        }
+    }
+
+    append_triangle_face_interior(
+        nodes, verts[0], verts[1], verts[2], order);
+    append_triangle_face_interior(
+        nodes, verts[3], verts[4], verts[5], order);
+
+    for (int r = 1; r < order; ++r) {
+        const Real z = line_coord_pm_one(r, order);
+        for (int m = 1; m < order; ++m) {
+            const Real t = static_cast<Real>(m) / static_cast<Real>(order);
+            nodes.push_back(Point{t, Real(0), z});
+        }
+        for (int m = 1; m < order; ++m) {
+            const Real t = static_cast<Real>(m) / static_cast<Real>(order);
+            nodes.push_back(Point{Real(1) - t, t, z});
+        }
+        for (int m = 1; m < order; ++m) {
+            const Real t = static_cast<Real>(m) / static_cast<Real>(order);
+            nodes.push_back(Point{Real(0), Real(1) - t, z});
+        }
+    }
+
+    for (int r = 1; r < order; ++r) {
+        const Real z = line_coord_pm_one(r, order);
+        for (int c = 1; c <= order - 2; ++c) {
+            for (int b = 1; b <= order - c - 1; ++b) {
+                const Real x = static_cast<Real>(b) / static_cast<Real>(order);
+                const Real y = static_cast<Real>(c) / static_cast<Real>(order);
+                nodes.push_back(Point{x, y, z});
+            }
+        }
+    }
+
+    return nodes;
+}
+
+std::vector<Point> generate_pyramid_nodes(int order) {
+    if (order == 0) {
+        return {Point{Real(0), Real(0), Real(0.25)}};
+    }
+
+    std::vector<Point> nodes;
+    nodes.reserve(static_cast<std::size_t>((order + 1) * (order + 2) * (2 * order + 3) / 6));
+
+    nodes.push_back(Point{Real(-1), Real(-1), Real(0)});
+    nodes.push_back(Point{Real(1), Real(-1), Real(0)});
+    nodes.push_back(Point{Real(1), Real(1), Real(0)});
+    nodes.push_back(Point{Real(-1), Real(1), Real(0)});
+    nodes.push_back(Point{Real(0), Real(0), Real(1)});
+
+    for (int m = 1; m < order; ++m) {
+        nodes.push_back(Point{line_coord_pm_one(m, order), Real(-1), Real(0)});
+    }
+    for (int m = 1; m < order; ++m) {
+        nodes.push_back(Point{Real(1), line_coord_pm_one(m, order), Real(0)});
+    }
+    for (int m = order - 1; m >= 1; --m) {
+        nodes.push_back(Point{line_coord_pm_one(m, order), Real(1), Real(0)});
+    }
+    for (int m = order - 1; m >= 1; --m) {
+        nodes.push_back(Point{Real(-1), line_coord_pm_one(m, order), Real(0)});
+    }
+
+    for (int level = 1; level < order; ++level) {
+        const Real z = static_cast<Real>(level) / static_cast<Real>(order);
+        const Real scale = Real(1) - z;
+        nodes.push_back(Point{-scale, -scale, z});
+        nodes.push_back(Point{scale, -scale, z});
+        nodes.push_back(Point{scale, scale, z});
+        nodes.push_back(Point{-scale, scale, z});
+    }
+
+    for (int j = 1; j < order; ++j) {
+        for (int i = 1; i < order; ++i) {
+            nodes.push_back(Point{line_coord_pm_one(i, order), line_coord_pm_one(j, order), Real(0)});
+        }
+    }
+
+    for (int level = 1; level < order - 1; ++level) {
+        const int n = order - level;
+        const Real z = static_cast<Real>(level) / static_cast<Real>(order);
+        const Real scale = Real(1) - z;
+
+        for (int m = 1; m < n; ++m) {
+            const Real s = line_coord_pm_one(m, n) * scale;
+            nodes.push_back(Point{s, -scale, z});
+        }
+        for (int m = 1; m < n; ++m) {
+            const Real s = line_coord_pm_one(m, n) * scale;
+            nodes.push_back(Point{scale, s, z});
+        }
+        for (int m = n - 1; m >= 1; --m) {
+            const Real s = line_coord_pm_one(m, n) * scale;
+            nodes.push_back(Point{s, scale, z});
+        }
+        for (int m = n - 1; m >= 1; --m) {
+            const Real s = line_coord_pm_one(m, n) * scale;
+            nodes.push_back(Point{-scale, s, z});
+        }
+    }
+
+    for (int level = 1; level < order - 1; ++level) {
+        const int n = order - level;
+        const Real z = static_cast<Real>(level) / static_cast<Real>(order);
+        const Real scale = Real(1) - z;
+        for (int j = 1; j < n; ++j) {
+            for (int i = 1; i < n; ++i) {
+                nodes.push_back(Point{line_coord_pm_one(i, n) * scale,
+                                      line_coord_pm_one(j, n) * scale,
+                                      z});
+            }
+        }
+    }
+
+    return nodes;
+}
+
+} // namespace
+
+math::Vector<Real, 3> ReferenceNodeLayout::get_node_coords(ElementType elem_type,
+                                                     std::size_t local_node) {
+    const auto table = fixed_node_table(elem_type);
+    if (table.data != nullptr && local_node < table.size) {
+        return to_point(table.data[local_node]);
+    }
+
+    throw BasisNodeOrderingException("Invalid element type or node index in ReferenceNodeLayout::get_node_coords",
+                                     __FILE__, __LINE__, __func__);
+}
+
+std::size_t ReferenceNodeLayout::num_nodes(ElementType elem_type) {
+    const auto table = fixed_node_table(elem_type);
+    if (table.data != nullptr) {
+        return table.size;
+    }
+
+    throw BasisNodeOrderingException("Unknown element type in ReferenceNodeLayout::num_nodes",
+                                     __FILE__, __LINE__, __func__);
+}
+
+std::vector<math::Vector<Real, 3>>
+ReferenceNodeLayout::get_lagrange_node_coords(ElementType canonical_type, int order) {
+    if (order < 0) {
+        throw BasisNodeOrderingException("ReferenceNodeLayout::get_lagrange_node_coords requires non-negative order",
+                                         __FILE__, __LINE__, __func__);
+    }
+
+    const ElementType type = canonical_lagrange_type(canonical_type);
+    const auto fixed_table = fixed_complete_lagrange_table(type, order);
+    if (fixed_table.data != nullptr) {
+        std::vector<Point> nodes;
+        nodes.reserve(fixed_table.size);
+        for (std::size_t i = 0; i < fixed_table.size; ++i) {
+            nodes.push_back(to_point(fixed_table.data[i]));
+        }
+        return nodes;
+    }
+
+    switch (type) {
+        case ElementType::Point1:
+            return {Point{Real(0), Real(0), Real(0)}};
+        case ElementType::Line2:
+            return generate_line_nodes(order);
+        case ElementType::Triangle3:
+            return generate_triangle_nodes(order);
+        case ElementType::Quad4:
+            return generate_quad_nodes(order);
+        case ElementType::Tetra4:
+            return generate_tetra_nodes(order);
+        case ElementType::Hex8:
+            return generate_hex_nodes(order);
+        case ElementType::Wedge6:
+            return generate_wedge_nodes(order);
+        case ElementType::Pyramid5:
+            return generate_pyramid_nodes(order);
+        case ElementType::Quad8:
+        case ElementType::Hex20:
+        case ElementType::Wedge15:
+        case ElementType::Pyramid13:
+            throw BasisNodeOrderingException("ReferenceNodeLayout::get_lagrange_node_coords does not support serendipity topologies",
+                                             __FILE__, __LINE__, __func__);
+        default:
+            throw BasisNodeOrderingException("ReferenceNodeLayout::get_lagrange_node_coords: unsupported topology",
+                                             __FILE__, __LINE__, __func__);
+    }
+}
+
+std::span<const std::size_t> ReferenceNodeLayout::mesh_to_basis_ordering(ElementType elem_type) {
+    if (elem_type == ElementType::Hex20) {
+        return std::span<const std::size_t>(
+            kHex20MeshToBasisOrder.data(),
+            kHex20MeshToBasisOrder.size());
+    }
+    return {};
+}
+
+bool ReferenceNodeLayout::is_simplex(ElementType elem_type) {
+    return svmp::FE::basis::is_simplex(elem_type);
+}
+
+bool ReferenceNodeLayout::is_tensor_product(ElementType elem_type) {
+    return svmp::FE::basis::is_tensor_product(elem_type);
+}
+
+} // namespace basis
+} // namespace FE
+} // namespace svmp
diff --git a/Code/Source/solver/FE/Basis/NodeOrderingConventions.h b/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
new file mode 100644
index 000000000..52af4d932
--- /dev/null
+++ b/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
@@ -0,0 +1,538 @@
+/* Copyright (c) Stanford University, The Regents of the University of California, and others.
+ *
+ * All Rights Reserved.
+ *
+ * See License file.
+ */
+
+#ifndef SVMP_FE_BASIS_NODEORDERINGCONVENTIONS_H
+#define SVMP_FE_BASIS_NODEORDERINGCONVENTIONS_H
+
+#include "Types.h"
+#include "Math/Vector.h"
+#include <cstddef>
+#include <vector>
+
+/**
+ * @file NodeOrderingConventions.h
+ * @brief Documentation of node ordering conventions for all element types
+ *
+ * This file provides comprehensive documentation of the node ordering
+ * conventions used throughout the FE library. These orderings are consistent
+ * with VTK conventions and must be matched exactly when interfacing with
+ * the Mesh library.
+ *
+ * IMPORTANT: The FE library (Basis, Quadrature, Geometry) uses "node" to refer
+ * to degrees of freedom locations on reference elements. The Mesh library uses
+ * "vertex" for geometry vertices and "cell" for mesh elements. When interfacing
+ * between the two, ensure consistent ordering.
+ *
+ * Reference Element Conventions:
+ * - Line:       xi in [-1, 1]
+ * - Quad:       (xi, eta) in [-1, 1] x [-1, 1]
+ * - Hex:        (xi, eta, zeta) in [-1, 1]^3
+ * - Triangle:   (xi, eta) in simplex with vertices (0,0), (1,0), (0,1)
+ * - Tetrahedron: (xi, eta, zeta) in simplex with vertices
+ *                (0,0,0), (1,0,0), (0,1,0), (0,0,1)
+ * - Wedge:      Triangle base x line height, zeta in [-1, 1]
+ * - Pyramid:    Quad base at z=0, apex at (0, 0, 1)
+ *
+ *
+ * =============================================================================
+ * 1D ELEMENTS
+ * =============================================================================
+ *
+ * Line2 (Linear Line)
+ * -------------------
+ *   0---------1
+ *   |         |
+ *  xi=-1     xi=+1
+ *
+ * Node 0: xi = -1
+ * Node 1: xi = +1
+ *
+ *
+ * Line3 (Quadratic Line)
+ * ----------------------
+ *   0----2----1
+ *   |    |    |
+ *  xi=-1 0   xi=+1
+ *
+ * Node 0: xi = -1
+ * Node 1: xi = +1
+ * Node 2: xi =  0 (mid-edge)
+ *
+ *
+ * =============================================================================
+ * 2D QUADRILATERAL ELEMENTS
+ * =============================================================================
+ *
+ * Quad4 (Bilinear Quadrilateral)
+ * ------------------------------
+ *
+ *   3-----------2
+ *   |           |
+ *   |           |
+ *   |           |
+ *   0-----------1
+ *
+ * Node 0: (xi, eta) = (-1, -1)
+ * Node 1: (xi, eta) = (+1, -1)
+ * Node 2: (xi, eta) = (+1, +1)
+ * Node 3: (xi, eta) = (-1, +1)
+ *
+ *
+ * Quad8 (Serendipity Quadrilateral)
+ * ---------------------------------
+ *
+ *   3-----6-----2
+ *   |           |
+ *   7           5
+ *   |           |
+ *   0-----4-----1
+ *
+ * Corners (same as Quad4):
+ *   Node 0: (-1, -1)
+ *   Node 1: (+1, -1)
+ *   Node 2: (+1, +1)
+ *   Node 3: (-1, +1)
+ *
+ * Mid-edge nodes:
+ *   Node 4: ( 0, -1)  (edge 0-1)
+ *   Node 5: (+1,  0)  (edge 1-2)
+ *   Node 6: ( 0, +1)  (edge 2-3)
+ *   Node 7: (-1,  0)  (edge 3-0)
+ *
+ *
+ * Quad9 (Biquadratic Quadrilateral)
+ * ---------------------------------
+ *
+ *   3-----6-----2
+ *   |           |
+ *   7     8     5
+ *   |           |
+ *   0-----4-----1
+ *
+ * Same as Quad8 plus:
+ *   Node 8: (0, 0)  (center)
+ *
+ *
+ * =============================================================================
+ * 3D HEXAHEDRAL ELEMENTS
+ * =============================================================================
+ *
+ * Hex8 (Trilinear Hexahedron)
+ * ---------------------------
+ *
+ *        7-----------6
+ *       /|          /|
+ *      / |         / |
+ *     4-----------5  |
+ *     |  |        |  |
+ *     |  3--------|--2
+ *     | /         | /
+ *     |/          |/
+ *     0-----------1
+ *
+ * Bottom face (zeta = -1):
+ *   Node 0: (xi, eta, zeta) = (-1, -1, -1)
+ *   Node 1: (xi, eta, zeta) = (+1, -1, -1)
+ *   Node 2: (xi, eta, zeta) = (+1, +1, -1)
+ *   Node 3: (xi, eta, zeta) = (-1, +1, -1)
+ *
+ * Top face (zeta = +1):
+ *   Node 4: (xi, eta, zeta) = (-1, -1, +1)
+ *   Node 5: (xi, eta, zeta) = (+1, -1, +1)
+ *   Node 6: (xi, eta, zeta) = (+1, +1, +1)
+ *   Node 7: (xi, eta, zeta) = (-1, +1, +1)
+ *
+ *
+ * Hex20 (Serendipity Hexahedron)
+ * ------------------------------
+ *
+ *        7-----14-----6
+ *       /|           /|
+ *     15 |         13 |
+ *     /  19        /  18
+ *    4-----12-----5   |
+ *    |   |        |   |
+ *    |   3-----10-|---2
+ *   16  /        17  /
+ *    | 11         | 9
+ *    |/           |/
+ *    0------8-----1
+ *
+ * Corners (same as Hex8): Nodes 0-7
+ *
+ * Mid-edge nodes on bottom face (zeta = -1):
+ *   Node 8:  ( 0, -1, -1)  (edge 0-1)
+ *   Node 9:  (+1,  0, -1)  (edge 1-2)
+ *   Node 10: ( 0, +1, -1)  (edge 2-3)
+ *   Node 11: (-1,  0, -1)  (edge 3-0)
+ *
+ * Mid-edge nodes on top face (zeta = +1):
+ *   Node 12: ( 0, -1, +1)  (edge 4-5)
+ *   Node 13: (+1,  0, +1)  (edge 5-6)
+ *   Node 14: ( 0, +1, +1)  (edge 6-7)
+ *   Node 15: (-1,  0, +1)  (edge 7-4)
+ *
+ * Mid-edge nodes on vertical edges:
+ *   Node 16: (-1, -1,  0)  (edge 0-4)
+ *   Node 17: (+1, -1,  0)  (edge 1-5)
+ *   Node 18: (+1, +1,  0)  (edge 2-6)
+ *   Node 19: (-1, +1,  0)  (edge 3-7)
+ *
+ *
+ * Hex27 (Triquadratic Hexahedron)
+ * -------------------------------
+ * Same as Hex20 plus face-center and body-center nodes:
+ *
+ * Face centers:
+ *   Node 20: ( 0,  0, -1)  (bottom face)
+ *   Node 21: ( 0,  0, +1)  (top face)
+ *   Node 22: ( 0, -1,  0)  (front face)
+ *   Node 23: (+1,  0,  0)  (right face)
+ *   Node 24: ( 0, +1,  0)  (back face)
+ *   Node 25: (-1,  0,  0)  (left face)
+ *
+ * Body center:
+ *   Node 26: (0, 0, 0)
+ *
+ *
+ * =============================================================================
+ * 2D TRIANGULAR ELEMENTS
+ * =============================================================================
+ *
+ * Triangle3 (Linear Triangle)
+ * ---------------------------
+ *
+ *   2
+ *   |\
+ *   | \
+ *   |  \
+ *   |   \
+ *   0----1
+ *
+ * Reference: (xi, eta) simplex with vertices at:
+ *   Node 0: (xi, eta) = (0, 0)
+ *   Node 1: (xi, eta) = (1, 0)
+ *   Node 2: (xi, eta) = (0, 1)
+ *
+ *
+ * Triangle6 (Quadratic Triangle)
+ * ------------------------------
+ *
+ *   2
+ *   |\
+ *   | \
+ *   5  4
+ *   |   \
+ *   0--3--1
+ *
+ * Corners: Nodes 0-2 (same as Triangle3)
+ *
+ * Mid-edge nodes:
+ *   Node 3: (0.5,   0)  (edge 0-1)
+ *   Node 4: (0.5, 0.5)  (edge 1-2)
+ *   Node 5: (  0, 0.5)  (edge 2-0)
+ *
+ *
+ * =============================================================================
+ * 3D TETRAHEDRAL ELEMENTS
+ * =============================================================================
+ *
+ * Tetrahedron4 (Linear Tetrahedron)
+ * ---------------------------------
+ *
+ *             3
+ *            /|\
+ *           / | \
+ *          /  |  \
+ *         /   |   \
+ *        /    |    \
+ *       0-----|-----2
+ *        \    |    /
+ *         \   |   /
+ *          \  |  /
+ *           \ | /
+ *            \|/
+ *             1
+ *
+ * Reference: (xi, eta, zeta) simplex with vertices at:
+ *   Node 0: (0, 0, 0)
+ *   Node 1: (1, 0, 0)
+ *   Node 2: (0, 1, 0)
+ *   Node 3: (0, 0, 1)
+ *
+ *
+ * Tetrahedron10 (Quadratic Tetrahedron)
+ * -------------------------------------
+ * Corners: Nodes 0-3 (same as Tet4)
+ *
+ * Mid-edge nodes:
+ *   Node 4: (0.5,   0,   0)  (edge 0-1)
+ *   Node 5: (0.5, 0.5,   0)  (edge 1-2)
+ *   Node 6: (  0, 0.5,   0)  (edge 2-0)
+ *   Node 7: (  0,   0, 0.5)  (edge 0-3)
+ *   Node 8: (0.5,   0, 0.5)  (edge 1-3)
+ *   Node 9: (  0, 0.5, 0.5)  (edge 2-3)
+ *
+ *
+ * =============================================================================
+ * 3D WEDGE (PRISM) ELEMENTS
+ * =============================================================================
+ *
+ * Wedge6 (Linear Wedge)
+ * ---------------------
+ *
+ *         5
+ *        /|\
+ *       / | \
+ *      /  |  \
+ *     3---|---4
+ *     |   2   |
+ *     |  / \  |
+ *     | /   \ |
+ *     |/     \|
+ *     0-------1
+ *
+ * Reference: Triangle base at zeta = -1, top at zeta = +1
+ *
+ * Bottom face (zeta = -1):
+ *   Node 0: (0, 0, -1)
+ *   Node 1: (1, 0, -1)
+ *   Node 2: (0, 1, -1)
+ *
+ * Top face (zeta = +1):
+ *   Node 3: (0, 0, +1)
+ *   Node 4: (1, 0, +1)
+ *   Node 5: (0, 1, +1)
+ *
+ *
+ * Wedge15 (Quadratic Wedge)
+ * -------------------------
+ * Corners: Nodes 0-5 (same as Wedge6)
+ *
+ * Mid-edge nodes on bottom face:
+ *   Node 6:  (0.5,   0, -1)  (edge 0-1)
+ *   Node 7:  (0.5, 0.5, -1)  (edge 1-2)
+ *   Node 8:  (  0, 0.5, -1)  (edge 2-0)
+ *
+ * Mid-edge nodes on top face:
+ *   Node 9:  (0.5,   0, +1)  (edge 3-4)
+ *   Node 10: (0.5, 0.5, +1)  (edge 4-5)
+ *   Node 11: (  0, 0.5, +1)  (edge 5-3)
+ *
+ * Mid-edge nodes on vertical edges:
+ *   Node 12: (0, 0, 0)  (edge 0-3)
+ *   Node 13: (1, 0, 0)  (edge 1-4)
+ *   Node 14: (0, 1, 0)  (edge 2-5)
+ *
+ *
+ * Wedge18 (Complete Quadratic Wedge)
+ * ----------------------------------
+ * Corners and mid-edges: Nodes 0-14 (same as Wedge15)
+ *
+ * Face-center nodes on quadrilateral faces:
+ *   Node 15: (0.5, 0.0, 0.0)  (face with vertices 0-1-4-3, y = 0)
+ *   Node 16: (0.5, 0.5, 0.0)  (face with vertices 1-2-5-4, x + y = 1)
+ *   Node 17: (0.0, 0.5, 0.0)  (face with vertices 2-0-3-5, x = 0)
+ *
+ *
+ * =============================================================================
+ * 3D PYRAMID ELEMENTS
+ * =============================================================================
+ *
+ * Pyramid5 (Linear Pyramid)
+ * -------------------------
+ *
+ *           4
+ *          /|\
+ *         / | \
+ *        /  |  \
+ *       /   |   \
+ *      3----|----2
+ *      |    |    |
+ *      |    +    |   (apex projects to center of base)
+ *      |         |
+ *      0---------1
+ *
+ * Reference: Quad base in xi-eta plane at zeta = 0, apex at zeta = 1
+ *
+ * Base (zeta = 0):
+ *   Node 0: (-1, -1, 0)
+ *   Node 1: (+1, -1, 0)
+ *   Node 2: (+1, +1, 0)
+ *   Node 3: (-1, +1, 0)
+ *
+ * Apex:
+ *   Node 4: (0, 0, 1)
+ *
+ *
+ * Pyramid13 (Quadratic Pyramid)
+ * -----------------------------
+ * Corners: Nodes 0-4 (same as Pyramid5)
+ *
+ * Mid-edge nodes on base:
+ *   Node 5: ( 0, -1, 0)  (edge 0-1)
+ *   Node 6: (+1,  0, 0)  (edge 1-2)
+ *   Node 7: ( 0, +1, 0)  (edge 2-3)
+ *   Node 8: (-1,  0, 0)  (edge 3-0)
+ *
+ * Mid-edge nodes to apex:
+ *   Node 9:  (-0.5, -0.5, 0.5)  (edge 0-4)
+ *   Node 10: (+0.5, -0.5, 0.5)  (edge 1-4)
+ *   Node 11: (+0.5, +0.5, 0.5)  (edge 2-4)
+ *   Node 12: (-0.5, +0.5, 0.5)  (edge 3-4)
+ *
+ *
+ * Pyramid14 (Quadratic Rational Pyramid)
+ * --------------------------------------
+ *
+ * This retained low-order compatibility layout matches the generated
+ * complete-family quadratic Lagrange ordering for the reference pyramid with
+ * base (-1,-1,0)..(1,1,0) and apex at (0,0,1). Nodes 0-12 coincide with the
+ * Pyramid13 layout; node 13 is the base center.
+ *
+ *   Base corners (same as Pyramid5):
+ *     Node 0: (-1, -1, 0)
+ *     Node 1: (+1, -1, 0)
+ *     Node 2: (+1, +1, 0)
+ *     Node 3: (-1, +1, 0)
+ *
+ *   Apex:
+ *     Node 4: (0, 0, 1)
+ *
+ *   Base mid-edges (same as Pyramid13):
+ *     Node 5:  ( 0, -1, 0)   (edge 0-1)
+ *     Node 6:  (+1,  0, 0)   (edge 1-2)
+ *     Node 7:  ( 0, +1, 0)   (edge 2-3)
+ *     Node 8:  (-1,  0, 0)   (edge 3-0)
+ *
+ *   Mid-edges to apex (same as Pyramid13):
+ *     Node 9:  (-0.5, -0.5, 0.5)  (edge 0-4)
+ *     Node 10: (+0.5, -0.5, 0.5)  (edge 1-4)
+ *     Node 11: (+0.5, +0.5, 0.5)  (edge 2-4)
+ *     Node 12: (-0.5, +0.5, 0.5)  (edge 3-4)
+ *
+ *   Base center:
+ *     Node 13: (0, 0, 0)
+ *
+ *
+ * =============================================================================
+ * NOTES ON VTK COMPATIBILITY
+ * =============================================================================
+ *
+ * The node orderings above are consistent with VTK cell types:
+ *
+ *   VTK_LINE           (3)  -> Line2
+ *   VTK_QUADRATIC_EDGE (21) -> Line3
+ *   VTK_TRIANGLE       (5)  -> Triangle3
+ *   VTK_QUADRATIC_TRIANGLE (22) -> Triangle6
+ *   VTK_QUAD           (9)  -> Quad4
+ *   VTK_QUADRATIC_QUAD (23) -> Quad8
+ *   VTK_BIQUADRATIC_QUAD (28) -> Quad9
+ *   VTK_TETRA          (10) -> Tetrahedron4
+ *   VTK_QUADRATIC_TETRA (24) -> Tetrahedron10
+ *   VTK_HEXAHEDRON     (12) -> Hex8
+ *   VTK_QUADRATIC_HEXAHEDRON (25) -> Hex20
+ *   VTK_TRIQUADRATIC_HEXAHEDRON (29) -> Hex27
+ *   VTK_WEDGE          (13) -> Wedge6
+ *   VTK_QUADRATIC_WEDGE (26) -> Wedge15
+ *   VTK_BIQUADRATIC_QUADRATIC_WEDGE (32) -> Wedge18
+ *   VTK_PYRAMID        (14) -> Pyramid5
+ *   VTK_QUADRATIC_PYRAMID (27) -> Pyramid13
+ *
+ *
+ * =============================================================================
+ * BARYCENTRIC COORDINATES
+ * =============================================================================
+ *
+ * For simplex elements, barycentric coordinates (lambda_0, ..., lambda_n)
+ * satisfy sum(lambda_i) = 1.
+ *
+ * Triangle:
+ *   lambda_0 = 1 - xi - eta
+ *   lambda_1 = xi
+ *   lambda_2 = eta
+ *
+ * Tetrahedron:
+ *   lambda_0 = 1 - xi - eta - zeta
+ *   lambda_1 = xi
+ *   lambda_2 = eta
+ *   lambda_3 = zeta
+ *
+ */
+
+#include <span>
+
+namespace svmp {
+namespace FE {
+namespace basis {
+
+/**
+ * @brief Basis-side reference node coordinate queries
+ *
+ * This is intentionally named differently from `svmp::NodeOrdering` in Mesh,
+ * which handles mesh-format permutations rather than reference basis layouts.
+ */
+class ReferenceNodeLayout {
+public:
+    /**
+     * @brief Get reference coordinates for a node
+     * @param elem_type Element type
+     * @param local_node Local node index (0-based)
+     *
+     * Complete-family low-order Lagrange aliases (`Line2/3`, `Triangle3/6`,
+     * `Quad4/9`, `Tetra4/10`, `Hex8/27`, `Wedge6/18`, `Pyramid5/14`) are
+     * served by the generated arbitrary-order Lagrange ordering path. Explicit
+     * hard-coded tables remain only for serendipity-only enums such as
+     * `Quad8`, `Hex20`, `Wedge15`, and `Pyramid13`.
+     *
+     * @return Reference coordinates (xi, eta, zeta)
+     */
+    static math::Vector<Real, 3> get_node_coords(ElementType elem_type, std::size_t local_node);
+
+    /**
+     * @brief Get number of nodes for an element type
+     *
+     * The low-order complete-family Lagrange aliases share the same generated
+     * ordering path used by `get_node_coords`.
+     */
+    static std::size_t num_nodes(ElementType elem_type);
+
+    /**
+     * @brief Generate complete-family Lagrange node coordinates for a canonical topology and order
+     *
+     * This covers arbitrary-order complete nodal Lagrange spaces on the
+     * canonical topologies `Line2`, `Triangle3`, `Quad4`, `Tetra4`, `Hex8`,
+     * `Wedge6`, and `Pyramid5`. Serendipity variants are intentionally
+     * excluded.
+     */
+    static std::vector<math::Vector<Real, 3>>
+    get_lagrange_node_coords(ElementType canonical_type, int order);
+
+    /**
+     * @brief Optional mapping from mesh/reference node order to internal basis order
+     *
+     * Returns an empty span when the public node order is already the basis
+     * table order or no special mapping is registered.
+     */
+    static std::span<const std::size_t> mesh_to_basis_ordering(ElementType elem_type);
+
+    /**
+     * @brief Check if element is a simplex (triangle, tetrahedron)
+     */
+    static bool is_simplex(ElementType elem_type);
+
+    /**
+     * @brief Check if element uses tensor-product topology
+     */
+    static bool is_tensor_product(ElementType elem_type);
+};
+
+} // namespace basis
+} // namespace FE
+} // namespace svmp
+
+#endif // SVMP_FE_BASIS_NODEORDERINGCONVENTIONS_H
diff --git a/Code/Source/solver/FE/Basis/PyramidModalBasis.h b/Code/Source/solver/FE/Basis/PyramidModalBasis.h
new file mode 100644
index 000000000..1ecdae282
--- /dev/null
+++ b/Code/Source/solver/FE/Basis/PyramidModalBasis.h
@@ -0,0 +1,265 @@
+#ifndef SVMP_FE_BASIS_PYRAMIDMODALBASIS_H
+#define SVMP_FE_BASIS_PYRAMIDMODALBASIS_H
+
+// Shared rational/modal pyramid helpers for scalar complete-family and spectral
+// pyramid bases. The degenerate z=1 top plane is evaluated by its apex limit;
+// callers that reject non-apex top-plane queries must validate before calling.
+
+#include "BasisFunction.h"
+#include "BasisTolerance.h"
+#include <algorithm>
+#include <cmath>
+#include <vector>
+
+namespace svmp {
+namespace FE {
+namespace basis {
+namespace pyramid_modal {
+
+struct Term {
+    int px{0};
+    int py{0};
+    int pz{0};
+    int denom_power{0};
+};
+
+struct EvaluationPoint {
+    Real x{Real(0)};
+    Real y{Real(0)};
+    Real z{Real(0)};
+    Real t{Real(1)};
+    bool top_plane{false};
+    std::vector<Real> x_powers;
+    std::vector<Real> y_powers;
+    std::vector<Real> z_powers;
+    std::vector<Real> t_powers;
+};
+
+inline std::vector<Term> build_terms(int order) {
+    std::vector<Term> terms;
+    terms.reserve(static_cast<std::size_t>((order + 1) * (order + 2) *
+                                           (2 * order + 3) / 6));
+    for (int pz = 0; pz <= order; ++pz) {
+        const int n = order - pz;
+        for (int py = 0; py <= n; ++py) {
+            for (int px = 0; px <= n; ++px) {
+                terms.push_back(Term{px, py, pz, std::min(px, py)});
+            }
+        }
+    }
+    return terms;
+}
+
+inline bool on_degenerate_top_plane(const math::Vector<Real, 3>& xi,
+                                    Real tolerance = detail::basis_scaled_tolerance()) {
+    return std::abs(Real(1) - xi[2]) <= tolerance;
+}
+
+inline void fill_powers(Real base, int max_power, std::vector<Real>& powers) {
+    powers.assign(static_cast<std::size_t>(max_power + 1), Real(1));
+    for (int p = 1; p <= max_power; ++p) {
+        powers[static_cast<std::size_t>(p)] =
+            powers[static_cast<std::size_t>(p - 1)] * base;
+    }
+}
+
+inline void prepare_evaluation_point(const math::Vector<Real, 3>& xi,
+                                     int max_px,
+                                     int max_py,
+                                     int max_pz,
+                                     int max_denom_power,
+                                     EvaluationPoint& point) {
+    point.x = xi[0];
+    point.y = xi[1];
+    point.z = xi[2];
+    point.t = Real(1) - point.z;
+    point.top_plane = on_degenerate_top_plane(xi);
+
+    fill_powers(point.x, std::max(max_px, 0), point.x_powers);
+    fill_powers(point.y, std::max(max_py, 0), point.y_powers);
+    fill_powers(point.z, std::max(max_pz, 0), point.z_powers);
+    if (point.top_plane) [[unlikely]] {
+        point.t_powers.assign(1u, Real(1));
+    } else {
+        fill_powers(point.t, std::max(max_denom_power + 2, 0), point.t_powers);
+    }
+}
+
+inline void prepare_evaluation_point(const std::vector<Term>& terms,
+                                     const math::Vector<Real, 3>& xi,
+                                     EvaluationPoint& point) {
+    int max_px = 0;
+    int max_py = 0;
+    int max_pz = 0;
+    int max_denom_power = 0;
+    for (const Term& term : terms) {
+        max_px = std::max(max_px, term.px);
+        max_py = std::max(max_py, term.py);
+        max_pz = std::max(max_pz, term.pz);
+        max_denom_power = std::max(max_denom_power, term.denom_power);
+    }
+    prepare_evaluation_point(xi, max_px, max_py, max_pz, max_denom_power, point);
+}
+
+inline void evaluate_term(const Term& term,
+                          const EvaluationPoint& point,
+                          Real& value,
+                          Gradient* gradient = nullptr,
+                          Hessian* hessian = nullptr) {
+    const auto pow_x = [&](int p) -> Real {
+        return point.x_powers[static_cast<std::size_t>(p)];
+    };
+    const auto pow_y = [&](int p) -> Real {
+        return point.y_powers[static_cast<std::size_t>(p)];
+    };
+    const auto pow_z = [&](int p) -> Real {
+        return point.z_powers[static_cast<std::size_t>(p)];
+    };
+    const auto pow_t = [&](int p) -> Real {
+        return point.t_powers[static_cast<std::size_t>(p)];
+    };
+
+    if (point.top_plane) [[unlikely]] {
+        if (term.px == 0 && term.py == 0) {
+            value = pow_z(term.pz);
+        } else {
+            value = Real(0);
+        }
+        if (gradient != nullptr) {
+            *gradient = Gradient{};
+            if (term.px == 0 && term.py == 0 && term.pz > 0) {
+                (*gradient)[2] = static_cast<Real>(term.pz) * pow_z(term.pz - 1);
+            }
+        }
+        if (hessian != nullptr) {
+            *hessian = Hessian{};
+            if (term.px == 0 && term.py == 0 && term.pz > 1) {
+                (*hessian)(2, 2) =
+                    static_cast<Real>(term.pz * (term.pz - 1)) *
+                    pow_z(term.pz - 2);
+            }
+        }
+        return;
+    }
+
+    const Real base = pow_x(term.px) * pow_y(term.py) * pow_z(term.pz);
+    const Real denom = pow_t(term.denom_power);
+    value = base / denom;
+
+    if (gradient != nullptr) {
+        *gradient = Gradient{};
+        if (term.px > 0) {
+            (*gradient)[0] =
+                static_cast<Real>(term.px) * pow_x(term.px - 1) *
+                pow_y(term.py) * pow_z(term.pz) / denom;
+        }
+        if (term.py > 0) {
+            (*gradient)[1] =
+                static_cast<Real>(term.py) * pow_x(term.px) *
+                pow_y(term.py - 1) * pow_z(term.pz) / denom;
+        }
+
+        Real gz = Real(0);
+        if (term.pz > 0) {
+            gz += static_cast<Real>(term.pz) * pow_x(term.px) *
+                  pow_y(term.py) * pow_z(term.pz - 1) / denom;
+        }
+        if (term.denom_power > 0) {
+            gz += static_cast<Real>(term.denom_power) * base / pow_t(term.denom_power + 1);
+        }
+        (*gradient)[2] = gz;
+    }
+
+    if (hessian == nullptr) {
+        return;
+    }
+
+    *hessian = Hessian{};
+    if (term.px > 1) {
+        (*hessian)(0, 0) =
+            static_cast<Real>(term.px * (term.px - 1)) *
+            pow_x(term.px - 2) * pow_y(term.py) * pow_z(term.pz) / denom;
+    }
+    if (term.py > 1) {
+        (*hessian)(1, 1) =
+            static_cast<Real>(term.py * (term.py - 1)) *
+            pow_x(term.px) * pow_y(term.py - 2) * pow_z(term.pz) / denom;
+    }
+    if (term.px > 0 && term.py > 0) {
+        const Real hxy =
+            static_cast<Real>(term.px * term.py) *
+            pow_x(term.px - 1) * pow_y(term.py - 1) * pow_z(term.pz) / denom;
+        (*hessian)(0, 1) = hxy;
+        (*hessian)(1, 0) = hxy;
+    }
+
+    if (term.px > 0) {
+        Real hxz =
+            static_cast<Real>(term.px) * pow_x(term.px - 1) *
+            pow_y(term.py) / denom;
+        if (term.pz > 0) {
+            hxz *= static_cast<Real>(term.pz) * pow_z(term.pz - 1);
+        } else {
+            hxz = Real(0);
+        }
+        if (term.denom_power > 0) {
+            hxz += static_cast<Real>(term.px * term.denom_power) *
+                   pow_x(term.px - 1) * pow_y(term.py) *
+                   pow_z(term.pz) / pow_t(term.denom_power + 1);
+        }
+        (*hessian)(0, 2) = hxz;
+        (*hessian)(2, 0) = hxz;
+    }
+
+    if (term.py > 0) {
+        Real hyz =
+            static_cast<Real>(term.py) * pow_x(term.px) *
+            pow_y(term.py - 1) / denom;
+        if (term.pz > 0) {
+            hyz *= static_cast<Real>(term.pz) * pow_z(term.pz - 1);
+        } else {
+            hyz = Real(0);
+        }
+        if (term.denom_power > 0) {
+            hyz += static_cast<Real>(term.py * term.denom_power) *
+                   pow_x(term.px) * pow_y(term.py - 1) *
+                   pow_z(term.pz) / pow_t(term.denom_power + 1);
+        }
+        (*hessian)(1, 2) = hyz;
+        (*hessian)(2, 1) = hyz;
+    }
+
+    Real hzz = Real(0);
+    if (term.pz > 1) {
+        hzz += static_cast<Real>(term.pz * (term.pz - 1)) *
+               pow_x(term.px) * pow_y(term.py) * pow_z(term.pz - 2) / denom;
+    }
+    if (term.pz > 0 && term.denom_power > 0) {
+        hzz += static_cast<Real>(2 * term.pz * term.denom_power) *
+               pow_x(term.px) * pow_y(term.py) *
+               pow_z(term.pz - 1) / pow_t(term.denom_power + 1);
+    }
+    if (term.denom_power > 0) {
+        hzz += static_cast<Real>(term.denom_power * (term.denom_power + 1)) *
+               base / pow_t(term.denom_power + 2);
+    }
+    (*hessian)(2, 2) = hzz;
+}
+
+inline void evaluate_term(const Term& term,
+                          const math::Vector<Real, 3>& xi,
+                          Real& value,
+                          Gradient* gradient = nullptr,
+                          Hessian* hessian = nullptr) {
+    EvaluationPoint point;
+    prepare_evaluation_point(
+        xi, term.px, term.py, term.pz, term.denom_power, point);
+    evaluate_term(term, point, value, gradient, hessian);
+}
+
+} // namespace pyramid_modal
+} // namespace basis
+} // namespace FE
+} // namespace svmp
+
+#endif // SVMP_FE_BASIS_PYRAMIDMODALBASIS_H
diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
new file mode 100644
index 000000000..309fd18be
--- /dev/null
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
@@ -0,0 +1,882 @@
+/* Copyright (c) Stanford University, The Regents of the University of California, and others.
+ *
+ * All Rights Reserved.
+ *
+ * See License file.
+ */
+
+#include "SerendipityBasis.h"
+#include "LagrangeBasis.h"
+#include "NodeOrderingConventions.h"
+#include "Math/DenseLinearAlgebra.h"
+#include "Math/IntegerMath.h"
+
+#include <algorithm>
+#include <array>
+#include <cmath>
+#include <span>
+#include <string>
+
+namespace svmp {
+namespace FE {
+namespace basis {
+
+using math::pow_int;
+
+namespace {
+using Vec3 = math::Vector<Real, 3>;
+
+int quad_serendipity_superlinear_degree(int ax, int ay) {
+    return (ax > 1 ? ax : 0) + (ay > 1 ? ay : 0);
+}
+
+std::vector<std::array<int, 2>> quad_serendipity_exponents(int order) {
+    std::vector<std::array<int, 2>> exponents;
+    for (int ay = 0; ay <= order; ++ay) {
+        for (int ax = 0; ax <= order; ++ax) {
+            if (quad_serendipity_superlinear_degree(ax, ay) <= order) {
+                exponents.push_back({ax, ay});
+            }
+        }
+    }
+    return exponents;
+}
+
+std::vector<Vec3> quad_serendipity_nodes(int order, std::size_t total_size) {
+    std::vector<Vec3> nodes;
+    if (order <= 0) {
+        return nodes;
+    }
+
+    const Real inv_order = Real(1) / Real(order);
+
+    nodes.push_back(Vec3{Real(-1), Real(-1), Real(0)});
+    nodes.push_back(Vec3{Real(1),  Real(-1), Real(0)});
+    nodes.push_back(Vec3{Real(1),  Real(1),  Real(0)});
+    nodes.push_back(Vec3{Real(-1), Real(1),  Real(0)});
+
+    for (int i = 1; i < order; ++i) {
+        nodes.push_back(Vec3{Real(-1) + Real(2 * i) * inv_order, Real(-1), Real(0)});
+    }
+    for (int i = 1; i < order; ++i) {
+        nodes.push_back(Vec3{Real(1), Real(-1) + Real(2 * i) * inv_order, Real(0)});
+    }
+    for (int i = 1; i < order; ++i) {
+        nodes.push_back(Vec3{Real(1) - Real(2 * i) * inv_order, Real(1), Real(0)});
+    }
+    for (int i = 1; i < order; ++i) {
+        nodes.push_back(Vec3{Real(-1), Real(1) - Real(2 * i) * inv_order, Real(0)});
+    }
+
+    if (nodes.size() > total_size) {
+        throw BasisConstructionException(
+            "SerendipityBasis: quadrilateral serendipity boundary nodes exceed requested size",
+            __FILE__, __LINE__, __func__);
+    }
+
+    const std::size_t interior_count = total_size - nodes.size();
+    if (interior_count == 0u) {
+        return nodes;
+    }
+
+    std::vector<Vec3> interior_candidates;
+    interior_candidates.reserve(static_cast<std::size_t>((order - 1) * (order - 1)));
+    for (int j = 1; j < order; ++j) {
+        for (int i = 1; i < order; ++i) {
+            interior_candidates.push_back(
+                Vec3{Real(-1) + Real(2 * i) * inv_order,
+                     Real(-1) + Real(2 * j) * inv_order,
+                     Real(0)});
+        }
+    }
+
+    std::sort(interior_candidates.begin(), interior_candidates.end(),
+              [](const Vec3& a, const Vec3& b) {
+                  const Real a_linf = std::max(std::abs(a[0]), std::abs(a[1]));
+                  const Real b_linf = std::max(std::abs(b[0]), std::abs(b[1]));
+                  if (a_linf != b_linf) {
+                      return a_linf < b_linf;
+                  }
+
+                  const Real a_l1 = std::abs(a[0]) + std::abs(a[1]);
+                  const Real b_l1 = std::abs(b[0]) + std::abs(b[1]);
+                  if (a_l1 != b_l1) {
+                      return a_l1 < b_l1;
+                  }
+
+                  if (a[1] != b[1]) {
+                      return a[1] < b[1];
+                  }
+                  return a[0] < b[0];
+              });
+
+    if (interior_count > interior_candidates.size()) {
+        throw BasisConstructionException(
+            "SerendipityBasis: insufficient quadrilateral interior nodes for requested serendipity order",
+            __FILE__, __LINE__, __func__);
+    }
+
+    nodes.insert(nodes.end(),
+                 interior_candidates.begin(),
+                 interior_candidates.begin() + static_cast<std::ptrdiff_t>(interior_count));
+    return nodes;
+}
+
+std::vector<Real> invert_dense_matrix(std::vector<Real> matrix, int n, const char* label) {
+    return math::invert_dense_matrix(
+        std::move(matrix),
+        static_cast<std::size_t>(n),
+        std::string("SerendipityBasis interpolation matrix for ") + label);
+}
+
+std::vector<Real> quad_serendipity_inverse_vandermonde(
+    std::span<const Vec3> nodes,
+    std::span<const std::array<int, 2>> exponents,
+    int order) {
+    const int n = static_cast<int>(nodes.size());
+    if (n == 0 || exponents.size() != nodes.size()) {
+        throw BasisConstructionException(
+            "SerendipityBasis: invalid quadrilateral serendipity interpolation setup",
+            __FILE__, __LINE__, __func__);
+    }
+
+    std::vector<Real> vandermonde(static_cast<std::size_t>(n * n), Real(0));
+    auto idx = [n](int row, int col) -> std::size_t {
+        return static_cast<std::size_t>(row * n + col);
+    };
+
+    for (int row = 0; row < n; ++row) {
+        const Real x = nodes[static_cast<std::size_t>(row)][0];
+        const Real y = nodes[static_cast<std::size_t>(row)][1];
+        for (int col = 0; col < n; ++col) {
+            const auto [ax, ay] = exponents[static_cast<std::size_t>(col)];
+            vandermonde[idx(row, col)] = pow_int(x, ax) * pow_int(y, ay);
+        }
+    }
+
+    const std::string label = "Quad order " + std::to_string(order);
+    return invert_dense_matrix(std::move(vandermonde), n, label.c_str());
+}
+constexpr std::array<Real, 13> kPyramid13CenterRedistribution = {
+    Real(-0.25), Real(-0.25), Real(-0.25), Real(-0.25),
+    Real(0),
+    Real(0.5), Real(0.5), Real(0.5), Real(0.5),
+    Real(0), Real(0), Real(0), Real(0)
+};
+
+constexpr std::array<std::array<int, 3>, 15> kWedge15MonomialExponents = {{
+    {{0, 0, 0}},
+    {{0, 0, 1}},
+    {{0, 0, 2}},
+    {{0, 1, 0}},
+    {{0, 1, 1}},
+    {{0, 1, 2}},
+    {{0, 2, 0}},
+    {{0, 2, 1}},
+    {{1, 0, 0}},
+    {{1, 0, 1}},
+    {{1, 0, 2}},
+    {{1, 1, 0}},
+    {{1, 1, 1}},
+    {{2, 0, 0}},
+    {{2, 0, 1}}
+}};
+
+constexpr std::array<std::array<Real, 15>, 15> kWedge15Coefficients = {{
+    {{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0}},
+    {{-0.5, 0, 0, 0.5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}},
+    {{0.5, -0, -0, 0.5, -0, -0, -0, -0, -0, -0, -0, -0, -1, -0, -0}},
+    {{-1, 0, -1, -1, 0, -1, 0, 0, 2, 0, 0, 2, -1, 0, 1}},
+    {{1.5, 0, 0.5, -1.5, 0, -0.5, 0, 0, -2, 0, 0, 2, 0, 0, 0}},
+    {{-0.5, -0, 0.5, -0.5, -0, 0.5, -0, -0, -0, -0, -0, -0, 1, -0, -1}},
+    {{1, 0, 1, 1, 0, 1, 0, 0, -2, 0, 0, -2, 0, 0, 0}},
+    {{-1, 0, -1, 1, 0, 1, 0, 0, 2, 0, 0, -2, 0, 0, 0}},
+    {{-1, -1, 0, -1, -1, 0, 2, 0, 0, 2, 0, 0, -1, 1, 0}},
+    {{1.5, 0.5, 0, -1.5, -0.5, 0, -2, 0, 0, 2, 0, 0, 0, 0, 0}},
+    {{-0.5, 0.5, -0, -0.5, 0.5, -0, -0, -0, -0, -0, -0, -0, 1, -1, -0}},
+    {{2, 0, -0, 2, 0, -0, -2, 2, -2, -2, 2, -2, -0, -0, -0}},
+    {{-2, 0, 0, 2, 0, 0, 2, -2, 2, -2, 2, -2, 0, 0, 0}},
+    {{1, 1, -0, 1, 1, -0, -2, -0, -0, -2, -0, -0, -0, -0, -0}},
+    {{-1, -1, -0, 1, 1, -0, 2, -0, -0, -2, -0, -0, -0, -0, -0}}
+}};
+
+static const int hex20_monomial_exponents[20][3] = {
+    {0, 0, 0}, {0, 0, 1}, {0, 0, 2}, {0, 1, 0}, {0, 1, 1},
+    {0, 1, 2}, {0, 2, 0}, {0, 2, 1}, {1, 0, 0}, {1, 0, 1},
+    {1, 0, 2}, {1, 1, 0}, {1, 1, 1}, {1, 1, 2}, {1, 2, 0},
+    {1, 2, 1}, {2, 0, 0}, {2, 0, 1}, {2, 1, 0}, {2, 1, 1}
+};
+
+static const Real hex20_coeffs[20][20] = {
+    {-0.25, -0.25, -0.25, -0.25, -0.25, -0.25, -0.25, -0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25},
+    {0.125, 0.125, 0.125, 0.125, -0.125, -0.125, -0.125, -0.125, -0.25, 0.25, -0.25, 0.25, -0.25, -0.25, 0.25, 0.25, 0, 0, 0, 0},
+    {0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0, 0, 0, 0, 0, 0, 0, 0, -0.25, -0.25, -0.25, -0.25},
+    {0.125, 0.125, -0.125, -0.125, 0.125, 0.125, -0.125, -0.125, -0.25, -0.25, 0.25, 0.25, 0, 0, 0, 0, -0.25, -0.25, 0.25, 0.25},
+    {0, 0, 0, 0, 0, 0, 0, 0, 0.25, -0.25, -0.25, 0.25, 0, 0, 0, 0, 0, 0, 0, 0},
+    {-0.125, -0.125, 0.125, 0.125, -0.125, -0.125, 0.125, 0.125, 0, 0, 0, 0, 0, 0, 0, 0, 0.25, 0.25, -0.25, -0.25},
+    {0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0, 0, 0, 0, -0.25, -0.25, -0.25, -0.25, 0, 0, 0, 0},
+    {-0.125, -0.125, -0.125, -0.125, 0.125, 0.125, 0.125, 0.125, 0, 0, 0, 0, 0.25, 0.25, -0.25, -0.25, 0, 0, 0, 0},
+    {0.125, -0.125, -0.125, 0.125, 0.125, -0.125, -0.125, 0.125, 0, 0, 0, 0, -0.25, 0.25, -0.25, 0.25, -0.25, 0.25, -0.25, 0.25},
+    {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.25, -0.25, -0.25, 0.25, 0, 0, 0, 0},
+    {-0.125, 0.125, 0.125, -0.125, -0.125, 0.125, 0.125, -0.125, 0, 0, 0, 0, 0, 0, 0, 0, 0.25, -0.25, 0.25, -0.25},
+    {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.25, -0.25, -0.25, 0.25},
+    {-0.125, 0.125, -0.125, 0.125, 0.125, -0.125, 0.125, -0.125, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+    {0.125, -0.125, 0.125, -0.125, 0.125, -0.125, 0.125, -0.125, 0, 0, 0, 0, 0, 0, 0, 0, -0.25, 0.25, 0.25, -0.25},
+    {-0.125, 0.125, 0.125, -0.125, -0.125, 0.125, 0.125, -0.125, 0, 0, 0, 0, 0.25, -0.25, 0.25, -0.25, 0, 0, 0, 0},
+    {0.125, -0.125, -0.125, 0.125, -0.125, 0.125, 0.125, -0.125, 0, 0, 0, 0, -0.25, 0.25, 0.25, -0.25, 0, 0, 0, 0},
+    {0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, -0.25, -0.25, -0.25, -0.25, 0, 0, 0, 0, 0, 0, 0, 0},
+    {-0.125, -0.125, -0.125, -0.125, 0.125, 0.125, 0.125, 0.125, 0.25, -0.25, 0.25, -0.25, 0, 0, 0, 0, 0, 0, 0, 0},
+    {-0.125, -0.125, 0.125, 0.125, -0.125, -0.125, 0.125, 0.125, 0.25, 0.25, -0.25, -0.25, 0, 0, 0, 0, 0, 0, 0, 0},
+    {0.125, 0.125, -0.125, -0.125, -0.125, -0.125, 0.125, 0.125, -0.25, 0.25, 0.25, -0.25, 0, 0, 0, 0, 0, 0, 0, 0}
+};
+
+inline std::array<Real, 3> quadratic_powers(Real x) {
+    return {Real(1), x, x * x};
+}
+
+void eval_hex20_internal(Real r, Real s, Real t, Real* internal_vals) {
+    const auto rp = quadratic_powers(r);
+    const auto sp = quadratic_powers(s);
+    const auto tp = quadratic_powers(t);
+    Real phi[20];
+    for (int j = 0; j < 20; ++j) {
+        const int a = hex20_monomial_exponents[j][0];
+        const int b = hex20_monomial_exponents[j][1];
+        const int c = hex20_monomial_exponents[j][2];
+        phi[j] = rp[static_cast<std::size_t>(a)] *
+                 sp[static_cast<std::size_t>(b)] *
+                 tp[static_cast<std::size_t>(c)];
+    }
+    for (int i = 0; i < 20; ++i) {
+        Real v = Real(0);
+        for (int j = 0; j < 20; ++j) {
+            v += hex20_coeffs[j][i] * phi[j];
+        }
+        internal_vals[i] = v;
+    }
+}
+
+void eval_hex20_grad_internal(Real r, Real s, Real t, Gradient* internal_grads) {
+    const auto rp = quadratic_powers(r);
+    const auto sp = quadratic_powers(s);
+    const auto tp = quadratic_powers(t);
+    Real dphi_dr[20], dphi_ds[20], dphi_dt[20];
+    for (int j = 0; j < 20; ++j) {
+        const int a = hex20_monomial_exponents[j][0];
+        const int b = hex20_monomial_exponents[j][1];
+        const int c = hex20_monomial_exponents[j][2];
+
+        dphi_dr[j] = (a > 0) ? Real(a) * rp[static_cast<std::size_t>(a - 1)] *
+                                    sp[static_cast<std::size_t>(b)] *
+                                    tp[static_cast<std::size_t>(c)]
+                              : Real(0);
+        dphi_ds[j] = (b > 0) ? rp[static_cast<std::size_t>(a)] *
+                                    Real(b) * sp[static_cast<std::size_t>(b - 1)] *
+                                    tp[static_cast<std::size_t>(c)]
+                              : Real(0);
+        dphi_dt[j] = (c > 0) ? rp[static_cast<std::size_t>(a)] *
+                                    sp[static_cast<std::size_t>(b)] *
+                                    Real(c) * tp[static_cast<std::size_t>(c - 1)]
+                              : Real(0);
+    }
+
+    for (int i = 0; i < 20; ++i) {
+        Real gr = Real(0), gs = Real(0), gt = Real(0);
+        for (int j = 0; j < 20; ++j) {
+            gr += hex20_coeffs[j][i] * dphi_dr[j];
+            gs += hex20_coeffs[j][i] * dphi_ds[j];
+            gt += hex20_coeffs[j][i] * dphi_dt[j];
+        }
+        internal_grads[i][0] = gr;
+        internal_grads[i][1] = gs;
+        internal_grads[i][2] = gt;
+    }
+}
+
+void eval_hex20_hess_internal(Real r, Real s, Real t, Hessian* internal_hessians) {
+    const auto rp = quadratic_powers(r);
+    const auto sp = quadratic_powers(s);
+    const auto tp = quadratic_powers(t);
+    Real d2phi_drr[20], d2phi_dss[20], d2phi_dtt[20];
+    Real d2phi_drs[20], d2phi_drt[20], d2phi_dst[20];
+    for (int j = 0; j < 20; ++j) {
+        const int a = hex20_monomial_exponents[j][0];
+        const int b = hex20_monomial_exponents[j][1];
+        const int c = hex20_monomial_exponents[j][2];
+
+        d2phi_drr[j] = (a > 1) ? Real(a * (a - 1)) *
+                                      rp[static_cast<std::size_t>(a - 2)] *
+                                      sp[static_cast<std::size_t>(b)] *
+                                      tp[static_cast<std::size_t>(c)]
+                                : Real(0);
+        d2phi_dss[j] = (b > 1) ? rp[static_cast<std::size_t>(a)] *
+                                      Real(b * (b - 1)) *
+                                      sp[static_cast<std::size_t>(b - 2)] *
+                                      tp[static_cast<std::size_t>(c)]
+                                : Real(0);
+        d2phi_dtt[j] = (c > 1) ? rp[static_cast<std::size_t>(a)] *
+                                      sp[static_cast<std::size_t>(b)] *
+                                      Real(c * (c - 1)) *
+                                      tp[static_cast<std::size_t>(c - 2)]
+                                : Real(0);
+        d2phi_drs[j] = (a > 0 && b > 0) ? Real(a * b) *
+                                              rp[static_cast<std::size_t>(a - 1)] *
+                                              sp[static_cast<std::size_t>(b - 1)] *
+                                              tp[static_cast<std::size_t>(c)]
+                                        : Real(0);
+        d2phi_drt[j] = (a > 0 && c > 0) ? Real(a * c) *
+                                              rp[static_cast<std::size_t>(a - 1)] *
+                                              sp[static_cast<std::size_t>(b)] *
+                                              tp[static_cast<std::size_t>(c - 1)]
+                                        : Real(0);
+        d2phi_dst[j] = (b > 0 && c > 0) ? rp[static_cast<std::size_t>(a)] *
+                                              Real(b * c) *
+                                              sp[static_cast<std::size_t>(b - 1)] *
+                                              tp[static_cast<std::size_t>(c - 1)]
+                                        : Real(0);
+    }
+
+    for (int i = 0; i < 20; ++i) {
+        Hessian H{};
+        for (int j = 0; j < 20; ++j) {
+            H(0, 0) += hex20_coeffs[j][i] * d2phi_drr[j];
+            H(1, 1) += hex20_coeffs[j][i] * d2phi_dss[j];
+            H(2, 2) += hex20_coeffs[j][i] * d2phi_dtt[j];
+            H(0, 1) += hex20_coeffs[j][i] * d2phi_drs[j];
+            H(0, 2) += hex20_coeffs[j][i] * d2phi_drt[j];
+            H(1, 2) += hex20_coeffs[j][i] * d2phi_dst[j];
+        }
+        H(1, 0) = H(0, 1);
+        H(2, 0) = H(0, 2);
+        H(2, 1) = H(1, 2);
+        internal_hessians[i] = H;
+    }
+}
+
+void eval_wedge15_polynomial(Real r,
+                             Real s,
+                             Real t,
+                             Real* values,
+                             Gradient* gradients,
+                             Hessian* hessians) {
+    Real phi[15]{};
+    Real dr[15]{};
+    Real ds[15]{};
+    Real dt[15]{};
+    Real drr[15]{};
+    Real dss[15]{};
+    Real dtt[15]{};
+    Real drs[15]{};
+    Real drt[15]{};
+    Real dst[15]{};
+
+    const auto rp = quadratic_powers(r);
+    const auto sp = quadratic_powers(s);
+    const auto tp = quadratic_powers(t);
+
+    for (int j = 0; j < 15; ++j) {
+        const auto& exponent = kWedge15MonomialExponents[static_cast<std::size_t>(j)];
+        const int a = exponent[0];
+        const int b = exponent[1];
+        const int c = exponent[2];
+        const auto ar = static_cast<std::size_t>(a);
+        const auto bs = static_cast<std::size_t>(b);
+        const auto ct = static_cast<std::size_t>(c);
+
+        const Real ra = rp[ar];
+        const Real sb = sp[bs];
+        const Real tc = tp[ct];
+
+        if (values) {
+            phi[j] = ra * sb * tc;
+        }
+        if (gradients) {
+            dr[j] = (a > 0) ? Real(a) * rp[ar - 1u] * sb * tc : Real(0);
+            ds[j] = (b > 0) ? ra * Real(b) * sp[bs - 1u] * tc : Real(0);
+            dt[j] = (c > 0) ? ra * sb * Real(c) * tp[ct - 1u] : Real(0);
+        }
+        if (hessians) {
+            drr[j] = (a > 1) ? Real(a * (a - 1)) * rp[ar - 2u] * sb * tc : Real(0);
+            dss[j] = (b > 1) ? ra * Real(b * (b - 1)) * sp[bs - 2u] * tc : Real(0);
+            dtt[j] = (c > 1) ? ra * sb * Real(c * (c - 1)) * tp[ct - 2u] : Real(0);
+            drs[j] = (a > 0 && b > 0) ? Real(a * b) * rp[ar - 1u] * sp[bs - 1u] * tc : Real(0);
+            drt[j] = (a > 0 && c > 0) ? Real(a * c) * rp[ar - 1u] * sb * tp[ct - 1u] : Real(0);
+            dst[j] = (b > 0 && c > 0) ? ra * Real(b * c) * sp[bs - 1u] * tp[ct - 1u] : Real(0);
+        }
+    }
+
+    for (int i = 0; i < 15; ++i) {
+        Real value = Real(0);
+        Real gr = Real(0);
+        Real gs = Real(0);
+        Real gt = Real(0);
+        Hessian H{};
+        for (int j = 0; j < 15; ++j) {
+            const Real coefficient =
+                kWedge15Coefficients[static_cast<std::size_t>(j)][static_cast<std::size_t>(i)];
+            if (values) {
+                value += coefficient * phi[j];
+            }
+            if (gradients) {
+                gr += coefficient * dr[j];
+                gs += coefficient * ds[j];
+                gt += coefficient * dt[j];
+            }
+            if (hessians) {
+                H(0, 0) += coefficient * drr[j];
+                H(1, 1) += coefficient * dss[j];
+                H(2, 2) += coefficient * dtt[j];
+                H(0, 1) += coefficient * drs[j];
+                H(0, 2) += coefficient * drt[j];
+                H(1, 2) += coefficient * dst[j];
+            }
+        }
+
+        const std::size_t index = static_cast<std::size_t>(i);
+        if (values) {
+            values[index] = value;
+        }
+        if (gradients) {
+            gradients[index][0] = gr;
+            gradients[index][1] = gs;
+            gradients[index][2] = gt;
+        }
+        if (hessians) {
+            H(1, 0) = H(0, 1);
+            H(2, 0) = H(0, 2);
+            H(2, 1) = H(1, 2);
+            hessians[index] = H;
+        }
+    }
+}
+
+} // namespace
+
+SerendipityBasis::SerendipityBasis(ElementType type, int order, bool geometry_mode)
+    : element_type_(type), dimension_(0), order_(order), size_(0), geometry_mode_(geometry_mode) {
+    if (type == ElementType::Quad4 || type == ElementType::Quad8) {
+        dimension_ = 2;
+        if (order_ < 1) {
+            order_ = 1;
+        }
+        if (type == ElementType::Quad8 && order_ != 2) {
+            throw BasisConfigurationException(
+                "SerendipityBasis: Quad8 is only valid for quadratic order 2; use Quad4 for higher-order quadrilateral serendipity",
+                __FILE__, __LINE__, __func__);
+        }
+        quad_monomial_exponents_ = quad_serendipity_exponents(order_);
+        size_ = quad_monomial_exponents_.size();
+        nodes_ = quad_serendipity_nodes(order_, size_);
+        if (nodes_.size() != size_) {
+            throw BasisConstructionException(
+                "SerendipityBasis: quadrilateral serendipity setup produced inconsistent sizes",
+                __FILE__, __LINE__, __func__);
+        }
+        quad_inv_vandermonde_ = quad_serendipity_inverse_vandermonde(nodes_, quad_monomial_exponents_, order_);
+    } else if (type == ElementType::Hex8 || type == ElementType::Hex20) {
+        dimension_ = 3;
+        if (order_ < 1) order_ = 1;
+        if (order_ == 1) {
+            size_ = 8;
+        } else if (order_ == 2) {
+            size_ = 20;
+        } else {
+            throw BasisConfigurationException(
+                "SerendipityBasis supports up to quadratic on hexahedra",
+                __FILE__, __LINE__, __func__);
+        }
+    } else if (type == ElementType::Wedge15) {
+        dimension_ = 3;
+        if (order_ < 2) {
+            order_ = 2;
+        }
+        if (order_ == 2) {
+            size_ = 15;
+        } else {
+            throw BasisConfigurationException(
+                "SerendipityBasis supports up to quadratic on wedge15",
+                __FILE__, __LINE__, __func__);
+        }
+    } else if (type == ElementType::Pyramid13) {
+        dimension_ = 3;
+        if (order_ < 2) {
+            order_ = 2;
+        }
+        if (order_ == 2) {
+            size_ = 13;
+        } else {
+            throw BasisConfigurationException(
+                "SerendipityBasis supports up to quadratic on pyramid13",
+                __FILE__, __LINE__, __func__);
+        }
+    } else {
+        throw BasisElementCompatibilityException("SerendipityBasis supports Quad4/Quad8, Hex8/Hex20, Wedge15, and Pyramid13 elements",
+                                                 __FILE__, __LINE__, __func__);
+    }
+
+    if (nodes_.empty()) {
+        nodes_.reserve(size_);
+        for (std::size_t i = 0; i < size_; ++i) {
+            nodes_.push_back(ReferenceNodeLayout::get_node_coords(element_type_, i));
+        }
+    }
+}
+
+bool SerendipityBasis::cache_identity_words(std::vector<std::uint64_t>& words) const {
+    words.push_back(0x736572656e646970ULL);
+    words.push_back(static_cast<std::uint64_t>(basis_type()));
+    words.push_back(static_cast<std::uint64_t>(element_type_));
+    words.push_back(static_cast<std::uint64_t>(dimension_));
+    words.push_back(static_cast<std::uint64_t>(order_));
+    words.push_back(static_cast<std::uint64_t>(size_));
+    words.push_back(geometry_mode_ ? 1u : 0u);
+    return true;
+}
+
+void SerendipityBasis::evaluate_values(const math::Vector<Real, 3>& xi,
+                                       std::vector<Real>& values) const {
+    values.assign(size_, Real(0));
+    const Real x = xi[0];
+    const Real y = xi[1];
+    const Real z = xi[2];
+
+    if (dimension_ == 2) {
+        if (quad_monomial_exponents_.size() != size_ ||
+            quad_inv_vandermonde_.size() != size_ * size_) {
+            throw BasisEvaluationException(
+                "SerendipityBasis: quadrilateral interpolation tables are not initialized for value evaluation",
+                __FILE__, __LINE__, __func__);
+        }
+
+        std::vector<Real> monomials(size_, Real(0));
+        for (std::size_t j = 0; j < size_; ++j) {
+            const auto [ax, ay] = quad_monomial_exponents_[j];
+            monomials[j] = pow_int(x, ax) * pow_int(y, ay);
+        }
+
+        for (std::size_t i = 0; i < size_; ++i) {
+            Real value = Real(0);
+            for (std::size_t j = 0; j < size_; ++j) {
+                value += monomials[j] * quad_inv_vandermonde_[j * size_ + i];
+            }
+            values[i] = value;
+        }
+        return;
+    }
+
+    if (dimension_ == 3 && order_ == 1) {
+        // Hex8 trilinear shape functions
+        const Real r = x;
+        const Real s = y;
+        const Real t = z;
+        values[0] = Real(0.125) * (Real(1) - r) * (Real(1) - s) * (Real(1) - t);
+        values[1] = Real(0.125) * (Real(1) + r) * (Real(1) - s) * (Real(1) - t);
+        values[2] = Real(0.125) * (Real(1) + r) * (Real(1) + s) * (Real(1) - t);
+        values[3] = Real(0.125) * (Real(1) - r) * (Real(1) + s) * (Real(1) - t);
+        values[4] = Real(0.125) * (Real(1) - r) * (Real(1) - s) * (Real(1) + t);
+        values[5] = Real(0.125) * (Real(1) + r) * (Real(1) - s) * (Real(1) + t);
+        values[6] = Real(0.125) * (Real(1) + r) * (Real(1) + s) * (Real(1) + t);
+        values[7] = Real(0.125) * (Real(1) - r) * (Real(1) + s) * (Real(1) + t);
+        return;
+    }
+
+    const Real r = x;
+    const Real s = y;
+    const Real t = z;
+
+    if (geometry_mode_ && element_type_ == ElementType::Hex20) {
+        // Hex20 geometry mode: use trilinear Hex8 shape functions on corners, edges zero.
+        values[0] = Real(0.125) * (Real(1) - r) * (Real(1) - s) * (Real(1) - t);
+        values[1] = Real(0.125) * (Real(1) + r) * (Real(1) - s) * (Real(1) - t);
+        values[2] = Real(0.125) * (Real(1) + r) * (Real(1) + s) * (Real(1) - t);
+        values[3] = Real(0.125) * (Real(1) - r) * (Real(1) + s) * (Real(1) - t);
+        values[4] = Real(0.125) * (Real(1) - r) * (Real(1) - s) * (Real(1) + t);
+        values[5] = Real(0.125) * (Real(1) + r) * (Real(1) - s) * (Real(1) + t);
+        values[6] = Real(0.125) * (Real(1) + r) * (Real(1) + s) * (Real(1) + t);
+        values[7] = Real(0.125) * (Real(1) - r) * (Real(1) + s) * (Real(1) + t);
+        for (std::size_t i = 8; i < 20; ++i) {
+            values[i] = Real(0);
+        }
+        return;
+    }
+
+    if (element_type_ == ElementType::Hex20) {
+        Real internal_vals[20];
+        eval_hex20_internal(r, s, t, internal_vals);
+        const auto mesh_to_basis = ReferenceNodeLayout::mesh_to_basis_ordering(element_type_);
+        BASIS_CHECK_EVAL(mesh_to_basis.size() == size_,
+                         "Hex20 mesh-to-basis ordering is not registered");
+        for (std::size_t i = 0; i < 20; ++i) {
+            values[i] = internal_vals[mesh_to_basis[i]];
+        }
+        return;
+    }
+
+    if (element_type_ == ElementType::Wedge15) {
+        eval_wedge15_polynomial(r, s, t, values.data(), nullptr, nullptr);
+        return;
+    }
+
+    if (element_type_ == ElementType::Pyramid13) {
+        static const LagrangeBasis parent(ElementType::Pyramid14, 2);
+        std::array<Real, 14> parent_values{};
+        parent.evaluate_values_to(xi, parent_values.data());
+        for (std::size_t i = 0; i < 13; ++i) {
+            values[i] = parent_values[i] + kPyramid13CenterRedistribution[i] * parent_values[13];
+        }
+        return;
+    }
+}
+
+void SerendipityBasis::evaluate_gradients(const math::Vector<Real, 3>& xi,
+                                          std::vector<Gradient>& gradients) const {
+    gradients.assign(size_, Gradient{});
+
+    const Real x = xi[0];
+    const Real y = xi[1];
+    const Real z = xi[2];
+
+    if (dimension_ == 2) {
+        if (quad_monomial_exponents_.size() != size_ ||
+            quad_inv_vandermonde_.size() != size_ * size_) {
+            throw BasisEvaluationException(
+                "SerendipityBasis: quadrilateral interpolation tables are not initialized for gradient evaluation",
+                __FILE__, __LINE__, __func__);
+        }
+
+        std::vector<Real> dmon_dx(size_, Real(0));
+        std::vector<Real> dmon_dy(size_, Real(0));
+        for (std::size_t j = 0; j < size_; ++j) {
+            const auto [ax, ay] = quad_monomial_exponents_[j];
+            dmon_dx[j] = (ax > 0) ? Real(ax) * pow_int(x, ax - 1) * pow_int(y, ay) : Real(0);
+            dmon_dy[j] = (ay > 0) ? pow_int(x, ax) * Real(ay) * pow_int(y, ay - 1) : Real(0);
+        }
+
+        for (std::size_t i = 0; i < size_; ++i) {
+            Real gx = Real(0);
+            Real gy = Real(0);
+            for (std::size_t j = 0; j < size_; ++j) {
+                const Real coeff = quad_inv_vandermonde_[j * size_ + i];
+                gx += dmon_dx[j] * coeff;
+                gy += dmon_dy[j] * coeff;
+            }
+            gradients[i][0] = gx;
+            gradients[i][1] = gy;
+        }
+        return;
+    }
+
+    // 3D linear hex (Hex8)
+    if (dimension_ == 3 && order_ == 1) {
+        const Real r = x, s = y, t = z;
+        gradients[0][0] = -Real(0.125) * (Real(1) - s) * (Real(1) - t);
+        gradients[0][1] = -Real(0.125) * (Real(1) - r) * (Real(1) - t);
+        gradients[0][2] = -Real(0.125) * (Real(1) - r) * (Real(1) - s);
+
+        gradients[1][0] =  Real(0.125) * (Real(1) - s) * (Real(1) - t);
+        gradients[1][1] = -Real(0.125) * (Real(1) + r) * (Real(1) - t);
+        gradients[1][2] = -Real(0.125) * (Real(1) + r) * (Real(1) - s);
+
+        gradients[2][0] =  Real(0.125) * (Real(1) + s) * (Real(1) - t);
+        gradients[2][1] =  Real(0.125) * (Real(1) + r) * (Real(1) - t);
+        gradients[2][2] = -Real(0.125) * (Real(1) + r) * (Real(1) + s);
+
+        gradients[3][0] = -Real(0.125) * (Real(1) + s) * (Real(1) - t);
+        gradients[3][1] =  Real(0.125) * (Real(1) - r) * (Real(1) - t);
+        gradients[3][2] = -Real(0.125) * (Real(1) - r) * (Real(1) + s);
+
+        gradients[4][0] = -Real(0.125) * (Real(1) - s) * (Real(1) + t);
+        gradients[4][1] = -Real(0.125) * (Real(1) - r) * (Real(1) + t);
+        gradients[4][2] =  Real(0.125) * (Real(1) - r) * (Real(1) - s);
+
+        gradients[5][0] =  Real(0.125) * (Real(1) - s) * (Real(1) + t);
+        gradients[5][1] = -Real(0.125) * (Real(1) + r) * (Real(1) + t);
+        gradients[5][2] =  Real(0.125) * (Real(1) + r) * (Real(1) - s);
+
+        gradients[6][0] =  Real(0.125) * (Real(1) + s) * (Real(1) + t);
+        gradients[6][1] =  Real(0.125) * (Real(1) + r) * (Real(1) + t);
+        gradients[6][2] =  Real(0.125) * (Real(1) + r) * (Real(1) + s);
+
+        gradients[7][0] = -Real(0.125) * (Real(1) + s) * (Real(1) + t);
+        gradients[7][1] =  Real(0.125) * (Real(1) - r) * (Real(1) + t);
+        gradients[7][2] =  Real(0.125) * (Real(1) - r) * (Real(1) + s);
+        return;
+    }
+
+    // Hex20 geometry mode: use Hex8 gradients
+    if (dimension_ == 3 && order_ == 2 && geometry_mode_ &&
+        (element_type_ == ElementType::Hex20 || element_type_ == ElementType::Quad8)) {
+        const Real r = x, s = y, t = z;
+        gradients[0][0] = -Real(0.125) * (Real(1) - s) * (Real(1) - t);
+        gradients[0][1] = -Real(0.125) * (Real(1) - r) * (Real(1) - t);
+        gradients[0][2] = -Real(0.125) * (Real(1) - r) * (Real(1) - s);
+
+        gradients[1][0] =  Real(0.125) * (Real(1) - s) * (Real(1) - t);
+        gradients[1][1] = -Real(0.125) * (Real(1) + r) * (Real(1) - t);
+        gradients[1][2] = -Real(0.125) * (Real(1) + r) * (Real(1) - s);
+
+        gradients[2][0] =  Real(0.125) * (Real(1) + s) * (Real(1) - t);
+        gradients[2][1] =  Real(0.125) * (Real(1) + r) * (Real(1) - t);
+        gradients[2][2] = -Real(0.125) * (Real(1) + r) * (Real(1) + s);
+
+        gradients[3][0] = -Real(0.125) * (Real(1) + s) * (Real(1) - t);
+        gradients[3][1] =  Real(0.125) * (Real(1) - r) * (Real(1) - t);
+        gradients[3][2] = -Real(0.125) * (Real(1) - r) * (Real(1) + s);
+
+        gradients[4][0] = -Real(0.125) * (Real(1) - s) * (Real(1) + t);
+        gradients[4][1] = -Real(0.125) * (Real(1) - r) * (Real(1) + t);
+        gradients[4][2] =  Real(0.125) * (Real(1) - r) * (Real(1) - s);
+
+        gradients[5][0] =  Real(0.125) * (Real(1) - s) * (Real(1) + t);
+        gradients[5][1] = -Real(0.125) * (Real(1) + r) * (Real(1) + t);
+        gradients[5][2] =  Real(0.125) * (Real(1) + r) * (Real(1) - s);
+
+        gradients[6][0] =  Real(0.125) * (Real(1) + s) * (Real(1) + t);
+        gradients[6][1] =  Real(0.125) * (Real(1) + r) * (Real(1) + t);
+        gradients[6][2] =  Real(0.125) * (Real(1) + r) * (Real(1) + s);
+
+        gradients[7][0] = -Real(0.125) * (Real(1) + s) * (Real(1) + t);
+        gradients[7][1] =  Real(0.125) * (Real(1) - r) * (Real(1) + t);
+        gradients[7][2] =  Real(0.125) * (Real(1) - r) * (Real(1) + s);
+        // Edge-node gradients remain zero
+        return;
+    }
+
+    // Hex20 analytical gradients using monomial differentiation
+    if (element_type_ == ElementType::Hex20 && order_ == 2) {
+        const Real r = x, s = y, t = z;
+        Gradient internal_grads[20];
+        eval_hex20_grad_internal(r, s, t, internal_grads);
+        const auto mesh_to_basis = ReferenceNodeLayout::mesh_to_basis_ordering(element_type_);
+        BASIS_CHECK_EVAL(mesh_to_basis.size() == size_,
+                         "Hex20 mesh-to-basis ordering is not registered");
+        for (std::size_t i = 0; i < 20; ++i) {
+            gradients[i] = internal_grads[mesh_to_basis[i]];
+        }
+        return;
+    }
+
+    // Wedge15 analytical gradients using monomial differentiation
+    if (element_type_ == ElementType::Wedge15 && order_ == 2) {
+        eval_wedge15_polynomial(x, y, z, nullptr, gradients.data(), nullptr);
+        return;
+    }
+
+    if (element_type_ == ElementType::Pyramid13) {
+        static const LagrangeBasis parent(ElementType::Pyramid14, 2);
+        std::array<Real, 14u * 3u> parent_gradients{};
+        // Pyramid13 inherits the complete-family pyramid apex contract from the
+        // parent basis rather than introducing a separate regularized path.
+        parent.evaluate_gradients_to(xi, parent_gradients.data());
+        const auto parent_gradient = [&](std::size_t node, std::size_t component) {
+            return parent_gradients[node * 3u + component];
+        };
+        for (std::size_t i = 0; i < 13; ++i) {
+            for (std::size_t c = 0; c < 3u; ++c) {
+                gradients[i][c] =
+                    parent_gradient(i, c) +
+                    kPyramid13CenterRedistribution[i] * parent_gradient(13u, c);
+            }
+        }
+        return;
+    }
+
+    throw BasisEvaluationException("SerendipityBasis::evaluate_gradients: unsupported serendipity configuration",
+                                   __FILE__, __LINE__, __func__);
+}
+
+void SerendipityBasis::evaluate_hessians(const math::Vector<Real, 3>& xi,
+                                         std::vector<Hessian>& hessians) const {
+    hessians.assign(size_, Hessian{});
+    const Real x = xi[0];
+    const Real y = xi[1];
+    const Real z = xi[2];
+
+    if (dimension_ == 2) {
+        if (quad_monomial_exponents_.size() != size_ ||
+            quad_inv_vandermonde_.size() != size_ * size_) {
+            throw BasisEvaluationException(
+                "SerendipityBasis: quadrilateral interpolation tables are not initialized for Hessian evaluation",
+                __FILE__, __LINE__, __func__);
+        }
+
+        std::vector<Real> dxx(size_, Real(0));
+        std::vector<Real> dxy(size_, Real(0));
+        std::vector<Real> dyy(size_, Real(0));
+        for (std::size_t j = 0; j < size_; ++j) {
+            const auto [ax, ay] = quad_monomial_exponents_[j];
+            dxx[j] = (ax > 1) ? Real(ax * (ax - 1)) * pow_int(x, ax - 2) * pow_int(y, ay) : Real(0);
+            dxy[j] = (ax > 0 && ay > 0) ? Real(ax * ay) * pow_int(x, ax - 1) * pow_int(y, ay - 1) : Real(0);
+            dyy[j] = (ay > 1) ? Real(ay * (ay - 1)) * pow_int(x, ax) * pow_int(y, ay - 2) : Real(0);
+        }
+
+        for (std::size_t i = 0; i < size_; ++i) {
+            for (std::size_t j = 0; j < size_; ++j) {
+                const Real coeff = quad_inv_vandermonde_[j * size_ + i];
+                hessians[i](0, 0) += dxx[j] * coeff;
+                hessians[i](0, 1) += dxy[j] * coeff;
+                hessians[i](1, 1) += dyy[j] * coeff;
+            }
+            hessians[i](1, 0) = hessians[i](0, 1);
+        }
+        return;
+    }
+
+    if (element_type_ == ElementType::Hex8 && order_ == 1) {
+        static const LagrangeBasis parent(ElementType::Hex8, 1);
+        parent.evaluate_hessians(xi, hessians);
+        return;
+    }
+
+    if (geometry_mode_ && element_type_ == ElementType::Hex20) {
+        static const LagrangeBasis parent(ElementType::Hex8, 1);
+        std::array<Real, 8u * 9u> parent_hessians{};
+        parent.evaluate_hessians_to(xi, parent_hessians.data());
+        for (std::size_t i = 0; i < 8; ++i) {
+            for (std::size_t r = 0; r < 3; ++r) {
+                for (std::size_t c = 0; c < 3; ++c) {
+                    hessians[i](r, c) = parent_hessians[i * 9u + r * 3u + c];
+                }
+            }
+        }
+        return;
+    }
+
+    if (element_type_ == ElementType::Hex20 && order_ == 2) {
+        Hessian internal_hessians[20];
+        eval_hex20_hess_internal(x, y, z, internal_hessians);
+        const auto mesh_to_basis = ReferenceNodeLayout::mesh_to_basis_ordering(element_type_);
+        BASIS_CHECK_EVAL(mesh_to_basis.size() == size_,
+                         "Hex20 mesh-to-basis ordering is not registered");
+        for (std::size_t i = 0; i < 20; ++i) {
+            hessians[i] = internal_hessians[mesh_to_basis[i]];
+        }
+        return;
+    }
+
+    if (element_type_ == ElementType::Wedge15 && order_ == 2) {
+        eval_wedge15_polynomial(x, y, z, nullptr, nullptr, hessians.data());
+        return;
+    }
+
+    if (element_type_ == ElementType::Pyramid13) {
+        static const LagrangeBasis parent(ElementType::Pyramid14, 2);
+        std::array<Real, 14u * 9u> parent_hessians{};
+        // Pyramid13 inherits the complete-family pyramid apex contract from the
+        // parent basis rather than introducing a separate regularized path.
+        parent.evaluate_hessians_to(xi, parent_hessians.data());
+        const Hessian center_hessian = load_hessian(parent_hessians.data() + 13u * 9u);
+        for (std::size_t i = 0; i < 13; ++i) {
+            hessians[i] = load_hessian(parent_hessians.data() + i * 9u);
+            add_scaled_hessian(hessians[i], center_hessian, kPyramid13CenterRedistribution[i]);
+        }
+        return;
+    }
+
+    throw BasisEvaluationException("SerendipityBasis::evaluate_hessians: unsupported serendipity configuration",
+                                   __FILE__, __LINE__, __func__);
+}
+
+} // namespace basis
+} // namespace FE
+} // namespace svmp
diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.h b/Code/Source/solver/FE/Basis/SerendipityBasis.h
new file mode 100644
index 000000000..98c01415a
--- /dev/null
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.h
@@ -0,0 +1,70 @@
+/* Copyright (c) Stanford University, The Regents of the University of California, and others.
+ *
+ * All Rights Reserved.
+ *
+ * See License file.
+ */
+
+#ifndef SVMP_FE_BASIS_SERENDIPITYBASIS_H
+#define SVMP_FE_BASIS_SERENDIPITYBASIS_H
+
+/**
+ * @file SerendipityBasis.h
+ * @brief Reduced-degree-of-freedom serendipity bases
+ *
+ * `Pyramid13` inherits its apex contract from the complete-family rational
+ * pyramid basis: values remain exact at the apex, while exact-apex gradient
+ * and Hessian queries throw because the inherited nodal derivative limit is
+ * not unique.
+ */
+
+#include "BasisFunction.h"
+
+#include <array>
+#include <cstdint>
+
+namespace svmp {
+namespace FE {
+namespace basis {
+
+class SerendipityBasis : public BasisFunction {
+public:
+    SerendipityBasis(ElementType type, int order, bool geometry_mode = false);
+
+    BasisType basis_type() const noexcept override { return BasisType::Serendipity; }
+    ElementType element_type() const noexcept override { return element_type_; }
+    int dimension() const noexcept override { return dimension_; }
+    int order() const noexcept override { return order_; }
+    std::size_t size() const noexcept override { return size_; }
+    const std::vector<math::Vector<Real, 3>>& nodes() const noexcept { return nodes_; }
+    bool cache_identity_words(std::vector<std::uint64_t>& words) const override;
+
+    void evaluate_values(const math::Vector<Real, 3>& xi,
+                         std::vector<Real>& values) const override;
+
+    void evaluate_gradients(const math::Vector<Real, 3>& xi,
+                            std::vector<Gradient>& gradients) const override;
+
+    void evaluate_hessians(const math::Vector<Real, 3>& xi,
+                           std::vector<Hessian>& hessians) const override;
+
+private:
+    ElementType element_type_;
+    int dimension_;
+    int order_;
+    std::size_t size_;
+    std::vector<math::Vector<Real, 3>> nodes_;
+    std::vector<std::array<int, 2>> quad_monomial_exponents_;
+    // Row-major inverse Vandermonde, indexed as [monomial, basis].
+    std::vector<Real> quad_inv_vandermonde_;
+
+    // When true, this basis is used purely for geometry mapping and may use
+    // reduced polynomial order (e.g., Hex20 geometry as Hex8).
+    bool geometry_mode_;
+};
+
+} // namespace basis
+} // namespace FE
+} // namespace svmp
+
+#endif // SVMP_FE_BASIS_SERENDIPITYBASIS_H
diff --git a/Code/Source/solver/FE/Basis/VectorBasis.h b/Code/Source/solver/FE/Basis/VectorBasis.h
new file mode 100644
index 000000000..d442c2160
--- /dev/null
+++ b/Code/Source/solver/FE/Basis/VectorBasis.h
@@ -0,0 +1,255 @@
+/* Copyright (c) Stanford University, The Regents of the University of California, and others.
+ *
+ * All Rights Reserved.
+ *
+ * See License file.
+ */
+
+#ifndef SVMP_FE_BASIS_VECTORBASIS_H
+#define SVMP_FE_BASIS_VECTORBASIS_H
+
+/**
+ * @file VectorBasis.h
+ * @brief Vector-valued bases for H(div) and H(curl) conforming spaces
+ */
+
+#include "BasisFunction.h"
+#include "VectorBasisModalPolynomial.h"
+#include <array>
+#include <cstddef>
+#include <vector>
+
+namespace svmp {
+namespace FE {
+namespace basis {
+
+/**
+ * @brief DOF entity type for vector-valued basis functions
+ */
+enum class DofEntity {
+    Vertex,   ///< DOF associated with a vertex
+    Edge,     ///< DOF associated with an edge (tangential moments for H(curl))
+    Face,     ///< DOF associated with a face (normal moments for H(div), tangential for H(curl))
+    Interior  ///< DOF associated with element interior
+};
+
+/**
+ * @brief DOF association metadata for a single DOF
+ */
+struct DofAssociation {
+    DofEntity entity_type{DofEntity::Interior};
+    int entity_id{-1};      ///< Local index of the entity (edge/face/vertex)
+    int moment_index{0};    ///< Index within the entity's moment space
+};
+
+struct SparseModalCoefficientMatrix {
+    std::size_t rows{0};
+    std::size_t cols{0};
+    std::vector<std::size_t> row_offsets;
+    std::vector<std::size_t> dofs;
+    std::vector<Real> coefficients;
+};
+
+class VectorBasisFunction : public BasisFunction {
+public:
+    bool is_vector_valued() const noexcept override { return true; }
+    bool supports_vector_jacobians() const noexcept override { return true; }
+    void evaluate_values(const math::Vector<Real, 3>&,
+                         std::vector<Real>&) const override {
+        throw BasisEvaluationException("Vector basis uses evaluate_vector_values",
+                                       __FILE__, __LINE__, __func__);
+    }
+
+    void evaluate_vector_at_quadrature_points_strided(
+        const std::vector<math::Vector<Real, 3>>& points,
+        std::size_t output_stride,
+        Real* SVMP_RESTRICT values_out,
+        Real* SVMP_RESTRICT jacobians_out,
+        Real* SVMP_RESTRICT curls_out,
+        Real* SVMP_RESTRICT divergence_out) const override;
+
+    /**
+     * @brief Get DOF association metadata for all basis functions
+     *
+     * Returns a vector of size(), where each entry describes which
+     * geometric entity (vertex/edge/face/interior) the corresponding
+     * DOF is associated with. This is essential for orientation-aware
+     * assembly of H(div) and H(curl) spaces.
+     */
+    virtual std::vector<DofAssociation> dof_associations() const {
+        // Default: all interior DOFs (subclasses should override)
+        std::vector<DofAssociation> result(size());
+        for (std::size_t i = 0; i < size(); ++i) {
+            result[i].entity_type = DofEntity::Interior;
+            result[i].entity_id = 0;
+            result[i].moment_index = static_cast<int>(i);
+        }
+        return result;
+    }
+};
+
+/**
+ * @brief Raviart-Thomas H(div) basis on supported element families
+ */
+class RaviartThomasBasis : public VectorBasisFunction {
+public:
+    RaviartThomasBasis(ElementType type, int order = 0);
+
+    BasisType basis_type() const noexcept override { return BasisType::RaviartThomas; }
+    ElementType element_type() const noexcept override { return element_type_; }
+    int dimension() const noexcept override { return dimension_; }
+    int order() const noexcept override { return order_; }
+    std::size_t size() const noexcept override { return size_; }
+    bool cache_identity_is_structural() const noexcept override { return true; }
+
+    void evaluate_vector_values(const math::Vector<Real, 3>& xi,
+                                std::vector<math::Vector<Real, 3>>& values) const override;
+    void evaluate_vector_jacobians(const math::Vector<Real, 3>& xi,
+                                   std::vector<VectorJacobian>& jacobians) const override;
+    void evaluate_divergence(const math::Vector<Real, 3>& xi,
+                             std::vector<Real>& divergence) const override;
+    bool supports_divergence() const noexcept override { return true; }
+    void evaluate_vector_at_quadrature_points_strided(
+        const std::vector<math::Vector<Real, 3>>& points,
+        std::size_t output_stride,
+        Real* SVMP_RESTRICT values_out,
+        Real* SVMP_RESTRICT jacobians_out,
+        Real* SVMP_RESTRICT curls_out,
+        Real* SVMP_RESTRICT divergence_out) const override;
+
+    /// Get DOF associations (face/edge DOFs for 2D, face DOFs for 3D H(div))
+    std::vector<DofAssociation> dof_associations() const override;
+
+private:
+    using ModalTerm = VectorBasisModalTerm;
+    using ModalPolynomial = VectorBasisModalPolynomial;
+    using SeedJacobianEvaluator = void (*)(
+        const math::Vector<Real, 3>&,
+        std::vector<VectorJacobian>&);
+
+    ElementType element_type_;
+    int dimension_;
+    int order_;
+    std::size_t size_{0};
+
+    bool nodal_generated_{false};
+    bool use_transformed_direct_seed_{false};  ///< True for wedge/pyramid RT(k=1,2) transformed from direct seed functions
+    std::vector<int> transformed_seed_indices_;
+    std::vector<std::array<int, 4>> transformed_monomial_candidates_; ///< {component, px, py, pz}
+    std::vector<ModalPolynomial> monomials_;
+    std::array<int, 3> modal_power_limits_{{0, 0, 0}};
+    std::array<int, 3> transformed_power_limits_{{0, 0, 0}};
+    SeedJacobianEvaluator transformed_seed_jacobian_evaluator_{nullptr};
+    // Sparse coefficients for nodal basis in modal monomial basis:
+    //   phi_j = sum_p c(p,j) * modal_p.
+    // Rows index modal functions; entries target nodal DOFs.
+    SparseModalCoefficientMatrix modal_sparse_coeffs_;
+    SparseModalCoefficientMatrix transformed_sparse_coeffs_;
+};
+
+/**
+ * @brief First-kind Nedelec H(curl) basis on supported element families
+ */
+class NedelecBasis : public VectorBasisFunction {
+public:
+    NedelecBasis(ElementType type, int order = 0);
+
+    BasisType basis_type() const noexcept override { return BasisType::Nedelec; }
+    ElementType element_type() const noexcept override { return element_type_; }
+    int dimension() const noexcept override { return dimension_; }
+    int order() const noexcept override { return order_; }
+    std::size_t size() const noexcept override { return size_; }
+    bool cache_identity_is_structural() const noexcept override { return true; }
+
+    void evaluate_vector_values(const math::Vector<Real, 3>& xi,
+                                std::vector<math::Vector<Real, 3>>& values) const override;
+    void evaluate_vector_jacobians(const math::Vector<Real, 3>& xi,
+                                   std::vector<VectorJacobian>& jacobians) const override;
+    void evaluate_curl(const math::Vector<Real, 3>& xi,
+                       std::vector<math::Vector<Real, 3>>& curl) const override;
+    bool supports_curl() const noexcept override { return true; }
+    void evaluate_vector_at_quadrature_points_strided(
+        const std::vector<math::Vector<Real, 3>>& points,
+        std::size_t output_stride,
+        Real* SVMP_RESTRICT values_out,
+        Real* SVMP_RESTRICT jacobians_out,
+        Real* SVMP_RESTRICT curls_out,
+        Real* SVMP_RESTRICT divergence_out) const override;
+
+    /// Get DOF associations (edge DOFs for H(curl), face DOFs for 3D interior)
+    std::vector<DofAssociation> dof_associations() const override;
+
+private:
+    using ModalTerm = VectorBasisModalTerm;
+    using ModalPolynomial = VectorBasisModalPolynomial;
+    using SeedJacobianEvaluator = void (*)(
+        const math::Vector<Real, 3>&,
+        std::vector<VectorJacobian>&);
+
+    ElementType element_type_;
+    int dimension_;
+    int order_;
+    std::size_t size_{0};
+
+    bool nodal_generated_{false};
+    bool use_transformed_direct_seed_{false};  ///< True for wedge/pyramid ND(k=1,2) transformed from direct seed/candidate functions
+    std::vector<std::array<int, 4>> transformed_monomial_candidates_; ///< {component, px, py, pz}
+    std::vector<ModalPolynomial> monomials_;
+    SparseModalCoefficientMatrix modal_sparse_coeffs_;
+    SparseModalCoefficientMatrix transformed_sparse_coeffs_;
+    std::array<int, 3> modal_power_limits_{{0, 0, 0}};
+    std::array<int, 3> transformed_power_limits_{{0, 0, 0}};
+    SeedJacobianEvaluator transformed_seed_jacobian_evaluator_{nullptr};
+};
+
+/**
+ * @brief Brezzi-Douglas-Marini basis (simple linear variant)
+ */
+class BDMBasis : public VectorBasisFunction {
+public:
+    BDMBasis(ElementType type, int order = 1);
+
+    BasisType basis_type() const noexcept override { return BasisType::BDM; }
+    ElementType element_type() const noexcept override { return element_type_; }
+    int dimension() const noexcept override { return dimension_; }
+    int order() const noexcept override { return order_; }
+    std::size_t size() const noexcept override { return size_; }
+    bool cache_identity_is_structural() const noexcept override { return true; }
+
+    void evaluate_vector_values(const math::Vector<Real, 3>& xi,
+                                std::vector<math::Vector<Real, 3>>& values) const override;
+    void evaluate_vector_jacobians(const math::Vector<Real, 3>& xi,
+                                   std::vector<VectorJacobian>& jacobians) const override;
+    void evaluate_divergence(const math::Vector<Real, 3>& xi,
+                             std::vector<Real>& divergence) const override;
+    bool supports_divergence() const noexcept override { return true; }
+    void evaluate_vector_at_quadrature_points_strided(
+        const std::vector<math::Vector<Real, 3>>& points,
+        std::size_t output_stride,
+        Real* SVMP_RESTRICT values_out,
+        Real* SVMP_RESTRICT jacobians_out,
+        Real* SVMP_RESTRICT curls_out,
+        Real* SVMP_RESTRICT divergence_out) const override;
+
+    /// Get DOF associations (face/edge DOFs for H(div))
+    std::vector<DofAssociation> dof_associations() const override;
+
+private:
+    using ModalTerm = VectorBasisModalTerm;
+    using ModalPolynomial = VectorBasisModalPolynomial;
+
+    ElementType element_type_;
+    int dimension_;
+    int order_;
+    std::size_t size_{0};
+    bool nodal_generated_{false};
+    std::vector<ModalPolynomial> monomials_;
+    SparseModalCoefficientMatrix modal_sparse_coeffs_;
+    std::array<int, 3> modal_power_limits_{{0, 0, 0}};
+};
+
+} // namespace basis
+} // namespace FE
+} // namespace svmp
+
+#endif // SVMP_FE_BASIS_VECTORBASIS_H
diff --git a/Code/Source/solver/FE/Basis/VectorBasisEvaluationHelpers.cpp b/Code/Source/solver/FE/Basis/VectorBasisEvaluationHelpers.cpp
new file mode 100644
index 000000000..7ec848633
--- /dev/null
+++ b/Code/Source/solver/FE/Basis/VectorBasisEvaluationHelpers.cpp
@@ -0,0 +1,593 @@
+/* Copyright (c) Stanford University, The Regents of the University of California, and others.
+ *
+ * All Rights Reserved.
+ *
+ * See License file.
+ */
+
+#include "VectorBasisEvaluationHelpers.h"
+
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <string>
+
+namespace svmp {
+namespace FE {
+namespace basis {
+namespace detail {
+namespace vector_common {
+
+VectorBasisScratch& vector_basis_scratch() {
+    // Scratch is intentionally thread-local: production assembly uses a
+    // persistent worker-thread team, so buffers stay warm on each worker.
+    static thread_local VectorBasisScratch scratch;
+    return scratch;
+}
+
+void prewarm_vector_basis_scratch(std::size_t max_size, std::size_t max_qpts) {
+    vector_basis_scratch().prewarm(max_size, max_qpts);
+}
+
+void fill_powers(Real x, int max_p, std::vector<Real>& out) {
+    BASIS_CHECK_CONSTRUCTION(max_p >= 0, "powers: negative max_p");
+    out.assign(static_cast<std::size_t>(max_p + 1), Real(1));
+    for (int i = 1; i <= max_p; ++i) {
+        out[static_cast<std::size_t>(i)] =
+            out[static_cast<std::size_t>(i - 1)] * x;
+    }
+}
+
+void fill_power_tables(const Vec3& xi,
+                       const std::array<int, 3>& limits,
+                       VectorBasisScratch& scratch) {
+    fill_powers(xi[0], limits[0], scratch.px);
+    fill_powers(xi[1], limits[1], scratch.py);
+    fill_powers(xi[2], limits[2], scratch.pz);
+}
+
+namespace {
+
+constexpr Real kSparseCoefficientRelativeTolerance =
+    Real(256) * std::numeric_limits<Real>::epsilon();
+
+void fill_batched_axis_powers(const std::vector<Vec3>& points,
+                              std::size_t axis,
+                              int max_power,
+                              std::vector<Real>& out) {
+    BASIS_CHECK_CONSTRUCTION(max_power >= 0, "batched powers: negative max_p");
+    const std::size_t num_qpts = points.size();
+    out.assign(static_cast<std::size_t>(max_power + 1) * num_qpts, Real(1));
+    if (num_qpts == 0 || max_power == 0) {
+        return;
+    }
+
+    Real* first_power = out.data() + num_qpts;
+    for (std::size_t q = 0; q < num_qpts; ++q) {
+        first_power[q] = points[q][axis];
+    }
+    for (int power = 2; power <= max_power; ++power) {
+        const Real* previous =
+            out.data() + static_cast<std::size_t>(power - 1) * num_qpts;
+        Real* current = out.data() + static_cast<std::size_t>(power) * num_qpts;
+        for (std::size_t q = 0; q < num_qpts; ++q) {
+            current[q] = previous[q] * points[q][axis];
+        }
+    }
+}
+
+} // namespace
+
+void fill_batched_power_tables(const std::vector<Vec3>& points,
+                               const std::array<int, 3>& limits,
+                               VectorBasisScratch& scratch) {
+    fill_batched_axis_powers(points, 0u, limits[0], scratch.batched_px);
+    fill_batched_axis_powers(points, 1u, limits[1], scratch.batched_py);
+    fill_batched_axis_powers(points, 2u, limits[2], scratch.batched_pz);
+}
+
+void validate_vector_strided_outputs(std::size_t num_qpts,
+                                     std::size_t output_stride,
+                                     const char* family_name) {
+    if (output_stride < num_qpts) {
+        throw BasisConfigurationException(
+            std::string(family_name) +
+                " strided vector evaluation requires output_stride >= points.size()",
+            __FILE__, __LINE__, __func__);
+    }
+}
+
+void zero_active_strided_rows(Real* output,
+                              std::size_t rows,
+                              std::size_t output_stride,
+                              std::size_t num_qpts) {
+    for (std::size_t row = 0; row < rows; ++row) {
+        std::fill_n(output + row * output_stride, num_qpts, Real(0));
+    }
+}
+
+SparseModalCoefficientMatrix build_sparse_modal_coefficients(
+    const std::vector<Real>& dense_coefficients,
+    std::size_t rows,
+    std::size_t cols) {
+    BASIS_CHECK_CONSTRUCTION(dense_coefficients.size() == rows * cols,
+                 "build_sparse_modal_coefficients: dense coefficient size mismatch");
+
+    SparseModalCoefficientMatrix sparse;
+    sparse.rows = rows;
+    sparse.cols = cols;
+    sparse.row_offsets.reserve(rows + 1u);
+    sparse.row_offsets.push_back(0u);
+
+    Real max_abs = Real(0);
+    for (const Real coefficient : dense_coefficients) {
+        max_abs = std::max(max_abs, std::abs(coefficient));
+    }
+    const Real prune_threshold = kSparseCoefficientRelativeTolerance * max_abs;
+
+    for (std::size_t row = 0; row < rows; ++row) {
+        const Real* dense_row = dense_coefficients.data() + row * cols;
+        for (std::size_t col = 0; col < cols; ++col) {
+            const Real coefficient = dense_row[col];
+            if (std::abs(coefficient) > prune_threshold) {
+                sparse.dofs.push_back(col);
+                sparse.coefficients.push_back(coefficient);
+            }
+        }
+        sparse.row_offsets.push_back(sparse.dofs.size());
+    }
+
+    return sparse;
+}
+
+Vec3 curl_from_jacobian(const VectorJacobian& J) noexcept {
+    return Vec3{J(2u, 1u) - J(1u, 2u),
+                J(0u, 2u) - J(2u, 0u),
+                J(1u, 0u) - J(0u, 1u)};
+}
+
+Real divergence_from_jacobian(const VectorJacobian& J) noexcept {
+    return J(0u, 0u) + J(1u, 1u) + J(2u, 2u);
+}
+
+void write_vector_values_strided(const std::vector<Vec3>& values,
+                                 std::size_t num_dofs,
+                                 std::size_t output_stride,
+                                 std::size_t q,
+                                 Real* SVMP_RESTRICT values_out) {
+    if (values_out == nullptr) {
+        return;
+    }
+    BASIS_CHECK_CONSTRUCTION(values.size() == num_dofs,
+                 "vector value evaluation returned the wrong number of DOFs");
+    for (std::size_t dof = 0; dof < num_dofs; ++dof) {
+        for (std::size_t component = 0; component < 3u; ++component) {
+            values_out[(dof * 3u + component) * output_stride + q] =
+                values[dof][component];
+        }
+    }
+}
+
+void write_vector_jacobians_strided(const std::vector<VectorJacobian>& jacobians,
+                                    std::size_t num_dofs,
+                                    std::size_t output_stride,
+                                    std::size_t q,
+                                    Real* SVMP_RESTRICT jacobians_out) {
+    if (jacobians_out == nullptr) {
+        return;
+    }
+    BASIS_CHECK_CONSTRUCTION(jacobians.size() == num_dofs,
+                 "vector Jacobian evaluation returned the wrong number of DOFs");
+    for (std::size_t dof = 0; dof < num_dofs; ++dof) {
+        const auto& J = jacobians[dof];
+        for (std::size_t component = 0; component < 3u; ++component) {
+            for (std::size_t derivative = 0; derivative < 3u; ++derivative) {
+                jacobians_out[(dof * 9u + component * 3u + derivative) *
+                                  output_stride + q] = J(component, derivative);
+            }
+        }
+    }
+}
+
+void write_vector_curl_strided(const std::vector<Vec3>& curl,
+                               std::size_t num_dofs,
+                               std::size_t output_stride,
+                               std::size_t q,
+                               Real* SVMP_RESTRICT curls_out) {
+    if (curls_out == nullptr) {
+        return;
+    }
+    BASIS_CHECK_CONSTRUCTION(curl.size() == num_dofs,
+                 "vector curl evaluation returned the wrong number of DOFs");
+    for (std::size_t dof = 0; dof < num_dofs; ++dof) {
+        for (std::size_t component = 0; component < 3u; ++component) {
+            curls_out[(dof * 3u + component) * output_stride + q] =
+                curl[dof][component];
+        }
+    }
+}
+
+void write_vector_divergence_strided(const std::vector<Real>& divergence,
+                                     std::size_t num_dofs,
+                                     std::size_t output_stride,
+                                     std::size_t q,
+                                     Real* SVMP_RESTRICT divergence_out) {
+    if (divergence_out == nullptr) {
+        return;
+    }
+    BASIS_CHECK_CONSTRUCTION(divergence.size() == num_dofs,
+                 "vector divergence evaluation returned the wrong number of DOFs");
+    for (std::size_t dof = 0; dof < num_dofs; ++dof) {
+        divergence_out[dof * output_stride + q] = divergence[dof];
+    }
+}
+
+void write_curl_and_divergence_from_jacobians_strided(
+    const std::vector<VectorJacobian>& jacobians,
+    std::size_t num_dofs,
+    std::size_t output_stride,
+    std::size_t q,
+    Real* SVMP_RESTRICT curls_out,
+    Real* SVMP_RESTRICT divergence_out) {
+    BASIS_CHECK_CONSTRUCTION(jacobians.size() == num_dofs,
+                 "vector Jacobian evaluation returned the wrong number of DOFs");
+    for (std::size_t dof = 0; dof < num_dofs; ++dof) {
+        const auto& J = jacobians[dof];
+        if (curls_out != nullptr) {
+            const Vec3 curl = curl_from_jacobian(J);
+            for (std::size_t component = 0; component < 3u; ++component) {
+                curls_out[(dof * 3u + component) * output_stride + q] =
+                    curl[component];
+            }
+        }
+        if (divergence_out != nullptr) {
+            divergence_out[dof * output_stride + q] = divergence_from_jacobian(J);
+        }
+    }
+}
+
+Vec3 lerp(const Vec3& a, const Vec3& b, Real s) {
+    const Real t = (s + Real(1)) * Real(0.5);
+    return a * (Real(1) - t) + b * t;
+}
+
+Vec3 bilinear(const std::array<Vec3, 4>& v, Real u, Real w) {
+    const Real N0 = Real(0.25) * (Real(1) - u) * (Real(1) - w);
+    const Real N1 = Real(0.25) * (Real(1) + u) * (Real(1) - w);
+    const Real N2 = Real(0.25) * (Real(1) + u) * (Real(1) + w);
+    const Real N3 = Real(0.25) * (Real(1) - u) * (Real(1) + w);
+    return v[0] * N0 + v[1] * N1 + v[2] * N2 + v[3] * N3;
+}
+
+Vec3 bilinear_du(const std::array<Vec3, 4>& v, Real u, Real w) {
+    (void)u;
+    const Real dN0 = -Real(0.25) * (Real(1) - w);
+    const Real dN1 =  Real(0.25) * (Real(1) - w);
+    const Real dN2 =  Real(0.25) * (Real(1) + w);
+    const Real dN3 = -Real(0.25) * (Real(1) + w);
+    return v[0] * dN0 + v[1] * dN1 + v[2] * dN2 + v[3] * dN3;
+}
+
+Vec3 bilinear_dw(const std::array<Vec3, 4>& v, Real u, Real w) {
+    (void)w;
+    const Real dN0 = -Real(0.25) * (Real(1) - u);
+    const Real dN1 = -Real(0.25) * (Real(1) + u);
+    const Real dN2 =  Real(0.25) * (Real(1) + u);
+    const Real dN3 =  Real(0.25) * (Real(1) - u);
+    return v[0] * dN0 + v[1] * dN1 + v[2] * dN2 + v[3] * dN3;
+}
+
+Vec3 cross3(const Vec3& a, const Vec3& b) {
+    return Vec3{a[1] * b[2] - a[2] * b[1],
+                a[2] * b[0] - a[0] * b[2],
+                a[0] * b[1] - a[1] * b[0]};
+}
+
+Vec3 normalize3(const Vec3& v) {
+    const Real n = v.norm();
+    BASIS_CHECK_CONSTRUCTION(n > std::numeric_limits<Real>::epsilon(),
+                 "normalize3: zero-length vector");
+    return v / n;
+}
+
+std::array<int, 3> component_monomial_power_limits(
+    const std::vector<std::array<int, 4>>& candidates) {
+    std::array<int, 3> limits{{0, 0, 0}};
+    for (const auto& mono : candidates) {
+        limits[0] = std::max(limits[0], mono[1]);
+        limits[1] = std::max(limits[1], mono[2]);
+        limits[2] = std::max(limits[2], mono[3]);
+    }
+    return limits;
+}
+
+std::size_t triangle_poly_dim(std::size_t k) {
+    return (k + 1u) * (k + 2u) / 2u;
+}
+
+std::size_t tetra_poly_dim(std::size_t k) {
+    return (k + 1u) * (k + 2u) * (k + 3u) / 6u;
+}
+
+std::size_t rt_wedge_size(int order) {
+    const std::size_t k = static_cast<std::size_t>(order);
+    const std::size_t face_dofs =
+        2u * triangle_poly_dim(k) + 3u * (k + 1u) * (k + 1u);
+    const std::size_t interior_dofs =
+        (k >= 1u) ? (3u * k * (k + 1u) * (k + 1u) / 2u) : 0u;
+    return face_dofs + interior_dofs;
+}
+
+std::size_t rt_pyramid_size(int order) {
+    const std::size_t k = static_cast<std::size_t>(order);
+    const std::size_t face_dofs = (k + 1u) * (k + 1u) + 4u * triangle_poly_dim(k);
+    const std::size_t interior_dofs = (k >= 1u) ? (3u * k * k * k) : 0u;
+    return face_dofs + interior_dofs;
+}
+
+std::size_t nd_wedge_size(int order) {
+    const std::size_t k = static_cast<std::size_t>(order);
+    const std::size_t edge_dofs = 9u * (k + 1u);
+    const std::size_t face_dofs = (k >= 1u) ? (8u * k * (k + 1u)) : 0u;
+    const std::size_t interior_dofs =
+        (k >= 2u) ? (3u * k * (k - 1u) * (k + 1u) / 2u) : 0u;
+    return edge_dofs + face_dofs + interior_dofs;
+}
+
+std::size_t nd_pyramid_size(int order) {
+    const std::size_t k = static_cast<std::size_t>(order);
+    const std::size_t edge_dofs = 8u * (k + 1u);
+    const std::size_t face_dofs = (k >= 1u) ? (6u * k * (k + 1u)) : 0u;
+    const std::size_t interior_dofs =
+        (k >= 2u) ? (k * (k - 1u) * (k + 1u) / 2u) : 0u;
+    return edge_dofs + face_dofs + interior_dofs;
+}
+
+void ensure_supported_hybrid_vector_order(ElementType type,
+                                          int order,
+                                          const char* family_name) {
+    (void)type;
+    (void)order;
+    (void)family_name;
+}
+
+std::vector<std::array<int, 4>> make_component_monomial_candidates(
+    int max_total_degree) {
+    BASIS_CHECK_CONSTRUCTION(max_total_degree >= 0,
+                 "make_component_monomial_candidates: negative total degree");
+
+    std::vector<std::array<int, 4>> candidates;
+    for (int component = 0; component < 3; ++component) {
+        for (int total = 0; total <= max_total_degree; ++total) {
+            for (int pz = 0; pz <= total; ++pz) {
+                for (int py = 0; py <= total - pz; ++py) {
+                    const int px = total - py - pz;
+                    candidates.push_back({component, px, py, pz});
+                }
+            }
+        }
+    }
+    return candidates;
+}
+
+std::vector<std::array<int, 4>> make_rt_extra_monomial_candidates(ElementType type,
+                                                                  int order) {
+    if (order >= 3) {
+        return make_component_monomial_candidates(3 * order);
+    }
+
+    std::vector<std::array<int, 4>> candidates;
+    if (!is_pyramid(type) || order != 2) {
+        return candidates;
+    }
+
+    for (int component = 0; component < 3; ++component) {
+        for (int pz = 0; pz <= 2; ++pz) {
+            for (int py = 0; py <= 2 - pz; ++py) {
+                for (int px = 0; px <= 2 - py - pz; ++px) {
+                    candidates.push_back({component, px, py, pz});
+                }
+            }
+        }
+    }
+    return candidates;
+}
+
+Real eval_transformed_rt_monomial_scalar(const std::array<int, 4>& mono,
+                                         const std::vector<Real>& px,
+                                         const std::vector<Real>& py,
+                                         const std::vector<Real>& pz) {
+    return px[static_cast<std::size_t>(mono[1])] *
+           py[static_cast<std::size_t>(mono[2])] *
+           pz[static_cast<std::size_t>(mono[3])];
+}
+
+Real eval_transformed_rt_monomial_divergence(const std::array<int, 4>& mono,
+                                             const std::vector<Real>& px,
+                                             const std::vector<Real>& py,
+                                             const std::vector<Real>& pz) {
+    const int component = mono[0];
+    const int px_pow = mono[1];
+    const int py_pow = mono[2];
+    const int pz_pow = mono[3];
+
+    if (component == 0) {
+        if (px_pow == 0) {
+            return Real(0);
+        }
+        return Real(px_pow) *
+               px[static_cast<std::size_t>(px_pow - 1)] *
+               py[static_cast<std::size_t>(py_pow)] *
+               pz[static_cast<std::size_t>(pz_pow)];
+    }
+    if (component == 1) {
+        if (py_pow == 0) {
+            return Real(0);
+        }
+        return Real(py_pow) *
+               px[static_cast<std::size_t>(px_pow)] *
+               py[static_cast<std::size_t>(py_pow - 1)] *
+               pz[static_cast<std::size_t>(pz_pow)];
+    }
+    if (pz_pow == 0) {
+        return Real(0);
+    }
+    return Real(pz_pow) *
+           px[static_cast<std::size_t>(px_pow)] *
+           py[static_cast<std::size_t>(py_pow)] *
+           pz[static_cast<std::size_t>(pz_pow - 1)];
+}
+
+void add_component_monomial_jacobian(VectorJacobian& J,
+                                     int component,
+                                     int px_pow,
+                                     int py_pow,
+                                     int pz_pow,
+                                     Real coefficient,
+                                     const std::vector<Real>& px,
+                                     const std::vector<Real>& py,
+                                     const std::vector<Real>& pz) {
+    const auto comp = static_cast<std::size_t>(component);
+    if (px_pow > 0) {
+        J(comp, 0) += coefficient * Real(px_pow) *
+                      px[static_cast<std::size_t>(px_pow - 1)] *
+                      py[static_cast<std::size_t>(py_pow)] *
+                      pz[static_cast<std::size_t>(pz_pow)];
+    }
+    if (py_pow > 0) {
+        J(comp, 1) += coefficient * Real(py_pow) *
+                      px[static_cast<std::size_t>(px_pow)] *
+                      py[static_cast<std::size_t>(py_pow - 1)] *
+                      pz[static_cast<std::size_t>(pz_pow)];
+    }
+    if (pz_pow > 0) {
+        J(comp, 2) += coefficient * Real(pz_pow) *
+                      px[static_cast<std::size_t>(px_pow)] *
+                      py[static_cast<std::size_t>(py_pow)] *
+                      pz[static_cast<std::size_t>(pz_pow - 1)];
+    }
+}
+
+VectorJacobian eval_transformed_component_monomial_jacobian(
+    const std::array<int, 4>& mono,
+    const std::vector<Real>& px,
+    const std::vector<Real>& py,
+    const std::vector<Real>& pz) {
+    VectorJacobian J{};
+    add_component_monomial_jacobian(
+        J, mono[0], mono[1], mono[2], mono[3], Real(1), px, py, pz);
+    return J;
+}
+
+void add_component_monomial_curl(Vec3& curl,
+                                 int component,
+                                 int px_pow,
+                                 int py_pow,
+                                 int pz_pow,
+                                 Real coefficient,
+                                 const std::vector<Real>& px,
+                                 const std::vector<Real>& py,
+                                 const std::vector<Real>& pz) {
+    const Real dphidx = (px_pow == 0)
+        ? Real(0)
+        : coefficient * Real(px_pow) *
+              px[static_cast<std::size_t>(px_pow - 1)] *
+              py[static_cast<std::size_t>(py_pow)] *
+              pz[static_cast<std::size_t>(pz_pow)];
+    const Real dphidy = (py_pow == 0)
+        ? Real(0)
+        : coefficient * Real(py_pow) *
+              px[static_cast<std::size_t>(px_pow)] *
+              py[static_cast<std::size_t>(py_pow - 1)] *
+              pz[static_cast<std::size_t>(pz_pow)];
+    const Real dphidz = (pz_pow == 0)
+        ? Real(0)
+        : coefficient * Real(pz_pow) *
+              px[static_cast<std::size_t>(px_pow)] *
+              py[static_cast<std::size_t>(py_pow)] *
+              pz[static_cast<std::size_t>(pz_pow - 1)];
+
+    if (component == 0) {
+        curl[1] += dphidz;
+        curl[2] -= dphidy;
+    } else if (component == 1) {
+        curl[0] -= dphidz;
+        curl[2] += dphidx;
+    } else {
+        curl[0] += dphidy;
+        curl[1] -= dphidx;
+    }
+}
+
+std::vector<std::array<int, 4>> make_nd_extra_monomial_candidates(ElementType,
+                                                                  int order) {
+    if (order >= 3) {
+        return make_component_monomial_candidates(3 * order);
+    }
+
+    std::vector<std::array<int, 4>> candidates;
+    const int max_total_degree = (order == 1) ? 4 : 5;
+    for (int component = 0; component < 3; ++component) {
+        for (int total = 0; total <= max_total_degree; ++total) {
+            for (int pz = 0; pz <= total; ++pz) {
+                for (int py = 0; py <= total - pz; ++py) {
+                    const int px = total - py - pz;
+                    candidates.push_back({component, px, py, pz});
+                }
+            }
+        }
+    }
+    return candidates;
+}
+
+Real eval_transformed_nd_monomial_scalar(const std::array<int, 4>& mono,
+                                         const std::vector<Real>& px,
+                                         const std::vector<Real>& py,
+                                         const std::vector<Real>& pz) {
+    return px[static_cast<std::size_t>(mono[1])] *
+           py[static_cast<std::size_t>(mono[2])] *
+           pz[static_cast<std::size_t>(mono[3])];
+}
+
+Vec3 eval_transformed_nd_monomial_curl(const std::array<int, 4>& mono,
+                                       const std::vector<Real>& px,
+                                       const std::vector<Real>& py,
+                                       const std::vector<Real>& pz) {
+    const int component = mono[0];
+    const int px_pow = mono[1];
+    const int py_pow = mono[2];
+    const int pz_pow = mono[3];
+
+    const Real dphidx = (px_pow == 0)
+        ? Real(0)
+        : Real(px_pow) *
+              px[static_cast<std::size_t>(px_pow - 1)] *
+              py[static_cast<std::size_t>(py_pow)] *
+              pz[static_cast<std::size_t>(pz_pow)];
+    const Real dphidy = (py_pow == 0)
+        ? Real(0)
+        : Real(py_pow) *
+              px[static_cast<std::size_t>(px_pow)] *
+              py[static_cast<std::size_t>(py_pow - 1)] *
+              pz[static_cast<std::size_t>(pz_pow)];
+    const Real dphidz = (pz_pow == 0)
+        ? Real(0)
+        : Real(pz_pow) *
+              px[static_cast<std::size_t>(px_pow)] *
+              py[static_cast<std::size_t>(py_pow)] *
+              pz[static_cast<std::size_t>(pz_pow - 1)];
+
+    if (component == 0) {
+        return Vec3{Real(0), dphidz, -dphidy};
+    }
+    if (component == 1) {
+        return Vec3{-dphidz, Real(0), dphidx};
+    }
+    return Vec3{dphidy, -dphidx, Real(0)};
+}
+
+} // namespace vector_common
+} // namespace detail
+} // namespace basis
+} // namespace FE
+} // namespace svmp
diff --git a/Code/Source/solver/FE/Basis/VectorBasisEvaluationHelpers.h b/Code/Source/solver/FE/Basis/VectorBasisEvaluationHelpers.h
new file mode 100644
index 000000000..e0e6daa10
--- /dev/null
+++ b/Code/Source/solver/FE/Basis/VectorBasisEvaluationHelpers.h
@@ -0,0 +1,751 @@
+/* Copyright (c) Stanford University, The Regents of the University of California, and others.
+ *
+ * All Rights Reserved.
+ *
+ * See License file.
+ */
+
+#ifndef SVMP_FE_BASIS_VECTORBASISEVALUATIONHELPERS_H
+#define SVMP_FE_BASIS_VECTORBASISEVALUATIONHELPERS_H
+
+#include "VectorBasis.h"
+#include "Basis/BasisTraits.h"
+
+#include <algorithm>
+#include <array>
+#include <limits>
+#include <string>
+#include <vector>
+
+namespace svmp {
+namespace FE {
+namespace basis {
+namespace detail {
+namespace vector_common {
+
+using Vec3 = math::Vector<Real, 3>;
+
+struct VectorBasisScratch {
+    std::vector<Real> px;
+    std::vector<Real> py;
+    std::vector<Real> pz;
+    std::vector<Real> batched_px;
+    std::vector<Real> batched_py;
+    std::vector<Real> batched_pz;
+    std::vector<Real> candidate_values;
+    std::vector<Real> candidate_dx;
+    std::vector<Real> candidate_dy;
+    std::vector<Real> candidate_dz;
+    std::vector<Real> modal_values_batched;
+    std::vector<Real> modal_jacobians_batched;
+    std::vector<Real> modal_curls_batched;
+    std::vector<Real> modal_divergence_batched;
+    std::vector<Vec3> vector_values;
+    std::vector<VectorJacobian> vector_jacobians;
+    std::vector<Real> scalars;
+    std::vector<Vec3> api_values;
+    std::vector<VectorJacobian> api_jacobians;
+    std::vector<Vec3> api_curl;
+    std::vector<Real> api_divergence;
+
+    void prewarm(std::size_t max_size, std::size_t max_qpts) {
+        const std::size_t batched_size = max_size * std::max<std::size_t>(max_qpts, 1u);
+        px.reserve(max_size);
+        py.reserve(max_size);
+        pz.reserve(max_size);
+        batched_px.reserve(batched_size);
+        batched_py.reserve(batched_size);
+        batched_pz.reserve(batched_size);
+        candidate_values.reserve(max_size);
+        candidate_dx.reserve(max_size);
+        candidate_dy.reserve(max_size);
+        candidate_dz.reserve(max_size);
+        modal_values_batched.reserve(batched_size * 3u);
+        modal_jacobians_batched.reserve(batched_size * 9u);
+        modal_curls_batched.reserve(batched_size * 3u);
+        modal_divergence_batched.reserve(batched_size);
+        vector_values.reserve(max_size);
+        vector_jacobians.reserve(max_size);
+        scalars.reserve(max_size);
+        api_values.reserve(max_size);
+        api_jacobians.reserve(max_size);
+        api_curl.reserve(max_size);
+        api_divergence.reserve(max_size);
+    }
+};
+
+VectorBasisScratch& vector_basis_scratch();
+void prewarm_vector_basis_scratch(std::size_t max_size, std::size_t max_qpts = 0);
+
+void fill_powers(Real x, int max_p, std::vector<Real>& out);
+void fill_power_tables(const Vec3& xi,
+                       const std::array<int, 3>& limits,
+                       VectorBasisScratch& scratch);
+void fill_batched_power_tables(const std::vector<Vec3>& points,
+                               const std::array<int, 3>& limits,
+                               VectorBasisScratch& scratch);
+void validate_vector_strided_outputs(std::size_t num_qpts,
+                                     std::size_t output_stride,
+                                     const char* family_name);
+void zero_active_strided_rows(Real* output,
+                              std::size_t rows,
+                              std::size_t output_stride,
+                              std::size_t num_qpts);
+SparseModalCoefficientMatrix build_sparse_modal_coefficients(
+    const std::vector<Real>& dense_coefficients,
+    std::size_t rows,
+    std::size_t cols);
+Vec3 curl_from_jacobian(const VectorJacobian& J) noexcept;
+Real divergence_from_jacobian(const VectorJacobian& J) noexcept;
+
+inline Real batched_power_product(const std::vector<Real>& px,
+                                  const std::vector<Real>& py,
+                                  const std::vector<Real>& pz,
+                                  std::size_t stride,
+                                  int px_pow,
+                                  int py_pow,
+                                  int pz_pow,
+                                  std::size_t q) noexcept {
+    return px[static_cast<std::size_t>(px_pow) * stride + q] *
+           py[static_cast<std::size_t>(py_pow) * stride + q] *
+           pz[static_cast<std::size_t>(pz_pow) * stride + q];
+}
+
+inline Real batched_component_partial(const std::vector<Real>& px,
+                                      const std::vector<Real>& py,
+                                      const std::vector<Real>& pz,
+                                      std::size_t stride,
+                                      int px_pow,
+                                      int py_pow,
+                                      int pz_pow,
+                                      int derivative_axis,
+                                      std::size_t q) noexcept {
+    if (derivative_axis == 0) {
+        if (px_pow == 0) {
+            return Real(0);
+        }
+        return Real(px_pow) *
+               px[static_cast<std::size_t>(px_pow - 1) * stride + q] *
+               py[static_cast<std::size_t>(py_pow) * stride + q] *
+               pz[static_cast<std::size_t>(pz_pow) * stride + q];
+    }
+    if (derivative_axis == 1) {
+        if (py_pow == 0) {
+            return Real(0);
+        }
+        return Real(py_pow) *
+               px[static_cast<std::size_t>(px_pow) * stride + q] *
+               py[static_cast<std::size_t>(py_pow - 1) * stride + q] *
+               pz[static_cast<std::size_t>(pz_pow) * stride + q];
+    }
+    if (pz_pow == 0) {
+        return Real(0);
+    }
+    return Real(pz_pow) *
+           px[static_cast<std::size_t>(px_pow) * stride + q] *
+           py[static_cast<std::size_t>(py_pow) * stride + q] *
+           pz[static_cast<std::size_t>(pz_pow - 1) * stride + q];
+}
+
+inline Vec3 curl_from_component_gradient(int component,
+                                         Real dphidx,
+                                         Real dphidy,
+                                         Real dphidz) noexcept {
+    if (component == 0) {
+        return Vec3{Real(0), dphidz, -dphidy};
+    }
+    if (component == 1) {
+        return Vec3{-dphidz, Real(0), dphidx};
+    }
+    return Vec3{dphidy, -dphidx, Real(0)};
+}
+
+inline void axpy_qpoints(Real* target,
+                         const Real* source,
+                         Real coefficient,
+                         std::size_t num_qpts) noexcept {
+    for (std::size_t q = 0; q < num_qpts; ++q) {
+        target[q] += coefficient * source[q];
+    }
+}
+
+void write_vector_values_strided(const std::vector<Vec3>& values,
+                                 std::size_t num_dofs,
+                                 std::size_t output_stride,
+                                 std::size_t q,
+                                 Real* SVMP_RESTRICT values_out);
+void write_vector_jacobians_strided(const std::vector<VectorJacobian>& jacobians,
+                                    std::size_t num_dofs,
+                                    std::size_t output_stride,
+                                    std::size_t q,
+                                    Real* SVMP_RESTRICT jacobians_out);
+void write_vector_curl_strided(const std::vector<Vec3>& curl,
+                               std::size_t num_dofs,
+                               std::size_t output_stride,
+                               std::size_t q,
+                               Real* SVMP_RESTRICT curls_out);
+void write_vector_divergence_strided(const std::vector<Real>& divergence,
+                                     std::size_t num_dofs,
+                                     std::size_t output_stride,
+                                     std::size_t q,
+                                     Real* SVMP_RESTRICT divergence_out);
+void write_curl_and_divergence_from_jacobians_strided(
+    const std::vector<VectorJacobian>& jacobians,
+    std::size_t num_dofs,
+    std::size_t output_stride,
+    std::size_t q,
+    Real* SVMP_RESTRICT curls_out,
+    Real* SVMP_RESTRICT divergence_out);
+
+template <typename BasisLike>
+void evaluate_vector_public_api_strided(
+    const BasisLike& basis,
+    const std::vector<Vec3>& points,
+    std::size_t output_stride,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT jacobians_out,
+    Real* SVMP_RESTRICT curls_out,
+    Real* SVMP_RESTRICT divergence_out,
+    bool use_direct_curl,
+    bool use_direct_divergence,
+    const char* family_name) {
+    const std::size_t num_qpts = points.size();
+    const std::size_t num_dofs = basis.size();
+    validate_vector_strided_outputs(num_qpts, output_stride, family_name);
+
+    auto& scratch = vector_basis_scratch();
+    for (std::size_t q = 0; q < num_qpts; ++q) {
+        if (values_out != nullptr) {
+            basis.evaluate_vector_values(points[q], scratch.api_values);
+            write_vector_values_strided(
+                scratch.api_values, num_dofs, output_stride, q, values_out);
+        }
+
+        const bool needs_jacobians =
+            jacobians_out != nullptr ||
+            (curls_out != nullptr && !use_direct_curl) ||
+            (divergence_out != nullptr && !use_direct_divergence);
+
+        if (needs_jacobians) {
+            basis.evaluate_vector_jacobians(points[q], scratch.api_jacobians);
+            write_vector_jacobians_strided(
+                scratch.api_jacobians, num_dofs, output_stride, q, jacobians_out);
+            write_curl_and_divergence_from_jacobians_strided(
+                scratch.api_jacobians,
+                num_dofs,
+                output_stride,
+                q,
+                curls_out,
+                divergence_out);
+            continue;
+        }
+
+        if (curls_out != nullptr) {
+            basis.evaluate_curl(points[q], scratch.api_curl);
+            write_vector_curl_strided(
+                scratch.api_curl, num_dofs, output_stride, q, curls_out);
+        }
+        if (divergence_out != nullptr) {
+            basis.evaluate_divergence(points[q], scratch.api_divergence);
+            write_vector_divergence_strided(
+                scratch.api_divergence, num_dofs, output_stride, q, divergence_out);
+        }
+    }
+}
+
+Vec3 lerp(const Vec3& a, const Vec3& b, Real s);
+Vec3 bilinear(const std::array<Vec3, 4>& v, Real u, Real w);
+Vec3 bilinear_du(const std::array<Vec3, 4>& v, Real u, Real w);
+Vec3 bilinear_dw(const std::array<Vec3, 4>& v, Real u, Real w);
+Vec3 cross3(const Vec3& a, const Vec3& b);
+Vec3 normalize3(const Vec3& v);
+
+template <typename ModalPolynomials>
+std::array<int, 3> modal_power_limits(const ModalPolynomials& monomials) {
+    std::array<int, 3> limits{{0, 0, 0}};
+    for (const auto& poly : monomials) {
+        for (int t = 0; t < poly.num_terms; ++t) {
+            const auto& m = poly.terms[static_cast<std::size_t>(t)];
+            limits[0] = std::max(limits[0], m.px);
+            limits[1] = std::max(limits[1], m.py);
+            limits[2] = std::max(limits[2], m.pz);
+        }
+    }
+    return limits;
+}
+
+std::array<int, 3> component_monomial_power_limits(
+    const std::vector<std::array<int, 4>>& candidates);
+std::size_t triangle_poly_dim(std::size_t k);
+std::size_t tetra_poly_dim(std::size_t k);
+std::size_t rt_wedge_size(int order);
+std::size_t rt_pyramid_size(int order);
+std::size_t nd_wedge_size(int order);
+std::size_t nd_pyramid_size(int order);
+void ensure_supported_hybrid_vector_order(ElementType type,
+                                          int order,
+                                          const char* family_name);
+std::vector<std::array<int, 4>> make_component_monomial_candidates(int max_total_degree);
+std::vector<std::array<int, 4>> make_rt_extra_monomial_candidates(ElementType type,
+                                                                  int order);
+Real eval_transformed_rt_monomial_scalar(const std::array<int, 4>& mono,
+                                         const std::vector<Real>& px,
+                                         const std::vector<Real>& py,
+                                         const std::vector<Real>& pz);
+Real eval_transformed_rt_monomial_divergence(const std::array<int, 4>& mono,
+                                             const std::vector<Real>& px,
+                                             const std::vector<Real>& py,
+                                             const std::vector<Real>& pz);
+
+void add_component_monomial_jacobian(VectorJacobian& J,
+                                     int component,
+                                     int px_pow,
+                                     int py_pow,
+                                     int pz_pow,
+                                     Real coefficient,
+                                     const std::vector<Real>& px,
+                                     const std::vector<Real>& py,
+                                     const std::vector<Real>& pz);
+VectorJacobian eval_transformed_component_monomial_jacobian(
+    const std::array<int, 4>& mono,
+    const std::vector<Real>& px,
+    const std::vector<Real>& py,
+    const std::vector<Real>& pz);
+void add_component_monomial_curl(Vec3& curl,
+                                 int component,
+                                 int px_pow,
+                                 int py_pow,
+                                 int pz_pow,
+                                 Real coefficient,
+                                 const std::vector<Real>& px,
+                                 const std::vector<Real>& py,
+                                 const std::vector<Real>& pz);
+
+template <typename ModalPolynomials>
+void evaluate_nodal_modal_vector_values_with_limits(const ModalPolynomials& monomials,
+                                                    const SparseModalCoefficientMatrix& sparse_coeffs,
+                                                    std::size_t n,
+                                                    const Vec3& xi,
+                                                    const std::array<int, 3>& power_limits,
+                                                    std::vector<Vec3>& values) {
+    values.assign(n, Vec3{});
+
+    auto& scratch = vector_basis_scratch();
+    fill_power_tables(xi, power_limits, scratch);
+    const auto& px = scratch.px;
+    const auto& py = scratch.py;
+    const auto& pz = scratch.pz;
+
+    auto& modal_vals = scratch.vector_values;
+    modal_vals.assign(n, Vec3{});
+    for (std::size_t p = 0; p < n; ++p) {
+        const auto& poly = monomials[p];
+        auto& v = modal_vals[p];
+        for (int t = 0; t < poly.num_terms; ++t) {
+            const auto& m = poly.terms[static_cast<std::size_t>(t)];
+            const Real mv =
+                px[static_cast<std::size_t>(m.px)] *
+                py[static_cast<std::size_t>(m.py)] *
+                pz[static_cast<std::size_t>(m.pz)];
+            v[static_cast<std::size_t>(m.component)] += m.coefficient * mv;
+        }
+    }
+
+    BASIS_CHECK_CONSTRUCTION(sparse_coeffs.rows == n &&
+                     sparse_coeffs.cols == n &&
+                     sparse_coeffs.row_offsets.size() == n + 1u,
+                 "evaluate_nodal_modal_vector_values: sparse coefficient size mismatch");
+    BASIS_CHECK_CONSTRUCTION(sparse_coeffs.dofs.size() == sparse_coeffs.coefficients.size(),
+                 "evaluate_nodal_modal_vector_values: sparse coefficient entry mismatch");
+    for (std::size_t p = 0; p < n; ++p) {
+        const Vec3& mv = modal_vals[p];
+        const std::size_t row_begin = sparse_coeffs.row_offsets[p];
+        const std::size_t row_end = sparse_coeffs.row_offsets[p + 1u];
+        for (std::size_t entry = row_begin; entry < row_end; ++entry) {
+            const std::size_t dof = sparse_coeffs.dofs[entry];
+            const Real c = sparse_coeffs.coefficients[entry];
+            values[dof][0] += c * mv[0];
+            values[dof][1] += c * mv[1];
+            values[dof][2] += c * mv[2];
+        }
+    }
+}
+
+template <typename ModalPolynomials>
+void evaluate_nodal_modal_vector_jacobians_with_limits(const ModalPolynomials& monomials,
+                                                       const SparseModalCoefficientMatrix& sparse_coeffs,
+                                                       std::size_t n,
+                                                       const Vec3& xi,
+                                                       const std::array<int, 3>& power_limits,
+                                                       std::vector<VectorJacobian>& jacobians) {
+    jacobians.assign(n, VectorJacobian{});
+
+    auto& scratch = vector_basis_scratch();
+    fill_power_tables(xi, power_limits, scratch);
+    const auto& px = scratch.px;
+    const auto& py = scratch.py;
+    const auto& pz = scratch.pz;
+
+    auto& modal_jacs = scratch.vector_jacobians;
+    modal_jacs.assign(n, VectorJacobian{});
+    for (std::size_t p = 0; p < n; ++p) {
+        const auto& poly = monomials[p];
+        auto& J = modal_jacs[p];
+        for (int t = 0; t < poly.num_terms; ++t) {
+            const auto& m = poly.terms[static_cast<std::size_t>(t)];
+            add_component_monomial_jacobian(J, m.component, m.px, m.py, m.pz, m.coefficient, px, py, pz);
+        }
+    }
+
+    BASIS_CHECK_CONSTRUCTION(sparse_coeffs.rows == n &&
+                     sparse_coeffs.cols == n &&
+                     sparse_coeffs.row_offsets.size() == n + 1u,
+                 "evaluate_nodal_modal_vector_jacobians: sparse coefficient size mismatch");
+    BASIS_CHECK_CONSTRUCTION(sparse_coeffs.dofs.size() == sparse_coeffs.coefficients.size(),
+                 "evaluate_nodal_modal_vector_jacobians: sparse coefficient entry mismatch");
+    for (std::size_t p = 0; p < n; ++p) {
+        const auto& Jp = modal_jacs[p];
+        const std::size_t row_begin = sparse_coeffs.row_offsets[p];
+        const std::size_t row_end = sparse_coeffs.row_offsets[p + 1u];
+        for (std::size_t entry = row_begin; entry < row_end; ++entry) {
+            const std::size_t dof = sparse_coeffs.dofs[entry];
+            const Real c = sparse_coeffs.coefficients[entry];
+            for (std::size_t r = 0; r < 3; ++r) {
+                for (std::size_t col = 0; col < 3; ++col) {
+                    jacobians[dof](r, col) += c * Jp(r, col);
+                }
+            }
+        }
+    }
+}
+
+template <typename ModalPolynomials>
+void evaluate_nodal_modal_vector_curl_with_limits(const ModalPolynomials& monomials,
+                                                  const SparseModalCoefficientMatrix& sparse_coeffs,
+                                                  std::size_t n,
+                                                  const Vec3& xi,
+                                                  const std::array<int, 3>& power_limits,
+                                                  std::vector<Vec3>& curl) {
+    curl.assign(n, Vec3{});
+
+    auto& scratch = vector_basis_scratch();
+    fill_power_tables(xi, power_limits, scratch);
+    const auto& px = scratch.px;
+    const auto& py = scratch.py;
+    const auto& pz = scratch.pz;
+
+    auto& modal_curl = scratch.vector_values;
+    modal_curl.assign(n, Vec3{});
+    for (std::size_t p = 0; p < n; ++p) {
+        const auto& poly = monomials[p];
+        auto& c = modal_curl[p];
+        for (int t = 0; t < poly.num_terms; ++t) {
+            const auto& m = poly.terms[static_cast<std::size_t>(t)];
+            add_component_monomial_curl(c, m.component, m.px, m.py, m.pz, m.coefficient, px, py, pz);
+        }
+    }
+
+    BASIS_CHECK_CONSTRUCTION(sparse_coeffs.rows == n &&
+                     sparse_coeffs.cols == n &&
+                     sparse_coeffs.row_offsets.size() == n + 1u,
+                 "evaluate_nodal_modal_vector_curl: sparse coefficient size mismatch");
+    BASIS_CHECK_CONSTRUCTION(sparse_coeffs.dofs.size() == sparse_coeffs.coefficients.size(),
+                 "evaluate_nodal_modal_vector_curl: sparse coefficient entry mismatch");
+    for (std::size_t p = 0; p < n; ++p) {
+        const Vec3& cm = modal_curl[p];
+        const std::size_t row_begin = sparse_coeffs.row_offsets[p];
+        const std::size_t row_end = sparse_coeffs.row_offsets[p + 1u];
+        for (std::size_t entry = row_begin; entry < row_end; ++entry) {
+            const std::size_t dof = sparse_coeffs.dofs[entry];
+            const Real c = sparse_coeffs.coefficients[entry];
+            curl[dof][0] += c * cm[0];
+            curl[dof][1] += c * cm[1];
+            curl[dof][2] += c * cm[2];
+        }
+    }
+}
+
+template <typename ModalPolynomials>
+void evaluate_nodal_modal_divergence_with_limits(const ModalPolynomials& monomials,
+                                                 const SparseModalCoefficientMatrix& sparse_coeffs,
+                                                 std::size_t n,
+                                                 const Vec3& xi,
+                                                 const std::array<int, 3>& power_limits,
+                                                 std::vector<Real>& divergence) {
+    divergence.assign(n, Real(0));
+
+    auto& scratch = vector_basis_scratch();
+    fill_power_tables(xi, power_limits, scratch);
+    const auto& px = scratch.px;
+    const auto& py = scratch.py;
+    const auto& pz = scratch.pz;
+
+    auto& modal_divergence = scratch.scalars;
+    modal_divergence.assign(n, Real(0));
+    for (std::size_t p = 0; p < n; ++p) {
+        const auto& poly = monomials[p];
+        Real div = Real(0);
+        for (int t = 0; t < poly.num_terms; ++t) {
+            const auto& m = poly.terms[static_cast<std::size_t>(t)];
+            if (m.component == 0 && m.px > 0) {
+                div += m.coefficient * Real(m.px) *
+                       px[static_cast<std::size_t>(m.px - 1)] *
+                       py[static_cast<std::size_t>(m.py)] *
+                       pz[static_cast<std::size_t>(m.pz)];
+            } else if (m.component == 1 && m.py > 0) {
+                div += m.coefficient * Real(m.py) *
+                       px[static_cast<std::size_t>(m.px)] *
+                       py[static_cast<std::size_t>(m.py - 1)] *
+                       pz[static_cast<std::size_t>(m.pz)];
+            } else if (m.component == 2 && m.pz > 0) {
+                div += m.coefficient * Real(m.pz) *
+                       px[static_cast<std::size_t>(m.px)] *
+                       py[static_cast<std::size_t>(m.py)] *
+                       pz[static_cast<std::size_t>(m.pz - 1)];
+            }
+        }
+        modal_divergence[p] = div;
+    }
+
+    BASIS_CHECK_CONSTRUCTION(sparse_coeffs.rows == n &&
+                     sparse_coeffs.cols == n &&
+                     sparse_coeffs.row_offsets.size() == n + 1u,
+                 "evaluate_nodal_modal_divergence: sparse coefficient size mismatch");
+    BASIS_CHECK_CONSTRUCTION(sparse_coeffs.dofs.size() == sparse_coeffs.coefficients.size(),
+                 "evaluate_nodal_modal_divergence: sparse coefficient entry mismatch");
+    for (std::size_t p = 0; p < n; ++p) {
+        const Real div = modal_divergence[p];
+        if (div == Real(0)) {
+            continue;
+        }
+        const std::size_t row_begin = sparse_coeffs.row_offsets[p];
+        const std::size_t row_end = sparse_coeffs.row_offsets[p + 1u];
+        for (std::size_t entry = row_begin; entry < row_end; ++entry) {
+            divergence[sparse_coeffs.dofs[entry]] +=
+                sparse_coeffs.coefficients[entry] * div;
+        }
+    }
+}
+
+template <typename ModalPolynomials>
+void evaluate_nodal_modal_vector_strided_with_limits(
+    const ModalPolynomials& monomials,
+    const SparseModalCoefficientMatrix& sparse_coeffs,
+    std::size_t n,
+    const std::vector<Vec3>& points,
+    std::size_t output_stride,
+    const std::array<int, 3>& power_limits,
+    Real* SVMP_RESTRICT values_out,
+    Real* SVMP_RESTRICT jacobians_out,
+    Real* SVMP_RESTRICT curls_out,
+    Real* SVMP_RESTRICT divergence_out,
+    const char* family_name) {
+    const std::size_t num_qpts = points.size();
+    validate_vector_strided_outputs(num_qpts, output_stride, family_name);
+    BASIS_CHECK_CONSTRUCTION(sparse_coeffs.rows == n &&
+                     sparse_coeffs.cols == n &&
+                     sparse_coeffs.row_offsets.size() == n + 1u,
+                 "evaluate_nodal_modal_vector_strided: sparse coefficient size mismatch");
+    BASIS_CHECK_CONSTRUCTION(sparse_coeffs.dofs.size() == sparse_coeffs.coefficients.size(),
+                 "evaluate_nodal_modal_vector_strided: sparse coefficient entry mismatch");
+
+    auto& scratch = vector_basis_scratch();
+    const bool need_values = values_out != nullptr;
+    const bool need_jacobians = jacobians_out != nullptr;
+    const bool need_curls = curls_out != nullptr;
+    const bool need_divergence = divergence_out != nullptr;
+
+    if (need_values) {
+        zero_active_strided_rows(values_out, n * 3u, output_stride, num_qpts);
+    }
+    if (need_jacobians) {
+        zero_active_strided_rows(jacobians_out, n * 9u, output_stride, num_qpts);
+    }
+    if (need_curls) {
+        zero_active_strided_rows(curls_out, n * 3u, output_stride, num_qpts);
+    }
+    if (need_divergence) {
+        zero_active_strided_rows(divergence_out, n, output_stride, num_qpts);
+    }
+    if (num_qpts == 0 || n == 0) {
+        return;
+    }
+
+    fill_batched_power_tables(points, power_limits, scratch);
+    const auto& px = scratch.batched_px;
+    const auto& py = scratch.batched_py;
+    const auto& pz = scratch.batched_pz;
+    const std::size_t power_stride = num_qpts;
+    const bool need_modal_gradient = need_jacobians || need_curls || need_divergence;
+
+    auto& modal_values = scratch.modal_values_batched;
+    auto& modal_jacobians = scratch.modal_jacobians_batched;
+    auto& modal_curls = scratch.modal_curls_batched;
+    auto& modal_divergence = scratch.modal_divergence_batched;
+
+    for (std::size_t p = 0; p < n; ++p) {
+        if (need_values) {
+            modal_values.assign(3u * num_qpts, Real(0));
+        }
+        if (need_jacobians) {
+            modal_jacobians.assign(9u * num_qpts, Real(0));
+        }
+        if (need_curls) {
+            modal_curls.assign(3u * num_qpts, Real(0));
+        }
+        if (need_divergence) {
+            modal_divergence.assign(num_qpts, Real(0));
+        }
+
+        const auto& poly = monomials[p];
+        for (int term_index = 0; term_index < poly.num_terms; ++term_index) {
+            const auto& term = poly.terms[static_cast<std::size_t>(term_index)];
+            const std::size_t component = static_cast<std::size_t>(term.component);
+            Real* modal_value_row = need_values
+                ? modal_values.data() + component * num_qpts
+                : nullptr;
+            Real* modal_jacobian_row = need_jacobians
+                ? modal_jacobians.data() + component * 3u * num_qpts
+                : nullptr;
+            Real* modal_curl_rows = need_curls ? modal_curls.data() : nullptr;
+            Real* modal_divergence_row =
+                need_divergence ? modal_divergence.data() : nullptr;
+
+            if (need_values) {
+                for (std::size_t q = 0; q < num_qpts; ++q) {
+                    modal_value_row[q] +=
+                        term.coefficient *
+                        batched_power_product(px,
+                                              py,
+                                              pz,
+                                              power_stride,
+                                              term.px,
+                                              term.py,
+                                              term.pz,
+                                              q);
+                }
+            }
+
+            if (need_modal_gradient) {
+                for (std::size_t q = 0; q < num_qpts; ++q) {
+                    const Real dphidx =
+                        term.coefficient *
+                        batched_component_partial(px,
+                                                  py,
+                                                  pz,
+                                                  power_stride,
+                                                  term.px,
+                                                  term.py,
+                                                  term.pz,
+                                                  0,
+                                                  q);
+                    const Real dphidy =
+                        term.coefficient *
+                        batched_component_partial(px,
+                                                  py,
+                                                  pz,
+                                                  power_stride,
+                                                  term.px,
+                                                  term.py,
+                                                  term.pz,
+                                                  1,
+                                                  q);
+                    const Real dphidz =
+                        term.coefficient *
+                        batched_component_partial(px,
+                                                  py,
+                                                  pz,
+                                                  power_stride,
+                                                  term.px,
+                                                  term.py,
+                                                  term.pz,
+                                                  2,
+                                                  q);
+
+                    if (need_jacobians) {
+                        modal_jacobian_row[q] += dphidx;
+                        modal_jacobian_row[num_qpts + q] += dphidy;
+                        modal_jacobian_row[2u * num_qpts + q] += dphidz;
+                    }
+                    if (need_curls) {
+                        const Vec3 curl =
+                            curl_from_component_gradient(term.component,
+                                                         dphidx,
+                                                         dphidy,
+                                                         dphidz);
+                        modal_curl_rows[q] += curl[0];
+                        modal_curl_rows[num_qpts + q] += curl[1];
+                        modal_curl_rows[2u * num_qpts + q] += curl[2];
+                    }
+                    if (need_divergence) {
+                        const Real div = term.component == 0 ? dphidx
+                                       : term.component == 1 ? dphidy
+                                                            : dphidz;
+                        modal_divergence_row[q] += div;
+                    }
+                }
+            }
+        }
+
+        const std::size_t row_begin = sparse_coeffs.row_offsets[p];
+        const std::size_t row_end = sparse_coeffs.row_offsets[p + 1u];
+        for (std::size_t entry = row_begin; entry < row_end; ++entry) {
+            const std::size_t dof = sparse_coeffs.dofs[entry];
+            const Real c = sparse_coeffs.coefficients[entry];
+            if (need_values) {
+                for (std::size_t component = 0; component < 3u; ++component) {
+                    axpy_qpoints(values_out + (dof * 3u + component) * output_stride,
+                                 modal_values.data() + component * num_qpts,
+                                 c,
+                                 num_qpts);
+                }
+            }
+            if (need_jacobians) {
+                for (std::size_t row = 0; row < 3u; ++row) {
+                    for (std::size_t col = 0; col < 3u; ++col) {
+                        axpy_qpoints(jacobians_out +
+                                         (dof * 9u + row * 3u + col) * output_stride,
+                                     modal_jacobians.data() +
+                                         (row * 3u + col) * num_qpts,
+                                     c,
+                                     num_qpts);
+                    }
+                }
+            }
+            if (need_curls) {
+                for (std::size_t component = 0; component < 3u; ++component) {
+                    axpy_qpoints(curls_out + (dof * 3u + component) * output_stride,
+                                 modal_curls.data() + component * num_qpts,
+                                 c,
+                                 num_qpts);
+                }
+            }
+            if (need_divergence) {
+                axpy_qpoints(divergence_out + dof * output_stride,
+                             modal_divergence.data(),
+                             c,
+                             num_qpts);
+            }
+        }
+    }
+}
+
+std::vector<std::array<int, 4>> make_nd_extra_monomial_candidates(ElementType type,
+                                                                  int order);
+Real eval_transformed_nd_monomial_scalar(const std::array<int, 4>& mono,
+                                         const std::vector<Real>& px,
+                                         const std::vector<Real>& py,
+                                         const std::vector<Real>& pz);
+Vec3 eval_transformed_nd_monomial_curl(const std::array<int, 4>& mono,
+                                       const std::vector<Real>& px,
+                                       const std::vector<Real>& py,
+                                       const std::vector<Real>& pz);
+
+
+} // namespace vector_common
+} // namespace detail
+} // namespace basis
+} // namespace FE
+} // namespace svmp
+
+#endif // SVMP_FE_BASIS_VECTORBASISEVALUATIONHELPERS_H
diff --git a/Code/Source/solver/FE/Basis/VectorBasisModalPolynomial.h b/Code/Source/solver/FE/Basis/VectorBasisModalPolynomial.h
new file mode 100644
index 000000000..6e1a7202b
--- /dev/null
+++ b/Code/Source/solver/FE/Basis/VectorBasisModalPolynomial.h
@@ -0,0 +1,77 @@
+/* Copyright (c) Stanford University, The Regents of the University of California, and others.
+ *
+ * All Rights Reserved.
+ *
+ * See License file.
+ */
+
+#ifndef SVMP_FE_BASIS_VECTORBASISMODALPOLYNOMIAL_H
+#define SVMP_FE_BASIS_VECTORBASISMODALPOLYNOMIAL_H
+
+#include "Types.h"
+
+#include <algorithm>
+#include <array>
+#include <vector>
+
+namespace svmp {
+namespace FE {
+namespace basis {
+
+struct VectorBasisModalTerm {
+    int component{0}; // 0=x, 1=y, 2=z
+    int px{0};
+    int py{0};
+    int pz{0};
+    Real coefficient{Real(1)};
+};
+
+struct VectorBasisModalPolynomial {
+    std::array<VectorBasisModalTerm, 4> terms{};
+    int num_terms{0};
+};
+
+inline bool modal_terms_equal(const VectorBasisModalTerm& lhs,
+                              const VectorBasisModalTerm& rhs) noexcept {
+    return lhs.component == rhs.component &&
+           lhs.px == rhs.px &&
+           lhs.py == rhs.py &&
+           lhs.pz == rhs.pz &&
+           lhs.coefficient == rhs.coefficient;
+}
+
+inline bool modal_polynomials_equal(const VectorBasisModalPolynomial& lhs,
+                                    const VectorBasisModalPolynomial& rhs) noexcept {
+    if (lhs.num_terms != rhs.num_terms) {
+        return false;
+    }
+    for (int term = 0; term < lhs.num_terms; ++term) {
+        const auto index = static_cast<std::size_t>(term);
+        if (!modal_terms_equal(lhs.terms[index], rhs.terms[index])) {
+            return false;
+        }
+    }
+    return true;
+}
+
+inline bool append_unique_modal_polynomial(
+    std::vector<VectorBasisModalPolynomial>& polynomials,
+    const VectorBasisModalPolynomial& polynomial) {
+    const auto found = std::find_if(
+        polynomials.begin(),
+        polynomials.end(),
+        [&](const VectorBasisModalPolynomial& existing) {
+            return modal_polynomials_equal(existing, polynomial);
+        });
+    if (found != polynomials.end()) {
+        return false;
+    }
+    polynomials.push_back(polynomial);
+    return true;
+}
+
+} // namespace basis
+} // namespace FE
+} // namespace svmp
+
+#endif // SVMP_FE_BASIS_VECTORBASISMODALPOLYNOMIAL_H
diff --git a/Code/Source/solver/FE/Common/Alignment.h b/Code/Source/solver/FE/Common/Alignment.h
new file mode 100644
index 000000000..8d33a7a7a
--- /dev/null
+++ b/Code/Source/solver/FE/Common/Alignment.h
@@ -0,0 +1,23 @@
+#ifndef SVMP_FE_CORE_ALIGNMENT_H
+#define SVMP_FE_CORE_ALIGNMENT_H
+
+/**
+ * @file Alignment.h
+ * @brief Global alignment constants used across FE modules.
+ */
+
+#include <cstddef>
+
+namespace svmp {
+namespace FE {
+
+/// Preferred cache-line/SIMD alignment for performance-critical arrays.
+inline constexpr std::size_t kFEPreferredAlignmentBytes = 64u;
+
+/// Alignment for small fixed-size math objects that are commonly passed by value.
+inline constexpr std::size_t kFEFixedObjectAlignmentBytes = 32u;
+
+} // namespace FE
+} // namespace svmp
+
+#endif // SVMP_FE_CORE_ALIGNMENT_H
diff --git a/Code/Source/solver/FE/Common/Types.h b/Code/Source/solver/FE/Common/Types.h
new file mode 100644
index 000000000..60312a524
--- /dev/null
+++ b/Code/Source/solver/FE/Common/Types.h
@@ -0,0 +1,532 @@
+/* Copyright (c) Stanford University, The Regents of the University of California, and others.
+ *
+ * All Rights Reserved.
+ *
+ * See Copyright-SimVascular.txt for additional details.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject
+ * to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+ * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+ * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SVMP_FE_TYPES_H
+#define SVMP_FE_TYPES_H
+
+/**
+ * @file Types.h
+ * @brief Fundamental type definitions for the finite element library
+ *
+ * This header provides core type aliases, enumerations, and strong type
+ * definitions used throughout the FE library. It establishes a consistent
+ * type system that integrates with the Mesh library while maintaining
+ * independence from backend-specific types.
+ */
+
+#if defined(SVMP_FE_WITH_MESH) && SVMP_FE_WITH_MESH
+#  include "Mesh/Core/MeshTypes.h"
+#  define SVMP_FE_HAS_MESH_TYPES 1
+#else
+// Build FE without Mesh types unless explicitly enabled.
+#  define SVMP_FE_HAS_MESH_TYPES 0
+#endif
+
+#if !SVMP_FE_HAS_MESH_TYPES
+namespace svmp {
+// Minimal fallback when the Mesh library is not available.
+// Keeps FE compilation self-contained while preserving the same namespace.
+#ifndef SVMP_CELL_FAMILY_DEFINED
+#define SVMP_CELL_FAMILY_DEFINED 1
+enum class CellFamily {
+    Point,
+    Line,
+    Triangle,
+    Quad,
+    Tetra,
+    Hex,
+    Wedge,
+    Pyramid,
+    Polygon,
+    Polyhedron
+};
+#endif
+} // namespace svmp
+#endif
+#include <cstdint>
+#include <array>
+#include <string>
+#include <type_traits>
+#include <limits>
+
+#if defined(_MSC_VER)
+#  define SVMP_RESTRICT __restrict
+#elif defined(__clang__) || defined(__GNUC__)
+#  define SVMP_RESTRICT __restrict__
+#else
+#  define SVMP_RESTRICT
+#endif
+
+namespace svmp {
+namespace FE {
+
+// ============================================================================
+// Index Types
+// ============================================================================
+
+/**
+ * @brief Local index type for element-level operations
+ *
+ * Used for local node numbering within elements, local DOF indices,
+ * and other element-local indexing. Unsigned for safety.
+ */
+using LocalIndex = std::uint32_t;
+
+/**
+ * @brief Global index type for distributed DOF numbering
+ *
+ * Signed 64-bit for compatibility with PETSc and Trilinos.
+ * Negative values can indicate special conditions or invalid indices.
+ */
+using GlobalIndex = std::int64_t;
+
+/**
+ * @brief DOF-specific index type
+ *
+ * Strong type alias to prevent mixing DOF indices with other indices.
+ * Provides type safety at compile time.
+ */
+struct DofIndex {
+    GlobalIndex value;
+
+    constexpr explicit DofIndex(GlobalIndex v = -1) noexcept : value(v) {}
+    constexpr operator GlobalIndex() const noexcept { return value; }
+    constexpr bool is_valid() const noexcept { return value >= 0; }
+};
+
+/**
+ * @brief Field identifier type
+ *
+ * Used to distinguish between different physical fields in multi-field problems.
+ */
+using FieldId = std::uint16_t;
+
+/**
+ * @brief Block identifier for block-structured systems
+ */
+using BlockId = std::uint16_t;
+
+// Import mesh library scalar/index types when available (optional dependency).
+#if SVMP_FE_HAS_MESH_TYPES
+using MeshIndex = svmp::index_t;
+using MeshOffset = svmp::offset_t;
+using MeshGlobalId = svmp::gid_t;
+using Real = svmp::real_t;  // Use same precision as Mesh library
+#else
+using MeshIndex = std::int32_t;
+using MeshOffset = std::int64_t;
+using MeshGlobalId = std::int64_t;
+using Real = double;
+#endif
+
+// ============================================================================
+// Constants
+// ============================================================================
+
+constexpr LocalIndex INVALID_LOCAL_INDEX = std::numeric_limits<LocalIndex>::max();
+constexpr GlobalIndex INVALID_GLOBAL_INDEX = -1;
+constexpr FieldId INVALID_FIELD_ID = std::numeric_limits<FieldId>::max();
+/// Sentinel FieldId for geometry-only quantities (no DOF dependence).
+/// Uses first registered field's space for quadrature, but logically decoupled
+/// from any specific field's DOFs.
+constexpr FieldId GEOMETRY_FIELD_ID = std::numeric_limits<FieldId>::max() - 1;
+constexpr BlockId INVALID_BLOCK_ID = std::numeric_limits<BlockId>::max();
+
+/**
+ * @brief Sentinel FieldId representing "the current solution state" in tangent forms.
+ *
+ * When differentiating a residual form to obtain the tangent (Jacobian), undifferentiated
+ * TrialFunction occurrences are rewritten to StateField nodes. Those that represent the
+ * block's own primary unknown (rather than a named external field) use this sentinel
+ * FieldId. The assembler maps it to the current solution coefficients at each quadrature
+ * point, regardless of which physics or field variables are involved.
+ *
+ * This is distinct from INVALID_FIELD_ID, which means "uninitialized / no field."
+ * CURRENT_SOLUTION_FIELD_ID uses the same numeric value for backward compatibility
+ * with existing KernelIR encodings, but carries explicit semantic intent.
+ */
+constexpr FieldId CURRENT_SOLUTION_FIELD_ID = std::numeric_limits<FieldId>::max();
+
+// ============================================================================
+// Field Value Entry (for point evaluation of field-dependent expressions)
+// ============================================================================
+
+/// Maximum number of components in a FieldValueEntry (3x3 tensor).
+constexpr int MAX_FIELD_VALUE_COMPONENTS = 9;
+
+/**
+ * @brief Field value at an evaluation point — scalar, vector, or tensor.
+ *
+ * Used by PointEvaluator and the auxiliary assembly path to supply FE
+ * field values at entity locations (e.g., nodal DOF values for
+ * Node-scoped auxiliary models with Lagrange Kronecker delta).
+ */
+struct FieldValueEntry {
+    FieldId field{INVALID_FIELD_ID};
+    int n_components{0};
+    Real components[MAX_FIELD_VALUE_COMPONENTS]{};
+};
+
+// ============================================================================
+// Element Type Enumerations
+// ============================================================================
+
+/**
+ * @brief Reference element types supported by the FE library
+ *
+ * Maps to svmp::CellFamily from the Mesh library but provides
+ * FE-specific categorization including higher-order variants.
+ */
+enum class ElementType : std::uint8_t {
+    // Linear elements
+    Line2      = 0,   // 2-node line
+    Triangle3  = 1,   // 3-node triangle
+    Quad4      = 2,   // 4-node quadrilateral
+    Tetra4     = 3,   // 4-node tetrahedron
+    Hex8       = 4,   // 8-node hexahedron
+    Wedge6     = 5,   // 6-node wedge/prism
+    Pyramid5   = 6,   // 5-node pyramid
+
+    // Quadratic elements
+    Line3      = 10,  // 3-node line
+    Triangle6  = 11,  // 6-node triangle
+    Quad9      = 12,  // 9-node quadrilateral (bi-quadratic)
+    Quad8      = 13,  // 8-node quadrilateral (serendipity)
+    Tetra10    = 14,  // 10-node tetrahedron
+    Hex27      = 15,  // 27-node hexahedron (tri-quadratic)
+    Hex20      = 16,  // 20-node hexahedron (serendipity)
+    Wedge15    = 17,  // 15-node wedge
+    Wedge18    = 18,  // 18-node wedge (complete quadratic)
+    Pyramid13  = 19,  // 13-node pyramid
+    Pyramid14  = 20,  // 14-node pyramid
+
+    // Special elements
+    Point1     = 30,  // 1-node point element
+
+    Unknown    = 255
+};
+
+/**
+ * @brief Quadrature rule types
+ */
+enum class QuadratureType : std::uint8_t {
+    GaussLegendre,     // Standard Gaussian quadrature
+    GaussLobatto,      // Includes endpoints (for spectral elements)
+    Newton,            // Newton-Cotes rules
+    Reduced,           // Order-based reduced integration for locking
+    PositionBased,     // Position-based reduced integration (legacy compatible)
+    Composite,         // Composite rules for adaptivity
+    Custom             // User-defined quadrature points
+};
+
+/**
+ * @brief Basis function families
+ */
+enum class BasisType : std::uint8_t {
+    Lagrange,          // Standard nodal Lagrange basis
+    Hierarchical,      // Hierarchical/modal basis
+    Bernstein,         // Bernstein polynomials
+    NURBS,             // Non-uniform rational B-splines
+    BSpline,           // Non-rational B-spline basis
+    Spectral,          // Spectral element basis
+    Serendipity,       // Serendipity elements
+    Hermite,           // Hermite C1 continuity basis
+    RaviartThomas,     // H(div) Raviart-Thomas family
+    Nedelec,           // H(curl) Nedelec edge elements
+    BDM,               // H(div) Brezzi-Douglas-Marini family
+    Bubble,            // Interior bubble functions for enrichment
+    Custom             // User-defined basis
+};
+
+/**
+ * @brief Field types for function spaces
+ */
+enum class FieldType : std::uint8_t {
+    Scalar,            // Scalar field (temperature, pressure)
+    Vector,            // Vector field (velocity, displacement)
+    Tensor,            // Tensor field (stress, strain)
+    SymmetricTensor,   // Symmetric tensor field
+    Mixed              // Mixed/composite field
+};
+
+/**
+ * @brief Continuity requirements for function spaces
+ */
+enum class Continuity : std::uint8_t {
+    C0,                // Continuous (standard FEM)
+    C1,                // C1 continuous (for plates/shells)
+    L2,                // L2 (discontinuous)
+    H_div,             // H(div) conforming
+    H_curl,            // H(curl) conforming
+    Custom
+};
+
+/**
+ * @brief Assembly strategies
+ */
+enum class AssemblyStrategy : std::uint8_t {
+    ElementByElement,  // Traditional element loop
+    Vectorized,        // SIMD vectorized assembly
+    MatrixFree,        // Matrix-free operators
+    Hybrid             // Mixed strategy
+};
+
+/**
+ * @brief Status codes for FE operations
+ */
+enum class FEStatus : std::uint8_t {
+    Success           = 0,
+    InvalidArgument   = 1,
+    InvalidElement    = 2,
+    SingularMapping   = 3,
+    QuadratureError   = 4,
+    AssemblyError     = 5,
+    BackendError      = 6,
+    NotImplemented    = 7,
+    ConvergenceError  = 8,
+    AllocationError   = 9,
+    MPIError          = 10,
+    IOError           = 11,
+    Unknown           = 255
+};
+
+// ============================================================================
+// Geometric Types
+// ============================================================================
+
+/**
+ * @brief Point in reference element coordinates
+ */
+template<int Dim>
+using ReferencePoint = std::array<Real, static_cast<std::size_t>(Dim)>;
+
+/**
+ * @brief Point in physical coordinates
+ */
+using PhysicalPoint = std::array<Real, 3>;
+
+/**
+ * @brief Jacobian matrix type
+ */
+template<int SpatialDim, int ReferenceDim = SpatialDim>
+using Jacobian = std::array<std::array<Real, static_cast<std::size_t>(ReferenceDim)>, static_cast<std::size_t>(SpatialDim)>;
+
+// ============================================================================
+// Strong Type Wrappers (C++17 idiom for type safety)
+// ============================================================================
+
+/**
+ * @brief Strong type wrapper template for type-safe programming
+ *
+ * Prevents accidental mixing of conceptually different types that have
+ * the same underlying representation.
+ */
+template<typename T, typename Tag>
+class StrongType {
+public:
+    using ValueType = T;
+
+    constexpr StrongType() noexcept(std::is_nothrow_default_constructible_v<T>)
+        : value_{} {}
+
+    constexpr explicit StrongType(T value) noexcept(std::is_nothrow_move_constructible_v<T>)
+        : value_(std::move(value)) {}
+
+    constexpr T& get() noexcept { return value_; }
+    constexpr const T& get() const noexcept { return value_; }
+
+    // Explicit conversion
+    constexpr explicit operator T() const noexcept { return value_; }
+
+    // Comparison operators
+    constexpr bool operator==(const StrongType& other) const noexcept {
+        return value_ == other.value_;
+    }
+    constexpr bool operator!=(const StrongType& other) const noexcept {
+        return value_ != other.value_;
+    }
+    constexpr bool operator<(const StrongType& other) const noexcept {
+        return value_ < other.value_;
+    }
+
+private:
+    T value_;
+};
+
+// Specific strong types for common use cases
+struct QuadraturePointTag {};
+struct QuadratureWeightTag {};
+struct BasisValueTag {};
+struct BasisGradientTag {};
+
+using QuadraturePointIndex = StrongType<LocalIndex, QuadraturePointTag>;
+using QuadratureWeight = StrongType<Real, QuadratureWeightTag>;
+
+// ============================================================================
+// Type Traits
+// ============================================================================
+
+/**
+ * @brief Check if a type is a valid index type
+ */
+template<typename T>
+struct is_index_type : std::false_type {};
+
+template<>
+struct is_index_type<LocalIndex> : std::true_type {};
+
+template<>
+struct is_index_type<GlobalIndex> : std::true_type {};
+
+template<>
+struct is_index_type<DofIndex> : std::true_type {};
+
+template<typename T>
+inline constexpr bool is_index_type_v = is_index_type<T>::value;
+
+/**
+ * @brief Check if a type represents a field type
+ */
+template<typename T>
+struct is_field_type : std::false_type {};
+
+template<>
+struct is_field_type<FieldType> : std::true_type {};
+
+template<typename T>
+inline constexpr bool is_field_type_v = is_field_type<T>::value;
+
+// ============================================================================
+// Utility Functions
+// ============================================================================
+
+/**
+ * @brief Convert FE ElementType to Mesh CellFamily
+ */
+constexpr svmp::CellFamily to_mesh_family(ElementType elem) noexcept {
+    switch(elem) {
+        case ElementType::Line2:
+        case ElementType::Line3:
+            return svmp::CellFamily::Line;
+
+        case ElementType::Triangle3:
+        case ElementType::Triangle6:
+            return svmp::CellFamily::Triangle;
+
+        case ElementType::Quad4:
+        case ElementType::Quad8:
+        case ElementType::Quad9:
+            return svmp::CellFamily::Quad;
+
+        case ElementType::Tetra4:
+        case ElementType::Tetra10:
+            return svmp::CellFamily::Tetra;
+
+        case ElementType::Hex8:
+        case ElementType::Hex20:
+        case ElementType::Hex27:
+            return svmp::CellFamily::Hex;
+
+        case ElementType::Wedge6:
+        case ElementType::Wedge15:
+        case ElementType::Wedge18:
+            return svmp::CellFamily::Wedge;
+
+        case ElementType::Pyramid5:
+        case ElementType::Pyramid13:
+        case ElementType::Pyramid14:
+            return svmp::CellFamily::Pyramid;
+
+        case ElementType::Point1:
+            return svmp::CellFamily::Point;
+
+        default:
+            return svmp::CellFamily::Point;  // Fallback
+    }
+}
+
+/**
+ * @brief Get spatial dimension of element type
+ */
+constexpr int element_dimension(ElementType elem) noexcept {
+    switch(elem) {
+        case ElementType::Point1:
+            return 0;
+        case ElementType::Line2:
+        case ElementType::Line3:
+            return 1;
+        case ElementType::Triangle3:
+        case ElementType::Triangle6:
+        case ElementType::Quad4:
+        case ElementType::Quad8:
+        case ElementType::Quad9:
+            return 2;
+        case ElementType::Tetra4:
+        case ElementType::Tetra10:
+        case ElementType::Hex8:
+        case ElementType::Hex20:
+        case ElementType::Hex27:
+        case ElementType::Wedge6:
+        case ElementType::Wedge15:
+        case ElementType::Wedge18:
+        case ElementType::Pyramid5:
+        case ElementType::Pyramid13:
+        case ElementType::Pyramid14:
+            return 3;
+        default:
+            return -1;
+    }
+}
+
+/**
+ * @brief Convert status code to string for error reporting
+ */
+inline const char* status_to_string(FEStatus status) noexcept {
+    switch(status) {
+        case FEStatus::Success:          return "Success";
+        case FEStatus::InvalidArgument:  return "Invalid argument";
+        case FEStatus::InvalidElement:   return "Invalid element";
+        case FEStatus::SingularMapping:  return "Singular mapping";
+        case FEStatus::QuadratureError:  return "Quadrature error";
+        case FEStatus::AssemblyError:    return "Assembly error";
+        case FEStatus::BackendError:     return "Backend error";
+        case FEStatus::NotImplemented:   return "Not implemented";
+        case FEStatus::ConvergenceError: return "Convergence error";
+        case FEStatus::AllocationError:  return "Allocation error";
+        case FEStatus::MPIError:         return "MPI error";
+        case FEStatus::IOError:          return "I/O error";
+        default:                         return "Unknown error";
+    }
+}
+
+} // namespace FE
+} // namespace svmp
+
+#endif // SVMP_FE_TYPES_H
diff --git a/Code/Source/solver/FE/Math/DenseLinearAlgebra.cpp b/Code/Source/solver/FE/Math/DenseLinearAlgebra.cpp
new file mode 100644
index 000000000..7d909fa0c
--- /dev/null
+++ b/Code/Source/solver/FE/Math/DenseLinearAlgebra.cpp
@@ -0,0 +1,480 @@
+/* Copyright (c) Stanford University, The Regents of the University of California, and others.
+ *
+ * All Rights Reserved.
+ *
+ * See License file.
+ */
+
+#include "DenseLinearAlgebra.h"
+
+#include "FEException.h"
+
+#if defined(FE_HAS_EIGEN) && FE_HAS_EIGEN
+#include <Eigen/Dense>
+#endif
+
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <string>
+#include <utility>
+
+#define DENSE_LINALG_CHECK(condition, message) \
+    ::svmp::FE::throw_if<::svmp::FE::FEException>(!(condition), SVMP_HERE, (message))
+
+namespace svmp {
+namespace FE {
+namespace math {
+
+namespace {
+
+constexpr std::size_t kDenseSolveRhsBlock = 32u;
+
+void materialize_inverse_from_solver(const DenseLUSolver& solver,
+                                     std::vector<Real>& inverse) {
+    const std::size_t n = solver.n;
+    inverse.assign(n * n, Real(0));
+    for (std::size_t diag = 0; diag < n; ++diag) {
+        inverse[diag * n + diag] = Real(1);
+    }
+    solver.solve_in_place(std::span<Real>(inverse.data(), inverse.size()), n);
+}
+
+} // namespace
+
+Real dense_matrix_max_abs(std::span<const Real> matrix) noexcept {
+    Real max_abs = Real(0);
+    for (const Real value : matrix) {
+        max_abs = std::max(max_abs, std::abs(value));
+    }
+    return max_abs;
+}
+
+Real dense_matrix_pivot_tolerance(std::size_t rows,
+                                  std::size_t cols,
+                                  Real max_abs,
+                                  Real multiplier) noexcept {
+    const Real size_scale = static_cast<Real>(std::max<std::size_t>(rows, cols));
+    const Real value_scale = std::max(Real(1), max_abs);
+    return multiplier * std::numeric_limits<Real>::epsilon() *
+           std::max(Real(1), size_scale) * value_scale;
+}
+
+Real dense_matrix_singular_value_tolerance(std::size_t rows,
+                                           std::size_t cols,
+                                           Real largest_singular_value,
+                                           Real multiplier) noexcept {
+    const Real size_scale = static_cast<Real>(std::max<std::size_t>(rows, cols));
+    return multiplier * std::numeric_limits<Real>::epsilon() *
+           std::max(Real(1), size_scale) *
+           std::max(Real(1), largest_singular_value);
+}
+
+Real dense_matrix_condition_fallback_threshold() noexcept {
+    return Real(1.0e12);
+}
+
+Real dense_matrix_condition_error_threshold() noexcept {
+    return Real(1.0e14);
+}
+
+void DenseLUSolver::solve_in_place(std::span<Real> rhs) const {
+    solve_in_place(rhs, 1u);
+}
+
+void DenseLUSolver::solve_in_place(std::span<Real> rhs,
+                                   std::size_t rhs_count) const {
+    DENSE_LINALG_CHECK(rhs_count > 0,
+                             label + ": dense solve requires at least one right-hand side");
+    DENSE_LINALG_CHECK(rhs.size() == n * rhs_count,
+                             label + ": dense multi-RHS solve size mismatch");
+    DENSE_LINALG_CHECK(lu.size() == n * n && pivots.size() == n,
+                             label + ": dense solver is not factorized");
+
+    for (std::size_t k = 0; k < n; ++k) {
+        if (pivots[k] != k) {
+            for (std::size_t block = 0; block < rhs_count; block += kDenseSolveRhsBlock) {
+                const std::size_t end =
+                    std::min(rhs_count, block + kDenseSolveRhsBlock);
+                for (std::size_t r = block; r < end; ++r) {
+                    std::swap(rhs[k * rhs_count + r],
+                              rhs[pivots[k] * rhs_count + r]);
+                }
+            }
+        }
+    }
+
+    for (std::size_t row = 0; row < n; ++row) {
+        for (std::size_t col = 0; col < row; ++col) {
+            const Real factor = lu[row * n + col];
+            for (std::size_t block = 0; block < rhs_count; block += kDenseSolveRhsBlock) {
+                const std::size_t end =
+                    std::min(rhs_count, block + kDenseSolveRhsBlock);
+                for (std::size_t r = block; r < end; ++r) {
+                    rhs[row * rhs_count + r] -= factor * rhs[col * rhs_count + r];
+                }
+            }
+        }
+    }
+
+    for (std::size_t rev = 0; rev < n; ++rev) {
+        const std::size_t row = n - 1u - rev;
+        for (std::size_t col = row + 1u; col < n; ++col) {
+            const Real factor = lu[row * n + col];
+            for (std::size_t block = 0; block < rhs_count; block += kDenseSolveRhsBlock) {
+                const std::size_t end =
+                    std::min(rhs_count, block + kDenseSolveRhsBlock);
+                for (std::size_t r = block; r < end; ++r) {
+                    rhs[row * rhs_count + r] -= factor * rhs[col * rhs_count + r];
+                }
+            }
+        }
+        const Real pivot = lu[row * n + row];
+        DENSE_LINALG_CHECK(
+            std::abs(pivot) > pivot_tolerance,
+            label + ": zero pivot during dense solve");
+        for (std::size_t block = 0; block < rhs_count; block += kDenseSolveRhsBlock) {
+            const std::size_t end =
+                std::min(rhs_count, block + kDenseSolveRhsBlock);
+            for (std::size_t r = block; r < end; ++r) {
+                rhs[row * rhs_count + r] /= pivot;
+            }
+        }
+    }
+}
+
+std::vector<Real> DenseLUSolver::solve(std::span<const Real> rhs) const {
+    std::vector<Real> x(rhs.begin(), rhs.end());
+    solve_in_place(std::span<Real>(x.data(), x.size()));
+    return x;
+}
+
+DenseMatrixDiagnostics dense_matrix_diagnostics(
+    std::span<const Real> matrix,
+    std::size_t rows,
+    std::size_t cols,
+    std::string_view label) {
+    DENSE_LINALG_CHECK(matrix.size() == rows * cols,
+                             std::string(label) + ": diagnostic size mismatch");
+    DENSE_LINALG_CHECK(rows > 0 && cols > 0,
+                             std::string(label) + ": diagnostics require a nonempty matrix");
+
+#if defined(FE_HAS_EIGEN) && FE_HAS_EIGEN
+    using RowMajorMatrix = Eigen::Matrix<Real, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+    using Matrix = Eigen::Matrix<Real, Eigen::Dynamic, Eigen::Dynamic>;
+    const Eigen::Map<const RowMajorMatrix> A(matrix.data(),
+                                             static_cast<Eigen::Index>(rows),
+                                             static_cast<Eigen::Index>(cols));
+    const Matrix dense = A;
+    Eigen::JacobiSVD<Matrix> svd(dense);
+
+    DenseMatrixDiagnostics diagnostics;
+    const auto& singular_values = svd.singularValues();
+    diagnostics.largest_singular_value =
+        (singular_values.size() > 0) ? singular_values[0] : Real(0);
+    diagnostics.tolerance =
+        dense_matrix_singular_value_tolerance(rows, cols,
+                                              diagnostics.largest_singular_value);
+
+    for (Eigen::Index i = 0; i < singular_values.size(); ++i) {
+        const Real sigma = singular_values[i];
+        if (sigma <= diagnostics.tolerance) {
+            continue;
+        }
+        ++diagnostics.rank;
+        diagnostics.smallest_retained_singular_value = sigma;
+    }
+
+    const std::size_t full_rank = std::min(rows, cols);
+    if (diagnostics.rank == full_rank &&
+        diagnostics.smallest_retained_singular_value > Real(0)) {
+        diagnostics.condition_estimate =
+            diagnostics.largest_singular_value /
+            diagnostics.smallest_retained_singular_value;
+    }
+    return diagnostics;
+#else
+    DenseMatrixDiagnostics diagnostics;
+    diagnostics.largest_singular_value = dense_matrix_max_abs(matrix);
+    diagnostics.tolerance =
+        dense_matrix_pivot_tolerance(rows, cols, diagnostics.largest_singular_value);
+    diagnostics.rank =
+        dense_matrix_rank(std::vector<Real>(matrix.begin(), matrix.end()), rows, cols);
+    const std::size_t full_rank = std::min(rows, cols);
+    if (diagnostics.rank == full_rank) {
+        diagnostics.smallest_retained_singular_value = diagnostics.tolerance;
+    }
+    // Exact condition estimates require SVD diagnostics. In Eigen-disabled
+    // builds this stays explicit instead of relying on a misleading estimate.
+    diagnostics.condition_estimate = std::numeric_limits<Real>::infinity();
+    return diagnostics;
+#endif
+}
+
+DenseLUSolver factor_dense_matrix(std::vector<Real> matrix,
+                                  std::size_t n,
+                                  std::string_view label) {
+    DENSE_LINALG_CHECK(matrix.size() == n * n,
+                             std::string(label) + ": dense factorization size mismatch");
+
+    DenseLUSolver solver;
+    solver.n = n;
+    solver.lu = std::move(matrix);
+    solver.pivots.resize(n);
+    const Real max_abs = dense_matrix_max_abs(solver.lu);
+    solver.pivot_tolerance =
+        dense_matrix_pivot_tolerance(n, n, max_abs);
+    solver.label = std::string(label);
+
+    Real max_pivot_abs = Real(0);
+    Real min_pivot_abs = std::numeric_limits<Real>::infinity();
+    for (std::size_t col = 0; col < n; ++col) {
+        std::size_t pivot_row = col;
+        Real pivot_abs = std::abs(solver.lu[col * n + col]);
+        for (std::size_t row = col + 1; row < n; ++row) {
+            const Real candidate = std::abs(solver.lu[row * n + col]);
+            if (candidate > pivot_abs) {
+                pivot_abs = candidate;
+                pivot_row = row;
+            }
+        }
+
+        DENSE_LINALG_CHECK(
+            pivot_abs > solver.pivot_tolerance,
+            solver.label + ": rank-deficient matrix (rank " +
+                std::to_string(col) + " of " + std::to_string(n) +
+                ", pivot below scale-aware tolerance " +
+                std::to_string(solver.pivot_tolerance) + ")");
+
+        solver.pivots[col] = pivot_row;
+        if (pivot_row != col) {
+            for (std::size_t j = 0; j < n; ++j) {
+                std::swap(solver.lu[col * n + j], solver.lu[pivot_row * n + j]);
+            }
+        }
+
+        const Real pivot = solver.lu[col * n + col];
+        DENSE_LINALG_CHECK(
+            std::abs(pivot) > solver.pivot_tolerance,
+            solver.label + ": zero pivot after row exchange");
+        const Real pivot_magnitude = std::abs(pivot);
+        max_pivot_abs = std::max(max_pivot_abs, pivot_magnitude);
+        min_pivot_abs = std::min(min_pivot_abs, pivot_magnitude);
+
+        for (std::size_t row = col + 1; row < n; ++row) {
+            const Real factor = solver.lu[row * n + col] / pivot;
+            solver.lu[row * n + col] = factor;
+            for (std::size_t j = col + 1; j < n; ++j) {
+                solver.lu[row * n + j] -= factor * solver.lu[col * n + j];
+            }
+        }
+    }
+
+    solver.diagnostics.rank = n;
+    solver.diagnostics.tolerance = solver.pivot_tolerance;
+    solver.diagnostics.largest_singular_value = max_abs;
+    solver.diagnostics.smallest_retained_singular_value =
+        std::isfinite(min_pivot_abs) ? min_pivot_abs : Real(0);
+    if (solver.diagnostics.smallest_retained_singular_value > Real(0)) {
+        solver.diagnostics.condition_estimate =
+            max_pivot_abs / solver.diagnostics.smallest_retained_singular_value;
+    }
+    return solver;
+}
+
+DenseInverseResult invert_dense_matrix_with_diagnostics(
+    std::vector<Real> matrix,
+    std::size_t n,
+    std::string_view label) {
+    DENSE_LINALG_CHECK(matrix.size() == n * n,
+                             std::string(label) + ": dense inverse size mismatch");
+    std::vector<Real> matrix_for_lu = matrix;
+    const DenseLUSolver solver =
+        factor_dense_matrix(std::move(matrix_for_lu), n, label);
+
+    DenseInverseResult result;
+    result.diagnostics =
+        dense_matrix_diagnostics(std::span<const Real>(matrix.data(), matrix.size()),
+                                 n, n, label);
+
+#if defined(FE_HAS_EIGEN) && FE_HAS_EIGEN
+    if (std::isfinite(solver.diagnostics.condition_estimate) &&
+        std::isfinite(result.diagnostics.condition_estimate) &&
+        result.diagnostics.condition_estimate > dense_matrix_condition_fallback_threshold()) {
+        using RowMajorMatrix = Eigen::Matrix<Real, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+        using Matrix = Eigen::Matrix<Real, Eigen::Dynamic, Eigen::Dynamic>;
+        const Eigen::Map<const RowMajorMatrix> A(matrix.data(),
+                                                 static_cast<Eigen::Index>(n),
+                                                 static_cast<Eigen::Index>(n));
+        const Matrix dense = A;
+        Eigen::JacobiSVD<Matrix> svd(dense,
+                                     Eigen::ComputeFullU | Eigen::ComputeFullV);
+        Matrix sigma_inverse = Matrix::Zero(static_cast<Eigen::Index>(n),
+                                            static_cast<Eigen::Index>(n));
+        const auto& singular_values = svd.singularValues();
+        for (Eigen::Index i = 0; i < singular_values.size(); ++i) {
+            DENSE_LINALG_CHECK(
+                singular_values[i] > solver.diagnostics.tolerance,
+                std::string(label) + ": high-condition SVD fallback encountered a dropped singular value");
+            sigma_inverse(i, i) = Real(1) / singular_values[i];
+        }
+        const Matrix inverse = svd.matrixV() * sigma_inverse * svd.matrixU().transpose();
+        result.inverse.assign(n * n, Real(0));
+        for (std::size_t row = 0; row < n; ++row) {
+            for (std::size_t col = 0; col < n; ++col) {
+                result.inverse[row * n + col] =
+                    inverse(static_cast<Eigen::Index>(row), static_cast<Eigen::Index>(col));
+            }
+        }
+        result.used_svd_fallback = true;
+        return result;
+    }
+#endif
+
+    materialize_inverse_from_solver(solver, result.inverse);
+    return result;
+}
+
+void validate_dense_inverse_diagnostics(
+    const DenseInverseResult& result,
+    std::size_t expected_rank,
+    std::string_view label,
+    Real max_condition) {
+    DENSE_LINALG_CHECK(
+        result.diagnostics.rank == expected_rank,
+        std::string(label) + ": rank-deficient matrix (rank " +
+            std::to_string(result.diagnostics.rank) + " of " +
+            std::to_string(expected_rank) + ")");
+
+    if (!std::isfinite(result.diagnostics.condition_estimate)) {
+        return;
+    }
+
+    DENSE_LINALG_CHECK(
+        result.diagnostics.condition_estimate <= max_condition,
+        std::string(label) + ": condition estimate " +
+            std::to_string(result.diagnostics.condition_estimate) +
+            " exceeds supported threshold " + std::to_string(max_condition));
+}
+
+std::vector<Real> invert_dense_matrix(std::vector<Real> matrix,
+                                      std::size_t n,
+                                      std::string_view label) {
+    const DenseLUSolver solver = factor_dense_matrix(std::move(matrix), n, label);
+    std::vector<Real> inverse;
+    materialize_inverse_from_solver(solver, inverse);
+    return inverse;
+}
+
+std::size_t dense_matrix_rank(std::vector<Real> matrix,
+                              std::size_t rows,
+                              std::size_t cols) {
+    DENSE_LINALG_CHECK(matrix.size() == rows * cols,
+                             "dense_matrix_rank: size mismatch");
+    const Real tolerance =
+        dense_matrix_pivot_tolerance(rows, cols, dense_matrix_max_abs(matrix));
+
+    std::size_t rank = 0;
+    std::size_t pivot_row = 0;
+    for (std::size_t col = 0; col < cols && pivot_row < rows; ++col) {
+        std::size_t best_row = pivot_row;
+        Real best_abs = std::abs(matrix[pivot_row * cols + col]);
+        for (std::size_t row = pivot_row + 1; row < rows; ++row) {
+            const Real candidate = std::abs(matrix[row * cols + col]);
+            if (candidate > best_abs) {
+                best_abs = candidate;
+                best_row = row;
+            }
+        }
+        if (best_abs <= tolerance) {
+            continue;
+        }
+
+        if (best_row != pivot_row) {
+            for (std::size_t c = col; c < cols; ++c) {
+                std::swap(matrix[pivot_row * cols + c], matrix[best_row * cols + c]);
+            }
+        }
+
+        const Real pivot = matrix[pivot_row * cols + col];
+        for (std::size_t row = pivot_row + 1; row < rows; ++row) {
+            const Real factor = matrix[row * cols + col] / pivot;
+            if (std::abs(factor) <= tolerance) {
+                matrix[row * cols + col] = Real(0);
+                continue;
+            }
+            matrix[row * cols + col] = Real(0);
+            for (std::size_t c = col + 1; c < cols; ++c) {
+                matrix[row * cols + c] -= factor * matrix[pivot_row * cols + c];
+            }
+        }
+
+        ++rank;
+        ++pivot_row;
+    }
+    return rank;
+}
+
+DensePseudoInverseResult rank_revealing_pseudo_inverse(
+    std::span<const Real> matrix,
+    std::size_t rows,
+    std::size_t cols,
+    std::string_view label) {
+    DENSE_LINALG_CHECK(matrix.size() == rows * cols,
+                             std::string(label) + ": pseudo-inverse size mismatch");
+    DENSE_LINALG_CHECK(rows > 0 && cols > 0,
+                             std::string(label) + ": pseudo-inverse requires a nonempty matrix");
+
+#if defined(FE_HAS_EIGEN) && FE_HAS_EIGEN
+    using RowMajorMatrix = Eigen::Matrix<Real, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+    using Matrix = Eigen::Matrix<Real, Eigen::Dynamic, Eigen::Dynamic>;
+    const Eigen::Map<const RowMajorMatrix> A(matrix.data(),
+                                             static_cast<Eigen::Index>(rows),
+                                             static_cast<Eigen::Index>(cols));
+    const Matrix dense = A;
+    Eigen::JacobiSVD<Matrix> svd(dense, Eigen::ComputeFullU | Eigen::ComputeFullV);
+
+    DensePseudoInverseResult result;
+    result.inverse.assign(cols * rows, Real(0));
+
+    const auto& singular_values = svd.singularValues();
+    result.largest_singular_value =
+        (singular_values.size() > 0) ? singular_values[0] : Real(0);
+    result.tolerance =
+        dense_matrix_singular_value_tolerance(rows, cols, result.largest_singular_value);
+
+    Matrix sigma_inverse = Matrix::Zero(static_cast<Eigen::Index>(cols),
+                                        static_cast<Eigen::Index>(rows));
+    for (Eigen::Index i = 0; i < singular_values.size(); ++i) {
+        const Real sigma = singular_values[i];
+        if (sigma <= result.tolerance) {
+            continue;
+        }
+        sigma_inverse(i, i) = Real(1) / sigma;
+        ++result.rank;
+        result.smallest_retained_singular_value = sigma;
+    }
+
+    const Matrix pseudo_inverse =
+        svd.matrixV() * sigma_inverse * svd.matrixU().transpose();
+    for (std::size_t r = 0; r < cols; ++r) {
+        for (std::size_t c = 0; c < rows; ++c) {
+            result.inverse[r * rows + c] =
+                pseudo_inverse(static_cast<Eigen::Index>(r), static_cast<Eigen::Index>(c));
+        }
+    }
+    return result;
+#else
+    DENSE_LINALG_CHECK(
+        false,
+        std::string(label) +
+            ": rank-revealing pseudo-inverse requires FE_ENABLE_EIGEN");
+    return {};
+#endif
+}
+
+} // namespace math
+} // namespace FE
+} // namespace svmp
+
+#undef DENSE_LINALG_CHECK
diff --git a/Code/Source/solver/FE/Math/DenseLinearAlgebra.h b/Code/Source/solver/FE/Math/DenseLinearAlgebra.h
new file mode 100644
index 000000000..7684439b5
--- /dev/null
+++ b/Code/Source/solver/FE/Math/DenseLinearAlgebra.h
@@ -0,0 +1,119 @@
+/* Copyright (c) Stanford University, The Regents of the University of California, and others.
+ *
+ * All Rights Reserved.
+ *
+ * See License file.
+ */
+
+#ifndef SVMP_FE_MATH_DENSELINEARALGEBRA_H
+#define SVMP_FE_MATH_DENSELINEARALGEBRA_H
+
+#include "Types.h"
+
+#include <cstddef>
+#include <limits>
+#include <span>
+#include <string>
+#include <string_view>
+#include <vector>
+
+namespace svmp {
+namespace FE {
+namespace math {
+
+// Dense solve, inverse, rank, and pseudo-inverse support for FE construction
+// utilities. Matrices are row-major: matrix[row * cols + col].
+[[nodiscard]] Real dense_matrix_max_abs(std::span<const Real> matrix) noexcept;
+
+[[nodiscard]] Real dense_matrix_pivot_tolerance(std::size_t rows,
+                                                std::size_t cols,
+                                                Real max_abs,
+                                                Real multiplier = Real(64)) noexcept;
+
+[[nodiscard]] Real dense_matrix_singular_value_tolerance(std::size_t rows,
+                                                         std::size_t cols,
+                                                         Real largest_singular_value,
+                                                         Real multiplier = Real(64)) noexcept;
+
+struct DensePseudoInverseResult {
+    std::vector<Real> inverse;
+    std::size_t rank{0};
+    Real tolerance{0};
+    Real largest_singular_value{0};
+    Real smallest_retained_singular_value{0};
+};
+
+struct DenseMatrixDiagnostics {
+    std::size_t rank{0};
+    Real tolerance{0};
+    Real largest_singular_value{0};
+    Real smallest_retained_singular_value{0};
+    Real condition_estimate{std::numeric_limits<Real>::infinity()};
+};
+
+struct DenseInverseResult {
+    std::vector<Real> inverse;
+    DenseMatrixDiagnostics diagnostics;
+    bool used_svd_fallback{false};
+};
+
+[[nodiscard]] Real dense_matrix_condition_fallback_threshold() noexcept;
+[[nodiscard]] Real dense_matrix_condition_error_threshold() noexcept;
+
+struct DenseLUSolver {
+    std::size_t n{0};
+    std::vector<Real> lu;
+    std::vector<std::size_t> pivots;
+    DenseMatrixDiagnostics diagnostics;
+    Real pivot_tolerance{0};
+    std::string label;
+
+    [[nodiscard]] bool empty() const noexcept { return n == 0; }
+
+    void solve_in_place(std::span<Real> rhs) const;
+    void solve_in_place(std::span<Real> rhs, std::size_t rhs_count) const;
+    [[nodiscard]] std::vector<Real> solve(std::span<const Real> rhs) const;
+};
+
+// Inverses and pseudo-inverses keep the same row-major convention for their
+// returned dimensions.
+[[nodiscard]] DenseMatrixDiagnostics dense_matrix_diagnostics(
+    std::span<const Real> matrix,
+    std::size_t rows,
+    std::size_t cols,
+    std::string_view label = "dense matrix");
+
+[[nodiscard]] DenseLUSolver factor_dense_matrix(std::vector<Real> matrix,
+                                                std::size_t n,
+                                                std::string_view label = "dense matrix");
+
+[[nodiscard]] std::vector<Real> invert_dense_matrix(std::vector<Real> matrix,
+                                                    std::size_t n,
+                                                    std::string_view label = "dense matrix");
+
+[[nodiscard]] DenseInverseResult invert_dense_matrix_with_diagnostics(
+    std::vector<Real> matrix,
+    std::size_t n,
+    std::string_view label = "dense matrix");
+
+void validate_dense_inverse_diagnostics(
+    const DenseInverseResult& result,
+    std::size_t expected_rank,
+    std::string_view label = "dense matrix",
+    Real max_condition = dense_matrix_condition_error_threshold());
+
+[[nodiscard]] std::size_t dense_matrix_rank(std::vector<Real> matrix,
+                                            std::size_t rows,
+                                            std::size_t cols);
+
+[[nodiscard]] DensePseudoInverseResult rank_revealing_pseudo_inverse(
+    std::span<const Real> matrix,
+    std::size_t rows,
+    std::size_t cols,
+    std::string_view label = "dense matrix");
+
+} // namespace math
+} // namespace FE
+} // namespace svmp
+
+#endif // SVMP_FE_MATH_DENSELINEARALGEBRA_H
diff --git a/Code/Source/solver/FE/Math/DenseTransformKernels.h b/Code/Source/solver/FE/Math/DenseTransformKernels.h
new file mode 100644
index 000000000..8bf83ec0b
--- /dev/null
+++ b/Code/Source/solver/FE/Math/DenseTransformKernels.h
@@ -0,0 +1,78 @@
+/* Copyright (c) Stanford University, The Regents of the University of California, and others.
+ *
+ * All Rights Reserved.
+ *
+ * See License file.
+ */
+
+#ifndef SVMP_FE_MATH_DENSETRANSFORMKERNELS_H
+#define SVMP_FE_MATH_DENSETRANSFORMKERNELS_H
+
+#include "Types.h"
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+
+namespace svmp {
+namespace FE {
+namespace math {
+
+constexpr std::size_t dense_transform_blocked_min_rows() noexcept { return 32u; }
+constexpr std::size_t dense_transform_blocked_min_rhs() noexcept { return 4u; }
+
+inline void dense_transform_batched_row_major(
+    const Real* SVMP_RESTRICT matrix,
+    std::size_t rows,
+    std::size_t cols,
+    const Real* SVMP_RESTRICT input,
+    std::size_t input_row_stride,
+    Real* SVMP_RESTRICT output,
+    std::size_t output_row_stride,
+    std::size_t rhs_count) {
+    if (rows == 0u || cols == 0u || rhs_count == 0u) {
+        return;
+    }
+
+    if (rows < dense_transform_blocked_min_rows() ||
+        rhs_count < dense_transform_blocked_min_rhs()) {
+        for (std::size_t row = 0; row < rows; ++row) {
+            const Real* matrix_row = matrix + row * cols;
+            Real* output_row = output + row * output_row_stride;
+            for (std::size_t rhs = 0; rhs < rhs_count; ++rhs) {
+                Real value = Real(0);
+                for (std::size_t col = 0; col < cols; ++col) {
+                    value += matrix_row[col] * input[col * input_row_stride + rhs];
+                }
+                output_row[rhs] = value;
+            }
+        }
+        return;
+    }
+
+    constexpr std::size_t kRhsBlock = 32u;
+    for (std::size_t row = 0; row < rows; ++row) {
+        const Real* matrix_row = matrix + row * cols;
+        Real* output_row = output + row * output_row_stride;
+        for (std::size_t rhs_base = 0; rhs_base < rhs_count; rhs_base += kRhsBlock) {
+            const std::size_t block_size = std::min(kRhsBlock, rhs_count - rhs_base);
+            std::array<Real, kRhsBlock> accum{};
+            for (std::size_t col = 0; col < cols; ++col) {
+                const Real coeff = matrix_row[col];
+                const Real* input_row = input + col * input_row_stride + rhs_base;
+                for (std::size_t rhs = 0; rhs < block_size; ++rhs) {
+                    accum[rhs] += coeff * input_row[rhs];
+                }
+            }
+            for (std::size_t rhs = 0; rhs < block_size; ++rhs) {
+                output_row[rhs_base + rhs] = accum[rhs];
+            }
+        }
+    }
+}
+
+} // namespace math
+} // namespace FE
+} // namespace svmp
+
+#endif // SVMP_FE_MATH_DENSETRANSFORMKERNELS_H
diff --git a/Code/Source/solver/FE/Math/ExpressionOps.h b/Code/Source/solver/FE/Math/ExpressionOps.h
new file mode 100644
index 000000000..96cea1037
--- /dev/null
+++ b/Code/Source/solver/FE/Math/ExpressionOps.h
@@ -0,0 +1,99 @@
+#ifndef SVMP_FE_MATH_EXPRESSION_OPS_H
+#define SVMP_FE_MATH_EXPRESSION_OPS_H
+
+/**
+ * @file ExpressionOps.h
+ * @brief Common expression template operators for vector and matrix expressions
+ *
+ * This header provides shared operator functors used by both VectorExpr.h and
+ * MatrixExpr.h to avoid code duplication and namespace conflicts. All operators
+ * are defined in the detail::ops namespace for internal use by expression templates.
+ */
+
+#include <cmath>
+
+namespace svmp {
+namespace FE {
+namespace math {
+namespace detail {
+namespace ops {
+
+/**
+ * @brief Addition operator functor
+ */
+struct Add {
+    template<typename T1, typename T2>
+    constexpr auto operator()(const T1& a, const T2& b) const {
+        return a + b;
+    }
+};
+
+/**
+ * @brief Subtraction operator functor
+ */
+struct Sub {
+    template<typename T1, typename T2>
+    constexpr auto operator()(const T1& a, const T2& b) const {
+        return a - b;
+    }
+};
+
+/**
+ * @brief Multiplication operator functor
+ */
+struct Mul {
+    template<typename T1, typename T2>
+    constexpr auto operator()(const T1& a, const T2& b) const {
+        return a * b;
+    }
+};
+
+/**
+ * @brief Division operator functor
+ */
+struct Div {
+    template<typename T1, typename T2>
+    constexpr auto operator()(const T1& a, const T2& b) const {
+        return a / b;
+    }
+};
+
+/**
+ * @brief Negation operator functor
+ */
+struct Negate {
+    template<typename T>
+    constexpr auto operator()(const T& a) const {
+        return -a;
+    }
+};
+
+/**
+ * @brief Absolute value operator functor
+ */
+struct Abs {
+    template<typename T>
+    constexpr auto operator()(const T& a) const {
+        using std::abs;
+        return abs(a);
+    }
+};
+
+/**
+ * @brief Square root operator functor
+ */
+struct Sqrt {
+    template<typename T>
+    constexpr auto operator()(const T& a) const {
+        using std::sqrt;
+        return sqrt(a);
+    }
+};
+
+} // namespace ops
+} // namespace detail
+} // namespace math
+} // namespace FE
+} // namespace svmp
+
+#endif // SVMP_FE_MATH_EXPRESSION_OPS_H
diff --git a/Code/Source/solver/FE/Math/IntegerMath.h b/Code/Source/solver/FE/Math/IntegerMath.h
new file mode 100644
index 000000000..52a50117f
--- /dev/null
+++ b/Code/Source/solver/FE/Math/IntegerMath.h
@@ -0,0 +1,98 @@
+/* Copyright (c) Stanford University, The Regents of the University of California, and others.
+ *
+ * All Rights Reserved.
+ *
+ * See License file.
+ */
+
+#ifndef SVMP_FE_MATH_INTEGERMATH_H
+#define SVMP_FE_MATH_INTEGERMATH_H
+
+#include "Types.h"
+
+#include <cstddef>
+#include <limits>
+#include <numeric>
+#include <stdexcept>
+
+namespace svmp {
+namespace FE {
+namespace math {
+
+[[nodiscard]] constexpr Real pow_int_nonnegative(Real base, int exponent) noexcept {
+    Real result = Real(1);
+    Real factor = base;
+    int power = exponent;
+    while (power > 0) {
+        if ((power & 1) != 0) {
+            result *= factor;
+        }
+        power >>= 1;
+        if (power > 0) {
+            factor *= factor;
+        }
+    }
+    return result;
+}
+
+[[nodiscard]] constexpr Real pow_int(Real base, int exponent) noexcept {
+    if (exponent < 0) {
+        return Real(1) / pow_int_nonnegative(base, -exponent);
+    }
+    return pow_int_nonnegative(base, exponent);
+}
+
+[[nodiscard]] constexpr std::size_t binomial_size(int n, int k) {
+    if (n < 0 || k < 0 || k > n) {
+        return 0u;
+    }
+    if (k > n - k) {
+        k = n - k;
+    }
+
+    std::size_t result = 1u;
+    for (int i = 1; i <= k; ++i) {
+        auto numerator = static_cast<std::size_t>(n - (k - i));
+        auto denominator = static_cast<std::size_t>(i);
+
+        const auto numerator_gcd = std::gcd(numerator, denominator);
+        numerator /= numerator_gcd;
+        denominator /= numerator_gcd;
+
+        const auto result_gcd = std::gcd(result, denominator);
+        result /= result_gcd;
+        denominator /= result_gcd;
+        if (denominator != 1u) {
+            throw std::overflow_error(
+                "binomial_size: failed to reduce exact binomial factor");
+        }
+        if (numerator != 0u &&
+            result > std::numeric_limits<std::size_t>::max() / numerator) {
+            throw std::overflow_error("binomial_size: result does not fit in size_t");
+        }
+        result *= numerator;
+    }
+    return result;
+}
+
+[[nodiscard]] constexpr Real binomial_real(int n, int k) noexcept {
+    if (k < 0 || k > n) {
+        return Real(0);
+    }
+    if (k > n - k) {
+        k = n - k;
+    }
+
+    Real result = Real(1);
+    for (int i = 1; i <= k; ++i) {
+        result *= static_cast<Real>(n - (k - i));
+        result /= static_cast<Real>(i);
+    }
+    return result;
+}
+
+} // namespace math
+} // namespace FE
+} // namespace svmp
+
+#endif // SVMP_FE_MATH_INTEGERMATH_H
diff --git a/Code/Source/solver/FE/Math/MathConstants.h b/Code/Source/solver/FE/Math/MathConstants.h
new file mode 100644
index 000000000..145520ab2
--- /dev/null
+++ b/Code/Source/solver/FE/Math/MathConstants.h
@@ -0,0 +1,388 @@
+#ifndef SVMP_FE_MATH_CONSTANTS_H
+#define SVMP_FE_MATH_CONSTANTS_H
+
+/**
+ * @file MathConstants.h
+ * @brief Mathematical constants and numerical tolerances for FE computations
+ *
+ * This header provides mathematical constants (π, e, √2, etc.) and numerical
+ * tolerances used throughout the FE library. All constants are templated
+ * to support different precision types.
+ */
+
+#include <cmath>
+#include <limits>
+#include <type_traits>
+#include <algorithm>
+
+namespace svmp {
+namespace FE {
+namespace math {
+
+/**
+ * @brief Mathematical constants templated by type
+ * @tparam T The numeric type (float, double, long double)
+ */
+template<typename T>
+struct Constants {
+    static_assert(std::is_floating_point_v<T>,
+                  "Constants only defined for floating-point types");
+
+    // Mathematical constants
+    static constexpr T pi           = T(3.14159265358979323846264338327950288419716939937510L);
+    static constexpr T two_pi       = T(6.28318530717958647692528676655900576839433879875021L);
+    static constexpr T half_pi      = T(1.57079632679489661923132169163975144209858469968755L);
+    static constexpr T quarter_pi   = T(0.78539816339744830961566084581987572104929234984378L);
+    static constexpr T inv_pi       = T(0.31830988618379067153776752674502872406891929148091L);
+    static constexpr T inv_two_pi   = T(0.15915494309189533576888376337251436203445964574046L);
+
+    static constexpr T e            = T(2.71828182845904523536028747135266249775724709369995L);
+    static constexpr T log2e        = T(1.44269504088896340735992468100189213742664595415299L);
+    static constexpr T log10e       = T(0.43429448190325182765112891891660508229439700580367L);
+    static constexpr T ln2          = T(0.69314718055994530941723212145817656807550013436026L);
+    static constexpr T ln10         = T(2.30258509299404568401799145468436420760110148862877L);
+
+    static constexpr T sqrt2        = T(1.41421356237309504880168872420969807856967187537694L);
+    static constexpr T sqrt3        = T(1.73205080756887729352744634150587236694280525381038L);
+    static constexpr T inv_sqrt2    = T(0.70710678118654752440084436210484903928483593768847L);
+    static constexpr T inv_sqrt3    = T(0.57735026918962576450914878050195745564760175127013L);
+
+    // Golden ratio
+    static constexpr T phi          = T(1.61803398874989484820458683436563811772030917980576L);
+
+    // Degrees to radians conversion
+    static constexpr T deg_to_rad   = pi / T(180);
+    static constexpr T rad_to_deg   = T(180) / pi;
+};
+
+/**
+ * @brief Numerical tolerances and machine epsilon
+ * @tparam T The numeric type
+ */
+template<typename T>
+struct Tolerances {
+    static_assert(std::is_floating_point_v<T>,
+                  "Tolerances only defined for floating-point types");
+
+    // Machine epsilon
+    static constexpr T epsilon      = std::numeric_limits<T>::epsilon();
+
+    // Default tolerance (1000 * machine epsilon)
+    static constexpr T tolerance    = T(1000) * epsilon;
+
+    // Strict tolerance (10 * machine epsilon)
+    static constexpr T strict       = T(10) * epsilon;
+
+    // Loose tolerance (10000 * machine epsilon)
+    static constexpr T loose        = T(10000) * epsilon;
+
+    // Square root of epsilon (useful for finite differences)
+    static inline const T sqrt_epsilon = std::sqrt(epsilon);
+
+    // Cube root of epsilon (useful for numerical derivatives)
+    static inline const T cbrt_epsilon = std::cbrt(epsilon);
+
+    // Smallest positive normalized value
+    static constexpr T min_positive = std::numeric_limits<T>::min();
+
+    // Largest representable value
+    static constexpr T max_value    = std::numeric_limits<T>::max();
+
+    // Infinity
+    static constexpr T infinity     = std::numeric_limits<T>::infinity();
+
+    // Not-a-Number
+    static constexpr T nan          = std::numeric_limits<T>::quiet_NaN();
+};
+
+/**
+ * @brief Convenient aliases for common types
+ */
+template<typename T> inline constexpr T pi           = Constants<T>::pi;
+template<typename T> inline constexpr T two_pi       = Constants<T>::two_pi;
+template<typename T> inline constexpr T half_pi      = Constants<T>::half_pi;
+template<typename T> inline constexpr T quarter_pi   = Constants<T>::quarter_pi;
+template<typename T> inline constexpr T inv_pi       = Constants<T>::inv_pi;
+template<typename T> inline constexpr T inv_two_pi   = Constants<T>::inv_two_pi;
+
+template<typename T> inline constexpr T e            = Constants<T>::e;
+template<typename T> inline constexpr T log2e        = Constants<T>::log2e;
+template<typename T> inline constexpr T log10e       = Constants<T>::log10e;
+template<typename T> inline constexpr T ln2          = Constants<T>::ln2;
+template<typename T> inline constexpr T ln10         = Constants<T>::ln10;
+
+template<typename T> inline constexpr T sqrt2        = Constants<T>::sqrt2;
+template<typename T> inline constexpr T sqrt3        = Constants<T>::sqrt3;
+template<typename T> inline constexpr T inv_sqrt2    = Constants<T>::inv_sqrt2;
+template<typename T> inline constexpr T inv_sqrt3    = Constants<T>::inv_sqrt3;
+
+template<typename T> inline constexpr T phi          = Constants<T>::phi;
+
+template<typename T> inline constexpr T deg_to_rad   = Constants<T>::deg_to_rad;
+template<typename T> inline constexpr T rad_to_deg   = Constants<T>::rad_to_deg;
+
+template<typename T> inline constexpr T epsilon      = Tolerances<T>::epsilon;
+template<typename T> inline constexpr T tolerance    = Tolerances<T>::tolerance;
+template<typename T> inline constexpr T strict_tol   = Tolerances<T>::strict;
+template<typename T> inline constexpr T loose_tol    = Tolerances<T>::loose;
+template<typename T> inline const T sqrt_epsilon = Tolerances<T>::sqrt_epsilon;
+template<typename T> inline const T cbrt_epsilon = Tolerances<T>::cbrt_epsilon;
+template<typename T> inline constexpr T min_positive = Tolerances<T>::min_positive;
+template<typename T> inline constexpr T max_value    = Tolerances<T>::max_value;
+template<typename T> inline constexpr T infinity     = Tolerances<T>::infinity;
+
+/**
+ * @brief Comparison functions with tolerance
+ */
+
+/**
+ * @brief Check if two values are approximately equal
+ * @param a First value
+ * @param b Second value
+ * @param tol Tolerance (default: 1000 * epsilon)
+ * @return true if |a - b| <= tol * max(|a|, |b|, 1)
+ */
+template<typename T>
+inline constexpr bool approx_equal(T a, T b, T tol = tolerance<T>) {
+    static_assert(std::is_floating_point_v<T>,
+                  "approx_equal only defined for floating-point types");
+    const T scale = std::max({std::abs(a), std::abs(b), T(1)});
+    return std::abs(a - b) <= tol * scale;
+}
+
+/**
+ * @brief Check if a value is approximately zero
+ * @param a Value to check
+ * @param tol Tolerance (default: 1000 * epsilon)
+ * @return true if |a| <= tol
+ */
+template<typename T>
+inline constexpr bool approx_zero(T a, T tol = tolerance<T>) {
+    static_assert(std::is_floating_point_v<T>,
+                  "approx_zero only defined for floating-point types");
+    return std::abs(a) <= tol;
+}
+
+/**
+ * @brief Check if a value is positive (greater than tolerance)
+ * @param a Value to check
+ * @param tol Tolerance (default: 1000 * epsilon)
+ * @return true if a > tol
+ */
+template<typename T>
+inline constexpr bool is_positive(T a, T tol = tolerance<T>) {
+    static_assert(std::is_floating_point_v<T>,
+                  "is_positive only defined for floating-point types");
+    return a > tol;
+}
+
+/**
+ * @brief Check if a value is negative (less than -tolerance)
+ * @param a Value to check
+ * @param tol Tolerance (default: 1000 * epsilon)
+ * @return true if a < -tol
+ */
+template<typename T>
+inline constexpr bool is_negative(T a, T tol = tolerance<T>) {
+    static_assert(std::is_floating_point_v<T>,
+                  "is_negative only defined for floating-point types");
+    return a < -tol;
+}
+
+/**
+ * @brief Check if a value is finite (not infinite or NaN)
+ * @param a Value to check
+ * @return true if value is finite
+ */
+template<typename T>
+inline constexpr bool is_finite(T a) {
+    static_assert(std::is_floating_point_v<T>,
+                  "is_finite only defined for floating-point types");
+    return std::isfinite(a);
+}
+
+/**
+ * @brief Degrees to radians conversion
+ * @param degrees Angle in degrees
+ * @return Angle in radians
+ */
+template<typename T>
+inline constexpr T to_radians(T degrees) {
+    static_assert(std::is_floating_point_v<T>,
+                  "to_radians only defined for floating-point types");
+    return degrees * deg_to_rad<T>;
+}
+
+/**
+ * @brief Radians to degrees conversion
+ * @param radians Angle in radians
+ * @return Angle in degrees
+ */
+template<typename T>
+inline constexpr T to_degrees(T radians) {
+    static_assert(std::is_floating_point_v<T>,
+                  "to_degrees only defined for floating-point types");
+    return radians * rad_to_deg<T>;
+}
+
+// =============================================================================
+// Constants namespace for compatibility with test expectations
+// =============================================================================
+namespace constants {
+
+// Mathematical constants (double precision defaults)
+inline constexpr double PI         = Constants<double>::pi;
+inline constexpr double PI_2       = Constants<double>::half_pi;
+inline constexpr double PI_4       = Constants<double>::quarter_pi;
+inline constexpr double TWO_PI     = Constants<double>::two_pi;
+inline constexpr double INV_PI     = Constants<double>::inv_pi;
+
+inline constexpr double E          = Constants<double>::e;
+inline constexpr double LN_2       = Constants<double>::ln2;
+inline constexpr double LN_10      = Constants<double>::ln10;
+inline constexpr double LOG10_E    = Constants<double>::log10e;
+inline constexpr double LOG2_E     = Constants<double>::log2e;
+
+inline constexpr double SQRT_2     = Constants<double>::sqrt2;
+inline constexpr double SQRT_3     = Constants<double>::sqrt3;
+inline constexpr double SQRT_5     = 2.2360679774997896964091736687312L;
+inline constexpr double INV_SQRT_2  = Constants<double>::inv_sqrt2;
+inline constexpr double INV_SQRT_3  = Constants<double>::inv_sqrt3;
+
+inline constexpr double PHI        = Constants<double>::phi;
+
+// Angle conversion functions
+template<typename T>
+inline constexpr T deg_to_rad(T degrees) {
+    return degrees * Constants<T>::deg_to_rad;
+}
+
+template<typename T>
+inline constexpr T rad_to_deg(T radians) {
+    return radians * Constants<T>::rad_to_deg;
+}
+
+// Templated tolerances
+template<typename T>
+inline constexpr T tolerance() {
+    return Tolerances<T>::tolerance;
+}
+
+template<typename T>
+inline constexpr T machine_epsilon() {
+    return Tolerances<T>::epsilon;
+}
+
+// Additional constants and utility functions for tests
+inline constexpr double DEFAULT_TOLERANCE = Tolerances<double>::tolerance;
+inline constexpr double DEFAULT_REL_TOLERANCE = 1e-12;
+inline constexpr double GEOMETRY_TOLERANCE = 1e-10;
+inline constexpr double SOLVER_TOLERANCE = Tolerances<double>::strict;
+inline constexpr double EPSILON = Tolerances<double>::epsilon;
+inline constexpr double INF_VALUE = Tolerances<double>::infinity;  // Renamed from INFINITY
+inline constexpr double NOT_A_NUMBER = Tolerances<double>::nan;  // Renamed from NAN
+inline constexpr double MAX_DOUBLE = Tolerances<double>::max_value;
+inline constexpr double MIN_DOUBLE = Tolerances<double>::min_positive;
+inline constexpr double LOWEST_DOUBLE = -Tolerances<double>::max_value;
+
+// Physical constants
+inline constexpr double SPEED_OF_LIGHT = 299792458.0;         // m/s
+inline constexpr double GRAVITATIONAL_CONSTANT = 6.67430e-11;  // m³/(kg·s²)
+inline constexpr double PLANCK_CONSTANT = 6.62607015e-34;      // J·s
+inline constexpr double AVOGADRO_NUMBER = 6.02214076e23;       // mol⁻¹
+inline constexpr double BOLTZMANN_CONSTANT = 1.380649e-23;     // J/K
+inline constexpr double STANDARD_GRAVITY = 9.80665;            // m/s²
+
+// Float and long double versions
+inline constexpr float PI_F = static_cast<float>(PI);
+inline constexpr float E_F = static_cast<float>(E);
+inline constexpr float SQRT_2_F = static_cast<float>(SQRT_2);
+inline constexpr float EPSILON_F = Tolerances<float>::epsilon;
+
+inline constexpr long double PI_L = static_cast<long double>(PI);
+inline constexpr long double E_L = static_cast<long double>(E);
+inline constexpr long double SQRT_2_L = static_cast<long double>(SQRT_2);
+inline constexpr long double EPSILON_L = Tolerances<long double>::epsilon;
+
+// Additional mathematical constants
+inline constexpr double SQRT_PI = 1.7724538509055160272981674833411L;
+
+// Utility functions
+template<typename T>
+inline constexpr int sign(T value) {
+    return (T(0) < value) - (value < T(0));
+}
+
+template<typename T>
+inline constexpr bool is_zero(T value, T tol = DEFAULT_TOLERANCE) {
+    return std::abs(value) <= tol;
+}
+
+template<typename T>
+inline bool near(T a, T b, T tol = DEFAULT_TOLERANCE) {
+    return std::abs(a - b) <= tol;
+}
+
+template<typename T>
+inline bool near_relative(T a, T b, T rel_tol = DEFAULT_REL_TOLERANCE) {
+    T scale = std::max(std::abs(a), std::abs(b));
+    return std::abs(a - b) <= rel_tol * scale;
+}
+
+template<typename T>
+inline constexpr T clamp(T value, T min_val, T max_val) {
+    return value < min_val ? min_val : (value > max_val ? max_val : value);
+}
+
+template<typename T>
+inline constexpr T lerp(T a, T b, T t) {
+    return a + t * (b - a);
+}
+
+template<typename T>
+inline T safe_divide(T numerator, T denominator, T default_val = T(0)) {
+    return is_zero(denominator) ? default_val : numerator / denominator;
+}
+
+template<typename T>
+inline bool isinf(T value) {
+    return std::isinf(value);
+}
+
+template<typename T>
+inline bool isnan(T value) {
+    return std::isnan(value);
+}
+
+} // namespace constants
+
+// Physical constants for FE analysis
+namespace physical_constants {
+
+// Material properties (SI units)
+inline constexpr double water_density = 1000.0;         // kg/m³
+inline constexpr double steel_density = 7850.0;         // kg/m³
+inline constexpr double aluminum_density = 2700.0;      // kg/m³
+
+inline constexpr double water_viscosity = 0.001;        // Pa·s at 20°C
+inline constexpr double air_viscosity = 1.81e-5;        // Pa·s at 20°C
+
+inline constexpr double steel_youngs_modulus = 200e9;   // Pa
+inline constexpr double aluminum_youngs_modulus = 70e9; // Pa
+
+inline constexpr double steel_poisson_ratio = 0.3;      // dimensionless
+inline constexpr double aluminum_poisson_ratio = 0.33;  // dimensionless
+
+// Physical constants
+inline constexpr double gravity = 9.80665;              // m/s²
+inline constexpr double gas_constant = 8.314462618;     // J/(mol·K)
+inline constexpr double boltzmann = 1.380649e-23;       // J/K
+inline constexpr double avogadro = 6.02214076e23;       // mol⁻¹
+
+} // namespace physical_constants
+
+} // namespace math
+} // namespace FE
+} // namespace svmp
+
+#endif // SVMP_FE_MATH_CONSTANTS_H
diff --git a/Code/Source/solver/FE/Math/Matrix.h b/Code/Source/solver/FE/Math/Matrix.h
new file mode 100644
index 000000000..0b80091f9
--- /dev/null
+++ b/Code/Source/solver/FE/Math/Matrix.h
@@ -0,0 +1,1487 @@
+#ifndef SVMP_FE_MATH_MATRIX_H
+#define SVMP_FE_MATH_MATRIX_H
+
+/**
+ * @file Matrix.h
+ * @brief Fixed-size matrices with expression templates and specializations for FE computations
+ *
+ * This header provides optimized fixed-size matrix operations for element-level
+ * computations. Includes specialized analytical formulas for 2x2 and 3x3 matrices
+ * (determinant, inverse using Cramer's rule) and Gauss elimination for larger matrices.
+ * All operations use expression templates to eliminate temporaries.
+ */
+
+#include "MatrixExpr.h"
+#include "Vector.h"
+#include "MathConstants.h"
+#include "../Common/Alignment.h"
+#include <algorithm>
+#include <array>
+#include <cmath>
+#include <initializer_list>
+#include <ostream>
+#include <stdexcept>
+#include <type_traits>
+
+namespace svmp {
+namespace FE {
+namespace math {
+
+/**
+ * @brief Fixed-size matrix for element-level computations
+ * @tparam T Scalar type (float, double)
+ * @tparam M Number of rows
+ * @tparam N Number of columns
+ *
+ * Storage is row-major for cache efficiency. Memory is aligned for SIMD operations.
+ * Specializations exist for 2x2, 3x3, 4x4 matrices with analytical algorithms.
+ */
+template<typename T, std::size_t M, std::size_t N>
+class Matrix : public MatrixExpr<Matrix<T, M, N>> {
+    static_assert(std::is_arithmetic_v<T>, "T must be an arithmetic type");
+    static_assert(M > 0 && N > 0, "Matrix dimensions must be positive");
+
+private:
+    alignas(kFEFixedObjectAlignmentBytes) T data_[M * N];  // Row-major, SIMD-friendly storage
+
+    // Helper to compute linear index from (i,j)
+    static constexpr std::size_t index(std::size_t i, std::size_t j) {
+        return i * N + j;
+    }
+
+public:
+    // Type definitions
+    using value_type = T;
+    using size_type = std::size_t;
+    using reference = T&;
+    using const_reference = const T&;
+    using pointer = T*;
+    using const_pointer = const T*;
+
+    /**
+     * @brief Default constructor - zero initializes all elements
+     */
+    constexpr Matrix() : data_{} {}
+
+    /**
+     * @brief Fill constructor - initializes all elements with same value
+     * @param value Value to fill matrix with
+     */
+    constexpr explicit Matrix(T value) {
+        for (size_type i = 0; i < M * N; ++i) {
+            data_[i] = value;
+        }
+    }
+
+    /**
+     * @brief Initializer list constructor for row-wise initialization
+     * @param init Nested initializer lists {{row0}, {row1}, ...}
+     */
+    constexpr Matrix(std::initializer_list<std::initializer_list<T>> init) : data_{} {
+        size_type row = 0;
+        for (auto row_init : init) {
+            if (row >= M) break;
+            size_type col = 0;
+            for (auto val : row_init) {
+                if (col >= N) break;
+                (*this)(row, col) = val;
+                ++col;
+            }
+            ++row;
+        }
+    }
+
+    /**
+     * @brief Constructor from expression template
+     * @tparam Expr Expression type
+     * @param expr Matrix expression to evaluate
+     */
+    template<typename Expr>
+    Matrix(const MatrixExpr<Expr>& expr) {
+        const auto& e = expr.derived();
+        for (size_type i = 0; i < M; ++i) {
+            for (size_type j = 0; j < N; ++j) {
+                (*this)(i, j) = e(i, j);
+            }
+        }
+    }
+
+    /**
+     * @brief Copy constructor
+     */
+    constexpr Matrix(const Matrix&) = default;
+
+    /**
+     * @brief Move constructor
+     */
+    constexpr Matrix(Matrix&&) noexcept = default;
+
+    /**
+     * @brief Copy assignment
+     */
+    Matrix& operator=(const Matrix&) = default;
+
+    /**
+     * @brief Move assignment
+     */
+    Matrix& operator=(Matrix&&) noexcept = default;
+
+    /**
+     * @brief Assignment from expression template
+     * @tparam Expr Expression type
+     * @param expr Matrix expression to evaluate
+     * @return Reference to this
+     */
+    template<typename Expr>
+    Matrix& operator=(const MatrixExpr<Expr>& expr) {
+        const auto& e = expr.derived();
+        for (size_type i = 0; i < M; ++i) {
+            for (size_type j = 0; j < N; ++j) {
+                (*this)(i, j) = e(i, j);
+            }
+        }
+        return *this;
+    }
+
+    /**
+     * @brief Get number of rows (compile-time constant)
+     * @return Number of rows
+     */
+    static constexpr size_type rows() { return M; }
+
+    /**
+     * @brief Get number of columns (compile-time constant)
+     * @return Number of columns
+     */
+    static constexpr size_type cols() { return N; }
+
+    /**
+     * @brief Get total number of elements
+     * @return M * N
+     */
+    static constexpr size_type size() { return M * N; }
+
+    /**
+     * @brief Element access (no bounds checking)
+     * @param i Row index
+     * @param j Column index
+     * @return Reference to element
+     */
+    constexpr T& operator()(size_type i, size_type j) {
+        return data_[index(i, j)];
+    }
+
+    /**
+     * @brief Element access (no bounds checking) - const version
+     * @param i Row index
+     * @param j Column index
+     * @return Const reference to element
+     */
+    constexpr const T& operator()(size_type i, size_type j) const {
+        return data_[index(i, j)];
+    }
+
+    /**
+     * @brief Element access with bounds checking
+     * @param i Row index
+     * @param j Column index
+     * @return Reference to element
+     * @throws std::out_of_range if indices are out of bounds
+     */
+    T& at(size_type i, size_type j) {
+        if (i >= M || j >= N) {
+            throw std::out_of_range("Matrix::at: index out of range");
+        }
+        return (*this)(i, j);
+    }
+
+    /**
+     * @brief Element access with bounds checking - const version
+     * @param i Row index
+     * @param j Column index
+     * @return Const reference to element
+     * @throws std::out_of_range if indices are out of bounds
+     */
+    const T& at(size_type i, size_type j) const {
+        if (i >= M || j >= N) {
+            throw std::out_of_range("Matrix::at: index out of range");
+        }
+        return (*this)(i, j);
+    }
+
+    /**
+     * @brief Get row as vector
+     * @param i Row index
+     * @return Vector containing row elements
+     */
+    Vector<T, N> row(size_type i) const {
+        Vector<T, N> result;
+        for (size_type j = 0; j < N; ++j) {
+            result[j] = (*this)(i, j);
+        }
+        return result;
+    }
+
+    /**
+     * @brief Get column as vector
+     * @param j Column index
+     * @return Vector containing column elements
+     */
+    Vector<T, M> column(size_type j) const {
+        Vector<T, M> result;
+        for (size_type i = 0; i < M; ++i) {
+            result[i] = (*this)(i, j);
+        }
+        return result;
+    }
+
+    /**
+     * @brief Get column as vector (alias for column)
+     * @param j Column index
+     * @return Vector containing column elements
+     */
+    Vector<T, M> col(size_type j) const {
+        return column(j);
+    }
+
+    /**
+     * @brief Set row from vector
+     * @param i Row index
+     * @param v Vector of values
+     */
+    void set_row(size_type i, const Vector<T, N>& v) {
+        for (size_type j = 0; j < N; ++j) {
+            (*this)(i, j) = v[j];
+        }
+    }
+
+    /**
+     * @brief Set column from vector
+     * @param j Column index
+     * @param v Vector of values
+     */
+    void set_column(size_type j, const Vector<T, M>& v) {
+        for (size_type i = 0; i < M; ++i) {
+            (*this)(i, j) = v[i];
+        }
+    }
+
+    /**
+     * @brief Set column from vector (alias for set_column)
+     * @param j Column index
+     * @param v Vector of values
+     */
+    void set_col(size_type j, const Vector<T, M>& v) {
+        set_column(j, v);
+    }
+
+    /**
+     * @brief Get pointer to underlying data
+     * @return Pointer to first element
+     */
+    T* data() { return data_; }
+    const T* data() const { return data_; }
+
+    /**
+     * @brief Fill matrix with value
+     * @param value Value to fill with
+     */
+    void fill(T value) {
+        for (size_type i = 0; i < M * N; ++i) {
+            data_[i] = value;
+        }
+    }
+
+    /**
+     * @brief Set all elements to zero
+     */
+    void set_zero() {
+        fill(T{0});
+    }
+
+    // Arithmetic operators
+
+    /**
+     * @brief In-place addition
+     * @param other Matrix to add
+     * @return Reference to this
+     */
+    Matrix& operator+=(const Matrix& other) {
+        for (size_type i = 0; i < M * N; ++i) {
+            data_[i] += other.data_[i];
+        }
+        return *this;
+    }
+
+    /**
+     * @brief In-place subtraction
+     * @param other Matrix to subtract
+     * @return Reference to this
+     */
+    Matrix& operator-=(const Matrix& other) {
+        for (size_type i = 0; i < M * N; ++i) {
+            data_[i] -= other.data_[i];
+        }
+        return *this;
+    }
+
+    /**
+     * @brief In-place scalar multiplication
+     * @param scalar Scalar to multiply by
+     * @return Reference to this
+     */
+    Matrix& operator*=(T scalar) {
+        for (size_type i = 0; i < M * N; ++i) {
+            data_[i] *= scalar;
+        }
+        return *this;
+    }
+
+    /**
+     * @brief In-place scalar division
+     * @param scalar Scalar to divide by
+     * @return Reference to this
+     */
+    Matrix& operator/=(T scalar) {
+        const T inv = T(1) / scalar;
+        return (*this) *= inv;
+    }
+
+    // Matrix operations
+
+    /**
+     * @brief Compute transpose
+     * @return Transposed matrix
+     */
+    Matrix<T, N, M> transpose() const {
+        Matrix<T, N, M> result;
+        for (size_type i = 0; i < M; ++i) {
+            for (size_type j = 0; j < N; ++j) {
+                result(j, i) = (*this)(i, j);
+            }
+        }
+        return result;
+    }
+
+    /**
+     * @brief Compute trace (sum of diagonal elements)
+     * @return Trace (only valid for square matrices)
+     */
+    template<std::size_t M2 = M, std::size_t N2 = N>
+    std::enable_if_t<M2 == N2, T> trace() const {
+        T result = T(0);
+        for (size_type i = 0; i < M; ++i) {
+            result += (*this)(i, i);
+        }
+        return result;
+    }
+
+    /**
+     * @brief Compute Frobenius norm squared
+     * @return Sum of squares of all elements
+     */
+    T frobenius_norm_squared() const {
+        T result = T(0);
+        for (size_type i = 0; i < M * N; ++i) {
+            result += data_[i] * data_[i];
+        }
+        return result;
+    }
+
+    /**
+     * @brief Compute Frobenius norm
+     * @return Square root of sum of squares
+     */
+    T frobenius_norm() const {
+        using std::sqrt;
+        return sqrt(frobenius_norm_squared());
+    }
+
+    /**
+     * @brief Compute infinity norm (maximum absolute row sum)
+     * @return Infinity norm
+     */
+    T infinity_norm() const {
+        T max_row_sum = T(0);
+        for (size_type i = 0; i < M; ++i) {
+            T row_sum = T(0);
+            for (size_type j = 0; j < N; ++j) {
+                using std::abs;
+                row_sum += abs((*this)(i, j));
+            }
+            max_row_sum = std::max(max_row_sum, row_sum);
+        }
+        return max_row_sum;
+    }
+
+    /**
+     * @brief Compute one norm (maximum absolute column sum)
+     * @return One norm
+     */
+    T one_norm() const {
+        T max_col_sum = T(0);
+        for (size_type j = 0; j < N; ++j) {
+            T col_sum = T(0);
+            for (size_type i = 0; i < M; ++i) {
+                using std::abs;
+                col_sum += abs((*this)(i, j));
+            }
+            max_col_sum = std::max(max_col_sum, col_sum);
+        }
+        return max_col_sum;
+    }
+
+    /**
+     * @brief Get minimum element
+     * @return Minimum value
+     */
+    T min() const {
+        return *std::min_element(data_, data_ + M * N);
+    }
+
+    /**
+     * @brief Get maximum element
+     * @return Maximum value
+     */
+    T max() const {
+        return *std::max_element(data_, data_ + M * N);
+    }
+
+    /**
+     * @brief Get sum of all elements
+     * @return Sum of elements
+     */
+    T sum() const {
+        T result = T(0);
+        for (size_type i = 0; i < M * N; ++i) {
+            result += data_[i];
+        }
+        return result;
+    }
+
+    // Static factory functions
+
+    /**
+     * @brief Create zero matrix
+     * @return Matrix with all elements zero
+     */
+    static constexpr Matrix zeros() {
+        return Matrix();
+    }
+
+    /**
+     * @brief Create matrix with all elements one
+     * @return Matrix with all elements one
+     */
+    static constexpr Matrix ones() {
+        return Matrix(T(1));
+    }
+
+    /**
+     * @brief Create identity matrix (only for square matrices)
+     * @return Identity matrix
+     */
+    template<std::size_t M2 = M, std::size_t N2 = N>
+    static std::enable_if_t<M2 == N2, Matrix> identity() {
+        Matrix result;
+        for (size_type i = 0; i < M; ++i) {
+            result(i, i) = T(1);
+        }
+        return result;
+    }
+
+    /**
+     * @brief Create diagonal matrix from vector (only for square matrices)
+     * @param diag Vector of diagonal elements
+     * @return Diagonal matrix
+     */
+    template<std::size_t M2 = M, std::size_t N2 = N>
+    static std::enable_if_t<M2 == N2, Matrix> diagonal(const Vector<T, M>& diag) {
+        Matrix result;
+        for (size_type i = 0; i < M; ++i) {
+            result(i, i) = diag[i];
+        }
+        return result;
+    }
+
+    /**
+     * @brief Create zero matrix (static factory)
+     * @return Zero matrix
+     */
+    static Matrix zero() {
+        return zeros();
+    }
+
+    // Property checking methods
+
+    /**
+     * @brief Check if matrix is symmetric (only for square matrices)
+     * @param tol Tolerance for comparison
+     * @return true if symmetric
+     */
+    template<std::size_t M2 = M, std::size_t N2 = N>
+    std::enable_if_t<M2 == N2, bool> is_symmetric(T tol = tolerance<T>) const {
+        for (size_type i = 0; i < M; ++i) {
+            for (size_type j = i + 1; j < N; ++j) {
+                using std::abs;
+                if (abs((*this)(i, j) - (*this)(j, i)) > tol) {
+                    return false;
+                }
+            }
+        }
+        return true;
+    }
+
+    /**
+     * @brief Check if matrix is skew-symmetric (only for square matrices)
+     * @param tol Tolerance for comparison
+     * @return true if skew-symmetric
+     */
+    template<std::size_t M2 = M, std::size_t N2 = N>
+    std::enable_if_t<M2 == N2, bool> is_skew_symmetric(T tol = tolerance<T>) const {
+        for (size_type i = 0; i < M; ++i) {
+            // Diagonal must be zero
+            using std::abs;
+            if (abs((*this)(i, i)) > tol) {
+                return false;
+            }
+            for (size_type j = i + 1; j < N; ++j) {
+                if (abs((*this)(i, j) + (*this)(j, i)) > tol) {
+                    return false;
+                }
+            }
+        }
+        return true;
+    }
+
+    /**
+     * @brief Check if matrix is diagonal (only for square matrices)
+     * @param tol Tolerance for comparison
+     * @return true if diagonal
+     */
+    template<std::size_t M2 = M, std::size_t N2 = N>
+    std::enable_if_t<M2 == N2, bool> is_diagonal(T tol = tolerance<T>) const {
+        for (size_type i = 0; i < M; ++i) {
+            for (size_type j = 0; j < N; ++j) {
+                if (i != j) {
+                    using std::abs;
+                    if (abs((*this)(i, j)) > tol) {
+                        return false;
+                    }
+                }
+            }
+        }
+        return true;
+    }
+
+    // Determinant (general template, specialized for 2x2, 3x3)
+    /**
+     * @brief Compute determinant (only for square matrices)
+     * @return Determinant value
+     */
+    template<std::size_t M2 = M, std::size_t N2 = N>
+    std::enable_if_t<M2 == N2 && M2 != 2 && M2 != 3, T> determinant() const {
+        // For 4x4 and larger, use LU decomposition
+        return determinant_lu();
+    }
+
+    // Inverse (general template, specialized for 2x2, 3x3)
+    /**
+     * @brief Compute matrix inverse (only for square matrices)
+     * @return Inverse matrix
+     */
+    template<std::size_t M2 = M, std::size_t N2 = N>
+    std::enable_if_t<M2 == N2 && M2 != 2 && M2 != 3, Matrix> inverse() const {
+        // For 4x4 and larger, use Gauss-Jordan elimination
+        return inverse_gauss_jordan();
+    }
+
+private:
+    // LU decomposition for determinant (4x4 and larger)
+    T determinant_lu() const {
+        Matrix<T, M, M> lu = *this;
+        T det = T(1);
+
+        for (size_type k = 0; k < M - 1; ++k) {
+            // Find pivot
+            size_type pivot = k;
+            T max_val = std::abs(lu(k, k));
+            for (size_type i = k + 1; i < M; ++i) {
+                T val = std::abs(lu(i, k));
+                if (val > max_val) {
+                    max_val = val;
+                    pivot = i;
+                }
+            }
+
+            // Swap rows if needed
+            if (pivot != k) {
+                for (size_type j = 0; j < M; ++j) {
+                    std::swap(lu(k, j), lu(pivot, j));
+                }
+                det = -det;  // Row swap changes sign
+            }
+
+            // Check for singularity
+            if (approx_zero(lu(k, k))) {
+                return T(0);
+            }
+
+            // Eliminate column
+            for (size_type i = k + 1; i < M; ++i) {
+                T factor = lu(i, k) / lu(k, k);
+                for (size_type j = k + 1; j < M; ++j) {
+                    lu(i, j) -= factor * lu(k, j);
+                }
+            }
+
+            det *= lu(k, k);
+        }
+        det *= lu(M - 1, M - 1);
+
+        return det;
+    }
+
+    // Gauss-Jordan elimination for inverse (4x4 and larger)
+    Matrix inverse_gauss_jordan() const {
+        Matrix<T, M, M> aug;  // Augmented matrix [A | I]
+        Matrix<T, M, M> result = Matrix::identity();
+
+        // Copy this matrix to augmented matrix
+        for (size_type i = 0; i < M; ++i) {
+            for (size_type j = 0; j < M; ++j) {
+                aug(i, j) = (*this)(i, j);
+            }
+        }
+
+        // Forward elimination with partial pivoting
+        for (size_type k = 0; k < M; ++k) {
+            // Find pivot
+            size_type pivot = k;
+            T max_val = std::abs(aug(k, k));
+            for (size_type i = k + 1; i < M; ++i) {
+                T val = std::abs(aug(i, k));
+                if (val > max_val) {
+                    max_val = val;
+                    pivot = i;
+                }
+            }
+
+            // Swap rows
+            if (pivot != k) {
+                for (size_type j = 0; j < M; ++j) {
+                    std::swap(aug(k, j), aug(pivot, j));
+                    std::swap(result(k, j), result(pivot, j));
+                }
+            }
+
+            // Check for singularity
+            if (approx_zero(aug(k, k))) {
+                throw std::runtime_error("Matrix is singular");
+            }
+
+            // Scale pivot row
+            T pivot_val = aug(k, k);
+            for (size_type j = 0; j < M; ++j) {
+                aug(k, j) /= pivot_val;
+                result(k, j) /= pivot_val;
+            }
+
+            // Eliminate column
+            for (size_type i = 0; i < M; ++i) {
+                if (i != k) {
+                    T factor = aug(i, k);
+                    for (size_type j = 0; j < M; ++j) {
+                        aug(i, j) -= factor * aug(k, j);
+                        result(i, j) -= factor * result(k, j);
+                    }
+                }
+            }
+        }
+
+        return result;
+    }
+
+    // Iterators
+public:
+    T* begin() { return data_; }
+    T* end() { return data_ + M * N; }
+    const T* begin() const { return data_; }
+    const T* end() const { return data_ + M * N; }
+    const T* cbegin() const { return data_; }
+    const T* cend() const { return data_ + M * N; }
+};
+
+// Specialization for 2x2 determinant (analytical formula)
+template<typename T>
+inline T determinant_2x2(const Matrix<T, 2, 2>& m) {
+    return m(0, 0) * m(1, 1) - m(0, 1) * m(1, 0);
+}
+
+// Specialization for 2x2 inverse (Cramer's rule)
+template<typename T>
+inline Matrix<T, 2, 2> inverse_2x2(const Matrix<T, 2, 2>& m) {
+    T det = determinant_2x2(m);
+    if (approx_zero(det)) {
+        throw std::runtime_error("Matrix is singular");
+    }
+
+    T inv_det = T(1) / det;
+    return Matrix<T, 2, 2>{
+        { m(1, 1) * inv_det, -m(0, 1) * inv_det},
+        {-m(1, 0) * inv_det,  m(0, 0) * inv_det}
+    };
+}
+
+// Specialization for 3x3 determinant (Sarrus rule)
+template<typename T>
+inline T determinant_3x3(const Matrix<T, 3, 3>& m) {
+    return m(0, 0) * (m(1, 1) * m(2, 2) - m(1, 2) * m(2, 1))
+         - m(0, 1) * (m(1, 0) * m(2, 2) - m(1, 2) * m(2, 0))
+         + m(0, 2) * (m(1, 0) * m(2, 1) - m(1, 1) * m(2, 0));
+}
+
+// Specialization for 3x3 inverse (Cramer's rule / adjugate method)
+template<typename T>
+inline Matrix<T, 3, 3> inverse_3x3(const Matrix<T, 3, 3>& m) {
+    T det = determinant_3x3(m);
+    if (approx_zero(det)) {
+        throw std::runtime_error("Matrix is singular");
+    }
+
+    T inv_det = T(1) / det;
+
+    // Compute adjugate matrix (transpose of cofactor matrix)
+    Matrix<T, 3, 3> adj;
+    adj(0, 0) =  (m(1, 1) * m(2, 2) - m(1, 2) * m(2, 1));
+    adj(0, 1) = -(m(0, 1) * m(2, 2) - m(0, 2) * m(2, 1));
+    adj(0, 2) =  (m(0, 1) * m(1, 2) - m(0, 2) * m(1, 1));
+
+    adj(1, 0) = -(m(1, 0) * m(2, 2) - m(1, 2) * m(2, 0));
+    adj(1, 1) =  (m(0, 0) * m(2, 2) - m(0, 2) * m(2, 0));
+    adj(1, 2) = -(m(0, 0) * m(1, 2) - m(0, 2) * m(1, 0));
+
+    adj(2, 0) =  (m(1, 0) * m(2, 1) - m(1, 1) * m(2, 0));
+    adj(2, 1) = -(m(0, 0) * m(2, 1) - m(0, 1) * m(2, 0));
+    adj(2, 2) =  (m(0, 0) * m(1, 1) - m(0, 1) * m(1, 0));
+
+    return adj * inv_det;
+}
+
+// Template specializations for 2x2 Matrix determinant and inverse
+template<typename T>
+class Matrix<T, 2, 2> : public MatrixExpr<Matrix<T, 2, 2>> {
+    static constexpr std::size_t M = 2;
+    static constexpr std::size_t N = 2;
+
+private:
+    alignas(kFEFixedObjectAlignmentBytes) T data_[4];
+
+    static constexpr std::size_t index(std::size_t i, std::size_t j) {
+        return i * 2 + j;
+    }
+
+public:
+    using value_type = T;
+    using size_type = std::size_t;
+
+    // Include all the same constructors and methods as the general template
+    constexpr Matrix() : data_{} {}
+    constexpr explicit Matrix(T value) {
+        for (size_type i = 0; i < 4; ++i) {
+            data_[i] = value;
+        }
+    }
+    constexpr Matrix(std::initializer_list<std::initializer_list<T>> init) : data_{} {
+        size_type row = 0;
+        for (auto row_init : init) {
+            if (row >= 2) break;
+            size_type col = 0;
+            for (auto val : row_init) {
+                if (col >= 2) break;
+                (*this)(row, col) = val;
+                ++col;
+            }
+            ++row;
+        }
+    }
+
+    template<typename Expr>
+    Matrix(const MatrixExpr<Expr>& expr) {
+        const auto& e = expr.derived();
+        for (size_type i = 0; i < 2; ++i) {
+            for (size_type j = 0; j < 2; ++j) {
+                (*this)(i, j) = e(i, j);
+            }
+        }
+    }
+
+    constexpr Matrix(const Matrix&) = default;
+    constexpr Matrix(Matrix&&) noexcept = default;
+    Matrix& operator=(const Matrix&) = default;
+    Matrix& operator=(Matrix&&) noexcept = default;
+
+    template<typename Expr>
+    Matrix& operator=(const MatrixExpr<Expr>& expr) {
+        const auto& e = expr.derived();
+        for (size_type i = 0; i < 2; ++i) {
+            for (size_type j = 0; j < 2; ++j) {
+                (*this)(i, j) = e(i, j);
+            }
+        }
+        return *this;
+    }
+
+    static constexpr size_type rows() { return 2; }
+    static constexpr size_type cols() { return 2; }
+    static constexpr size_type size() { return 4; }
+
+    constexpr T& operator()(size_type i, size_type j) {
+        return data_[index(i, j)];
+    }
+    constexpr const T& operator()(size_type i, size_type j) const {
+        return data_[index(i, j)];
+    }
+
+    T* data() { return data_; }
+    const T* data() const { return data_; }
+
+    void fill(T value) {
+        for (size_type i = 0; i < 4; ++i) {
+            data_[i] = value;
+        }
+    }
+
+    void set_zero() { fill(T{0}); }
+
+    void set_row(size_type i, const Vector<T, 2>& v) {
+        for (size_type j = 0; j < 2; ++j) {
+            (*this)(i, j) = v[j];
+        }
+    }
+
+    void set_column(size_type j, const Vector<T, 2>& v) {
+        for (size_type i = 0; i < 2; ++i) {
+            (*this)(i, j) = v[i];
+        }
+    }
+
+    void set_col(size_type j, const Vector<T, 2>& v) {
+        set_column(j, v);
+    }
+
+    Vector<T, 2> col(size_type j) const {
+        return column(j);
+    }
+
+    static Matrix zero() {
+        return zeros();
+    }
+
+    static Matrix diagonal(const Vector<T, 2>& diag) {
+        Matrix result;
+        result(0, 0) = diag[0];
+        result(1, 1) = diag[1];
+        return result;
+    }
+
+    bool is_symmetric(T tol = tolerance<T>) const {
+        using std::abs;
+        return abs((*this)(0, 1) - (*this)(1, 0)) <= tol;
+    }
+
+    bool is_skew_symmetric(T tol = tolerance<T>) const {
+        using std::abs;
+        // Diagonal must be zero
+        if (abs((*this)(0, 0)) > tol || abs((*this)(1, 1)) > tol) {
+            return false;
+        }
+        // Off-diagonal must be opposite
+        return abs((*this)(0, 1) + (*this)(1, 0)) <= tol;
+    }
+
+    bool is_diagonal(T tol = tolerance<T>) const {
+        using std::abs;
+        return abs((*this)(0, 1)) <= tol && abs((*this)(1, 0)) <= tol;
+    }
+
+    T frobenius_norm() const {
+        using std::sqrt;
+        T sum = T(0);
+        for (size_type i = 0; i < 4; ++i) {
+            sum += data_[i] * data_[i];
+        }
+        return sqrt(sum);
+    }
+
+    T infinity_norm() const {
+        using std::abs;
+        T row0 = abs((*this)(0, 0)) + abs((*this)(0, 1));
+        T row1 = abs((*this)(1, 0)) + abs((*this)(1, 1));
+        return std::max(row0, row1);
+    }
+
+    T one_norm() const {
+        using std::abs;
+        T col0 = abs((*this)(0, 0)) + abs((*this)(1, 0));
+        T col1 = abs((*this)(0, 1)) + abs((*this)(1, 1));
+        return std::max(col0, col1);
+    }
+
+    Matrix& operator+=(const Matrix& other) {
+        for (size_type i = 0; i < 4; ++i) {
+            data_[i] += other.data_[i];
+        }
+        return *this;
+    }
+
+    Matrix& operator-=(const Matrix& other) {
+        for (size_type i = 0; i < 4; ++i) {
+            data_[i] -= other.data_[i];
+        }
+        return *this;
+    }
+
+    Matrix& operator*=(T scalar) {
+        for (size_type i = 0; i < 4; ++i) {
+            data_[i] *= scalar;
+        }
+        return *this;
+    }
+
+    Matrix& operator/=(T scalar) {
+        const T inv = T(1) / scalar;
+        return (*this) *= inv;
+    }
+
+    Matrix<T, 2, 2> transpose() const {
+        return Matrix<T, 2, 2>{
+            {(*this)(0, 0), (*this)(1, 0)},
+            {(*this)(0, 1), (*this)(1, 1)}
+        };
+    }
+
+    T trace() const {
+        return (*this)(0, 0) + (*this)(1, 1);
+    }
+
+    static Matrix identity() {
+        Matrix result;
+        result(0, 0) = T(1);
+        result(1, 1) = T(1);
+        return result;
+    }
+
+    static Matrix zeros() {
+        return Matrix();
+    }
+
+    static Matrix ones() {
+        return Matrix(T(1));
+    }
+
+    // Specialized 2x2 determinant
+    T determinant() const {
+        return determinant_2x2(*this);
+    }
+
+    // Specialized 2x2 inverse
+    Matrix inverse() const {
+        return inverse_2x2(*this);
+    }
+
+    Vector<T, 2> row(size_type i) const {
+        return Vector<T, 2>{(*this)(i, 0), (*this)(i, 1)};
+    }
+
+    Vector<T, 2> column(size_type j) const {
+        return Vector<T, 2>{(*this)(0, j), (*this)(1, j)};
+    }
+
+    T* begin() { return data_; }
+    T* end() { return data_ + 4; }
+    const T* begin() const { return data_; }
+    const T* end() const { return data_ + 4; }
+};
+
+// Template specialization for 3x3 Matrix
+template<typename T>
+class Matrix<T, 3, 3> : public MatrixExpr<Matrix<T, 3, 3>> {
+    static constexpr std::size_t M = 3;
+    static constexpr std::size_t N = 3;
+
+private:
+    alignas(kFEFixedObjectAlignmentBytes) T data_[9];
+
+    static constexpr std::size_t index(std::size_t i, std::size_t j) {
+        return i * 3 + j;
+    }
+
+public:
+    using value_type = T;
+    using size_type = std::size_t;
+
+    constexpr Matrix() : data_{} {}
+    constexpr explicit Matrix(T value) {
+        for (size_type i = 0; i < 9; ++i) {
+            data_[i] = value;
+        }
+    }
+    constexpr Matrix(std::initializer_list<std::initializer_list<T>> init) : data_{} {
+        size_type row = 0;
+        for (auto row_init : init) {
+            if (row >= 3) break;
+            size_type col = 0;
+            for (auto val : row_init) {
+                if (col >= 3) break;
+                (*this)(row, col) = val;
+                ++col;
+            }
+            ++row;
+        }
+    }
+
+    template<typename Expr>
+    Matrix(const MatrixExpr<Expr>& expr) {
+        const auto& e = expr.derived();
+        for (size_type i = 0; i < 3; ++i) {
+            for (size_type j = 0; j < 3; ++j) {
+                (*this)(i, j) = e(i, j);
+            }
+        }
+    }
+
+    constexpr Matrix(const Matrix&) = default;
+    constexpr Matrix(Matrix&&) noexcept = default;
+    Matrix& operator=(const Matrix&) = default;
+    Matrix& operator=(Matrix&&) noexcept = default;
+
+    template<typename Expr>
+    Matrix& operator=(const MatrixExpr<Expr>& expr) {
+        const auto& e = expr.derived();
+        for (size_type i = 0; i < 3; ++i) {
+            for (size_type j = 0; j < 3; ++j) {
+                (*this)(i, j) = e(i, j);
+            }
+        }
+        return *this;
+    }
+
+    static constexpr size_type rows() { return 3; }
+    static constexpr size_type cols() { return 3; }
+    static constexpr size_type size() { return 9; }
+
+    constexpr T& operator()(size_type i, size_type j) {
+        return data_[index(i, j)];
+    }
+    constexpr const T& operator()(size_type i, size_type j) const {
+        return data_[index(i, j)];
+    }
+
+    T* data() { return data_; }
+    const T* data() const { return data_; }
+
+    void fill(T value) {
+        for (size_type i = 0; i < 9; ++i) {
+            data_[i] = value;
+        }
+    }
+
+    void set_zero() { fill(T{0}); }
+
+    void set_row(size_type i, const Vector<T, 3>& v) {
+        for (size_type j = 0; j < 3; ++j) {
+            (*this)(i, j) = v[j];
+        }
+    }
+
+    void set_column(size_type j, const Vector<T, 3>& v) {
+        for (size_type i = 0; i < 3; ++i) {
+            (*this)(i, j) = v[i];
+        }
+    }
+
+    void set_col(size_type j, const Vector<T, 3>& v) {
+        set_column(j, v);
+    }
+
+    Vector<T, 3> col(size_type j) const {
+        return column(j);
+    }
+
+    static Matrix zero() {
+        return zeros();
+    }
+
+    static Matrix diagonal(const Vector<T, 3>& diag) {
+        Matrix result;
+        result(0, 0) = diag[0];
+        result(1, 1) = diag[1];
+        result(2, 2) = diag[2];
+        return result;
+    }
+
+    bool is_symmetric(T tol = tolerance<T>) const {
+        using std::abs;
+        for (size_type i = 0; i < 3; ++i) {
+            for (size_type j = i + 1; j < 3; ++j) {
+                if (abs((*this)(i, j) - (*this)(j, i)) > tol) {
+                    return false;
+                }
+            }
+        }
+        return true;
+    }
+
+    bool is_skew_symmetric(T tol = tolerance<T>) const {
+        using std::abs;
+        // Diagonal must be zero
+        for (size_type i = 0; i < 3; ++i) {
+            if (abs((*this)(i, i)) > tol) {
+                return false;
+            }
+        }
+        // Off-diagonal must be opposite
+        for (size_type i = 0; i < 3; ++i) {
+            for (size_type j = i + 1; j < 3; ++j) {
+                if (abs((*this)(i, j) + (*this)(j, i)) > tol) {
+                    return false;
+                }
+            }
+        }
+        return true;
+    }
+
+    bool is_diagonal(T tol = tolerance<T>) const {
+        using std::abs;
+        for (size_type i = 0; i < 3; ++i) {
+            for (size_type j = 0; j < 3; ++j) {
+                if (i != j && abs((*this)(i, j)) > tol) {
+                    return false;
+                }
+            }
+        }
+        return true;
+    }
+
+    T frobenius_norm() const {
+        using std::sqrt;
+        T sum = T(0);
+        for (size_type i = 0; i < 9; ++i) {
+            sum += data_[i] * data_[i];
+        }
+        return sqrt(sum);
+    }
+
+    T infinity_norm() const {
+        using std::abs;
+        T max_row_sum = T(0);
+        for (size_type i = 0; i < 3; ++i) {
+            T row_sum = T(0);
+            for (size_type j = 0; j < 3; ++j) {
+                row_sum += abs((*this)(i, j));
+            }
+            max_row_sum = std::max(max_row_sum, row_sum);
+        }
+        return max_row_sum;
+    }
+
+    T one_norm() const {
+        using std::abs;
+        T max_col_sum = T(0);
+        for (size_type j = 0; j < 3; ++j) {
+            T col_sum = T(0);
+            for (size_type i = 0; i < 3; ++i) {
+                col_sum += abs((*this)(i, j));
+            }
+            max_col_sum = std::max(max_col_sum, col_sum);
+        }
+        return max_col_sum;
+    }
+
+    Matrix& operator+=(const Matrix& other) {
+        for (size_type i = 0; i < 9; ++i) {
+            data_[i] += other.data_[i];
+        }
+        return *this;
+    }
+
+    Matrix& operator-=(const Matrix& other) {
+        for (size_type i = 0; i < 9; ++i) {
+            data_[i] -= other.data_[i];
+        }
+        return *this;
+    }
+
+    Matrix& operator*=(T scalar) {
+        for (size_type i = 0; i < 9; ++i) {
+            data_[i] *= scalar;
+        }
+        return *this;
+    }
+
+    Matrix& operator/=(T scalar) {
+        const T inv = T(1) / scalar;
+        return (*this) *= inv;
+    }
+
+    Matrix<T, 3, 3> transpose() const {
+        Matrix<T, 3, 3> result;
+        for (size_type i = 0; i < 3; ++i) {
+            for (size_type j = 0; j < 3; ++j) {
+                result(j, i) = (*this)(i, j);
+            }
+        }
+        return result;
+    }
+
+    T trace() const {
+        return (*this)(0, 0) + (*this)(1, 1) + (*this)(2, 2);
+    }
+
+    static Matrix identity() {
+        Matrix result;
+        result(0, 0) = T(1);
+        result(1, 1) = T(1);
+        result(2, 2) = T(1);
+        return result;
+    }
+
+    static Matrix zeros() {
+        return Matrix();
+    }
+
+    static Matrix ones() {
+        return Matrix(T(1));
+    }
+
+    // Specialized 3x3 determinant
+    T determinant() const {
+        return determinant_3x3(*this);
+    }
+
+    // Specialized 3x3 inverse
+    Matrix inverse() const {
+        return inverse_3x3(*this);
+    }
+
+    Vector<T, 3> row(size_type i) const {
+        return Vector<T, 3>{(*this)(i, 0), (*this)(i, 1), (*this)(i, 2)};
+    }
+
+    Vector<T, 3> column(size_type j) const {
+        return Vector<T, 3>{(*this)(0, j), (*this)(1, j), (*this)(2, j)};
+    }
+
+    T* begin() { return data_; }
+    T* end() { return data_ + 9; }
+    const T* begin() const { return data_; }
+    const T* end() const { return data_ + 9; }
+};
+
+// Type aliases for common matrix types
+template<typename T> using Matrix2x2 = Matrix<T, 2, 2>;
+template<typename T> using Matrix3x3 = Matrix<T, 3, 3>;
+template<typename T> using Matrix4x4 = Matrix<T, 4, 4>;
+template<typename T> using Matrix2x3 = Matrix<T, 2, 3>;
+template<typename T> using Matrix3x2 = Matrix<T, 3, 2>;
+template<typename T> using Matrix3x4 = Matrix<T, 3, 4>;
+template<typename T> using Matrix4x3 = Matrix<T, 4, 3>;
+
+// Double precision aliases
+using Matrix2x2d = Matrix2x2<double>;
+using Matrix3x3d = Matrix3x3<double>;
+using Matrix4x4d = Matrix4x4<double>;
+
+// Single precision aliases
+using Matrix2x2f = Matrix2x2<float>;
+using Matrix3x3f = Matrix3x3<float>;
+using Matrix4x4f = Matrix4x4<float>;
+
+// Matrix-vector multiplication
+template<typename T, std::size_t M, std::size_t N>
+inline Vector<T, M> operator*(const Matrix<T, M, N>& A, const Vector<T, N>& x) {
+    Vector<T, M> result;
+    for (std::size_t i = 0; i < M; ++i) {
+        T sum = T(0);
+        for (std::size_t j = 0; j < N; ++j) {
+            sum += A(i, j) * x[j];
+        }
+        result[i] = sum;
+    }
+    return result;
+}
+
+// Vector-matrix multiplication (row vector * matrix)
+template<typename T, std::size_t M, std::size_t N>
+inline Vector<T, N> operator*(const Vector<T, M>& x, const Matrix<T, M, N>& A) {
+    Vector<T, N> result;
+    for (std::size_t j = 0; j < N; ++j) {
+        T sum = T(0);
+        for (std::size_t i = 0; i < M; ++i) {
+            sum += x[i] * A(i, j);
+        }
+        result[j] = sum;
+    }
+    return result;
+}
+
+// Matrix-matrix multiplication
+template<typename T, std::size_t M, std::size_t N, std::size_t P>
+inline Matrix<T, M, P> operator*(const Matrix<T, M, N>& A, const Matrix<T, N, P>& B) {
+    Matrix<T, M, P> result;
+    for (std::size_t i = 0; i < M; ++i) {
+        for (std::size_t k = 0; k < N; ++k) {
+            T a_ik = A(i, k);
+            for (std::size_t j = 0; j < P; ++j) {
+                result(i, j) += a_ik * B(k, j);
+            }
+        }
+    }
+    return result;
+}
+
+// Free functions
+
+/**
+ * @brief Compute matrix transpose
+ */
+template<typename T, std::size_t M, std::size_t N>
+inline Matrix<T, N, M> transpose(const Matrix<T, M, N>& m) {
+    return m.transpose();
+}
+
+/**
+ * @brief Compute matrix trace
+ */
+template<typename T, std::size_t N>
+inline T trace(const Matrix<T, N, N>& m) {
+    return m.trace();
+}
+
+/**
+ * @brief Compute matrix determinant
+ */
+template<typename T, std::size_t N>
+inline T determinant(const Matrix<T, N, N>& m) {
+    return m.determinant();
+}
+
+/**
+ * @brief Compute matrix inverse
+ */
+template<typename T, std::size_t N>
+inline Matrix<T, N, N> inverse(const Matrix<T, N, N>& m) {
+    return m.inverse();
+}
+
+/**
+ * @brief Compute Frobenius norm
+ */
+template<typename T, std::size_t M, std::size_t N>
+inline T frobenius_norm(const Matrix<T, M, N>& m) {
+    return m.frobenius_norm();
+}
+
+/**
+ * @brief Component-wise absolute value
+ */
+template<typename T, std::size_t M, std::size_t N>
+inline Matrix<T, M, N> abs(const Matrix<T, M, N>& m) {
+    Matrix<T, M, N> result;
+    for (std::size_t i = 0; i < M; ++i) {
+        for (std::size_t j = 0; j < N; ++j) {
+            using std::abs;
+            result(i, j) = abs(m(i, j));
+        }
+    }
+    return result;
+}
+
+/**
+ * @brief Component-wise minimum
+ */
+template<typename T, std::size_t M, std::size_t N>
+inline Matrix<T, M, N> min(const Matrix<T, M, N>& a, const Matrix<T, M, N>& b) {
+    Matrix<T, M, N> result;
+    for (std::size_t i = 0; i < M; ++i) {
+        for (std::size_t j = 0; j < N; ++j) {
+            result(i, j) = std::min(a(i, j), b(i, j));
+        }
+    }
+    return result;
+}
+
+/**
+ * @brief Component-wise maximum
+ */
+template<typename T, std::size_t M, std::size_t N>
+inline Matrix<T, M, N> max(const Matrix<T, M, N>& a, const Matrix<T, M, N>& b) {
+    Matrix<T, M, N> result;
+    for (std::size_t i = 0; i < M; ++i) {
+        for (std::size_t j = 0; j < N; ++j) {
+            result(i, j) = std::max(a(i, j), b(i, j));
+        }
+    }
+    return result;
+}
+
+/**
+ * @brief Outer product of two vectors
+ */
+template<typename T, std::size_t M, std::size_t N>
+inline Matrix<T, M, N> outer_product(const Vector<T, M>& u, const Vector<T, N>& v) {
+    Matrix<T, M, N> result;
+    for (std::size_t i = 0; i < M; ++i) {
+        for (std::size_t j = 0; j < N; ++j) {
+            result(i, j) = u[i] * v[j];
+        }
+    }
+    return result;
+}
+
+/**
+ * @brief Check if two matrices are approximately equal
+ */
+template<typename T, std::size_t M, std::size_t N>
+inline bool approx_equal(const Matrix<T, M, N>& a, const Matrix<T, M, N>& b, T tol = tolerance<T>) {
+    for (std::size_t i = 0; i < M; ++i) {
+        for (std::size_t j = 0; j < N; ++j) {
+            if (!approx_equal(a(i, j), b(i, j), tol)) {
+                return false;
+            }
+        }
+    }
+    return true;
+}
+
+/**
+ * @brief Stream output operator for matrices
+ * @tparam T Scalar type
+ * @tparam M Number of rows
+ * @tparam N Number of columns
+ * @param os Output stream
+ * @param m Matrix to output
+ * @return Reference to output stream
+ */
+template<typename T, std::size_t M, std::size_t N>
+inline std::ostream& operator<<(std::ostream& os, const Matrix<T, M, N>& m) {
+    os << "[";
+    for (std::size_t i = 0; i < M; ++i) {
+        if (i > 0) os << "\n ";
+        os << "[";
+        for (std::size_t j = 0; j < N; ++j) {
+            if (j > 0) os << ", ";
+            os << m(i, j);
+        }
+        os << "]";
+    }
+    os << "]";
+    return os;
+}
+
+} // namespace math
+} // namespace FE
+} // namespace svmp
+
+#endif // SVMP_FE_MATH_MATRIX_H
diff --git a/Code/Source/solver/FE/Math/MatrixExpr.h b/Code/Source/solver/FE/Math/MatrixExpr.h
new file mode 100644
index 000000000..da2f8c8d6
--- /dev/null
+++ b/Code/Source/solver/FE/Math/MatrixExpr.h
@@ -0,0 +1,626 @@
+#ifndef SVMP_FE_MATH_MATRIX_EXPR_H
+#define SVMP_FE_MATH_MATRIX_EXPR_H
+
+/**
+ * @file MatrixExpr.h
+ * @brief Expression template infrastructure for lazy evaluation of matrix operations
+ *
+ * This header provides expression templates that enable compound matrix operations
+ * without creating temporary objects. Operations are evaluated lazily at the point
+ * of assignment, eliminating intermediate allocations and improving performance.
+ */
+
+#include <algorithm>
+#include <cstddef>
+#include <type_traits>
+#include <cmath>
+#include "ExpressionOps.h"
+
+namespace svmp {
+namespace FE {
+namespace math {
+
+/**
+ * @brief Base class for all matrix expressions using CRTP
+ * @tparam Derived The derived expression type
+ *
+ * This uses the Curiously Recurring Template Pattern (CRTP) to provide
+ * static polymorphism for expression templates.
+ */
+template<typename Derived>
+class MatrixExpr {
+public:
+    /**
+     * @brief Get the derived expression
+     * @return Reference to the derived type
+     */
+    const Derived& derived() const {
+        return static_cast<const Derived&>(*this);
+    }
+
+    /**
+     * @brief Get the derived expression (non-const)
+     * @return Reference to the derived type
+     */
+    Derived& derived() {
+        return static_cast<Derived&>(*this);
+    }
+
+    /**
+     * @brief Access element by row and column indices
+     * @param i Row index
+     * @param j Column index
+     * @return Value at (i,j)
+     */
+    auto operator()(std::size_t i, std::size_t j) const {
+        return derived()(i, j);
+    }
+
+    /**
+     * @brief Get number of rows
+     * @return Number of rows
+     */
+    std::size_t rows() const {
+        return derived().rows();
+    }
+
+    /**
+     * @brief Get number of columns
+     * @return Number of columns
+     */
+    std::size_t cols() const {
+        return derived().cols();
+    }
+};
+
+/**
+ * @brief Binary expression for element-wise operations between two matrix expressions
+ * @tparam LHS Left-hand side expression type
+ * @tparam RHS Right-hand side expression type
+ * @tparam Op Binary operation functor
+ */
+template<typename LHS, typename RHS, typename Op>
+class MatrixBinaryExpr : public MatrixExpr<MatrixBinaryExpr<LHS, RHS, Op>> {
+private:
+    const LHS& lhs_;
+    const RHS& rhs_;
+    Op op_;
+
+public:
+    /**
+     * @brief Construct binary expression
+     * @param lhs Left operand
+     * @param rhs Right operand
+     * @param op Operation to apply
+     */
+    constexpr MatrixBinaryExpr(const LHS& lhs, const RHS& rhs, Op op = Op{})
+        : lhs_(lhs), rhs_(rhs), op_(op) {}
+
+    /**
+     * @brief Access element at (i,j)
+     * @param i Row index
+     * @param j Column index
+     * @return Result of operation on elements at (i,j)
+     */
+    constexpr auto operator()(std::size_t i, std::size_t j) const {
+        return op_(lhs_(i, j), rhs_(i, j));
+    }
+
+    /**
+     * @brief Get number of rows
+     * @return Number of rows
+     */
+    constexpr std::size_t rows() const {
+        return lhs_.rows();
+    }
+
+    /**
+     * @brief Get number of columns
+     * @return Number of columns
+     */
+    constexpr std::size_t cols() const {
+        return lhs_.cols();
+    }
+};
+
+/**
+ * @brief Unary expression for element-wise operations on a single matrix expression
+ * @tparam Expr Expression type
+ * @tparam Op Unary operation functor
+ */
+template<typename Expr, typename Op>
+class MatrixUnaryExpr : public MatrixExpr<MatrixUnaryExpr<Expr, Op>> {
+private:
+    const Expr& expr_;
+    Op op_;
+
+public:
+    /**
+     * @brief Construct unary expression
+     * @param expr Operand expression
+     * @param op Operation to apply
+     */
+    constexpr MatrixUnaryExpr(const Expr& expr, Op op = Op{})
+        : expr_(expr), op_(op) {}
+
+    /**
+     * @brief Access element at (i,j)
+     * @param i Row index
+     * @param j Column index
+     * @return Result of operation on element at (i,j)
+     */
+    constexpr auto operator()(std::size_t i, std::size_t j) const {
+        return op_(expr_(i, j));
+    }
+
+    /**
+     * @brief Get number of rows
+     * @return Number of rows
+     */
+    constexpr std::size_t rows() const {
+        return expr_.rows();
+    }
+
+    /**
+     * @brief Get number of columns
+     * @return Number of columns
+     */
+    constexpr std::size_t cols() const {
+        return expr_.cols();
+    }
+};
+
+/**
+ * @brief Scalar multiplication expression
+ * @tparam Expr Matrix expression type
+ * @tparam Scalar Scalar type
+ */
+template<typename Expr, typename Scalar>
+class MatrixScalarExpr : public MatrixExpr<MatrixScalarExpr<Expr, Scalar>> {
+private:
+    const Expr& expr_;
+    Scalar scalar_;
+
+public:
+    /**
+     * @brief Construct scalar multiplication expression
+     * @param expr Matrix expression
+     * @param scalar Scalar value
+     */
+    constexpr MatrixScalarExpr(const Expr& expr, Scalar scalar)
+        : expr_(expr), scalar_(scalar) {}
+
+    /**
+     * @brief Access element at (i,j)
+     * @param i Row index
+     * @param j Column index
+     * @return Element multiplied by scalar
+     */
+    constexpr auto operator()(std::size_t i, std::size_t j) const {
+        return expr_(i, j) * scalar_;
+    }
+
+    /**
+     * @brief Get number of rows
+     * @return Number of rows
+     */
+    constexpr std::size_t rows() const {
+        return expr_.rows();
+    }
+
+    /**
+     * @brief Get number of columns
+     * @return Number of columns
+     */
+    constexpr std::size_t cols() const {
+        return expr_.cols();
+    }
+};
+
+/**
+ * @brief Scalar division expression
+ * @tparam Expr Matrix expression type
+ * @tparam Scalar Scalar type
+ */
+template<typename Expr, typename Scalar>
+class MatrixScalarDivExpr : public MatrixExpr<MatrixScalarDivExpr<Expr, Scalar>> {
+private:
+    const Expr& expr_;
+    Scalar scalar_;
+
+public:
+    /**
+     * @brief Construct scalar division expression
+     * @param expr Matrix expression
+     * @param scalar Scalar divisor
+     */
+    constexpr MatrixScalarDivExpr(const Expr& expr, Scalar scalar)
+        : expr_(expr), scalar_(scalar) {}
+
+    /**
+     * @brief Access element at (i,j)
+     * @param i Row index
+     * @param j Column index
+     * @return Element divided by scalar
+     */
+    constexpr auto operator()(std::size_t i, std::size_t j) const {
+        return expr_(i, j) / scalar_;
+    }
+
+    /**
+     * @brief Get number of rows
+     * @return Number of rows
+     */
+    constexpr std::size_t rows() const {
+        return expr_.rows();
+    }
+
+    /**
+     * @brief Get number of columns
+     * @return Number of columns
+     */
+    constexpr std::size_t cols() const {
+        return expr_.cols();
+    }
+};
+
+/**
+ * @brief Matrix multiplication expression (lazy evaluation)
+ * @tparam LHS Left matrix expression type
+ * @tparam RHS Right matrix expression type
+ *
+ * Computes matrix multiplication A*B lazily
+ */
+template<typename LHS, typename RHS>
+class MatrixMulExpr : public MatrixExpr<MatrixMulExpr<LHS, RHS>> {
+private:
+    const LHS& lhs_;
+    const RHS& rhs_;
+
+public:
+    /**
+     * @brief Construct matrix multiplication expression
+     * @param lhs Left matrix
+     * @param rhs Right matrix
+     */
+    constexpr MatrixMulExpr(const LHS& lhs, const RHS& rhs)
+        : lhs_(lhs), rhs_(rhs) {}
+
+    /**
+     * @brief Compute element at (i,j)
+     * @param i Row index
+     * @param j Column index
+     * @return Dot product of row i of lhs and column j of rhs
+     */
+    constexpr auto operator()(std::size_t i, std::size_t j) const {
+        using result_type = decltype(lhs_(0, 0) * rhs_(0, 0));
+        result_type sum = result_type{0};
+        const auto n = lhs_.cols();
+        for (std::size_t k = 0; k < n; ++k) {
+            sum += lhs_(i, k) * rhs_(k, j);
+        }
+        return sum;
+    }
+
+    /**
+     * @brief Get number of rows (from left matrix)
+     * @return Number of rows
+     */
+    constexpr std::size_t rows() const {
+        return lhs_.rows();
+    }
+
+    /**
+     * @brief Get number of columns (from right matrix)
+     * @return Number of columns
+     */
+    constexpr std::size_t cols() const {
+        return rhs_.cols();
+    }
+};
+
+/**
+ * @brief Transpose expression (lazy evaluation)
+ * @tparam Expr Matrix expression type
+ */
+template<typename Expr>
+class TransposeExpr : public MatrixExpr<TransposeExpr<Expr>> {
+private:
+    const Expr& expr_;
+
+public:
+    /**
+     * @brief Construct transpose expression
+     * @param expr Matrix expression to transpose
+     */
+    constexpr explicit TransposeExpr(const Expr& expr)
+        : expr_(expr) {}
+
+    /**
+     * @brief Access transposed element
+     * @param i Row index (becomes column in original)
+     * @param j Column index (becomes row in original)
+     * @return Element at (j,i) of original matrix
+     */
+    constexpr auto operator()(std::size_t i, std::size_t j) const {
+        return expr_(j, i);
+    }
+
+    /**
+     * @brief Get number of rows (columns of original)
+     * @return Number of rows
+     */
+    constexpr std::size_t rows() const {
+        return expr_.cols();
+    }
+
+    /**
+     * @brief Get number of columns (rows of original)
+     * @return Number of columns
+     */
+    constexpr std::size_t cols() const {
+        return expr_.rows();
+    }
+};
+
+/**
+ * @brief Diagonal matrix expression (creates diagonal matrix from vector)
+ * @tparam VecExpr Vector expression type
+ */
+template<typename VecExpr>
+class DiagonalExpr : public MatrixExpr<DiagonalExpr<VecExpr>> {
+private:
+    const VecExpr& vec_;
+    std::size_t n_;
+
+public:
+    /**
+     * @brief Construct diagonal matrix from vector
+     * @param vec Vector of diagonal elements
+     * @param n Matrix dimension (default: vector size)
+     */
+    constexpr explicit DiagonalExpr(const VecExpr& vec, std::size_t n = 0)
+        : vec_(vec), n_(n > 0 ? n : vec.size()) {}
+
+    /**
+     * @brief Access element
+     * @param i Row index
+     * @param j Column index
+     * @return Diagonal element if i==j, zero otherwise
+     */
+    constexpr auto operator()(std::size_t i, std::size_t j) const {
+        using result_type = decltype(vec_[0]);
+        return (i == j && i < vec_.size()) ? vec_[i] : result_type{0};
+    }
+
+    /**
+     * @brief Get number of rows
+     * @return Number of rows
+     */
+    constexpr std::size_t rows() const {
+        return n_;
+    }
+
+    /**
+     * @brief Get number of columns
+     * @return Number of columns
+     */
+    constexpr std::size_t cols() const {
+        return n_;
+    }
+};
+
+/**
+ * @brief Addition operator for matrix expressions
+ */
+template<typename LHS, typename RHS,
+         typename = std::enable_if_t<
+             std::is_base_of_v<MatrixExpr<LHS>, LHS> &&
+             std::is_base_of_v<MatrixExpr<RHS>, RHS>
+         >>
+constexpr auto operator+(const MatrixExpr<LHS>& lhs, const MatrixExpr<RHS>& rhs) {
+    return MatrixBinaryExpr<LHS, RHS, detail::ops::Add>(
+        lhs.derived(), rhs.derived(), detail::ops::Add{}
+    );
+}
+
+/**
+ * @brief Subtraction operator for matrix expressions
+ */
+template<typename LHS, typename RHS,
+         typename = std::enable_if_t<
+             std::is_base_of_v<MatrixExpr<LHS>, LHS> &&
+             std::is_base_of_v<MatrixExpr<RHS>, RHS>
+         >>
+constexpr auto operator-(const MatrixExpr<LHS>& lhs, const MatrixExpr<RHS>& rhs) {
+    return MatrixBinaryExpr<LHS, RHS, detail::ops::Sub>(
+        lhs.derived(), rhs.derived(), detail::ops::Sub{}
+    );
+}
+
+/**
+ * @brief Matrix multiplication operator
+ */
+template<typename LHS, typename RHS,
+         typename = std::enable_if_t<
+             std::is_base_of_v<MatrixExpr<LHS>, LHS> &&
+             std::is_base_of_v<MatrixExpr<RHS>, RHS>
+         >>
+constexpr auto operator*(const MatrixExpr<LHS>& lhs, const MatrixExpr<RHS>& rhs) {
+    return MatrixMulExpr<LHS, RHS>(lhs.derived(), rhs.derived());
+}
+
+/**
+ * @brief Element-wise multiplication (Hadamard product)
+ */
+template<typename LHS, typename RHS,
+         typename = std::enable_if_t<
+             std::is_base_of_v<MatrixExpr<LHS>, LHS> &&
+             std::is_base_of_v<MatrixExpr<RHS>, RHS>
+         >>
+constexpr auto hadamard(const MatrixExpr<LHS>& lhs, const MatrixExpr<RHS>& rhs) {
+    return MatrixBinaryExpr<LHS, RHS, detail::ops::Mul>(
+        lhs.derived(), rhs.derived(), detail::ops::Mul{}
+    );
+}
+
+/**
+ * @brief Element-wise division
+ */
+template<typename LHS, typename RHS,
+         typename = std::enable_if_t<
+             std::is_base_of_v<MatrixExpr<LHS>, LHS> &&
+             std::is_base_of_v<MatrixExpr<RHS>, RHS>
+         >>
+constexpr auto hadamard_div(const MatrixExpr<LHS>& lhs, const MatrixExpr<RHS>& rhs) {
+    return MatrixBinaryExpr<LHS, RHS, detail::ops::Div>(
+        lhs.derived(), rhs.derived(), detail::ops::Div{}
+    );
+}
+
+/**
+ * @brief Negation operator for matrix expressions
+ */
+template<typename Expr,
+         typename = std::enable_if_t<
+             std::is_base_of_v<MatrixExpr<Expr>, Expr>
+         >>
+constexpr auto operator-(const MatrixExpr<Expr>& expr) {
+    return MatrixUnaryExpr<Expr, detail::ops::Negate>(
+        expr.derived(), detail::ops::Negate{}
+    );
+}
+
+/**
+ * @brief Scalar multiplication operator (matrix * scalar)
+ */
+template<typename Expr, typename Scalar,
+         typename = std::enable_if_t<
+             std::is_base_of_v<MatrixExpr<Expr>, Expr> &&
+             std::is_arithmetic_v<Scalar>
+         >>
+constexpr auto operator*(const MatrixExpr<Expr>& expr, Scalar scalar) {
+    return MatrixScalarExpr<Expr, Scalar>(expr.derived(), scalar);
+}
+
+/**
+ * @brief Scalar multiplication operator (scalar * matrix)
+ */
+template<typename Scalar, typename Expr,
+         typename = std::enable_if_t<
+             std::is_arithmetic_v<Scalar> &&
+             std::is_base_of_v<MatrixExpr<Expr>, Expr>
+         >>
+constexpr auto operator*(Scalar scalar, const MatrixExpr<Expr>& expr) {
+    return MatrixScalarExpr<Expr, Scalar>(expr.derived(), scalar);
+}
+
+/**
+ * @brief Scalar division operator (matrix / scalar)
+ */
+template<typename Expr, typename Scalar,
+         typename = std::enable_if_t<
+             std::is_base_of_v<MatrixExpr<Expr>, Expr> &&
+             std::is_arithmetic_v<Scalar>
+         >>
+constexpr auto operator/(const MatrixExpr<Expr>& expr, Scalar scalar) {
+    return MatrixScalarDivExpr<Expr, Scalar>(expr.derived(), scalar);
+}
+
+/**
+ * @brief Transpose function
+ */
+template<typename Expr,
+         typename = std::enable_if_t<
+             std::is_base_of_v<MatrixExpr<Expr>, Expr>
+         >>
+constexpr auto transpose(const MatrixExpr<Expr>& expr) {
+    return TransposeExpr<Expr>(expr.derived());
+}
+
+/**
+ * @brief Element-wise absolute value
+ */
+template<typename Expr,
+         typename = std::enable_if_t<
+             std::is_base_of_v<MatrixExpr<Expr>, Expr>
+         >>
+constexpr auto abs(const MatrixExpr<Expr>& expr) {
+    return MatrixUnaryExpr<Expr, detail::ops::Abs>(expr.derived(), detail::ops::Abs{});
+}
+
+/**
+ * @brief Element-wise square root
+ */
+template<typename Expr,
+         typename = std::enable_if_t<
+             std::is_base_of_v<MatrixExpr<Expr>, Expr>
+         >>
+constexpr auto sqrt(const MatrixExpr<Expr>& expr) {
+    return MatrixUnaryExpr<Expr, detail::ops::Sqrt>(expr.derived(), detail::ops::Sqrt{});
+}
+
+/**
+ * @brief Compute Frobenius norm squared of matrix expression
+ * @tparam Expr Matrix expression type
+ * @param expr Matrix expression
+ * @return Square of the Frobenius norm
+ */
+template<typename Expr,
+         typename = std::enable_if_t<
+             std::is_base_of_v<MatrixExpr<Expr>, Expr>
+         >>
+constexpr auto frobenius_norm_squared(const MatrixExpr<Expr>& expr) {
+    using result_type = decltype(expr.derived()(0, 0) * expr.derived()(0, 0));
+    result_type sum = result_type{0};
+    const auto m = expr.rows();
+    const auto n = expr.cols();
+    for (std::size_t i = 0; i < m; ++i) {
+        for (std::size_t j = 0; j < n; ++j) {
+            auto val = expr.derived()(i, j);
+            sum += val * val;
+        }
+    }
+    return sum;
+}
+
+/**
+ * @brief Compute Frobenius norm of matrix expression
+ * @tparam Expr Matrix expression type
+ * @param expr Matrix expression
+ * @return Frobenius norm
+ */
+template<typename Expr,
+         typename = std::enable_if_t<
+             std::is_base_of_v<MatrixExpr<Expr>, Expr>
+         >>
+constexpr auto frobenius_norm(const MatrixExpr<Expr>& expr) {
+    using std::sqrt;
+    return sqrt(frobenius_norm_squared(expr));
+}
+
+/**
+ * @brief Compute trace of square matrix expression
+ * @tparam Expr Matrix expression type
+ * @param expr Matrix expression
+ * @return Sum of diagonal elements
+ */
+template<typename Expr,
+         typename = std::enable_if_t<
+             std::is_base_of_v<MatrixExpr<Expr>, Expr>
+         >>
+constexpr auto trace(const MatrixExpr<Expr>& expr) {
+    using result_type = decltype(expr.derived()(0, 0));
+    result_type sum = result_type{0};
+    const auto n = std::min(expr.rows(), expr.cols());
+    for (std::size_t i = 0; i < n; ++i) {
+        sum += expr.derived()(i, i);
+    }
+    return sum;
+}
+
+} // namespace math
+} // namespace FE
+} // namespace svmp
+
+#endif // SVMP_FE_MATH_MATRIX_EXPR_H
\ No newline at end of file
diff --git a/Code/Source/solver/FE/Math/Vector.h b/Code/Source/solver/FE/Math/Vector.h
new file mode 100644
index 000000000..e272bd6dd
--- /dev/null
+++ b/Code/Source/solver/FE/Math/Vector.h
@@ -0,0 +1,831 @@
+#ifndef SVMP_FE_MATH_VECTOR_H
+#define SVMP_FE_MATH_VECTOR_H
+
+/**
+ * @file Vector.h
+ * @brief Fixed-size vectors with expression templates for FE computations
+ *
+ * This header provides optimized fixed-size vector operations for element-level
+ * computations. All operations use expression templates to eliminate temporaries
+ * and are header-only for maximum inlining. Memory is aligned for SIMD operations.
+ */
+
+#include "VectorExpr.h"
+#include "MathConstants.h"
+#include "../Common/Alignment.h"
+#include <algorithm>
+#include <array>
+#include <cmath>
+#include <initializer_list>
+#include <ostream>
+#include <stdexcept>
+#include <type_traits>
+
+namespace svmp {
+namespace FE {
+namespace math {
+
+/**
+ * @brief Fixed-size vector for element-level computations
+ * @tparam T Scalar type (float, double)
+ * @tparam N Vector dimension
+ *
+ * This class provides small vector operations optimized for
+ * compile-time known dimensions. Memory is aligned for SIMD operations.
+ */
+template<typename T, std::size_t N>
+class Vector : public VectorExpr<Vector<T, N>> {
+    static_assert(std::is_arithmetic_v<T>, "T must be an arithmetic type");
+    static_assert(N > 0, "Vector dimension must be positive");
+
+private:
+    alignas(kFEFixedObjectAlignmentBytes) T data_[N];  // SIMD-friendly alignment
+
+public:
+    // Type definitions
+    using value_type = T;
+    using size_type = std::size_t;
+    using reference = T&;
+    using const_reference = const T&;
+    using pointer = T*;
+    using const_pointer = const T*;
+
+    /**
+     * @brief Default constructor - zero initializes all components
+     */
+    constexpr Vector() : data_{} {}
+
+    /**
+     * @brief Fill constructor - initializes all components with same value
+     * @param value Value to fill vector with
+     */
+    constexpr explicit Vector(T value) {
+        for (size_type i = 0; i < N; ++i) {
+            data_[i] = value;
+        }
+    }
+
+    /**
+     * @brief Initializer list constructor
+     * @param init List of values
+     */
+    constexpr Vector(std::initializer_list<T> init) : data_{} {
+        auto it = init.begin();
+        for (size_type i = 0; i < N && it != init.end(); ++i, ++it) {
+            data_[i] = *it;
+        }
+    }
+
+    /**
+     * @brief Constructor from expression template
+     * @tparam Expr Expression type
+     * @param expr Vector expression to evaluate
+     */
+    template<typename Expr>
+    Vector(const VectorExpr<Expr>& expr) {
+        const auto& e = expr.derived();
+        for (size_type i = 0; i < N; ++i) {
+            data_[i] = e[i];
+        }
+    }
+
+    /**
+     * @brief Copy constructor
+     */
+    constexpr Vector(const Vector&) = default;
+
+    /**
+     * @brief Move constructor
+     */
+    constexpr Vector(Vector&&) noexcept = default;
+
+    /**
+     * @brief Copy assignment
+     */
+    Vector& operator=(const Vector&) = default;
+
+    /**
+     * @brief Move assignment
+     */
+    Vector& operator=(Vector&&) noexcept = default;
+
+    /**
+     * @brief Assignment from expression template
+     * @tparam Expr Expression type
+     * @param expr Vector expression to evaluate
+     * @return Reference to this
+     */
+    template<typename Expr>
+    Vector& operator=(const VectorExpr<Expr>& expr) {
+        const auto& e = expr.derived();
+        for (size_type i = 0; i < N; ++i) {
+            data_[i] = e[i];
+        }
+        return *this;
+    }
+
+    /**
+     * @brief Get vector size (compile-time constant)
+     * @return Number of elements
+     */
+    static constexpr size_type size() { return N; }
+
+    /**
+     * @brief Element access (no bounds checking)
+     * @param i Element index
+     * @return Reference to element
+     */
+    constexpr T& operator[](size_type i) {
+        return data_[i];
+    }
+
+    /**
+     * @brief Element access (no bounds checking) - const version
+     * @param i Element index
+     * @return Const reference to element
+     */
+    constexpr const T& operator[](size_type i) const {
+        return data_[i];
+    }
+
+    /**
+     * @brief Element access with bounds checking
+     * @param i Element index
+     * @return Reference to element
+     * @throws std::out_of_range if i >= N
+     */
+    T& at(size_type i) {
+        if (i >= N) {
+            throw std::out_of_range("Vector::at: index out of range");
+        }
+        return data_[i];
+    }
+
+    /**
+     * @brief Element access with bounds checking - const version
+     * @param i Element index
+     * @return Const reference to element
+     * @throws std::out_of_range if i >= N
+     */
+    const T& at(size_type i) const {
+        if (i >= N) {
+            throw std::out_of_range("Vector::at: index out of range");
+        }
+        return data_[i];
+    }
+
+    /**
+     * @brief Access first element
+     * @return Reference to first element
+     */
+    T& front() { return data_[0]; }
+    const T& front() const { return data_[0]; }
+
+    /**
+     * @brief Access last element
+     * @return Reference to last element
+     */
+    T& back() { return data_[N-1]; }
+    const T& back() const { return data_[N-1]; }
+
+    /**
+     * @brief Get pointer to underlying data
+     * @return Pointer to first element
+     */
+    T* data() { return data_; }
+    const T* data() const { return data_; }
+
+    /**
+     * @brief Fill vector with value
+     * @param value Value to fill with
+     */
+    void fill(T value) {
+        for (size_type i = 0; i < N; ++i) {
+            data_[i] = value;
+        }
+    }
+
+    /**
+     * @brief Set all components to zero
+     */
+    void set_zero() {
+        fill(T{0});
+    }
+
+    // Arithmetic operators
+
+    /**
+     * @brief In-place addition
+     * @param other Vector to add
+     * @return Reference to this
+     */
+    Vector& operator+=(const Vector& other) {
+        for (size_type i = 0; i < N; ++i) {
+            data_[i] += other.data_[i];
+        }
+        return *this;
+    }
+
+    /**
+     * @brief In-place subtraction
+     * @param other Vector to subtract
+     * @return Reference to this
+     */
+    Vector& operator-=(const Vector& other) {
+        for (size_type i = 0; i < N; ++i) {
+            data_[i] -= other.data_[i];
+        }
+        return *this;
+    }
+
+    /**
+     * @brief In-place scalar multiplication
+     * @param scalar Scalar to multiply by
+     * @return Reference to this
+     */
+    Vector& operator*=(T scalar) {
+        for (size_type i = 0; i < N; ++i) {
+            data_[i] *= scalar;
+        }
+        return *this;
+    }
+
+    /**
+     * @brief In-place scalar division
+     * @param scalar Scalar to divide by
+     * @return Reference to this
+     */
+    Vector& operator/=(T scalar) {
+        const T inv = T(1) / scalar;
+        return (*this) *= inv;
+    }
+
+    // Vector operations
+
+    /**
+     * @brief Compute dot product
+     * @param other Other vector
+     * @return Dot product
+     */
+    T dot(const Vector& other) const {
+        T result = T(0);
+        for (size_type i = 0; i < N; ++i) {
+            result += data_[i] * other.data_[i];
+        }
+        return result;
+    }
+
+    /**
+     * @brief Compute squared Euclidean norm
+     * @return Squared norm
+     */
+    T norm_squared() const {
+        return dot(*this);
+    }
+
+    /**
+     * @brief Compute Euclidean norm
+     * @return Norm
+     */
+    T norm() const {
+        using std::sqrt;
+        return sqrt(norm_squared());
+    }
+
+    /**
+     * @brief Get normalized vector
+     * @return Unit vector in same direction
+     */
+    Vector normalized() const {
+        const T n = norm();
+        if (approx_zero(n)) {
+            return Vector();  // Return zero vector
+        }
+        return (*this) / n;
+    }
+
+    /**
+     * @brief Normalize this vector in place
+     * @return Reference to this
+     */
+    Vector& normalize() {
+        const T n = norm();
+        if (!approx_zero(n)) {
+            (*this) /= n;
+        }
+        return *this;
+    }
+
+    /**
+     * @brief Compute L1 norm (Manhattan norm)
+     * @return Sum of absolute values
+     */
+    T norm_l1() const {
+        T result = T(0);
+        for (size_type i = 0; i < N; ++i) {
+            using std::abs;
+            result += abs(data_[i]);
+        }
+        return result;
+    }
+
+    /**
+     * @brief Compute L-infinity norm (maximum norm)
+     * @return Maximum absolute value
+     */
+    T norm_inf() const {
+        T result = T(0);
+        for (size_type i = 0; i < N; ++i) {
+            using std::abs;
+            result = std::max(result, abs(data_[i]));
+        }
+        return result;
+    }
+
+    /**
+     * @brief Get minimum component
+     * @return Minimum value
+     */
+    T min() const {
+        T result = data_[0];
+        for (size_type i = 1; i < N; ++i) {
+            result = std::min(result, data_[i]);
+        }
+        return result;
+    }
+
+    /**
+     * @brief Get maximum component
+     * @return Maximum value
+     */
+    T max() const {
+        T result = data_[0];
+        for (size_type i = 1; i < N; ++i) {
+            result = std::max(result, data_[i]);
+        }
+        return result;
+    }
+
+    /**
+     * @brief Get sum of all components
+     * @return Sum of components
+     */
+    T sum() const {
+        T result = T(0);
+        for (size_type i = 0; i < N; ++i) {
+            result += data_[i];
+        }
+        return result;
+    }
+
+    /**
+     * @brief Get product of all components
+     * @return Product of components
+     */
+    T product() const {
+        T result = data_[0];
+        for (size_type i = 1; i < N; ++i) {
+            result *= data_[i];
+        }
+        return result;
+    }
+
+    // Static factory functions
+
+    /**
+     * @brief Create zero vector
+     * @return Vector with all components zero
+     */
+    static constexpr Vector zeros() {
+        return Vector();
+    }
+
+    /**
+     * @brief Create vector with all components one
+     * @return Vector with all components one
+     */
+    static constexpr Vector ones() {
+        return Vector(T(1));
+    }
+
+    /**
+     * @brief Create unit vector along axis
+     * @param axis Axis index (0-based)
+     * @return Unit vector
+     */
+    static Vector unit(size_type axis) {
+        Vector v;
+        if (axis < N) {
+            v[axis] = T(1);
+        }
+        return v;
+    }
+
+    /**
+     * @brief Create basis vector (alias for unit)
+     * @param i Axis index (0-based)
+     * @return Basis vector
+     */
+    static Vector basis(size_type i) {
+        return unit(i);
+    }
+
+    /**
+     * @brief Create zero vector (alias for zeros)
+     * @return Zero vector
+     */
+    static constexpr Vector zero() {
+        return zeros();
+    }
+
+    /**
+     * @brief Get index of minimum element
+     * @return Index of minimum value
+     */
+    size_type min_index() const {
+        size_type idx = 0;
+        T min_val = data_[0];
+        for (size_type i = 1; i < N; ++i) {
+            if (data_[i] < min_val) {
+                min_val = data_[i];
+                idx = i;
+            }
+        }
+        return idx;
+    }
+
+    /**
+     * @brief Get index of maximum element
+     * @return Index of maximum value
+     */
+    size_type max_index() const {
+        size_type idx = 0;
+        T max_val = data_[0];
+        for (size_type i = 1; i < N; ++i) {
+            if (data_[i] > max_val) {
+                max_val = data_[i];
+                idx = i;
+            }
+        }
+        return idx;
+    }
+
+    /**
+     * @brief Compute mean of all components
+     * @return Average value
+     */
+    T mean() const {
+        return sum() / static_cast<T>(N);
+    }
+
+    /**
+     * @brief Cross product for 3D vectors
+     * @param other Other vector
+     * @return Cross product
+     * @note Only available for 3D vectors
+     */
+    template<typename U = T>
+    std::enable_if_t<N == 3, Vector<U, 3>> cross(const Vector<U, 3>& other) const {
+        return Vector<U, 3>{
+            data_[1] * other[2] - data_[2] * other[1],
+            data_[2] * other[0] - data_[0] * other[2],
+            data_[0] * other[1] - data_[1] * other[0]
+        };
+    }
+
+    /**
+     * @brief Check if vectors are approximately equal
+     * @param other Other vector
+     * @param tol Tolerance
+     * @return true if equal within tolerance
+     */
+    bool approx_equal(const Vector& other, T tol = tolerance<T>) const {
+        for (size_type i = 0; i < N; ++i) {
+            using std::abs;
+            if (abs(data_[i] - other.data_[i]) > tol) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    /**
+     * @brief Equality comparison
+     * @param other Other vector
+     * @return true if exactly equal
+     */
+    bool operator==(const Vector& other) const {
+        for (size_type i = 0; i < N; ++i) {
+            if (data_[i] != other.data_[i]) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    /**
+     * @brief Inequality comparison
+     * @param other Other vector
+     * @return true if not equal
+     */
+    bool operator!=(const Vector& other) const {
+        return !(*this == other);
+    }
+
+    // Iterators
+    T* begin() { return data_; }
+    T* end() { return data_ + N; }
+    const T* begin() const { return data_; }
+    const T* end() const { return data_ + N; }
+    const T* cbegin() const { return data_; }
+    const T* cend() const { return data_ + N; }
+};
+
+// Type aliases for common vector types
+template<typename T> using Vector2 = Vector<T, 2>;
+template<typename T> using Vector3 = Vector<T, 3>;
+template<typename T> using Vector4 = Vector<T, 4>;
+
+// Double precision aliases
+using Vector2d = Vector2<double>;
+using Vector3d = Vector3<double>;
+using Vector4d = Vector4<double>;
+
+// Single precision aliases
+using Vector2f = Vector2<float>;
+using Vector3f = Vector3<float>;
+using Vector4f = Vector4<float>;
+
+// Integer aliases
+using Vector2i = Vector2<int>;
+using Vector3i = Vector3<int>;
+using Vector4i = Vector4<int>;
+
+/**
+ * @brief 3D Cross product
+ * @tparam T Scalar type
+ * @param a First vector
+ * @param b Second vector
+ * @return Cross product a × b
+ */
+template<typename T>
+inline Vector3<T> cross(const Vector3<T>& a, const Vector3<T>& b) {
+    return Vector3<T>{
+        a[1] * b[2] - a[2] * b[1],
+        a[2] * b[0] - a[0] * b[2],
+        a[0] * b[1] - a[1] * b[0]
+    };
+}
+
+/**
+ * @brief 2D Cross product (returns scalar - z component of 3D cross)
+ * @tparam T Scalar type
+ * @param a First vector
+ * @param b Second vector
+ * @return Scalar cross product
+ */
+template<typename T>
+inline T cross(const Vector2<T>& a, const Vector2<T>& b) {
+    return a[0] * b[1] - a[1] * b[0];
+}
+
+/**
+ * @brief Triple scalar product (a · (b × c))
+ * @tparam T Scalar type
+ * @param a First vector
+ * @param b Second vector
+ * @param c Third vector
+ * @return Scalar triple product
+ */
+template<typename T>
+inline T triple_product(const Vector3<T>& a, const Vector3<T>& b, const Vector3<T>& c) {
+    return a.dot(cross(b, c));
+}
+
+// Free functions for common operations
+
+/**
+ * @brief Compute dot product
+ */
+template<typename T, std::size_t N>
+inline T dot(const Vector<T, N>& a, const Vector<T, N>& b) {
+    return a.dot(b);
+}
+
+/**
+ * @brief Compute Euclidean norm
+ */
+template<typename T, std::size_t N>
+inline T norm(const Vector<T, N>& v) {
+    return v.norm();
+}
+
+/**
+ * @brief Compute squared Euclidean norm
+ */
+template<typename T, std::size_t N>
+inline T norm_squared(const Vector<T, N>& v) {
+    return v.norm_squared();
+}
+
+/**
+ * @brief Get normalized vector
+ */
+template<typename T, std::size_t N>
+inline Vector<T, N> normalize(const Vector<T, N>& v) {
+    return v.normalized();
+}
+
+/**
+ * @brief Component-wise absolute value
+ */
+template<typename T, std::size_t N>
+inline Vector<T, N> abs(const Vector<T, N>& v) {
+    Vector<T, N> result;
+    for (std::size_t i = 0; i < N; ++i) {
+        using std::abs;
+        result[i] = abs(v[i]);
+    }
+    return result;
+}
+
+/**
+ * @brief Component-wise minimum
+ */
+template<typename T, std::size_t N>
+inline Vector<T, N> min(const Vector<T, N>& a, const Vector<T, N>& b) {
+    Vector<T, N> result;
+    for (std::size_t i = 0; i < N; ++i) {
+        result[i] = std::min(a[i], b[i]);
+    }
+    return result;
+}
+
+/**
+ * @brief Component-wise maximum
+ */
+template<typename T, std::size_t N>
+inline Vector<T, N> max(const Vector<T, N>& a, const Vector<T, N>& b) {
+    Vector<T, N> result;
+    for (std::size_t i = 0; i < N; ++i) {
+        result[i] = std::max(a[i], b[i]);
+    }
+    return result;
+}
+
+/**
+ * @brief Component-wise clamp
+ */
+template<typename T, std::size_t N>
+inline Vector<T, N> clamp(const Vector<T, N>& v, const Vector<T, N>& min_v, const Vector<T, N>& max_v) {
+    Vector<T, N> result;
+    for (std::size_t i = 0; i < N; ++i) {
+        result[i] = std::clamp(v[i], min_v[i], max_v[i]);
+    }
+    return result;
+}
+
+/**
+ * @brief Linear interpolation between vectors
+ * @tparam T Scalar type
+ * @tparam N Vector dimension
+ * @param t Interpolation parameter [0, 1]
+ * @param a Start vector (at t=0)
+ * @param b End vector (at t=1)
+ * @return Interpolated vector
+ */
+template<typename T, std::size_t N>
+inline Vector<T, N> lerp(T t, const Vector<T, N>& a, const Vector<T, N>& b) {
+    return a + t * (b - a);
+}
+
+/**
+ * @brief Spherical linear interpolation (for unit vectors)
+ * @tparam T Scalar type
+ * @param t Interpolation parameter [0, 1]
+ * @param a Start unit vector
+ * @param b End unit vector
+ * @return Interpolated unit vector
+ */
+template<typename T>
+inline Vector3<T> slerp(T t, const Vector3<T>& a, const Vector3<T>& b) {
+    T cos_angle = a.dot(b);
+
+    // Handle numerical issues
+    cos_angle = std::clamp(cos_angle, T(-1), T(1));
+
+    // If vectors are nearly parallel, use linear interpolation
+    if (cos_angle > T(0.9995)) {
+        return normalize(lerp(t, a, b));
+    }
+
+    T angle = std::acos(cos_angle);
+    T sin_angle = std::sin(angle);
+
+    T t0 = std::sin((T(1) - t) * angle) / sin_angle;
+    T t1 = std::sin(t * angle) / sin_angle;
+
+    return t0 * a + t1 * b;
+}
+
+/**
+ * @brief Reflect vector about normal
+ * @tparam T Scalar type
+ * @tparam N Vector dimension
+ * @param v Incident vector
+ * @param n Normal vector (should be unit)
+ * @return Reflected vector
+ */
+template<typename T, std::size_t N>
+inline Vector<T, N> reflect(const Vector<T, N>& v, const Vector<T, N>& n) {
+    return v - T(2) * dot(v, n) * n;
+}
+
+/**
+ * @brief Project vector onto another vector
+ * @tparam T Scalar type
+ * @tparam N Vector dimension
+ * @param v Vector to project
+ * @param onto Vector to project onto
+ * @return Projection of v onto 'onto'
+ */
+template<typename T, std::size_t N>
+inline Vector<T, N> project(const Vector<T, N>& v, const Vector<T, N>& onto) {
+    T denom = onto.norm_squared();
+    if (approx_zero(denom)) {
+        return Vector<T, N>::zeros();
+    }
+    return (dot(v, onto) / denom) * onto;
+}
+
+/**
+ * @brief Get perpendicular component of vector
+ * @tparam T Scalar type
+ * @tparam N Vector dimension
+ * @param v Vector
+ * @param direction Direction to remove
+ * @return Component of v perpendicular to direction
+ */
+template<typename T, std::size_t N>
+inline Vector<T, N> perpendicular(const Vector<T, N>& v, const Vector<T, N>& direction) {
+    return v - project(v, direction);
+}
+
+/**
+ * @brief Compute angle between two vectors
+ * @tparam T Scalar type
+ * @tparam N Vector dimension
+ * @param a First vector
+ * @param b Second vector
+ * @return Angle in radians [0, π]
+ */
+template<typename T, std::size_t N>
+inline T angle(const Vector<T, N>& a, const Vector<T, N>& b) {
+    T cos_angle = dot(a, b) / (norm(a) * norm(b));
+    cos_angle = std::clamp(cos_angle, T(-1), T(1));
+    return std::acos(cos_angle);
+}
+
+/**
+ * @brief Check if two vectors are approximately equal
+ * @tparam T Scalar type
+ * @tparam N Vector dimension
+ * @param a First vector
+ * @param b Second vector
+ * @param tol Tolerance
+ * @return true if vectors are equal within tolerance
+ */
+template<typename T, std::size_t N>
+inline bool approx_equal(const Vector<T, N>& a, const Vector<T, N>& b, T tol = tolerance<T>) {
+    for (std::size_t i = 0; i < N; ++i) {
+        if (!approx_equal(a[i], b[i], tol)) {
+            return false;
+        }
+    }
+    return true;
+}
+
+/**
+ * @brief Stream output operator
+ * @tparam T Scalar type
+ * @tparam N Vector dimension
+ * @param os Output stream
+ * @param v Vector to output
+ * @return Reference to output stream
+ */
+template<typename T, std::size_t N>
+inline std::ostream& operator<<(std::ostream& os, const Vector<T, N>& v) {
+    os << "[";
+    for (std::size_t i = 0; i < N; ++i) {
+        if (i > 0) os << ", ";
+        os << v[i];
+    }
+    os << "]";
+    return os;
+}
+
+} // namespace math
+} // namespace FE
+} // namespace svmp
+
+#endif // SVMP_FE_MATH_VECTOR_H
diff --git a/Code/Source/solver/FE/Math/VectorExpr.h b/Code/Source/solver/FE/Math/VectorExpr.h
new file mode 100644
index 000000000..8b9c8e382
--- /dev/null
+++ b/Code/Source/solver/FE/Math/VectorExpr.h
@@ -0,0 +1,418 @@
+#ifndef SVMP_FE_MATH_VECTOR_EXPR_H
+#define SVMP_FE_MATH_VECTOR_EXPR_H
+
+/**
+ * @file VectorExpr.h
+ * @brief Expression template infrastructure for lazy evaluation of vector operations
+ *
+ * This header provides expression templates that enable compound vector operations
+ * without creating temporary objects. Operations are evaluated lazily at the point
+ * of assignment, eliminating intermediate allocations and improving performance.
+ */
+
+#include <cstddef>
+#include <type_traits>
+#include <cmath>
+#include "ExpressionOps.h"
+
+namespace svmp {
+namespace FE {
+namespace math {
+
+/**
+ * @brief Base class for all vector expressions using CRTP
+ * @tparam Derived The derived expression type
+ *
+ * This uses the Curiously Recurring Template Pattern (CRTP) to provide
+ * static polymorphism for expression templates.
+ */
+template<typename Derived>
+class VectorExpr {
+public:
+    /**
+     * @brief Get the derived expression
+     * @return Reference to the derived type
+     */
+    const Derived& derived() const {
+        return static_cast<const Derived&>(*this);
+    }
+
+    /**
+     * @brief Get the derived expression (non-const)
+     * @return Reference to the derived type
+     */
+    Derived& derived() {
+        return static_cast<Derived&>(*this);
+    }
+
+    /**
+     * @brief Access element by index
+     * @param i Element index
+     * @return Value at index i
+     */
+    auto operator[](std::size_t i) const {
+        return derived()[i];
+    }
+
+    /**
+     * @brief Get the size of the vector expression
+     * @return Number of elements
+     */
+    std::size_t size() const {
+        return derived().size();
+    }
+};
+
+/**
+ * @brief Binary expression for element-wise operations between two vector expressions
+ * @tparam LHS Left-hand side expression type
+ * @tparam RHS Right-hand side expression type
+ * @tparam Op Binary operation functor
+ */
+template<typename LHS, typename RHS, typename Op>
+class VectorBinaryExpr : public VectorExpr<VectorBinaryExpr<LHS, RHS, Op>> {
+private:
+    const LHS& lhs_;
+    const RHS& rhs_;
+    Op op_;
+
+public:
+    /**
+     * @brief Construct binary expression
+     * @param lhs Left operand
+     * @param rhs Right operand
+     * @param op Operation to apply
+     */
+    constexpr VectorBinaryExpr(const LHS& lhs, const RHS& rhs, Op op = Op{})
+        : lhs_(lhs), rhs_(rhs), op_(op) {}
+
+    /**
+     * @brief Access element at index
+     * @param i Element index
+     * @return Result of operation on elements at index i
+     */
+    constexpr auto operator[](std::size_t i) const {
+        return op_(lhs_[i], rhs_[i]);
+    }
+
+    /**
+     * @brief Get size of expression (from left operand)
+     * @return Number of elements
+     */
+    constexpr std::size_t size() const {
+        return lhs_.size();
+    }
+};
+
+/**
+ * @brief Unary expression for element-wise operations on a single vector expression
+ * @tparam Expr Expression type
+ * @tparam Op Unary operation functor
+ */
+template<typename Expr, typename Op>
+class VectorUnaryExpr : public VectorExpr<VectorUnaryExpr<Expr, Op>> {
+private:
+    const Expr& expr_;
+    Op op_;
+
+public:
+    /**
+     * @brief Construct unary expression
+     * @param expr Operand expression
+     * @param op Operation to apply
+     */
+    constexpr VectorUnaryExpr(const Expr& expr, Op op = Op{})
+        : expr_(expr), op_(op) {}
+
+    /**
+     * @brief Access element at index
+     * @param i Element index
+     * @return Result of operation on element at index i
+     */
+    constexpr auto operator[](std::size_t i) const {
+        return op_(expr_[i]);
+    }
+
+    /**
+     * @brief Get size of expression
+     * @return Number of elements
+     */
+    constexpr std::size_t size() const {
+        return expr_.size();
+    }
+};
+
+/**
+ * @brief Scalar multiplication expression
+ * @tparam Expr Vector expression type
+ * @tparam Scalar Scalar type
+ */
+template<typename Expr, typename Scalar>
+class VectorScalarExpr : public VectorExpr<VectorScalarExpr<Expr, Scalar>> {
+private:
+    const Expr& expr_;
+    Scalar scalar_;
+
+public:
+    /**
+     * @brief Construct scalar multiplication expression
+     * @param expr Vector expression
+     * @param scalar Scalar value
+     */
+    constexpr VectorScalarExpr(const Expr& expr, Scalar scalar)
+        : expr_(expr), scalar_(scalar) {}
+
+    /**
+     * @brief Access element at index
+     * @param i Element index
+     * @return Element multiplied by scalar
+     */
+    constexpr auto operator[](std::size_t i) const {
+        return expr_[i] * scalar_;
+    }
+
+    /**
+     * @brief Get size of expression
+     * @return Number of elements
+     */
+    constexpr std::size_t size() const {
+        return expr_.size();
+    }
+};
+
+/**
+ * @brief Scalar division expression
+ * @tparam Expr Vector expression type
+ * @tparam Scalar Scalar type
+ */
+template<typename Expr, typename Scalar>
+class VectorScalarDivExpr : public VectorExpr<VectorScalarDivExpr<Expr, Scalar>> {
+private:
+    const Expr& expr_;
+    Scalar scalar_;
+
+public:
+    /**
+     * @brief Construct scalar division expression
+     * @param expr Vector expression
+     * @param scalar Scalar divisor
+     */
+    constexpr VectorScalarDivExpr(const Expr& expr, Scalar scalar)
+        : expr_(expr), scalar_(scalar) {}
+
+    /**
+     * @brief Access element at index
+     * @param i Element index
+     * @return Element divided by scalar
+     */
+    constexpr auto operator[](std::size_t i) const {
+        return expr_[i] / scalar_;
+    }
+
+    /**
+     * @brief Get size of expression
+     * @return Number of elements
+     */
+    constexpr std::size_t size() const {
+        return expr_.size();
+    }
+};
+
+/**
+ * @brief Addition operator for vector expressions
+ */
+template<typename LHS, typename RHS,
+         typename = std::enable_if_t<
+             std::is_base_of_v<VectorExpr<LHS>, LHS> &&
+             std::is_base_of_v<VectorExpr<RHS>, RHS>
+         >>
+constexpr auto operator+(const VectorExpr<LHS>& lhs, const VectorExpr<RHS>& rhs) {
+    return VectorBinaryExpr<LHS, RHS, detail::ops::Add>(
+        lhs.derived(), rhs.derived(), detail::ops::Add{}
+    );
+}
+
+/**
+ * @brief Subtraction operator for vector expressions
+ */
+template<typename LHS, typename RHS,
+         typename = std::enable_if_t<
+             std::is_base_of_v<VectorExpr<LHS>, LHS> &&
+             std::is_base_of_v<VectorExpr<RHS>, RHS>
+         >>
+constexpr auto operator-(const VectorExpr<LHS>& lhs, const VectorExpr<RHS>& rhs) {
+    return VectorBinaryExpr<LHS, RHS, detail::ops::Sub>(
+        lhs.derived(), rhs.derived(), detail::ops::Sub{}
+    );
+}
+
+/**
+ * @brief Element-wise multiplication operator for vector expressions
+ */
+template<typename LHS, typename RHS,
+         typename = std::enable_if_t<
+             std::is_base_of_v<VectorExpr<LHS>, LHS> &&
+             std::is_base_of_v<VectorExpr<RHS>, RHS>
+         >>
+constexpr auto hadamard(const VectorExpr<LHS>& lhs, const VectorExpr<RHS>& rhs) {
+    return VectorBinaryExpr<LHS, RHS, detail::ops::Mul>(
+        lhs.derived(), rhs.derived(), detail::ops::Mul{}
+    );
+}
+
+/**
+ * @brief Element-wise division operator for vector expressions
+ */
+template<typename LHS, typename RHS,
+         typename = std::enable_if_t<
+             std::is_base_of_v<VectorExpr<LHS>, LHS> &&
+             std::is_base_of_v<VectorExpr<RHS>, RHS>
+         >>
+constexpr auto hadamard_div(const VectorExpr<LHS>& lhs, const VectorExpr<RHS>& rhs) {
+    return VectorBinaryExpr<LHS, RHS, detail::ops::Div>(
+        lhs.derived(), rhs.derived(), detail::ops::Div{}
+    );
+}
+
+/**
+ * @brief Negation operator for vector expressions
+ */
+template<typename Expr,
+         typename = std::enable_if_t<
+             std::is_base_of_v<VectorExpr<Expr>, Expr>
+         >>
+constexpr auto operator-(const VectorExpr<Expr>& expr) {
+    return VectorUnaryExpr<Expr, detail::ops::Negate>(
+        expr.derived(), detail::ops::Negate{}
+    );
+}
+
+/**
+ * @brief Scalar multiplication operator (vector * scalar)
+ */
+template<typename Expr, typename Scalar,
+         typename = std::enable_if_t<
+             std::is_base_of_v<VectorExpr<Expr>, Expr> &&
+             std::is_arithmetic_v<Scalar>
+         >>
+constexpr auto operator*(const VectorExpr<Expr>& expr, Scalar scalar) {
+    return VectorScalarExpr<Expr, Scalar>(expr.derived(), scalar);
+}
+
+/**
+ * @brief Scalar multiplication operator (scalar * vector)
+ */
+template<typename Scalar, typename Expr,
+         typename = std::enable_if_t<
+             std::is_arithmetic_v<Scalar> &&
+             std::is_base_of_v<VectorExpr<Expr>, Expr>
+         >>
+constexpr auto operator*(Scalar scalar, const VectorExpr<Expr>& expr) {
+    return VectorScalarExpr<Expr, Scalar>(expr.derived(), scalar);
+}
+
+/**
+ * @brief Scalar division operator (vector / scalar)
+ */
+template<typename Expr, typename Scalar,
+         typename = std::enable_if_t<
+             std::is_base_of_v<VectorExpr<Expr>, Expr> &&
+             std::is_arithmetic_v<Scalar>
+         >>
+constexpr auto operator/(const VectorExpr<Expr>& expr, Scalar scalar) {
+    return VectorScalarDivExpr<Expr, Scalar>(expr.derived(), scalar);
+}
+
+/**
+ * @brief Element-wise absolute value
+ */
+template<typename Expr,
+         typename = std::enable_if_t<
+             std::is_base_of_v<VectorExpr<Expr>, Expr>
+         >>
+constexpr auto abs(const VectorExpr<Expr>& expr) {
+    return VectorUnaryExpr<Expr, detail::ops::Abs>(expr.derived(), detail::ops::Abs{});
+}
+
+/**
+ * @brief Element-wise square root
+ */
+template<typename Expr,
+         typename = std::enable_if_t<
+             std::is_base_of_v<VectorExpr<Expr>, Expr>
+         >>
+constexpr auto sqrt(const VectorExpr<Expr>& expr) {
+    return VectorUnaryExpr<Expr, detail::ops::Sqrt>(expr.derived(), detail::ops::Sqrt{});
+}
+
+/**
+ * @brief Dot product for vector expressions
+ * @tparam LHS Left vector expression type
+ * @tparam RHS Right vector expression type
+ * @param lhs Left operand
+ * @param rhs Right operand
+ * @return Dot product result
+ */
+template<typename LHS, typename RHS,
+         typename = std::enable_if_t<
+             std::is_base_of_v<VectorExpr<LHS>, LHS> &&
+             std::is_base_of_v<VectorExpr<RHS>, RHS>
+         >>
+constexpr auto dot(const VectorExpr<LHS>& lhs, const VectorExpr<RHS>& rhs) {
+    using result_type = decltype(lhs.derived()[0] * rhs.derived()[0]);
+    result_type sum = result_type{0};
+    const auto n = lhs.size();
+    for (std::size_t i = 0; i < n; ++i) {
+        sum += lhs.derived()[i] * rhs.derived()[i];
+    }
+    return sum;
+}
+
+/**
+ * @brief Compute norm squared of vector expression
+ * @tparam Expr Vector expression type
+ * @param expr Vector expression
+ * @return Square of the Euclidean norm
+ */
+template<typename Expr,
+         typename = std::enable_if_t<
+             std::is_base_of_v<VectorExpr<Expr>, Expr>
+         >>
+constexpr auto norm_squared(const VectorExpr<Expr>& expr) {
+    return dot(expr, expr);
+}
+
+/**
+ * @brief Compute norm of vector expression
+ * @tparam Expr Vector expression type
+ * @param expr Vector expression
+ * @return Euclidean norm
+ */
+template<typename Expr,
+         typename = std::enable_if_t<
+             std::is_base_of_v<VectorExpr<Expr>, Expr>
+         >>
+constexpr auto norm(const VectorExpr<Expr>& expr) {
+    using std::sqrt;
+    return sqrt(norm_squared(expr));
+}
+
+/**
+ * @brief Normalize vector expression
+ * @tparam Expr Vector expression type
+ * @param expr Vector expression
+ * @return Normalized vector expression
+ */
+template<typename Expr,
+         typename = std::enable_if_t<
+             std::is_base_of_v<VectorExpr<Expr>, Expr>
+         >>
+constexpr auto normalize(const VectorExpr<Expr>& expr) {
+    return expr / norm(expr);
+}
+
+} // namespace math
+} // namespace FE
+} // namespace svmp
+
+#endif // SVMP_FE_MATH_VECTOR_EXPR_H
\ No newline at end of file
diff --git a/Code/Source/solver/FE/Quadrature/QuadratureRule.h b/Code/Source/solver/FE/Quadrature/QuadratureRule.h
new file mode 100644
index 000000000..f7d186891
--- /dev/null
+++ b/Code/Source/solver/FE/Quadrature/QuadratureRule.h
@@ -0,0 +1,237 @@
+/* Copyright (c) Stanford University, The Regents of the University of California, and others.
+ *
+ * All Rights Reserved.
+ *
+ * See License file.
+ */
+
+#ifndef SVMP_FE_QUADRATURE_RULE_H
+#define SVMP_FE_QUADRATURE_RULE_H
+
+/**
+ * @file QuadratureRule.h
+ * @brief Abstracted quadrature rule representation for FE integration
+ *
+ * This header defines the base class for all quadrature rules used by the
+ * finite element infrastructure. Rules are expressed in reference element
+ * space only; mapping to physical space is handled by the Geometry module.
+ *
+ * The interface is intentionally lightweight and header-only to avoid coupling
+ * Quadrature to other modules while remaining compatible with the Mesh library
+ * through shared type aliases provided by FE/Common/Types.h.
+ */
+
+#include "Types.h"
+#include "FEException.h"
+#include "Math/Vector.h"
+#include <algorithm>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <iomanip>
+#include <limits>
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace svmp {
+namespace FE {
+namespace quadrature {
+
+/// Convenience alias for quadrature point representation in reference space
+using QuadPoint = math::Vector<Real, 3>;
+
+struct QuadraturePointFingerprint {
+    int dimension{0};
+    std::size_t num_points{0};
+    std::uint64_t points_hash_a{0};
+    std::uint64_t points_hash_b{0};
+};
+
+/**
+ * @brief Base class for quadrature rules over reference elements
+ *
+ * Derived classes populate the point/weight data via the protected setters.
+ * The class performs lightweight consistency checks (size agreement, basic
+ * reference-measure validation) but leaves element-specific checks to callers.
+ */
+class QuadratureRule {
+public:
+    virtual ~QuadratureRule() = default;
+
+    /// Number of quadrature points
+    std::size_t num_points() const noexcept { return points_.size(); }
+
+    /// Polynomial exactness degree reported by the rule
+    int order() const noexcept { return order_; }
+
+    /// Spatial dimension of the reference domain
+    int dimension() const noexcept { return dimension_; }
+
+    /// Cell family the rule integrates over (line, tri, quad, ...)
+    svmp::CellFamily cell_family() const noexcept { return cell_family_; }
+
+    /// Access a single quadrature point (no bounds checking)
+    QuadPoint point(std::size_t i) const noexcept { return points_[i]; }
+
+    /// Access a single quadrature weight (no bounds checking)
+    Real weight(std::size_t i) const noexcept { return weights_[i]; }
+
+    /// Bulk accessors
+    const std::vector<QuadPoint>& points() const noexcept { return points_; }
+    const std::vector<Real>& weights() const noexcept { return weights_; }
+
+    /// Cached coordinate-only fingerprint for consumers whose values depend on
+    /// reference points but not quadrature weights.
+    QuadraturePointFingerprint point_fingerprint() const noexcept { return point_fingerprint_; }
+
+    /// Stable semantic identity used by BasisCache
+    virtual std::string cache_identity() const;
+
+    /**
+     * @brief Validate rule data for basic consistency
+     * @param tol Relative tolerance for weight sum check
+     * @return True if rule passes size and weight checks
+     */
+    virtual bool is_valid(Real tol = 1e-12) const;
+
+    /**
+     * @brief Reference-domain measure for the element family
+     *
+     * Length/area/volume of the canonical reference element:
+     * - Line   [-1,1]            -> 2
+     * - Quad   [-1,1]^2          -> 4
+     * - Hex    [-1,1]^3          -> 8
+     * - Tri    (0,0)-(1,0)-(0,1) -> 0.5
+     * - Tet    simplex at origin -> 1/6
+     * - Wedge  (triangle x line) -> 1
+     * - Pyramid (x,y in [-1,1], z in [0,1]) -> 4/3
+     */
+    Real reference_measure() const noexcept;
+
+protected:
+    QuadratureRule(svmp::CellFamily family, int dimension, int order = 0)
+        : cell_family_(family), dimension_(dimension), order_(order) {}
+
+    /// Assign point and weight storage (sizes must match)
+    void set_data(std::vector<QuadPoint> pts, std::vector<Real> wts);
+
+    /// Override computed order in derived classes
+    void set_order(int ord) noexcept { order_ = ord; }
+
+private:
+    std::string build_cache_identity() const;
+    QuadraturePointFingerprint build_point_fingerprint() const noexcept;
+
+    svmp::CellFamily cell_family_;
+    int dimension_;
+    int order_;
+    std::vector<QuadPoint> points_;
+    std::vector<Real> weights_;
+    std::string cache_identity_;
+    QuadraturePointFingerprint point_fingerprint_;
+};
+
+// --------------------------------------------------------------------------------
+// Inline implementations
+// --------------------------------------------------------------------------------
+
+inline void QuadratureRule::set_data(std::vector<QuadPoint> pts, std::vector<Real> wts) {
+    if (pts.size() != wts.size()) {
+        throw FEException("QuadratureRule: points/weights size mismatch",
+                          StatusCode::InvalidArgument,
+                          __FILE__, __LINE__, __func__);
+    }
+    points_ = std::move(pts);
+    weights_ = std::move(wts);
+    point_fingerprint_ = build_point_fingerprint();
+    cache_identity_ = build_cache_identity();
+}
+
+inline bool QuadratureRule::is_valid(Real tol) const {
+    if (points_.empty() || points_.size() != weights_.size()) {
+        return false;
+    }
+    Real sum_w = Real(0);
+    for (Real w : weights_) {
+        if (!std::isfinite(w)) {
+            return false;
+        }
+        sum_w += w;
+    }
+    const Real ref = reference_measure();
+    const Real denom = std::max(Real(1), std::abs(ref));
+    return std::abs(sum_w - ref) <= tol * denom;
+}
+
+inline std::string QuadratureRule::cache_identity() const {
+    if (!cache_identity_.empty()) {
+        return cache_identity_;
+    }
+    return build_cache_identity();
+}
+
+inline std::string QuadratureRule::build_cache_identity() const {
+    std::ostringstream oss;
+    oss << "dim=" << dimension_
+        << "|npts=" << points_.size();
+
+    oss << std::setprecision(std::numeric_limits<Real>::max_digits10);
+    for (const auto& pt : points_) {
+        oss << "|pt=" << pt[0] << ',' << pt[1] << ',' << pt[2];
+    }
+    return oss.str();
+}
+
+inline QuadraturePointFingerprint QuadratureRule::build_point_fingerprint() const noexcept {
+    auto real_bits = [](Real value) noexcept {
+        static_assert(sizeof(Real) <= sizeof(std::uint64_t),
+                      "Quadrature point fingerprints assume Real fits in 64 bits");
+        std::uint64_t bits = 0;
+        std::memcpy(&bits, &value, sizeof(Real));
+        return bits;
+    };
+    auto mix_hash = [](std::uint64_t& seed, std::uint64_t value) noexcept {
+        seed ^= value + 0x9e3779b97f4a7c15ULL + (seed << 6u) + (seed >> 2u);
+    };
+
+    QuadraturePointFingerprint fingerprint;
+    fingerprint.dimension = dimension_;
+    fingerprint.num_points = points_.size();
+    fingerprint.points_hash_a = 1469598103934665603ULL;
+    fingerprint.points_hash_b = 1099511628211ULL;
+
+    mix_hash(fingerprint.points_hash_a, static_cast<std::uint64_t>(fingerprint.dimension));
+    mix_hash(fingerprint.points_hash_a, static_cast<std::uint64_t>(fingerprint.num_points));
+    mix_hash(fingerprint.points_hash_b, static_cast<std::uint64_t>(fingerprint.num_points));
+    mix_hash(fingerprint.points_hash_b, static_cast<std::uint64_t>(fingerprint.dimension));
+    for (const auto& point : points_) {
+        for (std::size_t component = 0; component < 3u; ++component) {
+            const std::uint64_t bits = real_bits(point[component]);
+            mix_hash(fingerprint.points_hash_a, bits);
+            mix_hash(fingerprint.points_hash_b, bits ^ (0xbf58476d1ce4e5b9ULL + component));
+        }
+    }
+    return fingerprint;
+}
+
+inline Real QuadratureRule::reference_measure() const noexcept {
+    switch (cell_family_) {
+        case svmp::CellFamily::Line:      return Real(2);
+        case svmp::CellFamily::Quad:      return Real(4);
+        case svmp::CellFamily::Hex:       return Real(8);
+        case svmp::CellFamily::Triangle:  return Real(0.5);
+        case svmp::CellFamily::Tetra:     return Real(1.0 / 6.0);
+        case svmp::CellFamily::Wedge:     return Real(1.0);     // 0.5 area * length 2
+        case svmp::CellFamily::Pyramid:   return Real(4.0 / 3.0);
+        case svmp::CellFamily::Point:     return Real(1.0);
+        default:                          return Real(1.0);
+    }
+}
+
+} // namespace quadrature
+} // namespace FE
+} // namespace svmp
+
+#endif // SVMP_FE_QUADRATURE_RULE_H
diff --git a/Code/Source/solver/fs.cpp b/Code/Source/solver/fs.cpp
index d592a8b96..abe1992df 100644
--- a/Code/Source/solver/fs.cpp
+++ b/Code/Source/solver/fs.cpp
@@ -5,10 +5,66 @@
 
 #include "fs.h"
 #include "consts.h"
+#include "FE/Common/FEException.h"
 #include "nn.h"
 
+#include <algorithm>
+#include <string>
+
 namespace fs {
 
+namespace {
+
+namespace fe = svmp::FE;
+
+std::string element_name(consts::ElementType eType)
+{
+  const auto iter = consts::element_type_to_string.find(eType);
+  if (iter != consts::element_type_to_string.end()) {
+    return iter->second;
+  }
+
+  return "unknown (" + std::to_string(static_cast<int>(eType)) + ")";
+}
+
+bool supports_reference_hessians(consts::ElementType eType)
+{
+  using namespace consts;
+
+  switch (eType) {
+    case ElementType::LIN1:
+    case ElementType::LIN2:
+    case ElementType::TRI3:
+    case ElementType::TRI6:
+    case ElementType::QUD4:
+    case ElementType::QUD8:
+    case ElementType::QUD9:
+    case ElementType::TET4:
+    case ElementType::TET10:
+    case ElementType::HEX8:
+    case ElementType::HEX20:
+    case ElementType::HEX27:
+    case ElementType::WDG:
+      return true;
+    default:
+      return false;
+  }
+}
+
+void populate_reference_hessians_if_supported(fsType& fs, const int insd)
+{
+  if (fs.Nxx.size() == 0 || !supports_reference_hessians(fs.eType)) {
+    return;
+  }
+
+  const int ind2 = std::max(3 * (insd - 1), 1);
+  for (int g = 0; g < fs.nG; ++g) {
+    nn::get_gn_nxx(insd, ind2, fs.eType, fs.eNoN, g, fs.xi, fs.Nxx);
+  }
+}
+
+} // namespace
+
 
 /// @brief Allocates arrays within the function space type. Assumes that 
 /// fs%eNoN and fs%nG are already defined
@@ -103,6 +159,7 @@ void get_thood_fs(ComMod& com_mod, std::array<fsType,2>& fs, const mshType& lM,
         nn::get_gnn(nsd, fs[1].eType, fs[1].eNoN, g, fs[1].xi, fs[1].N, fs[1].Nx);
       }
       nn::get_nn_bnds(nsd, fs[1].eType, fs[1].eNoN, fs[1].xib, fs[1].Nb);
+      populate_reference_hessians_if_supported(fs[1], nsd);
 
     } else if (iOpt == 2) {
       fs[1].nG    = lM.fs[1].nG;
@@ -133,6 +190,7 @@ void get_thood_fs(ComMod& com_mod, std::array<fsType,2>& fs, const mshType& lM,
         nn::get_gnn(nsd, fs[0].eType, fs[0].eNoN, g, fs[0].xi, fs[0].N, fs[0].Nx);
       }
       nn::get_nn_bnds(nsd, fs[0].eType, fs[0].eNoN, fs[0].xib, fs[0].Nb);
+      populate_reference_hessians_if_supported(fs[0], nsd);
     }
   }
 }
@@ -275,14 +333,7 @@ void init_fs_msh(const ComMod& com_mod, mshType& lM)
     lM.fs[0].Nb  = lM.Nb;
     lM.fs[0].Nx  = lM.Nx;
   }
-  // Second order derivatives for vector function space
-  //
-  if (!lM.fs[0].lShpF) {
-    int ind2 = std::max(3*(insd-1), 1);
-    for (int g = 0; g < lM.fs[0].nG; g++) {
-      nn::get_gn_nxx(insd, ind2, lM.fs[0].eType, lM.fs[0].eNoN, g, lM.fs[0].xi, lM.fs[0].Nxx);
-    }
-  }
+  populate_reference_hessians_if_supported(lM.fs[0], insd);
 
   // Sets Taylor-Hood basis [fluid, stokes, ustruct, FSI)
   if (lM.nFs == 2) {
@@ -291,6 +342,7 @@ void init_fs_msh(const ComMod& com_mod, mshType& lM)
 
     // Initialize the function space
     init_fs(lM.fs[1], nsd, insd);
+    populate_reference_hessians_if_supported(lM.fs[1], insd);
   }
 }
 
@@ -343,7 +395,8 @@ void set_thood_fs(fsType& fs, consts::ElementType eType)
     break;
 
     default:
-      throw std::runtime_error("Cannot choose Taylor-Hood basis");
+      throw fe::InvalidElementException("Cannot choose Taylor-Hood basis",
+          element_name(eType), __FILE__, __LINE__, __func__);
     break;
   }
 }
diff --git a/Code/Source/solver/nn.cpp b/Code/Source/solver/nn.cpp
index 9f12d64e4..51c126708 100644
--- a/Code/Source/solver/nn.cpp
+++ b/Code/Source/solver/nn.cpp
@@ -15,15 +15,28 @@
 #include "Array.h"
 #include "Vector.h"
 
+#include "FE/Basis/BasisExceptions.h"
+#include "FE/Basis/BasisFactory.h"
+#include "FE/Common/FEException.h"
+
 #include "consts.h"
 #include "mat_fun.h"
 #include "utils.h"
 
 #include "lapack_defs.h"
 
+#include <algorithm>
+#include <array>
+#include <cstdlib>
+#include <cctype>
+#include <exception>
 #include <functional>
 #include <iostream> 
 #include <math.h> 
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
 
 namespace nn {
 
@@ -46,13 +59,510 @@ using namespace consts;
 // Define a map type used to set the bounds of element shape functions.
 #include "nn_elem_nn_bnds.h"
 
+namespace {
+
+namespace fe = svmp::FE;
+namespace febasis = svmp::FE::basis;
+
+struct BasisSelection {
+  fe::ElementType element;
+  fe::BasisType basis;
+  int order;
+};
+
+enum class BasisMode {
+  Auto,
+  Legacy,
+  Fe
+};
+
+std::string normalize_basis_mode_name(std::string value)
+{
+  std::transform(value.begin(), value.end(), value.begin(),
+      [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+  return value;
+}
+
+BasisMode parse_basis_mode()
+{
+  const char* mode_env = std::getenv("SVMP_BASIS_MODE");
+  if (mode_env == nullptr || *mode_env == '\0') {
+    return BasisMode::Auto;
+  }
+
+  const std::string mode = normalize_basis_mode_name(mode_env);
+  if (mode == "auto") {
+    return BasisMode::Auto;
+  }
+  if (mode == "legacy") {
+    return BasisMode::Legacy;
+  }
+  if (mode == "fe") {
+    return BasisMode::Fe;
+  }
+
+  throw febasis::BasisConfigurationException(
+      "Invalid SVMP_BASIS_MODE='" + std::string(mode_env) +
+          "'. Expected one of: auto, legacy, fe",
+      __FILE__, __LINE__, __func__);
+}
+
+BasisMode active_basis_mode()
+{
+  static const BasisMode mode = parse_basis_mode();
+  return mode;
+}
+
+const char* basis_mode_name(BasisMode mode)
+{
+  switch (mode) {
+    case BasisMode::Auto:
+      return "auto";
+    case BasisMode::Legacy:
+      return "legacy";
+    case BasisMode::Fe:
+      return "fe";
+  }
+  return "unknown";
+}
+
+void log_basis_mode_once()
+{
+  static const bool logged = []() {
+    std::cout << "[svMultiPhysics] SVMP_BASIS_MODE="
+              << basis_mode_name(active_basis_mode()) << std::endl;
+    return true;
+  }();
+  (void)logged;
+}
+
+bool basis_mode_allows_fe_adapter()
+{
+  return active_basis_mode() != BasisMode::Legacy;
+}
+
+std::string solver_element_name(consts::ElementType eType)
+{
+  auto it = consts::element_type_to_string.find(eType);
+  if (it != consts::element_type_to_string.end()) {
+    return it->second + " (" + std::to_string(static_cast<int>(eType)) + ")";
+  }
+  return "unknown (" + std::to_string(static_cast<int>(eType)) + ")";
+}
+
+std::optional<BasisSelection> to_basis_selection(consts::ElementType eType)
+{
+  switch (eType) {
+    case consts::ElementType::LIN1:
+      return BasisSelection{fe::ElementType::Line2, fe::BasisType::Lagrange, 1};
+    case consts::ElementType::LIN2:
+      return BasisSelection{fe::ElementType::Line3, fe::BasisType::Lagrange, 2};
+    case consts::ElementType::TRI3:
+      return BasisSelection{fe::ElementType::Triangle3, fe::BasisType::Lagrange, 1};
+    case consts::ElementType::TRI6:
+      return BasisSelection{fe::ElementType::Triangle6, fe::BasisType::Lagrange, 2};
+    case consts::ElementType::QUD4:
+      return BasisSelection{fe::ElementType::Quad4, fe::BasisType::Lagrange, 1};
+    case consts::ElementType::QUD8:
+      return BasisSelection{fe::ElementType::Quad8, fe::BasisType::Serendipity, 2};
+    case consts::ElementType::QUD9:
+      return BasisSelection{fe::ElementType::Quad9, fe::BasisType::Lagrange, 2};
+    case consts::ElementType::TET4:
+      return BasisSelection{fe::ElementType::Tetra4, fe::BasisType::Lagrange, 1};
+    case consts::ElementType::TET10:
+      return BasisSelection{fe::ElementType::Tetra10, fe::BasisType::Lagrange, 2};
+    case consts::ElementType::HEX8:
+      return BasisSelection{fe::ElementType::Hex8, fe::BasisType::Lagrange, 1};
+    case consts::ElementType::HEX20:
+      return BasisSelection{fe::ElementType::Hex20, fe::BasisType::Serendipity, 2};
+    case consts::ElementType::HEX27:
+      return BasisSelection{fe::ElementType::Hex27, fe::BasisType::Lagrange, 2};
+    case consts::ElementType::WDG:
+      return BasisSelection{fe::ElementType::Wedge6, fe::BasisType::Lagrange, 1};
+    default:
+      return std::nullopt;
+  }
+}
+
+bool use_basis_adapter_for(consts::ElementType eType)
+{
+  return basis_mode_allows_fe_adapter() && to_basis_selection(eType).has_value();
+}
+
+bool supports_basis_hessian_adapter_for(consts::ElementType eType)
+{
+  return basis_mode_allows_fe_adapter() && to_basis_selection(eType).has_value();
+}
+
+bool supports_face_basis_adapter_for(consts::ElementType eType)
+{
+  if (!basis_mode_allows_fe_adapter()) {
+    return false;
+  }
+
+  switch (eType) {
+    case consts::ElementType::LIN1:
+    case consts::ElementType::LIN2:
+    case consts::ElementType::TRI3:
+    case consts::ElementType::TRI6:
+    case consts::ElementType::QUD4:
+    case consts::ElementType::QUD8:
+    case consts::ElementType::QUD9:
+      return to_basis_selection(eType).has_value();
+    default:
+      return false;
+  }
+}
+
+std::shared_ptr<febasis::BasisFunction> make_basis_for_solver_element(consts::ElementType eType)
+{
+  auto selection = to_basis_selection(eType);
+  if (!selection) {
+    throw febasis::BasisElementCompatibilityException(
+        "No FE Basis selection for solver element " + solver_element_name(eType),
+        __FILE__, __LINE__, __func__);
+  }
+
+  febasis::BasisRequest request;
+  request.element_type = selection->element;
+  request.basis_type = selection->basis;
+  request.order = selection->order;
+  return febasis::basis_factory::create(request);
+}
+
+template <std::size_t NumNodes>
+std::size_t mapped_basis_index(const std::array<std::size_t, NumNodes>& map,
+                               consts::ElementType eType,
+                               const int solver_node)
+{
+  if (solver_node < 0 || static_cast<std::size_t>(solver_node) >= map.size()) {
+    throw febasis::BasisNodeOrderingException(
+        "Solver node " + std::to_string(solver_node) +
+            " is outside node map for " + solver_element_name(eType),
+        __FILE__, __LINE__, __func__);
+  }
+
+  return map[static_cast<std::size_t>(solver_node)];
+}
+
+std::size_t basis_index_for_solver_node(consts::ElementType eType, const int solver_node)
+{
+  if (solver_node < 0) {
+    throw febasis::BasisNodeOrderingException(
+        "Solver node " + std::to_string(solver_node) +
+            " is outside node map for " + solver_element_name(eType),
+        __FILE__, __LINE__, __func__);
+  }
+
+  const auto node = static_cast<std::size_t>(solver_node);
+
+  switch (eType) {
+    case consts::ElementType::TRI3: {
+      static constexpr std::array<std::size_t, 3> map{1, 2, 0};
+      return mapped_basis_index(map, eType, solver_node);
+    }
+    case consts::ElementType::TRI6: {
+      static constexpr std::array<std::size_t, 6> map{1, 2, 0, 4, 5, 3};
+      return mapped_basis_index(map, eType, solver_node);
+    }
+    case consts::ElementType::TET4: {
+      static constexpr std::array<std::size_t, 4> map{1, 2, 3, 0};
+      return mapped_basis_index(map, eType, solver_node);
+    }
+    case consts::ElementType::TET10: {
+      static constexpr std::array<std::size_t, 10> map{1, 2, 3, 0, 5, 9, 8, 4, 6, 7};
+      return mapped_basis_index(map, eType, solver_node);
+    }
+    case consts::ElementType::WDG: {
+      static constexpr std::array<std::size_t, 6> map{1, 2, 0, 4, 5, 3};
+      return mapped_basis_index(map, eType, solver_node);
+    }
+    case consts::ElementType::HEX27: {
+      static constexpr std::array<std::size_t, 27> map{
+        0, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19,
+        25, 23, 22, 24, 20, 21, 26
+      };
+      return mapped_basis_index(map, eType, solver_node);
+    }
+    default:
+      return node;
+  }
+}
+
+fe::math::Vector<fe::Real, 3> make_basis_point(const febasis::BasisFunction& basis,
+                                               const int g,
+                                               const Array<double>& xi)
+{
+  if (xi.nrows() < basis.dimension()) {
+    throw febasis::BasisConfigurationException(
+        "xi has " + std::to_string(xi.nrows()) +
+            " rows but FE Basis element requires " + std::to_string(basis.dimension()) +
+            " reference coordinates",
+        __FILE__, __LINE__, __func__);
+  }
+
+  fe::math::Vector<fe::Real, 3> point{};
+  for (int d = 0; d < basis.dimension(); ++d) {
+    point[static_cast<std::size_t>(d)] = xi(d, g);
+  }
+  return point;
+}
+
+void copy_basis_values_to_solver_arrays(consts::ElementType eType,
+                                        const int eNoN,
+                                        const int g,
+                                        const std::vector<fe::Real>& values,
+                                        const std::vector<febasis::Gradient>& gradients,
+                                        Array<double>& N,
+                                        Array3<double>& Nx)
+{
+  if (values.size() != static_cast<std::size_t>(eNoN)) {
+    throw febasis::BasisEvaluationException(
+        "FE Basis value count " + std::to_string(values.size()) +
+            " does not match solver eNoN " + std::to_string(eNoN),
+        __FILE__, __LINE__, __func__);
+  }
+  if (gradients.size() != static_cast<std::size_t>(eNoN)) {
+    throw febasis::BasisEvaluationException(
+        "FE Basis gradient count " + std::to_string(gradients.size()) +
+            " does not match solver eNoN " + std::to_string(eNoN),
+        __FILE__, __LINE__, __func__);
+  }
+
+  for (int a = 0; a < eNoN; ++a) {
+    const auto basis_index = basis_index_for_solver_node(eType, a);
+    if (basis_index >= values.size() || basis_index >= gradients.size()) {
+      throw febasis::BasisNodeOrderingException(
+          "Solver node " + std::to_string(a) + " maps to FE Basis node " +
+              std::to_string(basis_index) + " outside basis output for " +
+              solver_element_name(eType),
+          __FILE__, __LINE__, __func__);
+    }
+
+    N(a, g) = values[basis_index];
+
+    for (int d = 0; d < Nx.nrows(); ++d) {
+      Nx(d, a, g) = 0.0;
+    }
+    const int ndim = std::min<int>(Nx.nrows(), 3);
+    for (int d = 0; d < ndim; ++d) {
+      Nx(d, a, g) = gradients[basis_index][static_cast<std::size_t>(d)];
+    }
+  }
+}
+
+void evaluate_basis_values_and_gradients(const int insd,
+                                         consts::ElementType eType,
+                                         const int eNoN,
+                                         const int g,
+                                         Array<double>& xi,
+                                         Array<double>& N,
+                                         Array3<double>& Nx)
+{
+  auto basis = make_basis_for_solver_element(eType);
+  if (insd < basis->dimension()) {
+    throw febasis::BasisConfigurationException(
+        "solver insd " + std::to_string(insd) +
+            " is smaller than FE Basis reference dimension " + std::to_string(basis->dimension()),
+        __FILE__, __LINE__, __func__);
+  }
+
+  const auto point = make_basis_point(*basis, g, xi);
+  std::vector<fe::Real> values;
+  std::vector<febasis::Gradient> gradients;
+  basis->evaluate_values(point, values);
+  basis->evaluate_gradients(point, gradients);
+
+  // FE Basis owns the formulas; fsType and mshType remain the solver-facing storage contract.
+  copy_basis_values_to_solver_arrays(eType, eNoN, g, values, gradients, N, Nx);
+}
+
+void evaluate_face_basis_values_and_gradients(const int gaus_pt, faceType& face)
+{
+  evaluate_basis_values_and_gradients(
+      face.xi.nrows(),
+      face.eType,
+      face.eNoN,
+      gaus_pt,
+      face.xi,
+      face.N,
+      face.Nx);
+}
+
+int required_nxx_components_for_dimension(const int dimension)
+{
+  switch (dimension) {
+    case 1:
+      return 1;
+    case 2:
+      return 3;
+    case 3:
+      return 6;
+    default:
+      throw febasis::BasisConfigurationException(
+          "Unsupported FE Basis reference dimension " + std::to_string(dimension),
+          __FILE__, __LINE__, __func__);
+  }
+}
+
+void copy_basis_hessians_to_solver_nxx(consts::ElementType eType,
+                                       const int eNoN,
+                                       const int g,
+                                       const int dimension,
+                                       const std::vector<febasis::Hessian>& hessians,
+                                       Array3<double>& Nxx)
+{
+  if (hessians.size() != static_cast<std::size_t>(eNoN)) {
+    throw febasis::BasisEvaluationException(
+        "FE Basis Hessian count " + std::to_string(hessians.size()) +
+            " does not match solver eNoN " + std::to_string(eNoN),
+        __FILE__, __LINE__, __func__);
+  }
+
+  const int required_components = required_nxx_components_for_dimension(dimension);
+  if (Nxx.nrows() < required_components) {
+    throw febasis::BasisConfigurationException(
+        "solver Nxx has " + std::to_string(Nxx.nrows()) +
+            " rows but FE Basis Hessian packing requires " + std::to_string(required_components),
+        __FILE__, __LINE__, __func__);
+  }
+
+  for (int a = 0; a < eNoN; ++a) {
+    for (int i = 0; i < Nxx.nrows(); ++i) {
+      Nxx(i, a, g) = 0.0;
+    }
+
+    const auto basis_index = basis_index_for_solver_node(eType, a);
+    if (basis_index >= hessians.size()) {
+      throw febasis::BasisNodeOrderingException(
+          "Solver node " + std::to_string(a) + " maps to FE Basis Hessian node " +
+              std::to_string(basis_index) + " outside basis output for " +
+              solver_element_name(eType),
+          __FILE__, __LINE__, __func__);
+    }
+
+    const auto& hessian = hessians[basis_index];
+    Nxx(0, a, g) = hessian(0, 0);
+    if (dimension >= 2) {
+      Nxx(1, a, g) = hessian(1, 1);
+      Nxx(2, a, g) = hessian(0, 1);
+    }
+    if (dimension >= 3) {
+      Nxx(2, a, g) = hessian(2, 2);
+      Nxx(3, a, g) = hessian(0, 1);
+      Nxx(4, a, g) = hessian(1, 2);
+      Nxx(5, a, g) = hessian(0, 2);
+    }
+  }
+}
+
+void evaluate_basis_hessians(const int insd,
+                             const int ind2,
+                             consts::ElementType eType,
+                             const int eNoN,
+                             const int gaus_pt,
+                             const Array<double>& xi,
+                             Array3<double>& Nxx)
+{
+  auto basis = make_basis_for_solver_element(eType);
+  if (insd < basis->dimension()) {
+    throw febasis::BasisConfigurationException(
+        "solver insd " + std::to_string(insd) +
+            " is smaller than FE Basis reference dimension " + std::to_string(basis->dimension()),
+        __FILE__, __LINE__, __func__);
+  }
+
+  const int required_components = required_nxx_components_for_dimension(basis->dimension());
+  if (ind2 < required_components) {
+    throw febasis::BasisConfigurationException(
+        "solver ind2 " + std::to_string(ind2) +
+            " is smaller than packed Hessian component count " + std::to_string(required_components),
+        __FILE__, __LINE__, __func__);
+  }
+
+  const auto point = make_basis_point(*basis, gaus_pt, xi);
+  std::vector<febasis::Hessian> hessians;
+  basis->evaluate_hessians(point, hessians);
+
+  // Solver Nxx packing is dxx, dyy, dxy in 2D and dxx, dyy, dzz, dxy, dyz, dxz in 3D.
+  copy_basis_hessians_to_solver_nxx(eType, eNoN, gaus_pt, basis->dimension(), hessians, Nxx);
+}
+
+void call_legacy_get_gnn(const int insd,
+                         consts::ElementType eType,
+                         const int eNoN,
+                         const int g,
+                         Array<double>& xi,
+                         Array<double>& N,
+                         Array3<double>& Nx,
+                         const std::string& basis_failure = "")
+{
+  try {
+    get_element_shape_data[eType](insd, eNoN, g, xi, N, Nx);
+  } catch (const std::bad_function_call&) {
+    std::string message = "[get_gnn] No FE Basis or legacy shape support for element " +
+        solver_element_name(eType) + "; legacy fallback was attempted";
+    if (!basis_failure.empty()) {
+      message += " after FE Basis failure: " + basis_failure;
+    }
+    throw fe::InvalidElementException(message, solver_element_name(eType),
+        __FILE__, __LINE__, __func__);
+  }
+}
+
+void call_legacy_get_gn_nxx(const int insd,
+                            const int ind2,
+                            consts::ElementType eType,
+                            const int eNoN,
+                            const int gaus_pt,
+                            const Array<double>& xi,
+                            Array3<double>& Nxx,
+                            const std::string& basis_failure = "",
+                            const bool allow_missing_legacy_table = false)
+{
+  try {
+    get_element_2nd_derivs[eType](insd, ind2, eNoN, gaus_pt, xi, Nxx);
+  } catch (const std::bad_function_call&) {
+    if (allow_missing_legacy_table) {
+      return;
+    }
+
+    std::string message = "[get_gn_nxx] No FE Basis or legacy second-derivative support for element " +
+        solver_element_name(eType) + "; legacy fallback was attempted";
+    if (!basis_failure.empty()) {
+      message += " after FE Basis failure: " + basis_failure;
+    }
+    throw fe::InvalidElementException(message, solver_element_name(eType),
+        __FILE__, __LINE__, __func__);
+  }
+}
+
+void call_legacy_face_shape_data(const int gaus_pt, faceType& face)
+{
+  auto legacy_shape = set_face_shape_data.find(face.eType);
+  if (legacy_shape == set_face_shape_data.end()) {
+    throw fe::InvalidElementException(
+        "[get_gnn(face)] No FE Basis or legacy face shape support",
+        solver_element_name(face.eType), __FILE__, __LINE__, __func__);
+  }
+
+  legacy_shape->second(gaus_pt, face);
+}
+
+} // namespace
+
 void get_gip(const int insd, consts::ElementType eType, const int nG, Vector<double>& w, Array<double>& xi) 
 {
+  log_basis_mode_once();
+
   try {
     get_element_gauss_int_data[eType](insd, nG, w, xi);
   } catch (const std::bad_function_call& exception) {
-    throw std::runtime_error("No support for element etype " + std::to_string(static_cast<int>(eType)) + 
-        " in 'get_element_gauss_int_data'.");
+    throw fe::InvalidElementException(
+        "No support in 'get_element_gauss_int_data'",
+        solver_element_name(eType), __FILE__, __LINE__, __func__);
   }
 }
 
@@ -62,19 +572,27 @@ void get_gip(const int insd, consts::ElementType eType, const int nG, Vector<dou
 //
 void get_gip(mshType& mesh)
 {
+  log_basis_mode_once();
+
   try {
     set_element_gauss_int_data[mesh.eType](mesh);
   } catch (const std::bad_function_call& exception) {
-    throw std::runtime_error("No support for mesh etype " + std::to_string(static_cast<int>(mesh.eType)) + " in 'set_element_gauss_int_data'.");
+    throw fe::InvalidElementException(
+        "No support in 'set_element_gauss_int_data'",
+        solver_element_name(mesh.eType), __FILE__, __LINE__, __func__);
   }
 }
 
 void get_gip(Simulation* simulation, faceType& face)
 {
+  log_basis_mode_once();
+
   try {
     set_face_gauss_int_data[face.eType](face);
   } catch (const std::bad_function_call& exception) {
-    throw std::runtime_error("No support for face type " + std::to_string(static_cast<int>(face.eType)) + " in 'set_face_gauss_int_data'.");
+    throw fe::InvalidElementException(
+        "No support in 'set_face_gauss_int_data'",
+        solver_element_name(face.eType), __FILE__, __LINE__, __func__);
   }
 }
 
@@ -83,11 +601,26 @@ void get_gip(Simulation* simulation, faceType& face)
 void get_gnn(const int insd, consts::ElementType eType, const int eNoN, const int g, Array<double>& xi, 
     Array<double>& N, Array3<double>& Nx)
 {
-  try {
-    get_element_shape_data[eType](insd, eNoN, g, xi, N, Nx);
-  } catch (const std::bad_function_call& exception) {
-    throw std::runtime_error("[get_gnn] No support for element type " + std::to_string(static_cast<int>(eType)) + " in 'get_element_shape_data'.");
+  log_basis_mode_once();
+
+  if (use_basis_adapter_for(eType)) {
+    try {
+      evaluate_basis_values_and_gradients(insd, eType, eNoN, g, xi, N, Nx);
+      return;
+    } catch (const fe::NotImplementedException& exception) {
+      call_legacy_get_gnn(insd, eType, eNoN, g, xi, N, Nx, exception.what());
+      return;
+    } catch (const std::exception& exception) {
+      throw febasis::BasisEvaluationException(
+          "[get_gnn] FE Basis adapter failed for element " +
+              solver_element_name(eType) +
+              "; legacy fallback was not attempted for this approved element: " +
+              exception.what(),
+          __FILE__, __LINE__, __func__);
+    }
   }
+
+  call_legacy_get_gnn(insd, eType, eNoN, g, xi, N, Nx);
 }
 
 /// @brief A big fat hack because the Fortran GETNN() operates on primitive types but
@@ -111,20 +644,48 @@ void get_gnn(const int nsd, consts::ElementType eType, const int eNoN, Vector<do
 
 void get_gnn(int gaus_pt, mshType& mesh)
 {
-  try {
-    set_element_shape_data[mesh.eType](gaus_pt, mesh);
-  } catch (const std::bad_function_call& exception) {
-    throw std::runtime_error("[get_gnn] No support for element type " + std::to_string(static_cast<int>(mesh.eType)) + " in 'set_element_shape_data'.");
-  }
+  nn::get_gnn(mesh.xi.nrows(), mesh.eType, mesh.eNoN, gaus_pt, mesh.xi, mesh.N, mesh.Nx);
 }
 
 void get_gnn(Simulation* simulation, int gaus_pt, faceType& face)
 {
-  try {
-    set_face_shape_data[face.eType](gaus_pt, face);
-  } catch (const std::bad_function_call& exception) {
-    throw std::runtime_error("No support for face type " + std::to_string(static_cast<int>(face.eType)) + " in 'set_face_shape_data'.");
+  using consts::ElementType;
+
+  log_basis_mode_once();
+
+  if (active_basis_mode() == BasisMode::Legacy) {
+    call_legacy_face_shape_data(gaus_pt, face);
+    return;
+  }
+
+  if (face.eType == ElementType::NRB) {
+    throw fe::NotImplementedException(
+        "[get_gnn(face)] NRB face shape functions remain unsupported by FE Basis and the legacy face table",
+        __FILE__, __LINE__, __func__);
+  }
+
+  if (supports_face_basis_adapter_for(face.eType)) {
+    try {
+      // FE Basis owns mapped face N/Nx formulas; faceType remains the solver-facing storage contract.
+      evaluate_face_basis_values_and_gradients(gaus_pt, face);
+      return;
+    } catch (const std::exception& exception) {
+      throw febasis::BasisEvaluationException(
+          "[get_gnn(face)] FE Basis face adapter failed for mapped face element " +
+              solver_element_name(face.eType) + "; legacy fallback was not attempted: " +
+              exception.what(),
+          __FILE__, __LINE__, __func__);
+    }
   }
+
+  if (face.eType == ElementType::PNT) {
+    // Point faces have no mapped FE Basis representation in this pass; keep the legacy scalar value path.
+    call_legacy_face_shape_data(gaus_pt, face);
+    return;
+  }
+
+  // The legacy face table is retained only for explicitly unsupported paths and future cleanup.
+  call_legacy_face_shape_data(gaus_pt, face);
 }
 
 /// @brief Returns second order derivatives at given natural coords
@@ -136,19 +697,40 @@ void get_gn_nxx(const int insd, const int ind2, consts::ElementType eType, const
 {
   using namespace consts;
 
-  // Element types that don't have 2nd derivatives computed for them.
-  static std::set<ElementType> no_derivs{ElementType::NRB, ElementType::QUD4, ElementType::HEX8, 
-                                         ElementType::HEX20, ElementType::HEX27};
+  log_basis_mode_once();
 
-  if (no_derivs.count(eType) != 0) {
+  // NRB/PNT and face-only Hessian paths remain intentionally unsupported here.
+  if (eType == ElementType::NRB || eType == ElementType::PNT) {
     return;
   }
 
-  try {
-    get_element_2nd_derivs[eType](insd, ind2, eNoN, gaus_pt, xi, Nxx);
-  } catch (const std::bad_function_call& exception) {
-    throw std::runtime_error("[get_gn_nxx] No support for element type " + std::to_string(static_cast<int>(eType)) + " in 'get_element_2nd_derivs'.");
+  if (active_basis_mode() == BasisMode::Legacy) {
+    call_legacy_get_gn_nxx(
+        insd, ind2, eType, eNoN, gaus_pt, xi, Nxx, "", true);
+    return;
   }
+
+  if (supports_basis_hessian_adapter_for(eType)) {
+    try {
+      evaluate_basis_hessians(insd, ind2, eType, eNoN, gaus_pt, xi, Nxx);
+      return;
+    } catch (const fe::NotImplementedException& exception) {
+      throw fe::NotImplementedException(
+          "[get_gn_nxx] FE Basis Hessian support is required for mapped volume element " +
+              solver_element_name(eType) + " but is not implemented: " + exception.what(),
+          __FILE__, __LINE__, __func__);
+    } catch (const std::exception& exception) {
+      throw febasis::BasisEvaluationException(
+          "[get_gn_nxx] FE Basis Hessian adapter failed for element " +
+              solver_element_name(eType) +
+              "; legacy fallback was not attempted for this approved element: " +
+              exception.what(),
+          __FILE__, __LINE__, __func__);
+    }
+  }
+
+  // Legacy Hessian tables are reserved for intentionally unsupported families.
+  call_legacy_get_gn_nxx(insd, ind2, eType, eNoN, gaus_pt, xi, Nxx);
 }
 
 /// @brief Sets bounds on Gauss integration points in parametric space and
@@ -333,7 +915,9 @@ void get_nnx(const int nsd, const consts::ElementType eType, const int eNoN, con
   l1 = (l1 && l2 && l3 && l4);
 
   if (!l1) {
-    throw std::runtime_error("Error in computing shape functions");
+    throw fe::InvalidArgumentException(
+        "Error in computing shape functions",
+        __FILE__, __LINE__, __func__);
   }
 }
 
@@ -582,8 +1166,11 @@ void gnnb(const ComMod& com_mod, const faceType& lFa, const int e, const int g,
     }
 
     if (!found_node) {
-      throw std::runtime_error("[svMultiPhysics::gnnb] ERROR: The '" + lFa.name + "' face node " + std::to_string(Ac) + 
-          " could not be matched to a node in the '" + msh.name + "' volume mesh.");
+      throw fe::InvalidArgumentException(
+          "[svMultiPhysics::gnnb] ERROR: The '" + lFa.name + "' face node " +
+              std::to_string(Ac) + " could not be matched to a node in the '" +
+              msh.name + "' volume mesh.",
+          __FILE__, __LINE__, __func__);
     }
 
     ptr(a) = b;
@@ -632,7 +1219,9 @@ void gnnb(const ComMod& com_mod, const faceType& lFa, const int e, const int g,
           }
           break;
         default:
-          throw std::runtime_error("gnnb: invalid MechanicalConfigurationType provided");
+          throw fe::InvalidArgumentException(
+              "gnnb: invalid MechanicalConfigurationType provided",
+              __FILE__, __LINE__, __func__);
       }
     }
   }
@@ -821,7 +1410,8 @@ void gn_nxx(const int l, const int eNoN, const int nsd, const int insd, Array<do
     dgesv_(&l, &eNoN, K.data(), &l, IPIV.data(), B.data(), &l, &INFO);
 
     if (INFO != 0) {
-      throw std::runtime_error("[gn_nxx] Error in Lapack");
+      throw fe::BackendException("[gn_nxx] Error in Lapack", "LAPACK dgesv", INFO,
+          __FILE__, __LINE__, __func__);
     }
 
     Nxx = B;
@@ -892,7 +1482,8 @@ void gn_nxx(const int l, const int eNoN, const int nsd, const int insd, Array<do
     dgesv_(&l, &eNoN, K.data(), &l, IPIV.data(), B.data(), &l, &INFO);
 
     if (INFO != 0) {
-      throw std::runtime_error("[gn_nxx] Error in Lapack");
+      throw fe::BackendException("[gn_nxx] Error in Lapack", "LAPACK dgesv", INFO,
+          __FILE__, __LINE__, __func__);
     }
 
     Nxx = B;
@@ -940,8 +1531,10 @@ void select_ele(const ComMod& com_mod, mshType& mesh)
       set_1d_element_props[mesh.eNoN](insd, mesh);
     }
   } catch (const std::bad_function_call& exception) {
-      throw std::runtime_error("[select_ele] No support for " + std::to_string(mesh.eNoN) + " noded " + 
-          std::to_string(insd) + "D elements.");
+      throw fe::InvalidElementException(
+          "[select_ele] No support for " + std::to_string(mesh.eNoN) +
+              " noded " + std::to_string(insd) + "D elements.",
+          solver_element_name(mesh.eType), __FILE__, __LINE__, __func__);
   }
 
   // Set mesh 'w' and 'xi' arrays used for Gauss integration.
@@ -997,8 +1590,10 @@ void select_eleb(Simulation* simulation, mshType& mesh, faceType& face)
   try {
     set_face_element_props[face.eNoN](insd, face);
   } catch (const std::bad_function_call& exception) {
-    throw std::runtime_error("No support for " + std::to_string(face.eNoN) + " noded " +
-      std::to_string(insd) + "D elements in 'set_face_element_props'.");
+    throw fe::InvalidElementException(
+        "No support for " + std::to_string(face.eNoN) + " noded " +
+            std::to_string(insd) + "D elements in 'set_face_element_props'.",
+        solver_element_name(face.eType), __FILE__, __LINE__, __func__);
   }
 
   // Set face 'w' and 'xi' arrays used for Gauss integration.
@@ -1015,4 +1610,3 @@ void select_eleb(Simulation* simulation, mshType& mesh, faceType& face)
 }
 
 };
-
diff --git a/tests/unitTests/FE/Basis/test_BasisCacheFactory.cpp b/tests/unitTests/FE/Basis/test_BasisCacheFactory.cpp
new file mode 100644
index 000000000..216fd0401
--- /dev/null
+++ b/tests/unitTests/FE/Basis/test_BasisCacheFactory.cpp
@@ -0,0 +1,256 @@
+/**
+ * @file test_BasisCacheFactory.cpp
+ * @brief Tests for the migrated Basis cache and factory subset.
+ */
+
+#include <gtest/gtest.h>
+
+#include "FE/Basis/BasisCache.h"
+#include "FE/Basis/BasisFactory.h"
+#include "FE/Basis/LagrangeBasis.h"
+#include "FE/Basis/SerendipityBasis.h"
+#include "FE/Quadrature/QuadratureRule.h"
+
+#include <memory>
+#include <vector>
+
+using namespace svmp::FE;
+using namespace svmp::FE::basis;
+using namespace svmp::FE::quadrature;
+
+namespace {
+
+class CustomQuadratureRule final : public QuadratureRule {
+public:
+    CustomQuadratureRule(svmp::CellFamily family,
+                         int dimension,
+                         int order,
+                         std::vector<QuadPoint> points,
+                         std::vector<Real> weights)
+        : QuadratureRule(family, dimension, order)
+    {
+        set_data(std::move(points), std::move(weights));
+    }
+};
+
+CustomQuadratureRule line_rule() {
+    return CustomQuadratureRule(
+        svmp::CellFamily::Line, 1, 3,
+        {
+            QuadPoint{Real(-0.5), Real(0), Real(0)},
+            QuadPoint{Real(0.5), Real(0), Real(0)}
+        },
+        {Real(1), Real(1)});
+}
+
+CustomQuadratureRule quad_rule(Real first_weight = Real(1)) {
+    return CustomQuadratureRule(
+        svmp::CellFamily::Quad, 2, 3,
+        {
+            QuadPoint{Real(-0.5), Real(-0.5), Real(0)},
+            QuadPoint{Real(0.5), Real(-0.25), Real(0)},
+            QuadPoint{Real(0.0), Real(0.5), Real(0)}
+        },
+        {first_weight, Real(1), Real(2)});
+}
+
+class TestCustomScalarBasis final : public BasisFunction {
+public:
+    explicit TestCustomScalarBasis(int tag)
+        : tag_(tag)
+    {
+    }
+
+    BasisType basis_type() const noexcept override { return BasisType::Custom; }
+    ElementType element_type() const noexcept override { return ElementType::Line2; }
+    int dimension() const noexcept override { return 1; }
+    int order() const noexcept override { return 1; }
+    std::size_t size() const noexcept override { return 2u; }
+
+    std::string cache_identity() const override {
+        return BasisFunction::cache_identity() + "|tag=" + std::to_string(tag_);
+    }
+
+    void evaluate_values(const math::Vector<Real, 3>& xi,
+                         std::vector<Real>& values) const override
+    {
+        values.resize(2u);
+        const Real shift = Real(tag_) * Real(0.125);
+        values[0] = Real(0.5) * (Real(1) - xi[0]) + shift;
+        values[1] = Real(0.5) * (Real(1) + xi[0]) - shift;
+    }
+
+    void evaluate_gradients(const math::Vector<Real, 3>&,
+                            std::vector<Gradient>& gradients) const override
+    {
+        gradients.assign(2u, Gradient{});
+        gradients[0][0] = Real(-0.5);
+        gradients[1][0] = Real(0.5);
+    }
+
+private:
+    int tag_{0};
+};
+
+class StructuredIdentityScalarBasis final : public BasisFunction {
+public:
+    explicit StructuredIdentityScalarBasis(int tag)
+        : tag_(tag)
+    {
+    }
+
+    BasisType basis_type() const noexcept override { return BasisType::Custom; }
+    ElementType element_type() const noexcept override { return ElementType::Line2; }
+    int dimension() const noexcept override { return 1; }
+    int order() const noexcept override { return 1; }
+    std::size_t size() const noexcept override { return 2u; }
+
+    bool cache_identity_words(std::vector<std::uint64_t>& words) const override {
+        words.push_back(0x7374727563746964ULL);
+        words.push_back(static_cast<std::uint64_t>(tag_));
+        return true;
+    }
+
+    std::string cache_identity() const override {
+        ++string_identity_calls;
+        return BasisFunction::cache_identity() + "|structured-tag=" + std::to_string(tag_);
+    }
+
+    void evaluate_values(const math::Vector<Real, 3>& xi,
+                         std::vector<Real>& values) const override
+    {
+        values.resize(2u);
+        values[0] = Real(1) - xi[0] + Real(tag_);
+        values[1] = xi[0] - Real(tag_);
+    }
+
+    mutable std::size_t string_identity_calls{0};
+
+private:
+    int tag_{0};
+};
+
+} // namespace
+
+TEST(BasisFactory, CreatesLagrangeAndSerendipityBases) {
+    auto lagrange = basis_factory::create(
+        BasisRequest{ElementType::Line2, BasisType::Lagrange, 2});
+    ASSERT_NE(lagrange, nullptr);
+    EXPECT_EQ(lagrange->basis_type(), BasisType::Lagrange);
+    EXPECT_EQ(lagrange->element_type(), ElementType::Line2);
+    EXPECT_EQ(lagrange->order(), 2);
+
+    auto serendipity = basis_factory::create(
+        BasisRequest{ElementType::Quad8, BasisType::Serendipity, 2});
+    ASSERT_NE(serendipity, nullptr);
+    EXPECT_EQ(serendipity->basis_type(), BasisType::Serendipity);
+    EXPECT_EQ(serendipity->element_type(), ElementType::Quad8);
+    EXPECT_EQ(serendipity->size(), 8u);
+}
+
+TEST(BasisFactory, RejectsOutOfScopeAndInvalidRequests) {
+    EXPECT_THROW(
+        (void)basis_factory::create(BasisRequest{ElementType::Line2, BasisType::Lagrange}),
+        BasisConfigurationException);
+    EXPECT_THROW(
+        (void)basis_factory::create(
+            BasisRequest{ElementType::Line2, BasisType::Lagrange, -1}),
+        BasisConfigurationException);
+    EXPECT_THROW(
+        (void)basis_factory::create(
+            BasisRequest{ElementType::Line2, BasisType::Bernstein, 1}),
+        BasisConfigurationException);
+    EXPECT_THROW(
+        (void)basis_factory::create(
+            BasisRequest{ElementType::Line2,
+                         BasisType::Lagrange,
+                         1,
+                         Continuity::H_div,
+                         FieldType::Vector}),
+        BasisConfigurationException);
+}
+
+TEST(BasisFactory, SupportsCustomFactoryRegistration) {
+    basis_factory::clear_custom_registry_for_tests();
+    basis_factory::register_custom(
+        "test-custom",
+        [](const BasisRequest& req) {
+            const int tag = req.order.value_or(0);
+            return std::make_shared<TestCustomScalarBasis>(tag);
+        });
+
+    BasisRequest req{ElementType::Line2, BasisType::Custom, 7};
+    req.custom_id = "test-custom";
+    auto custom = basis_factory::create(req);
+    ASSERT_NE(custom, nullptr);
+    EXPECT_EQ(custom->basis_type(), BasisType::Custom);
+    EXPECT_EQ(custom->size(), 2u);
+
+    basis_factory::unregister_custom("test-custom");
+    EXPECT_THROW((void)basis_factory::create(req), BasisConfigurationException);
+    basis_factory::clear_custom_registry_for_tests();
+}
+
+TEST(BasisCache, ReusesEntriesForSameBasisAndQuadratureCoordinates) {
+    LagrangeBasis basis(ElementType::Line2, 2);
+    const auto quad = line_rule();
+
+    auto& cache = BasisCache::instance();
+    cache.clear();
+    const auto& entry1 = cache.get_or_compute(basis, quad, true, true);
+    const auto& entry2 = cache.get_or_compute(basis, quad, true, true);
+
+    EXPECT_EQ(&entry1, &entry2);
+    EXPECT_EQ(entry1.num_qpts, quad.num_points());
+    EXPECT_EQ(entry1.num_dofs, basis.size());
+    ASSERT_EQ(entry1.scalar_values.size(), basis.size() * quad.num_points());
+    ASSERT_EQ(entry1.gradients.size(), basis.size() * 3u * quad.num_points());
+    ASSERT_EQ(entry1.hessians.size(), basis.size() * 9u * quad.num_points());
+    EXPECT_EQ(cache.size(), 1u);
+}
+
+TEST(BasisCache, ReusesCoordinateIdenticalQuadratureRulesIgnoringWeights) {
+    SerendipityBasis basis(ElementType::Quad8, 2);
+    const auto quad_a = quad_rule(Real(1));
+    const auto quad_b = quad_rule(Real(0.25));
+
+    auto& cache = BasisCache::instance();
+    cache.clear();
+    const auto& entry_a = cache.get_or_compute(basis, quad_a, true, false);
+    const auto& entry_b = cache.get_or_compute(basis, quad_b, true, false);
+
+    EXPECT_EQ(&entry_a, &entry_b);
+    EXPECT_EQ(cache.size(), 1u);
+}
+
+TEST(BasisCache, SeparatesStringIdentityCustomBases) {
+    TestCustomScalarBasis custom_a(1);
+    TestCustomScalarBasis custom_b(2);
+    const auto quad = line_rule();
+
+    auto& cache = BasisCache::instance();
+    cache.clear();
+    const auto& entry_a = cache.get_or_compute(custom_a, quad, false, false);
+    const auto& entry_b = cache.get_or_compute(custom_b, quad, false, false);
+
+    EXPECT_NE(&entry_a, &entry_b);
+    EXPECT_NE(entry_a.scalar_values, entry_b.scalar_values);
+    EXPECT_EQ(cache.size(), 2u);
+}
+
+TEST(BasisCache, StructuredIdentityAvoidsStringFallbackAndSeparatesBases) {
+    StructuredIdentityScalarBasis custom_a(1);
+    StructuredIdentityScalarBasis custom_b(2);
+    const auto quad = line_rule();
+
+    auto& cache = BasisCache::instance();
+    cache.clear();
+    const auto& entry_a = cache.get_or_compute(custom_a, quad, false, false);
+    const auto& entry_b = cache.get_or_compute(custom_b, quad, false, false);
+
+    EXPECT_NE(&entry_a, &entry_b);
+    EXPECT_EQ(custom_a.string_identity_calls, 0u);
+    EXPECT_EQ(custom_b.string_identity_calls, 0u);
+    EXPECT_EQ(cache.size(), 2u);
+}
+
diff --git a/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp b/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
new file mode 100644
index 000000000..967f078aa
--- /dev/null
+++ b/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
@@ -0,0 +1,203 @@
+/**
+ * @file test_BasisErrorPaths.cpp
+ * @brief Error-path coverage for the migrated Lagrange-focused Basis subset.
+ */
+
+#include <gtest/gtest.h>
+
+#include "FE/Basis/BasisExceptions.h"
+#include "FE/Basis/BasisFactory.h"
+#include "FE/Basis/BasisFunction.h"
+#include "FE/Basis/LagrangeBasis.h"
+#include "FE/Basis/NodeOrderingConventions.h"
+#include "FE/Basis/SerendipityBasis.h"
+
+#include <vector>
+
+using namespace svmp::FE;
+using namespace svmp::FE::basis;
+
+namespace {
+
+class MinimalScalarBasis : public BasisFunction {
+public:
+    BasisType basis_type() const noexcept override { return BasisType::Custom; }
+    ElementType element_type() const noexcept override { return ElementType::Line2; }
+    int dimension() const noexcept override { return 1; }
+    int order() const noexcept override { return 1; }
+    std::size_t size() const noexcept override { return 2u; }
+
+    void evaluate_values(const math::Vector<Real, 3>&,
+                         std::vector<Real>& values) const override
+    {
+        values.assign(size(), Real(0));
+    }
+};
+
+class CompleteFallbackBasis : public BasisFunction {
+public:
+    BasisType basis_type() const noexcept override { return BasisType::Custom; }
+    ElementType element_type() const noexcept override { return ElementType::Triangle3; }
+    int dimension() const noexcept override { return 2; }
+    int order() const noexcept override { return 1; }
+    std::size_t size() const noexcept override { return 2u; }
+
+    void evaluate_values(const math::Vector<Real, 3>& xi,
+                         std::vector<Real>& values) const override
+    {
+        values.resize(size());
+        values[0] = Real(1) + xi[0];
+        values[1] = Real(2) + xi[1];
+    }
+
+    void evaluate_gradients(const math::Vector<Real, 3>&,
+                            std::vector<Gradient>& gradients) const override
+    {
+        gradients.assign(size(), Gradient{});
+        gradients[0][0] = Real(1);
+        gradients[1][1] = Real(1);
+    }
+
+    void evaluate_hessians(const math::Vector<Real, 3>& xi,
+                           std::vector<Hessian>& hessians) const override
+    {
+        hessians.assign(size(), Hessian{});
+        for (std::size_t d = 0; d < hessians.size(); ++d) {
+            for (std::size_t r = 0; r < 3u; ++r) {
+                for (std::size_t c = 0; c < 3u; ++c) {
+                    hessians[d](r, c) = Real(100) * static_cast<Real>(d + 1u) +
+                                        Real(10) * static_cast<Real>(r) +
+                                        static_cast<Real>(c) + xi[2];
+                }
+            }
+        }
+    }
+};
+
+} // namespace
+
+TEST(BasisErrorPaths, LagrangeInvalidRequestsThrowBasisExceptions) {
+    EXPECT_THROW(LagrangeBasis(ElementType::Unknown, 1),
+                 BasisElementCompatibilityException);
+    EXPECT_THROW(LagrangeBasis(ElementType::Line2, -1),
+                 BasisConfigurationException);
+    EXPECT_THROW(LagrangeBasis(ElementType::Quad8, 2),
+                 BasisElementCompatibilityException);
+}
+
+TEST(BasisErrorPaths, SerendipityInvalidRequestsThrowBasisExceptions) {
+    EXPECT_THROW(SerendipityBasis(ElementType::Unknown, 2),
+                 BasisElementCompatibilityException);
+    EXPECT_THROW(SerendipityBasis(ElementType::Quad8, 3),
+                 BasisConfigurationException);
+    EXPECT_THROW(SerendipityBasis(ElementType::Pyramid14, 2),
+                 BasisElementCompatibilityException);
+}
+
+TEST(BasisErrorPaths, BasisFactoryInvalidRequestsThrowBasisExceptions) {
+    EXPECT_THROW((void)basis_factory::create(
+                     BasisRequest{ElementType::Line2, BasisType::Lagrange}),
+                 BasisConfigurationException);
+    EXPECT_THROW((void)basis_factory::create(
+                     BasisRequest{ElementType::Line2, BasisType::Lagrange, -1}),
+                 BasisConfigurationException);
+    EXPECT_THROW((void)basis_factory::create(
+                     BasisRequest{ElementType::Line2, BasisType::Bernstein, 1}),
+                 BasisConfigurationException);
+
+    auto serendipity = basis_factory::create(
+        BasisRequest{ElementType::Quad8, BasisType::Serendipity, 2});
+    ASSERT_NE(serendipity, nullptr);
+    EXPECT_EQ(serendipity->basis_type(), BasisType::Serendipity);
+}
+
+TEST(BasisErrorPaths, BasisExceptionsUseCommonStatusCodes) {
+    try {
+        throw BasisConfigurationException("invalid config", __FILE__, __LINE__, __func__);
+    } catch (const FEException& e) {
+        EXPECT_EQ(e.status(), svmp::StatusCode::InvalidArgument);
+    }
+
+    try {
+        throw BasisConstructionException("construction failure", __FILE__, __LINE__, __func__);
+    } catch (const FEException& e) {
+        EXPECT_EQ(e.status(), svmp::StatusCode::InternalError);
+    }
+}
+
+TEST(BasisErrorPaths, NodeOrderingInvalidNodeThrows) {
+    EXPECT_THROW((void)ReferenceNodeLayout::get_node_coords(ElementType::Quad8, 99u),
+                 BasisNodeOrderingException);
+    EXPECT_THROW((void)ReferenceNodeLayout::get_lagrange_node_coords(ElementType::Quad8, 2),
+                 BasisNodeOrderingException);
+}
+
+TEST(BasisErrorPaths, BasisFunctionDefaultsThrowForMissingDerivatives) {
+    MinimalScalarBasis basis;
+    const math::Vector<Real, 3> xi{Real(0), Real(0), Real(0)};
+    std::vector<Gradient> gradients;
+    std::vector<Hessian> hessians;
+
+    EXPECT_THROW(basis.evaluate_gradients(xi, gradients), BasisEvaluationException);
+    EXPECT_THROW(basis.evaluate_hessians(xi, hessians), BasisEvaluationException);
+}
+
+TEST(BasisErrorPaths, BasisFunctionFallbackWritesFlatAndStridedLayouts) {
+    CompleteFallbackBasis basis;
+    const std::vector<math::Vector<Real, 3>> points = {
+        {Real(0.25), Real(0.5), Real(-0.25)},
+        {Real(-0.5), Real(0.75), Real(0.125)}
+    };
+    prewarm_basis_function_scratch(basis.size(), points.size());
+
+    std::vector<Real> flat_values(basis.size());
+    std::vector<Real> flat_gradients(basis.size() * 3u);
+    std::vector<Real> flat_hessians(basis.size() * 9u);
+    basis.evaluate_values_to(points.front(), flat_values.data());
+    basis.evaluate_gradients_to(points.front(), flat_gradients.data());
+    basis.evaluate_hessians_to(points.front(), flat_hessians.data());
+
+    std::vector<Real> expected_values;
+    std::vector<Gradient> expected_gradients;
+    std::vector<Hessian> expected_hessians;
+    basis.evaluate_all(points.front(), expected_values, expected_gradients, expected_hessians);
+    for (std::size_t d = 0; d < basis.size(); ++d) {
+        EXPECT_EQ(flat_values[d], expected_values[d]);
+        for (std::size_t c = 0; c < 3u; ++c) {
+            EXPECT_EQ(flat_gradients[d * 3u + c], expected_gradients[d][c]);
+        }
+        for (std::size_t r = 0; r < 3u; ++r) {
+            for (std::size_t c = 0; c < 3u; ++c) {
+                EXPECT_EQ(flat_hessians[d * 9u + r * 3u + c], expected_hessians[d](r, c));
+            }
+        }
+    }
+
+    constexpr std::size_t output_stride = 3u;
+    std::vector<Real> values(basis.size() * output_stride, Real(-99));
+    std::vector<Real> gradients(basis.size() * 3u * output_stride, Real(-99));
+    std::vector<Real> hessians(basis.size() * 9u * output_stride, Real(-99));
+    basis.evaluate_at_quadrature_points_strided(
+        points, output_stride, values.data(), gradients.data(), hessians.data());
+
+    for (std::size_t q = 0; q < points.size(); ++q) {
+        basis.evaluate_all(points[q], expected_values, expected_gradients, expected_hessians);
+        for (std::size_t d = 0; d < basis.size(); ++d) {
+            EXPECT_EQ(values[d * output_stride + q], expected_values[d]);
+            for (std::size_t c = 0; c < 3u; ++c) {
+                EXPECT_EQ(gradients[(d * 3u + c) * output_stride + q],
+                          expected_gradients[d][c]);
+            }
+            for (std::size_t r = 0; r < 3u; ++r) {
+                for (std::size_t c = 0; c < 3u; ++c) {
+                    EXPECT_EQ(hessians[(d * 9u + r * 3u + c) * output_stride + q],
+                              expected_hessians[d](r, c));
+                }
+            }
+        }
+    }
+
+    for (std::size_t d = 0; d < basis.size(); ++d) {
+        EXPECT_EQ(values[d * output_stride + 2u], Real(-99));
+    }
+}
diff --git a/tests/unitTests/FE/Basis/test_BasisHessians.cpp b/tests/unitTests/FE/Basis/test_BasisHessians.cpp
new file mode 100644
index 000000000..0899ce358
--- /dev/null
+++ b/tests/unitTests/FE/Basis/test_BasisHessians.cpp
@@ -0,0 +1,314 @@
+/**
+ * @file test_BasisHessians.cpp
+ * @brief Analytical Hessian coverage for the migrated Lagrange basis.
+ */
+
+#include <gtest/gtest.h>
+
+#include "FE/Basis/BasisFactory.h"
+#include "FE/Basis/LagrangeBasis.h"
+#include "FE/Basis/SerendipityBasis.h"
+
+#include <array>
+#include <limits>
+#include <vector>
+
+using namespace svmp::FE;
+using namespace svmp::FE::basis;
+
+namespace {
+
+void numerical_hessian_helper(const BasisFunction& basis,
+                              const math::Vector<Real, 3>& xi,
+                              std::vector<Hessian>& hessians,
+                              Real eps = Real(1e-5))
+{
+    hessians.assign(basis.size(), Hessian{});
+    const int dim = basis.dimension();
+
+    for (int i = 0; i < dim; ++i) {
+        for (int j = 0; j < dim; ++j) {
+            math::Vector<Real, 3> xi_p = xi;
+            math::Vector<Real, 3> xi_m = xi;
+            const std::size_t sj = static_cast<std::size_t>(j);
+            xi_p[sj] += eps;
+            xi_m[sj] -= eps;
+
+            std::vector<Gradient> g_p;
+            std::vector<Gradient> g_m;
+            basis.evaluate_gradients(xi_p, g_p);
+            basis.evaluate_gradients(xi_m, g_m);
+
+            for (std::size_t n = 0; n < basis.size(); ++n) {
+                const std::size_t si = static_cast<std::size_t>(i);
+                hessians[n](si, sj) = (g_p[n][si] - g_m[n][si]) / (Real(2) * eps);
+            }
+        }
+    }
+}
+
+std::vector<math::Vector<Real, 3>> sample_points_for(ElementType type) {
+    switch (type) {
+        case ElementType::Line2:
+            return {{Real(-0.35), Real(0), Real(0)}, {Real(0.2), Real(0), Real(0)}};
+        case ElementType::Triangle3:
+            return {{Real(0.15), Real(0.2), Real(0)}, {Real(0.25), Real(0.1), Real(0)}};
+        case ElementType::Quad4:
+            return {{Real(0.2), Real(-0.3), Real(0)}, {Real(-0.45), Real(0.25), Real(0)}};
+        case ElementType::Tetra4:
+            return {{Real(0.12), Real(0.18), Real(0.16)}, {Real(0.2), Real(0.1), Real(0.18)}};
+        case ElementType::Hex8:
+            return {{Real(0.1), Real(-0.2), Real(0.3)}, {Real(-0.35), Real(0.25), Real(-0.15)}};
+        case ElementType::Wedge6:
+            return {{Real(0.18), Real(0.22), Real(-0.2)}, {Real(0.12), Real(0.16), Real(0.1)}};
+        case ElementType::Pyramid5:
+            return {{Real(0.0), Real(0.0), Real(0.2)}, {Real(0.12), Real(-0.08), Real(0.24)}};
+        default:
+            return {{Real(0), Real(0), Real(0)}};
+    }
+}
+
+void expect_hessians_match_numerical(const LagrangeBasis& basis,
+                                     const std::vector<math::Vector<Real, 3>>& points,
+                                     Real tol,
+                                     Real eps = Real(1e-5))
+{
+    for (const auto& xi : points) {
+        std::vector<Hessian> analytical;
+        std::vector<Hessian> numerical;
+        basis.evaluate_hessians(xi, analytical);
+        numerical_hessian_helper(basis, xi, numerical, eps);
+
+        ASSERT_EQ(analytical.size(), numerical.size());
+        for (std::size_t n = 0; n < analytical.size(); ++n) {
+            for (int i = 0; i < basis.dimension(); ++i) {
+                for (int j = 0; j < basis.dimension(); ++j) {
+                    const std::size_t si = static_cast<std::size_t>(i);
+                    const std::size_t sj = static_cast<std::size_t>(j);
+                    EXPECT_NEAR(analytical[n](si, sj), numerical[n](si, sj), tol)
+                        << "basis " << n << ", component (" << i << "," << j
+                        << "), element " << static_cast<int>(basis.element_type())
+                        << ", order " << basis.order();
+                }
+            }
+        }
+    }
+}
+
+void expect_partition_hessian_sum_zero(const LagrangeBasis& basis,
+                                       const math::Vector<Real, 3>& xi,
+                                       Real tol)
+{
+    std::vector<Hessian> hessians;
+    basis.evaluate_hessians(xi, hessians);
+
+    Hessian sum{};
+    for (const auto& hessian : hessians) {
+        for (std::size_t r = 0; r < 3u; ++r) {
+            for (std::size_t c = 0; c < 3u; ++c) {
+                sum(r, c) += hessian(r, c);
+            }
+        }
+    }
+
+    for (int r = 0; r < basis.dimension(); ++r) {
+        for (int c = 0; c < basis.dimension(); ++c) {
+            EXPECT_NEAR(sum(static_cast<std::size_t>(r), static_cast<std::size_t>(c)),
+                        Real(0),
+                        tol)
+                << "element " << static_cast<int>(basis.element_type())
+                << ", order " << basis.order();
+        }
+    }
+}
+
+void expect_hessians_symmetric(const LagrangeBasis& basis,
+                               const math::Vector<Real, 3>& xi,
+                               Real tol)
+{
+    std::vector<Hessian> hessians;
+    basis.evaluate_hessians(xi, hessians);
+
+    for (const auto& hessian : hessians) {
+        for (int r = 0; r < basis.dimension(); ++r) {
+            for (int c = r + 1; c < basis.dimension(); ++c) {
+                const std::size_t sr = static_cast<std::size_t>(r);
+                const std::size_t sc = static_cast<std::size_t>(c);
+                EXPECT_NEAR(hessian(sr, sc), hessian(sc, sr), tol);
+            }
+        }
+    }
+}
+
+void expect_partition_hessian_sum_zero(const BasisFunction& basis,
+                                       const math::Vector<Real, 3>& xi,
+                                       Real tol)
+{
+    std::vector<Hessian> hessians;
+    basis.evaluate_hessians(xi, hessians);
+
+    Hessian sum{};
+    for (const auto& hessian : hessians) {
+        for (std::size_t r = 0; r < 3u; ++r) {
+            for (std::size_t c = 0; c < 3u; ++c) {
+                sum(r, c) += hessian(r, c);
+            }
+        }
+    }
+
+    for (int r = 0; r < basis.dimension(); ++r) {
+        for (int c = 0; c < basis.dimension(); ++c) {
+            EXPECT_NEAR(sum(static_cast<std::size_t>(r), static_cast<std::size_t>(c)),
+                        Real(0),
+                        tol)
+                << "element " << static_cast<int>(basis.element_type())
+                << ", order " << basis.order();
+        }
+    }
+}
+
+void expect_hessians_symmetric(const BasisFunction& basis,
+                               const math::Vector<Real, 3>& xi,
+                               Real tol)
+{
+    std::vector<Hessian> hessians;
+    basis.evaluate_hessians(xi, hessians);
+
+    for (const auto& hessian : hessians) {
+        for (int r = 0; r < basis.dimension(); ++r) {
+            for (int c = r + 1; c < basis.dimension(); ++c) {
+                const std::size_t sr = static_cast<std::size_t>(r);
+                const std::size_t sc = static_cast<std::size_t>(c);
+                EXPECT_NEAR(hessian(sr, sc), hessian(sc, sr), tol);
+            }
+        }
+    }
+}
+
+} // namespace
+
+TEST(BasisHessians, LagrangeCanonicalTopologiesMatchNumericalHessians) {
+    const struct Case {
+        ElementType type;
+        int order;
+        Real tol;
+        Real eps;
+    } cases[] = {
+        {ElementType::Line2, 3, Real(1e-7), Real(1e-5)},
+        {ElementType::Triangle3, 3, Real(2e-6), Real(1e-5)},
+        {ElementType::Quad4, 3, Real(1e-6), Real(1e-5)},
+        {ElementType::Tetra4, 2, Real(1e-6), Real(1e-5)},
+        {ElementType::Hex8, 2, Real(1e-6), Real(1e-5)},
+        {ElementType::Wedge6, 2, Real(1e-5), Real(1e-5)},
+        {ElementType::Pyramid5, 1, Real(2e-6), Real(1e-5)},
+        {ElementType::Pyramid5, 3, Real(4e-4), Real(2e-5)},
+    };
+
+    for (const auto& c : cases) {
+        LagrangeBasis basis(c.type, c.order);
+        expect_hessians_match_numerical(basis, sample_points_for(c.type), c.tol, c.eps);
+    }
+}
+
+TEST(BasisHessians, LagrangeHessiansSumToZeroAndAreSymmetric) {
+    const struct Case {
+        ElementType type;
+        int order;
+        math::Vector<Real, 3> xi;
+        Real tol;
+    } cases[] = {
+        {ElementType::Line2, 3, {Real(0.15), Real(0), Real(0)}, Real(1e-12)},
+        {ElementType::Triangle3, 3, {Real(0.2), Real(0.25), Real(0)}, Real(1e-10)},
+        {ElementType::Quad4, 3, {Real(0.3), Real(-0.2), Real(0)}, Real(1e-12)},
+        {ElementType::Tetra4, 2, {Real(0.15), Real(0.2), Real(0.1)}, Real(1e-10)},
+        {ElementType::Hex8, 2, {Real(0.1), Real(-0.2), Real(0.3)}, Real(1e-12)},
+        {ElementType::Wedge6, 2, {Real(0.2), Real(0.15), Real(-0.3)}, Real(1e-10)},
+        {ElementType::Pyramid5, 1, {Real(0.1), Real(-0.2), Real(0.3)}, Real(1e-8)},
+    };
+
+    for (const auto& c : cases) {
+        LagrangeBasis basis(c.type, c.order);
+        expect_partition_hessian_sum_zero(basis, c.xi, Real(10) * c.tol);
+        expect_hessians_symmetric(basis, c.xi, c.tol);
+    }
+}
+
+TEST(BasisHessians, LagrangePyramidExactApexHessianThrows) {
+    const struct Case {
+        ElementType type;
+        int order;
+    } cases[] = {
+        {ElementType::Pyramid5, 1},
+        {ElementType::Pyramid14, 2},
+        {ElementType::Pyramid5, 4},
+    };
+
+    const math::Vector<Real, 3> apex{Real(0), Real(0), Real(1)};
+    for (const auto& c : cases) {
+        LagrangeBasis basis(c.type, c.order);
+        std::vector<Hessian> hessians;
+        EXPECT_THROW(basis.evaluate_hessians(apex, hessians), BasisEvaluationException)
+            << "order " << c.order;
+    }
+}
+
+TEST(BasisHessians, SerendipityHessiansSumToZeroAndAreSymmetric) {
+    const struct Case {
+        ElementType type;
+        int order;
+        math::Vector<Real, 3> xi;
+        Real tol;
+    } cases[] = {
+        {ElementType::Quad8, 2, {Real(0.17), Real(-0.31), Real(0)}, Real(1e-10)},
+        {ElementType::Hex20, 2, {Real(0.2), Real(-0.1), Real(0.3)}, Real(1e-10)},
+        {ElementType::Wedge15, 2, {Real(0.2), Real(0.3), Real(0.1)}, Real(1e-10)},
+        {ElementType::Pyramid13, 2, {Real(0.1), Real(-0.2), Real(0.4)}, Real(1e-8)},
+    };
+
+    for (const auto& c : cases) {
+        SerendipityBasis basis(c.type, c.order);
+        expect_partition_hessian_sum_zero(basis, c.xi, c.tol);
+        expect_hessians_symmetric(basis, c.xi, c.tol);
+    }
+}
+
+TEST(BasisHessians, SerendipityPyramidExactApexHessianThrows) {
+    SerendipityBasis basis(ElementType::Pyramid13, 2);
+    std::vector<Hessian> hessians;
+    EXPECT_THROW(basis.evaluate_hessians({Real(0), Real(0), Real(1)}, hessians),
+                 BasisEvaluationException);
+}
+
+TEST(BasisHessians, SolverMappedVolumeSelectionsSatisfyInvariants) {
+    const struct Case {
+        ElementType type;
+        BasisType basis_type;
+        int order;
+        math::Vector<Real, 3> xi;
+        Real tol;
+    } cases[] = {
+        {ElementType::Line2, BasisType::Lagrange, 1, {Real(0.15), Real(0), Real(0)}, Real(1e-12)},
+        {ElementType::Line3, BasisType::Lagrange, 2, {Real(-0.25), Real(0), Real(0)}, Real(1e-12)},
+        {ElementType::Triangle3, BasisType::Lagrange, 1, {Real(0.2), Real(0.25), Real(0)}, Real(1e-12)},
+        {ElementType::Triangle6, BasisType::Lagrange, 2, {Real(0.2), Real(0.25), Real(0)}, Real(1e-12)},
+        {ElementType::Quad4, BasisType::Lagrange, 1, {Real(0.3), Real(-0.2), Real(0)}, Real(1e-12)},
+        {ElementType::Quad8, BasisType::Serendipity, 2, {Real(0.17), Real(-0.31), Real(0)}, Real(1e-10)},
+        {ElementType::Quad9, BasisType::Lagrange, 2, {Real(0.3), Real(-0.2), Real(0)}, Real(1e-12)},
+        {ElementType::Tetra4, BasisType::Lagrange, 1, {Real(0.15), Real(0.2), Real(0.1)}, Real(1e-12)},
+        {ElementType::Tetra10, BasisType::Lagrange, 2, {Real(0.15), Real(0.2), Real(0.1)}, Real(1e-10)},
+        {ElementType::Hex8, BasisType::Lagrange, 1, {Real(0.1), Real(-0.2), Real(0.3)}, Real(1e-12)},
+        {ElementType::Hex20, BasisType::Serendipity, 2, {Real(0.2), Real(-0.1), Real(0.3)}, Real(1e-10)},
+        {ElementType::Hex27, BasisType::Lagrange, 2, {Real(0.1), Real(-0.2), Real(0.3)}, Real(1e-12)},
+        {ElementType::Wedge6, BasisType::Lagrange, 1, {Real(0.2), Real(0.15), Real(-0.3)}, Real(1e-12)},
+    };
+
+    int covered = 0;
+    for (const auto& c : cases) {
+        auto basis = basis_factory::create(BasisRequest{c.type, c.basis_type, c.order});
+        expect_partition_hessian_sum_zero(*basis, c.xi, c.tol);
+        expect_hessians_symmetric(*basis, c.xi, c.tol);
+        ++covered;
+    }
+
+    EXPECT_EQ(covered, 13);
+}
diff --git a/tests/unitTests/FE/Basis/test_ConstexprBasis.cpp b/tests/unitTests/FE/Basis/test_ConstexprBasis.cpp
new file mode 100644
index 000000000..a1031fa76
--- /dev/null
+++ b/tests/unitTests/FE/Basis/test_ConstexprBasis.cpp
@@ -0,0 +1,226 @@
+/**
+ * @file test_ConstexprBasis.cpp
+ * @brief Compile-time and lightweight runtime checks for migrated Basis helpers.
+ */
+
+#include "FE/Basis/BasisTolerance.h"
+#include "FE/Basis/BasisTraits.h"
+#include "FE/Basis/LagrangeBasis.h"
+#include "FE/Basis/LagrangeBasisFast.h"
+#include "FE/Basis/NodeOrderingConventions.h"
+
+#include <gtest/gtest.h>
+
+#include <array>
+#include <limits>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+namespace svmp {
+namespace FE {
+namespace basis {
+namespace {
+
+static_assert(is_line(ElementType::Line2));
+static_assert(is_line(ElementType::Line3));
+static_assert(is_triangle(ElementType::Triangle6));
+static_assert(is_quadrilateral(ElementType::Quad8));
+static_assert(is_tetrahedron(ElementType::Tetra10));
+static_assert(is_hexahedron(ElementType::Hex20));
+static_assert(is_wedge(ElementType::Wedge18));
+static_assert(is_pyramid(ElementType::Pyramid14));
+static_assert(is_simplex(ElementType::Triangle3));
+static_assert(is_simplex(ElementType::Tetra4));
+static_assert(!is_simplex(ElementType::Wedge6));
+static_assert(is_tensor_product(ElementType::Line2));
+static_assert(is_tensor_product(ElementType::Quad9));
+static_assert(is_tensor_product(ElementType::Hex27));
+static_assert(!is_tensor_product(ElementType::Pyramid5));
+static_assert(reference_dimension(ElementType::Pyramid14) == 3);
+static_assert(canonical_lagrange_type(ElementType::Hex27) == ElementType::Hex8);
+static_assert(canonical_lagrange_type(ElementType::Pyramid13) == ElementType::Pyramid13);
+static_assert(complete_lagrange_alias_order(ElementType::Wedge18) == 2);
+static_assert(complete_lagrange_alias_order(ElementType::Hex20) == -1);
+static_assert(line_lagrange_size(2) == 3u);
+static_assert(triangle_lagrange_size(2) == 6u);
+static_assert(quad_lagrange_size(2) == 9u);
+static_assert(tetra_lagrange_size(2) == 10u);
+static_assert(hex_lagrange_size(2) == 27u);
+static_assert(wedge_lagrange_size(2) == 18u);
+static_assert(pyramid_lagrange_size(2) == 14u);
+static_assert(detail::basis_abs(Real(-2)) == Real(2));
+static_assert(detail::basis_max(Real(2), Real(3)) == Real(3));
+static_assert(detail::basis_near_zero(std::numeric_limits<Real>::epsilon() * Real(32)));
+static_assert(detail::basis_nearly_equal(
+    Real(1),
+    Real(1) + std::numeric_limits<Real>::epsilon() * Real(32)));
+
+constexpr auto kLineFastValues = [] {
+    math::Vector<Real, 3> xi{Real(0), Real(0), Real(0)};
+    std::array<Real, LagrangeLineFast<1>::n_dofs> values{};
+    LagrangeLineFast<1>::evaluate(xi, values);
+    return values;
+}();
+static_assert(kLineFastValues[0] == Real(0.5));
+static_assert(kLineFastValues[1] == Real(0.5));
+
+constexpr auto kLineP2FastHessians = [] {
+    math::Vector<Real, 3> xi{Real(0), Real(0), Real(0)};
+    std::array<Hessian, LagrangeLineFast<2>::n_dofs> hessians{};
+    LagrangeLineFast<2>::evaluate_hessians(xi, hessians);
+    return hessians;
+}();
+static_assert(kLineP2FastHessians[0](0, 0) == Real(1));
+static_assert(kLineP2FastHessians[1](0, 0) == Real(1));
+static_assert(kLineP2FastHessians[2](0, 0) == Real(-2));
+
+constexpr auto kTriP2FastValues = [] {
+    math::Vector<Real, 3> xi{Real(0.25), Real(0.25), Real(0)};
+    std::array<Real, LagrangeTriFast<2>::n_dofs> values{};
+    LagrangeTriFast<2>::evaluate(xi, values);
+    return values;
+}();
+static_assert(kTriP2FastValues[0] == Real(0));
+static_assert(kTriP2FastValues[3] == Real(0.5));
+static_assert(kTriP2FastValues[4] == Real(0.25));
+
+template<typename Basis>
+constexpr bool overrides_scalar_strided_v =
+    !std::is_same_v<decltype(&Basis::evaluate_at_quadrature_points_strided),
+                    decltype(&BasisFunction::evaluate_at_quadrature_points_strided)>;
+
+template<typename FastBasis>
+void expect_fast_matches_lagrange(ElementType type,
+                                  int order,
+                                  const std::vector<math::Vector<Real, 3>>& points)
+{
+    LagrangeBasis basis(type, order);
+    for (const auto& xi : points) {
+        std::vector<Real> expected_values;
+        std::vector<Gradient> expected_gradients;
+        std::vector<Hessian> expected_hessians;
+        basis.evaluate_all(xi, expected_values, expected_gradients, expected_hessians);
+
+        std::array<Real, FastBasis::n_dofs> values{};
+        std::array<Gradient, FastBasis::n_dofs> gradients{};
+        std::array<Hessian, FastBasis::n_dofs> hessians{};
+        FastBasis::evaluate(xi, values);
+        FastBasis::evaluate_gradients(xi, gradients);
+        FastBasis::evaluate_hessians(xi, hessians);
+
+        ASSERT_EQ(expected_values.size(), values.size());
+        for (std::size_t i = 0; i < values.size(); ++i) {
+            EXPECT_NEAR(values[i], expected_values[i], Real(1e-14));
+            for (std::size_t d = 0; d < 3u; ++d) {
+                EXPECT_NEAR(gradients[i][d], expected_gradients[i][d], Real(1e-14));
+                for (std::size_t e = 0; e < 3u; ++e) {
+                    EXPECT_NEAR(hessians[i](d, e), expected_hessians[i](d, e), Real(1e-14));
+                }
+            }
+        }
+    }
+}
+
+TEST(ConstexprBasis, FixedNodeTableSizes) {
+    const std::vector<std::pair<ElementType, std::size_t>> expected = {
+        {ElementType::Line2, 2u},
+        {ElementType::Line3, 3u},
+        {ElementType::Triangle3, 3u},
+        {ElementType::Triangle6, 6u},
+        {ElementType::Quad4, 4u},
+        {ElementType::Quad8, 8u},
+        {ElementType::Quad9, 9u},
+        {ElementType::Tetra4, 4u},
+        {ElementType::Tetra10, 10u},
+        {ElementType::Hex8, 8u},
+        {ElementType::Hex20, 20u},
+        {ElementType::Hex27, 27u},
+        {ElementType::Wedge6, 6u},
+        {ElementType::Wedge15, 15u},
+        {ElementType::Wedge18, 18u},
+        {ElementType::Pyramid5, 5u},
+        {ElementType::Pyramid13, 13u},
+        {ElementType::Pyramid14, 14u},
+    };
+
+    for (const auto& [type, size] : expected) {
+        EXPECT_EQ(ReferenceNodeLayout::num_nodes(type), size);
+    }
+}
+
+TEST(ConstexprBasis, BasisToleranceScalesWithRealPrecision) {
+    const Real eps = std::numeric_limits<Real>::epsilon();
+    EXPECT_GT(detail::basis_scaled_tolerance(), eps);
+    EXPECT_TRUE(detail::basis_near_zero(eps * Real(32)));
+    EXPECT_FALSE(detail::basis_near_zero(eps * Real(128)));
+    EXPECT_TRUE(detail::basis_nearly_equal(Real(1), Real(1) + eps * Real(32)));
+    EXPECT_FALSE(detail::basis_nearly_equal(Real(1), Real(1) + eps * Real(128)));
+}
+
+TEST(ConstexprBasis, LagrangeOverridesStridedEvaluation) {
+    EXPECT_TRUE(overrides_scalar_strided_v<LagrangeBasis>);
+}
+
+TEST(ConstexprBasis, FastSidecarsMatchRuntimeLagrangeBasis) {
+    expect_fast_matches_lagrange<LagrangeLineFast<1>>(
+        ElementType::Line2, 1,
+        {{Real(-0.2), Real(0), Real(0)}, {Real(0.35), Real(0), Real(0)}});
+    expect_fast_matches_lagrange<LagrangeLineFast<2>>(
+        ElementType::Line2, 2,
+        {{Real(-0.2), Real(0), Real(0)}, {Real(0.35), Real(0), Real(0)}});
+    expect_fast_matches_lagrange<LagrangeQuadFast<1>>(
+        ElementType::Quad4, 1,
+        {{Real(-0.2), Real(0.3), Real(0)}, {Real(0.35), Real(-0.45), Real(0)}});
+    expect_fast_matches_lagrange<LagrangeHexFast<1>>(
+        ElementType::Hex8, 1,
+        {{Real(-0.2), Real(0.3), Real(0.1)}, {Real(0.35), Real(-0.45), Real(0.25)}});
+    expect_fast_matches_lagrange<LagrangeTriFast<1>>(
+        ElementType::Triangle3, 1,
+        {{Real(0.2), Real(0.3), Real(0)}, {Real(0.1), Real(0.6), Real(0)}});
+    expect_fast_matches_lagrange<LagrangeTriFast<2>>(
+        ElementType::Triangle3, 2,
+        {{Real(0.2), Real(0.3), Real(0)}, {Real(0.1), Real(0.6), Real(0)}});
+    expect_fast_matches_lagrange<LagrangeTetFast<1>>(
+        ElementType::Tetra4, 1,
+        {{Real(0.2), Real(0.3), Real(0.1)}, {Real(0.1), Real(0.2), Real(0.4)}});
+    expect_fast_matches_lagrange<LagrangeTetFast<2>>(
+        ElementType::Tetra4, 2,
+        {{Real(0.2), Real(0.3), Real(0.1)}, {Real(0.1), Real(0.2), Real(0.4)}});
+}
+
+TEST(ConstexprBasis, CompleteAliasTablesMatchGeneratedLagrangeNodes) {
+    const std::vector<std::tuple<ElementType, ElementType, int>> aliases = {
+        {ElementType::Line2, ElementType::Line2, 1},
+        {ElementType::Line3, ElementType::Line2, 2},
+        {ElementType::Triangle3, ElementType::Triangle3, 1},
+        {ElementType::Triangle6, ElementType::Triangle3, 2},
+        {ElementType::Quad4, ElementType::Quad4, 1},
+        {ElementType::Quad9, ElementType::Quad4, 2},
+        {ElementType::Tetra4, ElementType::Tetra4, 1},
+        {ElementType::Tetra10, ElementType::Tetra4, 2},
+        {ElementType::Hex8, ElementType::Hex8, 1},
+        {ElementType::Hex27, ElementType::Hex8, 2},
+        {ElementType::Wedge6, ElementType::Wedge6, 1},
+        {ElementType::Wedge18, ElementType::Wedge6, 2},
+        {ElementType::Pyramid5, ElementType::Pyramid5, 1},
+        {ElementType::Pyramid14, ElementType::Pyramid5, 2},
+    };
+
+    for (const auto& [alias, canonical_type, order] : aliases) {
+        const auto nodes = ReferenceNodeLayout::get_lagrange_node_coords(canonical_type, order);
+        ASSERT_EQ(nodes.size(), ReferenceNodeLayout::num_nodes(alias));
+        for (std::size_t i = 0; i < nodes.size(); ++i) {
+            const auto direct = ReferenceNodeLayout::get_node_coords(alias, i);
+            EXPECT_EQ(nodes[i][0], direct[0]);
+            EXPECT_EQ(nodes[i][1], direct[1]);
+            EXPECT_EQ(nodes[i][2], direct[2]);
+        }
+    }
+}
+
+} // namespace
+} // namespace basis
+} // namespace FE
+} // namespace svmp
diff --git a/tests/unitTests/FE/Basis/test_HigherOrderWedgePyramid.cpp b/tests/unitTests/FE/Basis/test_HigherOrderWedgePyramid.cpp
new file mode 100644
index 000000000..26efc4070
--- /dev/null
+++ b/tests/unitTests/FE/Basis/test_HigherOrderWedgePyramid.cpp
@@ -0,0 +1,173 @@
+/**
+ * @file test_HigherOrderWedgePyramid.cpp
+ * @brief Focused higher-order wedge and pyramid checks for LagrangeBasis.
+ */
+
+#include <gtest/gtest.h>
+
+#include "FE/Basis/LagrangeBasis.h"
+#include "FE/Basis/NodeOrderingConventions.h"
+
+#include <cmath>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+using namespace svmp::FE;
+using namespace svmp::FE::basis;
+
+namespace {
+
+void expect_nodes_close(const std::vector<math::Vector<Real, 3>>& lhs,
+                        const std::vector<math::Vector<Real, 3>>& rhs,
+                        Real tol)
+{
+    ASSERT_EQ(lhs.size(), rhs.size());
+    for (std::size_t i = 0; i < lhs.size(); ++i) {
+        EXPECT_NEAR(lhs[i][0], rhs[i][0], tol) << "node " << i;
+        EXPECT_NEAR(lhs[i][1], rhs[i][1], tol) << "node " << i;
+        EXPECT_NEAR(lhs[i][2], rhs[i][2], tol) << "node " << i;
+    }
+}
+
+void expect_kronecker_at_nodes(const LagrangeBasis& basis, Real tol)
+{
+    const auto& nodes = basis.nodes();
+    ASSERT_EQ(nodes.size(), basis.size());
+
+    std::vector<Real> values;
+    for (std::size_t node = 0; node < nodes.size(); ++node) {
+        basis.evaluate_values(nodes[node], values);
+        ASSERT_EQ(values.size(), basis.size());
+        for (std::size_t i = 0; i < values.size(); ++i) {
+            const Real expected = (i == node) ? Real(1) : Real(0);
+            EXPECT_NEAR(values[i], expected, tol)
+                << "node " << node << ", basis " << i;
+        }
+    }
+}
+
+void expect_partition_gradient_hessian_sums(const LagrangeBasis& basis,
+                                            const std::vector<math::Vector<Real, 3>>& points,
+                                            Real value_tol,
+                                            Real derivative_tol)
+{
+    for (const auto& xi : points) {
+        std::vector<Real> values;
+        std::vector<Gradient> gradients;
+        std::vector<Hessian> hessians;
+        basis.evaluate_all(xi, values, gradients, hessians);
+
+        Real value_sum = Real(0);
+        Gradient gradient_sum{};
+        Hessian hessian_sum{};
+        for (std::size_t i = 0; i < values.size(); ++i) {
+            value_sum += values[i];
+            for (std::size_t d = 0; d < 3u; ++d) {
+                gradient_sum[d] += gradients[i][d];
+                for (std::size_t e = 0; e < 3u; ++e) {
+                    hessian_sum(d, e) += hessians[i](d, e);
+                }
+            }
+        }
+
+        EXPECT_NEAR(value_sum, Real(1), value_tol);
+        for (int d = 0; d < basis.dimension(); ++d) {
+            EXPECT_NEAR(gradient_sum[static_cast<std::size_t>(d)], Real(0), derivative_tol);
+            for (int e = 0; e < basis.dimension(); ++e) {
+                EXPECT_NEAR(hessian_sum(static_cast<std::size_t>(d),
+                                        static_cast<std::size_t>(e)),
+                            Real(0),
+                            derivative_tol);
+            }
+        }
+    }
+}
+
+void expect_all_entries_finite(const LagrangeBasis& basis,
+                               const math::Vector<Real, 3>& xi)
+{
+    std::vector<Real> values;
+    std::vector<Gradient> gradients;
+    std::vector<Hessian> hessians;
+    basis.evaluate_all(xi, values, gradients, hessians);
+
+    for (std::size_t i = 0; i < values.size(); ++i) {
+        EXPECT_TRUE(std::isfinite(static_cast<double>(values[i]))) << "value " << i;
+        for (std::size_t d = 0; d < 3u; ++d) {
+            EXPECT_TRUE(std::isfinite(static_cast<double>(gradients[i][d])))
+                << "gradient " << i << ", " << d;
+            for (std::size_t e = 0; e < 3u; ++e) {
+                EXPECT_TRUE(std::isfinite(static_cast<double>(hessians[i](d, e))))
+                    << "hessian " << i << ", " << d << ", " << e;
+            }
+        }
+    }
+}
+
+} // namespace
+
+TEST(HigherOrderWedgePyramid, CompleteAliasesMatchGeneratedNodeLayouts) {
+    const std::vector<std::tuple<ElementType, ElementType, int>> cases = {
+        {ElementType::Wedge18, ElementType::Wedge6, 2},
+        {ElementType::Pyramid14, ElementType::Pyramid5, 2},
+    };
+
+    for (const auto& [alias, canonical, order] : cases) {
+        LagrangeBasis alias_basis(alias, order);
+        const auto generated = ReferenceNodeLayout::get_lagrange_node_coords(canonical, order);
+        ASSERT_EQ(generated.size(), ReferenceNodeLayout::num_nodes(alias));
+        expect_nodes_close(alias_basis.nodes(), generated, Real(1e-14));
+
+        for (std::size_t i = 0; i < generated.size(); ++i) {
+            const auto public_node = ReferenceNodeLayout::get_node_coords(alias, i);
+            EXPECT_NEAR(public_node[0], generated[i][0], Real(1e-14)) << "node " << i;
+            EXPECT_NEAR(public_node[1], generated[i][1], Real(1e-14)) << "node " << i;
+            EXPECT_NEAR(public_node[2], generated[i][2], Real(1e-14)) << "node " << i;
+        }
+    }
+}
+
+TEST(HigherOrderWedgePyramid, WedgeOrderThreeIsNodalAndPartitionsUnity) {
+    LagrangeBasis wedge(ElementType::Wedge6, 3);
+
+    expect_kronecker_at_nodes(wedge, Real(2e-10));
+    expect_partition_gradient_hessian_sums(
+        wedge,
+        {
+            {Real(0.18), Real(0.22), Real(-0.2)},
+            {Real(0.12), Real(0.16), Real(0.1)},
+            {Real(0.25), Real(0.15), Real(0.45)},
+        },
+        Real(1e-12),
+        Real(1e-9));
+}
+
+TEST(HigherOrderWedgePyramid, PyramidOrderThreeIsNodalAndPartitionsUnity) {
+    LagrangeBasis pyramid(ElementType::Pyramid5, 3);
+
+    expect_kronecker_at_nodes(pyramid, Real(5e-8));
+    expect_partition_gradient_hessian_sums(
+        pyramid,
+        {
+            {Real(0), Real(0), Real(0.2)},
+            {Real(0.12), Real(-0.08), Real(0.24)},
+            {Real(-0.08), Real(0.1), Real(0.55)},
+        },
+        Real(1e-11),
+        Real(5e-7));
+}
+
+TEST(HigherOrderWedgePyramid, PyramidNearApexDerivativeQueriesRemainFinite) {
+    const std::vector<std::pair<ElementType, int>> cases = {
+        {ElementType::Pyramid5, 1},
+        {ElementType::Pyramid14, 2},
+        {ElementType::Pyramid5, 4},
+    };
+
+    for (const auto& [type, order] : cases) {
+        LagrangeBasis basis(type, order);
+        expect_all_entries_finite(basis, {Real(0.01), Real(-0.005), Real(0.92)});
+        expect_all_entries_finite(basis, {Real(-0.004), Real(0.007), Real(0.98)});
+    }
+}
diff --git a/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp b/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp
new file mode 100644
index 000000000..a88d860e9
--- /dev/null
+++ b/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp
@@ -0,0 +1,3028 @@
+/**
+ * @file test_LagrangeBasis.cpp
+ * @brief Unit tests for Lagrange basis functions
+ */
+
+#include <gtest/gtest.h>
+#include "FE/Basis/BasisFactory.h"
+#include "FE/Basis/LagrangeBasis.h"
+#include "FE/Basis/NodeOrderingConventions.h"
+#include "FE/Basis/SerendipityBasis.h"
+#include "fs.h"
+#include "nn.h"
+#include <array>
+#include <cmath>
+#include <functional>
+#include <limits>
+#include <map>
+#include <math.h>
+#include <numeric>
+#include <string>
+#include <vector>
+
+namespace legacy_solver_nn {
+using namespace consts;
+#include "nn_elem_gip.h"
+#include "nn_elem_gnn.h"
+#include "nn_elem_gnnxx.h"
+} // namespace legacy_solver_nn
+
+using svmp::FE::basis::LagrangeBasis;
+using svmp::FE::ElementType;
+using svmp::FE::Real;
+using svmp::FE::basis::Gradient;
+using svmp::FE::basis::Hessian;
+using svmp::FE::basis::ReferenceNodeLayout;
+
+namespace {
+
+using Point = svmp::FE::math::Vector<Real, 3>;
+
+struct SolverBasisAdapterCase {
+    consts::ElementType type;
+    consts::ElementType quadrature_type;
+    int insd;
+    int eNoN;
+    int nG;
+};
+
+std::vector<SolverBasisAdapterCase> solver_basis_adapter_cases() {
+    using consts::ElementType;
+    return {
+        {ElementType::LIN1, ElementType::LIN1, 1, 2, 2},
+        {ElementType::LIN2, ElementType::LIN2, 1, 3, 3},
+        {ElementType::TRI3, ElementType::TRI3, 2, 3, 3},
+        {ElementType::TRI6, ElementType::TRI6, 2, 6, 7},
+        {ElementType::QUD4, ElementType::QUD4, 2, 4, 4},
+        {ElementType::QUD8, ElementType::QUD9, 2, 8, 9},
+        {ElementType::QUD9, ElementType::QUD9, 2, 9, 9},
+        {ElementType::TET4, ElementType::TET4, 3, 4, 4},
+        {ElementType::TET10, ElementType::TET10, 3, 10, 15},
+        {ElementType::HEX8, ElementType::HEX8, 3, 8, 8},
+        {ElementType::HEX20, ElementType::HEX20, 3, 20, 27},
+        {ElementType::HEX27, ElementType::HEX27, 3, 27, 27},
+        {ElementType::WDG, ElementType::WDG, 3, 6, 6},
+    };
+}
+
+std::vector<SolverBasisAdapterCase> solver_face_basis_adapter_cases() {
+    using consts::ElementType;
+    return {
+        {ElementType::LIN1, ElementType::LIN1, 1, 2, 2},
+        {ElementType::LIN2, ElementType::LIN2, 1, 3, 3},
+        {ElementType::TRI3, ElementType::TRI3, 2, 3, 3},
+        {ElementType::TRI6, ElementType::TRI6, 2, 6, 7},
+        {ElementType::QUD4, ElementType::QUD4, 2, 4, 4},
+        {ElementType::QUD8, ElementType::QUD8, 2, 8, 9},
+        {ElementType::QUD9, ElementType::QUD9, 2, 9, 9},
+    };
+}
+
+std::vector<SolverBasisAdapterCase> solver_hessian_adapter_cases() {
+    return solver_basis_adapter_cases();
+}
+
+std::vector<SolverBasisAdapterCase> solver_legacy_hessian_parity_cases() {
+    using consts::ElementType;
+    return {
+        {ElementType::TRI6, ElementType::TRI6, 2, 6, 7},
+        {ElementType::QUD9, ElementType::QUD9, 2, 9, 9},
+        {ElementType::TET10, ElementType::TET10, 3, 10, 15},
+    };
+}
+
+int packed_hessian_components(int insd) {
+    if (insd == 1) {
+        return 1;
+    }
+    if (insd == 2) {
+        return 3;
+    }
+    return 6;
+}
+
+void fill_legacy_quadrature(const SolverBasisAdapterCase& c,
+                            Vector<double>& w,
+                            Array<double>& xi) {
+    mshType mesh;
+    mesh.eType = c.quadrature_type;
+    mesh.eNoN = c.eNoN;
+    mesh.nG = c.nG;
+    mesh.w.resize(c.nG);
+    mesh.xi.resize(c.insd, c.nG);
+    legacy_solver_nn::set_element_gauss_int_data.at(c.quadrature_type)(mesh);
+    w = mesh.w;
+    xi = mesh.xi;
+}
+
+faceType initialized_face_for_case(const SolverBasisAdapterCase& c) {
+    faceType face;
+    face.eType = c.type;
+    face.eNoN = c.eNoN;
+    face.nG = c.nG;
+    face.w.resize(c.nG);
+    face.xi.resize(c.insd, c.nG);
+    legacy_solver_nn::set_face_gauss_int_data.at(c.quadrature_type)(face);
+    face.N.resize(c.eNoN, c.nG);
+    face.Nx.resize(c.insd, c.eNoN, c.nG);
+    return face;
+}
+
+void expect_arrays_near(const Array<double>& actual,
+                        const Array<double>& expected,
+                        double tol) {
+    ASSERT_EQ(actual.nrows(), expected.nrows());
+    ASSERT_EQ(actual.ncols(), expected.ncols());
+    for (int col = 0; col < actual.ncols(); ++col) {
+        for (int row = 0; row < actual.nrows(); ++row) {
+            EXPECT_NEAR(actual(row, col), expected(row, col), tol)
+                << "row=" << row << ", col=" << col;
+        }
+    }
+}
+
+void expect_vectors_near(const Vector<double>& actual,
+                         const Vector<double>& expected,
+                         double tol) {
+    ASSERT_EQ(actual.size(), expected.size());
+    for (int i = 0; i < actual.size(); ++i) {
+        EXPECT_NEAR(actual(i), expected(i), tol) << "index=" << i;
+    }
+}
+
+void expect_array3_near(const Array3<double>& actual,
+                        const Array3<double>& expected,
+                        double tol) {
+    ASSERT_EQ(actual.nrows(), expected.nrows());
+    ASSERT_EQ(actual.ncols(), expected.ncols());
+    ASSERT_EQ(actual.nslices(), expected.nslices());
+    for (int slice = 0; slice < actual.nslices(); ++slice) {
+        for (int col = 0; col < actual.ncols(); ++col) {
+            for (int row = 0; row < actual.nrows(); ++row) {
+                EXPECT_NEAR(actual(row, col, slice), expected(row, col, slice), tol)
+                    << "row=" << row << ", col=" << col << ", slice=" << slice;
+            }
+        }
+    }
+}
+
+void fill_array3(Array3<double>& values, double value) {
+    for (int slice = 0; slice < values.nslices(); ++slice) {
+        for (int col = 0; col < values.ncols(); ++col) {
+            for (int row = 0; row < values.nrows(); ++row) {
+                values(row, col, slice) = value;
+            }
+        }
+    }
+}
+
+void expect_face_partition_identities(const SolverBasisAdapterCase& c,
+                                      const faceType& face,
+                                      int g,
+                                      double tol) {
+    double partition = 0.0;
+    std::array<double, 3> gradient_sum{0.0, 0.0, 0.0};
+
+    for (int a = 0; a < c.eNoN; ++a) {
+        EXPECT_TRUE(std::isfinite(face.N(a, g)))
+            << "element=" << static_cast<int>(c.type)
+            << ", node=" << a
+            << ", g=" << g;
+        partition += face.N(a, g);
+
+        for (int d = 0; d < c.insd; ++d) {
+            EXPECT_TRUE(std::isfinite(face.Nx(d, a, g)))
+                << "element=" << static_cast<int>(c.type)
+                << ", d=" << d
+                << ", node=" << a
+                << ", g=" << g;
+            gradient_sum[static_cast<std::size_t>(d)] += face.Nx(d, a, g);
+        }
+    }
+
+    EXPECT_NEAR(partition, 1.0, tol)
+        << "element=" << static_cast<int>(c.type) << ", g=" << g;
+    for (int d = 0; d < c.insd; ++d) {
+        EXPECT_NEAR(gradient_sum[static_cast<std::size_t>(d)], 0.0, tol)
+            << "element=" << static_cast<int>(c.type) << ", d=" << d << ", g=" << g;
+    }
+}
+
+bool array3_has_nonzero_component(const Array3<double>& values,
+                                  int row,
+                                  double tol) {
+    for (int slice = 0; slice < values.nslices(); ++slice) {
+        for (int col = 0; col < values.ncols(); ++col) {
+            if (std::abs(values(row, col, slice)) > tol) {
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
+Array<double> single_point_xi(const SolverBasisAdapterCase& c,
+                              const Array<double>& xi,
+                              int g) {
+    Array<double> point(c.insd, 1);
+    for (int d = 0; d < c.insd; ++d) {
+        point(d, 0) = xi(d, g);
+    }
+    return point;
+}
+
+std::vector<double> finite_difference_solver_second_derivative(
+    const SolverBasisAdapterCase& c,
+    const Array<double>& point,
+    int gradient_component,
+    int coordinate_component,
+    double eps) {
+    Array<double> xi_plus = point;
+    Array<double> xi_minus = point;
+    xi_plus(coordinate_component, 0) += eps;
+    xi_minus(coordinate_component, 0) -= eps;
+
+    Array<double> N_plus(c.eNoN, 1);
+    Array<double> N_minus(c.eNoN, 1);
+    Array3<double> Nx_plus(c.insd, c.eNoN, 1);
+    Array3<double> Nx_minus(c.insd, c.eNoN, 1);
+
+    nn::get_gnn(c.insd, c.type, c.eNoN, 0, xi_plus, N_plus, Nx_plus);
+    nn::get_gnn(c.insd, c.type, c.eNoN, 0, xi_minus, N_minus, Nx_minus);
+
+    std::vector<double> values(static_cast<std::size_t>(c.eNoN));
+    for (int a = 0; a < c.eNoN; ++a) {
+        values[static_cast<std::size_t>(a)] =
+            (Nx_plus(gradient_component, a, 0) - Nx_minus(gradient_component, a, 0)) /
+            (2.0 * eps);
+    }
+    return values;
+}
+
+void expect_packed_hessian_component_matches_finite_difference(
+    const SolverBasisAdapterCase& c,
+    const Array<double>& point,
+    const Array3<double>& Nxx,
+    int g,
+    int packed_row,
+    int first_derivative_component,
+    int second_derivative_component,
+    double tol) {
+    const double eps = 2e-6;
+    const auto numerical = finite_difference_solver_second_derivative(
+        c, point, first_derivative_component, second_derivative_component, eps);
+    for (int a = 0; a < c.eNoN; ++a) {
+        EXPECT_NEAR(Nxx(packed_row, a, g), numerical[static_cast<std::size_t>(a)], tol)
+            << "element=" << static_cast<int>(c.type)
+            << ", packed_row=" << packed_row
+            << ", node=" << a
+            << ", g=" << g;
+    }
+
+    if (first_derivative_component != second_derivative_component) {
+        const auto symmetric_numerical = finite_difference_solver_second_derivative(
+            c, point, second_derivative_component, first_derivative_component, eps);
+        for (int a = 0; a < c.eNoN; ++a) {
+            EXPECT_NEAR(Nxx(packed_row, a, g),
+                        symmetric_numerical[static_cast<std::size_t>(a)],
+                        tol)
+                << "element=" << static_cast<int>(c.type)
+                << ", symmetry packed_row=" << packed_row
+                << ", node=" << a
+                << ", g=" << g;
+        }
+    }
+}
+
+void expect_solver_hessian_matches_gradient_finite_difference(
+    const SolverBasisAdapterCase& c,
+    const Array<double>& xi,
+    int g,
+    const Array3<double>& Nxx,
+    double tol) {
+    const Array<double> point = single_point_xi(c, xi, g);
+
+    expect_packed_hessian_component_matches_finite_difference(c, point, Nxx, g, 0, 0, 0, tol);
+    if (c.insd >= 2) {
+        expect_packed_hessian_component_matches_finite_difference(c, point, Nxx, g, 1, 1, 1, tol);
+    }
+    if (c.insd == 2) {
+        expect_packed_hessian_component_matches_finite_difference(c, point, Nxx, g, 2, 0, 1, tol);
+    } else if (c.insd >= 3) {
+        expect_packed_hessian_component_matches_finite_difference(c, point, Nxx, g, 2, 2, 2, tol);
+        expect_packed_hessian_component_matches_finite_difference(c, point, Nxx, g, 3, 0, 1, tol);
+        expect_packed_hessian_component_matches_finite_difference(c, point, Nxx, g, 4, 1, 2, tol);
+        expect_packed_hessian_component_matches_finite_difference(c, point, Nxx, g, 5, 0, 2, tol);
+    }
+}
+
+void expect_partition_hessian_identity(const SolverBasisAdapterCase& c,
+                                       const Array3<double>& Nxx,
+                                       int g,
+                                       double tol) {
+    for (int row = 0; row < Nxx.nrows(); ++row) {
+        double sum = 0.0;
+        for (int a = 0; a < c.eNoN; ++a) {
+            sum += Nxx(row, a, g);
+        }
+        EXPECT_NEAR(sum, 0.0, tol)
+            << "element=" << static_cast<int>(c.type)
+            << ", packed_row=" << row
+            << ", g=" << g;
+    }
+}
+
+void expect_all_hessians_zero(const SolverBasisAdapterCase& c,
+                              const Array3<double>& Nxx,
+                              int g,
+                              double tol) {
+    for (int row = 0; row < Nxx.nrows(); ++row) {
+        for (int a = 0; a < c.eNoN; ++a) {
+            EXPECT_NEAR(Nxx(row, a, g), 0.0, tol)
+                << "element=" << static_cast<int>(c.type)
+                << ", packed_row=" << row
+                << ", node=" << a
+                << ", g=" << g;
+        }
+    }
+}
+
+mshType initialized_mesh_for_case(const SolverBasisAdapterCase& c, bool force_lShpF) {
+    mshType mesh;
+    mesh.nFs = 1;
+    mesh.eType = c.type;
+    mesh.eNoN = c.eNoN;
+    mesh.nG = c.nG;
+    mesh.lShpF = force_lShpF;
+    mesh.w.resize(c.nG);
+    mesh.xi.resize(c.insd, c.nG);
+    mesh.N.resize(c.eNoN, c.nG);
+    mesh.Nx.resize(c.insd, c.eNoN, c.nG);
+    mesh.xib.resize(2, c.insd);
+    mesh.Nb.resize(2, c.eNoN);
+
+    nn::get_gip(c.insd, c.quadrature_type, c.nG, mesh.w, mesh.xi);
+    for (int g = 0; g < c.nG; ++g) {
+        nn::get_gnn(c.insd, c.type, c.eNoN, g, mesh.xi, mesh.N, mesh.Nx);
+    }
+    nn::get_nn_bnds(c.insd, c.type, c.eNoN, mesh.xib, mesh.Nb);
+    return mesh;
+}
+
+enum class PyramidFace {
+    Base,
+    South,
+    East,
+    North,
+    West
+};
+
+enum class PyramidEdge {
+    BaseSouth,
+    BaseEast,
+    BaseNorth,
+    BaseWest,
+    VerticalSW,
+    VerticalSE,
+    VerticalNE,
+    VerticalNW
+};
+
+struct LagrangeAccuracyCase {
+    ElementType type;
+    int order;
+    std::vector<Point> points;
+};
+
+std::size_t expected_lagrange_size(ElementType type, int order) {
+    switch (type) {
+        case ElementType::Point1:
+            return 1u;
+        case ElementType::Line2:
+        case ElementType::Line3:
+            return static_cast<std::size_t>(order + 1);
+        case ElementType::Triangle3:
+        case ElementType::Triangle6:
+            return static_cast<std::size_t>(order + 1) * static_cast<std::size_t>(order + 2) / 2;
+        case ElementType::Quad4:
+        case ElementType::Quad9:
+            return static_cast<std::size_t>(order + 1) * static_cast<std::size_t>(order + 1);
+        case ElementType::Tetra4:
+        case ElementType::Tetra10:
+            return static_cast<std::size_t>(order + 1) *
+                   static_cast<std::size_t>(order + 2) *
+                   static_cast<std::size_t>(order + 3) / 6;
+        case ElementType::Hex8:
+        case ElementType::Hex27:
+            return static_cast<std::size_t>(order + 1) *
+                   static_cast<std::size_t>(order + 1) *
+                   static_cast<std::size_t>(order + 1);
+        case ElementType::Wedge6:
+        case ElementType::Wedge18:
+            return static_cast<std::size_t>(order + 1) *
+                   static_cast<std::size_t>(order + 1) *
+                   static_cast<std::size_t>(order + 2) / 2;
+        case ElementType::Pyramid5:
+        case ElementType::Pyramid14:
+            return static_cast<std::size_t>(order + 1) *
+                   static_cast<std::size_t>(order + 2) *
+                   static_cast<std::size_t>(2 * order + 3) / 6;
+        default:
+            return 0u;
+    }
+}
+
+int expected_dimension(ElementType type) {
+    switch (type) {
+        case ElementType::Point1:
+            return 0;
+        case ElementType::Line2:
+        case ElementType::Line3:
+            return 1;
+        case ElementType::Triangle3:
+        case ElementType::Triangle6:
+        case ElementType::Quad4:
+        case ElementType::Quad9:
+            return 2;
+        default:
+            return 3;
+    }
+}
+
+bool points_close(const Point& a,
+                  const Point& b,
+                  Real tol = Real(1e-12)) {
+    return std::abs(a[0] - b[0]) <= tol &&
+           std::abs(a[1] - b[1]) <= tol &&
+           std::abs(a[2] - b[2]) <= tol;
+}
+
+std::vector<Point> reference_node_coords(ElementType type) {
+    switch (type) {
+        case ElementType::Line2:
+            return {
+                Point{Real(-1), Real(0), Real(0)},
+                Point{Real(1), Real(0), Real(0)},
+            };
+        case ElementType::Line3:
+            return {
+                Point{Real(-1), Real(0), Real(0)},
+                Point{Real(1), Real(0), Real(0)},
+                Point{Real(0), Real(0), Real(0)},
+            };
+        case ElementType::Triangle3:
+            return {
+                Point{Real(0), Real(0), Real(0)},
+                Point{Real(1), Real(0), Real(0)},
+                Point{Real(0), Real(1), Real(0)},
+            };
+        case ElementType::Triangle6:
+            return {
+                Point{Real(0), Real(0), Real(0)},
+                Point{Real(1), Real(0), Real(0)},
+                Point{Real(0), Real(1), Real(0)},
+                Point{Real(0.5), Real(0), Real(0)},
+                Point{Real(0.5), Real(0.5), Real(0)},
+                Point{Real(0), Real(0.5), Real(0)},
+            };
+        case ElementType::Quad4:
+            return {
+                Point{Real(-1), Real(-1), Real(0)},
+                Point{Real(1), Real(-1), Real(0)},
+                Point{Real(1), Real(1), Real(0)},
+                Point{Real(-1), Real(1), Real(0)},
+            };
+        case ElementType::Quad8:
+            return {
+                Point{Real(-1), Real(-1), Real(0)},
+                Point{Real(1), Real(-1), Real(0)},
+                Point{Real(1), Real(1), Real(0)},
+                Point{Real(-1), Real(1), Real(0)},
+                Point{Real(0), Real(-1), Real(0)},
+                Point{Real(1), Real(0), Real(0)},
+                Point{Real(0), Real(1), Real(0)},
+                Point{Real(-1), Real(0), Real(0)},
+            };
+        case ElementType::Quad9:
+            return {
+                Point{Real(-1), Real(-1), Real(0)},
+                Point{Real(1), Real(-1), Real(0)},
+                Point{Real(1), Real(1), Real(0)},
+                Point{Real(-1), Real(1), Real(0)},
+                Point{Real(0), Real(-1), Real(0)},
+                Point{Real(1), Real(0), Real(0)},
+                Point{Real(0), Real(1), Real(0)},
+                Point{Real(-1), Real(0), Real(0)},
+                Point{Real(0), Real(0), Real(0)},
+            };
+        case ElementType::Tetra4:
+            return {
+                Point{Real(0), Real(0), Real(0)},
+                Point{Real(1), Real(0), Real(0)},
+                Point{Real(0), Real(1), Real(0)},
+                Point{Real(0), Real(0), Real(1)},
+            };
+        case ElementType::Tetra10:
+            return {
+                Point{Real(0), Real(0), Real(0)},
+                Point{Real(1), Real(0), Real(0)},
+                Point{Real(0), Real(1), Real(0)},
+                Point{Real(0), Real(0), Real(1)},
+                Point{Real(0.5), Real(0), Real(0)},
+                Point{Real(0.5), Real(0.5), Real(0)},
+                Point{Real(0), Real(0.5), Real(0)},
+                Point{Real(0), Real(0), Real(0.5)},
+                Point{Real(0.5), Real(0), Real(0.5)},
+                Point{Real(0), Real(0.5), Real(0.5)},
+            };
+        case ElementType::Hex8:
+            return {
+                Point{Real(-1), Real(-1), Real(-1)},
+                Point{Real(1), Real(-1), Real(-1)},
+                Point{Real(1), Real(1), Real(-1)},
+                Point{Real(-1), Real(1), Real(-1)},
+                Point{Real(-1), Real(-1), Real(1)},
+                Point{Real(1), Real(-1), Real(1)},
+                Point{Real(1), Real(1), Real(1)},
+                Point{Real(-1), Real(1), Real(1)},
+            };
+        case ElementType::Hex20:
+            return {
+                Point{Real(-1), Real(-1), Real(-1)},
+                Point{Real(1), Real(-1), Real(-1)},
+                Point{Real(1), Real(1), Real(-1)},
+                Point{Real(-1), Real(1), Real(-1)},
+                Point{Real(-1), Real(-1), Real(1)},
+                Point{Real(1), Real(-1), Real(1)},
+                Point{Real(1), Real(1), Real(1)},
+                Point{Real(-1), Real(1), Real(1)},
+                Point{Real(0), Real(-1), Real(-1)},
+                Point{Real(1), Real(0), Real(-1)},
+                Point{Real(0), Real(1), Real(-1)},
+                Point{Real(-1), Real(0), Real(-1)},
+                Point{Real(0), Real(-1), Real(1)},
+                Point{Real(1), Real(0), Real(1)},
+                Point{Real(0), Real(1), Real(1)},
+                Point{Real(-1), Real(0), Real(1)},
+                Point{Real(-1), Real(-1), Real(0)},
+                Point{Real(1), Real(-1), Real(0)},
+                Point{Real(1), Real(1), Real(0)},
+                Point{Real(-1), Real(1), Real(0)},
+            };
+        case ElementType::Hex27:
+            return {
+                Point{Real(-1), Real(-1), Real(-1)},
+                Point{Real(1), Real(-1), Real(-1)},
+                Point{Real(1), Real(1), Real(-1)},
+                Point{Real(-1), Real(1), Real(-1)},
+                Point{Real(-1), Real(-1), Real(1)},
+                Point{Real(1), Real(-1), Real(1)},
+                Point{Real(1), Real(1), Real(1)},
+                Point{Real(-1), Real(1), Real(1)},
+                Point{Real(0), Real(-1), Real(-1)},
+                Point{Real(1), Real(0), Real(-1)},
+                Point{Real(0), Real(1), Real(-1)},
+                Point{Real(-1), Real(0), Real(-1)},
+                Point{Real(0), Real(-1), Real(1)},
+                Point{Real(1), Real(0), Real(1)},
+                Point{Real(0), Real(1), Real(1)},
+                Point{Real(-1), Real(0), Real(1)},
+                Point{Real(-1), Real(-1), Real(0)},
+                Point{Real(1), Real(-1), Real(0)},
+                Point{Real(1), Real(1), Real(0)},
+                Point{Real(-1), Real(1), Real(0)},
+                Point{Real(0), Real(0), Real(-1)},
+                Point{Real(0), Real(0), Real(1)},
+                Point{Real(0), Real(-1), Real(0)},
+                Point{Real(1), Real(0), Real(0)},
+                Point{Real(0), Real(1), Real(0)},
+                Point{Real(-1), Real(0), Real(0)},
+                Point{Real(0), Real(0), Real(0)},
+            };
+        case ElementType::Wedge6:
+            return {
+                Point{Real(0), Real(0), Real(-1)},
+                Point{Real(1), Real(0), Real(-1)},
+                Point{Real(0), Real(1), Real(-1)},
+                Point{Real(0), Real(0), Real(1)},
+                Point{Real(1), Real(0), Real(1)},
+                Point{Real(0), Real(1), Real(1)},
+            };
+        case ElementType::Wedge15:
+            return {
+                Point{Real(0), Real(0), Real(-1)},
+                Point{Real(1), Real(0), Real(-1)},
+                Point{Real(0), Real(1), Real(-1)},
+                Point{Real(0), Real(0), Real(1)},
+                Point{Real(1), Real(0), Real(1)},
+                Point{Real(0), Real(1), Real(1)},
+                Point{Real(0.5), Real(0), Real(-1)},
+                Point{Real(0.5), Real(0.5), Real(-1)},
+                Point{Real(0), Real(0.5), Real(-1)},
+                Point{Real(0.5), Real(0), Real(1)},
+                Point{Real(0.5), Real(0.5), Real(1)},
+                Point{Real(0), Real(0.5), Real(1)},
+                Point{Real(0), Real(0), Real(0)},
+                Point{Real(1), Real(0), Real(0)},
+                Point{Real(0), Real(1), Real(0)},
+            };
+        case ElementType::Wedge18:
+            return {
+                Point{Real(0), Real(0), Real(-1)},
+                Point{Real(1), Real(0), Real(-1)},
+                Point{Real(0), Real(1), Real(-1)},
+                Point{Real(0), Real(0), Real(1)},
+                Point{Real(1), Real(0), Real(1)},
+                Point{Real(0), Real(1), Real(1)},
+                Point{Real(0.5), Real(0), Real(-1)},
+                Point{Real(0.5), Real(0.5), Real(-1)},
+                Point{Real(0), Real(0.5), Real(-1)},
+                Point{Real(0.5), Real(0), Real(1)},
+                Point{Real(0.5), Real(0.5), Real(1)},
+                Point{Real(0), Real(0.5), Real(1)},
+                Point{Real(0), Real(0), Real(0)},
+                Point{Real(1), Real(0), Real(0)},
+                Point{Real(0), Real(1), Real(0)},
+                Point{Real(0.5), Real(0), Real(0)},
+                Point{Real(0.5), Real(0.5), Real(0)},
+                Point{Real(0), Real(0.5), Real(0)},
+            };
+        case ElementType::Pyramid5:
+            return {
+                Point{Real(-1), Real(-1), Real(0)},
+                Point{Real(1), Real(-1), Real(0)},
+                Point{Real(1), Real(1), Real(0)},
+                Point{Real(-1), Real(1), Real(0)},
+                Point{Real(0), Real(0), Real(1)},
+            };
+        case ElementType::Pyramid13:
+            return {
+                Point{Real(-1), Real(-1), Real(0)},
+                Point{Real(1), Real(-1), Real(0)},
+                Point{Real(1), Real(1), Real(0)},
+                Point{Real(-1), Real(1), Real(0)},
+                Point{Real(0), Real(0), Real(1)},
+                Point{Real(0), Real(-1), Real(0)},
+                Point{Real(1), Real(0), Real(0)},
+                Point{Real(0), Real(1), Real(0)},
+                Point{Real(-1), Real(0), Real(0)},
+                Point{Real(-0.5), Real(-0.5), Real(0.5)},
+                Point{Real(0.5), Real(-0.5), Real(0.5)},
+                Point{Real(0.5), Real(0.5), Real(0.5)},
+                Point{Real(-0.5), Real(0.5), Real(0.5)},
+            };
+        case ElementType::Pyramid14:
+            return {
+                Point{Real(-1), Real(-1), Real(0)},
+                Point{Real(1), Real(-1), Real(0)},
+                Point{Real(1), Real(1), Real(0)},
+                Point{Real(-1), Real(1), Real(0)},
+                Point{Real(0), Real(0), Real(1)},
+                Point{Real(0), Real(-1), Real(0)},
+                Point{Real(1), Real(0), Real(0)},
+                Point{Real(0), Real(1), Real(0)},
+                Point{Real(-1), Real(0), Real(0)},
+                Point{Real(-0.5), Real(-0.5), Real(0.5)},
+                Point{Real(0.5), Real(-0.5), Real(0.5)},
+                Point{Real(0.5), Real(0.5), Real(0.5)},
+                Point{Real(-0.5), Real(0.5), Real(0.5)},
+                Point{Real(0), Real(0), Real(0)},
+            };
+        default:
+            return {};
+    }
+}
+
+void expect_nodes_match_node_ordering(ElementType canonical_type,
+                                      int order,
+                                      ElementType node_ordering_type) {
+    LagrangeBasis basis(canonical_type, order);
+    const auto& nodes = basis.nodes();
+
+    ASSERT_EQ(nodes.size(), ReferenceNodeLayout::num_nodes(node_ordering_type));
+    ASSERT_EQ(nodes.size(), basis.size());
+
+    for (std::size_t i = 0; i < nodes.size(); ++i) {
+        const auto expected = ReferenceNodeLayout::get_node_coords(node_ordering_type, i);
+        EXPECT_NEAR(nodes[i][0], expected[0], 1e-14);
+        EXPECT_NEAR(nodes[i][1], expected[1], 1e-14);
+        EXPECT_NEAR(nodes[i][2], expected[2], 1e-14);
+
+        std::vector<Real> vals;
+        basis.evaluate_values(expected, vals);
+        ASSERT_EQ(vals.size(), nodes.size());
+        for (std::size_t j = 0; j < vals.size(); ++j) {
+            const double expected_delta = (i == j) ? 1.0 : 0.0;
+            EXPECT_NEAR(vals[j], expected_delta, 1e-12);
+        }
+    }
+}
+
+void expect_alias_matches_canonical(ElementType alias_type,
+                                    ElementType canonical_type,
+                                    int canonical_order,
+                                    const std::vector<Point>& points,
+                                    Real tol = Real(1e-12)) {
+    LagrangeBasis alias(alias_type, canonical_order);
+    LagrangeBasis canonical(canonical_type, canonical_order);
+
+    ASSERT_EQ(alias.element_type(), canonical.element_type());
+    ASSERT_EQ(alias.order(), canonical.order());
+    ASSERT_EQ(alias.size(), canonical.size());
+    ASSERT_EQ(alias.nodes().size(), canonical.nodes().size());
+
+    for (std::size_t i = 0; i < alias.nodes().size(); ++i) {
+        EXPECT_NEAR(alias.nodes()[i][0], canonical.nodes()[i][0], tol);
+        EXPECT_NEAR(alias.nodes()[i][1], canonical.nodes()[i][1], tol);
+        EXPECT_NEAR(alias.nodes()[i][2], canonical.nodes()[i][2], tol);
+    }
+
+    for (const auto& xi : points) {
+        std::vector<Real> alias_values;
+        std::vector<Real> canonical_values;
+        std::vector<Gradient> alias_gradients;
+        std::vector<Gradient> canonical_gradients;
+        std::vector<Hessian> alias_hessians;
+        std::vector<Hessian> canonical_hessians;
+
+        alias.evaluate_values(xi, alias_values);
+        canonical.evaluate_values(xi, canonical_values);
+        alias.evaluate_gradients(xi, alias_gradients);
+        canonical.evaluate_gradients(xi, canonical_gradients);
+        alias.evaluate_hessians(xi, alias_hessians);
+        canonical.evaluate_hessians(xi, canonical_hessians);
+
+        ASSERT_EQ(alias_values.size(), canonical_values.size());
+        ASSERT_EQ(alias_gradients.size(), canonical_gradients.size());
+        ASSERT_EQ(alias_hessians.size(), canonical_hessians.size());
+
+        for (std::size_t i = 0; i < alias_values.size(); ++i) {
+            EXPECT_NEAR(alias_values[i], canonical_values[i], tol);
+            for (int d = 0; d < canonical.dimension(); ++d) {
+                const std::size_t sd = static_cast<std::size_t>(d);
+                EXPECT_NEAR(alias_gradients[i][sd], canonical_gradients[i][sd], tol);
+                for (int e = 0; e < canonical.dimension(); ++e) {
+                    const std::size_t se = static_cast<std::size_t>(e);
+                    EXPECT_NEAR(alias_hessians[i](sd, se), canonical_hessians[i](sd, se), Real(5) * tol);
+                }
+            }
+        }
+    }
+}
+
+std::vector<Point> sample_points_for(ElementType type) {
+    switch (type) {
+        case ElementType::Line2:
+        case ElementType::Line3:
+            return {
+                Point{Real(-0.7), Real(0), Real(0)},
+                Point{Real(0.1), Real(0), Real(0)},
+                Point{Real(0.65), Real(0), Real(0)}
+            };
+        case ElementType::Triangle3:
+        case ElementType::Triangle6:
+            return {
+                Point{Real(0.15), Real(0.2), Real(0)},
+                Point{Real(0.25), Real(0.1), Real(0)},
+                Point{Real(0.2), Real(0.3), Real(0)}
+            };
+        case ElementType::Quad4:
+        case ElementType::Quad9:
+            return {
+                Point{Real(0.2), Real(-0.35), Real(0)},
+                Point{Real(-0.4), Real(0.25), Real(0)},
+                Point{Real(0.55), Real(0.1), Real(0)}
+            };
+        case ElementType::Tetra4:
+        case ElementType::Tetra10:
+            return {
+                Point{Real(0.1), Real(0.2), Real(0.15)},
+                Point{Real(0.2), Real(0.1), Real(0.25)},
+                Point{Real(0.15), Real(0.15), Real(0.2)}
+            };
+        case ElementType::Hex8:
+        case ElementType::Hex27:
+            return {
+                Point{Real(0.2), Real(-0.3), Real(0.25)},
+                Point{Real(-0.5), Real(0.4), Real(-0.2)},
+                Point{Real(0.1), Real(0.15), Real(0.6)}
+            };
+        case ElementType::Wedge6:
+        case ElementType::Wedge18:
+            return {
+                Point{Real(0.2), Real(0.25), Real(0.0)},
+                Point{Real(0.1), Real(0.2), Real(-0.45)},
+                Point{Real(0.3), Real(0.15), Real(0.5)}
+            };
+        case ElementType::Pyramid5:
+        case ElementType::Pyramid14:
+            return {
+                Point{Real(0.0), Real(0.0), Real(0.25)},
+                Point{Real(0.15), Real(-0.1), Real(0.3)},
+                Point{Real(-0.1), Real(0.2), Real(0.4)}
+            };
+        default:
+            return {Point{Real(0), Real(0), Real(0)}};
+    }
+}
+
+std::vector<Point> boundary_stress_points_for(ElementType type);
+
+std::vector<Point> dense_sample_points_for(ElementType type) {
+    const auto interior = sample_points_for(type);
+    const auto boundary = boundary_stress_points_for(type);
+
+    std::vector<Point> points;
+    points.reserve(interior.size() + boundary.size());
+    points.insert(points.end(), interior.begin(), interior.end());
+    points.insert(points.end(), boundary.begin(), boundary.end());
+
+    if (type == ElementType::Pyramid5 || type == ElementType::Pyramid14) {
+        points.push_back(Point{Real(0.0), Real(0.0), Real(0.85)});
+        points.push_back(Point{Real(0.02), Real(-0.015), Real(0.95)});
+    }
+    return points;
+}
+
+std::vector<Point> boundary_stress_points_for(ElementType type) {
+    switch (type) {
+        case ElementType::Line2:
+        case ElementType::Line3:
+            return {
+                Point{Real(-0.999), Real(0), Real(0)},
+                Point{Real(-0.75), Real(0), Real(0)},
+                Point{Real(0.0), Real(0), Real(0)},
+                Point{Real(0.8), Real(0), Real(0)},
+                Point{Real(0.999), Real(0), Real(0)}
+            };
+        case ElementType::Triangle3:
+        case ElementType::Triangle6:
+            return {
+                Point{Real(1e-6), Real(1e-6), Real(0)},
+                Point{Real(0.98), Real(0.01), Real(0)},
+                Point{Real(0.01), Real(0.98), Real(0)},
+                Point{Real(0.25), Real(1e-4), Real(0)},
+                Point{Real(0.49), Real(0.49), Real(0)}
+            };
+        case ElementType::Quad4:
+        case ElementType::Quad9:
+            return {
+                Point{Real(-0.99), Real(-0.99), Real(0)},
+                Point{Real(0.99), Real(-0.99), Real(0)},
+                Point{Real(0.99), Real(0.99), Real(0)},
+                Point{Real(-0.99), Real(0.99), Real(0)},
+                Point{Real(0.0), Real(0.95), Real(0)}
+            };
+        case ElementType::Tetra4:
+        case ElementType::Tetra10:
+            return {
+                Point{Real(1e-6), Real(1e-6), Real(1e-6)},
+                Point{Real(0.97), Real(0.01), Real(0.01)},
+                Point{Real(0.01), Real(0.97), Real(0.01)},
+                Point{Real(0.01), Real(0.01), Real(0.97)},
+                Point{Real(0.32), Real(0.33), Real(0.01)}
+            };
+        case ElementType::Hex8:
+        case ElementType::Hex27:
+            return {
+                Point{Real(-0.99), Real(-0.99), Real(-0.99)},
+                Point{Real(0.99), Real(-0.99), Real(0.99)},
+                Point{Real(0.99), Real(0.99), Real(-0.99)},
+                Point{Real(-0.99), Real(0.99), Real(0.99)},
+                Point{Real(0.0), Real(0.0), Real(0.95)}
+            };
+        case ElementType::Wedge6:
+        case ElementType::Wedge18:
+            return {
+                Point{Real(1e-6), Real(1e-6), Real(-0.99)},
+                Point{Real(0.98), Real(0.01), Real(-0.99)},
+                Point{Real(0.01), Real(0.98), Real(0.99)},
+                Point{Real(0.49), Real(0.49), Real(0.0)},
+                Point{Real(0.25), Real(1e-4), Real(0.95)}
+            };
+        case ElementType::Pyramid5:
+        case ElementType::Pyramid14:
+            return {
+                Point{Real(0.0), Real(0.0), Real(0.95)},
+                Point{Real(0.01), Real(-0.01), Real(0.98)},
+                Point{Real(0.6), Real(-0.6), Real(0.2)},
+                Point{Real(0.79), Real(0.0), Real(0.2)},
+                Point{Real(0.0), Real(0.79), Real(0.2)}
+            };
+        default:
+            return {Point{Real(0), Real(0), Real(0)}};
+    }
+}
+
+Real monomial_value(const Point& xi, int px, int py, int pz) {
+    return std::pow(xi[0], px) * std::pow(xi[1], py) * std::pow(xi[2], pz);
+}
+
+void expect_gradients_match_finite_difference(const LagrangeAccuracyCase& c,
+                                              Real eps,
+                                              Real tol) {
+    LagrangeBasis basis(c.type, c.order);
+
+    for (const auto& xi : c.points) {
+        std::vector<Gradient> gradients;
+        basis.evaluate_gradients(xi, gradients);
+        ASSERT_EQ(gradients.size(), basis.size());
+
+        for (int d = 0; d < basis.dimension(); ++d) {
+            Point xp = xi;
+            Point xm = xi;
+            xp[d] += eps;
+            xm[d] -= eps;
+
+            std::vector<Real> values_p;
+            std::vector<Real> values_m;
+            basis.evaluate_values(xp, values_p);
+            basis.evaluate_values(xm, values_m);
+
+            ASSERT_EQ(values_p.size(), basis.size());
+            ASSERT_EQ(values_m.size(), basis.size());
+            for (std::size_t i = 0; i < basis.size(); ++i) {
+                const Real fd = (values_p[i] - values_m[i]) / (Real(2) * eps);
+                EXPECT_NEAR(gradients[i][d], fd, tol)
+                    << "type=" << static_cast<int>(c.type)
+                    << ", order=" << c.order
+                    << ", dim=" << d
+                    << ", basis_i=" << i
+                    << ", xi=(" << xi[0] << "," << xi[1] << "," << xi[2] << ")";
+            }
+        }
+    }
+}
+
+void expect_polynomial_reproduction(const LagrangeAccuracyCase& c,
+                                    const std::vector<std::array<int, 3>>& exponents,
+                                    Real tol) {
+    LagrangeBasis basis(c.type, c.order);
+    const auto& nodes = basis.nodes();
+    ASSERT_EQ(nodes.size(), basis.size());
+
+    for (const auto& exp : exponents) {
+        std::vector<Real> coeffs(basis.size(), Real(0));
+        for (std::size_t i = 0; i < basis.size(); ++i) {
+            coeffs[i] = monomial_value(nodes[i], exp[0], exp[1], exp[2]);
+        }
+
+        for (const auto& xi : c.points) {
+            std::vector<Real> values;
+            basis.evaluate_values(xi, values);
+            ASSERT_EQ(values.size(), basis.size());
+
+            Real interpolated = Real(0);
+            for (std::size_t i = 0; i < basis.size(); ++i) {
+                interpolated += coeffs[i] * values[i];
+            }
+
+            const Real exact = monomial_value(xi, exp[0], exp[1], exp[2]);
+            EXPECT_NEAR(interpolated, exact, tol)
+                << "type=" << static_cast<int>(c.type)
+                << ", order=" << c.order
+                << ", monomial=(" << exp[0] << "," << exp[1] << "," << exp[2] << ")"
+                << ", xi=(" << xi[0] << "," << xi[1] << "," << xi[2] << ")";
+        }
+    }
+}
+
+template<typename Container>
+void expect_all_finite(const Container& values) {
+    for (const auto& value : values) {
+        for (std::size_t d = 0; d < 3; ++d) {
+            EXPECT_TRUE(std::isfinite(value[d]));
+        }
+    }
+}
+
+void expect_hessians_finite(const std::vector<Hessian>& hessians,
+                            int dimension) {
+    for (const auto& H : hessians) {
+        for (int i = 0; i < dimension; ++i) {
+            for (int j = 0; j < dimension; ++j) {
+                EXPECT_TRUE(std::isfinite(H(static_cast<std::size_t>(i),
+                                            static_cast<std::size_t>(j))));
+            }
+        }
+    }
+}
+
+void expect_partition_gradient_hessian_sums(const LagrangeBasis& basis,
+                                            const std::vector<Point>& points,
+                                            Real value_tol,
+                                            Real derivative_tol) {
+    for (const auto& xi : points) {
+        std::vector<Real> values;
+        std::vector<Gradient> gradients;
+        std::vector<Hessian> hessians;
+        basis.evaluate_values(xi, values);
+        basis.evaluate_gradients(xi, gradients);
+        basis.evaluate_hessians(xi, hessians);
+
+        ASSERT_EQ(values.size(), basis.size());
+        ASSERT_EQ(gradients.size(), basis.size());
+        ASSERT_EQ(hessians.size(), basis.size());
+
+        Real value_sum = Real(0);
+        Gradient gradient_sum{};
+        Hessian hessian_sum{};
+        for (std::size_t i = 0; i < basis.size(); ++i) {
+            value_sum += values[i];
+            for (int d = 0; d < basis.dimension(); ++d) {
+                const std::size_t sd = static_cast<std::size_t>(d);
+                gradient_sum[sd] += gradients[i][sd];
+                for (int e = 0; e < basis.dimension(); ++e) {
+                    const std::size_t se = static_cast<std::size_t>(e);
+                    hessian_sum(sd, se) += hessians[i](sd, se);
+                }
+            }
+        }
+
+        EXPECT_NEAR(value_sum, Real(1), value_tol)
+            << "Element type " << static_cast<int>(basis.element_type())
+            << ", order " << basis.order()
+            << ", xi=(" << xi[0] << "," << xi[1] << "," << xi[2] << ")";
+
+        for (int d = 0; d < basis.dimension(); ++d) {
+            const std::size_t sd = static_cast<std::size_t>(d);
+            EXPECT_NEAR(gradient_sum[sd], Real(0), derivative_tol)
+                << "Gradient sum mismatch for element type " << static_cast<int>(basis.element_type())
+                << ", order " << basis.order()
+                << ", dim " << d;
+            for (int e = 0; e < basis.dimension(); ++e) {
+                const std::size_t se = static_cast<std::size_t>(e);
+                EXPECT_NEAR(hessian_sum(sd, se), Real(0), derivative_tol)
+                    << "Hessian sum mismatch for element type " << static_cast<int>(basis.element_type())
+                    << ", order " << basis.order()
+                    << ", component (" << d << "," << e << ")";
+            }
+        }
+    }
+}
+
+bool is_on_pyramid_face(const Point& point,
+                        PyramidFace face,
+                        Real tol = Real(1e-12)) {
+    const Real scale = Real(1) - point[2];
+    switch (face) {
+        case PyramidFace::Base:
+            return std::abs(point[2]) <= tol;
+        case PyramidFace::South:
+            return std::abs(point[1] + scale) <= tol;
+        case PyramidFace::East:
+            return std::abs(point[0] - scale) <= tol;
+        case PyramidFace::North:
+            return std::abs(point[1] - scale) <= tol;
+        case PyramidFace::West:
+            return std::abs(point[0] + scale) <= tol;
+    }
+    return false;
+}
+
+Point map_pyramid_face_to_reference(PyramidFace face,
+                                    const Point& point) {
+    const Real scale = Real(1) - point[2];
+    switch (face) {
+        case PyramidFace::Base:
+            return Point{point[0], point[1], Real(0)};
+        case PyramidFace::South:
+            return Point{(scale - point[0]) / Real(2), point[2], Real(0)};
+        case PyramidFace::East:
+            return Point{(scale + point[1]) / Real(2), point[2], Real(0)};
+        case PyramidFace::North:
+            return Point{(scale + point[0]) / Real(2), point[2], Real(0)};
+        case PyramidFace::West:
+            return Point{(scale - point[1]) / Real(2), point[2], Real(0)};
+    }
+    return Point{};
+}
+
+std::vector<Point> sample_points_for_pyramid_face(PyramidFace face) {
+    switch (face) {
+        case PyramidFace::Base:
+            return {
+                Point{Real(0.15), Real(-0.2), Real(0)},
+                Point{Real(-0.55), Real(0.35), Real(0)}
+            };
+        case PyramidFace::South:
+            return {
+                Point{Real(-0.2), Real(-0.8), Real(0.2)},
+                Point{Real(0.05), Real(-0.35), Real(0.65)}
+            };
+        case PyramidFace::East:
+            return {
+                Point{Real(0.8), Real(-0.25), Real(0.2)},
+                Point{Real(0.3), Real(0.08), Real(0.7)}
+            };
+        case PyramidFace::North:
+            return {
+                Point{Real(0.25), Real(0.8), Real(0.2)},
+                Point{Real(-0.08), Real(0.35), Real(0.65)}
+            };
+        case PyramidFace::West:
+            return {
+                Point{Real(-0.8), Real(0.2), Real(0.2)},
+                Point{Real(-0.3), Real(-0.05), Real(0.7)}
+            };
+    }
+    return {};
+}
+
+bool is_on_pyramid_edge(const Point& point,
+                        PyramidEdge edge,
+                        Real tol = Real(1e-12)) {
+    const Real scale = Real(1) - point[2];
+    switch (edge) {
+        case PyramidEdge::BaseSouth:
+            return std::abs(point[2]) <= tol && std::abs(point[1] + Real(1)) <= tol;
+        case PyramidEdge::BaseEast:
+            return std::abs(point[2]) <= tol && std::abs(point[0] - Real(1)) <= tol;
+        case PyramidEdge::BaseNorth:
+            return std::abs(point[2]) <= tol && std::abs(point[1] - Real(1)) <= tol;
+        case PyramidEdge::BaseWest:
+            return std::abs(point[2]) <= tol && std::abs(point[0] + Real(1)) <= tol;
+        case PyramidEdge::VerticalSW:
+            return std::abs(point[0] + scale) <= tol && std::abs(point[1] + scale) <= tol;
+        case PyramidEdge::VerticalSE:
+            return std::abs(point[0] - scale) <= tol && std::abs(point[1] + scale) <= tol;
+        case PyramidEdge::VerticalNE:
+            return std::abs(point[0] - scale) <= tol && std::abs(point[1] - scale) <= tol;
+        case PyramidEdge::VerticalNW:
+            return std::abs(point[0] + scale) <= tol && std::abs(point[1] - scale) <= tol;
+    }
+    return false;
+}
+
+Point map_pyramid_edge_to_reference(PyramidEdge edge,
+                                    const Point& point) {
+    switch (edge) {
+        case PyramidEdge::BaseSouth:
+        case PyramidEdge::BaseNorth:
+            return Point{point[0], Real(0), Real(0)};
+        case PyramidEdge::BaseEast:
+        case PyramidEdge::BaseWest:
+            return Point{point[1], Real(0), Real(0)};
+        case PyramidEdge::VerticalSW:
+        case PyramidEdge::VerticalSE:
+        case PyramidEdge::VerticalNE:
+        case PyramidEdge::VerticalNW:
+            return Point{Real(2) * point[2] - Real(1), Real(0), Real(0)};
+    }
+    return Point{};
+}
+
+std::vector<Point> sample_points_for_pyramid_edge(PyramidEdge edge) {
+    switch (edge) {
+        case PyramidEdge::BaseSouth:
+            return {Point{Real(-0.65), Real(-1), Real(0)}, Point{Real(0.35), Real(-1), Real(0)}};
+        case PyramidEdge::BaseEast:
+            return {Point{Real(1), Real(-0.45), Real(0)}, Point{Real(1), Real(0.55), Real(0)}};
+        case PyramidEdge::BaseNorth:
+            return {Point{Real(-0.55), Real(1), Real(0)}, Point{Real(0.45), Real(1), Real(0)}};
+        case PyramidEdge::BaseWest:
+            return {Point{Real(-1), Real(-0.55), Real(0)}, Point{Real(-1), Real(0.45), Real(0)}};
+        case PyramidEdge::VerticalSW:
+            return {Point{Real(-0.75), Real(-0.75), Real(0.25)}, Point{Real(-0.3), Real(-0.3), Real(0.7)}};
+        case PyramidEdge::VerticalSE:
+            return {Point{Real(0.75), Real(-0.75), Real(0.25)}, Point{Real(0.3), Real(-0.3), Real(0.7)}};
+        case PyramidEdge::VerticalNE:
+            return {Point{Real(0.75), Real(0.75), Real(0.25)}, Point{Real(0.3), Real(0.3), Real(0.7)}};
+        case PyramidEdge::VerticalNW:
+            return {Point{Real(-0.75), Real(0.75), Real(0.25)}, Point{Real(-0.3), Real(0.3), Real(0.7)}};
+    }
+    return {};
+}
+
+std::vector<int> map_pyramid_nodes_to_lower_basis_nodes(
+    const std::vector<Point>& pyramid_nodes,
+    const std::vector<Point>& lower_basis_nodes,
+    const std::function<bool(const Point&)>& selector,
+    const std::function<Point(const Point&)>& mapper) {
+    std::vector<int> mapping(pyramid_nodes.size(), -1);
+    std::size_t face_count = 0;
+    for (std::size_t i = 0; i < pyramid_nodes.size(); ++i) {
+        if (!selector(pyramid_nodes[i])) {
+            continue;
+        }
+
+        ++face_count;
+        const Point mapped = mapper(pyramid_nodes[i]);
+        bool found = false;
+        for (std::size_t j = 0; j < lower_basis_nodes.size(); ++j) {
+            if (points_close(mapped, lower_basis_nodes[j])) {
+                mapping[i] = static_cast<int>(j);
+                found = true;
+                break;
+            }
+        }
+        EXPECT_TRUE(found)
+            << "Failed to match pyramid trace node at (" << pyramid_nodes[i][0] << ","
+            << pyramid_nodes[i][1] << "," << pyramid_nodes[i][2] << ")";
+    }
+
+    EXPECT_EQ(face_count, lower_basis_nodes.size());
+    return mapping;
+}
+
+void expect_pyramid_face_trace_matches_lower_basis(int order,
+                                                   PyramidFace face,
+                                                   Real tol = Real(2e-10)) {
+    LagrangeBasis pyramid(ElementType::Pyramid5, order);
+    const bool base_face = face == PyramidFace::Base;
+    LagrangeBasis lower(base_face ? ElementType::Quad4 : ElementType::Triangle3, order);
+
+    const auto mapping = map_pyramid_nodes_to_lower_basis_nodes(
+        pyramid.nodes(),
+        lower.nodes(),
+        [&](const Point& point) { return is_on_pyramid_face(point, face); },
+        [&](const Point& point) { return map_pyramid_face_to_reference(face, point); });
+
+    for (const auto& face_point : sample_points_for_pyramid_face(face)) {
+        std::vector<Real> pyramid_values;
+        std::vector<Real> lower_values;
+        pyramid.evaluate_values(face_point, pyramid_values);
+        lower.evaluate_values(map_pyramid_face_to_reference(face, face_point), lower_values);
+
+        ASSERT_EQ(pyramid_values.size(), pyramid.size());
+        ASSERT_EQ(lower_values.size(), lower.size());
+
+        for (std::size_t i = 0; i < pyramid.size(); ++i) {
+            if (mapping[i] >= 0) {
+                EXPECT_NEAR(pyramid_values[i], lower_values[static_cast<std::size_t>(mapping[i])], tol)
+                    << "Face trace mismatch for order " << order
+                    << ", face " << static_cast<int>(face)
+                    << ", basis " << i;
+            } else {
+                EXPECT_NEAR(pyramid_values[i], Real(0), tol)
+                    << "Off-face pyramid basis should vanish on face for order " << order
+                    << ", face " << static_cast<int>(face)
+                    << ", basis " << i;
+            }
+        }
+    }
+}
+
+void expect_pyramid_edge_trace_matches_line_basis(int order,
+                                                  PyramidEdge edge,
+                                                  Real tol = Real(2e-10)) {
+    LagrangeBasis pyramid(ElementType::Pyramid5, order);
+    LagrangeBasis line(ElementType::Line2, order);
+
+    const auto mapping = map_pyramid_nodes_to_lower_basis_nodes(
+        pyramid.nodes(),
+        line.nodes(),
+        [&](const Point& point) { return is_on_pyramid_edge(point, edge); },
+        [&](const Point& point) { return map_pyramid_edge_to_reference(edge, point); });
+
+    for (const auto& edge_point : sample_points_for_pyramid_edge(edge)) {
+        std::vector<Real> pyramid_values;
+        std::vector<Real> line_values;
+        pyramid.evaluate_values(edge_point, pyramid_values);
+        line.evaluate_values(map_pyramid_edge_to_reference(edge, edge_point), line_values);
+
+        ASSERT_EQ(pyramid_values.size(), pyramid.size());
+        ASSERT_EQ(line_values.size(), line.size());
+
+        for (std::size_t i = 0; i < pyramid.size(); ++i) {
+            if (mapping[i] >= 0) {
+                EXPECT_NEAR(pyramid_values[i], line_values[static_cast<std::size_t>(mapping[i])], tol)
+                    << "Edge trace mismatch for order " << order
+                    << ", edge " << static_cast<int>(edge)
+                    << ", basis " << i;
+            } else {
+                EXPECT_NEAR(pyramid_values[i], Real(0), tol)
+                    << "Off-edge pyramid basis should vanish on edge for order " << order
+                    << ", edge " << static_cast<int>(edge)
+                    << ", basis " << i;
+            }
+        }
+    }
+}
+
+struct StridedOutputRequest {
+    bool values;
+    bool gradients;
+    bool hessians;
+};
+
+void expect_strided_matches_pointwise(ElementType type,
+                                      int order,
+                                      const StridedOutputRequest& request) {
+    LagrangeBasis basis(type, order);
+    const auto points = dense_sample_points_for(type);
+    const std::size_t stride = points.size() + 3u;
+    constexpr Real sentinel = Real(-12345.25);
+
+    std::vector<Real> values(request.values ? basis.size() * stride : 0u, sentinel);
+    std::vector<Real> gradients(request.gradients ? basis.size() * 3u * stride : 0u, sentinel);
+    std::vector<Real> hessians(request.hessians ? basis.size() * 9u * stride : 0u, sentinel);
+
+    basis.evaluate_at_quadrature_points_strided(
+        points,
+        stride,
+        request.values ? values.data() : nullptr,
+        request.gradients ? gradients.data() : nullptr,
+        request.hessians ? hessians.data() : nullptr);
+
+    const Real tol = (type == ElementType::Pyramid5 || type == ElementType::Pyramid14)
+        ? Real(5e-10)
+        : Real(1e-12);
+
+    for (std::size_t q = 0; q < points.size(); ++q) {
+        if (request.values) {
+            std::vector<Real> expected;
+            basis.evaluate_values(points[q], expected);
+            ASSERT_EQ(expected.size(), basis.size());
+            for (std::size_t d = 0; d < basis.size(); ++d) {
+                EXPECT_NEAR(values[d * stride + q], expected[d], tol)
+                    << "type=" << static_cast<int>(type)
+                    << ", order=" << order
+                    << ", dof=" << d
+                    << ", q=" << q;
+            }
+        }
+
+        if (request.gradients) {
+            std::vector<Gradient> expected;
+            basis.evaluate_gradients(points[q], expected);
+            ASSERT_EQ(expected.size(), basis.size());
+            for (std::size_t d = 0; d < basis.size(); ++d) {
+                for (std::size_t c = 0; c < 3u; ++c) {
+                    EXPECT_NEAR(gradients[(d * 3u + c) * stride + q], expected[d][c], tol)
+                        << "type=" << static_cast<int>(type)
+                        << ", order=" << order
+                        << ", dof=" << d
+                        << ", component=" << c
+                        << ", q=" << q;
+                }
+            }
+        }
+
+        if (request.hessians) {
+            std::vector<Hessian> expected;
+            basis.evaluate_hessians(points[q], expected);
+            ASSERT_EQ(expected.size(), basis.size());
+            for (std::size_t d = 0; d < basis.size(); ++d) {
+                for (std::size_t r = 0; r < 3u; ++r) {
+                    for (std::size_t c = 0; c < 3u; ++c) {
+                        EXPECT_NEAR(hessians[(d * 9u + r * 3u + c) * stride + q],
+                                    expected[d](r, c),
+                                    Real(4) * tol)
+                            << "type=" << static_cast<int>(type)
+                            << ", order=" << order
+                            << ", dof=" << d
+                            << ", hessian=(" << r << "," << c << ")"
+                            << ", q=" << q;
+                    }
+                }
+            }
+        }
+    }
+
+    const auto expect_padding_untouched = [&](const std::vector<Real>& buffer,
+                                              std::size_t rows) {
+        for (std::size_t row = 0; row < rows; ++row) {
+            for (std::size_t q = points.size(); q < stride; ++q) {
+                EXPECT_EQ(buffer[row * stride + q], sentinel)
+                    << "type=" << static_cast<int>(type)
+                    << ", order=" << order
+                    << ", row=" << row
+                    << ", padding q=" << q;
+            }
+        }
+    };
+
+    if (request.values) {
+        expect_padding_untouched(values, basis.size());
+    }
+    if (request.gradients) {
+        expect_padding_untouched(gradients, basis.size() * 3u);
+    }
+    if (request.hessians) {
+        expect_padding_untouched(hessians, basis.size() * 9u);
+    }
+}
+
+void expect_raw_to_matches_vector_evaluation(ElementType type, int order) {
+    LagrangeBasis basis(type, order);
+    const Real tol = (type == ElementType::Pyramid5 || type == ElementType::Pyramid14)
+        ? Real(5e-10)
+        : Real(1e-12);
+
+    for (const auto& point : sample_points_for(type)) {
+        std::vector<Real> values;
+        std::vector<Gradient> gradients;
+        std::vector<Hessian> hessians;
+        basis.evaluate_all(point, values, gradients, hessians);
+
+        std::vector<Real> raw_values(basis.size());
+        std::vector<Real> raw_gradients(basis.size() * 3u);
+        std::vector<Real> raw_hessians(basis.size() * 9u);
+        basis.evaluate_values_to(point, raw_values.data());
+        basis.evaluate_gradients_to(point, raw_gradients.data());
+        basis.evaluate_hessians_to(point, raw_hessians.data());
+
+        for (std::size_t i = 0; i < basis.size(); ++i) {
+            EXPECT_NEAR(raw_values[i], values[i], tol)
+                << "type=" << static_cast<int>(type) << ", order=" << order << ", dof=" << i;
+            for (std::size_t c = 0; c < 3u; ++c) {
+                EXPECT_NEAR(raw_gradients[i * 3u + c], gradients[i][c], tol)
+                    << "type=" << static_cast<int>(type)
+                    << ", order=" << order
+                    << ", dof=" << i
+                    << ", gradient component=" << c;
+            }
+            for (std::size_t r = 0; r < 3u; ++r) {
+                for (std::size_t c = 0; c < 3u; ++c) {
+                    EXPECT_NEAR(raw_hessians[i * 9u + r * 3u + c], hessians[i](r, c), Real(4) * tol)
+                        << "type=" << static_cast<int>(type)
+                        << ", order=" << order
+                        << ", dof=" << i
+                        << ", hessian=(" << r << "," << c << ")";
+                }
+            }
+        }
+    }
+}
+
+} // namespace
+
+TEST(SolverBasisAdapter, ShapeValuesGradientsAndMeshOverloadMatchLegacy) {
+    constexpr double tol = 2e-12;
+
+    for (const auto& c : solver_basis_adapter_cases()) {
+        SCOPED_TRACE("element=" + std::to_string(static_cast<int>(c.type)));
+        Vector<double> weights;
+        Array<double> xi;
+        fill_legacy_quadrature(c, weights, xi);
+
+        Array<double> legacy_N(c.eNoN, c.nG);
+        Array<double> adapter_N(c.eNoN, c.nG);
+        Array3<double> legacy_Nx(c.insd, c.eNoN, c.nG);
+        Array3<double> adapter_Nx(c.insd, c.eNoN, c.nG);
+        auto legacy_shape = legacy_solver_nn::get_element_shape_data.find(c.type);
+
+        faceType legacy_face;
+        if (legacy_shape == legacy_solver_nn::get_element_shape_data.end()) {
+            ASSERT_EQ(c.type, consts::ElementType::QUD8);
+            legacy_face.eType = c.type;
+            legacy_face.eNoN = c.eNoN;
+            legacy_face.nG = c.nG;
+            legacy_face.xi = xi;
+            legacy_face.N.resize(c.eNoN, c.nG);
+            legacy_face.Nx.resize(c.insd, c.eNoN, c.nG);
+        }
+
+        for (int g = 0; g < c.nG; ++g) {
+            if (legacy_shape != legacy_solver_nn::get_element_shape_data.end()) {
+                legacy_shape->second(c.insd, c.eNoN, g, xi, legacy_N, legacy_Nx);
+            } else {
+                legacy_solver_nn::set_face_shape_data.at(c.type)(g, legacy_face);
+            }
+            nn::get_gnn(c.insd, c.type, c.eNoN, g, xi, adapter_N, adapter_Nx);
+
+            double partition = 0.0;
+            std::array<double, 3> gradient_sum{0.0, 0.0, 0.0};
+            for (int a = 0; a < c.eNoN; ++a) {
+                partition += adapter_N(a, g);
+                for (int d = 0; d < c.insd; ++d) {
+                    gradient_sum[static_cast<std::size_t>(d)] += adapter_Nx(d, a, g);
+                }
+            }
+
+            EXPECT_NEAR(partition, 1.0, tol)
+                << "element=" << static_cast<int>(c.type) << ", g=" << g;
+            for (int d = 0; d < c.insd; ++d) {
+                EXPECT_NEAR(gradient_sum[static_cast<std::size_t>(d)], 0.0, tol)
+                    << "element=" << static_cast<int>(c.type) << ", d=" << d << ", g=" << g;
+            }
+        }
+
+        if (legacy_shape == legacy_solver_nn::get_element_shape_data.end()) {
+            legacy_N = legacy_face.N;
+            legacy_Nx = legacy_face.Nx;
+        }
+
+        expect_arrays_near(adapter_N, legacy_N, tol);
+        expect_array3_near(adapter_Nx, legacy_Nx, tol);
+
+        mshType mesh;
+        mesh.eType = c.type;
+        mesh.eNoN = c.eNoN;
+        mesh.nG = c.nG;
+        mesh.xi = xi;
+        mesh.N.resize(c.eNoN, c.nG);
+        mesh.Nx.resize(c.insd, c.eNoN, c.nG);
+        for (int g = 0; g < c.nG; ++g) {
+            nn::get_gnn(g, mesh);
+        }
+
+        expect_arrays_near(mesh.N, legacy_N, tol);
+        expect_array3_near(mesh.Nx, legacy_Nx, tol);
+    }
+}
+
+TEST(SolverFaceBasisAdapter, ShapeValuesGradientsAndDispatchMatchLegacyFaceTable) {
+    constexpr double tol = 2e-12;
+
+    int covered = 0;
+    for (const auto& c : solver_face_basis_adapter_cases()) {
+        SCOPED_TRACE("face element=" + std::to_string(static_cast<int>(c.type)));
+
+        faceType legacy_face = initialized_face_for_case(c);
+        faceType basis_face = initialized_face_for_case(c);
+
+        for (int g = 0; g < c.nG; ++g) {
+            legacy_solver_nn::set_face_shape_data.at(c.type)(g, legacy_face);
+            nn::get_gnn(nullptr, g, basis_face);
+            expect_face_partition_identities(c, basis_face, g, tol);
+        }
+
+        expect_arrays_near(basis_face.N, legacy_face.N, tol);
+        expect_array3_near(basis_face.Nx, legacy_face.Nx, tol);
+        ++covered;
+    }
+
+    EXPECT_EQ(covered, 7);
+}
+
+TEST(SolverFaceBasisAdapter, MappedFacesFailClosedWithoutLegacyFallback) {
+    using consts::ElementType;
+
+    SolverBasisAdapterCase c{ElementType::LIN1, ElementType::LIN1, 1, 3, 2};
+    faceType face = initialized_face_for_case(c);
+
+    try {
+        nn::get_gnn(nullptr, 0, face);
+        FAIL() << "Expected mapped face dispatch to reject mismatched eNoN";
+    } catch (const svmp::FE::basis::BasisEvaluationException& exception) {
+        const std::string message = exception.what();
+        EXPECT_NE(message.find("legacy fallback was not attempted"), std::string::npos)
+            << message;
+    }
+}
+
+TEST(SolverFaceBasisAdapter, PointFaceRemainsLegacyValuePath) {
+    faceType face;
+    face.eType = consts::ElementType::PNT;
+    face.eNoN = 1;
+    face.nG = 1;
+    face.N.resize(1, 1);
+    face.Nx.resize(1, 1, 1);
+    face.N(0, 0) = -7.0;
+    face.Nx(0, 0, 0) = 42.0;
+
+    nn::get_gnn(nullptr, 0, face);
+
+    EXPECT_DOUBLE_EQ(face.N(0, 0), 1.0);
+    EXPECT_DOUBLE_EQ(face.Nx(0, 0, 0), 42.0);
+}
+
+TEST(SolverFaceBasisAdapter, UnsupportedFacesThrowClearErrors) {
+    faceType nrb_face;
+    nrb_face.eType = consts::ElementType::NRB;
+    nrb_face.eNoN = 1;
+    nrb_face.nG = 1;
+    nrb_face.N.resize(1, 1);
+    nrb_face.Nx.resize(1, 1, 1);
+    EXPECT_THROW(nn::get_gnn(nullptr, 0, nrb_face), svmp::FE::NotImplementedException);
+
+    faceType unknown_face;
+    unknown_face.eType = consts::ElementType::NA;
+    unknown_face.eNoN = 1;
+    unknown_face.nG = 1;
+    unknown_face.N.resize(1, 1);
+    unknown_face.Nx.resize(1, 1, 1);
+    EXPECT_THROW(nn::get_gnn(nullptr, 0, unknown_face), svmp::FE::InvalidElementException);
+}
+
+TEST(SolverBasisAdapter, QuadraturePathsRemainLegacyCompatible) {
+    constexpr double tol = 0.0;
+
+    for (const auto& c : solver_basis_adapter_cases()) {
+        auto mesh_it = legacy_solver_nn::set_element_gauss_int_data.find(c.type);
+        if (mesh_it != legacy_solver_nn::set_element_gauss_int_data.end()) {
+            mshType legacy_mesh;
+            legacy_mesh.eType = c.type;
+            legacy_mesh.eNoN = c.eNoN;
+            legacy_mesh.nG = c.nG;
+            legacy_mesh.w.resize(c.nG);
+            legacy_mesh.xi.resize(c.insd, c.nG);
+            mesh_it->second(legacy_mesh);
+
+            mshType adapter_mesh;
+            adapter_mesh.eType = c.type;
+            adapter_mesh.eNoN = c.eNoN;
+            adapter_mesh.nG = c.nG;
+            adapter_mesh.w.resize(c.nG);
+            adapter_mesh.xi.resize(c.insd, c.nG);
+            nn::get_gip(adapter_mesh);
+
+            expect_vectors_near(adapter_mesh.w, legacy_mesh.w, tol);
+            expect_arrays_near(adapter_mesh.xi, legacy_mesh.xi, tol);
+        }
+
+        auto scalar_it = legacy_solver_nn::get_element_gauss_int_data.find(c.type);
+        if (scalar_it != legacy_solver_nn::get_element_gauss_int_data.end()) {
+            Vector<double> legacy_w(c.nG);
+            Vector<double> adapter_w(c.nG);
+            Array<double> legacy_xi(c.insd, c.nG);
+            Array<double> adapter_xi(c.insd, c.nG);
+
+            scalar_it->second(c.insd, c.nG, legacy_w, legacy_xi);
+            nn::get_gip(c.insd, c.type, c.nG, adapter_w, adapter_xi);
+
+            expect_vectors_near(adapter_w, legacy_w, tol);
+            expect_arrays_near(adapter_xi, legacy_xi, tol);
+        }
+    }
+
+    mshType legacy_tet;
+    legacy_tet.eType = consts::ElementType::TET4;
+    legacy_tet.eNoN = 4;
+    legacy_tet.nG = 4;
+    legacy_tet.qmTET4 = 0.25;
+    legacy_tet.w.resize(4);
+    legacy_tet.xi.resize(3, 4);
+    legacy_solver_nn::set_element_gauss_int_data.at(consts::ElementType::TET4)(legacy_tet);
+
+    mshType adapter_tet;
+    adapter_tet.eType = consts::ElementType::TET4;
+    adapter_tet.eNoN = 4;
+    adapter_tet.nG = 4;
+    adapter_tet.qmTET4 = 0.25;
+    adapter_tet.w.resize(4);
+    adapter_tet.xi.resize(3, 4);
+    nn::get_gip(adapter_tet);
+
+    expect_vectors_near(adapter_tet.w, legacy_tet.w, tol);
+    expect_arrays_near(adapter_tet.xi, legacy_tet.xi, tol);
+}
+
+TEST(SolverBasisAdapter, HessiansCoverEveryMappedScalarVolumeElement) {
+    constexpr double partition_tol = 2e-10;
+    constexpr double finite_difference_tol = 2e-5;
+    constexpr double zero_tol = 2e-12;
+
+    int covered = 0;
+    for (const auto& c : solver_hessian_adapter_cases()) {
+        SCOPED_TRACE("element=" + std::to_string(static_cast<int>(c.type)));
+        Vector<double> weights;
+        Array<double> xi;
+        fill_legacy_quadrature(c, weights, xi);
+
+        const int ind2 = packed_hessian_components(c.insd);
+        Array3<double> adapter_Nxx(ind2, c.eNoN, c.nG);
+        fill_array3(adapter_Nxx, std::numeric_limits<double>::quiet_NaN());
+
+        for (int g = 0; g < c.nG; ++g) {
+            nn::get_gn_nxx(c.insd, ind2, c.type, c.eNoN, g, xi, adapter_Nxx);
+            expect_partition_hessian_identity(c, adapter_Nxx, g, partition_tol);
+            expect_solver_hessian_matches_gradient_finite_difference(
+                c, xi, g, adapter_Nxx, finite_difference_tol);
+
+            if (c.type == consts::ElementType::LIN1 ||
+                c.type == consts::ElementType::TRI3 ||
+                c.type == consts::ElementType::TET4) {
+                expect_all_hessians_zero(c, adapter_Nxx, g, zero_tol);
+            }
+        }
+
+        if (c.type == consts::ElementType::QUD4) {
+            EXPECT_TRUE(array3_has_nonzero_component(adapter_Nxx, 2, zero_tol));
+        } else if (c.type == consts::ElementType::HEX8) {
+            EXPECT_TRUE(array3_has_nonzero_component(adapter_Nxx, 3, zero_tol));
+            EXPECT_TRUE(array3_has_nonzero_component(adapter_Nxx, 4, zero_tol));
+            EXPECT_TRUE(array3_has_nonzero_component(adapter_Nxx, 5, zero_tol));
+        } else if (c.type == consts::ElementType::WDG) {
+            EXPECT_TRUE(array3_has_nonzero_component(adapter_Nxx, 5, zero_tol));
+        }
+        ++covered;
+    }
+
+    EXPECT_EQ(covered, 13);
+}
+
+TEST(SolverBasisAdapter, HessianPackingMatchesLegacyWhereLegacyIsApproved) {
+    constexpr double tol = 2e-12;
+
+    for (const auto& c : solver_legacy_hessian_parity_cases()) {
+        Vector<double> weights;
+        Array<double> xi;
+        fill_legacy_quadrature(c, weights, xi);
+
+        const int ind2 = packed_hessian_components(c.insd);
+        Array3<double> legacy_Nxx(ind2, c.eNoN, c.nG);
+        Array3<double> adapter_Nxx(ind2, c.eNoN, c.nG);
+
+        for (int g = 0; g < c.nG; ++g) {
+            legacy_solver_nn::get_element_2nd_derivs.at(c.type)(
+                c.insd, ind2, c.eNoN, g, xi, legacy_Nxx);
+            nn::get_gn_nxx(c.insd, ind2, c.type, c.eNoN, g, xi, adapter_Nxx);
+        }
+
+        expect_array3_near(adapter_Nxx, legacy_Nxx, tol);
+    }
+}
+
+TEST(SolverBasisAdapter, Qud8HessiansDoNotUseLegacyFallback) {
+    using consts::ElementType;
+    SolverBasisAdapterCase c{ElementType::QUD8, ElementType::QUD9, 2, 8, 9};
+
+    Vector<double> weights;
+    Array<double> xi;
+    fill_legacy_quadrature(c, weights, xi);
+
+    const int ind2 = packed_hessian_components(c.insd);
+    Array3<double> legacy_Nxx(ind2, c.eNoN, c.nG);
+    Array3<double> adapter_Nxx(ind2, c.eNoN, c.nG);
+    fill_array3(legacy_Nxx, 0.0);
+    fill_array3(adapter_Nxx, 0.0);
+
+    for (int g = 0; g < c.nG; ++g) {
+        legacy_solver_nn::get_element_2nd_derivs.at(c.type)(
+            c.insd, ind2, c.eNoN, g, xi, legacy_Nxx);
+        nn::get_gn_nxx(c.insd, ind2, c.type, c.eNoN, g, xi, adapter_Nxx);
+    }
+
+    double max_abs_difference = 0.0;
+    for (int g = 0; g < c.nG; ++g) {
+        for (int a = 0; a < c.eNoN; ++a) {
+            for (int row = 0; row < ind2; ++row) {
+                max_abs_difference = std::max(
+                    max_abs_difference,
+                    std::abs(adapter_Nxx(row, a, g) - legacy_Nxx(row, a, g)));
+            }
+        }
+    }
+
+    EXPECT_GT(max_abs_difference, 1e-8);
+}
+
+TEST(SolverBasisAdapter, UnsupportedHessianFamiliesRemainNoOp) {
+    Array<double> xi(1, 1);
+    xi(0, 0) = 0.0;
+    Array3<double> Nxx(1, 1, 1);
+
+    for (const auto unsupported : {consts::ElementType::NRB, consts::ElementType::PNT}) {
+        fill_array3(Nxx, 42.0);
+        nn::get_gn_nxx(1, 1, unsupported, 1, 0, xi, Nxx);
+        EXPECT_DOUBLE_EQ(Nxx(0, 0, 0), 42.0)
+            << "element=" << static_cast<int>(unsupported);
+    }
+}
+
+TEST(SolverBasisAdapter, InitFsMshPopulatesMappedHessiansWithoutLShpFGate) {
+    using consts::ElementType;
+    const SolverBasisAdapterCase cases[] = {
+        {ElementType::QUD4, ElementType::QUD4, 2, 4, 4},
+        {ElementType::HEX8, ElementType::HEX8, 3, 8, 8},
+        {ElementType::HEX20, ElementType::HEX20, 3, 20, 27},
+        {ElementType::HEX27, ElementType::HEX27, 3, 27, 27},
+        {ElementType::WDG, ElementType::WDG, 3, 6, 6},
+    };
+
+    for (const auto& c : cases) {
+        SCOPED_TRACE("element=" + std::to_string(static_cast<int>(c.type)));
+        ComMod com_mod;
+        com_mod.nsd = c.insd;
+        mshType mesh = initialized_mesh_for_case(c, true);
+
+        fs::init_fs_msh(com_mod, mesh);
+
+        ASSERT_EQ(mesh.fs.size(), 1u);
+        ASSERT_EQ(mesh.fs[0].Nxx.nrows(), packed_hessian_components(c.insd));
+        if (c.type == ElementType::QUD4) {
+            EXPECT_TRUE(array3_has_nonzero_component(mesh.fs[0].Nxx, 2, 2e-12));
+        } else if (c.type == ElementType::HEX8) {
+            EXPECT_TRUE(array3_has_nonzero_component(mesh.fs[0].Nxx, 3, 2e-12));
+        } else if (c.type == ElementType::WDG) {
+            EXPECT_TRUE(array3_has_nonzero_component(mesh.fs[0].Nxx, 5, 2e-12));
+        } else {
+            bool has_nonzero = false;
+            for (int row = 0; row < mesh.fs[0].Nxx.nrows(); ++row) {
+                has_nonzero = has_nonzero ||
+                    array3_has_nonzero_component(mesh.fs[0].Nxx, row, 2e-12);
+            }
+            EXPECT_TRUE(has_nonzero);
+        }
+    }
+}
+
+TEST(LagrangeBasis, QuadPartitionOfUnity) {
+    LagrangeBasis basis(ElementType::Quad4, 1);
+    svmp::FE::math::Vector<Real, 3> xi{0.2, -0.3, 0.0};
+
+    std::vector<Real> values;
+    basis.evaluate_values(xi, values);
+
+    double sum = std::accumulate(values.begin(), values.end(), 0.0);
+    EXPECT_NEAR(sum, 1.0, 1e-12);
+}
+
+TEST(LagrangeBasis, LineGradientLinear) {
+    LagrangeBasis basis(ElementType::Line2, 1);
+    svmp::FE::math::Vector<Real, 3> xi{0.0, 0.0, 0.0};
+    std::vector<Gradient> grad;
+    basis.evaluate_gradients(xi, grad);
+
+    ASSERT_EQ(grad.size(), 2u);
+    EXPECT_NEAR(grad[0][0], -0.5, 1e-12);
+    EXPECT_NEAR(grad[1][0], 0.5, 1e-12);
+}
+
+TEST(LagrangeBasis, TrianglePartitionOfUnity) {
+    LagrangeBasis basis(ElementType::Triangle3, 1);
+    svmp::FE::math::Vector<Real, 3> xi{0.2, 0.3, 0.0};
+    std::vector<Real> values;
+    basis.evaluate_values(xi, values);
+
+    double sum = std::accumulate(values.begin(), values.end(), 0.0);
+    EXPECT_NEAR(sum, 1.0, 1e-12);
+}
+
+TEST(LagrangeBasis, SizeFormulasPerElement) {
+    for (int order = 0; order <= 3; ++order) {
+        {
+            LagrangeBasis line(ElementType::Line2, order);
+            EXPECT_EQ(line.size(), static_cast<std::size_t>(order + 1));
+        }
+        {
+            LagrangeBasis quad(ElementType::Quad4, order);
+            const std::size_t n1d = static_cast<std::size_t>(order + 1);
+            EXPECT_EQ(quad.size(), n1d * n1d);
+        }
+        {
+            LagrangeBasis hex(ElementType::Hex8, order);
+            const std::size_t n1d = static_cast<std::size_t>(order + 1);
+            EXPECT_EQ(hex.size(), n1d * n1d * n1d);
+        }
+        {
+            LagrangeBasis tri(ElementType::Triangle3, order);
+            const std::size_t expected =
+                static_cast<std::size_t>(order + 1) *
+                static_cast<std::size_t>(order + 2) / 2;
+            EXPECT_EQ(tri.size(), expected);
+        }
+        {
+            LagrangeBasis tet(ElementType::Tetra4, order);
+            const std::size_t expected =
+                static_cast<std::size_t>(order + 1) *
+                static_cast<std::size_t>(order + 2) *
+                static_cast<std::size_t>(order + 3) / 6;
+            EXPECT_EQ(tet.size(), expected);
+        }
+    }
+}
+
+TEST(LagrangeBasis, KroneckerDeltaAtNodes) {
+    const std::vector<std::pair<ElementType, int>> cases = {
+        {ElementType::Line2, 1},
+        {ElementType::Quad4, 1},
+        {ElementType::Triangle3, 1},
+        {ElementType::Tetra4, 1},
+        {ElementType::Hex8, 1},
+        {ElementType::Triangle3, 2},
+        {ElementType::Tetra4, 2},
+        {ElementType::Quad4, 2},
+        {ElementType::Hex8, 2},
+        {ElementType::Wedge6, 2}
+    };
+
+    for (const auto& c : cases) {
+        LagrangeBasis basis(c.first, c.second);
+        const auto& nodes = basis.nodes();
+        ASSERT_EQ(nodes.size(), basis.size());
+
+        for (std::size_t i = 0; i < nodes.size(); ++i) {
+            std::vector<Real> vals;
+            basis.evaluate_values(nodes[i], vals);
+            ASSERT_EQ(vals.size(), nodes.size());
+            for (std::size_t j = 0; j < nodes.size(); ++j) {
+                if (i == j) {
+                    EXPECT_NEAR(vals[j], 1.0, 1e-12);
+                } else {
+                    EXPECT_NEAR(vals[j], 0.0, 1e-12);
+                }
+            }
+        }
+    }
+}
+
+TEST(LagrangeBasis, MatchesNodeOrderingConventionsForLinearAndQuadratic) {
+    // Tensor-product elements
+    expect_nodes_match_node_ordering(ElementType::Line2, 1, ElementType::Line2);
+    expect_nodes_match_node_ordering(ElementType::Line2, 2, ElementType::Line3);
+    expect_nodes_match_node_ordering(ElementType::Quad4, 1, ElementType::Quad4);
+    expect_nodes_match_node_ordering(ElementType::Quad4, 2, ElementType::Quad9);
+    expect_nodes_match_node_ordering(ElementType::Hex8, 1, ElementType::Hex8);
+    expect_nodes_match_node_ordering(ElementType::Hex8, 2, ElementType::Hex27);
+
+    // Simplex elements
+    expect_nodes_match_node_ordering(ElementType::Triangle3, 1, ElementType::Triangle3);
+    expect_nodes_match_node_ordering(ElementType::Triangle3, 2, ElementType::Triangle6);
+    expect_nodes_match_node_ordering(ElementType::Tetra4, 1, ElementType::Tetra4);
+    expect_nodes_match_node_ordering(ElementType::Tetra4, 2, ElementType::Tetra10);
+
+    // Mixed topology
+    expect_nodes_match_node_ordering(ElementType::Wedge6, 1, ElementType::Wedge6);
+    expect_nodes_match_node_ordering(ElementType::Wedge6, 2, ElementType::Wedge18);
+
+    // Pyramid
+    expect_nodes_match_node_ordering(ElementType::Pyramid5, 1, ElementType::Pyramid5);
+    expect_nodes_match_node_ordering(ElementType::Pyramid14, 2, ElementType::Pyramid14);
+}
+
+TEST(LagrangeBasis, WedgeAndPyramidPartitionOfUnity) {
+    {
+        LagrangeBasis wedge(ElementType::Wedge6, 1);
+        svmp::FE::math::Vector<Real, 3> xi{Real(0.2), Real(0.1), Real(0.3)};
+        std::vector<Real> vals;
+        wedge.evaluate_values(xi, vals);
+        const double sum = std::accumulate(vals.begin(), vals.end(), 0.0);
+        EXPECT_NEAR(sum, 1.0, 1e-12);
+    }
+
+    {
+        LagrangeBasis wedge_q(ElementType::Wedge18, 2);
+        svmp::FE::math::Vector<Real, 3> xi{Real(0.2), Real(0.1), Real(-0.25)};
+        std::vector<Real> vals;
+        wedge_q.evaluate_values(xi, vals);
+        const double sum = std::accumulate(vals.begin(), vals.end(), 0.0);
+        EXPECT_NEAR(sum, 1.0, 1e-12);
+
+        // Wedge18 should report 18 nodes in ReferenceNodeLayout
+        EXPECT_EQ(ReferenceNodeLayout::num_nodes(ElementType::Wedge18), 18u);
+        // Corner nodes should match Wedge6 vertices
+        auto v0 = ReferenceNodeLayout::get_node_coords(ElementType::Wedge18, 0);
+        auto v1 = ReferenceNodeLayout::get_node_coords(ElementType::Wedge18, 1);
+        auto v2 = ReferenceNodeLayout::get_node_coords(ElementType::Wedge18, 2);
+        EXPECT_NEAR(v0[0], Real(0), 1e-14);
+        EXPECT_NEAR(v0[1], Real(0), 1e-14);
+        EXPECT_NEAR(v0[2], Real(-1), 1e-14);
+        EXPECT_NEAR(v1[0], Real(1), 1e-14);
+        EXPECT_NEAR(v1[1], Real(0), 1e-14);
+        EXPECT_NEAR(v1[2], Real(-1), 1e-14);
+        EXPECT_NEAR(v2[0], Real(0), 1e-14);
+        EXPECT_NEAR(v2[1], Real(1), 1e-14);
+        EXPECT_NEAR(v2[2], Real(-1), 1e-14);
+    }
+
+    {
+        LagrangeBasis pyr(ElementType::Pyramid5, 1);
+        svmp::FE::math::Vector<Real, 3> xi{Real(0.1), Real(-0.2), Real(0.4)};
+        std::vector<Real> vals;
+        pyr.evaluate_values(xi, vals);
+        const double sum = std::accumulate(vals.begin(), vals.end(), 0.0);
+        EXPECT_NEAR(sum, 1.0, 1e-12);
+    }
+}
+
+TEST(LagrangeBasis, NonTensorStridedEvaluationMatchesPointwise) {
+    const std::vector<std::pair<ElementType, int>> cases = {
+        {ElementType::Triangle3, 3},
+        {ElementType::Tetra4, 3},
+        {ElementType::Wedge6, 3},
+        {ElementType::Pyramid5, 3},
+    };
+    const std::vector<StridedOutputRequest> requests = {
+        {true, false, false},
+        {false, true, false},
+        {false, false, true},
+        {true, true, false},
+        {true, false, true},
+        {false, true, true},
+        {true, true, true},
+    };
+
+    for (const auto& [type, order] : cases) {
+        for (const auto& request : requests) {
+            SCOPED_TRACE(static_cast<int>(type));
+            SCOPED_TRACE(order);
+            SCOPED_TRACE(request.values ? "values" : "no values");
+            SCOPED_TRACE(request.gradients ? "gradients" : "no gradients");
+            SCOPED_TRACE(request.hessians ? "hessians" : "no hessians");
+            expect_strided_matches_pointwise(type, order, request);
+        }
+    }
+}
+
+TEST(LagrangeBasis, RawOutputSinksMatchVectorEvaluationAcrossTopologies) {
+    const std::vector<std::pair<ElementType, int>> cases = {
+        {ElementType::Line2, 4},
+        {ElementType::Quad4, 3},
+        {ElementType::Hex8, 3},
+        {ElementType::Triangle3, 4},
+        {ElementType::Tetra4, 3},
+        {ElementType::Wedge6, 3},
+        {ElementType::Pyramid5, 3},
+    };
+
+    for (const auto& [type, order] : cases) {
+        SCOPED_TRACE(static_cast<int>(type));
+        SCOPED_TRACE(order);
+        expect_raw_to_matches_vector_evaluation(type, order);
+    }
+}
+
+TEST(LagrangeBasis, CanonicalConstructorsSupportArbitraryOrders) {
+    const struct Case {
+        ElementType type;
+        int max_order;
+    } cases[] = {
+        {ElementType::Line2, 8},
+        {ElementType::Triangle3, 6},
+        {ElementType::Quad4, 6},
+        {ElementType::Tetra4, 5},
+        {ElementType::Hex8, 5},
+        {ElementType::Wedge6, 5},
+        {ElementType::Pyramid5, 5},
+    };
+
+    for (const auto& c : cases) {
+        for (int order = 0; order <= c.max_order; ++order) {
+            LagrangeBasis basis(c.type, order);
+            EXPECT_EQ(basis.element_type(), c.type);
+            EXPECT_EQ(basis.order(), order);
+            EXPECT_EQ(basis.dimension(), expected_dimension(c.type));
+            EXPECT_EQ(basis.size(), expected_lagrange_size(c.type, order));
+            EXPECT_EQ(basis.nodes().size(), basis.size());
+        }
+    }
+}
+
+TEST(LagrangeBasis, AliasVariantsNormalizeToCanonicalPaths) {
+    expect_alias_matches_canonical(
+        ElementType::Line3, ElementType::Line2, 2, sample_points_for(ElementType::Line2));
+    expect_alias_matches_canonical(
+        ElementType::Triangle6, ElementType::Triangle3, 2, sample_points_for(ElementType::Triangle3));
+    expect_alias_matches_canonical(
+        ElementType::Quad9, ElementType::Quad4, 2, sample_points_for(ElementType::Quad4));
+    expect_alias_matches_canonical(
+        ElementType::Tetra10, ElementType::Tetra4, 2, sample_points_for(ElementType::Tetra4));
+    expect_alias_matches_canonical(
+        ElementType::Hex27, ElementType::Hex8, 2, sample_points_for(ElementType::Hex8));
+    expect_alias_matches_canonical(
+        ElementType::Wedge18, ElementType::Wedge6, 2, sample_points_for(ElementType::Wedge6));
+    expect_alias_matches_canonical(
+        ElementType::Pyramid14, ElementType::Pyramid5, 2, sample_points_for(ElementType::Pyramid5),
+        Real(2e-10));
+}
+
+TEST(LagrangeBasis, SerendipityVariantsRemainRejected) {
+    EXPECT_THROW((void)LagrangeBasis(ElementType::Quad8, 2), svmp::FE::FEException);
+    EXPECT_THROW((void)LagrangeBasis(ElementType::Hex20, 2), svmp::FE::FEException);
+    EXPECT_THROW((void)LagrangeBasis(ElementType::Wedge15, 2), svmp::FE::FEException);
+    EXPECT_THROW((void)LagrangeBasis(ElementType::Pyramid13, 2), svmp::FE::FEException);
+}
+
+TEST(LagrangeBasis, GeneratedNodeOrderingIsDeterministicAcrossOrders) {
+    const struct Case {
+        ElementType type;
+        int max_order;
+    } cases[] = {
+        {ElementType::Line2, 8},
+        {ElementType::Triangle3, 6},
+        {ElementType::Quad4, 6},
+        {ElementType::Tetra4, 5},
+        {ElementType::Hex8, 5},
+        {ElementType::Wedge6, 5},
+        {ElementType::Pyramid5, 5},
+    };
+
+    for (const auto& c : cases) {
+        for (int order = 0; order <= c.max_order; ++order) {
+            const auto generated_a = ReferenceNodeLayout::get_lagrange_node_coords(c.type, order);
+            const auto generated_b = ReferenceNodeLayout::get_lagrange_node_coords(c.type, order);
+            ASSERT_EQ(generated_a.size(), expected_lagrange_size(c.type, order));
+            ASSERT_EQ(generated_a.size(), generated_b.size());
+            for (std::size_t i = 0; i < generated_a.size(); ++i) {
+                EXPECT_TRUE(points_close(generated_a[i], generated_b[i]));
+            }
+        }
+    }
+}
+
+TEST(LagrangeBasis, NodeOrderingMatchesReferenceCoordinateOracles) {
+    const std::array<ElementType, 18> cases = {
+        ElementType::Line2, ElementType::Line3,
+        ElementType::Triangle3, ElementType::Triangle6,
+        ElementType::Quad4, ElementType::Quad8, ElementType::Quad9,
+        ElementType::Tetra4, ElementType::Tetra10,
+        ElementType::Hex8, ElementType::Hex20, ElementType::Hex27,
+        ElementType::Wedge6, ElementType::Wedge15, ElementType::Wedge18,
+        ElementType::Pyramid5, ElementType::Pyramid13, ElementType::Pyramid14,
+    };
+
+    for (ElementType type : cases) {
+        const auto expected = reference_node_coords(type);
+        ASSERT_FALSE(expected.empty());
+        ASSERT_EQ(ReferenceNodeLayout::num_nodes(type), expected.size());
+        for (std::size_t i = 0; i < expected.size(); ++i) {
+            const auto actual = ReferenceNodeLayout::get_node_coords(type, i);
+            EXPECT_TRUE(points_close(actual, expected[i]))
+                << "Element type " << static_cast<int>(type)
+                << ", node " << i;
+        }
+    }
+}
+
+TEST(LagrangeBasis, GeneratedLowOrderOrderingMatchesPublicAliasPaths) {
+    const struct Case {
+        ElementType type;
+        int order;
+        ElementType public_alias;
+    } cases[] = {
+        {ElementType::Line2, 1, ElementType::Line2},
+        {ElementType::Line2, 2, ElementType::Line3},
+        {ElementType::Triangle3, 1, ElementType::Triangle3},
+        {ElementType::Triangle3, 2, ElementType::Triangle6},
+        {ElementType::Quad4, 1, ElementType::Quad4},
+        {ElementType::Quad4, 2, ElementType::Quad9},
+        {ElementType::Tetra4, 1, ElementType::Tetra4},
+        {ElementType::Tetra4, 2, ElementType::Tetra10},
+        {ElementType::Hex8, 1, ElementType::Hex8},
+        {ElementType::Hex8, 2, ElementType::Hex27},
+        {ElementType::Wedge6, 1, ElementType::Wedge6},
+        {ElementType::Wedge6, 2, ElementType::Wedge18},
+        {ElementType::Pyramid5, 1, ElementType::Pyramid5},
+        {ElementType::Pyramid5, 2, ElementType::Pyramid14},
+    };
+
+    for (const auto& c : cases) {
+        const auto generated = ReferenceNodeLayout::get_lagrange_node_coords(c.type, c.order);
+        ASSERT_EQ(generated.size(), ReferenceNodeLayout::num_nodes(c.public_alias));
+        for (std::size_t i = 0; i < generated.size(); ++i) {
+            const auto public_alias = ReferenceNodeLayout::get_node_coords(c.public_alias, i);
+            EXPECT_TRUE(points_close(generated[i], public_alias));
+        }
+    }
+}
+
+TEST(LagrangeBasis, KroneckerDeltaAcrossCanonicalTopologiesAndOrders) {
+    const struct Case {
+        ElementType type;
+        int max_order;
+    } cases[] = {
+        {ElementType::Line2, 8},
+        {ElementType::Triangle3, 6},
+        {ElementType::Quad4, 6},
+        {ElementType::Tetra4, 5},
+        {ElementType::Hex8, 5},
+        {ElementType::Wedge6, 5},
+        {ElementType::Pyramid5, 5},
+    };
+
+    for (const auto& c : cases) {
+        for (int order = 0; order <= c.max_order; ++order) {
+            LagrangeBasis basis(c.type, order);
+            ASSERT_EQ(basis.size(), expected_lagrange_size(c.type, order));
+
+            std::vector<Real> values;
+            for (std::size_t node_i = 0; node_i < basis.size(); ++node_i) {
+                basis.evaluate_values(basis.nodes()[node_i], values);
+                ASSERT_EQ(values.size(), basis.size());
+                for (std::size_t basis_i = 0; basis_i < basis.size(); ++basis_i) {
+                    EXPECT_NEAR(values[basis_i], basis_i == node_i ? Real(1) : Real(0), Real(2e-10))
+                        << "Element type " << static_cast<int>(c.type)
+                        << ", order " << order
+                        << ", node " << node_i
+                        << ", basis " << basis_i;
+                }
+            }
+        }
+    }
+}
+
+TEST(LagrangeBasis, PartitionGradientAndHessianSumsAcrossCanonicalTopologiesAndOrders) {
+    const struct Case {
+        ElementType type;
+        int max_order;
+        Real tol;
+    } cases[] = {
+        {ElementType::Line2, 8, Real(1e-11)},
+        {ElementType::Triangle3, 6, Real(1e-10)},
+        {ElementType::Quad4, 6, Real(1e-10)},
+        {ElementType::Tetra4, 5, Real(2e-10)},
+        {ElementType::Hex8, 5, Real(2e-10)},
+        {ElementType::Wedge6, 5, Real(5e-10)},
+        {ElementType::Pyramid5, 5, Real(5e-7)},
+    };
+
+    for (const auto& c : cases) {
+        for (int order = 0; order <= c.max_order; ++order) {
+            LagrangeBasis basis(c.type, order);
+            expect_partition_gradient_hessian_sums(basis, dense_sample_points_for(c.type), c.tol, c.tol);
+        }
+    }
+}
+
+TEST(LagrangeBasis, SimplexAxisScratchDynamicFallbackForHighOrder) {
+    const struct Case {
+        ElementType type;
+        int order;
+        Point point;
+        Real tolerance;
+    } cases[] = {
+        {ElementType::Triangle3, 13, Point{Real(0.19), Real(0.31), Real(0)}, Real(1e-8)},
+        {ElementType::Tetra4, 13, Point{Real(0.13), Real(0.17), Real(0.19)}, Real(1e-7)},
+    };
+
+    for (const auto& c : cases) {
+        LagrangeBasis basis(c.type, c.order);
+        std::vector<Real> values;
+        std::vector<Gradient> gradients;
+        std::vector<Hessian> hessians;
+        basis.evaluate_all(c.point, values, gradients, hessians);
+
+        ASSERT_EQ(values.size(), basis.size());
+        ASSERT_EQ(gradients.size(), basis.size());
+        ASSERT_EQ(hessians.size(), basis.size());
+
+        Real value_sum = Real(0);
+        Gradient gradient_sum{};
+        Hessian hessian_sum{};
+        for (std::size_t i = 0; i < basis.size(); ++i) {
+            value_sum += values[i];
+            for (std::size_t d = 0; d < 3u; ++d) {
+                gradient_sum[d] += gradients[i][d];
+                for (std::size_t e = 0; e < 3u; ++e) {
+                    hessian_sum(d, e) += hessians[i](d, e);
+                }
+            }
+        }
+
+        EXPECT_NEAR(value_sum, Real(1), c.tolerance);
+        for (std::size_t d = 0; d < 3u; ++d) {
+            EXPECT_NEAR(gradient_sum[d], Real(0), c.tolerance);
+            for (std::size_t e = 0; e < 3u; ++e) {
+                EXPECT_NEAR(hessian_sum(d, e), Real(0), Real(10) * c.tolerance);
+            }
+        }
+    }
+}
+
+TEST(LagrangeBasis, HighOrderAxisNearNodeMaintainsPartitionAndDerivativeSums) {
+    const int order = 16;
+    const LagrangeBasis basis(ElementType::Line2, order);
+    const Real node = Real(-1) + Real(2 * 5) / static_cast<Real>(order);
+    const Point point{node + Real(1e-7), Real(0), Real(0)};
+
+    std::vector<Real> values;
+    std::vector<Gradient> gradients;
+    std::vector<Hessian> hessians;
+    basis.evaluate_all(point, values, gradients, hessians);
+    ASSERT_EQ(values.size(), basis.size());
+
+    Real value_sum = Real(0);
+    Real gradient_sum = Real(0);
+    Real hessian_sum = Real(0);
+    for (std::size_t i = 0; i < basis.size(); ++i) {
+        value_sum += values[i];
+        gradient_sum += gradients[i][0];
+        hessian_sum += hessians[i](0, 0);
+    }
+
+    EXPECT_NEAR(value_sum, Real(1), Real(1e-12));
+    EXPECT_NEAR(gradient_sum, Real(0), Real(1e-8));
+    EXPECT_NEAR(hessian_sum, Real(0), Real(1e-5));
+}
+
+TEST(LagrangeBasis, PyramidFaceTracesMatchLowerDimensionalLagrangeBases) {
+    const PyramidFace faces[] = {
+        PyramidFace::Base,
+        PyramidFace::South,
+        PyramidFace::East,
+        PyramidFace::North,
+        PyramidFace::West,
+    };
+
+    for (int order = 1; order <= 5; ++order) {
+        for (const auto face : faces) {
+            expect_pyramid_face_trace_matches_lower_basis(
+                order, face, face == PyramidFace::Base ? Real(2e-10) : Real(5e-10));
+        }
+    }
+}
+
+TEST(LagrangeBasis, PyramidEdgeTracesMatchLineLagrangeBasis) {
+    const PyramidEdge edges[] = {
+        PyramidEdge::BaseSouth,
+        PyramidEdge::BaseEast,
+        PyramidEdge::BaseNorth,
+        PyramidEdge::BaseWest,
+        PyramidEdge::VerticalSW,
+        PyramidEdge::VerticalSE,
+        PyramidEdge::VerticalNE,
+        PyramidEdge::VerticalNW,
+    };
+
+    for (int order = 1; order <= 5; ++order) {
+        for (const auto edge : edges) {
+            expect_pyramid_edge_trace_matches_line_basis(order, edge, Real(5e-10));
+        }
+    }
+}
+
+TEST(LagrangeBasis, Pyramid14RationalNodalAndPartition) {
+    using svmp::FE::basis::ReferenceNodeLayout;
+
+    LagrangeBasis basis(ElementType::Pyramid14, 2);
+    EXPECT_EQ(basis.dimension(), 3);
+    EXPECT_EQ(basis.size(), 14u);
+
+    // Kronecker nodal property at all Pyramid14 nodes
+    for (std::size_t i = 0; i < basis.size(); ++i) {
+        auto xi = ReferenceNodeLayout::get_node_coords(ElementType::Pyramid14, i);
+        std::vector<Real> vals;
+        basis.evaluate_values(xi, vals);
+        ASSERT_EQ(vals.size(), basis.size());
+        for (std::size_t j = 0; j < basis.size(); ++j) {
+            const double expected = (i == j) ? 1.0 : 0.0;
+            EXPECT_NEAR(vals[j], expected, 1e-12);
+        }
+    }
+
+    // Partition of unity at an interior point
+    svmp::FE::math::Vector<Real, 3> xi{Real(0.1), Real(-0.2), Real(0.3)};
+    std::vector<Real> vals;
+    basis.evaluate_values(xi, vals);
+    const double sum = std::accumulate(vals.begin(), vals.end(), 0.0);
+    EXPECT_NEAR(sum, 1.0, 1e-12);
+}
+
+TEST(LagrangeBasis, Pyramid14GradientSumZero) {
+    LagrangeBasis basis(ElementType::Pyramid14, 2);
+    svmp::FE::math::Vector<Real, 3> xi{Real(0.15), Real(-0.1), Real(0.3)};
+
+    std::vector<Gradient> grads;
+    basis.evaluate_gradients(xi, grads);
+    ASSERT_EQ(grads.size(), basis.size());
+
+    Gradient sum{};
+    for (const auto& g : grads) {
+        sum[0] += g[0];
+        sum[1] += g[1];
+        sum[2] += g[2];
+    }
+    EXPECT_NEAR(sum[0], 0.0, 1e-8);
+    EXPECT_NEAR(sum[1], 0.0, 1e-8);
+    EXPECT_NEAR(sum[2], 0.0, 1e-8);
+}
+
+TEST(LagrangeBasis, HigherOrderP4KroneckerAndPartition) {
+    struct Case {
+        ElementType type;
+        int order;
+        svmp::FE::math::Vector<Real, 3> xi;
+    };
+
+    const std::vector<Case> cases = {
+        {ElementType::Line2, 4, {Real(0.11), Real(0), Real(0)}},
+        {ElementType::Quad4, 4, {Real(0.2), Real(-0.3), Real(0)}},
+        {ElementType::Triangle3, 4, {Real(0.2), Real(0.3), Real(0)}},
+        {ElementType::Hex8, 4, {Real(0.2), Real(-0.3), Real(0.4)}},
+    };
+
+    for (const auto& c : cases) {
+        LagrangeBasis basis(c.type, c.order);
+
+        // Partition of unity at an interior point
+        std::vector<Real> values;
+        basis.evaluate_values(c.xi, values);
+        const double sum = std::accumulate(values.begin(), values.end(), 0.0);
+        EXPECT_NEAR(sum, 1.0, 1e-12);
+
+        // Kronecker delta property at all nodes
+        const auto& nodes = basis.nodes();
+        ASSERT_EQ(nodes.size(), basis.size());
+        for (std::size_t i = 0; i < nodes.size(); ++i) {
+            basis.evaluate_values(nodes[i], values);
+            ASSERT_EQ(values.size(), nodes.size());
+            for (std::size_t j = 0; j < nodes.size(); ++j) {
+                const double expected = (i == j) ? 1.0 : 0.0;
+                EXPECT_NEAR(values[j], expected, 1e-12);
+            }
+        }
+    }
+}
+
+TEST(LagrangeBasis, Pyramid14InterpolatesQuadraticPolynomials) {
+    using svmp::FE::basis::ReferenceNodeLayout;
+
+    LagrangeBasis basis(ElementType::Pyramid14, 2);
+    const std::size_t n = basis.size();
+
+    // Precompute nodal coordinates
+    std::vector<svmp::FE::math::Vector<Real,3>> nodes;
+    nodes.reserve(n);
+    for (std::size_t i = 0; i < n; ++i) {
+        nodes.push_back(ReferenceNodeLayout::get_node_coords(ElementType::Pyramid14, i));
+    }
+
+    auto interpolate_and_check = [&](auto f, Real tol) {
+        // Nodal coefficients
+        std::vector<Real> coeffs(n);
+        for (std::size_t i = 0; i < n; ++i) {
+            const auto& x = nodes[i];
+            coeffs[i] = f(x[0], x[1], x[2]);
+        }
+
+        // Test at a few interior points
+        const svmp::FE::math::Vector<Real,3> test_pts[] = {
+            {Real(0.1), Real(-0.2), Real(0.2)},
+            {Real(-0.2), Real(0.15), Real(0.4)},
+            {Real(0.05), Real(0.05), Real(0.3)}
+        };
+
+        for (const auto& xi : test_pts) {
+            std::vector<Real> vals;
+            basis.evaluate_values(xi, vals);
+            ASSERT_EQ(vals.size(), n);
+
+            Real u_interp = Real(0);
+            for (std::size_t i = 0; i < n; ++i) {
+                u_interp += coeffs[i] * vals[i];
+            }
+
+            const Real u_exact = f(xi[0], xi[1], xi[2]);
+            EXPECT_NEAR(u_interp, u_exact, tol);
+        }
+    };
+
+    // Constant, linear and quadratic monomials
+    interpolate_and_check([](Real, Real, Real) { return Real(1); }, Real(1e-12));
+    interpolate_and_check([](Real x, Real, Real) { return x; }, Real(1e-11));
+    interpolate_and_check([](Real, Real y, Real) { return y; }, Real(1e-11));
+    interpolate_and_check([](Real, Real, Real z) { return z; }, Real(1e-11));
+    interpolate_and_check([](Real x, Real y, Real) { return x * y; }, Real(1e-10));
+    interpolate_and_check([](Real x, Real, Real z) { return x * z; }, Real(1e-10));
+    interpolate_and_check([](Real, Real y, Real z) { return y * z; }, Real(1e-10));
+    interpolate_and_check([](Real x, Real, Real) { return x * x; }, Real(1e-10));
+    interpolate_and_check([](Real, Real y, Real) { return y * y; }, Real(1e-10));
+    interpolate_and_check([](Real, Real, Real z) { return z * z; }, Real(1e-10));
+}
+
+TEST(LagrangeBasis, Pyramid14GradientMatchesLinearFunctionGradient) {
+    using svmp::FE::basis::ReferenceNodeLayout;
+
+    LagrangeBasis basis(ElementType::Pyramid14, 2);
+    const std::size_t n = basis.size();
+
+    // Nodal coordinates and coefficients for f(x,y,z) = ax + by + cz
+    const Real a = Real(1.2);
+    const Real b = Real(-0.7);
+    const Real c = Real(0.5);
+
+    std::vector<Real> coeffs(n);
+    for (std::size_t i = 0; i < n; ++i) {
+        const auto x = ReferenceNodeLayout::get_node_coords(ElementType::Pyramid14, i);
+        coeffs[i] = a * x[0] + b * x[1] + c * x[2];
+    }
+
+    const svmp::FE::math::Vector<Real,3> xi{Real(0.1), Real(-0.15), Real(0.35)};
+
+    std::vector<Gradient> grads;
+    basis.evaluate_gradients(xi, grads);
+    ASSERT_EQ(grads.size(), n);
+
+    Gradient g_interp{};
+    for (std::size_t i = 0; i < n; ++i) {
+        g_interp[0] += coeffs[i] * grads[i][0];
+        g_interp[1] += coeffs[i] * grads[i][1];
+        g_interp[2] += coeffs[i] * grads[i][2];
+    }
+
+    EXPECT_NEAR(g_interp[0], a, 1e-6);
+    EXPECT_NEAR(g_interp[1], b, 1e-6);
+    EXPECT_NEAR(g_interp[2], c, 1e-6);
+}
+
+TEST(LagrangeBasis, PyramidApexValuesRemainExactAcrossRepresentativeOrders) {
+    const struct Case {
+        ElementType type;
+        int order;
+    } cases[] = {
+        {ElementType::Pyramid5, 1},
+        {ElementType::Pyramid14, 2},
+        {ElementType::Pyramid5, 4},
+    };
+
+    const svmp::FE::math::Vector<Real, 3> apex{Real(0), Real(0), Real(1)};
+    for (const auto& c : cases) {
+        LagrangeBasis basis(c.type, c.order);
+        std::vector<Real> values;
+        basis.evaluate_values(apex, values);
+        ASSERT_EQ(values.size(), basis.size());
+
+        const auto& nodes = basis.nodes();
+        auto apex_it = std::find_if(
+            nodes.begin(), nodes.end(),
+            [](const auto& node) {
+                return std::abs(node[0]) <= Real(1e-14) &&
+                       std::abs(node[1]) <= Real(1e-14) &&
+                       std::abs(node[2] - Real(1)) <= Real(1e-14);
+            });
+        ASSERT_NE(apex_it, nodes.end());
+        const std::size_t apex_index = static_cast<std::size_t>(
+            std::distance(nodes.begin(), apex_it));
+
+        Real sum = Real(0);
+        for (std::size_t i = 0; i < values.size(); ++i) {
+            EXPECT_TRUE(std::isfinite(static_cast<double>(values[i])));
+            sum += values[i];
+            const Real expected = (i == apex_index) ? Real(1) : Real(0);
+            EXPECT_NEAR(values[i], expected, 1e-12)
+                << "order " << c.order << ", basis " << i;
+        }
+        EXPECT_NEAR(sum, Real(1), 1e-12);
+    }
+}
+
+TEST(LagrangeBasis, PyramidGradientAtExactApexThrowsWhenLimitIsNotUnique) {
+    const struct Case {
+        ElementType type;
+        int order;
+    } cases[] = {
+        {ElementType::Pyramid5, 1},
+        {ElementType::Pyramid14, 2},
+        {ElementType::Pyramid5, 4},
+    };
+
+    const svmp::FE::math::Vector<Real, 3> apex{Real(0), Real(0), Real(1)};
+    for (const auto& c : cases) {
+        LagrangeBasis basis(c.type, c.order);
+        std::vector<Gradient> gradients;
+        EXPECT_THROW(basis.evaluate_gradients(apex, gradients), svmp::FE::basis::BasisEvaluationException)
+            << "order " << c.order;
+    }
+}
+
+TEST(LagrangeBasis, PyramidApexValuesMatchDirectionalNearApexLimits) {
+    const struct Case {
+        ElementType type;
+        int order;
+        Real tol;
+    } cases[] = {
+        {ElementType::Pyramid5, 1, Real(3e-6)},
+        {ElementType::Pyramid14, 2, Real(4e-6)},
+        {ElementType::Pyramid5, 4, Real(1e-5)},
+    };
+
+    const std::array<std::array<Real, 2>, 4> directions = {{
+        {Real(0), Real(0)},
+        {Real(0.35), Real(-0.25)},
+        {Real(-0.50), Real(0.45)},
+        {Real(0.20), Real(0.60)},
+    }};
+    const Real t = Real(1e-6);
+    const svmp::FE::math::Vector<Real, 3> apex{Real(0), Real(0), Real(1)};
+
+    for (const auto& c : cases) {
+        LagrangeBasis basis(c.type, c.order);
+        std::vector<Real> apex_values;
+        basis.evaluate_values(apex, apex_values);
+
+        for (const auto& direction : directions) {
+            const svmp::FE::math::Vector<Real, 3> xi{
+                t * direction[0],
+                t * direction[1],
+                Real(1) - t
+            };
+
+            std::vector<Real> values;
+            basis.evaluate_values(xi, values);
+            ASSERT_EQ(values.size(), apex_values.size());
+
+            for (std::size_t i = 0; i < values.size(); ++i) {
+                EXPECT_NEAR(values[i], apex_values[i], c.tol)
+                    << "order " << c.order
+                    << ", basis " << i
+                    << ", direction (" << direction[0] << ", " << direction[1] << ")";
+            }
+        }
+    }
+}
+
+TEST(LagrangeBasis, PyramidNearApexGradientShowsDirectionalSpread) {
+    const struct Case {
+        ElementType type;
+        int order;
+        Real min_spread;
+    } cases[] = {
+        {ElementType::Pyramid5, 1, Real(5e-2)},
+        {ElementType::Pyramid14, 2, Real(5e-2)},
+    };
+
+    const std::array<std::array<Real, 2>, 4> directions = {{
+        {Real(0), Real(0)},
+        {Real(0.45), Real(-0.30)},
+        {Real(-0.35), Real(0.40)},
+        {Real(0.25), Real(0.55)},
+    }};
+    const Real t = Real(1e-6);
+
+    for (const auto& c : cases) {
+        LagrangeBasis basis(c.type, c.order);
+        double max_spread = 0.0;
+
+        std::vector<std::vector<Gradient>> directional_gradients;
+        directional_gradients.reserve(directions.size());
+        for (const auto& direction : directions) {
+            const svmp::FE::math::Vector<Real, 3> xi{
+                t * direction[0],
+                t * direction[1],
+                Real(1) - t
+            };
+
+            std::vector<Gradient> gradients;
+            basis.evaluate_gradients(xi, gradients);
+            directional_gradients.push_back(std::move(gradients));
+        }
+
+        for (std::size_t i = 0; i < basis.size(); ++i) {
+            for (int d = 0; d < 3; ++d) {
+                double min_value = std::numeric_limits<double>::infinity();
+                double max_value = -std::numeric_limits<double>::infinity();
+                for (const auto& gradients : directional_gradients) {
+                    const double value = static_cast<double>(gradients[i][static_cast<std::size_t>(d)]);
+                    min_value = std::min(min_value, value);
+                    max_value = std::max(max_value, value);
+                }
+                max_spread = std::max(max_spread, max_value - min_value);
+            }
+        }
+
+        EXPECT_GT(max_spread, static_cast<double>(c.min_spread))
+            << "order " << c.order;
+    }
+}
+
+TEST(LagrangeBasis, GradientSumZeroQuadAndTet) {
+    const std::vector<std::pair<ElementType, svmp::FE::math::Vector<Real, 3>>> cases = {
+        {ElementType::Quad4, svmp::FE::math::Vector<Real, 3>{Real(0.2), Real(-0.1), Real(0)}},
+        {ElementType::Tetra4, svmp::FE::math::Vector<Real, 3>{Real(0.1), Real(0.2), Real(0.1)}}
+    };
+
+    for (const auto& c : cases) {
+        LagrangeBasis basis(c.first, 1);
+        std::vector<Gradient> grads;
+        basis.evaluate_gradients(c.second, grads);
+
+        ASSERT_EQ(grads.size(), basis.size());
+        Gradient sum{};
+        for (const auto& g : grads) {
+            sum[0] += g[0];
+            sum[1] += g[1];
+            sum[2] += g[2];
+        }
+        EXPECT_NEAR(sum[0], 0.0, 1e-12);
+        EXPECT_NEAR(sum[1], 0.0, 1e-12);
+        EXPECT_NEAR(sum[2], 0.0, 1e-12);
+    }
+}
+
+TEST(LagrangeBasis, HexPartitionAndGradientSumZeroOrderThree) {
+    LagrangeBasis basis(ElementType::Hex8, 3);
+    svmp::FE::math::Vector<Real, 3> xi{Real(0.1), Real(-0.2), Real(0.25)};
+
+    std::vector<Real> values;
+    basis.evaluate_values(xi, values);
+    const double sum = std::accumulate(values.begin(), values.end(), 0.0);
+    EXPECT_NEAR(sum, 1.0, 1e-12);
+
+    std::vector<Gradient> grads;
+    basis.evaluate_gradients(xi, grads);
+    Gradient gsum{};
+    for (const auto& g : grads) {
+        gsum[0] += g[0];
+        gsum[1] += g[1];
+        gsum[2] += g[2];
+    }
+    EXPECT_NEAR(gsum[0], 0.0, 1e-10);
+    EXPECT_NEAR(gsum[1], 0.0, 1e-10);
+    EXPECT_NEAR(gsum[2], 0.0, 1e-10);
+}
+
+TEST(LagrangeBasis, OracleLine3ValuesGradientsAndHessians) {
+    LagrangeBasis basis(ElementType::Line3, 2);
+    const Point xi{Real(0.2), Real(0), Real(0)};
+
+    std::vector<Real> values;
+    std::vector<Gradient> gradients;
+    std::vector<Hessian> hessians;
+    basis.evaluate_values(xi, values);
+    basis.evaluate_gradients(xi, gradients);
+    basis.evaluate_hessians(xi, hessians);
+
+    ASSERT_EQ(values.size(), 3u);
+    ASSERT_EQ(gradients.size(), 3u);
+    ASSERT_EQ(hessians.size(), 3u);
+
+    const Real expected_values[] = {Real(-2) / Real(25), Real(3) / Real(25), Real(24) / Real(25)};
+    const Real expected_gradients[] = {Real(-3) / Real(10), Real(7) / Real(10), Real(-2) / Real(5)};
+    const Real expected_hessians[] = {Real(1), Real(1), Real(-2)};
+
+    for (std::size_t i = 0; i < 3; ++i) {
+        EXPECT_NEAR(values[i], expected_values[i], 1e-14);
+        EXPECT_NEAR(gradients[i][0], expected_gradients[i], 1e-14);
+        EXPECT_NEAR(hessians[i](0, 0), expected_hessians[i], 1e-14);
+    }
+}
+
+TEST(LagrangeBasis, OracleTriangle3ValuesGradientsAndHessians) {
+    LagrangeBasis basis(ElementType::Triangle3, 1);
+    const Point xi{Real(0.2), Real(0.3), Real(0)};
+
+    std::vector<Real> values;
+    std::vector<Gradient> gradients;
+    std::vector<Hessian> hessians;
+    basis.evaluate_values(xi, values);
+    basis.evaluate_gradients(xi, gradients);
+    basis.evaluate_hessians(xi, hessians);
+
+    ASSERT_EQ(values.size(), 3u);
+    const Point expected_gradients[] = {
+        Point{Real(-1), Real(-1), Real(0)},
+        Point{Real(1), Real(0), Real(0)},
+        Point{Real(0), Real(1), Real(0)}
+    };
+    const Real expected_values[] = {Real(0.5), Real(0.2), Real(0.3)};
+
+    for (std::size_t i = 0; i < 3; ++i) {
+        EXPECT_NEAR(values[i], expected_values[i], 1e-14);
+        EXPECT_NEAR(gradients[i][0], expected_gradients[i][0], 1e-14);
+        EXPECT_NEAR(gradients[i][1], expected_gradients[i][1], 1e-14);
+        for (int a = 0; a < 2; ++a) {
+            for (int b = 0; b < 2; ++b) {
+                EXPECT_NEAR(hessians[i](static_cast<std::size_t>(a), static_cast<std::size_t>(b)),
+                            Real(0), 1e-14);
+            }
+        }
+    }
+}
+
+TEST(LagrangeBasis, OracleQuad4ValuesGradientsAndHessians) {
+    LagrangeBasis basis(ElementType::Quad4, 1);
+    const Point xi{Real(0.2), Real(-0.4), Real(0)};
+
+    std::vector<Real> values;
+    std::vector<Gradient> gradients;
+    std::vector<Hessian> hessians;
+    basis.evaluate_values(xi, values);
+    basis.evaluate_gradients(xi, gradients);
+    basis.evaluate_hessians(xi, hessians);
+
+    ASSERT_EQ(values.size(), 4u);
+    const Real expected_values[] = {Real(7) / Real(25), Real(21) / Real(50),
+                                    Real(9) / Real(50), Real(3) / Real(25)};
+    const Point expected_gradients[] = {
+        Point{Real(-7) / Real(20), Real(-1) / Real(5), Real(0)},
+        Point{Real(7) / Real(20), Real(-3) / Real(10), Real(0)},
+        Point{Real(3) / Real(20), Real(3) / Real(10), Real(0)},
+        Point{Real(-3) / Real(20), Real(1) / Real(5), Real(0)}
+    };
+    const Real expected_hxy[] = {Real(1) / Real(4), Real(-1) / Real(4),
+                                 Real(1) / Real(4), Real(-1) / Real(4)};
+
+    for (std::size_t i = 0; i < 4; ++i) {
+        EXPECT_NEAR(values[i], expected_values[i], 1e-14);
+        EXPECT_NEAR(gradients[i][0], expected_gradients[i][0], 1e-14);
+        EXPECT_NEAR(gradients[i][1], expected_gradients[i][1], 1e-14);
+        EXPECT_NEAR(hessians[i](0, 0), Real(0), 1e-14);
+        EXPECT_NEAR(hessians[i](1, 1), Real(0), 1e-14);
+        EXPECT_NEAR(hessians[i](0, 1), expected_hxy[i], 1e-14);
+        EXPECT_NEAR(hessians[i](1, 0), expected_hxy[i], 1e-14);
+    }
+}
+
+TEST(LagrangeBasis, OracleWedge6ValuesGradientsAndHessians) {
+    LagrangeBasis basis(ElementType::Wedge6, 1);
+    const Point xi{Real(0.2), Real(0.25), Real(-0.3)};
+
+    std::vector<Real> values;
+    std::vector<Gradient> gradients;
+    std::vector<Hessian> hessians;
+    basis.evaluate_values(xi, values);
+    basis.evaluate_gradients(xi, gradients);
+    basis.evaluate_hessians(xi, hessians);
+
+    ASSERT_EQ(values.size(), 6u);
+    const Real expected_values[] = {
+        Real(143) / Real(400), Real(13) / Real(100), Real(13) / Real(80),
+        Real(77) / Real(400), Real(7) / Real(100), Real(7) / Real(80)
+    };
+    const Point expected_gradients[] = {
+        Point{Real(-13) / Real(20), Real(-13) / Real(20), Real(-11) / Real(40)},
+        Point{Real(13) / Real(20), Real(0), Real(-1) / Real(10)},
+        Point{Real(0), Real(13) / Real(20), Real(-1) / Real(8)},
+        Point{Real(-7) / Real(20), Real(-7) / Real(20), Real(11) / Real(40)},
+        Point{Real(7) / Real(20), Real(0), Real(1) / Real(10)},
+        Point{Real(0), Real(7) / Real(20), Real(1) / Real(8)}
+    };
+    const Point expected_hxz[] = {
+        Point{Real(1) / Real(2), Real(1) / Real(2), Real(0)},
+        Point{Real(-1) / Real(2), Real(0), Real(0)},
+        Point{Real(0), Real(-1) / Real(2), Real(0)},
+        Point{Real(-1) / Real(2), Real(-1) / Real(2), Real(0)},
+        Point{Real(1) / Real(2), Real(0), Real(0)},
+        Point{Real(0), Real(1) / Real(2), Real(0)}
+    };
+
+    for (std::size_t i = 0; i < 6; ++i) {
+        EXPECT_NEAR(values[i], expected_values[i], 1e-14);
+        EXPECT_NEAR(gradients[i][0], expected_gradients[i][0], 1e-14);
+        EXPECT_NEAR(gradients[i][1], expected_gradients[i][1], 1e-14);
+        EXPECT_NEAR(gradients[i][2], expected_gradients[i][2], 1e-14);
+        EXPECT_NEAR(hessians[i](0, 0), Real(0), 1e-14);
+        EXPECT_NEAR(hessians[i](1, 1), Real(0), 1e-14);
+        EXPECT_NEAR(hessians[i](2, 2), Real(0), 1e-14);
+        EXPECT_NEAR(hessians[i](0, 1), Real(0), 1e-14);
+        EXPECT_NEAR(hessians[i](1, 0), Real(0), 1e-14);
+        EXPECT_NEAR(hessians[i](0, 2), expected_hxz[i][0], 1e-14);
+        EXPECT_NEAR(hessians[i](2, 0), expected_hxz[i][0], 1e-14);
+        EXPECT_NEAR(hessians[i](1, 2), expected_hxz[i][1], 1e-14);
+        EXPECT_NEAR(hessians[i](2, 1), expected_hxz[i][1], 1e-14);
+    }
+}
+
+TEST(LagrangeBasis, DeterministicBoundarySweepMaintainsPartitionAndFiniteDerivatives) {
+    const std::vector<std::pair<ElementType, int>> cases = {
+        {ElementType::Line2, 1},
+        {ElementType::Line3, 2},
+        {ElementType::Triangle3, 1},
+        {ElementType::Triangle6, 2},
+        {ElementType::Quad4, 1},
+        {ElementType::Quad9, 2},
+        {ElementType::Tetra4, 1},
+        {ElementType::Tetra10, 2},
+        {ElementType::Hex8, 1},
+        {ElementType::Hex27, 2},
+        {ElementType::Wedge6, 1},
+        {ElementType::Wedge18, 2},
+        {ElementType::Pyramid5, 1},
+        {ElementType::Pyramid14, 2},
+    };
+
+    for (const auto& [type, order] : cases) {
+        LagrangeBasis basis(type, order);
+        for (const auto& xi : boundary_stress_points_for(type)) {
+            std::vector<Real> values;
+            std::vector<Gradient> gradients;
+            std::vector<Hessian> hessians;
+            basis.evaluate_values(xi, values);
+            basis.evaluate_gradients(xi, gradients);
+            basis.evaluate_hessians(xi, hessians);
+
+            ASSERT_EQ(values.size(), basis.size());
+            ASSERT_EQ(gradients.size(), basis.size());
+            ASSERT_EQ(hessians.size(), basis.size());
+
+            Real sum = Real(0);
+            for (Real value : values) {
+                EXPECT_TRUE(std::isfinite(value));
+                sum += value;
+            }
+            expect_all_finite(gradients);
+            expect_hessians_finite(hessians, basis.dimension());
+            EXPECT_NEAR(sum, Real(1), type == ElementType::Pyramid5 || type == ElementType::Pyramid14
+                                       ? Real(1e-8)
+                                       : Real(1e-12))
+                << "type=" << static_cast<int>(type)
+                << ", order=" << order
+                << ", xi=(" << xi[0] << "," << xi[1] << "," << xi[2] << ")";
+        }
+    }
+}
+
+TEST(LagrangeBasis, FiniteDifferenceGradientsAcrossSupportedLinearShapes) {
+    const std::vector<LagrangeAccuracyCase> cases = {
+        {ElementType::Line2, 1, sample_points_for(ElementType::Line2)},
+        {ElementType::Triangle3, 1, sample_points_for(ElementType::Triangle3)},
+        {ElementType::Quad4, 1, sample_points_for(ElementType::Quad4)},
+        {ElementType::Tetra4, 1, sample_points_for(ElementType::Tetra4)},
+        {ElementType::Hex8, 1, sample_points_for(ElementType::Hex8)},
+        {ElementType::Wedge6, 1, sample_points_for(ElementType::Wedge6)},
+        {ElementType::Pyramid5, 1, sample_points_for(ElementType::Pyramid5)},
+    };
+
+    for (const auto& c : cases) {
+        expect_gradients_match_finite_difference(c, Real(1e-6), Real(1e-6));
+    }
+}
+
+TEST(LagrangeBasis, FiniteDifferenceGradientsAcrossSupportedQuadraticShapes) {
+    const std::vector<LagrangeAccuracyCase> cases = {
+        {ElementType::Line3, 2, sample_points_for(ElementType::Line3)},
+        {ElementType::Triangle6, 2, sample_points_for(ElementType::Triangle6)},
+        {ElementType::Quad9, 2, sample_points_for(ElementType::Quad9)},
+        {ElementType::Tetra10, 2, sample_points_for(ElementType::Tetra10)},
+        {ElementType::Hex27, 2, sample_points_for(ElementType::Hex27)},
+        {ElementType::Wedge18, 2, sample_points_for(ElementType::Wedge18)},
+        {ElementType::Pyramid14, 2, sample_points_for(ElementType::Pyramid14)},
+    };
+
+    for (const auto& c : cases) {
+        expect_gradients_match_finite_difference(c, Real(1e-6), Real(2e-6));
+    }
+}
+
+TEST(LagrangeBasis, LinearPolynomialReproductionAcrossSupportedLinearShapes) {
+    const std::vector<LagrangeAccuracyCase> cases = {
+        {ElementType::Line2, 1, sample_points_for(ElementType::Line2)},
+        {ElementType::Triangle3, 1, sample_points_for(ElementType::Triangle3)},
+        {ElementType::Quad4, 1, sample_points_for(ElementType::Quad4)},
+        {ElementType::Tetra4, 1, sample_points_for(ElementType::Tetra4)},
+        {ElementType::Hex8, 1, sample_points_for(ElementType::Hex8)},
+        {ElementType::Wedge6, 1, sample_points_for(ElementType::Wedge6)},
+        {ElementType::Pyramid5, 1, sample_points_for(ElementType::Pyramid5)},
+    };
+
+    const std::vector<std::array<int, 3>> exponents = {
+        {0, 0, 0},
+        {1, 0, 0},
+        {0, 1, 0},
+        {0, 0, 1},
+    };
+
+    for (const auto& c : cases) {
+        const std::vector<std::array<int, 3>> relevant(
+            exponents.begin(),
+            exponents.begin() + static_cast<std::ptrdiff_t>(c.type == ElementType::Line2 ? 2 :
+                                                            (c.type == ElementType::Triangle3 ||
+                                                             c.type == ElementType::Quad4) ? 3 : 4));
+        expect_polynomial_reproduction(c, relevant, Real(1e-12));
+    }
+}
+
+TEST(LagrangeBasis, QuadraticPolynomialReproductionAcrossSupportedQuadraticShapes) {
+    const std::vector<LagrangeAccuracyCase> cases = {
+        {ElementType::Line3, 2, sample_points_for(ElementType::Line3)},
+        {ElementType::Triangle6, 2, sample_points_for(ElementType::Triangle6)},
+        {ElementType::Quad9, 2, sample_points_for(ElementType::Quad9)},
+        {ElementType::Tetra10, 2, sample_points_for(ElementType::Tetra10)},
+        {ElementType::Hex27, 2, sample_points_for(ElementType::Hex27)},
+        {ElementType::Wedge18, 2, sample_points_for(ElementType::Wedge18)},
+        {ElementType::Pyramid14, 2, sample_points_for(ElementType::Pyramid14)},
+    };
+
+    const std::vector<std::array<int, 3>> line_exponents = {
+        {0, 0, 0}, {1, 0, 0}, {2, 0, 0}
+    };
+    const std::vector<std::array<int, 3>> surface_exponents = {
+        {0, 0, 0}, {1, 0, 0}, {0, 1, 0},
+        {2, 0, 0}, {1, 1, 0}, {0, 2, 0}
+    };
+    const std::vector<std::array<int, 3>> volume_exponents = {
+        {0, 0, 0}, {1, 0, 0}, {0, 1, 0}, {0, 0, 1},
+        {2, 0, 0}, {1, 1, 0}, {0, 2, 0},
+        {1, 0, 1}, {0, 1, 1}, {0, 0, 2}
+    };
+
+    for (const auto& c : cases) {
+        if (c.type == ElementType::Line3) {
+            expect_polynomial_reproduction(c, line_exponents, Real(1e-12));
+        } else if (c.type == ElementType::Triangle6 || c.type == ElementType::Quad9) {
+            expect_polynomial_reproduction(c, surface_exponents, Real(1e-11));
+        } else {
+            expect_polynomial_reproduction(c, volume_exponents, Real(2e-10));
+        }
+    }
+}
+
+TEST(LagrangeBasis, HighOrderTensorLagrangeMaintainsPartitionAndDerivativeSums) {
+    const std::vector<LagrangeAccuracyCase> cases = {
+        {ElementType::Line2, 8, {Point{-0.875, 0, 0}, Point{0.125, 0, 0}, Point{1, 0, 0}}},
+        {ElementType::Quad4, 7, {Point{0.2, -0.35, 0}, Point{-1, 0.5, 0}, Point{0.5, 1, 0}}},
+        {ElementType::Hex8, 6, {Point{0.1, -0.2, 0.3}, Point{-1, 0.5, 1}, Point{0.75, -1, -0.5}}},
+    };
+
+    for (const auto& c : cases) {
+        LagrangeBasis basis(c.type, c.order);
+        expect_partition_gradient_hessian_sums(basis, c.points, Real(2e-12), Real(2e-8));
+    }
+}
+
+TEST(LagrangeBasis, HighOrderTensorLagrangeReproducesTensorPolynomials) {
+    const LagrangeAccuracyCase line{ElementType::Line2,
+                                    8,
+                                    {Point{-0.73, 0, 0}, Point{-0.1, 0, 0}, Point{0.64, 0, 0}}};
+    expect_polynomial_reproduction(line,
+                                   {{0, 0, 0}, {1, 0, 0}, {4, 0, 0}, {8, 0, 0}},
+                                   Real(1e-11));
+
+    const LagrangeAccuracyCase quad{ElementType::Quad4,
+                                    7,
+                                    {Point{-0.6, -0.2, 0}, Point{0.15, 0.45, 0}, Point{0.8, -0.55, 0}}};
+    expect_polynomial_reproduction(quad,
+                                   {{0, 0, 0}, {7, 0, 0}, {0, 7, 0}, {4, 3, 0}},
+                                   Real(5e-10));
+
+    const LagrangeAccuracyCase hex{ElementType::Hex8,
+                                   6,
+                                   {Point{-0.4, 0.2, -0.3}, Point{0.35, -0.55, 0.25}, Point{0.75, 0.4, -0.65}}};
+    expect_polynomial_reproduction(hex,
+                                   {{0, 0, 0}, {6, 0, 0}, {0, 6, 0}, {0, 0, 6}, {3, 2, 4}},
+                                   Real(2e-9));
+}
diff --git a/tests/unitTests/FE/Basis/test_SerendipityTensorModal.cpp b/tests/unitTests/FE/Basis/test_SerendipityTensorModal.cpp
new file mode 100644
index 000000000..9f2bf8be5
--- /dev/null
+++ b/tests/unitTests/FE/Basis/test_SerendipityTensorModal.cpp
@@ -0,0 +1,116 @@
+/**
+ * @file test_SerendipityTensorModal.cpp
+ * @brief Tests for the migrated Serendipity basis subset.
+ */
+
+#include <gtest/gtest.h>
+
+#include "FE/Basis/NodeOrderingConventions.h"
+#include "FE/Basis/SerendipityBasis.h"
+
+#include <vector>
+
+using namespace svmp::FE;
+using namespace svmp::FE::basis;
+
+namespace {
+
+void expect_partition_of_unity(const SerendipityBasis& basis,
+                               const math::Vector<Real, 3>& xi,
+                               Real tolerance = Real(1e-10))
+{
+    std::vector<Real> values;
+    std::vector<Gradient> gradients;
+    basis.evaluate_values(xi, values);
+    basis.evaluate_gradients(xi, gradients);
+
+    Real value_sum = Real(0);
+    Gradient gradient_sum{};
+    for (std::size_t i = 0; i < values.size(); ++i) {
+        value_sum += values[i];
+        for (std::size_t component = 0; component < 3u; ++component) {
+            gradient_sum[component] += gradients[i][component];
+        }
+    }
+
+    EXPECT_NEAR(value_sum, Real(1), tolerance);
+    for (int component = 0; component < basis.dimension(); ++component) {
+        EXPECT_NEAR(gradient_sum[static_cast<std::size_t>(component)],
+                    Real(0),
+                    tolerance);
+    }
+}
+
+void expect_nodal_delta(const SerendipityBasis& basis,
+                        const std::vector<math::Vector<Real, 3>>& nodes,
+                        Real tolerance)
+{
+    ASSERT_EQ(nodes.size(), basis.size());
+    for (std::size_t node = 0; node < nodes.size(); ++node) {
+        std::vector<Real> values;
+        basis.evaluate_values(nodes[node], values);
+        ASSERT_EQ(values.size(), basis.size());
+        for (std::size_t dof = 0; dof < values.size(); ++dof) {
+            EXPECT_NEAR(values[dof], dof == node ? Real(1) : Real(0), tolerance)
+                << "node=" << node << " dof=" << dof;
+        }
+    }
+}
+
+std::vector<math::Vector<Real, 3>> reference_nodes(ElementType type,
+                                                   std::size_t count)
+{
+    std::vector<math::Vector<Real, 3>> nodes;
+    nodes.reserve(count);
+    for (std::size_t i = 0; i < count; ++i) {
+        nodes.push_back(ReferenceNodeLayout::get_node_coords(type, i));
+    }
+    return nodes;
+}
+
+} // namespace
+
+TEST(SerendipityBasis, Quad8IsNodalAndPartitionsUnity) {
+    SerendipityBasis basis(ElementType::Quad8, 2);
+
+    EXPECT_EQ(basis.size(), 8u);
+    expect_nodal_delta(basis, basis.nodes(), Real(1e-10));
+    expect_partition_of_unity(basis, {Real(0.17), Real(-0.31), Real(0)});
+}
+
+TEST(SerendipityBasis, Hex20IsNodalAndPartitionsUnity) {
+    SerendipityBasis basis(ElementType::Hex20, 2);
+
+    EXPECT_EQ(basis.size(), 20u);
+    expect_nodal_delta(basis,
+                       reference_nodes(ElementType::Hex20, basis.size()),
+                       Real(1e-10));
+    expect_partition_of_unity(basis, {Real(0.2), Real(-0.1), Real(0.3)});
+}
+
+TEST(SerendipityBasis, Wedge15IsNodalAndPartitionsUnity) {
+    SerendipityBasis basis(ElementType::Wedge15, 2);
+
+    EXPECT_EQ(basis.size(), 15u);
+    expect_nodal_delta(basis,
+                       reference_nodes(ElementType::Wedge15, basis.size()),
+                       Real(1e-9));
+    expect_partition_of_unity(basis, {Real(0.2), Real(0.3), Real(0.1)});
+}
+
+TEST(SerendipityBasis, Pyramid13IsNodalAndPartitionsUnity) {
+    SerendipityBasis basis(ElementType::Pyramid13, 2);
+
+    EXPECT_EQ(basis.size(), 13u);
+    expect_nodal_delta(basis,
+                       reference_nodes(ElementType::Pyramid13, basis.size()),
+                       Real(1e-8));
+    expect_partition_of_unity(basis, {Real(0.1), Real(-0.2), Real(0.4)});
+}
+
+TEST(SerendipityBasis, RejectsUnsupportedSerendipityAliases) {
+    EXPECT_THROW(SerendipityBasis(ElementType::Quad9, 2), FEException);
+    EXPECT_THROW(SerendipityBasis(ElementType::Pyramid14, 2), FEException);
+    EXPECT_THROW(SerendipityBasis(ElementType::Quad8, 3), FEException);
+}
+
diff --git a/tests/unitTests/FE/Math/test_DenseLinearAlgebra.cpp b/tests/unitTests/FE/Math/test_DenseLinearAlgebra.cpp
new file mode 100644
index 000000000..2b44ad2bf
--- /dev/null
+++ b/tests/unitTests/FE/Math/test_DenseLinearAlgebra.cpp
@@ -0,0 +1,265 @@
+/**
+ * @file test_DenseLinearAlgebra.cpp
+ * @brief Tests for shared dense linear algebra utilities.
+ */
+
+#include <gtest/gtest.h>
+
+#include "FE/Common/FEException.h"
+#include "FE/Math/DenseLinearAlgebra.h"
+
+#include <cmath>
+#include <span>
+#include <vector>
+
+using namespace svmp::FE;
+using namespace svmp::FE::math;
+
+namespace {
+
+Real multiply_entry(const std::vector<Real>& A,
+                    const std::vector<Real>& B,
+                    std::size_t n,
+                    std::size_t row,
+                    std::size_t col) {
+    Real sum = Real(0);
+    for (std::size_t k = 0; k < n; ++k) {
+        sum += A[row * n + k] * B[k * n + col];
+    }
+    return sum;
+}
+
+} // namespace
+
+TEST(DenseLinearAlgebra, InvertsScaledMatrix) {
+    const std::vector<Real> A{
+        Real(1.0e9), Real(2.0e6),
+        Real(3.0e3), Real(4.0)
+    };
+
+    const auto inv = invert_dense_matrix(A, 2u, "scaled 2x2");
+    for (std::size_t row = 0; row < 2u; ++row) {
+        for (std::size_t col = 0; col < 2u; ++col) {
+            const Real expected = (row == col) ? Real(1) : Real(0);
+            EXPECT_NEAR(multiply_entry(A, inv, 2u, row, col), expected, Real(1.0e-10));
+        }
+    }
+}
+
+TEST(DenseLinearAlgebra, FactorizationSolvesMultipleRightHandSides) {
+    const std::vector<Real> A{
+        Real(4), Real(2), Real(0),
+        Real(2), Real(5), Real(1),
+        Real(0), Real(1), Real(3)
+    };
+
+    const auto solver = factor_dense_matrix(A, 3u, "symmetric 3x3");
+    EXPECT_EQ(solver.diagnostics.rank, 3u);
+
+    const std::vector<Real> rhs{Real(2), Real(4), Real(6)};
+    const auto x = solver.solve(std::span<const Real>(rhs.data(), rhs.size()));
+    ASSERT_EQ(x.size(), 3u);
+
+    for (std::size_t row = 0; row < 3u; ++row) {
+        Real ax = Real(0);
+        for (std::size_t col = 0; col < 3u; ++col) {
+            ax += A[row * 3u + col] * x[col];
+        }
+        EXPECT_NEAR(ax, rhs[row], Real(1.0e-12));
+    }
+
+    std::vector<Real> second_rhs{Real(1), Real(-2), Real(0.5)};
+    const auto original_second_rhs = second_rhs;
+    solver.solve_in_place(std::span<Real>(second_rhs.data(), second_rhs.size()));
+    for (std::size_t row = 0; row < 3u; ++row) {
+        Real ax = Real(0);
+        for (std::size_t col = 0; col < 3u; ++col) {
+            ax += A[row * 3u + col] * second_rhs[col];
+        }
+        EXPECT_NEAR(ax, original_second_rhs[row], Real(1.0e-12));
+    }
+}
+
+TEST(DenseLinearAlgebra, FactorizationSolvesDenseRightHandSideBlock) {
+    const std::vector<Real> A{
+        Real(4), Real(2), Real(0),
+        Real(2), Real(5), Real(1),
+        Real(0), Real(1), Real(3)
+    };
+
+    const auto solver = factor_dense_matrix(A, 3u, "symmetric 3x3 block");
+
+    std::vector<Real> rhs{
+        Real(2), Real(1),
+        Real(4), Real(-2),
+        Real(6), Real(0.5)
+    };
+    const auto original_rhs = rhs;
+    solver.solve_in_place(std::span<Real>(rhs.data(), rhs.size()), 2u);
+
+    for (std::size_t rhs_col = 0; rhs_col < 2u; ++rhs_col) {
+        for (std::size_t row = 0; row < 3u; ++row) {
+            Real ax = Real(0);
+            for (std::size_t col = 0; col < 3u; ++col) {
+                ax += A[row * 3u + col] * rhs[col * 2u + rhs_col];
+            }
+            EXPECT_NEAR(ax, original_rhs[row * 2u + rhs_col], Real(1.0e-12));
+        }
+    }
+}
+
+TEST(DenseLinearAlgebra, HighConditionInverseUsesSvdFallback) {
+    const std::vector<Real> high_condition{
+        Real(1), Real(0),
+        Real(0), Real(1.0e-13)
+    };
+
+    const auto result =
+        invert_dense_matrix_with_diagnostics(high_condition, 2u, "high-condition diagonal");
+    EXPECT_EQ(result.diagnostics.rank, 2u);
+#if defined(FE_HAS_EIGEN) && FE_HAS_EIGEN
+    EXPECT_GT(result.diagnostics.condition_estimate,
+              dense_matrix_condition_fallback_threshold());
+    EXPECT_TRUE(result.used_svd_fallback);
+#else
+    EXPECT_FALSE(result.used_svd_fallback);
+#endif
+
+    for (std::size_t row = 0; row < 2u; ++row) {
+        for (std::size_t col = 0; col < 2u; ++col) {
+            const Real expected = (row == col) ? Real(1) : Real(0);
+            EXPECT_NEAR(multiply_entry(high_condition, result.inverse, 2u, row, col),
+                        expected,
+                        Real(1.0e-12));
+        }
+    }
+}
+
+TEST(DenseLinearAlgebra, DiagnosticValidationRejectsUnsupportedCondition) {
+#if !(defined(FE_HAS_EIGEN) && FE_HAS_EIGEN)
+    GTEST_SKIP() << "condition rejection requires FE_ENABLE_EIGEN diagnostics";
+#endif
+    DenseInverseResult result;
+    result.diagnostics.rank = 2u;
+    result.diagnostics.condition_estimate =
+        dense_matrix_condition_error_threshold() * Real(10);
+
+    EXPECT_GT(result.diagnostics.condition_estimate,
+              dense_matrix_condition_error_threshold());
+    EXPECT_THROW(validate_dense_inverse_diagnostics(
+                     result, 2u, "excessive-condition diagonal"),
+                 FEException);
+}
+
+TEST(DenseLinearAlgebra, ThrowsForScaleAwareSingularPivot) {
+    const std::vector<Real> singular{
+        Real(1.0e12), Real(2.0e12),
+        Real(0.5e12), Real(1.0e12)
+    };
+
+    EXPECT_THROW((void)invert_dense_matrix(singular, 2u, "singular 2x2"),
+                 FEException);
+}
+
+TEST(DenseLinearAlgebra, FactorizationThrowsForRankDeficientMatrix) {
+    const std::vector<Real> singular{
+        Real(1), Real(2),
+        Real(2), Real(4)
+    };
+
+    EXPECT_THROW((void)factor_dense_matrix(singular, 2u, "rank-one 2x2"),
+                 FEException);
+}
+
+TEST(DenseLinearAlgebra, RankUsesScaleAwareTolerance) {
+    const std::vector<Real> rank_one{
+        Real(1.0e8), Real(2.0e8),
+        Real(3.0e8), Real(6.0e8)
+    };
+    EXPECT_EQ(dense_matrix_rank(rank_one, 2u, 2u), 1u);
+
+    const std::vector<Real> full_rank{
+        Real(1.0e8), Real(2.0e8),
+        Real(3.0e8), Real(6.1e8)
+    };
+    EXPECT_EQ(dense_matrix_rank(full_rank, 2u, 2u), 2u);
+}
+
+TEST(DenseLinearAlgebra, DiagnosticsReportRankAndConditionEstimate) {
+    const std::vector<Real> diagonal{
+        Real(4), Real(0),
+        Real(0), Real(0.5)
+    };
+    const auto full =
+        dense_matrix_diagnostics(diagonal, 2u, 2u, "diagonal 2x2");
+    EXPECT_EQ(full.rank, 2u);
+#if defined(FE_HAS_EIGEN) && FE_HAS_EIGEN
+    EXPECT_NEAR(full.largest_singular_value, Real(4), Real(1.0e-14));
+    EXPECT_NEAR(full.smallest_retained_singular_value, Real(0.5), Real(1.0e-14));
+    EXPECT_NEAR(full.condition_estimate, Real(8), Real(1.0e-14));
+#else
+    EXPECT_TRUE(std::isinf(full.condition_estimate));
+#endif
+
+    const std::vector<Real> rank_one{
+        Real(1), Real(2),
+        Real(2), Real(4)
+    };
+    const auto deficient =
+        dense_matrix_diagnostics(rank_one, 2u, 2u, "rank-one 2x2");
+    EXPECT_EQ(deficient.rank, 1u);
+    EXPECT_TRUE(std::isinf(deficient.condition_estimate));
+}
+
+TEST(DenseLinearAlgebra, PseudoInverseHandlesSingularMatrixWithoutNormalEquations) {
+#if !(defined(FE_HAS_EIGEN) && FE_HAS_EIGEN)
+    GTEST_SKIP() << "rank-revealing pseudo-inverse requires FE_ENABLE_EIGEN";
+#endif
+    const std::vector<Real> rank_one{
+        Real(1), Real(2),
+        Real(2), Real(4)
+    };
+
+    const auto pinv =
+        rank_revealing_pseudo_inverse(rank_one, 2u, 2u, "rank-one 2x2");
+    EXPECT_EQ(pinv.rank, 1u);
+    EXPECT_NEAR(pinv.inverse[0], Real(0.04), Real(1.0e-13));
+    EXPECT_NEAR(pinv.inverse[1], Real(0.08), Real(1.0e-13));
+    EXPECT_NEAR(pinv.inverse[2], Real(0.08), Real(1.0e-13));
+    EXPECT_NEAR(pinv.inverse[3], Real(0.16), Real(1.0e-13));
+
+    std::vector<Real> projection(4u, Real(0));
+    for (std::size_t row = 0; row < 2u; ++row) {
+        for (std::size_t col = 0; col < 2u; ++col) {
+            for (std::size_t a = 0; a < 2u; ++a) {
+                for (std::size_t b = 0; b < 2u; ++b) {
+                    projection[row * 2u + col] +=
+                        rank_one[row * 2u + a] * pinv.inverse[a * 2u + b] *
+                        rank_one[b * 2u + col];
+                }
+            }
+            EXPECT_NEAR(projection[row * 2u + col],
+                        rank_one[row * 2u + col],
+                        Real(1.0e-12));
+        }
+    }
+}
+
+TEST(DenseLinearAlgebra, PseudoInverseDropsNearZeroSingularValues) {
+#if !(defined(FE_HAS_EIGEN) && FE_HAS_EIGEN)
+    GTEST_SKIP() << "rank-revealing pseudo-inverse requires FE_ENABLE_EIGEN";
+#endif
+    const std::vector<Real> near_singular{
+        Real(1), Real(0),
+        Real(0), Real(1.0e-18)
+    };
+
+    const auto pinv =
+        rank_revealing_pseudo_inverse(near_singular, 2u, 2u, "near-singular 2x2");
+    EXPECT_EQ(pinv.rank, 1u);
+    EXPECT_GT(pinv.tolerance, Real(1.0e-18));
+    EXPECT_NEAR(pinv.inverse[0], Real(1), Real(1.0e-14));
+    EXPECT_NEAR(pinv.inverse[1], Real(0), Real(1.0e-14));
+    EXPECT_NEAR(pinv.inverse[2], Real(0), Real(1.0e-14));
+    EXPECT_NEAR(pinv.inverse[3], Real(0), Real(1.0e-14));
+}
diff --git a/tests/unitTests/FE/Math/test_ExpressionOps.cpp b/tests/unitTests/FE/Math/test_ExpressionOps.cpp
new file mode 100644
index 000000000..307b308a1
--- /dev/null
+++ b/tests/unitTests/FE/Math/test_ExpressionOps.cpp
@@ -0,0 +1,509 @@
+/**
+ * @file test_ExpressionOps.cpp
+ * @brief Unit tests for ExpressionOps.h - expression template operators
+ */
+
+#include <gtest/gtest.h>
+#include "FE/Math/ExpressionOps.h"
+#include "FE/Math/Vector.h"
+#include "FE/Math/Matrix.h"
+#include "FE/Math/MathConstants.h"
+#include <limits>
+#include <cmath>
+#include <complex>
+#include <type_traits>
+
+using namespace svmp::FE::math;
+using namespace svmp::FE::math::detail::ops;
+
+// Test fixture for ExpressionOps tests
+class ExpressionOpsTest : public ::testing::Test {
+protected:
+    static constexpr double tolerance = 1e-14;
+
+    void SetUp() override {}
+    void TearDown() override {}
+
+    template<typename T>
+    bool approx_equal(T a, T b, T tol = tolerance) {
+        return std::abs(a - b) <= tol;
+    }
+};
+
+// =============================================================================
+// Binary Operation Tests
+// =============================================================================
+
+TEST_F(ExpressionOpsTest, AddOperator) {
+    Add op;
+
+    // Integer addition
+    EXPECT_EQ(op(5, 3), 8);
+    EXPECT_EQ(op(-5, 3), -2);
+    EXPECT_EQ(op(-5, -3), -8);
+
+    // Floating point addition
+    EXPECT_DOUBLE_EQ(op(3.14, 2.86), 6.0);
+    EXPECT_DOUBLE_EQ(op(-1.5, 2.5), 1.0);
+
+    // Mixed types
+    auto result = op(3, 2.5);
+    EXPECT_TRUE((std::is_same_v<decltype(result), double>));
+    EXPECT_DOUBLE_EQ(result, 5.5);
+}
+
+TEST_F(ExpressionOpsTest, SubOperator) {
+    Sub op;
+
+    // Integer subtraction
+    EXPECT_EQ(op(5, 3), 2);
+    EXPECT_EQ(op(3, 5), -2);
+    EXPECT_EQ(op(-5, -3), -2);
+
+    // Floating point subtraction
+    EXPECT_DOUBLE_EQ(op(5.5, 2.5), 3.0);
+    EXPECT_DOUBLE_EQ(op(2.5, 5.5), -3.0);
+
+    // Mixed types
+    auto result = op(5.5, 2);
+    EXPECT_TRUE((std::is_same_v<decltype(result), double>));
+    EXPECT_DOUBLE_EQ(result, 3.5);
+}
+
+TEST_F(ExpressionOpsTest, MulOperator) {
+    Mul op;
+
+    // Integer multiplication
+    EXPECT_EQ(op(5, 3), 15);
+    EXPECT_EQ(op(-5, 3), -15);
+    EXPECT_EQ(op(-5, -3), 15);
+
+    // Floating point multiplication
+    EXPECT_DOUBLE_EQ(op(2.5, 4.0), 10.0);
+    EXPECT_DOUBLE_EQ(op(-2.5, 4.0), -10.0);
+
+    // Zero multiplication
+    EXPECT_EQ(op(0, 100), 0);
+    EXPECT_DOUBLE_EQ(op(0.0, 3.14), 0.0);
+
+    // Mixed types
+    auto result = op(3, 2.5);
+    EXPECT_TRUE((std::is_same_v<decltype(result), double>));
+    EXPECT_DOUBLE_EQ(result, 7.5);
+}
+
+TEST_F(ExpressionOpsTest, DivOperator) {
+    Div op;
+
+    // Integer division
+    EXPECT_EQ(op(10, 2), 5);
+    EXPECT_EQ(op(10, 3), 3);  // Integer division truncates
+    EXPECT_EQ(op(-10, 2), -5);
+
+    // Floating point division
+    EXPECT_DOUBLE_EQ(op(10.0, 2.0), 5.0);
+    EXPECT_DOUBLE_EQ(op(10.0, 3.0), 10.0/3.0);
+    EXPECT_DOUBLE_EQ(op(-10.0, 2.0), -5.0);
+
+    // Mixed types
+    auto result = op(10.0, 3);
+    EXPECT_TRUE((std::is_same_v<decltype(result), double>));
+    EXPECT_DOUBLE_EQ(result, 10.0/3.0);
+}
+
+// =============================================================================
+// Unary Operation Tests
+// =============================================================================
+
+TEST_F(ExpressionOpsTest, NegateOperator) {
+    Negate op;
+
+    // Integer negation
+    EXPECT_EQ(op(5), -5);
+    EXPECT_EQ(op(-5), 5);
+    EXPECT_EQ(op(0), 0);
+
+    // Floating point negation
+    EXPECT_DOUBLE_EQ(op(3.14), -3.14);
+    EXPECT_DOUBLE_EQ(op(-2.71), 2.71);
+    EXPECT_DOUBLE_EQ(op(0.0), 0.0);
+
+    // Type preservation
+    auto int_result = op(5);
+    EXPECT_TRUE((std::is_same_v<decltype(int_result), int>));
+
+    auto double_result = op(5.0);
+    EXPECT_TRUE((std::is_same_v<decltype(double_result), double>));
+}
+
+TEST_F(ExpressionOpsTest, AbsOperator) {
+    Abs op;
+
+    // Integer absolute value
+    EXPECT_EQ(op(5), 5);
+    EXPECT_EQ(op(-5), 5);
+    EXPECT_EQ(op(0), 0);
+
+    // Floating point absolute value
+    EXPECT_DOUBLE_EQ(op(3.14), 3.14);
+    EXPECT_DOUBLE_EQ(op(-3.14), 3.14);
+    EXPECT_DOUBLE_EQ(op(0.0), 0.0);
+
+    // Special cases
+    EXPECT_DOUBLE_EQ(op(-0.0), 0.0);
+
+    // Type preservation
+    auto int_result = op(-5);
+    EXPECT_TRUE((std::is_same_v<decltype(int_result), int>));
+
+    auto double_result = op(-5.0);
+    EXPECT_TRUE((std::is_same_v<decltype(double_result), double>));
+}
+
+TEST_F(ExpressionOpsTest, SqrtOperator) {
+    Sqrt op;
+
+    // Perfect squares
+    EXPECT_DOUBLE_EQ(op(4.0), 2.0);
+    EXPECT_DOUBLE_EQ(op(9.0), 3.0);
+    EXPECT_DOUBLE_EQ(op(16.0), 4.0);
+    EXPECT_DOUBLE_EQ(op(25.0), 5.0);
+
+    // Non-perfect squares
+    EXPECT_DOUBLE_EQ(op(2.0), std::sqrt(2.0));
+    EXPECT_DOUBLE_EQ(op(3.0), std::sqrt(3.0));
+
+    // Special cases
+    EXPECT_DOUBLE_EQ(op(0.0), 0.0);
+    EXPECT_DOUBLE_EQ(op(1.0), 1.0);
+
+    // Type conversion
+    auto result = op(4);  // Integer input
+    EXPECT_DOUBLE_EQ(result, 2.0);
+}
+
+// =============================================================================
+// Constexpr Tests
+// =============================================================================
+
+TEST_F(ExpressionOpsTest, ConstexprOperators) {
+    // Test that operators can be used in constexpr contexts
+    constexpr Add add_op;
+    constexpr Sub sub_op;
+    constexpr Mul mul_op;
+    constexpr Div div_op;
+    constexpr Negate neg_op;
+
+    // Compile-time evaluation
+    constexpr auto sum = add_op(3, 4);
+    constexpr auto diff = sub_op(7, 3);
+    constexpr auto prod = mul_op(3, 4);
+    constexpr auto quot = div_op(12, 3);
+    constexpr auto neg = neg_op(5);
+
+    EXPECT_EQ(sum, 7);
+    EXPECT_EQ(diff, 4);
+    EXPECT_EQ(prod, 12);
+    EXPECT_EQ(quot, 4);
+    EXPECT_EQ(neg, -5);
+
+    // Static assertions to verify compile-time evaluation
+    static_assert(add_op(2, 3) == 5);
+    static_assert(sub_op(5, 2) == 3);
+    static_assert(mul_op(3, 4) == 12);
+    static_assert(div_op(10, 2) == 5);
+    static_assert(neg_op(3) == -3);
+}
+
+// =============================================================================
+// Type Deduction Tests
+// =============================================================================
+
+TEST_F(ExpressionOpsTest, TypeDeduction) {
+    Add add_op;
+    Sub sub_op;
+    Mul mul_op;
+    Div div_op;
+
+    // int + int -> int
+    auto int_result = add_op(3, 4);
+    EXPECT_TRUE((std::is_same_v<decltype(int_result), int>));
+
+    // double + double -> double
+    auto double_result = add_op(3.0, 4.0);
+    EXPECT_TRUE((std::is_same_v<decltype(double_result), double>));
+
+    // int + double -> double
+    auto mixed_result1 = add_op(3, 4.0);
+    EXPECT_TRUE((std::is_same_v<decltype(mixed_result1), double>));
+
+    // double + int -> double
+    auto mixed_result2 = add_op(3.0, 4);
+    EXPECT_TRUE((std::is_same_v<decltype(mixed_result2), double>));
+
+    // float + double -> double
+    auto float_double_result = add_op(3.0f, 4.0);
+    EXPECT_TRUE((std::is_same_v<decltype(float_double_result), double>));
+}
+
+// =============================================================================
+// Complex Expression Tests
+// =============================================================================
+
+TEST_F(ExpressionOpsTest, ChainedOperations) {
+    Add add_op;
+    Sub sub_op;
+    Mul mul_op;
+    Div div_op;
+    Negate neg_op;
+
+    // Simulate complex expression: -(a + b) * c / d
+    double a = 2.0, b = 3.0, c = 4.0, d = 2.0;
+
+    auto sum = add_op(a, b);       // 5.0
+    auto negated = neg_op(sum);    // -5.0
+    auto product = mul_op(negated, c);  // -20.0
+    auto result = div_op(product, d);   // -10.0
+
+    EXPECT_DOUBLE_EQ(result, -10.0);
+}
+
+TEST_F(ExpressionOpsTest, MixedPrecisionChain) {
+    Add add_op;
+    Mul mul_op;
+
+    // Mixed precision chain
+    int a = 2;
+    float b = 3.5f;
+    double c = 1.5;
+
+    auto step1 = add_op(a, b);    // int + float -> float (5.5f)
+    auto step2 = mul_op(step1, c); // float + double -> double (8.25)
+
+    EXPECT_TRUE((std::is_same_v<decltype(step2), double>));
+    EXPECT_DOUBLE_EQ(step2, 8.25);
+}
+
+// =============================================================================
+// Operator Integration with Vector/Matrix Tests
+// =============================================================================
+
+TEST_F(ExpressionOpsTest, VectorIntegration) {
+    Vector<double, 3> v1{1.0, 2.0, 3.0};
+    Vector<double, 3> v2{4.0, 5.0, 6.0};
+
+    // Test that operators work correctly in vector expressions
+    Vector<double, 3> sum = v1 + v2;
+    Vector<double, 3> diff = v1 - v2;
+    Vector<double, 3> neg = -v1;
+    Vector<double, 3> scaled = v1 * 2.0;
+
+    EXPECT_DOUBLE_EQ(sum[0], 5.0);
+    EXPECT_DOUBLE_EQ(sum[1], 7.0);
+    EXPECT_DOUBLE_EQ(sum[2], 9.0);
+
+    EXPECT_DOUBLE_EQ(diff[0], -3.0);
+    EXPECT_DOUBLE_EQ(diff[1], -3.0);
+    EXPECT_DOUBLE_EQ(diff[2], -3.0);
+
+    EXPECT_DOUBLE_EQ(neg[0], -1.0);
+    EXPECT_DOUBLE_EQ(neg[1], -2.0);
+    EXPECT_DOUBLE_EQ(neg[2], -3.0);
+
+    EXPECT_DOUBLE_EQ(scaled[0], 2.0);
+    EXPECT_DOUBLE_EQ(scaled[1], 4.0);
+    EXPECT_DOUBLE_EQ(scaled[2], 6.0);
+}
+
+TEST_F(ExpressionOpsTest, MatrixIntegration) {
+    Matrix<double, 2, 2> m1{{1.0, 2.0}, {3.0, 4.0}};
+    Matrix<double, 2, 2> m2{{5.0, 6.0}, {7.0, 8.0}};
+
+    // Test that operators work correctly in matrix expressions
+    Matrix<double, 2, 2> sum = m1 + m2;
+    Matrix<double, 2, 2> diff = m1 - m2;
+    Matrix<double, 2, 2> neg = -m1;
+    Matrix<double, 2, 2> scaled = m1 * 2.0;
+
+    EXPECT_DOUBLE_EQ(sum(0, 0), 6.0);
+    EXPECT_DOUBLE_EQ(sum(0, 1), 8.0);
+    EXPECT_DOUBLE_EQ(sum(1, 0), 10.0);
+    EXPECT_DOUBLE_EQ(sum(1, 1), 12.0);
+
+    EXPECT_DOUBLE_EQ(diff(0, 0), -4.0);
+    EXPECT_DOUBLE_EQ(diff(0, 1), -4.0);
+    EXPECT_DOUBLE_EQ(diff(1, 0), -4.0);
+    EXPECT_DOUBLE_EQ(diff(1, 1), -4.0);
+
+    EXPECT_DOUBLE_EQ(neg(0, 0), -1.0);
+    EXPECT_DOUBLE_EQ(neg(0, 1), -2.0);
+    EXPECT_DOUBLE_EQ(neg(1, 0), -3.0);
+    EXPECT_DOUBLE_EQ(neg(1, 1), -4.0);
+
+    EXPECT_DOUBLE_EQ(scaled(0, 0), 2.0);
+    EXPECT_DOUBLE_EQ(scaled(0, 1), 4.0);
+    EXPECT_DOUBLE_EQ(scaled(1, 0), 6.0);
+    EXPECT_DOUBLE_EQ(scaled(1, 1), 8.0);
+}
+
+// =============================================================================
+// Edge Cases and Special Values Tests
+// =============================================================================
+
+TEST_F(ExpressionOpsTest, SpecialFloatingPointValues) {
+    Add add_op;
+    Sub sub_op;
+    Mul mul_op;
+    Div div_op;
+    Abs abs_op;
+    Negate neg_op;
+
+    // Infinity handling
+    double inf = std::numeric_limits<double>::infinity();
+    EXPECT_DOUBLE_EQ(add_op(inf, 1.0), inf);
+    EXPECT_DOUBLE_EQ(sub_op(inf, 1.0), inf);
+    EXPECT_DOUBLE_EQ(mul_op(inf, 2.0), inf);
+    EXPECT_DOUBLE_EQ(div_op(inf, 2.0), inf);
+    EXPECT_DOUBLE_EQ(abs_op(inf), inf);
+    EXPECT_DOUBLE_EQ(neg_op(inf), -inf);
+
+    // NaN handling
+    double nan = std::numeric_limits<double>::quiet_NaN();
+    EXPECT_TRUE(std::isnan(add_op(nan, 1.0)));
+    EXPECT_TRUE(std::isnan(sub_op(nan, 1.0)));
+    EXPECT_TRUE(std::isnan(mul_op(nan, 2.0)));
+    EXPECT_TRUE(std::isnan(div_op(nan, 2.0)));
+    EXPECT_TRUE(std::isnan(abs_op(nan)));
+    EXPECT_TRUE(std::isnan(neg_op(nan)));
+
+    // Division by zero
+    EXPECT_DOUBLE_EQ(div_op(1.0, 0.0), inf);
+    EXPECT_DOUBLE_EQ(div_op(-1.0, 0.0), -inf);
+    EXPECT_TRUE(std::isnan(div_op(0.0, 0.0)));
+}
+
+TEST_F(ExpressionOpsTest, LargeAndSmallValues) {
+    Add add_op;
+    Mul mul_op;
+
+    // Large values
+    double large = 1e308;
+    double result = add_op(large, large);
+    EXPECT_TRUE(std::isinf(result));  // Overflow to infinity
+
+    // Small values
+    double tiny = std::numeric_limits<double>::min();
+    double tiny_result = mul_op(tiny, 0.5);
+    EXPECT_GT(tiny_result, 0.0);  // Should still be positive
+    EXPECT_LT(tiny_result, tiny);  // But smaller
+
+    // Denormalized numbers
+    double denorm = std::numeric_limits<double>::denorm_min();
+    double denorm_result = add_op(denorm, denorm);
+    EXPECT_EQ(denorm_result, 2.0 * denorm);
+}
+
+// =============================================================================
+// SFINAE and Compile-time Constraint Tests
+// =============================================================================
+
+TEST_F(ExpressionOpsTest, SFINAECompatibility) {
+    // Test that operators work with any arithmetic types
+    Add add_op;
+
+    // Various integer types
+    EXPECT_EQ(add_op(int8_t(3), int8_t(4)), 7);
+    EXPECT_EQ(add_op(int16_t(100), int16_t(200)), 300);
+    EXPECT_EQ(add_op(int32_t(1000), int32_t(2000)), 3000);
+    EXPECT_EQ(add_op(int64_t(10000), int64_t(20000)), 30000);
+
+    // Unsigned types
+    EXPECT_EQ(add_op(uint8_t(3), uint8_t(4)), 7u);
+    EXPECT_EQ(add_op(uint16_t(100), uint16_t(200)), 300u);
+    EXPECT_EQ(add_op(uint32_t(1000), uint32_t(2000)), 3000u);
+
+    // Floating point types
+    EXPECT_FLOAT_EQ(add_op(3.0f, 4.0f), 7.0f);
+    EXPECT_DOUBLE_EQ(add_op(3.0, 4.0), 7.0);
+
+    // Long double
+    long double ld1 = 3.0L;
+    long double ld2 = 4.0L;
+    EXPECT_DOUBLE_EQ(add_op(ld1, ld2), 7.0L);
+}
+
+// =============================================================================
+// Template Instantiation Tests
+// =============================================================================
+
+TEST_F(ExpressionOpsTest, TemplateInstantiations) {
+    // Test that operators can be instantiated with various types
+    Add add_op;
+    Sub sub_op;
+    Mul mul_op;
+    Div div_op;
+    Abs abs_op;
+    Sqrt sqrt_op;
+    Negate neg_op;
+
+    // Custom types that support arithmetic operations
+    struct CustomNumber {
+        double value;
+        CustomNumber(double v) : value(v) {}
+        CustomNumber operator+(const CustomNumber& other) const { return CustomNumber(value + other.value); }
+        CustomNumber operator-(const CustomNumber& other) const { return CustomNumber(value - other.value); }
+        CustomNumber operator*(const CustomNumber& other) const { return CustomNumber(value * other.value); }
+        CustomNumber operator/(const CustomNumber& other) const { return CustomNumber(value / other.value); }
+        CustomNumber operator-() const { return CustomNumber(-value); }
+        bool operator==(const CustomNumber& other) const { return value == other.value; }
+    };
+
+    CustomNumber cn1(3.0);
+    CustomNumber cn2(4.0);
+
+    auto cn_sum = add_op(cn1, cn2);
+    EXPECT_EQ(cn_sum.value, 7.0);
+
+    auto cn_diff = sub_op(cn1, cn2);
+    EXPECT_EQ(cn_diff.value, -1.0);
+
+    auto cn_prod = mul_op(cn1, cn2);
+    EXPECT_EQ(cn_prod.value, 12.0);
+
+    auto cn_quot = div_op(cn1, cn2);
+    EXPECT_EQ(cn_quot.value, 0.75);
+
+    auto cn_neg = neg_op(cn1);
+    EXPECT_EQ(cn_neg.value, -3.0);
+}
+
+// =============================================================================
+// Complex Number Support Tests
+// =============================================================================
+
+TEST_F(ExpressionOpsTest, ComplexNumberSupport) {
+    Add add_op;
+    Sub sub_op;
+    Mul mul_op;
+    Div div_op;
+    Negate neg_op;
+
+    std::complex<double> c1(3.0, 4.0);
+    std::complex<double> c2(1.0, 2.0);
+
+    auto c_sum = add_op(c1, c2);
+    EXPECT_DOUBLE_EQ(c_sum.real(), 4.0);
+    EXPECT_DOUBLE_EQ(c_sum.imag(), 6.0);
+
+    auto c_diff = sub_op(c1, c2);
+    EXPECT_DOUBLE_EQ(c_diff.real(), 2.0);
+    EXPECT_DOUBLE_EQ(c_diff.imag(), 2.0);
+
+    auto c_prod = mul_op(c1, c2);
+    EXPECT_DOUBLE_EQ(c_prod.real(), -5.0);  // (3+4i)(1+2i) = 3+6i+4i+8i² = 3+10i-8 = -5+10i
+    EXPECT_DOUBLE_EQ(c_prod.imag(), 10.0);
+
+    auto c_neg = neg_op(c1);
+    EXPECT_DOUBLE_EQ(c_neg.real(), -3.0);
+    EXPECT_DOUBLE_EQ(c_neg.imag(), -4.0);
+}
diff --git a/tests/unitTests/FE/Math/test_MathConstants.cpp b/tests/unitTests/FE/Math/test_MathConstants.cpp
new file mode 100644
index 000000000..5619690ed
--- /dev/null
+++ b/tests/unitTests/FE/Math/test_MathConstants.cpp
@@ -0,0 +1,341 @@
+/**
+ * @file test_MathConstants.cpp
+ * @brief Unit tests for MathConstants.h - mathematical constants and tolerances
+ */
+
+#include <gtest/gtest.h>
+#include "FE/Math/MathConstants.h"
+#include <cmath>
+#include <limits>
+#include <type_traits>
+
+using namespace svmp::FE::math;
+
+// Test fixture for MathConstants tests
+class MathConstantsTest : public ::testing::Test {
+protected:
+    void SetUp() override {}
+    void TearDown() override {}
+};
+
+// =============================================================================
+// Mathematical Constants Tests
+// =============================================================================
+
+TEST_F(MathConstantsTest, PiConstants) {
+    // Test PI value
+    EXPECT_NEAR(constants::PI, 3.14159265358979323846, 1e-15);
+
+    // Test PI/2
+    EXPECT_NEAR(constants::PI_2, constants::PI / 2.0, 1e-15);
+
+    // Test PI/4
+    EXPECT_NEAR(constants::PI_4, constants::PI / 4.0, 1e-15);
+
+    // Test 2*PI
+    EXPECT_NEAR(constants::TWO_PI, 2.0 * constants::PI, 1e-15);
+
+    // Test 1/PI
+    EXPECT_NEAR(constants::INV_PI, 1.0 / constants::PI, 1e-15);
+
+    // Test sqrt(PI)
+    EXPECT_NEAR(constants::SQRT_PI, std::sqrt(constants::PI), 1e-15);
+}
+
+TEST_F(MathConstantsTest, EulerConstant) {
+    // Test e (Euler's number)
+    EXPECT_NEAR(constants::E, std::exp(1.0), 1e-15);
+
+    // Test ln(2)
+    EXPECT_NEAR(constants::LN_2, std::log(2.0), 1e-15);
+
+    // Test ln(10)
+    EXPECT_NEAR(constants::LN_10, std::log(10.0), 1e-15);
+
+    // Test log10(e)
+    EXPECT_NEAR(constants::LOG10_E, std::log10(constants::E), 1e-15);
+
+    // Test log2(e)
+    EXPECT_NEAR(constants::LOG2_E, std::log2(constants::E), 1e-15);
+}
+
+TEST_F(MathConstantsTest, SquareRootConstants) {
+    // Test sqrt(2)
+    EXPECT_NEAR(constants::SQRT_2, std::sqrt(2.0), 1e-15);
+
+    // Test sqrt(3)
+    EXPECT_NEAR(constants::SQRT_3, std::sqrt(3.0), 1e-15);
+
+    // Test sqrt(5)
+    EXPECT_NEAR(constants::SQRT_5, std::sqrt(5.0), 1e-15);
+
+    // Test 1/sqrt(2)
+    EXPECT_NEAR(constants::INV_SQRT_2, 1.0 / std::sqrt(2.0), 1e-15);
+
+    // Test 1/sqrt(3)
+    EXPECT_NEAR(constants::INV_SQRT_3, 1.0 / std::sqrt(3.0), 1e-15);
+}
+
+TEST_F(MathConstantsTest, GoldenRatio) {
+    // Test golden ratio φ = (1 + sqrt(5))/2
+    EXPECT_NEAR(constants::PHI, (1.0 + std::sqrt(5.0)) / 2.0, 1e-15);
+
+    // Property: φ² = φ + 1
+    EXPECT_NEAR(constants::PHI * constants::PHI, constants::PHI + 1.0, 1e-14);
+
+    // Property: 1/φ = φ - 1
+    EXPECT_NEAR(1.0 / constants::PHI, constants::PHI - 1.0, 1e-14);
+}
+
+// =============================================================================
+// Angle Conversion Tests
+// =============================================================================
+
+TEST_F(MathConstantsTest, DegreesToRadians) {
+    // Test common conversions
+    EXPECT_NEAR(constants::deg_to_rad(0.0), 0.0, 1e-15);
+    EXPECT_NEAR(constants::deg_to_rad(90.0), constants::PI_2, 1e-15);
+    EXPECT_NEAR(constants::deg_to_rad(180.0), constants::PI, 1e-15);
+    EXPECT_NEAR(constants::deg_to_rad(270.0), 3.0 * constants::PI_2, 1e-15);
+    EXPECT_NEAR(constants::deg_to_rad(360.0), constants::TWO_PI, 1e-15);
+
+    // Test negative angles
+    EXPECT_NEAR(constants::deg_to_rad(-90.0), -constants::PI_2, 1e-15);
+    EXPECT_NEAR(constants::deg_to_rad(-180.0), -constants::PI, 1e-15);
+
+    // Test arbitrary angle
+    EXPECT_NEAR(constants::deg_to_rad(45.0), constants::PI_4, 1e-15);
+    EXPECT_NEAR(constants::deg_to_rad(30.0), constants::PI / 6.0, 1e-15);
+    EXPECT_NEAR(constants::deg_to_rad(60.0), constants::PI / 3.0, 1e-15);
+}
+
+TEST_F(MathConstantsTest, RadiansToDegrees) {
+    // Test common conversions
+    EXPECT_NEAR(constants::rad_to_deg(0.0), 0.0, 1e-13);
+    EXPECT_NEAR(constants::rad_to_deg(constants::PI_2), 90.0, 1e-13);
+    EXPECT_NEAR(constants::rad_to_deg(constants::PI), 180.0, 1e-13);
+    EXPECT_NEAR(constants::rad_to_deg(constants::TWO_PI), 360.0, 1e-13);
+
+    // Test negative angles
+    EXPECT_NEAR(constants::rad_to_deg(-constants::PI), -180.0, 1e-13);
+
+    // Test round-trip conversion
+    double angle_deg = 123.456;
+    double angle_rad = constants::deg_to_rad(angle_deg);
+    double back_to_deg = constants::rad_to_deg(angle_rad);
+    EXPECT_NEAR(back_to_deg, angle_deg, 1e-13);
+}
+
+// =============================================================================
+// Machine Precision Tests
+// =============================================================================
+
+TEST_F(MathConstantsTest, MachineEpsilon) {
+    // Test double precision epsilon
+    EXPECT_EQ(constants::EPSILON, std::numeric_limits<double>::epsilon());
+
+    // Test float precision epsilon
+    EXPECT_EQ(constants::EPSILON_F, std::numeric_limits<float>::epsilon());
+
+    // Verify epsilon is the smallest value such that 1.0 + epsilon != 1.0
+    double one_plus_eps = 1.0 + constants::EPSILON;
+    double one_plus_half_eps = 1.0 + constants::EPSILON / 2.0;
+
+    EXPECT_NE(one_plus_eps, 1.0);
+    EXPECT_EQ(one_plus_half_eps, 1.0);
+}
+
+TEST_F(MathConstantsTest, NumericalLimits) {
+    // Test infinity
+    EXPECT_TRUE(std::isinf(constants::INF_VALUE));
+    EXPECT_GT(constants::INF_VALUE, std::numeric_limits<double>::max());
+
+    // Test NaN
+    EXPECT_TRUE(std::isnan(constants::NOT_A_NUMBER));
+    EXPECT_NE(constants::NOT_A_NUMBER, constants::NOT_A_NUMBER);  // NaN != NaN
+
+    // Test max/min values
+    EXPECT_EQ(constants::MAX_DOUBLE, std::numeric_limits<double>::max());
+    EXPECT_EQ(constants::MIN_DOUBLE, std::numeric_limits<double>::min());
+    EXPECT_EQ(constants::LOWEST_DOUBLE, std::numeric_limits<double>::lowest());
+}
+
+// =============================================================================
+// Tolerance Tests
+// =============================================================================
+
+TEST_F(MathConstantsTest, DefaultTolerances) {
+    // Test default absolute tolerance
+    EXPECT_GT(constants::DEFAULT_TOLERANCE, 0.0);
+    EXPECT_LT(constants::DEFAULT_TOLERANCE, 1e-10);
+
+    // Test default relative tolerance
+    EXPECT_GT(constants::DEFAULT_REL_TOLERANCE, 0.0);
+    EXPECT_LT(constants::DEFAULT_REL_TOLERANCE, 1e-10);
+
+    // Test solver tolerance
+    EXPECT_GT(constants::SOLVER_TOLERANCE, 0.0);
+    EXPECT_LE(constants::SOLVER_TOLERANCE, constants::DEFAULT_TOLERANCE);
+
+    // Test geometry tolerance (typically larger)
+    EXPECT_GT(constants::GEOMETRY_TOLERANCE, 0.0);
+    EXPECT_GE(constants::GEOMETRY_TOLERANCE, constants::DEFAULT_TOLERANCE);
+}
+
+TEST_F(MathConstantsTest, ToleranceComparison) {
+    double a = 1.0;
+    double b = 1.0 + constants::DEFAULT_TOLERANCE / 2.0;
+    double c = 1.0 + constants::DEFAULT_TOLERANCE * 2.0;
+
+    // Values within tolerance should be considered equal
+    EXPECT_TRUE(constants::near(a, b, constants::DEFAULT_TOLERANCE));
+
+    // Values outside tolerance should not be equal
+    EXPECT_FALSE(constants::near(a, c, constants::DEFAULT_TOLERANCE));
+
+    // Test relative tolerance
+    double large_a = 1e10;
+    double large_b = large_a * (1.0 + constants::DEFAULT_REL_TOLERANCE / 2.0);
+    double large_c = large_a * (1.0 + constants::DEFAULT_REL_TOLERANCE * 2.0);
+
+    EXPECT_TRUE(constants::near_relative(large_a, large_b, constants::DEFAULT_REL_TOLERANCE));
+    EXPECT_FALSE(constants::near_relative(large_a, large_c, constants::DEFAULT_REL_TOLERANCE));
+}
+
+TEST_F(MathConstantsTest, ZeroComparison) {
+    // Test near zero detection
+    EXPECT_TRUE(constants::is_zero(0.0));
+    EXPECT_TRUE(constants::is_zero(constants::DEFAULT_TOLERANCE / 2.0));
+    EXPECT_FALSE(constants::is_zero(constants::DEFAULT_TOLERANCE * 2.0));
+
+    // Test with negative values
+    EXPECT_TRUE(constants::is_zero(-constants::DEFAULT_TOLERANCE / 2.0));
+    EXPECT_FALSE(constants::is_zero(-constants::DEFAULT_TOLERANCE * 2.0));
+}
+
+// =============================================================================
+// Physical Constants Tests
+// =============================================================================
+
+TEST_F(MathConstantsTest, PhysicalConstants) {
+    // Test speed of light (m/s)
+    EXPECT_NEAR(constants::SPEED_OF_LIGHT, 299792458.0, 1.0);
+
+    // Test gravitational constant (m³/kg/s²)
+    EXPECT_NEAR(constants::GRAVITATIONAL_CONSTANT, 6.67430e-11, 1e-16);
+
+    // Test standard gravity (m/s²)
+    EXPECT_NEAR(constants::STANDARD_GRAVITY, 9.80665, 1e-10);
+
+    // Test Planck constant (J⋅s)
+    EXPECT_NEAR(constants::PLANCK_CONSTANT, 6.62607015e-34, 1e-42);
+
+    // Test Boltzmann constant (J/K)
+    EXPECT_NEAR(constants::BOLTZMANN_CONSTANT, 1.380649e-23, 1e-29);
+
+    // Test Avogadro's number (1/mol)
+    EXPECT_NEAR(constants::AVOGADRO_NUMBER, 6.02214076e23, 1e15);
+}
+
+// =============================================================================
+// Compile-Time Constants Tests
+// =============================================================================
+
+TEST_F(MathConstantsTest, CompileTimeConstants) {
+    // Test that constants are constexpr (compile-time)
+    constexpr double pi = constants::PI;
+    constexpr double e = constants::E;
+    constexpr double sqrt2 = constants::SQRT_2;
+
+    EXPECT_EQ(pi, constants::PI);
+    EXPECT_EQ(e, constants::E);
+    EXPECT_EQ(sqrt2, constants::SQRT_2);
+
+    // Test compile-time functions
+    constexpr double angle_rad = constants::deg_to_rad(90.0);
+    EXPECT_NEAR(angle_rad, constants::PI_2, 1e-15);
+
+    constexpr double angle_deg = constants::rad_to_deg(constants::PI);
+    EXPECT_NEAR(angle_deg, 180.0, 1e-13);
+}
+
+// =============================================================================
+// Type Traits Tests
+// =============================================================================
+
+TEST_F(MathConstantsTest, TypedConstants) {
+    // Test float versions
+    EXPECT_NEAR(constants::PI_F, static_cast<float>(constants::PI), 1e-7f);
+    EXPECT_NEAR(constants::E_F, static_cast<float>(constants::E), 1e-7f);
+    EXPECT_NEAR(constants::SQRT_2_F, static_cast<float>(constants::SQRT_2), 1e-7f);
+
+    // Test long double versions
+    EXPECT_NEAR(constants::PI_L, static_cast<long double>(constants::PI), 1e-18L);
+    EXPECT_NEAR(constants::E_L, static_cast<long double>(constants::E), 1e-18L);
+}
+
+// =============================================================================
+// Special Functions Tests
+// =============================================================================
+
+TEST_F(MathConstantsTest, SignFunction) {
+    // Test sign function
+    EXPECT_EQ(constants::sign(5.0), 1);
+    EXPECT_EQ(constants::sign(-5.0), -1);
+    EXPECT_EQ(constants::sign(0.0), 0);
+
+    // Test with very small values
+    EXPECT_EQ(constants::sign(constants::EPSILON), 1);
+    EXPECT_EQ(constants::sign(-constants::EPSILON), -1);
+
+    // Test with infinity
+    EXPECT_EQ(constants::sign(constants::INF_VALUE), 1);
+    EXPECT_EQ(constants::sign(-constants::INF_VALUE), -1);
+}
+
+TEST_F(MathConstantsTest, SafeDivision) {
+    // Test safe division
+    EXPECT_NEAR(constants::safe_divide(10.0, 2.0), 5.0, 1e-15);
+    EXPECT_NEAR(constants::safe_divide(1.0, 3.0), 1.0/3.0, 1e-15);
+
+    // Test division by zero returns default
+    EXPECT_EQ(constants::safe_divide(1.0, 0.0, 999.0), 999.0);
+    EXPECT_EQ(constants::safe_divide(1.0, constants::EPSILON/2.0, -1.0), -1.0);
+
+    // Test division by near-zero
+    double tiny = constants::DEFAULT_TOLERANCE / 10.0;
+    EXPECT_EQ(constants::safe_divide(1.0, tiny, 0.0), 0.0);
+}
+
+// =============================================================================
+// Utility Functions Tests
+// =============================================================================
+
+TEST_F(MathConstantsTest, ClampFunction) {
+    // Test clamping
+    EXPECT_EQ(constants::clamp(5.0, 0.0, 10.0), 5.0);
+    EXPECT_EQ(constants::clamp(-5.0, 0.0, 10.0), 0.0);
+    EXPECT_EQ(constants::clamp(15.0, 0.0, 10.0), 10.0);
+
+    // Test with same min/max
+    EXPECT_EQ(constants::clamp(5.0, 3.0, 3.0), 3.0);
+
+    // Test with infinity
+    EXPECT_EQ(constants::clamp(constants::INF_VALUE, 0.0, 10.0), 10.0);
+    EXPECT_EQ(constants::clamp(-constants::INF_VALUE, 0.0, 10.0), 0.0);
+}
+
+TEST_F(MathConstantsTest, LerpFunction) {
+    // Test linear interpolation
+    EXPECT_NEAR(constants::lerp(0.0, 10.0, 0.0), 0.0, 1e-15);
+    EXPECT_NEAR(constants::lerp(0.0, 10.0, 1.0), 10.0, 1e-15);
+    EXPECT_NEAR(constants::lerp(0.0, 10.0, 0.5), 5.0, 1e-15);
+    EXPECT_NEAR(constants::lerp(0.0, 10.0, 0.25), 2.5, 1e-15);
+
+    // Test extrapolation
+    EXPECT_NEAR(constants::lerp(0.0, 10.0, -0.5), -5.0, 1e-15);
+    EXPECT_NEAR(constants::lerp(0.0, 10.0, 1.5), 15.0, 1e-15);
+
+    // Test with negative range
+    EXPECT_NEAR(constants::lerp(-10.0, -5.0, 0.5), -7.5, 1e-15);
+}
diff --git a/tests/unitTests/FE/Math/test_Matrix.cpp b/tests/unitTests/FE/Math/test_Matrix.cpp
new file mode 100644
index 000000000..c186c26ee
--- /dev/null
+++ b/tests/unitTests/FE/Math/test_Matrix.cpp
@@ -0,0 +1,594 @@
+/**
+ * @file test_Matrix.cpp
+ * @brief Unit tests for Matrix.h - fixed-size matrices with expression templates
+ */
+
+#include <gtest/gtest.h>
+#include "FE/Math/Matrix.h"
+#include "FE/Math/Vector.h"
+#include "FE/Math/MatrixExpr.h"
+#include "FE/Math/MathConstants.h"
+#include <limits>
+#include <cmath>
+#include <thread>
+#include <vector>
+
+using namespace svmp::FE::math;
+
+// Test fixture for Matrix tests
+class MatrixTest : public ::testing::Test {
+protected:
+    static constexpr double tolerance = 1e-14;
+
+    void SetUp() override {}
+    void TearDown() override {}
+
+    // Helper function to check if two values are approximately equal
+    template<typename T>
+    bool approx_equal(T a, T b, T tol = tolerance) {
+        return std::abs(a - b) <= tol;
+    }
+};
+
+// =============================================================================
+// Construction and Initialization Tests
+// =============================================================================
+
+TEST_F(MatrixTest, DefaultConstruction) {
+    Matrix<double, 3, 3> m;
+    for (size_t i = 0; i < 3; ++i) {
+        for (size_t j = 0; j < 3; ++j) {
+            EXPECT_EQ(m(i, j), 0.0);
+        }
+    }
+}
+
+TEST_F(MatrixTest, FillConstruction) {
+    Matrix<double, 2, 3> m(5.0);
+    for (size_t i = 0; i < 2; ++i) {
+        for (size_t j = 0; j < 3; ++j) {
+            EXPECT_EQ(m(i, j), 5.0);
+        }
+    }
+}
+
+TEST_F(MatrixTest, InitializerListConstruction) {
+    Matrix<double, 2, 3> m{{1.0, 2.0, 3.0},
+                           {4.0, 5.0, 6.0}};
+
+    EXPECT_EQ(m(0, 0), 1.0);
+    EXPECT_EQ(m(0, 1), 2.0);
+    EXPECT_EQ(m(0, 2), 3.0);
+    EXPECT_EQ(m(1, 0), 4.0);
+    EXPECT_EQ(m(1, 1), 5.0);
+    EXPECT_EQ(m(1, 2), 6.0);
+}
+
+TEST_F(MatrixTest, CopyConstruction) {
+    Matrix<double, 2, 2> m1{{1.0, 2.0},
+                            {3.0, 4.0}};
+    Matrix<double, 2, 2> m2(m1);
+
+    EXPECT_EQ(m2(0, 0), 1.0);
+    EXPECT_EQ(m2(0, 1), 2.0);
+    EXPECT_EQ(m2(1, 0), 3.0);
+    EXPECT_EQ(m2(1, 1), 4.0);
+
+    // Ensure deep copy
+    m2(0, 0) = 10.0;
+    EXPECT_EQ(m1(0, 0), 1.0);
+    EXPECT_EQ(m2(0, 0), 10.0);
+}
+
+TEST_F(MatrixTest, MoveConstruction) {
+    Matrix<double, 2, 2> m1{{1.0, 2.0},
+                            {3.0, 4.0}};
+    Matrix<double, 2, 2> m2(std::move(m1));
+
+    EXPECT_EQ(m2(0, 0), 1.0);
+    EXPECT_EQ(m2(0, 1), 2.0);
+    EXPECT_EQ(m2(1, 0), 3.0);
+    EXPECT_EQ(m2(1, 1), 4.0);
+}
+
+// =============================================================================
+// Element Access Tests
+// =============================================================================
+
+TEST_F(MatrixTest, ElementAccess) {
+    Matrix<double, 2, 3> m{{1.0, 2.0, 3.0},
+                           {4.0, 5.0, 6.0}};
+
+    // Non-const access using operator()
+    EXPECT_EQ(m(0, 0), 1.0);
+    EXPECT_EQ(m(0, 2), 3.0);
+    EXPECT_EQ(m(1, 1), 5.0);
+
+    // Modification
+    m(1, 2) = 7.0;
+    EXPECT_EQ(m(1, 2), 7.0);
+
+    // Const access
+    const Matrix<double, 2, 3> cm{{1.0, 2.0, 3.0},
+                                  {4.0, 5.0, 6.0}};
+    EXPECT_EQ(cm(0, 1), 2.0);
+    EXPECT_EQ(cm(1, 0), 4.0);
+}
+
+TEST_F(MatrixTest, ElementAccessBounds) {
+    Matrix<double, 2, 3> m{{1.0, 2.0, 3.0},
+                           {4.0, 5.0, 6.0}};
+
+    // at() with bounds checking
+    EXPECT_EQ(m.at(0, 0), 1.0);
+    EXPECT_EQ(m.at(1, 2), 6.0);
+
+    // Test out of bounds throws
+    EXPECT_THROW(m.at(2, 0), std::out_of_range);
+    EXPECT_THROW(m.at(0, 3), std::out_of_range);
+    EXPECT_THROW(m.at(10, 10), std::out_of_range);
+}
+
+TEST_F(MatrixTest, RowColumnAccess) {
+    Matrix<double, 3, 3> m{{1.0, 2.0, 3.0},
+                           {4.0, 5.0, 6.0},
+                           {7.0, 8.0, 9.0}};
+
+    // Get row
+    auto row1 = m.row(1);
+    EXPECT_EQ(row1[0], 4.0);
+    EXPECT_EQ(row1[1], 5.0);
+    EXPECT_EQ(row1[2], 6.0);
+
+    // Get column
+    auto col2 = m.col(2);
+    EXPECT_EQ(col2[0], 3.0);
+    EXPECT_EQ(col2[1], 6.0);
+    EXPECT_EQ(col2[2], 9.0);
+
+    // Set row
+    Vector<double, 3> new_row{10.0, 11.0, 12.0};
+    m.set_row(0, new_row);
+    EXPECT_EQ(m(0, 0), 10.0);
+    EXPECT_EQ(m(0, 1), 11.0);
+    EXPECT_EQ(m(0, 2), 12.0);
+
+    // Set column
+    Vector<double, 3> new_col{20.0, 21.0, 22.0};
+    m.set_col(1, new_col);
+    EXPECT_EQ(m(0, 1), 20.0);
+    EXPECT_EQ(m(1, 1), 21.0);
+    EXPECT_EQ(m(2, 1), 22.0);
+}
+
+// =============================================================================
+// Arithmetic Operations Tests
+// =============================================================================
+
+TEST_F(MatrixTest, Addition) {
+    Matrix<double, 2, 3> a{{1.0, 2.0, 3.0},
+                           {4.0, 5.0, 6.0}};
+    Matrix<double, 2, 3> b{{7.0, 8.0, 9.0},
+                           {10.0, 11.0, 12.0}};
+
+    Matrix<double, 2, 3> c = a + b;
+    EXPECT_EQ(c(0, 0), 8.0);
+    EXPECT_EQ(c(0, 1), 10.0);
+    EXPECT_EQ(c(0, 2), 12.0);
+    EXPECT_EQ(c(1, 0), 14.0);
+    EXPECT_EQ(c(1, 1), 16.0);
+    EXPECT_EQ(c(1, 2), 18.0);
+}
+
+TEST_F(MatrixTest, Subtraction) {
+    Matrix<double, 2, 3> a{{8.0, 10.0, 12.0},
+                           {14.0, 16.0, 18.0}};
+    Matrix<double, 2, 3> b{{7.0, 8.0, 9.0},
+                           {10.0, 11.0, 12.0}};
+
+    Matrix<double, 2, 3> c = a - b;
+    EXPECT_EQ(c(0, 0), 1.0);
+    EXPECT_EQ(c(0, 1), 2.0);
+    EXPECT_EQ(c(0, 2), 3.0);
+    EXPECT_EQ(c(1, 0), 4.0);
+    EXPECT_EQ(c(1, 1), 5.0);
+    EXPECT_EQ(c(1, 2), 6.0);
+}
+
+TEST_F(MatrixTest, ScalarMultiplication) {
+    Matrix<double, 2, 2> a{{1.0, 2.0},
+                           {3.0, 4.0}};
+
+    Matrix<double, 2, 2> b = 2.0 * a;
+    EXPECT_EQ(b(0, 0), 2.0);
+    EXPECT_EQ(b(0, 1), 4.0);
+    EXPECT_EQ(b(1, 0), 6.0);
+    EXPECT_EQ(b(1, 1), 8.0);
+
+    Matrix<double, 2, 2> c = a * 3.0;
+    EXPECT_EQ(c(0, 0), 3.0);
+    EXPECT_EQ(c(0, 1), 6.0);
+    EXPECT_EQ(c(1, 0), 9.0);
+    EXPECT_EQ(c(1, 1), 12.0);
+}
+
+TEST_F(MatrixTest, ScalarDivision) {
+    Matrix<double, 2, 2> a{{2.0, 4.0},
+                           {6.0, 8.0}};
+
+    Matrix<double, 2, 2> b = a / 2.0;
+    EXPECT_EQ(b(0, 0), 1.0);
+    EXPECT_EQ(b(0, 1), 2.0);
+    EXPECT_EQ(b(1, 0), 3.0);
+    EXPECT_EQ(b(1, 1), 4.0);
+}
+
+TEST_F(MatrixTest, MatrixMultiplication) {
+    Matrix<double, 2, 3> a{{1.0, 2.0, 3.0},
+                           {4.0, 5.0, 6.0}};
+    Matrix<double, 3, 2> b{{7.0, 8.0},
+                           {9.0, 10.0},
+                           {11.0, 12.0}};
+
+    Matrix<double, 2, 2> c = a * b;
+    EXPECT_EQ(c(0, 0), 58.0);   // 1*7 + 2*9 + 3*11
+    EXPECT_EQ(c(0, 1), 64.0);   // 1*8 + 2*10 + 3*12
+    EXPECT_EQ(c(1, 0), 139.0);  // 4*7 + 5*9 + 6*11
+    EXPECT_EQ(c(1, 1), 154.0);  // 4*8 + 5*10 + 6*12
+}
+
+TEST_F(MatrixTest, MatrixVectorMultiplication) {
+    Matrix<double, 3, 3> m{{1.0, 2.0, 3.0},
+                           {4.0, 5.0, 6.0},
+                           {7.0, 8.0, 9.0}};
+    Vector<double, 3> v{1.0, 2.0, 3.0};
+
+    Vector<double, 3> result = m * v;
+    EXPECT_EQ(result[0], 14.0);  // 1*1 + 2*2 + 3*3
+    EXPECT_EQ(result[1], 32.0);  // 4*1 + 5*2 + 6*3
+    EXPECT_EQ(result[2], 50.0);  // 7*1 + 8*2 + 9*3
+}
+
+// =============================================================================
+// Special Matrix Operations Tests
+// =============================================================================
+
+TEST_F(MatrixTest, Transpose) {
+    Matrix<double, 2, 3> m{{1.0, 2.0, 3.0},
+                           {4.0, 5.0, 6.0}};
+
+    Matrix<double, 3, 2> mt = m.transpose();
+    EXPECT_EQ(mt(0, 0), 1.0);
+    EXPECT_EQ(mt(0, 1), 4.0);
+    EXPECT_EQ(mt(1, 0), 2.0);
+    EXPECT_EQ(mt(1, 1), 5.0);
+    EXPECT_EQ(mt(2, 0), 3.0);
+    EXPECT_EQ(mt(2, 1), 6.0);
+}
+
+TEST_F(MatrixTest, Determinant2x2) {
+    Matrix<double, 2, 2> m{{1.0, 2.0},
+                           {3.0, 4.0}};
+
+    double det = m.determinant();
+    EXPECT_EQ(det, -2.0);  // 1*4 - 2*3 = 4 - 6 = -2
+}
+
+TEST_F(MatrixTest, Determinant3x3) {
+    Matrix<double, 3, 3> m{{1.0, 2.0, 3.0},
+                           {0.0, 1.0, 4.0},
+                           {5.0, 6.0, 0.0}};
+
+    double det = m.determinant();
+    EXPECT_EQ(det, 1.0);  // Using Sarrus rule
+}
+
+TEST_F(MatrixTest, Determinant4x4) {
+    Matrix<double, 4, 4> m{{1, 0, 0, 0},
+                           {0, 2, 0, 0},
+                           {0, 0, 3, 0},
+                           {0, 0, 0, 4}};
+
+    double det = m.determinant();
+    EXPECT_EQ(det, 24.0);  // 1*2*3*4 = 24 (diagonal matrix)
+}
+
+TEST_F(MatrixTest, Inverse2x2) {
+    Matrix<double, 2, 2> m{{1.0, 2.0},
+                           {3.0, 4.0}};
+
+    Matrix<double, 2, 2> inv = m.inverse();
+
+    // Check inverse properties
+    EXPECT_NEAR(inv(0, 0), -2.0, tolerance);
+    EXPECT_NEAR(inv(0, 1), 1.0, tolerance);
+    EXPECT_NEAR(inv(1, 0), 1.5, tolerance);
+    EXPECT_NEAR(inv(1, 1), -0.5, tolerance);
+
+    // Verify M * M^-1 = I
+    Matrix<double, 2, 2> identity = m * inv;
+    EXPECT_NEAR(identity(0, 0), 1.0, tolerance);
+    EXPECT_NEAR(identity(0, 1), 0.0, tolerance);
+    EXPECT_NEAR(identity(1, 0), 0.0, tolerance);
+    EXPECT_NEAR(identity(1, 1), 1.0, tolerance);
+}
+
+TEST_F(MatrixTest, Inverse3x3) {
+    Matrix<double, 3, 3> m{{1.0, 2.0, 3.0},
+                           {0.0, 1.0, 4.0},
+                           {5.0, 6.0, 0.0}};
+
+    Matrix<double, 3, 3> inv = m.inverse();
+
+    // Verify M * M^-1 = I
+    Matrix<double, 3, 3> identity = m * inv;
+    for (size_t i = 0; i < 3; ++i) {
+        for (size_t j = 0; j < 3; ++j) {
+            double expected = (i == j) ? 1.0 : 0.0;
+            EXPECT_NEAR(identity(i, j), expected, tolerance);
+        }
+    }
+}
+
+TEST_F(MatrixTest, Trace) {
+    Matrix<double, 3, 3> m{{1.0, 2.0, 3.0},
+                           {4.0, 5.0, 6.0},
+                           {7.0, 8.0, 9.0}};
+
+    double trace = m.trace();
+    EXPECT_EQ(trace, 15.0);  // 1 + 5 + 9 = 15
+}
+
+// =============================================================================
+// Special Matrix Types Tests
+// =============================================================================
+
+TEST_F(MatrixTest, IdentityMatrix) {
+    Matrix<double, 3, 3> I = Matrix<double, 3, 3>::identity();
+
+    for (size_t i = 0; i < 3; ++i) {
+        for (size_t j = 0; j < 3; ++j) {
+            double expected = (i == j) ? 1.0 : 0.0;
+            EXPECT_EQ(I(i, j), expected);
+        }
+    }
+
+    // Test identity property
+    Matrix<double, 3, 3> m{{1.0, 2.0, 3.0},
+                           {4.0, 5.0, 6.0},
+                           {7.0, 8.0, 9.0}};
+    Matrix<double, 3, 3> result = m * I;
+
+    for (size_t i = 0; i < 3; ++i) {
+        for (size_t j = 0; j < 3; ++j) {
+            EXPECT_EQ(result(i, j), m(i, j));
+        }
+    }
+}
+
+TEST_F(MatrixTest, ZeroMatrix) {
+    Matrix<double, 2, 3> Z = Matrix<double, 2, 3>::zero();
+
+    for (size_t i = 0; i < 2; ++i) {
+        for (size_t j = 0; j < 3; ++j) {
+            EXPECT_EQ(Z(i, j), 0.0);
+        }
+    }
+}
+
+TEST_F(MatrixTest, DiagonalMatrix) {
+    Vector<double, 3> diag{1.0, 2.0, 3.0};
+    Matrix<double, 3, 3> D = Matrix<double, 3, 3>::diagonal(diag);
+
+    EXPECT_EQ(D(0, 0), 1.0);
+    EXPECT_EQ(D(1, 1), 2.0);
+    EXPECT_EQ(D(2, 2), 3.0);
+
+    // Off-diagonal elements should be zero
+    EXPECT_EQ(D(0, 1), 0.0);
+    EXPECT_EQ(D(0, 2), 0.0);
+    EXPECT_EQ(D(1, 0), 0.0);
+    EXPECT_EQ(D(1, 2), 0.0);
+    EXPECT_EQ(D(2, 0), 0.0);
+    EXPECT_EQ(D(2, 1), 0.0);
+}
+
+// =============================================================================
+// Expression Template Tests
+// =============================================================================
+
+TEST_F(MatrixTest, ExpressionTemplatesNoTemporaries) {
+    Matrix<double, 2, 2> a{{1, 2}, {3, 4}};
+    Matrix<double, 2, 2> b{{5, 6}, {7, 8}};
+    Matrix<double, 2, 2> c{{9, 10}, {11, 12}};
+
+    // Complex expression should create no temporaries
+    Matrix<double, 2, 2> result = a + b - c;
+
+    EXPECT_EQ(result(0, 0), -3.0);   // 1 + 5 - 9
+    EXPECT_EQ(result(0, 1), -2.0);   // 2 + 6 - 10
+    EXPECT_EQ(result(1, 0), -1.0);   // 3 + 7 - 11
+    EXPECT_EQ(result(1, 1), 0.0);    // 4 + 8 - 12
+}
+
+TEST_F(MatrixTest, LazyEvaluation) {
+    Matrix<double, 2, 2> a{{1, 2}, {3, 4}};
+    Matrix<double, 2, 2> b{{5, 6}, {7, 8}};
+
+    // Expression should not be evaluated until assignment
+    auto expr = a + b;  // No computation yet
+
+    Matrix<double, 2, 2> result = expr;  // Evaluation happens here
+    EXPECT_EQ(result(0, 0), 6.0);
+    EXPECT_EQ(result(0, 1), 8.0);
+}
+
+// =============================================================================
+// Edge Cases and Error Handling Tests
+// =============================================================================
+
+TEST_F(MatrixTest, SingularMatrixInverse) {
+    Matrix<double, 2, 2> singular{{1.0, 2.0},
+                                  {2.0, 4.0}};  // det = 0
+
+    EXPECT_THROW(singular.inverse(), std::runtime_error);
+}
+
+TEST_F(MatrixTest, DivisionByZero) {
+    Matrix<double, 2, 2> m{{1.0, 2.0},
+                           {3.0, 4.0}};
+
+    Matrix<double, 2, 2> result = m / 0.0;
+    EXPECT_TRUE(std::isinf(result(0, 0)));
+    EXPECT_TRUE(std::isinf(result(0, 1)));
+}
+
+TEST_F(MatrixTest, ExtremeLargeValues) {
+    double large = 1e308;
+    Matrix<double, 2, 2> m{{large, 0}, {0, large}};
+
+    Matrix<double, 2, 2> half = m / 2.0;
+    EXPECT_FALSE(std::isinf(half(0, 0)));
+    EXPECT_EQ(half(0, 0), large / 2.0);
+}
+
+// =============================================================================
+// Numerical Precision Tests
+// =============================================================================
+
+TEST_F(MatrixTest, NumericalStability) {
+    // Test near-singular matrix
+    double eps = 1e-15;
+    Matrix<double, 2, 2> m{{1.0, 1.0},
+                           {1.0, 1.0 + eps}};
+
+    double det = m.determinant();
+    // Relax tolerance due to floating-point arithmetic in determinant calculation
+    EXPECT_NEAR(det, eps, 1e-14);
+}
+
+TEST_F(MatrixTest, OrthogonalMatrixProperties) {
+    // Create rotation matrix (orthogonal)
+    double angle = M_PI / 4;
+    Matrix<double, 2, 2> R{{cos(angle), -sin(angle)},
+                           {sin(angle), cos(angle)}};
+
+    // Check orthogonality: R * R^T = I
+    Matrix<double, 2, 2> RRt = R * R.transpose();
+    EXPECT_NEAR(RRt(0, 0), 1.0, tolerance);
+    EXPECT_NEAR(RRt(0, 1), 0.0, tolerance);
+    EXPECT_NEAR(RRt(1, 0), 0.0, tolerance);
+    EXPECT_NEAR(RRt(1, 1), 1.0, tolerance);
+
+    // Check determinant = ±1
+    EXPECT_NEAR(std::abs(R.determinant()), 1.0, tolerance);
+}
+
+// =============================================================================
+// Matrix Properties Tests
+// =============================================================================
+
+TEST_F(MatrixTest, IsSymmetric) {
+    Matrix<double, 3, 3> sym{{1, 2, 3},
+                             {2, 4, 5},
+                             {3, 5, 6}};
+    EXPECT_TRUE(sym.is_symmetric(tolerance));
+
+    Matrix<double, 3, 3> nonsym{{1, 2, 3},
+                                {4, 5, 6},
+                                {7, 8, 9}};
+    EXPECT_FALSE(nonsym.is_symmetric(tolerance));
+}
+
+TEST_F(MatrixTest, IsSkewSymmetric) {
+    Matrix<double, 3, 3> skew{{0, -1, 2},
+                              {1, 0, -3},
+                              {-2, 3, 0}};
+    EXPECT_TRUE(skew.is_skew_symmetric(tolerance));
+
+    Matrix<double, 3, 3> nonskew{{1, 2, 3},
+                                 {4, 5, 6},
+                                 {7, 8, 9}};
+    EXPECT_FALSE(nonskew.is_skew_symmetric(tolerance));
+}
+
+TEST_F(MatrixTest, IsDiagonal) {
+    Matrix<double, 3, 3> diag{{1, 0, 0},
+                              {0, 2, 0},
+                              {0, 0, 3}};
+    EXPECT_TRUE(diag.is_diagonal(tolerance));
+
+    Matrix<double, 3, 3> nondiag{{1, 0.1, 0},
+                                 {0, 2, 0},
+                                 {0, 0, 3}};
+    EXPECT_FALSE(nondiag.is_diagonal(tolerance));
+}
+
+// =============================================================================
+// Thread Safety Tests
+// =============================================================================
+
+TEST_F(MatrixTest, ThreadSafetyReadOnly) {
+    Matrix<double, 3, 3> m{{1, 2, 3}, {4, 5, 6}, {7, 8, 9}};
+
+    std::vector<std::thread> threads;
+    std::vector<double> results(10);
+
+    for (int i = 0; i < 10; ++i) {
+        threads.emplace_back([&m, &results, i]() {
+            results[static_cast<std::size_t>(i)] = m.trace();
+        });
+    }
+
+    for (auto& t : threads) {
+        t.join();
+    }
+
+    for (double r : results) {
+        EXPECT_EQ(r, 15.0);
+    }
+}
+
+// =============================================================================
+// Memory Alignment Tests
+// =============================================================================
+
+TEST_F(MatrixTest, MemoryAlignment) {
+    Matrix<double, 3, 3> m;
+
+    std::uintptr_t addr = reinterpret_cast<std::uintptr_t>(m.data());
+    EXPECT_EQ(addr % 32, 0) << "Matrix data should be 32-byte aligned for AVX";
+}
+
+// =============================================================================
+// Utility Function Tests
+// =============================================================================
+
+TEST_F(MatrixTest, Norms) {
+    Matrix<double, 2, 2> m{{1, 2}, {3, 4}};
+
+    // Frobenius norm: sqrt(1^2 + 2^2 + 3^2 + 4^2) = sqrt(30)
+    EXPECT_NEAR(m.frobenius_norm(), std::sqrt(30.0), tolerance);
+
+    // Infinity norm (max row sum)
+    EXPECT_EQ(m.infinity_norm(), 7.0);  // max(|1|+|2|, |3|+|4|) = max(3, 7)
+
+    // One norm (max column sum)
+    EXPECT_EQ(m.one_norm(), 6.0);  // max(|1|+|3|, |2|+|4|) = max(4, 6)
+}
+
+TEST_F(MatrixTest, MinMaxElements) {
+    Matrix<double, 2, 3> m{{3, -1, 4}, {1, -2, 5}};
+
+    EXPECT_EQ(m.min(), -2.0);
+    EXPECT_EQ(m.max(), 5.0);
+}
+
+TEST_F(MatrixTest, ToString) {
+    Matrix<double, 2, 2> m{{1, 2}, {3, 4}};
+    std::stringstream ss;
+    ss << m;
+
+    std::string expected = "[[1, 2]\n [3, 4]]";
+    EXPECT_EQ(ss.str(), expected);
+}
diff --git a/tests/unitTests/FE/Math/test_MatrixExpr.cpp b/tests/unitTests/FE/Math/test_MatrixExpr.cpp
new file mode 100644
index 000000000..9486f409c
--- /dev/null
+++ b/tests/unitTests/FE/Math/test_MatrixExpr.cpp
@@ -0,0 +1,528 @@
+/**
+ * @file test_MatrixExpr.cpp
+ * @brief Unit tests for MatrixExpr.h - matrix expression templates
+ */
+
+#include <gtest/gtest.h>
+#include "FE/Math/Matrix.h"
+#include "FE/Math/MatrixExpr.h"
+#include "FE/Math/Vector.h"
+#include "FE/Math/MathConstants.h"
+#include <limits>
+#include <cmath>
+#include <memory>
+#include <atomic>
+#include <type_traits>
+
+using namespace svmp::FE::math;
+
+// Test fixture for MatrixExpr tests
+class MatrixExprTest : public ::testing::Test {
+protected:
+    static constexpr double tolerance = 1e-14;
+
+    // Custom allocator to track memory allocations
+    template<typename T>
+    class TrackingAllocator {
+    public:
+        using value_type = T;
+
+        static std::atomic<size_t> allocations;
+        static std::atomic<size_t> deallocations;
+        static std::atomic<size_t> bytes_allocated;
+
+        TrackingAllocator() = default;
+
+        template<typename U>
+        TrackingAllocator(const TrackingAllocator<U>&) {}
+
+        T* allocate(size_t n) {
+            allocations.fetch_add(1);
+            bytes_allocated.fetch_add(n * sizeof(T));
+            return static_cast<T*>(::operator new(n * sizeof(T)));
+        }
+
+        void deallocate(T* p, size_t n) {
+            deallocations.fetch_add(1);
+            ::operator delete(p);
+        }
+
+        static void reset() {
+            allocations = 0;
+            deallocations = 0;
+            bytes_allocated = 0;
+        }
+    };
+
+    void SetUp() override {
+        TrackingAllocator<double>::reset();
+    }
+
+    void TearDown() override {}
+
+    template<typename T>
+    bool approx_equal(T a, T b, T tol = tolerance) {
+        return std::abs(a - b) <= tol;
+    }
+};
+
+template<typename T>
+std::atomic<size_t> MatrixExprTest::TrackingAllocator<T>::allocations{0};
+template<typename T>
+std::atomic<size_t> MatrixExprTest::TrackingAllocator<T>::deallocations{0};
+template<typename T>
+std::atomic<size_t> MatrixExprTest::TrackingAllocator<T>::bytes_allocated{0};
+
+// =============================================================================
+// Lazy Evaluation Tests
+// =============================================================================
+
+TEST_F(MatrixExprTest, LazyEvaluationNoTemporaries) {
+    // Expression templates should not create temporary matrices
+    Matrix<double, 2, 2> A{{1.0, 2.0}, {3.0, 4.0}};
+    Matrix<double, 2, 2> B{{5.0, 6.0}, {7.0, 8.0}};
+    Matrix<double, 2, 2> C{{9.0, 10.0}, {11.0, 12.0}};
+
+    // Build expression without evaluation
+    auto expr = A + B - C;
+
+    // Expression type should not be Matrix, but an expression type
+    using ExprType = decltype(expr);
+    EXPECT_FALSE((std::is_same_v<ExprType, Matrix<double, 2, 2>>));
+
+    // Now evaluate
+    Matrix<double, 2, 2> result = expr;
+    EXPECT_DOUBLE_EQ(result(0, 0), -3.0);
+    EXPECT_DOUBLE_EQ(result(0, 1), -2.0);
+    EXPECT_DOUBLE_EQ(result(1, 0), -1.0);
+    EXPECT_DOUBLE_EQ(result(1, 1), 0.0);
+}
+
+TEST_F(MatrixExprTest, LazyEvaluationAccessPattern) {
+    Matrix<double, 3, 3> A;
+    Matrix<double, 3, 3> B;
+    for (int i = 0; i < 3; ++i) {
+        for (int j = 0; j < 3; ++j) {
+            A(i, j) = i * 3 + j + 1;
+            B(i, j) = (i * 3 + j + 1) * 2;
+        }
+    }
+
+    auto expr = A + B;
+
+    // Access individual elements without full evaluation
+    EXPECT_DOUBLE_EQ(expr(0, 0), 3.0);
+    EXPECT_DOUBLE_EQ(expr(1, 1), 15.0);
+    EXPECT_DOUBLE_EQ(expr(2, 2), 27.0);
+
+    // Size should be accessible
+    EXPECT_EQ(expr.rows(), 3u);
+    EXPECT_EQ(expr.cols(), 3u);
+}
+
+// =============================================================================
+// Matrix Multiplication Tests
+// =============================================================================
+
+TEST_F(MatrixExprTest, MatrixMultiplicationExpression) {
+    Matrix<double, 2, 3> A{{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}};
+    Matrix<double, 3, 2> B{{7.0, 8.0}, {9.0, 10.0}, {11.0, 12.0}};
+
+    // Matrix multiplication should produce 2x2 result
+    Matrix<double, 2, 2> C = A * B;
+
+    // Verify results
+    EXPECT_DOUBLE_EQ(C(0, 0), 58.0);   // 1*7 + 2*9 + 3*11
+    EXPECT_DOUBLE_EQ(C(0, 1), 64.0);   // 1*8 + 2*10 + 3*12
+    EXPECT_DOUBLE_EQ(C(1, 0), 139.0);  // 4*7 + 5*9 + 6*11
+    EXPECT_DOUBLE_EQ(C(1, 1), 154.0);  // 4*8 + 5*10 + 6*12
+}
+
+TEST_F(MatrixExprTest, ChainedMatrixMultiplication) {
+    Matrix<double, 2, 2> A{{1.0, 2.0}, {3.0, 4.0}};
+    Matrix<double, 2, 2> B{{5.0, 6.0}, {7.0, 8.0}};
+    Matrix<double, 2, 2> C{{9.0, 10.0}, {11.0, 12.0}};
+
+    // Chain matrix multiplications: (A * B) * C
+    Matrix<double, 2, 2> result = A * B * C;
+
+    // First compute A * B
+    Matrix<double, 2, 2> AB = A * B;
+    EXPECT_DOUBLE_EQ(AB(0, 0), 19.0);  // 1*5 + 2*7
+    EXPECT_DOUBLE_EQ(AB(0, 1), 22.0);  // 1*6 + 2*8
+    EXPECT_DOUBLE_EQ(AB(1, 0), 43.0);  // 3*5 + 4*7
+    EXPECT_DOUBLE_EQ(AB(1, 1), 50.0);  // 3*6 + 4*8
+
+    // Then (A * B) * C
+    EXPECT_DOUBLE_EQ(result(0, 0), 413.0);  // 19*9 + 22*11
+    EXPECT_DOUBLE_EQ(result(0, 1), 454.0);  // 19*10 + 22*12
+    EXPECT_DOUBLE_EQ(result(1, 0), 937.0);  // 43*9 + 50*11
+    EXPECT_DOUBLE_EQ(result(1, 1), 1030.0); // 43*10 + 50*12
+}
+
+// =============================================================================
+// Mixed Operations Tests
+// =============================================================================
+
+TEST_F(MatrixExprTest, MixedMatrixOperations) {
+    Matrix<double, 3, 3> A, B, C, D;
+
+    // Initialize matrices
+    for (int i = 0; i < 3; ++i) {
+        for (int j = 0; j < 3; ++j) {
+            A(i, j) = i + j + 1;
+            B(i, j) = (i + 1) * (j + 1);
+            C(i, j) = i * j + 1;
+            D(i, j) = 1.0;
+        }
+    }
+
+    // Complex expression: A * B + C * D
+    Matrix<double, 3, 3> result = A * B + C * D;
+
+    // Verify a few key elements
+    Matrix<double, 3, 3> AB = A * B;
+    Matrix<double, 3, 3> CD = C * D;
+
+    for (int i = 0; i < 3; ++i) {
+        for (int j = 0; j < 3; ++j) {
+            EXPECT_DOUBLE_EQ(result(i, j), AB(i, j) + CD(i, j));
+        }
+    }
+}
+
+TEST_F(MatrixExprTest, ScalarMultiplicationInExpression) {
+    Matrix<double, 2, 2> A{{1.0, 2.0}, {3.0, 4.0}};
+    Matrix<double, 2, 2> B{{5.0, 6.0}, {7.0, 8.0}};
+
+    Matrix<double, 2, 2> result = 2.0 * (A + B) / 3.0;
+
+    EXPECT_TRUE(approx_equal(result(0, 0), 4.0));
+    EXPECT_TRUE(approx_equal(result(0, 1), 16.0/3.0));
+    EXPECT_TRUE(approx_equal(result(1, 0), 20.0/3.0));
+    EXPECT_TRUE(approx_equal(result(1, 1), 8.0));
+}
+
+// =============================================================================
+// Transpose Tests
+// =============================================================================
+
+TEST_F(MatrixExprTest, TransposeExpression) {
+    Matrix<double, 2, 3> A{{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}};
+
+    auto AT = transpose(A);
+
+    // Check dimensions
+    EXPECT_EQ(AT.rows(), 3u);
+    EXPECT_EQ(AT.cols(), 2u);
+
+    // Check values
+    EXPECT_DOUBLE_EQ(AT(0, 0), 1.0);
+    EXPECT_DOUBLE_EQ(AT(0, 1), 4.0);
+    EXPECT_DOUBLE_EQ(AT(1, 0), 2.0);
+    EXPECT_DOUBLE_EQ(AT(1, 1), 5.0);
+    EXPECT_DOUBLE_EQ(AT(2, 0), 3.0);
+    EXPECT_DOUBLE_EQ(AT(2, 1), 6.0);
+}
+
+TEST_F(MatrixExprTest, TransposeInExpression) {
+    Matrix<double, 3, 2> A{{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}};
+    Matrix<double, 3, 2> B{{7.0, 8.0}, {9.0, 10.0}, {11.0, 12.0}};
+
+    // Compute A^T * B (should be 2x2)
+    Matrix<double, 2, 2> result = transpose(A) * B;
+
+    EXPECT_DOUBLE_EQ(result(0, 0), 89.0);   // 1*7 + 3*9 + 5*11
+    EXPECT_DOUBLE_EQ(result(0, 1), 98.0);   // 1*8 + 3*10 + 5*12
+    EXPECT_DOUBLE_EQ(result(1, 0), 116.0);  // 2*7 + 4*9 + 6*11
+    EXPECT_DOUBLE_EQ(result(1, 1), 128.0);  // 2*8 + 4*10 + 6*12
+}
+
+// =============================================================================
+// Unary Operations Tests
+// =============================================================================
+
+TEST_F(MatrixExprTest, NegationInExpression) {
+    Matrix<double, 2, 2> A{{1.0, -2.0}, {3.0, -4.0}};
+    Matrix<double, 2, 2> B{{5.0, 6.0}, {-7.0, 8.0}};
+
+    Matrix<double, 2, 2> result = -A + (-B);
+
+    EXPECT_DOUBLE_EQ(result(0, 0), -6.0);
+    EXPECT_DOUBLE_EQ(result(0, 1), -4.0);
+    EXPECT_DOUBLE_EQ(result(1, 0), 4.0);
+    EXPECT_DOUBLE_EQ(result(1, 1), -4.0);
+}
+
+TEST_F(MatrixExprTest, AbsoluteValueExpression) {
+    Matrix<double, 2, 3> M{{-1.5, 2.3, -4.7}, {0.0, -3.2, 5.1}};
+
+    Matrix<double, 2, 3> result = abs(M);
+
+    EXPECT_DOUBLE_EQ(result(0, 0), 1.5);
+    EXPECT_DOUBLE_EQ(result(0, 1), 2.3);
+    EXPECT_DOUBLE_EQ(result(0, 2), 4.7);
+    EXPECT_DOUBLE_EQ(result(1, 0), 0.0);
+    EXPECT_DOUBLE_EQ(result(1, 1), 3.2);
+    EXPECT_DOUBLE_EQ(result(1, 2), 5.1);
+}
+
+TEST_F(MatrixExprTest, SqrtExpression) {
+    Matrix<double, 2, 2> M{{4.0, 9.0}, {16.0, 25.0}};
+
+    Matrix<double, 2, 2> result = sqrt(M);
+
+    EXPECT_DOUBLE_EQ(result(0, 0), 2.0);
+    EXPECT_DOUBLE_EQ(result(0, 1), 3.0);
+    EXPECT_DOUBLE_EQ(result(1, 0), 4.0);
+    EXPECT_DOUBLE_EQ(result(1, 1), 5.0);
+}
+
+// =============================================================================
+// Element-wise Operations Tests
+// =============================================================================
+
+TEST_F(MatrixExprTest, HadamardProductExpression) {
+    Matrix<double, 2, 3> A{{2.0, 3.0, 4.0}, {5.0, 6.0, 7.0}};
+    Matrix<double, 2, 3> B{{8.0, 9.0, 10.0}, {11.0, 12.0, 13.0}};
+
+    Matrix<double, 2, 3> result = hadamard(A, B);
+
+    EXPECT_DOUBLE_EQ(result(0, 0), 16.0);
+    EXPECT_DOUBLE_EQ(result(0, 1), 27.0);
+    EXPECT_DOUBLE_EQ(result(0, 2), 40.0);
+    EXPECT_DOUBLE_EQ(result(1, 0), 55.0);
+    EXPECT_DOUBLE_EQ(result(1, 1), 72.0);
+    EXPECT_DOUBLE_EQ(result(1, 2), 91.0);
+}
+
+TEST_F(MatrixExprTest, HadamardDivisionExpression) {
+    Matrix<double, 2, 2> A{{10.0, 18.0}, {28.0, 36.0}};
+    Matrix<double, 2, 2> B{{2.0, 3.0}, {4.0, 6.0}};
+
+    Matrix<double, 2, 2> result = hadamard_div(A, B);
+
+    EXPECT_DOUBLE_EQ(result(0, 0), 5.0);
+    EXPECT_DOUBLE_EQ(result(0, 1), 6.0);
+    EXPECT_DOUBLE_EQ(result(1, 0), 7.0);
+    EXPECT_DOUBLE_EQ(result(1, 1), 6.0);
+}
+
+// =============================================================================
+// Norm and Trace Tests
+// =============================================================================
+
+TEST_F(MatrixExprTest, FrobeniusNormOfExpression) {
+    Matrix<double, 2, 2> A{{1.0, 2.0}, {3.0, 4.0}};
+    Matrix<double, 2, 2> B{{2.0, 2.0}, {2.0, 2.0}};
+
+    double norm_sq = frobenius_norm_squared(A - B);
+    double norm = frobenius_norm(A - B);
+
+    // (A - B) = [[-1, 0], [1, 2]]
+    // norm_squared = 1 + 0 + 1 + 4 = 6
+    EXPECT_DOUBLE_EQ(norm_sq, 6.0);
+    EXPECT_DOUBLE_EQ(norm, std::sqrt(6.0));
+}
+
+TEST_F(MatrixExprTest, TraceOfExpression) {
+    Matrix<double, 3, 3> A;
+    Matrix<double, 3, 3> B;
+
+    // Initialize as diagonal matrices
+    for (int i = 0; i < 3; ++i) {
+        for (int j = 0; j < 3; ++j) {
+            A(i, j) = (i == j) ? (i + 1) : 0.0;  // diag(1, 2, 3)
+            B(i, j) = (i == j) ? (i + 4) : 0.0;  // diag(4, 5, 6)
+        }
+    }
+
+    double tr = trace(A + B);
+
+    // trace(A + B) = trace(diag(5, 7, 9)) = 21
+    EXPECT_DOUBLE_EQ(tr, 21.0);
+}
+
+// =============================================================================
+// Type Deduction Tests
+// =============================================================================
+
+TEST_F(MatrixExprTest, TypeDeductionCorrectness) {
+    Matrix<float, 2, 2> Mf{{1.0f, 2.0f}, {3.0f, 4.0f}};
+    Matrix<double, 2, 2> Md{{5.0, 6.0}, {7.0, 8.0}};
+
+    // Float expression
+    auto expr = Mf + Mf;
+    using ExprType = decltype(expr(0, 0));
+    EXPECT_TRUE((std::is_same_v<ExprType, float>));
+
+    // Test that expression evaluates correctly
+    Matrix<float, 2, 2> result = expr;
+    EXPECT_FLOAT_EQ(result(0, 0), 2.0f);
+    EXPECT_FLOAT_EQ(result(1, 1), 8.0f);
+}
+
+// =============================================================================
+// SFINAE and Compile-time Tests
+// =============================================================================
+
+TEST_F(MatrixExprTest, SFINAEConstraints) {
+    // Test that MatrixExpr operators only work with MatrixExpr types
+    Matrix<double, 2, 2> M1{{1.0, 2.0}, {3.0, 4.0}};
+    Matrix<double, 2, 2> M2{{5.0, 6.0}, {7.0, 8.0}};
+
+    // This should compile
+    auto expr = M1 + M2;
+    Matrix<double, 2, 2> result = expr;
+
+    // Verify the constraint checking
+    EXPECT_TRUE((std::is_base_of_v<MatrixExpr<Matrix<double, 2, 2>>, Matrix<double, 2, 2>>));
+}
+
+// =============================================================================
+// Aliasing and Self-Assignment Tests
+// =============================================================================
+
+TEST_F(MatrixExprTest, SelfAssignmentWithExpression) {
+    Matrix<double, 2, 2> A{{1.0, 2.0}, {3.0, 4.0}};
+    Matrix<double, 2, 2> B{{5.0, 6.0}, {7.0, 8.0}};
+
+    // Self-assignment through expression
+    A = A + B;
+
+    EXPECT_DOUBLE_EQ(A(0, 0), 6.0);
+    EXPECT_DOUBLE_EQ(A(0, 1), 8.0);
+    EXPECT_DOUBLE_EQ(A(1, 0), 10.0);
+    EXPECT_DOUBLE_EQ(A(1, 1), 12.0);
+}
+
+TEST_F(MatrixExprTest, AliasingInExpression) {
+    Matrix<double, 2, 2> A{{2.0, 3.0}, {4.0, 5.0}};
+    Matrix<double, 2, 2> B{{1.0, 1.0}, {1.0, 1.0}};
+
+    // A appears on both sides
+    A = B + A;
+
+    EXPECT_DOUBLE_EQ(A(0, 0), 3.0);
+    EXPECT_DOUBLE_EQ(A(0, 1), 4.0);
+    EXPECT_DOUBLE_EQ(A(1, 0), 5.0);
+    EXPECT_DOUBLE_EQ(A(1, 1), 6.0);
+}
+
+// =============================================================================
+// Edge Cases Tests
+// =============================================================================
+
+TEST_F(MatrixExprTest, SingleElementMatrix) {
+    Matrix<double, 1, 1> A{{5.0}};
+    Matrix<double, 1, 1> B{{3.0}};
+
+    Matrix<double, 1, 1> result = A + B - A * 0.5;
+
+    EXPECT_DOUBLE_EQ(result(0, 0), 5.5);
+}
+
+TEST_F(MatrixExprTest, NonSquareMatrixOperations) {
+    Matrix<double, 2, 4> A;
+    Matrix<double, 2, 4> B;
+
+    for (int i = 0; i < 2; ++i) {
+        for (int j = 0; j < 4; ++j) {
+            A(i, j) = i * 4 + j + 1;
+            B(i, j) = (i * 4 + j + 1) * 2;
+        }
+    }
+
+    Matrix<double, 2, 4> result = A + B - A * 0.5;
+
+    for (int i = 0; i < 2; ++i) {
+        for (int j = 0; j < 4; ++j) {
+            double expected = A(i, j) + B(i, j) - A(i, j) * 0.5;
+            EXPECT_DOUBLE_EQ(result(i, j), expected);
+        }
+    }
+}
+
+// =============================================================================
+// Diagonal Matrix Tests
+// =============================================================================
+
+TEST_F(MatrixExprTest, DiagonalMatrixExpression) {
+    Vector<double, 3> v{1.0, 2.0, 3.0};
+
+    auto diag = DiagonalExpr<Vector<double, 3>>(v);
+
+    // Check dimensions
+    EXPECT_EQ(diag.rows(), 3u);
+    EXPECT_EQ(diag.cols(), 3u);
+
+    // Check values
+    EXPECT_DOUBLE_EQ(diag(0, 0), 1.0);
+    EXPECT_DOUBLE_EQ(diag(1, 1), 2.0);
+    EXPECT_DOUBLE_EQ(diag(2, 2), 3.0);
+
+    // Off-diagonal should be zero
+    EXPECT_DOUBLE_EQ(diag(0, 1), 0.0);
+    EXPECT_DOUBLE_EQ(diag(1, 0), 0.0);
+}
+
+TEST_F(MatrixExprTest, DiagonalMatrixInExpression) {
+    Vector<double, 2> v{2.0, 3.0};
+    Matrix<double, 2, 2> A{{1.0, 1.0}, {1.0, 1.0}};
+
+    auto diag = DiagonalExpr<Vector<double, 2>>(v);
+    Matrix<double, 2, 2> result = A + diag;
+
+    EXPECT_DOUBLE_EQ(result(0, 0), 3.0);
+    EXPECT_DOUBLE_EQ(result(0, 1), 1.0);
+    EXPECT_DOUBLE_EQ(result(1, 0), 1.0);
+    EXPECT_DOUBLE_EQ(result(1, 1), 4.0);
+}
+
+// =============================================================================
+// Complex Expression Pattern Tests
+// =============================================================================
+
+TEST_F(MatrixExprTest, ComplexNestedExpression) {
+    Matrix<double, 2, 2> A{{1.0, 2.0}, {3.0, 4.0}};
+    Matrix<double, 2, 2> B{{5.0, 6.0}, {7.0, 8.0}};
+    Matrix<double, 2, 2> C{{9.0, 10.0}, {11.0, 12.0}};
+
+    // Complex expression with multiple operation types
+    Matrix<double, 2, 2> result = 2.0 * abs(A - B) + sqrt(hadamard(C, C)) / 3.0;
+
+    // |A - B| = |[-4, -4], [-4, -4]| = [4, 4], [4, 4]
+    // 2 * [4, 4], [4, 4] = [8, 8], [8, 8]
+    // C * C (element-wise) = [81, 100], [121, 144]
+    // sqrt(C * C) = [9, 10], [11, 12]
+    // sqrt(C * C) / 3 = [3, 10/3], [11/3, 4]
+    // result = [11, 34/3], [35/3, 12]
+
+    EXPECT_DOUBLE_EQ(result(0, 0), 11.0);
+    EXPECT_TRUE(approx_equal(result(0, 1), 34.0/3.0));
+    EXPECT_TRUE(approx_equal(result(1, 0), 35.0/3.0));
+    EXPECT_DOUBLE_EQ(result(1, 1), 12.0);
+}
+
+TEST_F(MatrixExprTest, MatrixVectorMixedExpression) {
+    Matrix<double, 3, 3> A;
+    Vector<double, 3> v{1.0, 2.0, 3.0};
+
+    // Create identity matrix
+    for (int i = 0; i < 3; ++i) {
+        for (int j = 0; j < 3; ++j) {
+            A(i, j) = (i == j) ? 1.0 : 0.0;
+        }
+    }
+
+    // Create diagonal from vector and add to identity
+    auto diag = DiagonalExpr<Vector<double, 3>>(v);
+    Matrix<double, 3, 3> result = A + diag;
+
+    // Result should be diag(2, 3, 4)
+    EXPECT_DOUBLE_EQ(result(0, 0), 2.0);
+    EXPECT_DOUBLE_EQ(result(1, 1), 3.0);
+    EXPECT_DOUBLE_EQ(result(2, 2), 4.0);
+    EXPECT_DOUBLE_EQ(result(0, 1), 0.0);
+    EXPECT_DOUBLE_EQ(result(1, 0), 0.0);
+}
diff --git a/tests/unitTests/FE/Math/test_Vector.cpp b/tests/unitTests/FE/Math/test_Vector.cpp
new file mode 100644
index 000000000..a38a71727
--- /dev/null
+++ b/tests/unitTests/FE/Math/test_Vector.cpp
@@ -0,0 +1,589 @@
+/**
+ * @file test_Vector.cpp
+ * @brief Unit tests for Vector.h - fixed-size vectors with expression templates
+ */
+
+#include <gtest/gtest.h>
+#include "FE/Math/Vector.h"
+#include "FE/Math/VectorExpr.h"
+#include "FE/Math/MathConstants.h"
+#include <limits>
+#include <cmath>
+#include <sstream>
+#include <thread>
+#include <vector>
+
+using namespace svmp::FE::math;
+
+// Test fixture for Vector tests
+class VectorTest : public ::testing::Test {
+protected:
+    static constexpr double tolerance = 1e-14;
+
+    void SetUp() override {}
+    void TearDown() override {}
+
+    // Helper function to check if two values are approximately equal
+    template<typename T>
+    bool approx_equal(T a, T b, T tol = tolerance) {
+        return std::abs(a - b) <= tol;
+    }
+};
+
+// =============================================================================
+// Construction and Initialization Tests
+// =============================================================================
+
+TEST_F(VectorTest, DefaultConstruction) {
+    Vector<double, 3> v;
+    EXPECT_EQ(v[0], 0.0);
+    EXPECT_EQ(v[1], 0.0);
+    EXPECT_EQ(v[2], 0.0);
+
+    Vector<float, 4> vf;
+    for (size_t i = 0; i < 4; ++i) {
+        EXPECT_EQ(vf[i], 0.0f);
+    }
+}
+
+TEST_F(VectorTest, FillConstruction) {
+    Vector<double, 3> v(5.0);
+    EXPECT_EQ(v[0], 5.0);
+    EXPECT_EQ(v[1], 5.0);
+    EXPECT_EQ(v[2], 5.0);
+
+    Vector<int, 10> vi(-3);
+    for (size_t i = 0; i < 10; ++i) {
+        EXPECT_EQ(vi[i], -3);
+    }
+}
+
+TEST_F(VectorTest, InitializerListConstruction) {
+    Vector<double, 3> v{1.0, 2.0, 3.0};
+    EXPECT_EQ(v[0], 1.0);
+    EXPECT_EQ(v[1], 2.0);
+    EXPECT_EQ(v[2], 3.0);
+
+    // Partial initialization
+    Vector<double, 5> v2{1.0, 2.0};
+    EXPECT_EQ(v2[0], 1.0);
+    EXPECT_EQ(v2[1], 2.0);
+    EXPECT_EQ(v2[2], 0.0);
+    EXPECT_EQ(v2[3], 0.0);
+    EXPECT_EQ(v2[4], 0.0);
+}
+
+TEST_F(VectorTest, CopyConstruction) {
+    Vector<double, 3> v1{1.0, 2.0, 3.0};
+    Vector<double, 3> v2(v1);
+
+    EXPECT_EQ(v2[0], 1.0);
+    EXPECT_EQ(v2[1], 2.0);
+    EXPECT_EQ(v2[2], 3.0);
+
+    // Ensure deep copy
+    v2[0] = 10.0;
+    EXPECT_EQ(v1[0], 1.0);
+    EXPECT_EQ(v2[0], 10.0);
+}
+
+TEST_F(VectorTest, MoveConstruction) {
+    Vector<double, 3> v1{1.0, 2.0, 3.0};
+    Vector<double, 3> v2(std::move(v1));
+
+    EXPECT_EQ(v2[0], 1.0);
+    EXPECT_EQ(v2[1], 2.0);
+    EXPECT_EQ(v2[2], 3.0);
+}
+
+// =============================================================================
+// Element Access Tests
+// =============================================================================
+
+TEST_F(VectorTest, ElementAccess) {
+    Vector<double, 3> v{1.0, 2.0, 3.0};
+
+    // Non-const access
+    EXPECT_EQ(v[0], 1.0);
+    EXPECT_EQ(v[1], 2.0);
+    EXPECT_EQ(v[2], 3.0);
+
+    // Modification
+    v[1] = 5.0;
+    EXPECT_EQ(v[1], 5.0);
+
+    // Const access
+    const Vector<double, 3> cv{4.0, 5.0, 6.0};
+    EXPECT_EQ(cv[0], 4.0);
+    EXPECT_EQ(cv[1], 5.0);
+    EXPECT_EQ(cv[2], 6.0);
+}
+
+TEST_F(VectorTest, ElementAccessBounds) {
+    Vector<double, 3> v{1.0, 2.0, 3.0};
+
+    // at() with bounds checking
+    EXPECT_EQ(v.at(0), 1.0);
+    EXPECT_EQ(v.at(1), 2.0);
+    EXPECT_EQ(v.at(2), 3.0);
+
+    // Test out of bounds throws
+    EXPECT_THROW(v.at(3), std::out_of_range);
+    EXPECT_THROW(v.at(100), std::out_of_range);
+}
+
+TEST_F(VectorTest, DataPointerAccess) {
+    Vector<double, 3> v{1.0, 2.0, 3.0};
+
+    double* data = v.data();
+    EXPECT_EQ(data[0], 1.0);
+    EXPECT_EQ(data[1], 2.0);
+    EXPECT_EQ(data[2], 3.0);
+
+    // Const data access
+    const Vector<double, 3> cv{4.0, 5.0, 6.0};
+    const double* cdata = cv.data();
+    EXPECT_EQ(cdata[0], 4.0);
+    EXPECT_EQ(cdata[1], 5.0);
+    EXPECT_EQ(cdata[2], 6.0);
+}
+
+// =============================================================================
+// Arithmetic Operations Tests
+// =============================================================================
+
+TEST_F(VectorTest, Addition) {
+    Vector<double, 3> a{1.0, 2.0, 3.0};
+    Vector<double, 3> b{4.0, 5.0, 6.0};
+
+    Vector<double, 3> c = a + b;
+    EXPECT_EQ(c[0], 5.0);
+    EXPECT_EQ(c[1], 7.0);
+    EXPECT_EQ(c[2], 9.0);
+}
+
+TEST_F(VectorTest, Subtraction) {
+    Vector<double, 3> a{5.0, 7.0, 9.0};
+    Vector<double, 3> b{4.0, 5.0, 6.0};
+
+    Vector<double, 3> c = a - b;
+    EXPECT_EQ(c[0], 1.0);
+    EXPECT_EQ(c[1], 2.0);
+    EXPECT_EQ(c[2], 3.0);
+}
+
+TEST_F(VectorTest, ScalarMultiplication) {
+    Vector<double, 3> a{1.0, 2.0, 3.0};
+
+    // Scalar * Vector
+    Vector<double, 3> b = 2.0 * a;
+    EXPECT_EQ(b[0], 2.0);
+    EXPECT_EQ(b[1], 4.0);
+    EXPECT_EQ(b[2], 6.0);
+
+    // Vector * Scalar
+    Vector<double, 3> c = a * 3.0;
+    EXPECT_EQ(c[0], 3.0);
+    EXPECT_EQ(c[1], 6.0);
+    EXPECT_EQ(c[2], 9.0);
+}
+
+TEST_F(VectorTest, ScalarDivision) {
+    Vector<double, 3> a{2.0, 4.0, 6.0};
+
+    Vector<double, 3> b = a / 2.0;
+    EXPECT_EQ(b[0], 1.0);
+    EXPECT_EQ(b[1], 2.0);
+    EXPECT_EQ(b[2], 3.0);
+}
+
+TEST_F(VectorTest, UnaryNegation) {
+    Vector<double, 3> a{1.0, -2.0, 3.0};
+
+    Vector<double, 3> b = -a;
+    EXPECT_EQ(b[0], -1.0);
+    EXPECT_EQ(b[1], 2.0);
+    EXPECT_EQ(b[2], -3.0);
+}
+
+TEST_F(VectorTest, CompoundAssignment) {
+    Vector<double, 3> a{1.0, 2.0, 3.0};
+    Vector<double, 3> b{4.0, 5.0, 6.0};
+
+    // +=
+    a += b;
+    EXPECT_EQ(a[0], 5.0);
+    EXPECT_EQ(a[1], 7.0);
+    EXPECT_EQ(a[2], 9.0);
+
+    // -=
+    a -= b;
+    EXPECT_EQ(a[0], 1.0);
+    EXPECT_EQ(a[1], 2.0);
+    EXPECT_EQ(a[2], 3.0);
+
+    // *=
+    a *= 2.0;
+    EXPECT_EQ(a[0], 2.0);
+    EXPECT_EQ(a[1], 4.0);
+    EXPECT_EQ(a[2], 6.0);
+
+    // /=
+    a /= 2.0;
+    EXPECT_EQ(a[0], 1.0);
+    EXPECT_EQ(a[1], 2.0);
+    EXPECT_EQ(a[2], 3.0);
+}
+
+// =============================================================================
+// Vector Operations Tests
+// =============================================================================
+
+TEST_F(VectorTest, DotProduct) {
+    Vector<double, 3> a{1.0, 2.0, 3.0};
+    Vector<double, 3> b{4.0, 5.0, 6.0};
+
+    double dot = a.dot(b);
+    EXPECT_EQ(dot, 32.0);  // 1*4 + 2*5 + 3*6 = 4 + 10 + 18 = 32
+
+    // Test commutativity
+    EXPECT_EQ(b.dot(a), dot);
+
+    // Test orthogonal vectors
+    Vector<double, 3> x{1.0, 0.0, 0.0};
+    Vector<double, 3> y{0.0, 1.0, 0.0};
+    EXPECT_EQ(x.dot(y), 0.0);
+}
+
+TEST_F(VectorTest, CrossProduct3D) {
+    Vector<double, 3> x{1.0, 0.0, 0.0};
+    Vector<double, 3> y{0.0, 1.0, 0.0};
+    Vector<double, 3> z{0.0, 0.0, 1.0};
+
+    // Test basis vector cross products
+    Vector<double, 3> xy = x.cross(y);
+    EXPECT_EQ(xy[0], 0.0);
+    EXPECT_EQ(xy[1], 0.0);
+    EXPECT_EQ(xy[2], 1.0);
+
+    Vector<double, 3> yx = y.cross(x);
+    EXPECT_EQ(yx[0], 0.0);
+    EXPECT_EQ(yx[1], 0.0);
+    EXPECT_EQ(yx[2], -1.0);
+
+    // General cross product
+    Vector<double, 3> a{1.0, 2.0, 3.0};
+    Vector<double, 3> b{4.0, 5.0, 6.0};
+    Vector<double, 3> c = a.cross(b);
+
+    EXPECT_EQ(c[0], -3.0);  // 2*6 - 3*5 = 12 - 15 = -3
+    EXPECT_EQ(c[1], 6.0);   // 3*4 - 1*6 = 12 - 6 = 6
+    EXPECT_EQ(c[2], -3.0);  // 1*5 - 2*4 = 5 - 8 = -3
+}
+
+TEST_F(VectorTest, Norm) {
+    Vector<double, 3> v{3.0, 4.0, 0.0};
+    EXPECT_EQ(v.norm(), 5.0);
+
+    Vector<double, 3> unit{1.0, 0.0, 0.0};
+    EXPECT_EQ(unit.norm(), 1.0);
+
+    Vector<double, 3> zero{0.0, 0.0, 0.0};
+    EXPECT_EQ(zero.norm(), 0.0);
+}
+
+TEST_F(VectorTest, NormSquared) {
+    Vector<double, 3> v{3.0, 4.0, 0.0};
+    EXPECT_EQ(v.norm_squared(), 25.0);
+
+    Vector<double, 3> a{1.0, 2.0, 3.0};
+    EXPECT_EQ(a.norm_squared(), 14.0);  // 1 + 4 + 9 = 14
+}
+
+TEST_F(VectorTest, Normalize) {
+    Vector<double, 3> v{3.0, 4.0, 0.0};
+    Vector<double, 3> n = v.normalized();
+
+    EXPECT_NEAR(n[0], 0.6, tolerance);
+    EXPECT_NEAR(n[1], 0.8, tolerance);
+    EXPECT_NEAR(n[2], 0.0, tolerance);
+    EXPECT_NEAR(n.norm(), 1.0, tolerance);
+
+    // In-place normalization
+    v.normalize();
+    EXPECT_NEAR(v[0], 0.6, tolerance);
+    EXPECT_NEAR(v[1], 0.8, tolerance);
+    EXPECT_NEAR(v.norm(), 1.0, tolerance);
+}
+
+// =============================================================================
+// Expression Template Tests
+// =============================================================================
+
+TEST_F(VectorTest, ExpressionTemplatesNoTemporaries) {
+    Vector<double, 3> a{1.0, 2.0, 3.0};
+    Vector<double, 3> b{4.0, 5.0, 6.0};
+    Vector<double, 3> c{7.0, 8.0, 9.0};
+    Vector<double, 3> d{10.0, 11.0, 12.0};
+
+    // Complex expression should create no temporaries
+    Vector<double, 3> result = a + b - c + d;
+
+    EXPECT_EQ(result[0], 8.0);   // 1 + 4 - 7 + 10
+    EXPECT_EQ(result[1], 10.0);  // 2 + 5 - 8 + 11
+    EXPECT_EQ(result[2], 12.0);  // 3 + 6 - 9 + 12
+}
+
+TEST_F(VectorTest, LazyEvaluation) {
+    Vector<double, 3> a{1.0, 2.0, 3.0};
+    Vector<double, 3> b{4.0, 5.0, 6.0};
+
+    // Expression should not be evaluated until assignment
+    auto expr = a + b;  // No computation yet
+
+    Vector<double, 3> result = expr;  // Evaluation happens here
+    EXPECT_EQ(result[0], 5.0);
+    EXPECT_EQ(result[1], 7.0);
+    EXPECT_EQ(result[2], 9.0);
+}
+
+TEST_F(VectorTest, MixedExpressions) {
+    Vector<double, 3> a{1.0, 2.0, 3.0};
+    Vector<double, 3> b{4.0, 5.0, 6.0};
+    double scalar = 2.0;
+
+    // Complex mixed expression
+    Vector<double, 3> result = scalar * (a + b) - a / scalar;
+
+    EXPECT_NEAR(result[0], 9.5, tolerance);   // 2*(1+4) - 1/2
+    EXPECT_NEAR(result[1], 13.0, tolerance);  // 2*(2+5) - 2/2
+    EXPECT_NEAR(result[2], 16.5, tolerance);  // 2*(3+6) - 3/2
+}
+
+// =============================================================================
+// Special Values Tests
+// =============================================================================
+
+TEST_F(VectorTest, ZeroVector) {
+    Vector<double, 3> zero = Vector<double, 3>::zero();
+    EXPECT_EQ(zero[0], 0.0);
+    EXPECT_EQ(zero[1], 0.0);
+    EXPECT_EQ(zero[2], 0.0);
+    EXPECT_EQ(zero.norm(), 0.0);
+}
+
+TEST_F(VectorTest, OnesVector) {
+    Vector<double, 3> ones = Vector<double, 3>::ones();
+    EXPECT_EQ(ones[0], 1.0);
+    EXPECT_EQ(ones[1], 1.0);
+    EXPECT_EQ(ones[2], 1.0);
+}
+
+TEST_F(VectorTest, BasisVectors) {
+    auto e0 = Vector<double, 3>::basis(0);
+    EXPECT_EQ(e0[0], 1.0);
+    EXPECT_EQ(e0[1], 0.0);
+    EXPECT_EQ(e0[2], 0.0);
+
+    auto e1 = Vector<double, 3>::basis(1);
+    EXPECT_EQ(e1[0], 0.0);
+    EXPECT_EQ(e1[1], 1.0);
+    EXPECT_EQ(e1[2], 0.0);
+
+    auto e2 = Vector<double, 3>::basis(2);
+    EXPECT_EQ(e2[0], 0.0);
+    EXPECT_EQ(e2[1], 0.0);
+    EXPECT_EQ(e2[2], 1.0);
+}
+
+// =============================================================================
+// Edge Cases and Error Handling Tests
+// =============================================================================
+
+TEST_F(VectorTest, DivisionByZero) {
+    Vector<double, 3> v{1.0, 2.0, 3.0};
+
+    // Division by zero should produce inf
+    Vector<double, 3> result = v / 0.0;
+    EXPECT_TRUE(std::isinf(result[0]));
+    EXPECT_TRUE(std::isinf(result[1]));
+    EXPECT_TRUE(std::isinf(result[2]));
+}
+
+TEST_F(VectorTest, NormalizeZeroVector) {
+    Vector<double, 3> zero{0.0, 0.0, 0.0};
+
+    // Normalizing zero vector should handle gracefully
+    Vector<double, 3> n = zero.normalized();
+    EXPECT_TRUE(std::isnan(n[0]) || n[0] == 0.0);
+}
+
+TEST_F(VectorTest, ExtremeLargeValues) {
+    double large = 1e308;  // Near double max
+    Vector<double, 3> v{large, large, large};
+
+    // Operations should not overflow
+    Vector<double, 3> half = v / 2.0;
+    EXPECT_FALSE(std::isinf(half[0]));
+    EXPECT_EQ(half[0], large / 2.0);
+}
+
+TEST_F(VectorTest, ExtremeSmallValues) {
+    double tiny = 1e-308;  // Near double min
+    Vector<double, 3> v{tiny, tiny, tiny};
+
+    // Operations should maintain precision
+    Vector<double, 3> doubled = v * 2.0;
+    EXPECT_EQ(doubled[0], tiny * 2.0);
+}
+
+// =============================================================================
+// Numerical Precision Tests
+// =============================================================================
+
+TEST_F(VectorTest, NumericalStability) {
+    // Test Kahan summation for better precision
+    Vector<double, 4> v{1e16, 1.0, -1e16, 1.0};
+    // Computed for future validation - demonstrates numerical precision issues
+    [[maybe_unused]] double sum = v[0] + v[1] + v[2] + v[3];
+
+    // Direct summation might lose precision
+    // But vector operations should maintain it
+    Vector<double, 4> a{1e16, 0.0, -1e16, 0.0};
+    Vector<double, 4> b{0.0, 1.0, 0.0, 1.0};
+    Vector<double, 4> c = a + b;
+
+    EXPECT_EQ(c[0], 1e16);
+    EXPECT_EQ(c[1], 1.0);
+    EXPECT_EQ(c[2], -1e16);
+    EXPECT_EQ(c[3], 1.0);
+}
+
+TEST_F(VectorTest, OrthogonalityPreservation) {
+    // Create nearly orthogonal vectors
+    Vector<double, 3> a{1.0, 1e-15, 0.0};
+    Vector<double, 3> b{0.0, 1.0, 0.0};
+
+    double dot = a.dot(b);
+    EXPECT_NEAR(dot, 1e-15, 1e-16);
+}
+
+// =============================================================================
+// Comparison Operations Tests
+// =============================================================================
+
+TEST_F(VectorTest, Equality) {
+    Vector<double, 3> a{1.0, 2.0, 3.0};
+    Vector<double, 3> b{1.0, 2.0, 3.0};
+    Vector<double, 3> c{1.0, 2.0, 3.1};
+
+    EXPECT_TRUE(a == b);
+    EXPECT_FALSE(a == c);
+    EXPECT_FALSE(a != b);
+    EXPECT_TRUE(a != c);
+}
+
+TEST_F(VectorTest, ApproximateEquality) {
+    Vector<double, 3> a{1.0, 2.0, 3.0};
+    Vector<double, 3> b{1.0 + 1e-15, 2.0 - 1e-15, 3.0 + 1e-15};
+
+    EXPECT_TRUE(a.approx_equal(b, 1e-14));
+    EXPECT_FALSE(a.approx_equal(b, 1e-16));
+}
+
+// =============================================================================
+// Thread Safety Tests
+// =============================================================================
+
+TEST_F(VectorTest, ThreadSafetyReadOnly) {
+    Vector<double, 3> v{1.0, 2.0, 3.0};
+
+    // Multiple threads reading should be safe
+    std::vector<std::thread> threads;
+    std::vector<double> results(10);
+
+    for (int i = 0; i < 10; ++i) {
+        threads.emplace_back([&v, &results, i]() {
+            results[static_cast<std::size_t>(i)] = v.norm();
+        });
+    }
+
+    for (auto& t : threads) {
+        t.join();
+    }
+
+    // All threads should get same result
+    double expected = v.norm();
+    for (double r : results) {
+        EXPECT_EQ(r, expected);
+    }
+}
+
+TEST_F(VectorTest, ThreadSafetyIsolated) {
+    // Each thread works on its own vector
+    std::vector<std::thread> threads;
+    std::vector<Vector<double, 3>> results(10);
+
+    for (int i = 0; i < 10; ++i) {
+        threads.emplace_back([&results, i]() {
+            Vector<double, 3> local{static_cast<double>(i), 0.0, 0.0};
+            results[static_cast<std::size_t>(i)] = local * 2.0;
+        });
+    }
+
+    for (auto& t : threads) {
+        t.join();
+    }
+
+    // Check each thread computed correctly
+    for (int i = 0; i < 10; ++i) {
+        EXPECT_EQ(results[static_cast<std::size_t>(i)][0], 2.0 * i);
+    }
+}
+
+// =============================================================================
+// Memory Alignment Tests
+// =============================================================================
+
+TEST_F(VectorTest, MemoryAlignment) {
+    Vector<double, 3> v;
+
+    // Check that data is properly aligned for SIMD
+    std::uintptr_t addr = reinterpret_cast<std::uintptr_t>(v.data());
+    EXPECT_EQ(addr % 32, 0) << "Vector data should be 32-byte aligned for AVX";
+}
+
+// =============================================================================
+// Utility Function Tests
+// =============================================================================
+
+TEST_F(VectorTest, MinMaxElements) {
+    Vector<double, 5> v{3.0, -1.0, 4.0, 1.0, -2.0};
+
+    EXPECT_EQ(v.min(), -2.0);
+    EXPECT_EQ(v.max(), 4.0);
+    EXPECT_EQ(v.min_index(), 4);
+    EXPECT_EQ(v.max_index(), 2);
+}
+
+TEST_F(VectorTest, Sum) {
+    Vector<double, 4> v{1.0, 2.0, 3.0, 4.0};
+    EXPECT_EQ(v.sum(), 10.0);
+
+    Vector<double, 3> zero{0.0, 0.0, 0.0};
+    EXPECT_EQ(zero.sum(), 0.0);
+}
+
+TEST_F(VectorTest, Mean) {
+    Vector<double, 4> v{1.0, 2.0, 3.0, 4.0};
+    EXPECT_EQ(v.mean(), 2.5);
+}
+
+TEST_F(VectorTest, ToString) {
+    Vector<double, 3> v{1.0, 2.0, 3.0};
+    std::stringstream ss;
+    ss << v;
+
+    std::string expected = "[1, 2, 3]";
+    EXPECT_EQ(ss.str(), expected);
+}
diff --git a/tests/unitTests/FE/Math/test_VectorExpr.cpp b/tests/unitTests/FE/Math/test_VectorExpr.cpp
new file mode 100644
index 000000000..bd6d85d51
--- /dev/null
+++ b/tests/unitTests/FE/Math/test_VectorExpr.cpp
@@ -0,0 +1,409 @@
+/**
+ * @file test_VectorExpr.cpp
+ * @brief Unit tests for VectorExpr.h - vector expression templates
+ */
+
+#include <gtest/gtest.h>
+#include "FE/Math/Vector.h"
+#include "FE/Math/VectorExpr.h"
+#include "FE/Math/MathConstants.h"
+#include <limits>
+#include <cmath>
+#include <memory>
+#include <atomic>
+#include <type_traits>
+
+using namespace svmp::FE::math;
+
+// Test fixture for VectorExpr tests
+class VectorExprTest : public ::testing::Test {
+protected:
+    static constexpr double tolerance = 1e-14;
+
+    // Custom allocator to track memory allocations
+    template<typename T>
+    class TrackingAllocator {
+    public:
+        using value_type = T;
+
+        static std::atomic<size_t> allocations;
+        static std::atomic<size_t> deallocations;
+        static std::atomic<size_t> bytes_allocated;
+
+        TrackingAllocator() = default;
+
+        template<typename U>
+        TrackingAllocator(const TrackingAllocator<U>&) {}
+
+        T* allocate(size_t n) {
+            allocations.fetch_add(1);
+            bytes_allocated.fetch_add(n * sizeof(T));
+            return static_cast<T*>(::operator new(n * sizeof(T)));
+        }
+
+        void deallocate(T* p, size_t n) {
+            deallocations.fetch_add(1);
+            ::operator delete(p);
+        }
+
+        static void reset() {
+            allocations = 0;
+            deallocations = 0;
+            bytes_allocated = 0;
+        }
+    };
+
+    void SetUp() override {
+        TrackingAllocator<double>::reset();
+    }
+
+    void TearDown() override {}
+
+    template<typename T>
+    bool approx_equal(T a, T b, T tol = tolerance) {
+        return std::abs(a - b) <= tol;
+    }
+};
+
+template<typename T>
+std::atomic<size_t> VectorExprTest::TrackingAllocator<T>::allocations{0};
+template<typename T>
+std::atomic<size_t> VectorExprTest::TrackingAllocator<T>::deallocations{0};
+template<typename T>
+std::atomic<size_t> VectorExprTest::TrackingAllocator<T>::bytes_allocated{0};
+
+// =============================================================================
+// Lazy Evaluation Tests
+// =============================================================================
+
+TEST_F(VectorExprTest, LazyEvaluationNoTemporaries) {
+    // Expression templates should not create temporary vectors
+    Vector<double, 3> a{1.0, 2.0, 3.0};
+    Vector<double, 3> b{4.0, 5.0, 6.0};
+    Vector<double, 3> c{7.0, 8.0, 9.0};
+
+    // Build expression without evaluation
+    auto expr = a + b - c;
+
+    // Expression type should not be Vector, but an expression type
+    using ExprType = decltype(expr);
+    EXPECT_FALSE((std::is_same_v<ExprType, Vector<double, 3>>));
+
+    // Now evaluate
+    Vector<double, 3> result = expr;
+    EXPECT_DOUBLE_EQ(result[0], -2.0);
+    EXPECT_DOUBLE_EQ(result[1], -1.0);
+    EXPECT_DOUBLE_EQ(result[2], 0.0);
+}
+
+TEST_F(VectorExprTest, LazyEvaluationAccessPattern) {
+    Vector<double, 4> a{1.0, 2.0, 3.0, 4.0};
+    Vector<double, 4> b{5.0, 6.0, 7.0, 8.0};
+
+    auto expr = a + b;
+
+    // Access individual elements without full evaluation
+    EXPECT_DOUBLE_EQ(expr[0], 6.0);
+    EXPECT_DOUBLE_EQ(expr[2], 10.0);
+
+    // Size should be accessible
+    EXPECT_EQ(expr.size(), 4u);
+}
+
+// =============================================================================
+// Expression Chaining Tests
+// =============================================================================
+
+TEST_F(VectorExprTest, ChainedAdditionSubtraction) {
+    Vector<double, 3> a{1.0, 2.0, 3.0};
+    Vector<double, 3> b{4.0, 5.0, 6.0};
+    Vector<double, 3> c{2.0, 3.0, 4.0};
+    Vector<double, 3> d{1.0, 1.0, 1.0};
+
+    // Chain multiple operations
+    Vector<double, 3> result = a + b - c + d;
+
+    EXPECT_DOUBLE_EQ(result[0], 4.0);
+    EXPECT_DOUBLE_EQ(result[1], 5.0);
+    EXPECT_DOUBLE_EQ(result[2], 6.0);
+}
+
+TEST_F(VectorExprTest, DeepExpressionNesting) {
+    Vector<double, 2> v1{1.0, 2.0};
+    Vector<double, 2> v2{3.0, 4.0};
+    Vector<double, 2> v3{5.0, 6.0};
+    Vector<double, 2> v4{7.0, 8.0};
+    Vector<double, 2> v5{9.0, 10.0};
+
+    // Deep nesting
+    Vector<double, 2> result = ((v1 + v2) - (v3 - v4)) + v5;
+
+    EXPECT_DOUBLE_EQ(result[0], 15.0);
+    EXPECT_DOUBLE_EQ(result[1], 18.0);
+}
+
+// =============================================================================
+// Mixed Operations Tests
+// =============================================================================
+
+TEST_F(VectorExprTest, ScalarMultiplicationInExpression) {
+    Vector<double, 3> a{1.0, 2.0, 3.0};
+    Vector<double, 3> b{4.0, 5.0, 6.0};
+
+    Vector<double, 3> result = 2.0 * (a + b) / 3.0;
+
+    EXPECT_TRUE(approx_equal(result[0], 10.0/3.0));
+    EXPECT_TRUE(approx_equal(result[1], 14.0/3.0));
+    EXPECT_TRUE(approx_equal(result[2], 6.0));
+}
+
+TEST_F(VectorExprTest, MixedScalarVectorOperations) {
+    Vector<double, 4> v{2.0, 4.0, 6.0, 8.0};
+
+    // Complex mixed expression
+    Vector<double, 4> result = 3.0 * v / 2.0 + v * 0.5 - 1.0 * v;
+
+    EXPECT_DOUBLE_EQ(result[0], 2.0);
+    EXPECT_DOUBLE_EQ(result[1], 4.0);
+    EXPECT_DOUBLE_EQ(result[2], 6.0);
+    EXPECT_DOUBLE_EQ(result[3], 8.0);
+}
+
+// =============================================================================
+// Unary Operations Tests
+// =============================================================================
+
+TEST_F(VectorExprTest, NegationInExpression) {
+    Vector<double, 3> a{1.0, -2.0, 3.0};
+    Vector<double, 3> b{4.0, 5.0, -6.0};
+
+    Vector<double, 3> result = -a + (-b);
+
+    EXPECT_DOUBLE_EQ(result[0], -5.0);
+    EXPECT_DOUBLE_EQ(result[1], -3.0);
+    EXPECT_DOUBLE_EQ(result[2], 3.0);
+}
+
+TEST_F(VectorExprTest, AbsoluteValueExpression) {
+    Vector<double, 4> v{-1.5, 2.3, -4.7, 0.0};
+
+    Vector<double, 4> result = abs(v);
+
+    EXPECT_DOUBLE_EQ(result[0], 1.5);
+    EXPECT_DOUBLE_EQ(result[1], 2.3);
+    EXPECT_DOUBLE_EQ(result[2], 4.7);
+    EXPECT_DOUBLE_EQ(result[3], 0.0);
+}
+
+TEST_F(VectorExprTest, SqrtExpression) {
+    Vector<double, 3> v{4.0, 9.0, 16.0};
+
+    Vector<double, 3> result = sqrt(v);
+
+    EXPECT_DOUBLE_EQ(result[0], 2.0);
+    EXPECT_DOUBLE_EQ(result[1], 3.0);
+    EXPECT_DOUBLE_EQ(result[2], 4.0);
+}
+
+// =============================================================================
+// Element-wise Operations Tests
+// =============================================================================
+
+TEST_F(VectorExprTest, HadamardProductExpression) {
+    Vector<double, 3> a{2.0, 3.0, 4.0};
+    Vector<double, 3> b{5.0, 6.0, 7.0};
+
+    Vector<double, 3> result = hadamard(a, b);
+
+    EXPECT_DOUBLE_EQ(result[0], 10.0);
+    EXPECT_DOUBLE_EQ(result[1], 18.0);
+    EXPECT_DOUBLE_EQ(result[2], 28.0);
+}
+
+TEST_F(VectorExprTest, HadamardDivisionExpression) {
+    Vector<double, 3> a{10.0, 18.0, 28.0};
+    Vector<double, 3> b{2.0, 3.0, 4.0};
+
+    Vector<double, 3> result = hadamard_div(a, b);
+
+    EXPECT_DOUBLE_EQ(result[0], 5.0);
+    EXPECT_DOUBLE_EQ(result[1], 6.0);
+    EXPECT_DOUBLE_EQ(result[2], 7.0);
+}
+
+// =============================================================================
+// Dot Product and Norm Tests
+// =============================================================================
+
+TEST_F(VectorExprTest, DotProductOfExpressions) {
+    Vector<double, 3> a{1.0, 2.0, 3.0};
+    Vector<double, 3> b{4.0, 5.0, 6.0};
+    Vector<double, 3> c{2.0, 2.0, 2.0};
+
+    // Dot product of expressions
+    double result = dot(a + b, c);
+
+    EXPECT_DOUBLE_EQ(result, 42.0);
+}
+
+TEST_F(VectorExprTest, NormOfExpression) {
+    Vector<double, 2> a{3.0, 0.0};
+    Vector<double, 2> b{0.0, 4.0};
+
+    double result = norm(a + b);
+
+    EXPECT_DOUBLE_EQ(result, 5.0);  // norm of (3,4) = 5
+}
+
+TEST_F(VectorExprTest, NormalizeExpression) {
+    Vector<double, 3> v{3.0, 0.0, 4.0};
+
+    Vector<double, 3> result = normalize(v);
+
+    EXPECT_DOUBLE_EQ(result[0], 0.6);
+    EXPECT_DOUBLE_EQ(result[1], 0.0);
+    EXPECT_DOUBLE_EQ(result[2], 0.8);
+}
+
+// =============================================================================
+// Type Deduction Tests
+// =============================================================================
+
+TEST_F(VectorExprTest, TypeDeductionCorrectness) {
+    Vector<float, 3> vf{1.0f, 2.0f, 3.0f};
+    Vector<double, 3> vd{4.0, 5.0, 6.0};
+
+    // Mixed type operations should promote to higher precision
+    auto expr = vf + vf;  // float expression
+    using ExprType = decltype(expr[0]);
+    EXPECT_TRUE((std::is_same_v<ExprType, float>));
+
+    // Test that expression evaluates correctly
+    Vector<float, 3> result = expr;
+    EXPECT_FLOAT_EQ(result[0], 2.0f);
+    EXPECT_FLOAT_EQ(result[1], 4.0f);
+    EXPECT_FLOAT_EQ(result[2], 6.0f);
+}
+
+// =============================================================================
+// SFINAE and Compile-time Tests
+// =============================================================================
+
+TEST_F(VectorExprTest, SFINAEConstraints) {
+    // Test that VectorExpr operators only work with VectorExpr types
+    Vector<double, 3> v1{1.0, 2.0, 3.0};
+    Vector<double, 3> v2{4.0, 5.0, 6.0};
+
+    // This should compile
+    auto expr = v1 + v2;
+    Vector<double, 3> result = expr;
+
+    // Verify the constraint checking
+    EXPECT_TRUE((std::is_base_of_v<VectorExpr<Vector<double, 3>>, Vector<double, 3>>));
+}
+
+// =============================================================================
+// Aliasing and Self-Assignment Tests
+// =============================================================================
+
+TEST_F(VectorExprTest, SelfAssignmentWithExpression) {
+    Vector<double, 3> a{1.0, 2.0, 3.0};
+    Vector<double, 3> b{4.0, 5.0, 6.0};
+
+    // Self-assignment through expression
+    a = a + b;
+
+    EXPECT_DOUBLE_EQ(a[0], 5.0);
+    EXPECT_DOUBLE_EQ(a[1], 7.0);
+    EXPECT_DOUBLE_EQ(a[2], 9.0);
+}
+
+TEST_F(VectorExprTest, AliasingInExpression) {
+    Vector<double, 3> a{2.0, 3.0, 4.0};
+    Vector<double, 3> b{1.0, 1.0, 1.0};
+
+    // a appears on both sides
+    a = b + a;
+
+    EXPECT_DOUBLE_EQ(a[0], 3.0);
+    EXPECT_DOUBLE_EQ(a[1], 4.0);
+    EXPECT_DOUBLE_EQ(a[2], 5.0);
+}
+
+// =============================================================================
+// Edge Cases Tests
+// =============================================================================
+
+TEST_F(VectorExprTest, SingleElementVector) {
+    Vector<double, 1> a{5.0};
+    Vector<double, 1> b{3.0};
+
+    Vector<double, 1> result = a + b - a * 0.5;
+
+    EXPECT_DOUBLE_EQ(result[0], 5.5);
+}
+
+TEST_F(VectorExprTest, EmptyExpression) {
+    Vector<double, 3> v{1.0, 2.0, 3.0};
+
+    // Expression that evaluates to identity
+    Vector<double, 3> result = v + v * 0.0;
+
+    EXPECT_DOUBLE_EQ(result[0], 1.0);
+    EXPECT_DOUBLE_EQ(result[1], 2.0);
+    EXPECT_DOUBLE_EQ(result[2], 3.0);
+}
+
+TEST_F(VectorExprTest, LargeVectorExpression) {
+    const size_t N = 100;
+    Vector<double, N> a, b, c;
+
+    for (size_t i = 0; i < N; ++i) {
+        a[i] = static_cast<double>(i);
+        b[i] = static_cast<double>(i * 2);
+        c[i] = static_cast<double>(i * 3);
+    }
+
+    Vector<double, N> result = a + b - c / 2.0;
+
+    for (size_t i = 0; i < N; ++i) {
+        EXPECT_DOUBLE_EQ(result[i], i + 2.0 * i - 1.5 * i);
+    }
+}
+
+// =============================================================================
+// Complex Expression Pattern Tests
+// =============================================================================
+
+TEST_F(VectorExprTest, ComplexNestedExpression) {
+    Vector<double, 3> a{1.0, 2.0, 3.0};
+    Vector<double, 3> b{4.0, 5.0, 6.0};
+    Vector<double, 3> c{7.0, 8.0, 9.0};
+
+    // Complex expression with multiple operation types
+    Vector<double, 3> result = 2.0 * abs(a - b) + sqrt(hadamard(c, c)) / 3.0;
+
+    // Verify each component
+    // |a - b| = |(-3, -3, -3)| = (3, 3, 3)
+    // 2 * (3, 3, 3) = (6, 6, 6)
+    // c * c = (49, 64, 81)
+    // sqrt(c * c) = (7, 8, 9)
+    // sqrt(c * c) / 3 = (7/3, 8/3, 3)
+    // result = (6 + 7/3, 6 + 8/3, 6 + 3) = (25/3, 26/3, 9)
+
+    EXPECT_TRUE(approx_equal(result[0], 25.0/3.0));
+    EXPECT_TRUE(approx_equal(result[1], 26.0/3.0));
+    EXPECT_DOUBLE_EQ(result[2], 9.0);
+}
+
+TEST_F(VectorExprTest, ChainedUnaryOperations) {
+    Vector<double, 4> v{-4.0, -9.0, -16.0, -25.0};
+
+    // Chain of unary operations
+    Vector<double, 4> result = sqrt(abs(-v));
+
+    EXPECT_DOUBLE_EQ(result[0], 2.0);
+    EXPECT_DOUBLE_EQ(result[1], 3.0);
+    EXPECT_DOUBLE_EQ(result[2], 4.0);
+    EXPECT_DOUBLE_EQ(result[3], 5.0);
+}

From dfdeead1edd2813a4a24bdba45fc13b883c6a919 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Fri, 5 Jun 2026 12:44:00 -0700
Subject: [PATCH 02/22] Update FSI HEX8 FE Basis reference results

Regenerate affected FSI and FSI-ustruct HEX8 result_005.vtu references for the FE Basis path with nonzero HEX8 Hessian contributions.

Update the pipe_3d PETSc and Trilinos references to match the base pipe_3d reference, preserving the existing shared-reference pattern across linear algebra variants.
---
 tests/cases/fsi/pipe_3d/result_005.vtu             | 4 ++--
 tests/cases/fsi/pipe_3d_petsc/result_005.vtu       | 4 ++--
 tests/cases/fsi/pipe_3d_trilinos_bj/result_005.vtu | 4 ++--
 tests/cases/fsi/pipe_3d_trilinos_ml/result_005.vtu | 4 ++--
 tests/cases/fsi/pipe_RCR_3d/result_005.vtu         | 4 ++--
 tests/cases/fsi_ustruct/pipe_3d/result_005.vtu     | 4 ++--
 tests/cases/fsi_ustruct/pipe_RCR_3d/result_005.vtu | 4 ++--
 7 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/tests/cases/fsi/pipe_3d/result_005.vtu b/tests/cases/fsi/pipe_3d/result_005.vtu
index b78ea6500..a7ca69daf 100644
--- a/tests/cases/fsi/pipe_3d/result_005.vtu
+++ b/tests/cases/fsi/pipe_3d/result_005.vtu
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:54ac116931be9b2a7d5024de8359f9ea09cae964e9bd34ba949f4bfb9312c8af
-size 210065
+oid sha256:b13d09a343a3fd8d033b0e3ecaf2cd94ce68e2ee8665144f7a53cca201db4266
+size 227356
diff --git a/tests/cases/fsi/pipe_3d_petsc/result_005.vtu b/tests/cases/fsi/pipe_3d_petsc/result_005.vtu
index b78ea6500..a7ca69daf 100644
--- a/tests/cases/fsi/pipe_3d_petsc/result_005.vtu
+++ b/tests/cases/fsi/pipe_3d_petsc/result_005.vtu
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:54ac116931be9b2a7d5024de8359f9ea09cae964e9bd34ba949f4bfb9312c8af
-size 210065
+oid sha256:b13d09a343a3fd8d033b0e3ecaf2cd94ce68e2ee8665144f7a53cca201db4266
+size 227356
diff --git a/tests/cases/fsi/pipe_3d_trilinos_bj/result_005.vtu b/tests/cases/fsi/pipe_3d_trilinos_bj/result_005.vtu
index b78ea6500..a7ca69daf 100644
--- a/tests/cases/fsi/pipe_3d_trilinos_bj/result_005.vtu
+++ b/tests/cases/fsi/pipe_3d_trilinos_bj/result_005.vtu
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:54ac116931be9b2a7d5024de8359f9ea09cae964e9bd34ba949f4bfb9312c8af
-size 210065
+oid sha256:b13d09a343a3fd8d033b0e3ecaf2cd94ce68e2ee8665144f7a53cca201db4266
+size 227356
diff --git a/tests/cases/fsi/pipe_3d_trilinos_ml/result_005.vtu b/tests/cases/fsi/pipe_3d_trilinos_ml/result_005.vtu
index b78ea6500..a7ca69daf 100644
--- a/tests/cases/fsi/pipe_3d_trilinos_ml/result_005.vtu
+++ b/tests/cases/fsi/pipe_3d_trilinos_ml/result_005.vtu
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:54ac116931be9b2a7d5024de8359f9ea09cae964e9bd34ba949f4bfb9312c8af
-size 210065
+oid sha256:b13d09a343a3fd8d033b0e3ecaf2cd94ce68e2ee8665144f7a53cca201db4266
+size 227356
diff --git a/tests/cases/fsi/pipe_RCR_3d/result_005.vtu b/tests/cases/fsi/pipe_RCR_3d/result_005.vtu
index 79eaced8c..6945fd005 100644
--- a/tests/cases/fsi/pipe_RCR_3d/result_005.vtu
+++ b/tests/cases/fsi/pipe_RCR_3d/result_005.vtu
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f194a3c364de0bf1a6cc79ba542306469e151de36275a06564022730c3f2c84c
-size 209865
+oid sha256:25a08e99ae0163800e73ea54720557d742548fe75a0eb6b68461d8bdb366972f
+size 227320
diff --git a/tests/cases/fsi_ustruct/pipe_3d/result_005.vtu b/tests/cases/fsi_ustruct/pipe_3d/result_005.vtu
index c838c9c3f..8b5f73c2a 100644
--- a/tests/cases/fsi_ustruct/pipe_3d/result_005.vtu
+++ b/tests/cases/fsi_ustruct/pipe_3d/result_005.vtu
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:262ffb4d7b644280f15bb2e32c8e5fc5ddade7fa5cabd845c31fe3803e9ef0a0
-size 207864
+oid sha256:16f0f2b2ea6a133f54db03954e76ea7586b0fb56d36e2e350ccd21ebadaf4bfb
+size 228764
diff --git a/tests/cases/fsi_ustruct/pipe_RCR_3d/result_005.vtu b/tests/cases/fsi_ustruct/pipe_RCR_3d/result_005.vtu
index e9e051d73..7d6c64d9b 100644
--- a/tests/cases/fsi_ustruct/pipe_RCR_3d/result_005.vtu
+++ b/tests/cases/fsi_ustruct/pipe_RCR_3d/result_005.vtu
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7dec176a56b610ed6b754f66e532a15ac1563b72c25198f49a0bc53adc6e4552
-size 207628
+oid sha256:5c00d715542a495f37a6ea1cd514cc654d3215360170a06c3af1440b71f7d093
+size 228708

From 8b47802fbcaf83ec07a4636de7ac6e6084db1364 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Sun, 7 Jun 2026 23:21:16 -0700
Subject: [PATCH 03/22] fixing temporary A + B expression in matrix and vector
 objects

---
 Code/Source/solver/FE/Math/MatrixExpr.h | 20 ++++++++++----------
 Code/Source/solver/FE/Math/VectorExpr.h | 12 ++++++------
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/Code/Source/solver/FE/Math/MatrixExpr.h b/Code/Source/solver/FE/Math/MatrixExpr.h
index da2f8c8d6..097f35361 100644
--- a/Code/Source/solver/FE/Math/MatrixExpr.h
+++ b/Code/Source/solver/FE/Math/MatrixExpr.h
@@ -82,8 +82,8 @@ class MatrixExpr {
 template<typename LHS, typename RHS, typename Op>
 class MatrixBinaryExpr : public MatrixExpr<MatrixBinaryExpr<LHS, RHS, Op>> {
 private:
-    const LHS& lhs_;
-    const RHS& rhs_;
+    LHS lhs_;
+    RHS rhs_;
     Op op_;
 
 public:
@@ -131,7 +131,7 @@ class MatrixBinaryExpr : public MatrixExpr<MatrixBinaryExpr<LHS, RHS, Op>> {
 template<typename Expr, typename Op>
 class MatrixUnaryExpr : public MatrixExpr<MatrixUnaryExpr<Expr, Op>> {
 private:
-    const Expr& expr_;
+    Expr expr_;
     Op op_;
 
 public:
@@ -178,7 +178,7 @@ class MatrixUnaryExpr : public MatrixExpr<MatrixUnaryExpr<Expr, Op>> {
 template<typename Expr, typename Scalar>
 class MatrixScalarExpr : public MatrixExpr<MatrixScalarExpr<Expr, Scalar>> {
 private:
-    const Expr& expr_;
+    Expr expr_;
     Scalar scalar_;
 
 public:
@@ -225,7 +225,7 @@ class MatrixScalarExpr : public MatrixExpr<MatrixScalarExpr<Expr, Scalar>> {
 template<typename Expr, typename Scalar>
 class MatrixScalarDivExpr : public MatrixExpr<MatrixScalarDivExpr<Expr, Scalar>> {
 private:
-    const Expr& expr_;
+    Expr expr_;
     Scalar scalar_;
 
 public:
@@ -274,8 +274,8 @@ class MatrixScalarDivExpr : public MatrixExpr<MatrixScalarDivExpr<Expr, Scalar>>
 template<typename LHS, typename RHS>
 class MatrixMulExpr : public MatrixExpr<MatrixMulExpr<LHS, RHS>> {
 private:
-    const LHS& lhs_;
-    const RHS& rhs_;
+    LHS lhs_;
+    RHS rhs_;
 
 public:
     /**
@@ -326,7 +326,7 @@ class MatrixMulExpr : public MatrixExpr<MatrixMulExpr<LHS, RHS>> {
 template<typename Expr>
 class TransposeExpr : public MatrixExpr<TransposeExpr<Expr>> {
 private:
-    const Expr& expr_;
+    Expr expr_;
 
 public:
     /**
@@ -370,7 +370,7 @@ class TransposeExpr : public MatrixExpr<TransposeExpr<Expr>> {
 template<typename VecExpr>
 class DiagonalExpr : public MatrixExpr<DiagonalExpr<VecExpr>> {
 private:
-    const VecExpr& vec_;
+    VecExpr vec_;
     std::size_t n_;
 
 public:
@@ -623,4 +623,4 @@ constexpr auto trace(const MatrixExpr<Expr>& expr) {
 } // namespace FE
 } // namespace svmp
 
-#endif // SVMP_FE_MATH_MATRIX_EXPR_H
\ No newline at end of file
+#endif // SVMP_FE_MATH_MATRIX_EXPR_H
diff --git a/Code/Source/solver/FE/Math/VectorExpr.h b/Code/Source/solver/FE/Math/VectorExpr.h
index 8b9c8e382..627d2fd88 100644
--- a/Code/Source/solver/FE/Math/VectorExpr.h
+++ b/Code/Source/solver/FE/Math/VectorExpr.h
@@ -72,8 +72,8 @@ class VectorExpr {
 template<typename LHS, typename RHS, typename Op>
 class VectorBinaryExpr : public VectorExpr<VectorBinaryExpr<LHS, RHS, Op>> {
 private:
-    const LHS& lhs_;
-    const RHS& rhs_;
+    LHS lhs_;
+    RHS rhs_;
     Op op_;
 
 public:
@@ -112,7 +112,7 @@ class VectorBinaryExpr : public VectorExpr<VectorBinaryExpr<LHS, RHS, Op>> {
 template<typename Expr, typename Op>
 class VectorUnaryExpr : public VectorExpr<VectorUnaryExpr<Expr, Op>> {
 private:
-    const Expr& expr_;
+    Expr expr_;
     Op op_;
 
 public:
@@ -150,7 +150,7 @@ class VectorUnaryExpr : public VectorExpr<VectorUnaryExpr<Expr, Op>> {
 template<typename Expr, typename Scalar>
 class VectorScalarExpr : public VectorExpr<VectorScalarExpr<Expr, Scalar>> {
 private:
-    const Expr& expr_;
+    Expr expr_;
     Scalar scalar_;
 
 public:
@@ -188,7 +188,7 @@ class VectorScalarExpr : public VectorExpr<VectorScalarExpr<Expr, Scalar>> {
 template<typename Expr, typename Scalar>
 class VectorScalarDivExpr : public VectorExpr<VectorScalarDivExpr<Expr, Scalar>> {
 private:
-    const Expr& expr_;
+    Expr expr_;
     Scalar scalar_;
 
 public:
@@ -415,4 +415,4 @@ constexpr auto normalize(const VectorExpr<Expr>& expr) {
 } // namespace FE
 } // namespace svmp
 
-#endif // SVMP_FE_MATH_VECTOR_EXPR_H
\ No newline at end of file
+#endif // SVMP_FE_MATH_VECTOR_EXPR_H

From 4d6baaa57f809a9ee4d6261069a84aec6efc9806 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Mon, 8 Jun 2026 00:41:06 -0700
Subject: [PATCH 04/22] fixing fetch content for google tests

---
 Code/Source/solver/CMakeLists.txt | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Code/Source/solver/CMakeLists.txt b/Code/Source/solver/CMakeLists.txt
index e42391862..4f317cf79 100644
--- a/Code/Source/solver/CMakeLists.txt
+++ b/Code/Source/solver/CMakeLists.txt
@@ -348,11 +348,11 @@ if(ENABLE_UNIT_TEST)
 
   # install Google Test
   #if(NOT TARGET gtest_main AND NOT TARGET gtest)
-  include(FetchContent)
   FetchContent_Declare(
-    googletest
-    URL https://github.com/google/googletest/archive/refs/heads/main.zip
-    DOWNLOAD_EXTRACT_TIMESTAMP TRUE
+          googletest
+          GIT_REPOSITORY https://github.com/google/googletest.git
+          GIT_TAG v1.17.0
+          DOWNLOAD_EXTRACT_TIMESTAMP TRUE
   )
   FetchContent_MakeAvailable(googletest)
   #endif()

From 81cad5461e14d754b42cd44b89d2abba10993d71 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Mon, 8 Jun 2026 00:51:47 -0700
Subject: [PATCH 05/22] adding fetch content to include for enabled unit tests

---
 Code/Source/solver/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Code/Source/solver/CMakeLists.txt b/Code/Source/solver/CMakeLists.txt
index 4f317cf79..1adc6be78 100644
--- a/Code/Source/solver/CMakeLists.txt
+++ b/Code/Source/solver/CMakeLists.txt
@@ -345,7 +345,7 @@ if(ENABLE_UNIT_TEST)
 
   # link pthread on ubuntu20
   find_package(Threads REQUIRED)
-
+  include(FetchContent)
   # install Google Test
   #if(NOT TARGET gtest_main AND NOT TARGET gtest)
   FetchContent_Declare(

From 004e678ca0e830d5a54f93b2a468fe347bf0456c Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Mon, 8 Jun 2026 12:20:53 -0700
Subject: [PATCH 06/22] removing basis optimizations, caching, pyramid support,
 manual/static reference tables and related unit tests

---
 Code/Source/solver/CMakeLists.txt             |    8 +
 Code/Source/solver/FE/Basis/BasisCache.cpp    |  309 -
 Code/Source/solver/FE/Basis/BasisCache.h      |  456 -
 Code/Source/solver/FE/Basis/BasisFactory.cpp  |   81 +-
 Code/Source/solver/FE/Basis/BasisFactory.h    |    6 -
 Code/Source/solver/FE/Basis/BasisFunction.cpp |  262 +-
 Code/Source/solver/FE/Basis/BasisFunction.h   |  367 +-
 Code/Source/solver/FE/Basis/BasisTolerance.h  |   52 -
 Code/Source/solver/FE/Basis/BasisTraits.h     |   55 +-
 Code/Source/solver/FE/Basis/LagrangeBasis.cpp | 8661 +----------------
 Code/Source/solver/FE/Basis/LagrangeBasis.h   |  123 +-
 .../solver/FE/Basis/LagrangeBasisFast.h       | 1378 ---
 .../solver/FE/Basis/LagrangeBasisPyramid.cpp  | 2069 ----
 .../solver/FE/Basis/LagrangeBasisPyramid.h    |   67 -
 .../solver/FE/Basis/LagrangeBasisSimplex.cpp  | 2457 -----
 .../solver/FE/Basis/LagrangeBasisSimplex.h    |   78 -
 .../solver/FE/Basis/LagrangeBasisUtility.h    |   25 -
 .../FE/Basis/NodeOrderingConventions.cpp      |  580 +-
 .../solver/FE/Basis/NodeOrderingConventions.h |  508 +-
 .../solver/FE/Basis/PyramidModalBasis.h       |  265 -
 .../solver/FE/Basis/SerendipityBasis.cpp      |   74 +-
 .../Source/solver/FE/Basis/SerendipityBasis.h |    7 -
 Code/Source/solver/FE/Basis/VectorBasis.h     |  255 -
 .../FE/Basis/VectorBasisEvaluationHelpers.cpp |  593 --
 .../FE/Basis/VectorBasisEvaluationHelpers.h   |  751 --
 .../FE/Basis/VectorBasisModalPolynomial.h     |   77 -
 Code/Source/solver/FE/Common/Alignment.h      |   23 -
 Code/Source/solver/FE/Common/Types.h          |    9 +-
 Code/Source/solver/FE/Math/Matrix.h           |    2 +-
 Code/Source/solver/FE/Math/Vector.h           |    2 +-
 .../solver/FE/Quadrature/QuadratureRule.h     |  237 -
 Code/Source/solver/Timer.h                    |   21 +-
 Code/Source/solver/load_msh.cpp               |    2 -
 Code/Source/solver/utils.cpp                  |   14 +-
 .../eigen3/unsupported/Eigen/CXX11/Tensor     |    2 +
 .../FE/Basis/test_BasisCacheFactory.cpp       |  256 -
 .../FE/Basis/test_BasisErrorPaths.cpp         |   60 +-
 .../unitTests/FE/Basis/test_BasisHessians.cpp |   32 -
 .../FE/Basis/test_ConstexprBasis.cpp          |  135 +-
 ...ePyramid.cpp => test_HigherOrderWedge.cpp} |   66 +-
 .../unitTests/FE/Basis/test_LagrangeBasis.cpp | 3198 +-----
 .../FE/Basis/test_SerendipityTensorModal.cpp  |   12 +-
 tests/unitTests/test_common.h                 |    3 +-
 43 files changed, 1060 insertions(+), 22578 deletions(-)
 delete mode 100644 Code/Source/solver/FE/Basis/BasisCache.cpp
 delete mode 100644 Code/Source/solver/FE/Basis/BasisCache.h
 delete mode 100644 Code/Source/solver/FE/Basis/BasisTolerance.h
 delete mode 100644 Code/Source/solver/FE/Basis/LagrangeBasisFast.h
 delete mode 100644 Code/Source/solver/FE/Basis/LagrangeBasisPyramid.cpp
 delete mode 100644 Code/Source/solver/FE/Basis/LagrangeBasisPyramid.h
 delete mode 100644 Code/Source/solver/FE/Basis/LagrangeBasisSimplex.cpp
 delete mode 100644 Code/Source/solver/FE/Basis/LagrangeBasisSimplex.h
 delete mode 100644 Code/Source/solver/FE/Basis/LagrangeBasisUtility.h
 delete mode 100644 Code/Source/solver/FE/Basis/PyramidModalBasis.h
 delete mode 100644 Code/Source/solver/FE/Basis/VectorBasis.h
 delete mode 100644 Code/Source/solver/FE/Basis/VectorBasisEvaluationHelpers.cpp
 delete mode 100644 Code/Source/solver/FE/Basis/VectorBasisEvaluationHelpers.h
 delete mode 100644 Code/Source/solver/FE/Basis/VectorBasisModalPolynomial.h
 delete mode 100644 Code/Source/solver/FE/Common/Alignment.h
 delete mode 100644 Code/Source/solver/FE/Quadrature/QuadratureRule.h
 delete mode 100644 tests/unitTests/FE/Basis/test_BasisCacheFactory.cpp
 rename tests/unitTests/FE/Basis/{test_HigherOrderWedgePyramid.cpp => test_HigherOrderWedge.cpp} (64%)

diff --git a/Code/Source/solver/CMakeLists.txt b/Code/Source/solver/CMakeLists.txt
index 1adc6be78..bdebc4a52 100644
--- a/Code/Source/solver/CMakeLists.txt
+++ b/Code/Source/solver/CMakeLists.txt
@@ -355,6 +355,14 @@ if(ENABLE_UNIT_TEST)
           DOWNLOAD_EXTRACT_TIMESTAMP TRUE
   )
   FetchContent_MakeAvailable(googletest)
+
+  if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_STANDARD GREATER_EQUAL 20)
+    foreach(GTEST_TARGET gtest gtest_main gmock gmock_main)
+      if(TARGET ${GTEST_TARGET})
+        target_compile_options(${GTEST_TARGET} PRIVATE -std=gnu++17)
+      endif()
+    endforeach()
+  endif()
   #endif()
 
   enable_testing()
diff --git a/Code/Source/solver/FE/Basis/BasisCache.cpp b/Code/Source/solver/FE/Basis/BasisCache.cpp
deleted file mode 100644
index 6d8a4ede3..000000000
--- a/Code/Source/solver/FE/Basis/BasisCache.cpp
+++ /dev/null
@@ -1,309 +0,0 @@
-/* Copyright (c) Stanford University, The Regents of the University of California, and others.
- *
- * All Rights Reserved.
- *
- * See License file.
- */
-
-#include "BasisCache.h"
-#include <utility>
-
-namespace svmp {
-namespace FE {
-namespace basis {
-
-namespace {
-
-QuadratureCacheKey make_quadrature_cache_key(const quadrature::QuadratureRule& quad) noexcept {
-    const auto fingerprint = quad.point_fingerprint();
-    return QuadratureCacheKey{fingerprint.dimension,
-                              fingerprint.num_points,
-                              fingerprint.points_hash_a,
-                              fingerprint.points_hash_b};
-}
-
-void mix_hash_word(std::uint64_t word,
-                   std::uint64_t& hash_a,
-                   std::uint64_t& hash_b) noexcept {
-    hash_a ^= word + 0x9e3779b97f4a7c15ULL + (hash_a << 6u) + (hash_a >> 2u);
-    hash_b ^= (word + 0xbf58476d1ce4e5b9ULL) + (hash_b << 7u) + (hash_b >> 3u);
-}
-
-std::pair<std::uint64_t, std::uint64_t>
-identity_fingerprint(const std::string& identity) noexcept {
-    std::uint64_t hash_a = 0xa4093822299f31d0ULL;
-    std::uint64_t hash_b = 0x082efa98ec4e6c89ULL;
-    mix_hash_word(static_cast<std::uint64_t>(identity.size()), hash_a, hash_b);
-    for (const char c : identity) {
-        mix_hash_word(static_cast<std::uint64_t>(static_cast<unsigned char>(c)), hash_a, hash_b);
-    }
-    return {hash_a, hash_b};
-}
-
-BasisCacheKey make_basis_cache_key(const BasisFunction& basis,
-                                   const quadrature::QuadratureRule& quad,
-                                   bool gradients,
-                                   bool hessians) {
-    StructuralBasisKey structural_key{
-        basis.basis_type(),
-        basis.element_type(),
-        basis.dimension(),
-        basis.order(),
-        basis.size(),
-        basis.is_vector_valued(),
-        make_quadrature_cache_key(quad),
-        gradients,
-        hessians
-    };
-
-    BasisCacheKey key;
-    const bool uses_basis_identity = !basis.cache_identity_is_structural();
-    if (!uses_basis_identity) {
-        key.value = structural_key;
-        return key;
-    }
-
-    std::vector<std::uint64_t> basis_identity_words;
-    const bool uses_structured_identity = basis.cache_identity_words(basis_identity_words);
-    if (!uses_structured_identity) {
-        basis_identity_words.clear();
-    }
-    const std::string basis_identity =
-        uses_structured_identity ? std::string{} : basis.cache_identity();
-    BasisIdentityFingerprint cached_identity_hash{};
-    const bool has_cached_identity_hash =
-        uses_structured_identity &&
-        basis.cache_identity_fingerprint(cached_identity_hash.hash_a,
-                                         cached_identity_hash.hash_b);
-    const auto identity_hash = uses_structured_identity
-        ? has_cached_identity_hash
-              ? std::pair<std::uint64_t, std::uint64_t>{
-                    cached_identity_hash.hash_a,
-                    cached_identity_hash.hash_b}
-              : [&basis_identity_words] {
-                    const auto fingerprint =
-                        compute_basis_identity_fingerprint(basis_identity_words);
-                    return std::pair<std::uint64_t, std::uint64_t>{
-                        fingerprint.hash_a,
-                        fingerprint.hash_b};
-                }()
-        : identity_fingerprint(basis_identity);
-    key.value = ParameterizedBasisKey{
-        structural_key,
-        uses_structured_identity,
-        identity_hash.first,
-        identity_hash.second,
-        std::move(basis_identity_words),
-        basis_identity
-    };
-    return key;
-}
-
-} // namespace
-
-BasisCache& BasisCache::instance() {
-    static BasisCache cache;
-    return cache;
-}
-
-const BasisCacheEntry& BasisCache::get_or_compute(
-    const BasisFunction& basis,
-    const quadrature::QuadratureRule& quad,
-    bool gradients,
-    bool hessians) {
-    return *get_or_compute_shared(basis, quad, gradients, hessians);
-}
-
-std::shared_ptr<const BasisCacheEntry> BasisCache::get_or_compute_shared(
-    const BasisFunction& basis,
-    const quadrature::QuadratureRule& quad,
-    bool gradients,
-    bool hessians) {
-    const BasisCacheKey key = make_basis_cache_key(basis, quad, gradients, hessians);
-
-    // Warm path: shared (reader) lock allows concurrent cache hits.
-    {
-        std::shared_lock<std::shared_mutex> read_lock(mutex_);
-        auto it = slots_.find(key);
-        if (it != slots_.end() && it->second.entry) {
-            return it->second.entry;
-        }
-    }
-
-    std::shared_ptr<InFlightComputation> in_flight;
-    bool owner = false;
-    {
-        std::unique_lock<std::shared_mutex> write_lock(mutex_);
-        auto& slot = slots_[key];
-        if (slot.entry) {
-            return slot.entry;
-        }
-
-        if (!slot.pending) {
-            in_flight = std::make_shared<InFlightComputation>();
-            slot.pending = in_flight;
-            owner = true;
-        } else {
-            in_flight = slot.pending;
-        }
-    }
-
-    if (!owner) {
-        std::unique_lock<std::mutex> wait_lock(in_flight->mutex);
-        in_flight->ready_cv.wait(wait_lock, [&in_flight] { return in_flight->ready; });
-        if (in_flight->exception) {
-            std::rethrow_exception(in_flight->exception);
-        }
-        return in_flight->entry;
-    }
-
-    try {
-        auto entry = std::make_shared<BasisCacheEntry>(compute(basis, quad, gradients, hessians));
-        {
-            std::unique_lock<std::shared_mutex> write_lock(mutex_);
-            auto slot_it = slots_.find(key);
-            if (slot_it == slots_.end()) {
-                slot_it = slots_.emplace(key, CacheSlot{}).first;
-            }
-            auto& slot = slot_it->second;
-            if (slot.entry) {
-                entry = slot.entry;
-            } else {
-                slot.entry = entry;
-            }
-            if (slot.pending == in_flight) {
-                slot.pending.reset();
-            }
-        }
-        {
-            std::lock_guard<std::mutex> ready_lock(in_flight->mutex);
-            in_flight->entry = entry;
-            in_flight->ready = true;
-        }
-        in_flight->ready_cv.notify_all();
-        return entry;
-    } catch (...) {
-        {
-            std::lock_guard<std::mutex> ready_lock(in_flight->mutex);
-            in_flight->exception = std::current_exception();
-            in_flight->ready = true;
-        }
-        {
-            std::unique_lock<std::shared_mutex> write_lock(mutex_);
-            auto slot_it = slots_.find(key);
-            if (slot_it != slots_.end() && slot_it->second.pending == in_flight) {
-                slot_it->second.pending.reset();
-                if (!slot_it->second.entry) {
-                    slots_.erase(slot_it);
-                }
-            }
-        }
-        in_flight->ready_cv.notify_all();
-        throw;
-    }
-}
-
-const BasisCacheEntry& BasisCache::prewarm(
-    const BasisFunction& basis,
-    const quadrature::QuadratureRule& quad,
-    bool gradients,
-    bool hessians) {
-    return get_or_compute(basis, quad, gradients, hessians);
-}
-
-BasisCacheHandle BasisCache::prewarm_handle(
-    const BasisFunction& basis,
-    const quadrature::QuadratureRule& quad,
-    bool gradients,
-    bool hessians) {
-    return BasisCacheHandle(get_or_compute_shared(basis, quad, gradients, hessians));
-}
-
-BasisCacheEntry BasisCache::compute_uncached(
-    const BasisFunction& basis,
-    const quadrature::QuadratureRule& quad,
-    bool gradients,
-    bool hessians) const {
-    return compute(basis, quad, gradients, hessians);
-}
-
-void BasisCache::clear() {
-    std::unique_lock<std::shared_mutex> lock(mutex_);
-    for (auto it = slots_.begin(); it != slots_.end();) {
-        if (it->second.pending) {
-            it->second.entry.reset();
-            ++it;
-        } else {
-            it = slots_.erase(it);
-        }
-    }
-}
-
-std::size_t BasisCache::size() const {
-    std::shared_lock<std::shared_mutex> lock(mutex_);
-    std::size_t completed = 0;
-    for (const auto& [key, slot] : slots_) {
-        (void)key;
-        if (slot.entry) {
-            ++completed;
-        }
-    }
-    return completed;
-}
-
-BasisCacheEntry BasisCache::compute(const BasisFunction& basis,
-                                    const quadrature::QuadratureRule& quad,
-                                    bool gradients,
-                                    bool hessians) const {
-    BasisCacheEntry entry;
-    const auto& points = quad.points();
-    entry.num_qpts = points.size();
-    entry.num_dofs = basis.size();
-
-    const bool vector_basis = basis.is_vector_valued();
-    if (!vector_basis) {
-        entry.scalar_values.assign(entry.num_dofs * entry.num_qpts, Real(0));
-        if (gradients) {
-            entry.gradients.assign(entry.num_dofs * 3u * entry.num_qpts, Real(0));
-        }
-        if (hessians) {
-            entry.hessians.assign(entry.num_dofs * 9u * entry.num_qpts, Real(0));
-        }
-    } else {
-        entry.vector_values_xyz.assign(entry.num_dofs * 3u * entry.num_qpts, Real(0));
-        if (gradients && basis.supports_vector_jacobians()) {
-            entry.vector_jacobians.assign(entry.num_dofs * 9u * entry.num_qpts, Real(0));
-        }
-        if (gradients && basis.supports_curl()) {
-            entry.vector_curls_xyz.assign(entry.num_dofs * 3u * entry.num_qpts, Real(0));
-        }
-        if (gradients && basis.supports_divergence()) {
-            entry.vector_divergence.assign(entry.num_dofs * entry.num_qpts, Real(0));
-        }
-    }
-
-    if (vector_basis) {
-        if (entry.num_dofs > 0 && entry.num_qpts > 0) {
-            basis.evaluate_vector_at_quadrature_points(
-                points,
-                entry.vector_values_xyz.data(),
-                entry.vector_jacobians.empty() ? nullptr : entry.vector_jacobians.data(),
-                entry.vector_curls_xyz.empty() ? nullptr : entry.vector_curls_xyz.data(),
-                entry.vector_divergence.empty() ? nullptr : entry.vector_divergence.data());
-        }
-        return entry;
-    }
-
-    if (entry.num_dofs > 0 && entry.num_qpts > 0) {
-        basis.fill_scalar_cache_entry(points,
-                                      entry.num_qpts,
-                                      entry.scalar_values.data(),
-                                      gradients ? entry.gradients.data() : nullptr,
-                                      hessians ? entry.hessians.data() : nullptr);
-    }
-
-    return entry;
-}
-} // namespace basis
-} // namespace FE
-} // namespace svmp
diff --git a/Code/Source/solver/FE/Basis/BasisCache.h b/Code/Source/solver/FE/Basis/BasisCache.h
deleted file mode 100644
index a84c0e87a..000000000
--- a/Code/Source/solver/FE/Basis/BasisCache.h
+++ /dev/null
@@ -1,456 +0,0 @@
-/* Copyright (c) Stanford University, The Regents of the University of California, and others.
- *
- * All Rights Reserved.
- *
- * See License file.
- */
-
-#ifndef SVMP_FE_BASIS_BASISCACHE_H
-#define SVMP_FE_BASIS_BASISCACHE_H
-
-/**
- * @file BasisCache.h
- * @brief Cache for basis evaluations at quadrature points
- */
-
-#include "BasisFunction.h"
-#include "Quadrature/QuadratureRule.h"
-#include <cstddef>
-#include <condition_variable>
-#include <exception>
-#include <functional>
-#include <memory>
-#include <mutex>
-#include <cstdint>
-#include <shared_mutex>
-#include <span>
-#include <string>
-#include <type_traits>
-#include <unordered_map>
-#include <utility>
-#include <variant>
-#include <vector>
-
-namespace svmp {
-namespace FE {
-namespace basis {
-
-struct QuadratureCacheKey {
-    int dimension{0};
-    std::size_t num_points{0};
-    // Quadrature coordinates are intentionally fingerprinted from their exact
-    // Real bit patterns. Values such as -0.0 and +0.0 therefore produce
-    // distinct cache keys unless a future API explicitly normalizes them. The
-    // key intentionally ignores weights and rule class because basis values only
-    // depend on reference coordinates; bit-identical point sets share entries.
-    std::uint64_t points_hash_a{0};
-    std::uint64_t points_hash_b{0};
-
-    bool operator==(const QuadratureCacheKey& other) const noexcept {
-        return dimension == other.dimension &&
-               num_points == other.num_points &&
-               points_hash_a == other.points_hash_a &&
-               points_hash_b == other.points_hash_b;
-    }
-};
-
-struct StructuralBasisKey {
-    BasisType basis_type{BasisType::Custom};
-    ElementType element_type{ElementType::Unknown};
-    int dimension{0};
-    int order{0};
-    std::size_t num_dofs{0};
-    bool vector_valued{false};
-    QuadratureCacheKey quadrature;
-    bool with_gradients{false};
-    bool with_hessians{false};
-
-    bool operator==(const StructuralBasisKey& other) const noexcept {
-        return basis_type == other.basis_type &&
-               element_type == other.element_type &&
-               dimension == other.dimension &&
-               order == other.order &&
-               num_dofs == other.num_dofs &&
-               vector_valued == other.vector_valued &&
-               quadrature == other.quadrature &&
-               with_gradients == other.with_gradients &&
-               with_hessians == other.with_hessians;
-    }
-};
-
-struct ParameterizedBasisKey {
-    StructuralBasisKey structural;
-    bool uses_structured_identity{false};
-    std::uint64_t identity_hash_a{0};
-    std::uint64_t identity_hash_b{0};
-    std::vector<std::uint64_t> basis_identity_words;
-    std::string basis_identity;
-
-    bool operator==(const ParameterizedBasisKey& other) const noexcept {
-        return structural == other.structural &&
-               uses_structured_identity == other.uses_structured_identity &&
-               identity_hash_a == other.identity_hash_a &&
-               identity_hash_b == other.identity_hash_b &&
-               basis_identity_words == other.basis_identity_words &&
-               basis_identity == other.basis_identity;
-    }
-};
-
-struct BasisCacheKey {
-    std::variant<StructuralBasisKey, ParameterizedBasisKey> value;
-
-    bool operator==(const BasisCacheKey& other) const noexcept {
-        return value == other.value;
-    }
-};
-
-struct BasisCacheKeyHash {
-    std::size_t operator()(const BasisCacheKey& key) const noexcept {
-        std::size_t seed = 0;
-        auto combine = [&seed](std::size_t value) noexcept {
-            seed ^= value + 0x9e3779b97f4a7c15ULL + (seed << 6u) + (seed >> 2u);
-        };
-
-        auto hash_structural = [&](const StructuralBasisKey& structural) noexcept {
-            combine(std::hash<int>()(structural.quadrature.dimension));
-            combine(std::hash<std::size_t>()(structural.quadrature.num_points));
-            combine(std::hash<std::uint64_t>()(structural.quadrature.points_hash_a));
-            combine(std::hash<std::uint64_t>()(structural.quadrature.points_hash_b));
-            combine(std::hash<int>()(static_cast<int>(structural.basis_type)));
-            combine(std::hash<int>()(static_cast<int>(structural.element_type)));
-            combine(std::hash<int>()(structural.dimension));
-            combine(std::hash<int>()(structural.order));
-            combine(std::hash<std::size_t>()(structural.num_dofs));
-            unsigned flags = 0u;
-            flags |= structural.vector_valued ? 1u : 0u;
-            flags |= structural.with_gradients ? 2u : 0u;
-            flags |= structural.with_hessians ? 4u : 0u;
-            combine(std::hash<unsigned>()(flags));
-        };
-
-        std::visit([&](const auto& active_key) {
-            using ActiveKey = std::decay_t<decltype(active_key)>;
-            if constexpr (std::is_same_v<ActiveKey, StructuralBasisKey>) {
-                combine(0x5354525543544b45ULL);
-                hash_structural(active_key);
-            } else {
-                combine(0x504152414d4b4559ULL);
-                hash_structural(active_key.structural);
-                combine(active_key.uses_structured_identity ? 1u : 0u);
-                combine(std::hash<std::uint64_t>()(active_key.identity_hash_a));
-                combine(std::hash<std::uint64_t>()(active_key.identity_hash_b));
-            }
-        }, key.value);
-        return seed;
-    }
-};
-
-struct BasisCacheEntry {
-    std::size_t num_qpts{0};
-    std::size_t num_dofs{0};
-    // Scalar basis values in dof-major SoA layout: [dof * num_qpts + qp].
-    std::vector<Real> scalar_values;
-    // Scalar reference gradients in dof/component/qpt SoA layout:
-    // [(dof * 3 + component) * num_qpts + qp].
-    std::vector<Real> gradients;
-    // Scalar reference Hessians in dof/component/qpt SoA layout:
-    // [(dof * 9 + row * 3 + col) * num_qpts + qp].
-    std::vector<Real> hessians;
-
-    // Vector basis values in dof/component/qpt SoA layout:
-    // [(dof * 3 + component) * num_qpts + qp].
-    std::vector<Real> vector_values_xyz;
-    // Vector basis reference Jacobians in dof/component/derivative/qpt layout:
-    // [(dof * 9 + component * 3 + derivative) * num_qpts + qp].
-    std::vector<Real> vector_jacobians;
-    // Vector basis curls in dof/component/qpt SoA layout.
-    std::vector<Real> vector_curls_xyz;
-    // Vector basis divergences in dof/qpt SoA layout.
-    std::vector<Real> vector_divergence;
-
-    // The object-returning accessors below are convenience helpers for tests,
-    // diagnostics, and occasional scalar use. Hot loops should prefer the SoA
-    // span accessors so they do not reconstruct Gradient, Hessian, or matrix
-    // objects per DOF and quadrature point.
-
-    [[nodiscard]] Real scalarValue(std::size_t dof, std::size_t qp) const noexcept {
-        return scalar_values[dof * num_qpts + qp];
-    }
-
-    [[nodiscard]] std::span<const Real> scalarValuesForDof(std::size_t dof) const noexcept {
-        if (num_qpts == 0) return {};
-        return std::span<const Real>(scalar_values.data() + dof * num_qpts, num_qpts);
-    }
-
-    [[nodiscard]] Real gradientValue(std::size_t dof,
-                                     std::size_t component,
-                                     std::size_t qp) const noexcept {
-        return gradients[(dof * 3u + component) * num_qpts + qp];
-    }
-
-    [[nodiscard]] Gradient gradientVector(std::size_t dof, std::size_t qp) const noexcept {
-        Gradient out{};
-        for (std::size_t component = 0; component < 3u; ++component) {
-            out[component] = gradientValue(dof, component, qp);
-        }
-        return out;
-    }
-
-    [[nodiscard]] std::span<const Real> gradientsForDofComponent(std::size_t dof,
-                                                                  std::size_t component) const noexcept {
-        if (num_qpts == 0) return {};
-        return std::span<const Real>(gradients.data() + (dof * 3u + component) * num_qpts, num_qpts);
-    }
-
-    [[nodiscard]] std::span<const Real> gradientsForDof(std::size_t dof) const noexcept {
-        if (num_qpts == 0) return {};
-        return std::span<const Real>(gradients.data() + dof * 3u * num_qpts, 3u * num_qpts);
-    }
-
-    [[nodiscard]] Real hessianValue(std::size_t dof,
-                                    std::size_t row,
-                                    std::size_t col,
-                                    std::size_t qp) const noexcept {
-        return hessians[(dof * 9u + row * 3u + col) * num_qpts + qp];
-    }
-
-    [[nodiscard]] Hessian hessianMatrix(std::size_t dof, std::size_t qp) const noexcept {
-        Hessian out{};
-        for (std::size_t row = 0; row < 3u; ++row) {
-            for (std::size_t col = 0; col < 3u; ++col) {
-                out(row, col) = hessianValue(dof, row, col, qp);
-            }
-        }
-        return out;
-    }
-
-    [[nodiscard]] std::span<const Real> hessiansForDofComponent(std::size_t dof,
-                                                                 std::size_t row,
-                                                                 std::size_t col) const noexcept {
-        if (num_qpts == 0) return {};
-        return std::span<const Real>(hessians.data() + (dof * 9u + row * 3u + col) * num_qpts, num_qpts);
-    }
-
-    [[nodiscard]] std::span<const Real> hessiansForDof(std::size_t dof) const noexcept {
-        if (num_qpts == 0) return {};
-        return std::span<const Real>(hessians.data() + dof * 9u * num_qpts, 9u * num_qpts);
-    }
-
-    [[nodiscard]] Real vectorValue(std::size_t dof,
-                                   std::size_t component,
-                                   std::size_t qp) const noexcept {
-        return vector_values_xyz[(dof * 3u + component) * num_qpts + qp];
-    }
-
-    [[nodiscard]] math::Vector<Real, 3> vectorValue(std::size_t dof,
-                                                     std::size_t qp) const noexcept {
-        math::Vector<Real, 3> out{};
-        for (std::size_t component = 0; component < 3u; ++component) {
-            out[component] = vectorValue(dof, component, qp);
-        }
-        return out;
-    }
-
-    [[nodiscard]] std::span<const Real> vectorValuesForDofComponent(std::size_t dof,
-                                                                     std::size_t component) const noexcept {
-        if (num_qpts == 0) return {};
-        return std::span<const Real>(vector_values_xyz.data() + (dof * 3u + component) * num_qpts, num_qpts);
-    }
-
-    [[nodiscard]] std::span<const Real> vectorValuesForDof(std::size_t dof) const noexcept {
-        if (num_qpts == 0 || vector_values_xyz.empty()) return {};
-        return std::span<const Real>(vector_values_xyz.data() + dof * 3u * num_qpts, 3u * num_qpts);
-    }
-
-    [[nodiscard]] Real vectorJacobianValue(std::size_t dof,
-                                           std::size_t component,
-                                           std::size_t derivative,
-                                           std::size_t qp) const noexcept {
-        return vector_jacobians[(dof * 9u + component * 3u + derivative) * num_qpts + qp];
-    }
-
-    [[nodiscard]] VectorJacobian vectorJacobianMatrix(std::size_t dof,
-                                                       std::size_t qp) const noexcept {
-        VectorJacobian out{};
-        for (std::size_t component = 0; component < 3u; ++component) {
-            for (std::size_t derivative = 0; derivative < 3u; ++derivative) {
-                out(component, derivative) =
-                    vectorJacobianValue(dof, component, derivative, qp);
-            }
-        }
-        return out;
-    }
-
-    [[nodiscard]] std::span<const Real> vectorJacobiansForDofComponentDerivative(
-        std::size_t dof,
-        std::size_t component,
-        std::size_t derivative) const noexcept {
-        if (num_qpts == 0 || vector_jacobians.empty()) return {};
-        return std::span<const Real>(
-            vector_jacobians.data() + (dof * 9u + component * 3u + derivative) * num_qpts,
-            num_qpts);
-    }
-
-    [[nodiscard]] std::span<const Real> vectorJacobiansForDof(std::size_t dof) const noexcept {
-        if (num_qpts == 0 || vector_jacobians.empty()) return {};
-        return std::span<const Real>(vector_jacobians.data() + dof * 9u * num_qpts, 9u * num_qpts);
-    }
-
-    [[nodiscard]] Real vectorCurlValue(std::size_t dof,
-                                       std::size_t component,
-                                       std::size_t qp) const noexcept {
-        return vector_curls_xyz[(dof * 3u + component) * num_qpts + qp];
-    }
-
-    [[nodiscard]] math::Vector<Real, 3> vectorCurl(std::size_t dof,
-                                                    std::size_t qp) const noexcept {
-        math::Vector<Real, 3> out{};
-        for (std::size_t component = 0; component < 3u; ++component) {
-            out[component] = vectorCurlValue(dof, component, qp);
-        }
-        return out;
-    }
-
-    [[nodiscard]] std::span<const Real> vectorCurlsForDofComponent(std::size_t dof,
-                                                                    std::size_t component) const noexcept {
-        if (num_qpts == 0 || vector_curls_xyz.empty()) return {};
-        return std::span<const Real>(vector_curls_xyz.data() + (dof * 3u + component) * num_qpts, num_qpts);
-    }
-
-    [[nodiscard]] std::span<const Real> vectorCurlsForDof(std::size_t dof) const noexcept {
-        if (num_qpts == 0 || vector_curls_xyz.empty()) return {};
-        return std::span<const Real>(vector_curls_xyz.data() + dof * 3u * num_qpts, 3u * num_qpts);
-    }
-
-    [[nodiscard]] Real vectorDivergenceValue(std::size_t dof,
-                                             std::size_t qp) const noexcept {
-        return vector_divergence[dof * num_qpts + qp];
-    }
-
-    [[nodiscard]] std::span<const Real> vectorDivergenceForDof(std::size_t dof) const noexcept {
-        if (num_qpts == 0 || vector_divergence.empty()) return {};
-        return std::span<const Real>(vector_divergence.data() + dof * num_qpts, num_qpts);
-    }
-};
-
-class BasisCacheHandle {
-public:
-    BasisCacheHandle() = default;
-
-    [[nodiscard]] const BasisCacheEntry& entry() const {
-        BASIS_CHECK_CONFIG(entry_ != nullptr,
-                           "BasisCacheHandle: attempted to access an empty handle");
-        return *entry_;
-    }
-
-    [[nodiscard]] bool valid() const noexcept { return entry_ != nullptr; }
-    explicit operator bool() const noexcept { return valid(); }
-
-private:
-    friend class BasisCache;
-
-    explicit BasisCacheHandle(std::shared_ptr<const BasisCacheEntry> entry)
-        : entry_(std::move(entry)) {}
-
-    std::shared_ptr<const BasisCacheEntry> entry_;
-};
-
-class BasisCache {
-public:
-    static BasisCache& instance();
-
-    const BasisCacheEntry& get_or_compute(
-        const BasisFunction& basis,
-        const quadrature::QuadratureRule& quad,
-        bool gradients = true,
-        bool hessians = false);
-
-    /**
-     * @brief Compute an entry without consulting, publishing to, or waiting on
-     * the shared cache.
-     */
-    BasisCacheEntry compute_uncached(
-        const BasisFunction& basis,
-        const quadrature::QuadratureRule& quad,
-        bool gradients = true,
-        bool hessians = false) const;
-
-    /**
-     * @brief Eagerly populate the cache for the given (basis, quadrature) key
-     *
-     * Pays the compute cost up front so that subsequent get_or_compute calls
-     * for the same key hit the warm-cache path immediately. Equivalent to
-     * calling get_or_compute and discarding the return value.
-     *
-     * Returns the inserted (or pre-existing) entry for convenience.
-     */
-    const BasisCacheEntry& prewarm(
-        const BasisFunction& basis,
-        const quadrature::QuadratureRule& quad,
-        bool gradients = true,
-        bool hessians = false);
-
-    /**
-     * @brief Eagerly populate the cache and return a hot-loop handle.
-     *
-     * The returned handle owns a shared reference to the completed entry. Access
-     * through BasisCacheHandle::entry() performs no key construction, hashing,
-     * map lookup, or cache mutex acquisition. Calling clear() removes the entry
-     * from the global lookup map but does not invalidate existing handles.
-     */
-    BasisCacheHandle prewarm_handle(
-        const BasisFunction& basis,
-        const quadrature::QuadratureRule& quad,
-        bool gradients = true,
-        bool hessians = false);
-
-    /**
-     * @brief Remove completed cache entries.
-     *
-     * This is a soft clear: computations that were already in flight before
-     * clear() was called are allowed to publish their completed entry afterward.
-     * This preserves the returned-reference lifetime contract for concurrent
-     * get_or_compute() callers while still dropping all entries that had already
-     * completed at the time of the call.
-     */
-    void clear();
-    std::size_t size() const;
-
-private:
-    struct InFlightComputation {
-        std::mutex mutex;
-        std::condition_variable ready_cv;
-        bool ready{false};
-        std::shared_ptr<BasisCacheEntry> entry;
-        std::exception_ptr exception;
-    };
-
-    struct CacheSlot {
-        std::shared_ptr<BasisCacheEntry> entry;
-        std::shared_ptr<InFlightComputation> pending;
-    };
-
-    BasisCache() = default;
-    BasisCache(const BasisCache&) = delete;
-    BasisCache& operator=(const BasisCache&) = delete;
-
-    BasisCacheEntry compute(const BasisFunction& basis,
-                            const quadrature::QuadratureRule& quad,
-                            bool gradients,
-                            bool hessians) const;
-
-    std::shared_ptr<const BasisCacheEntry> get_or_compute_shared(
-        const BasisFunction& basis,
-        const quadrature::QuadratureRule& quad,
-        bool gradients,
-        bool hessians);
-
-    mutable std::shared_mutex mutex_;
-    std::unordered_map<BasisCacheKey, CacheSlot, BasisCacheKeyHash> slots_;
-};
-
-} // namespace basis
-} // namespace FE
-} // namespace svmp
-
-#endif // SVMP_FE_BASIS_BASISCACHE_H
diff --git a/Code/Source/solver/FE/Basis/BasisFactory.cpp b/Code/Source/solver/FE/Basis/BasisFactory.cpp
index dddbd4c5c..9f0867959 100644
--- a/Code/Source/solver/FE/Basis/BasisFactory.cpp
+++ b/Code/Source/solver/FE/Basis/BasisFactory.cpp
@@ -10,29 +10,12 @@
 #include "LagrangeBasis.h"
 #include "SerendipityBasis.h"
 
-#include <mutex>
-#include <unordered_map>
-#include <utility>
-
 namespace svmp {
 namespace FE {
 namespace basis {
 
 namespace {
 
-using CustomRegistryMap =
-    std::unordered_map<std::string, basis_factory::CustomFactory>;
-
-CustomRegistryMap& custom_registry() {
-    static CustomRegistryMap registry;
-    return registry;
-}
-
-std::mutex& custom_registry_mutex() {
-    static std::mutex mutex;
-    return mutex;
-}
-
 int require_basis_order(const BasisRequest& req,
                         const char* missing_message,
                         const char* negative_message) {
@@ -50,12 +33,12 @@ int require_basis_order(const BasisRequest& req,
 void require_scalar_c0_request(const BasisRequest& req) {
     if (req.field_type != FieldType::Scalar) {
         throw BasisConfigurationException(
-            "BasisFactory: Lagrange/Serendipity bases currently support scalar fields only",
+            "BasisFactory: Lagrange/Serendipity bases support scalar fields only",
             __FILE__, __LINE__, __func__);
     }
     if (req.continuity != Continuity::C0) {
         throw BasisConfigurationException(
-            "BasisFactory: migrated Lagrange/Serendipity scope supports C0 continuity only",
+            "BasisFactory: Lagrange/Serendipity bases support C0 continuity only",
             __FILE__, __LINE__, __func__);
     }
 }
@@ -78,36 +61,6 @@ std::shared_ptr<BasisFunction> create_serendipity(const BasisRequest& req) {
     return std::make_shared<SerendipityBasis>(req.element_type, order);
 }
 
-std::shared_ptr<BasisFunction> create_custom(const BasisRequest& req) {
-    if (req.custom_id.empty()) {
-        throw BasisConfigurationException(
-            "BasisFactory: custom basis requests require custom_id",
-            __FILE__, __LINE__, __func__);
-    }
-
-    basis_factory::CustomFactory factory;
-    {
-        std::lock_guard<std::mutex> lock(custom_registry_mutex());
-        const auto it = custom_registry().find(req.custom_id);
-        if (it == custom_registry().end()) {
-            throw BasisConfigurationException(
-                "BasisFactory: no custom basis factory registered for id '" +
-                    req.custom_id + "'",
-                __FILE__, __LINE__, __func__);
-        }
-        factory = it->second;
-    }
-
-    auto basis = factory(req);
-    if (!basis) {
-        throw BasisConstructionException(
-            "BasisFactory: custom factory returned null basis for id '" +
-                req.custom_id + "'",
-            __FILE__, __LINE__, __func__);
-    }
-    return basis;
-}
-
 } // namespace
 
 namespace basis_factory {
@@ -118,41 +71,13 @@ std::shared_ptr<BasisFunction> create(const BasisRequest& req) {
             return create_lagrange(req);
         case BasisType::Serendipity:
             return create_serendipity(req);
-        case BasisType::Custom:
-            return create_custom(req);
         default:
             throw BasisConfigurationException(
-                "BasisFactory: requested basis family is outside the migrated Lagrange/Serendipity scope",
+                "BasisFactory: requested basis family is outside the scalar Lagrange/Serendipity scope",
                 __FILE__, __LINE__, __func__);
     }
 }
 
-void register_custom(std::string custom_id, CustomFactory factory) {
-    if (custom_id.empty()) {
-        throw BasisConfigurationException(
-            "BasisFactory: custom factory id must not be empty",
-            __FILE__, __LINE__, __func__);
-    }
-    if (!factory) {
-        throw BasisConfigurationException(
-            "BasisFactory: custom factory must be callable",
-            __FILE__, __LINE__, __func__);
-    }
-
-    std::lock_guard<std::mutex> lock(custom_registry_mutex());
-    custom_registry()[std::move(custom_id)] = std::move(factory);
-}
-
-void unregister_custom(const std::string& custom_id) {
-    std::lock_guard<std::mutex> lock(custom_registry_mutex());
-    custom_registry().erase(custom_id);
-}
-
-void clear_custom_registry_for_tests() {
-    std::lock_guard<std::mutex> lock(custom_registry_mutex());
-    custom_registry().clear();
-}
-
 } // namespace basis_factory
 
 } // namespace basis
diff --git a/Code/Source/solver/FE/Basis/BasisFactory.h b/Code/Source/solver/FE/Basis/BasisFactory.h
index cedf1ba6d..c937dd4a0 100644
--- a/Code/Source/solver/FE/Basis/BasisFactory.h
+++ b/Code/Source/solver/FE/Basis/BasisFactory.h
@@ -14,7 +14,6 @@
  */
 
 #include "BasisFunction.h"
-#include <functional>
 #include <memory>
 #include <optional>
 #include <string>
@@ -41,12 +40,7 @@ struct BasisRequest {
 
 namespace basis_factory {
 
-using CustomFactory = std::function<std::shared_ptr<BasisFunction>(const BasisRequest&)>;
-
 [[nodiscard]] std::shared_ptr<BasisFunction> create(const BasisRequest& req);
-void register_custom(std::string custom_id, CustomFactory factory);
-void unregister_custom(const std::string& custom_id);
-void clear_custom_registry_for_tests();
 
 } // namespace basis_factory
 
diff --git a/Code/Source/solver/FE/Basis/BasisFunction.cpp b/Code/Source/solver/FE/Basis/BasisFunction.cpp
index 49c8d8763..2a1d4f6b0 100644
--- a/Code/Source/solver/FE/Basis/BasisFunction.cpp
+++ b/Code/Source/solver/FE/Basis/BasisFunction.cpp
@@ -6,11 +6,8 @@
  */
 
 #include "BasisFunction.h"
-#include "VectorBasisEvaluationHelpers.h"
+
 #include <algorithm>
-#include <iomanip>
-#include <limits>
-#include <sstream>
 
 namespace svmp {
 namespace FE {
@@ -19,81 +16,26 @@ namespace basis {
 namespace {
 
 struct BasisFunctionScratch {
-    std::vector<Real> scalar_values;
-    std::vector<Gradient> scalar_gradients;
-    std::vector<Hessian> scalar_hessians;
-    std::vector<math::Vector<Real, 3>> vector_values;
-    std::vector<VectorJacobian> vector_jacobians;
-    std::vector<math::Vector<Real, 3>> vector_curls;
-    std::vector<Real> vector_divergences;
+    std::vector<Real> values;
+    std::vector<Gradient> gradients;
+    std::vector<Hessian> hessians;
 
     void prewarm(std::size_t max_size) {
-        scalar_values.reserve(max_size);
-        scalar_gradients.reserve(max_size);
-        scalar_hessians.reserve(max_size);
-        vector_values.reserve(max_size);
-        vector_jacobians.reserve(max_size);
-        vector_curls.reserve(max_size);
-        vector_divergences.reserve(max_size);
+        values.reserve(max_size);
+        gradients.reserve(max_size);
+        hessians.reserve(max_size);
     }
 };
 
-BasisFunctionScratch& basis_function_scratch() {
-    // Scratch is intentionally thread-local: production assembly uses a
-    // persistent worker-thread team, so buffers stay warm on each worker.
-    static thread_local BasisFunctionScratch scratch;
-    return scratch;
-}
-
-void mix_identity_hash_word(std::uint64_t word,
-                            std::uint64_t& hash_a,
-                            std::uint64_t& hash_b) noexcept {
-    hash_a ^= word + 0x9e3779b97f4a7c15ULL + (hash_a << 6u) + (hash_a >> 2u);
-    hash_b ^= (word + 0xbf58476d1ce4e5b9ULL) + (hash_b << 7u) + (hash_b >> 3u);
+BasisFunctionScratch& scratch() {
+    static thread_local BasisFunctionScratch data;
+    return data;
 }
 
 } // namespace
 
-BasisIdentityFingerprint
-compute_basis_identity_fingerprint(std::span<const std::uint64_t> words) noexcept {
-    BasisIdentityFingerprint fingerprint{0x243f6a8885a308d3ULL,
-                                         0x13198a2e03707344ULL};
-    mix_identity_hash_word(static_cast<std::uint64_t>(words.size()),
-                           fingerprint.hash_a,
-                           fingerprint.hash_b);
-    for (const auto word : words) {
-        mix_identity_hash_word(word, fingerprint.hash_a, fingerprint.hash_b);
-    }
-    return fingerprint;
-}
-
-std::string BasisFunction::cache_identity() const {
-    std::ostringstream oss;
-    oss << "basis=" << static_cast<int>(basis_type())
-        << "|elem=" << static_cast<int>(element_type())
-        << "|dim=" << dimension()
-        << "|order=" << order()
-        << "|size=" << size()
-        << "|vector=" << is_vector_valued();
-    return oss.str();
-}
-
-bool BasisFunction::cache_identity_words(std::vector<std::uint64_t>& words) const {
-    (void)words;
-    return false;
-}
-
-bool BasisFunction::cache_identity_fingerprint(std::uint64_t& hash_a,
-                                               std::uint64_t& hash_b) const {
-    (void)hash_a;
-    (void)hash_b;
-    return false;
-}
-
-void prewarm_basis_function_scratch(std::size_t max_size,
-                                    std::size_t max_qpts) {
-    (void)max_qpts;
-    basis_function_scratch().prewarm(max_size);
+void prewarm_basis_function_scratch(std::size_t max_size) {
+    scratch().prewarm(max_size);
 }
 
 void BasisFunction::evaluate_gradients(const math::Vector<Real, 3>& xi,
@@ -123,7 +65,7 @@ void BasisFunction::evaluate_all(const math::Vector<Real, 3>& xi,
 
 void BasisFunction::evaluate_values_to(const math::Vector<Real, 3>& xi,
                                        Real* SVMP_RESTRICT values_out) const {
-    auto& tmp = basis_function_scratch().scalar_values;
+    auto& tmp = scratch().values;
     tmp.resize(size());
     evaluate_values(xi, tmp);
     std::copy_n(tmp.data(), tmp.size(), values_out);
@@ -131,7 +73,7 @@ void BasisFunction::evaluate_values_to(const math::Vector<Real, 3>& xi,
 
 void BasisFunction::evaluate_gradients_to(const math::Vector<Real, 3>& xi,
                                           Real* SVMP_RESTRICT gradients_out) const {
-    auto& tmp = basis_function_scratch().scalar_gradients;
+    auto& tmp = scratch().gradients;
     tmp.resize(size());
     evaluate_gradients(xi, tmp);
     for (std::size_t i = 0; i < tmp.size(); ++i) {
@@ -143,7 +85,7 @@ void BasisFunction::evaluate_gradients_to(const math::Vector<Real, 3>& xi,
 
 void BasisFunction::evaluate_hessians_to(const math::Vector<Real, 3>& xi,
                                          Real* SVMP_RESTRICT hessians_out) const {
-    auto& tmp = basis_function_scratch().scalar_hessians;
+    auto& tmp = scratch().hessians;
     tmp.resize(size());
     evaluate_hessians(xi, tmp);
     for (std::size_t i = 0; i < tmp.size(); ++i) {
@@ -151,165 +93,6 @@ void BasisFunction::evaluate_hessians_to(const math::Vector<Real, 3>& xi,
     }
 }
 
-void BasisFunction::evaluate_at_quadrature_points(
-    const std::vector<math::Vector<Real, 3>>& points,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) const {
-    evaluate_at_quadrature_points_strided(
-        points, points.size(), values_out, gradients_out, hessians_out);
-}
-
-void BasisFunction::evaluate_at_quadrature_points_strided(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) const {
-    const std::size_t num_qpts = points.size();
-    const std::size_t num_dofs = size();
-    if (output_stride < num_qpts) {
-        throw BasisConfigurationException(
-            "BasisFunction strided evaluation requires output_stride >= points.size()",
-            __FILE__, __LINE__, __func__);
-    }
-
-    auto& scratch = basis_function_scratch();
-    auto& v_tmp = scratch.scalar_values;
-    auto& g_tmp = scratch.scalar_gradients;
-    auto& h_tmp = scratch.scalar_hessians;
-    if (values_out) v_tmp.resize(num_dofs);
-    if (gradients_out) g_tmp.resize(num_dofs);
-    if (hessians_out) h_tmp.resize(num_dofs);
-
-    for (std::size_t q = 0; q < num_qpts; ++q) {
-        if (values_out && gradients_out && hessians_out) {
-            evaluate_all(points[q], v_tmp, g_tmp, h_tmp);
-        } else {
-            if (values_out) evaluate_values(points[q], v_tmp);
-            if (gradients_out) evaluate_gradients(points[q], g_tmp);
-            if (hessians_out) evaluate_hessians(points[q], h_tmp);
-        }
-
-        if (values_out) {
-            for (std::size_t dof = 0; dof < num_dofs; ++dof) {
-                values_out[dof * output_stride + q] = v_tmp[dof];
-            }
-        }
-        if (gradients_out) {
-            for (std::size_t dof = 0; dof < num_dofs; ++dof) {
-                for (std::size_t component = 0; component < 3u; ++component) {
-                    gradients_out[(dof * 3u + component) * output_stride + q] =
-                        g_tmp[dof][component];
-                }
-            }
-        }
-        if (hessians_out) {
-            for (std::size_t dof = 0; dof < num_dofs; ++dof) {
-                store_hessian_strided(
-                    h_tmp[dof], hessians_out + dof * 9u * output_stride, output_stride, q);
-            }
-        }
-    }
-}
-
-void BasisFunction::fill_scalar_cache_entry(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) const {
-    evaluate_at_quadrature_points_strided(
-        points, output_stride, values_out, gradients_out, hessians_out);
-}
-
-void BasisFunction::evaluate_vector_at_quadrature_points(
-    const std::vector<math::Vector<Real, 3>>& points,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT jacobians_out,
-    Real* SVMP_RESTRICT curls_out,
-    Real* SVMP_RESTRICT divergence_out) const {
-    evaluate_vector_at_quadrature_points_strided(
-        points, points.size(), values_out, jacobians_out, curls_out, divergence_out);
-}
-
-void BasisFunction::evaluate_vector_at_quadrature_points_strided(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT jacobians_out,
-    Real* SVMP_RESTRICT curls_out,
-    Real* SVMP_RESTRICT divergence_out) const {
-    const std::size_t num_qpts = points.size();
-    const std::size_t num_dofs = size();
-    detail::vector_common::validate_vector_strided_outputs(
-        num_qpts, output_stride, "BasisFunction");
-
-    auto& scratch = basis_function_scratch();
-    auto& v_tmp = scratch.vector_values;
-    auto& j_tmp = scratch.vector_jacobians;
-    auto& c_tmp = scratch.vector_curls;
-    auto& d_tmp = scratch.vector_divergences;
-    if (values_out) v_tmp.resize(num_dofs);
-    if (jacobians_out) j_tmp.resize(num_dofs);
-    if (curls_out) c_tmp.resize(num_dofs);
-    if (divergence_out) d_tmp.resize(num_dofs);
-
-    for (std::size_t q = 0; q < num_qpts; ++q) {
-        if (values_out) {
-            evaluate_vector_values(points[q], v_tmp);
-            detail::vector_common::write_vector_values_strided(
-                v_tmp, num_dofs, output_stride, q, values_out);
-        }
-
-        if (jacobians_out) {
-            evaluate_vector_jacobians(points[q], j_tmp);
-            detail::vector_common::write_vector_jacobians_strided(
-                j_tmp, num_dofs, output_stride, q, jacobians_out);
-        }
-
-        if (curls_out) {
-            evaluate_curl(points[q], c_tmp);
-            detail::vector_common::write_vector_curl_strided(
-                c_tmp, num_dofs, output_stride, q, curls_out);
-        }
-
-        if (divergence_out) {
-            evaluate_divergence(points[q], d_tmp);
-            detail::vector_common::write_vector_divergence_strided(
-                d_tmp, num_dofs, output_stride, q, divergence_out);
-        }
-    }
-}
-
-void BasisFunction::evaluate_vector_values(
-    const math::Vector<Real, 3>&,
-    std::vector<math::Vector<Real, 3>>&) const {
-    throw BasisEvaluationException("Vector-valued evaluation requested on scalar basis",
-                                   __FILE__, __LINE__, __func__);
-}
-
-void BasisFunction::evaluate_vector_jacobians(
-    const math::Vector<Real, 3>&,
-    std::vector<VectorJacobian>&) const {
-    throw BasisEvaluationException("Vector-basis Jacobian evaluation requested on scalar basis",
-                                   __FILE__, __LINE__, __func__);
-}
-
-void BasisFunction::evaluate_divergence(
-    const math::Vector<Real, 3>&,
-    std::vector<Real>&) const {
-    throw BasisEvaluationException("Divergence requested on scalar basis",
-                                   __FILE__, __LINE__, __func__);
-}
-
-void BasisFunction::evaluate_curl(
-    const math::Vector<Real, 3>&,
-    std::vector<math::Vector<Real, 3>>&) const {
-    throw BasisEvaluationException("Curl requested on scalar basis",
-                                   __FILE__, __LINE__, __func__);
-}
-
 void BasisFunction::numerical_gradient(const math::Vector<Real, 3>& xi,
                                        std::vector<Gradient>& gradients,
                                        Real eps) const {
@@ -320,11 +103,12 @@ void BasisFunction::numerical_gradient(const math::Vector<Real, 3>& xi,
     for (int d = 0; d < dimension(); ++d) {
         math::Vector<Real, 3> forward = xi;
         math::Vector<Real, 3> backward = xi;
-        const std::size_t idx = static_cast<std::size_t>(d);
+        const auto idx = static_cast<std::size_t>(d);
         forward[idx] += eps;
         backward[idx] -= eps;
 
-        std::vector<Real> fwd, bwd;
+        std::vector<Real> fwd;
+        std::vector<Real> bwd;
         evaluate_values(forward, fwd);
         evaluate_values(backward, bwd);
 
@@ -344,18 +128,20 @@ void BasisFunction::numerical_hessian(const math::Vector<Real, 3>& xi,
     for (int d = 0; d < dimension(); ++d) {
         math::Vector<Real, 3> forward = xi;
         math::Vector<Real, 3> backward = xi;
-        const std::size_t col = static_cast<std::size_t>(d);
+        const auto col = static_cast<std::size_t>(d);
         forward[col] += eps;
         backward[col] -= eps;
 
-        std::vector<Gradient> g_forward, g_backward;
+        std::vector<Gradient> g_forward;
+        std::vector<Gradient> g_backward;
         evaluate_gradients(forward, g_forward);
         evaluate_gradients(backward, g_backward);
 
         for (std::size_t i = 0; i < base_grad.size(); ++i) {
             for (int k = 0; k < dimension(); ++k) {
-                const std::size_t row = static_cast<std::size_t>(k);
-                hessians[i](row, col) = (g_forward[i][row] - g_backward[i][row]) / (Real(2) * eps);
+                const auto row = static_cast<std::size_t>(k);
+                hessians[i](row, col) =
+                    (g_forward[i][row] - g_backward[i][row]) / (Real(2) * eps);
             }
         }
     }
diff --git a/Code/Source/solver/FE/Basis/BasisFunction.h b/Code/Source/solver/FE/Basis/BasisFunction.h
index ee38a5b19..dbabf7061 100644
--- a/Code/Source/solver/FE/Basis/BasisFunction.h
+++ b/Code/Source/solver/FE/Basis/BasisFunction.h
@@ -8,24 +8,12 @@
 #ifndef SVMP_FE_BASIS_BASISFUNCTION_H
 #define SVMP_FE_BASIS_BASISFUNCTION_H
 
-/**
- * @file BasisFunction.h
- * @brief Abstract interface for basis function evaluation on reference elements
- *
- * The Basis module operates purely on reference elements and is independent of
- * mesh-specific data structures. Implementations may leverage Math and
- * Quadrature utilities but must not read mesh connectivity or geometry.
- */
-
-#include "Types.h"
 #include "BasisExceptions.h"
-#include "Math/Vector.h"
 #include "Math/Matrix.h"
-#include <algorithm>
-#include <cstdint>
-#include <functional>
-#include <span>
-#include <string>
+#include "Math/Vector.h"
+#include "Types.h"
+
+#include <cstddef>
 #include <vector>
 
 namespace svmp {
@@ -34,18 +22,8 @@ namespace basis {
 
 using Gradient = math::Vector<Real, 3>;
 using Hessian  = math::Matrix<Real, 3, 3>;
-using VectorJacobian = math::Matrix<Real, 3, 3>;
-
-struct BasisIdentityFingerprint {
-    std::uint64_t hash_a{0};
-    std::uint64_t hash_b{0};
-};
-
-[[nodiscard]] BasisIdentityFingerprint
-compute_basis_identity_fingerprint(std::span<const std::uint64_t> words) noexcept;
 
-void prewarm_basis_function_scratch(std::size_t max_size,
-                                    std::size_t max_qpts = 0);
+void prewarm_basis_function_scratch(std::size_t max_size);
 
 [[nodiscard]] inline Hessian make_symmetric_hessian(Real xx,
                                                     Real yy,
@@ -57,363 +35,80 @@ void prewarm_basis_function_scratch(std::size_t max_size,
     hessian(0, 0) = xx;
     hessian(1, 1) = yy;
     hessian(2, 2) = zz;
-    hessian(0, 1) = xy;
-    hessian(1, 0) = xy;
-    hessian(0, 2) = xz;
-    hessian(2, 0) = xz;
-    hessian(1, 2) = yz;
-    hessian(2, 1) = yz;
+    hessian(0, 1) = hessian(1, 0) = xy;
+    hessian(0, 2) = hessian(2, 0) = xz;
+    hessian(1, 2) = hessian(2, 1) = yz;
     return hessian;
 }
 
-// Raw Hessian buffers use row-major 3x3 blocks:
-// dst[row * 3 + col] = H(row, col).
 inline void store_hessian(const Hessian& hessian, Real* dst) noexcept {
-    dst[0u] = hessian(0u, 0u);
-    dst[1u] = hessian(0u, 1u);
-    dst[2u] = hessian(0u, 2u);
-    dst[3u] = hessian(1u, 0u);
-    dst[4u] = hessian(1u, 1u);
-    dst[5u] = hessian(1u, 2u);
-    dst[6u] = hessian(2u, 0u);
-    dst[7u] = hessian(2u, 1u);
-    dst[8u] = hessian(2u, 2u);
-}
-
-inline void store_hessian_strided(const Hessian& hessian,
-                                  Real* dst,
-                                  std::size_t stride,
-                                  std::size_t offset) noexcept {
-    dst[0u * stride + offset] = hessian(0u, 0u);
-    dst[1u * stride + offset] = hessian(0u, 1u);
-    dst[2u * stride + offset] = hessian(0u, 2u);
-    dst[3u * stride + offset] = hessian(1u, 0u);
-    dst[4u * stride + offset] = hessian(1u, 1u);
-    dst[5u * stride + offset] = hessian(1u, 2u);
-    dst[6u * stride + offset] = hessian(2u, 0u);
-    dst[7u * stride + offset] = hessian(2u, 1u);
-    dst[8u * stride + offset] = hessian(2u, 2u);
-}
-
-inline void scatter_hessian_components_strided(const Real* src,
-                                               Real* dst,
-                                               std::size_t stride,
-                                               std::size_t offset) noexcept {
-    dst[0u * stride + offset] = src[0u];
-    dst[1u * stride + offset] = src[1u];
-    dst[2u * stride + offset] = src[2u];
-    dst[3u * stride + offset] = src[3u];
-    dst[4u * stride + offset] = src[4u];
-    dst[5u * stride + offset] = src[5u];
-    dst[6u * stride + offset] = src[6u];
-    dst[7u * stride + offset] = src[7u];
-    dst[8u * stride + offset] = src[8u];
+    dst[0] = hessian(0, 0);
+    dst[1] = hessian(0, 1);
+    dst[2] = hessian(0, 2);
+    dst[3] = hessian(1, 0);
+    dst[4] = hessian(1, 1);
+    dst[5] = hessian(1, 2);
+    dst[6] = hessian(2, 0);
+    dst[7] = hessian(2, 1);
+    dst[8] = hessian(2, 2);
 }
 
 [[nodiscard]] inline Hessian load_hessian(const Real* src) noexcept {
     Hessian hessian{};
-    hessian(0u, 0u) = src[0u];
-    hessian(0u, 1u) = src[1u];
-    hessian(0u, 2u) = src[2u];
-    hessian(1u, 0u) = src[3u];
-    hessian(1u, 1u) = src[4u];
-    hessian(1u, 2u) = src[5u];
-    hessian(2u, 0u) = src[6u];
-    hessian(2u, 1u) = src[7u];
-    hessian(2u, 2u) = src[8u];
+    hessian(0, 0) = src[0];
+    hessian(0, 1) = src[1];
+    hessian(0, 2) = src[2];
+    hessian(1, 0) = src[3];
+    hessian(1, 1) = src[4];
+    hessian(1, 2) = src[5];
+    hessian(2, 0) = src[6];
+    hessian(2, 1) = src[7];
+    hessian(2, 2) = src[8];
     return hessian;
 }
 
 inline void add_scaled_hessian(Hessian& target,
                                const Hessian& source,
                                Real scale) noexcept {
-    target(0u, 0u) += scale * source(0u, 0u);
-    target(0u, 1u) += scale * source(0u, 1u);
-    target(0u, 2u) += scale * source(0u, 2u);
-    target(1u, 0u) += scale * source(1u, 0u);
-    target(1u, 1u) += scale * source(1u, 1u);
-    target(1u, 2u) += scale * source(1u, 2u);
-    target(2u, 0u) += scale * source(2u, 0u);
-    target(2u, 1u) += scale * source(2u, 1u);
-    target(2u, 2u) += scale * source(2u, 2u);
+    for (std::size_t r = 0; r < 3u; ++r) {
+        for (std::size_t c = 0; c < 3u; ++c) {
+            target(r, c) += scale * source(r, c);
+        }
+    }
 }
 
-/**
- * @brief Base interface for scalar and vector-valued basis families
- *
- * All basis implementations operate in reference space. Physical mappings are
- * handled by the Geometry module. Derivatives are returned with unused
- * components set to zero for lower dimensional elements.
- */
 class BasisFunction {
 public:
     virtual ~BasisFunction() = default;
 
-    /// Basis family identifier
     virtual BasisType basis_type() const noexcept = 0;
-
-    /// Underlying element type on the reference domain
     virtual ElementType element_type() const noexcept = 0;
-
-    /// Reference dimensionality (1, 2, or 3)
     virtual int dimension() const noexcept = 0;
-
-    /// Polynomial order (modal/nodal definition dependent)
     virtual int order() const noexcept = 0;
-
-    /// Number of basis functions (scalar or vector-valued)
     virtual std::size_t size() const noexcept = 0;
 
-    /**
-     * @brief Whether BasisCache can key this basis from common structural fields.
-     *
-     * Return true only when basis_type/element_type/dimension/order/size and
-     * vector-valued status fully determine evaluation behavior. Parameterized
-     * bases such as splines and custom user bases should keep the default false
-     * so BasisCache includes cache_identity() in the key.
-     */
-    virtual bool cache_identity_is_structural() const noexcept { return false; }
-
-    /// Whether the basis is vector-valued (H(div)/H(curl))
-    virtual bool is_vector_valued() const noexcept { return false; }
-
-    /// Whether vector-valued basis Jacobians are available.
-    virtual bool supports_vector_jacobians() const noexcept { return false; }
-
-    /// Whether vector-valued basis curls are available.
-    virtual bool supports_curl() const noexcept { return false; }
-
-    /// Whether vector-valued basis divergences are available.
-    virtual bool supports_divergence() const noexcept { return false; }
-
-    /**
-     * @brief Stable semantic identity used by BasisCache
-     *
-     * Derived classes should override this when evaluation depends on
-     * additional state beyond basis family / element / order metadata.
-     */
-    virtual std::string cache_identity() const;
-
-    /**
-     * @brief Optional exact structured identity payload for BasisCache keys.
-     *
-     * Parameterized bases may append stable integer/bit-pattern words and
-     * return true to let BasisCache avoid using cache_identity() as the exact
-     * key payload. The human-readable cache_identity() remains available for
-     * diagnostics and for custom bases that do not implement this path.
-     */
-    virtual bool cache_identity_words(std::vector<std::uint64_t>& words) const;
-
-    /**
-     * @brief Optional cached fingerprint for structured identity words.
-     *
-     * Implementations that precompute cache_identity_words() may also cache the
-     * corresponding fingerprint. BasisCache still retains exact identity words
-     * for equality after hash matches.
-     */
-    virtual bool cache_identity_fingerprint(std::uint64_t& hash_a,
-                                            std::uint64_t& hash_b) const;
-
-    /**
-     * @brief Evaluate scalar basis values at a reference point
-     * @param xi Reference coordinates (unused entries are ignored)
-     * @param[out] values Output array resized to size()
-     */
     virtual void evaluate_values(const math::Vector<Real, 3>& xi,
                                  std::vector<Real>& values) const = 0;
-
-    /**
-     * @brief Evaluate gradients of scalar basis functions
-     *
-     * Production bases must override this with analytic derivatives.
-     * Use numerical_gradient explicitly in tests or diagnostics when a finite
-     * difference approximation is intended.
-     */
     virtual void evaluate_gradients(const math::Vector<Real, 3>& xi,
                                     std::vector<Gradient>& gradients) const;
-
-    /**
-     * @brief Evaluate Hessians of scalar basis functions
-     *
-     * Production bases must override this with analytic second derivatives.
-     * Use numerical_hessian explicitly in tests or diagnostics when a finite
-     * difference approximation is intended.
-     */
     virtual void evaluate_hessians(const math::Vector<Real, 3>& xi,
                                    std::vector<Hessian>& hessians) const;
-
-    /**
-     * @brief Fused evaluation of values, gradients, and Hessians at one point
-     *
-     * Default implementation calls evaluate_values, evaluate_gradients, and
-     * evaluate_hessians in sequence. Bases that share intermediate
-     * computations (e.g., LagrangeBasis sharing per-axis 1D evaluations)
-     * should override this to avoid redundant work.
-     */
     virtual void evaluate_all(const math::Vector<Real, 3>& xi,
                               std::vector<Real>& values,
                               std::vector<Gradient>& gradients,
                               std::vector<Hessian>& hessians) const;
 
-    /**
-     * @brief Fill SoA buffers with basis evaluations at all quadrature points
-     *
-     * Outputs are written directly to caller-provided strided buffers in
-     * DOF-major SoA layout — no scratch+transpose required by the caller.
-     * Pass `nullptr` for any output that is not needed.
-     *
-     *   values_out:    size num_dofs * num_qpts; element [d * num_qpts + q]
-     *   gradients_out: size num_dofs * 3 * num_qpts; element [(d*3 + c) * num_qpts + q]
-     *   hessians_out:  size num_dofs * 9 * num_qpts; element [(d*9 + r*3 + c) * num_qpts + q]
-     *
-     * Non-null output ranges must not overlap each other. Implementations may
-     * fill requested quantities in any order that is efficient for the basis.
-     *
-     * Default implementation calls evaluate_all (or evaluate_values/gradients/
-     * hessians as appropriate) per QP, materializing into temp buffers then
-     * scatter-writing to the output. Performance-sensitive bases must override
-     * this path so batched assembly does not fall back to Q virtual point
-     * evaluations. Unit coverage keeps an explicit list of hot bases that are
-     * expected to provide a direct strided implementation.
-     */
-    virtual void evaluate_at_quadrature_points(
-        const std::vector<math::Vector<Real, 3>>& points,
-        Real* SVMP_RESTRICT values_out,
-        Real* SVMP_RESTRICT gradients_out,
-        Real* SVMP_RESTRICT hessians_out) const;
-
-    /**
-     * @brief Fill strided SoA buffers with basis evaluations at quadrature points
-     *
-     * Same component layout as evaluate_at_quadrature_points, but each
-     * dof/component row advances by `output_stride` rather than `points.size()`.
-     * This lets padded SIMD cache storage be filled directly. Non-null output
-     * ranges have the same non-overlap requirement.
-     */
-    virtual void evaluate_at_quadrature_points_strided(
-        const std::vector<math::Vector<Real, 3>>& points,
-        std::size_t output_stride,
-        Real* SVMP_RESTRICT values_out,
-        Real* SVMP_RESTRICT gradients_out,
-        Real* SVMP_RESTRICT hessians_out) const;
-
-    /**
-     * @brief Fill zero-initialized scalar cache storage.
-     *
-     * BasisCache allocates and zero-initializes its scalar SoA buffers before
-     * calling this hook. The default implementation overwrites all requested
-     * entries through the public strided evaluator. Sparse-support bases may
-     * override this and write only active entries, relying on the caller's
-     * zero-initialization for inactive DOFs and unused derivative components.
-     */
-    virtual void fill_scalar_cache_entry(
-        const std::vector<math::Vector<Real, 3>>& points,
-        std::size_t output_stride,
-        Real* SVMP_RESTRICT values_out,
-        Real* SVMP_RESTRICT gradients_out,
-        Real* SVMP_RESTRICT hessians_out) const;
-
-    /**
-     * @brief Fill SoA buffers with vector-basis evaluations at all quadrature points
-     *
-     * Outputs are written in DOF-major SoA layout. Pass `nullptr` for any
-     * quantity that is not needed.
-     *
-     *   values_out:     size num_dofs * 3 * num_qpts; element [(d*3 + c) * num_qpts + q]
-     *   jacobians_out:  size num_dofs * 9 * num_qpts; element [(d*9 + c*3 + r) * num_qpts + q]
-     *   curls_out:      size num_dofs * 3 * num_qpts; element [(d*3 + c) * num_qpts + q]
-     *   divergence_out: size num_dofs * num_qpts; element [d * num_qpts + q]
-     *
-     * Non-null output ranges must not overlap each other. Implementations may
-     * fill requested quantities in any order that is efficient for the basis.
-     */
-    virtual void evaluate_vector_at_quadrature_points(
-        const std::vector<math::Vector<Real, 3>>& points,
-        Real* SVMP_RESTRICT values_out,
-        Real* SVMP_RESTRICT jacobians_out,
-        Real* SVMP_RESTRICT curls_out,
-        Real* SVMP_RESTRICT divergence_out) const;
-
-    /**
-     * @brief Fill strided SoA buffers with vector-basis evaluations
-     *
-     * Same component layout as evaluate_vector_at_quadrature_points, but each
-     * dof/component row advances by `output_stride` rather than `points.size()`.
-     * Non-null output ranges have the same non-overlap requirement.
-     *
-     * The base fallback loops over quadrature points through virtual point
-     * evaluation. H(div)/H(curl) bases used in assembly should override this
-     * method directly, and tests track the current hot vector families.
-     */
-    virtual void evaluate_vector_at_quadrature_points_strided(
-        const std::vector<math::Vector<Real, 3>>& points,
-        std::size_t output_stride,
-        Real* SVMP_RESTRICT values_out,
-        Real* SVMP_RESTRICT jacobians_out,
-        Real* SVMP_RESTRICT curls_out,
-        Real* SVMP_RESTRICT divergence_out) const;
-
-    /**
-     * @brief Evaluate scalar basis values into a caller-provided raw buffer
-     *
-     * Caller is responsible for providing a buffer of at least size() Real
-     * entries. This avoids the per-call std::vector::resize() cost of the
-     * vector-output overload. Default implementation forwards through a temp
-     * vector; bases should override for direct write.
-     */
     virtual void evaluate_values_to(const math::Vector<Real, 3>& xi,
                                     Real* SVMP_RESTRICT values_out) const;
-
-    /**
-     * @brief Evaluate gradients into a flat caller-provided buffer
-     *
-     * Layout: gradients_out[i * 3 + c] = component c of gradient of basis i.
-     * Caller provides a buffer of size() * 3 Real entries.
-     */
     virtual void evaluate_gradients_to(const math::Vector<Real, 3>& xi,
                                        Real* SVMP_RESTRICT gradients_out) const;
-
-    /**
-     * @brief Evaluate Hessians into a flat caller-provided buffer
-     *
-     * Layout: hessians_out[i * 9 + r * 3 + c] = H_i(r, c).
-     */
     virtual void evaluate_hessians_to(const math::Vector<Real, 3>& xi,
                                       Real* SVMP_RESTRICT hessians_out) const;
 
-    /**
-     * @brief Evaluate vector-valued basis functions (H(div)/H(curl))
-     *
-     * Default implementation throws; vector bases must override.
-     */
-    virtual void evaluate_vector_values(const math::Vector<Real, 3>& xi,
-                                        std::vector<math::Vector<Real, 3>>& values) const;
-
-    /**
-     * @brief Evaluate reference-space Jacobians of vector-valued basis functions
-     *
-     * The returned matrix for basis function `i` has entries
-     * `jacobians[i](component, derivative_direction) = d phi_i_component / d xi_direction`.
-     * Unused rows/columns are zero-filled for lower-dimensional elements.
-     */
-    virtual void evaluate_vector_jacobians(const math::Vector<Real, 3>& xi,
-                                           std::vector<VectorJacobian>& jacobians) const;
-
-    /// Evaluate divergence of vector-valued basis functions (if applicable)
-    virtual void evaluate_divergence(const math::Vector<Real, 3>& xi,
-                                     std::vector<Real>& divergence) const;
-
-    /// Evaluate curl of vector-valued basis functions (if applicable)
-    virtual void evaluate_curl(const math::Vector<Real, 3>& xi,
-                               std::vector<math::Vector<Real, 3>>& curl) const;
-
 protected:
-    /// Finite-difference helper for gradients of scalar bases
     void numerical_gradient(const math::Vector<Real, 3>& xi,
                             std::vector<Gradient>& gradients,
                             Real eps = Real(1e-6)) const;
-
-    /// Finite-difference helper for Hessians of scalar bases
     void numerical_hessian(const math::Vector<Real, 3>& xi,
                            std::vector<Hessian>& hessians,
                            Real eps = Real(1e-5)) const;
diff --git a/Code/Source/solver/FE/Basis/BasisTolerance.h b/Code/Source/solver/FE/Basis/BasisTolerance.h
deleted file mode 100644
index 423551f09..000000000
--- a/Code/Source/solver/FE/Basis/BasisTolerance.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright (c) Stanford University, The Regents of the University of California, and others.
- *
- * All Rights Reserved.
- *
- * See License file.
- */
-
-#ifndef SVMP_FE_BASIS_BASISTOLERANCE_H
-#define SVMP_FE_BASIS_BASISTOLERANCE_H
-
-#include "Types.h"
-
-#include <limits>
-
-namespace svmp {
-namespace FE {
-namespace basis {
-namespace detail {
-
-[[nodiscard]] constexpr Real basis_abs(Real value) noexcept {
-    return value < Real(0) ? -value : value;
-}
-
-[[nodiscard]] constexpr Real basis_max(Real lhs, Real rhs) noexcept {
-    return lhs < rhs ? rhs : lhs;
-}
-
-[[nodiscard]] constexpr Real basis_scaled_tolerance(Real scale = Real(1),
-                                                    Real multiplier = Real(64)) noexcept {
-    return multiplier * std::numeric_limits<Real>::epsilon() *
-           basis_max(Real(1), basis_abs(scale));
-}
-
-[[nodiscard]] constexpr bool basis_near_zero(Real value,
-                                             Real scale = Real(1),
-                                             Real multiplier = Real(64)) noexcept {
-    return basis_abs(value) <= basis_scaled_tolerance(scale, multiplier);
-}
-
-[[nodiscard]] constexpr bool basis_nearly_equal(Real a,
-                                                Real b,
-                                                Real multiplier = Real(64)) noexcept {
-    const Real scale = basis_max(Real(1), basis_max(basis_abs(a), basis_abs(b)));
-    return basis_abs(a - b) <= basis_scaled_tolerance(scale, multiplier);
-}
-
-} // namespace detail
-} // namespace basis
-} // namespace FE
-} // namespace svmp
-
-#endif // SVMP_FE_BASIS_BASISTOLERANCE_H
diff --git a/Code/Source/solver/FE/Basis/BasisTraits.h b/Code/Source/solver/FE/Basis/BasisTraits.h
index 835dfe705..d97b59f1f 100644
--- a/Code/Source/solver/FE/Basis/BasisTraits.h
+++ b/Code/Source/solver/FE/Basis/BasisTraits.h
@@ -11,6 +11,7 @@
 #include "Types.h"
 
 #include <cstddef>
+#include <limits>
 
 namespace svmp {
 namespace FE {
@@ -25,9 +26,39 @@ enum class BasisTopology {
     Tetrahedron,
     Hexahedron,
     Wedge,
-    Pyramid,
 };
 
+namespace detail {
+
+[[nodiscard]] constexpr Real basis_abs(Real value) noexcept {
+    return value < Real(0) ? -value : value;
+}
+
+[[nodiscard]] constexpr Real basis_max(Real lhs, Real rhs) noexcept {
+    return lhs < rhs ? rhs : lhs;
+}
+
+[[nodiscard]] constexpr Real basis_scaled_tolerance(Real scale = Real(1),
+                                                    Real multiplier = Real(64)) noexcept {
+    return multiplier * std::numeric_limits<Real>::epsilon() *
+           basis_max(Real(1), basis_abs(scale));
+}
+
+[[nodiscard]] constexpr bool basis_near_zero(Real value,
+                                             Real scale = Real(1),
+                                             Real multiplier = Real(64)) noexcept {
+    return basis_abs(value) <= basis_scaled_tolerance(scale, multiplier);
+}
+
+[[nodiscard]] constexpr bool basis_nearly_equal(Real a,
+                                                Real b,
+                                                Real multiplier = Real(64)) noexcept {
+    const Real scale = basis_max(Real(1), basis_max(basis_abs(a), basis_abs(b)));
+    return basis_abs(a - b) <= basis_scaled_tolerance(scale, multiplier);
+}
+
+} // namespace detail
+
 [[nodiscard]] constexpr bool is_point(ElementType type) noexcept {
     return type == ElementType::Point1;
 }
@@ -60,8 +91,8 @@ enum class BasisTopology {
 }
 
 [[nodiscard]] constexpr bool is_pyramid(ElementType type) noexcept {
-    return type == ElementType::Pyramid5 || type == ElementType::Pyramid13 ||
-           type == ElementType::Pyramid14;
+    (void)type;
+    return false;
 }
 
 [[nodiscard]] constexpr bool is_simplex(ElementType type) noexcept {
@@ -98,9 +129,6 @@ enum class BasisTopology {
     if (is_wedge(type)) {
         return BasisTopology::Wedge;
     }
-    if (is_pyramid(type)) {
-        return BasisTopology::Pyramid;
-    }
     return BasisTopology::Unknown;
 }
 
@@ -124,9 +152,6 @@ enum class BasisTopology {
         case ElementType::Wedge6:
         case ElementType::Wedge18:
             return ElementType::Wedge6;
-        case ElementType::Pyramid5:
-        case ElementType::Pyramid14:
-            return ElementType::Pyramid5;
         default:
             return type;
     }
@@ -140,7 +165,6 @@ enum class BasisTopology {
         case ElementType::Tetra4:
         case ElementType::Hex8:
         case ElementType::Wedge6:
-        case ElementType::Pyramid5:
             return 1;
         case ElementType::Line3:
         case ElementType::Triangle6:
@@ -148,7 +172,6 @@ enum class BasisTopology {
         case ElementType::Tetra10:
         case ElementType::Hex27:
         case ElementType::Wedge18:
-        case ElementType::Pyramid14:
             return 2;
         default:
             return -1;
@@ -179,14 +202,6 @@ enum class BasisTopology {
     return triangle_lagrange_size(order) * line_lagrange_size(order);
 }
 
-[[nodiscard]] constexpr std::size_t pyramid_lagrange_size(int order) noexcept {
-    if (order < 0) {
-        return 0u;
-    }
-    const std::size_t p = static_cast<std::size_t>(order);
-    return (p + 1u) * (p + 2u) * (2u * p + 3u) / 6u;
-}
-
 [[nodiscard]] constexpr std::size_t complete_lagrange_alias_size(ElementType type) noexcept {
     const int order = complete_lagrange_alias_order(type);
     switch (canonical_lagrange_type(type)) {
@@ -204,8 +219,6 @@ enum class BasisTopology {
             return hex_lagrange_size(order);
         case ElementType::Wedge6:
             return wedge_lagrange_size(order);
-        case ElementType::Pyramid5:
-            return pyramid_lagrange_size(order);
         default:
             return 0u;
     }
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
index 63b947516..7516d514a 100644
--- a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
+++ b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
@@ -6,16 +6,11 @@
  */
 
 #include "LagrangeBasis.h"
-#include "BasisTraits.h"
-#include "BasisTolerance.h"
-#include "LagrangeBasisFast.h"
 #include "NodeOrderingConventions.h"
-#include "LagrangeBasisPyramid.h"
-#include "LagrangeBasisSimplex.h"
-#include "LagrangeBasisUtility.h"
+
 #include <algorithm>
+#include <array>
 #include <cmath>
-#include <unordered_map>
 
 namespace svmp {
 namespace FE {
@@ -23,8299 +18,597 @@ namespace basis {
 
 namespace {
 
-using LagrangeTopology = BasisTopology;
-
-#if defined(_MSC_VER)
-#define SVMP_LAGRANGE_NOINLINE __declspec(noinline)
-#define SVMP_LAGRANGE_ALIGN64
-#elif defined(__GNUC__) || defined(__clang__)
-#define SVMP_LAGRANGE_NOINLINE __attribute__((noinline))
-#define SVMP_LAGRANGE_ALIGN64 __attribute__((aligned(64)))
-#else
-#define SVMP_LAGRANGE_NOINLINE
-#define SVMP_LAGRANGE_ALIGN64
-#endif
+using Vec3 = math::Vector<Real, 3>;
 
-#ifndef FE_ALWAYS_INLINE
-#if defined(_MSC_VER)
-#define FE_ALWAYS_INLINE __forceinline
-#elif defined(__GNUC__) || defined(__clang__)
-#define FE_ALWAYS_INLINE __attribute__((always_inline)) inline
-#else
-#define FE_ALWAYS_INLINE inline
-#endif
-#endif
-
-SVMP_LAGRANGE_NOINLINE void evaluate_triangle_order1_gradients_strided(
-    std::size_t num_qpts,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT gradients_out);
+inline constexpr Real equispaced_pm_one_coord(int i, int order) {
+    if (order <= 0) {
+        return Real(0);
+    }
+    return Real(-1) + Real(2) * static_cast<Real>(i) / static_cast<Real>(order);
+}
 
-struct LagrangeTopologyTraits {
-    LagrangeTopology topology;
-    int dimension;
+struct AxisEval {
+    std::vector<Real> value;
+    std::vector<Real> first;
+    std::vector<Real> second;
 };
 
-struct SimplexExponentHash {
-    std::size_t operator()(const std::array<int, 4>& exponents) const noexcept {
-        std::size_t seed = 0x9e3779b97f4a7c15ull;
-        for (const int exponent : exponents) {
-            const auto value = static_cast<std::size_t>(exponent);
-            seed ^= value + 0x9e3779b97f4a7c15ull + (seed << 6u) + (seed >> 2u);
-        }
-        return seed;
-    }
+struct SimplexEval {
+    std::vector<Real> value;
+    std::vector<Gradient> gradient;
+    std::vector<Hessian> hessian;
 };
 
-template<typename T, std::size_t N>
-void assign_array(std::vector<T>& out, const std::array<T, N>& values) {
-    out.assign(values.begin(), values.end());
-}
-
-bool coordinate_matches_expected(Real coord, Real expected) noexcept {
-    return detail::basis_nearly_equal(coord, expected);
-}
+struct NormalizedLagrangeRequest {
+    ElementType element_type;
+    int order;
+};
 
-template<typename FastBasis>
-void evaluate_fast_outputs(const math::Vector<Real, 3>& xi,
-                           std::vector<Real>* values,
-                           std::vector<Gradient>* gradients,
-                           std::vector<Hessian>* hessians) {
-    if (values != nullptr) {
-        std::array<Real, FastBasis::n_dofs> fast_values{};
-        FastBasis::evaluate(xi, fast_values);
-        assign_array(*values, fast_values);
-    }
-    if (gradients != nullptr) {
-        std::array<Gradient, FastBasis::n_dofs> fast_gradients{};
-        FastBasis::evaluate_gradients(xi, fast_gradients);
-        assign_array(*gradients, fast_gradients);
-    }
-    if (hessians != nullptr) {
-        std::array<Hessian, FastBasis::n_dofs> fast_hessians{};
-        FastBasis::evaluate_hessians(xi, fast_hessians);
-        assign_array(*hessians, fast_hessians);
+BasisTopology supported_lagrange_topology(ElementType type) {
+    const BasisTopology top = topology(type);
+    if (top == BasisTopology::Unknown) {
+        throw BasisElementCompatibilityException("LagrangeBasis: unsupported element type",
+                                                __FILE__, __LINE__, __func__);
     }
+    return top;
 }
 
-template<typename FastBasis>
-void evaluate_fast_outputs_to(const math::Vector<Real, 3>& xi,
-                              Real* SVMP_RESTRICT values_out,
-                              Real* SVMP_RESTRICT gradients_out,
-                              Real* SVMP_RESTRICT hessians_out) {
-    if (values_out != nullptr) {
-        std::array<Real, FastBasis::n_dofs> fast_values{};
-        FastBasis::evaluate(xi, fast_values);
-        for (std::size_t i = 0; i < fast_values.size(); ++i) {
-            values_out[i] = fast_values[i];
-        }
-    }
-    if (gradients_out != nullptr) {
-        std::array<Gradient, FastBasis::n_dofs> fast_gradients{};
-        FastBasis::evaluate_gradients(xi, fast_gradients);
-        for (std::size_t i = 0; i < fast_gradients.size(); ++i) {
-            gradients_out[i * 3u + 0u] = fast_gradients[i][0];
-            gradients_out[i * 3u + 1u] = fast_gradients[i][1];
-            gradients_out[i * 3u + 2u] = fast_gradients[i][2];
-        }
-    }
-    if (hessians_out != nullptr) {
-        std::array<Hessian, FastBasis::n_dofs> fast_hessians{};
-        FastBasis::evaluate_hessians(xi, fast_hessians);
-        for (std::size_t i = 0; i < fast_hessians.size(); ++i) {
-            store_hessian(fast_hessians[i], hessians_out + i * 9u);
-        }
+NormalizedLagrangeRequest normalize_lagrange_request(ElementType element_type, int order) {
+    switch (element_type) {
+        case ElementType::Line3:
+            return {ElementType::Line2, std::max(order, 2)};
+        case ElementType::Triangle6:
+            return {ElementType::Triangle3, std::max(order, 2)};
+        case ElementType::Quad9:
+            return {ElementType::Quad4, std::max(order, 2)};
+        case ElementType::Tetra10:
+            return {ElementType::Tetra4, std::max(order, 2)};
+        case ElementType::Hex27:
+            return {ElementType::Hex8, std::max(order, 2)};
+        case ElementType::Wedge18:
+            return {ElementType::Wedge6, std::max(order, 2)};
+        case ElementType::Quad8:
+            throw BasisElementCompatibilityException(
+                "LagrangeBasis: Quad8 is serendipity; use SerendipityBasis",
+                __FILE__, __LINE__, __func__);
+        case ElementType::Hex20:
+            throw BasisElementCompatibilityException(
+                "LagrangeBasis: Hex20 is serendipity; use SerendipityBasis",
+                __FILE__, __LINE__, __func__);
+        case ElementType::Wedge15:
+            throw BasisElementCompatibilityException(
+                "LagrangeBasis: Wedge15 is serendipity; use SerendipityBasis",
+                __FILE__, __LINE__, __func__);
+        case ElementType::Pyramid5:
+        case ElementType::Pyramid13:
+        case ElementType::Pyramid14:
+            throw BasisElementCompatibilityException(
+                "LagrangeBasis: pyramid support has been removed from the current solver basis scope",
+                __FILE__, __LINE__, __func__);
+        default:
+            return {element_type, order};
     }
 }
 
-template<typename FastBasis>
-void evaluate_fast_outputs_strided(const std::vector<math::Vector<Real, 3>>& points,
-                                   std::size_t output_stride,
-                                   Real* SVMP_RESTRICT values_out,
-                                   Real* SVMP_RESTRICT gradients_out,
-                                   Real* SVMP_RESTRICT hessians_out) {
-    for (std::size_t q = 0; q < points.size(); ++q) {
-        const auto& xi = points[q];
-        if (values_out != nullptr) {
-            std::array<Real, FastBasis::n_dofs> fast_values{};
-            FastBasis::evaluate(xi, fast_values);
-            for (std::size_t i = 0; i < fast_values.size(); ++i) {
-                values_out[i * output_stride + q] = fast_values[i];
-            }
-        }
-        if (gradients_out != nullptr) {
-            std::array<Gradient, FastBasis::n_dofs> fast_gradients{};
-            FastBasis::evaluate_gradients(xi, fast_gradients);
-            for (std::size_t i = 0; i < fast_gradients.size(); ++i) {
-                Real* g = gradients_out + i * 3u * output_stride;
-                g[0u * output_stride + q] = fast_gradients[i][0];
-                g[1u * output_stride + q] = fast_gradients[i][1];
-                g[2u * output_stride + q] = fast_gradients[i][2];
-            }
-        }
-        if (hessians_out != nullptr) {
-            std::array<Hessian, FastBasis::n_dofs> fast_hessians{};
-            FastBasis::evaluate_hessians(xi, fast_hessians);
-            for (std::size_t i = 0; i < fast_hessians.size(); ++i) {
-                const Hessian& hessian = fast_hessians[i];
-                Real* H = hessians_out + i * 9u * output_stride;
-                H[0u * output_stride + q] = hessian(0, 0);
-                H[1u * output_stride + q] = hessian(0, 1);
-                H[2u * output_stride + q] = hessian(0, 2);
-                H[3u * output_stride + q] = hessian(1, 0);
-                H[4u * output_stride + q] = hessian(1, 1);
-                H[5u * output_stride + q] = hessian(1, 2);
-                H[6u * output_stride + q] = hessian(2, 0);
-                H[7u * output_stride + q] = hessian(2, 1);
-                H[8u * output_stride + q] = hessian(2, 2);
-            }
-        }
+std::size_t axis_index_pm_one(Real coord, int order) {
+    if (order <= 0) {
+        return 0u;
     }
+    const Real scaled = (coord + Real(1)) * Real(order) / Real(2);
+    return static_cast<std::size_t>(std::llround(scaled));
 }
 
-template<int Order>
-bool evaluate_fixed_lagrange_fast_order(LagrangeTopology topology,
-                                        const math::Vector<Real, 3>& xi,
-                                        std::vector<Real>* values,
-                                        std::vector<Gradient>* gradients,
-                                        std::vector<Hessian>* hessians) {
-    switch (topology) {
-        case LagrangeTopology::Line:
-            evaluate_fast_outputs<LagrangeLineFast<Order>>(xi, values, gradients, hessians);
-            return true;
-        case LagrangeTopology::Quadrilateral:
-            evaluate_fast_outputs<LagrangeQuadFast<Order>>(xi, values, gradients, hessians);
-            return true;
-        case LagrangeTopology::Hexahedron:
-            evaluate_fast_outputs<LagrangeHexFast<Order>>(xi, values, gradients, hessians);
-            return true;
-        case LagrangeTopology::Triangle:
-            evaluate_fast_outputs<LagrangeTriFast<Order>>(xi, values, gradients, hessians);
-            return true;
-        case LagrangeTopology::Tetrahedron:
-            evaluate_fast_outputs<LagrangeTetFast<Order>>(xi, values, gradients, hessians);
-            return true;
-        default:
-            return false;
+int simplex_lattice_index(Real value, int order) {
+    if (order <= 0) {
+        return 0;
     }
+    return static_cast<int>(std::llround(value * Real(order)));
 }
 
-template<int Order>
-bool evaluate_fixed_lagrange_fast_to_order(LagrangeTopology topology,
-                                           const math::Vector<Real, 3>& xi,
-                                           Real* SVMP_RESTRICT values_out,
-                                           Real* SVMP_RESTRICT gradients_out,
-                                           Real* SVMP_RESTRICT hessians_out) {
-    switch (topology) {
-        case LagrangeTopology::Line:
-            evaluate_fast_outputs_to<LagrangeLineFast<Order>>(xi, values_out, gradients_out, hessians_out);
-            return true;
-        case LagrangeTopology::Quadrilateral:
-            evaluate_fast_outputs_to<LagrangeQuadFast<Order>>(xi, values_out, gradients_out, hessians_out);
-            return true;
-        case LagrangeTopology::Hexahedron:
-            evaluate_fast_outputs_to<LagrangeHexFast<Order>>(xi, values_out, gradients_out, hessians_out);
-            return true;
-        case LagrangeTopology::Triangle:
-            evaluate_fast_outputs_to<LagrangeTriFast<Order>>(xi, values_out, gradients_out, hessians_out);
-            return true;
-        case LagrangeTopology::Tetrahedron:
-            evaluate_fast_outputs_to<LagrangeTetFast<Order>>(xi, values_out, gradients_out, hessians_out);
-            return true;
-        default:
-            return false;
+LagrangeBasis::SimplexExponent simplex_exponent_from_point(const Vec3& p,
+                                                           BasisTopology top,
+                                                           int order) {
+    LagrangeBasis::SimplexExponent e{0, 0, 0, 0};
+    if (order <= 0) {
+        return e;
     }
-}
-
-template<int Order>
-bool evaluate_fixed_lagrange_fast_strided_order(
-    LagrangeTopology topology,
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) {
-    switch (topology) {
-        case LagrangeTopology::Line:
-            evaluate_fast_outputs_strided<LagrangeLineFast<Order>>(
-                points, output_stride, values_out, gradients_out, hessians_out);
-            return true;
-        case LagrangeTopology::Quadrilateral:
-            evaluate_fast_outputs_strided<LagrangeQuadFast<Order>>(
-                points, output_stride, values_out, gradients_out, hessians_out);
-            return true;
-        case LagrangeTopology::Hexahedron:
-            evaluate_fast_outputs_strided<LagrangeHexFast<Order>>(
-                points, output_stride, values_out, gradients_out, hessians_out);
-            return true;
-        case LagrangeTopology::Triangle:
-            evaluate_fast_outputs_strided<LagrangeTriFast<Order>>(
-                points, output_stride, values_out, gradients_out, hessians_out);
-            return true;
-        case LagrangeTopology::Tetrahedron:
-            evaluate_fast_outputs_strided<LagrangeTetFast<Order>>(
-                points, output_stride, values_out, gradients_out, hessians_out);
-            return true;
-        default:
-            return false;
+    if (top == BasisTopology::Triangle) {
+        e[1] = simplex_lattice_index(p[0], order);
+        e[2] = simplex_lattice_index(p[1], order);
+        e[0] = order - e[1] - e[2];
+    } else {
+        e[1] = simplex_lattice_index(p[0], order);
+        e[2] = simplex_lattice_index(p[1], order);
+        e[3] = simplex_lattice_index(p[2], order);
+        e[0] = order - e[1] - e[2] - e[3];
     }
+    return e;
 }
 
-void evaluate_triangle_order3_values_strided(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out) {
-    Real* row0 = values_out + 0u * output_stride;
-    Real* row1 = values_out + 1u * output_stride;
-    Real* row2 = values_out + 2u * output_stride;
-    Real* row3 = values_out + 3u * output_stride;
-    Real* row4 = values_out + 4u * output_stride;
-    Real* row5 = values_out + 5u * output_stride;
-    Real* row6 = values_out + 6u * output_stride;
-    Real* row7 = values_out + 7u * output_stride;
-    Real* row8 = values_out + 8u * output_stride;
-    Real* row9 = values_out + 9u * output_stride;
-
-    if (points.size() == 4u && output_stride == 4u) {
-        Real p10[4];
-        Real p11[4];
-        Real p12[4];
-        Real p20[4];
-        Real p21[4];
-        Real p22[4];
-        Real p30[4];
-        Real p31[4];
-        Real p32[4];
-
-        for (std::size_t q = 0; q < 4u; ++q) {
-            const auto& xi = points[q];
-            const Real l1 = xi[0];
-            const Real l2 = xi[1];
-            const Real l0 = Real(1) - l1 - l2;
-
-            p10[q] = Real(3) * l0;
-            p11[q] = Real(3) * l1;
-            p12[q] = Real(3) * l2;
-            p20[q] = Real(0.5) * p10[q] * (p10[q] - Real(1));
-            p21[q] = Real(0.5) * p11[q] * (p11[q] - Real(1));
-            p22[q] = Real(0.5) * p12[q] * (p12[q] - Real(1));
-            p30[q] = (p10[q] * (p10[q] - Real(1)) * (p10[q] - Real(2))) / Real(6);
-            p31[q] = (p11[q] * (p11[q] - Real(1)) * (p11[q] - Real(2))) / Real(6);
-            p32[q] = (p12[q] * (p12[q] - Real(1)) * (p12[q] - Real(2))) / Real(6);
-        }
+void evaluate_1d_lagrange(Real x, const std::vector<Real>& nodes, AxisEval& out) {
+    const std::size_t n = nodes.size();
+    out.value.assign(n, Real(0));
+    out.first.assign(n, Real(0));
+    out.second.assign(n, Real(0));
 
-        row0[0] = p30[0]; row0[1] = p30[1]; row0[2] = p30[2]; row0[3] = p30[3];
-        row1[0] = p31[0]; row1[1] = p31[1]; row1[2] = p31[2]; row1[3] = p31[3];
-        row2[0] = p32[0]; row2[1] = p32[1]; row2[2] = p32[2]; row2[3] = p32[3];
-        row3[0] = p20[0] * p11[0];
-        row3[1] = p20[1] * p11[1];
-        row3[2] = p20[2] * p11[2];
-        row3[3] = p20[3] * p11[3];
-        row4[0] = p10[0] * p21[0];
-        row4[1] = p10[1] * p21[1];
-        row4[2] = p10[2] * p21[2];
-        row4[3] = p10[3] * p21[3];
-        row5[0] = p21[0] * p12[0];
-        row5[1] = p21[1] * p12[1];
-        row5[2] = p21[2] * p12[2];
-        row5[3] = p21[3] * p12[3];
-        row6[0] = p11[0] * p22[0];
-        row6[1] = p11[1] * p22[1];
-        row6[2] = p11[2] * p22[2];
-        row6[3] = p11[3] * p22[3];
-        row7[0] = p10[0] * p22[0];
-        row7[1] = p10[1] * p22[1];
-        row7[2] = p10[2] * p22[2];
-        row7[3] = p10[3] * p22[3];
-        row8[0] = p20[0] * p12[0];
-        row8[1] = p20[1] * p12[1];
-        row8[2] = p20[2] * p12[2];
-        row8[3] = p20[3] * p12[3];
-        row9[0] = p10[0] * p11[0] * p12[0];
-        row9[1] = p10[1] * p11[1] * p12[1];
-        row9[2] = p10[2] * p11[2] * p12[2];
-        row9[3] = p10[3] * p11[3] * p12[3];
+    if (n == 1u) {
+        out.value[0] = Real(1);
         return;
     }
 
-    for (std::size_t q = 0; q < points.size(); ++q) {
-        const auto& xi = points[q];
-        const Real l1 = xi[0];
-        const Real l2 = xi[1];
-        const Real l0 = Real(1) - l1 - l2;
-
-        const Real p10 = Real(3) * l0;
-        const Real p11 = Real(3) * l1;
-        const Real p12 = Real(3) * l2;
-        const Real p20 = Real(0.5) * p10 * (p10 - Real(1));
-        const Real p21 = Real(0.5) * p11 * (p11 - Real(1));
-        const Real p22 = Real(0.5) * p12 * (p12 - Real(1));
-        const Real p30 = (p10 * (p10 - Real(1)) * (p10 - Real(2))) / Real(6);
-        const Real p31 = (p11 * (p11 - Real(1)) * (p11 - Real(2))) / Real(6);
-        const Real p32 = (p12 * (p12 - Real(1)) * (p12 - Real(2))) / Real(6);
-
-        row0[q] = p30;
-        row1[q] = p31;
-        row2[q] = p32;
-        row3[q] = p20 * p11;
-        row4[q] = p10 * p21;
-        row5[q] = p21 * p12;
-        row6[q] = p11 * p22;
-        row7[q] = p10 * p22;
-        row8[q] = p20 * p12;
-        row9[q] = p10 * p11 * p12;
-    }
-}
+    for (std::size_t i = 0; i < n; ++i) {
+        Real denom = Real(1);
+        for (std::size_t j = 0; j < n; ++j) {
+            if (j != i) {
+                denom *= nodes[i] - nodes[j];
+            }
+        }
 
-void evaluate_triangle_order2_values_strided(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out) {
-    Real* row0 = values_out + 0u * output_stride;
-    Real* row1 = values_out + 1u * output_stride;
-    Real* row2 = values_out + 2u * output_stride;
-    Real* row3 = values_out + 3u * output_stride;
-    Real* row4 = values_out + 4u * output_stride;
-    Real* row5 = values_out + 5u * output_stride;
+        Real value = Real(1);
+        for (std::size_t j = 0; j < n; ++j) {
+            if (j != i) {
+                value *= x - nodes[j];
+            }
+        }
+        out.value[i] = value / denom;
 
-    if (points.size() == 4u && output_stride == 4u) {
-        Real l0[4];
-        Real l1[4];
-        Real l2[4];
-        for (std::size_t q = 0; q < 4u; ++q) {
-            const auto& xi = points[q];
-            l1[q] = xi[0];
-            l2[q] = xi[1];
-            l0[q] = Real(1) - l1[q] - l2[q];
+        Real first = Real(0);
+        for (std::size_t m = 0; m < n; ++m) {
+            if (m == i) {
+                continue;
+            }
+            Real product = Real(1);
+            for (std::size_t j = 0; j < n; ++j) {
+                if (j != i && j != m) {
+                    product *= x - nodes[j];
+                }
+            }
+            first += product;
         }
+        out.first[i] = first / denom;
 
-        row0[0] = l0[0] * (Real(2) * l0[0] - Real(1));
-        row0[1] = l0[1] * (Real(2) * l0[1] - Real(1));
-        row0[2] = l0[2] * (Real(2) * l0[2] - Real(1));
-        row0[3] = l0[3] * (Real(2) * l0[3] - Real(1));
-        row1[0] = l1[0] * (Real(2) * l1[0] - Real(1));
-        row1[1] = l1[1] * (Real(2) * l1[1] - Real(1));
-        row1[2] = l1[2] * (Real(2) * l1[2] - Real(1));
-        row1[3] = l1[3] * (Real(2) * l1[3] - Real(1));
-        row2[0] = l2[0] * (Real(2) * l2[0] - Real(1));
-        row2[1] = l2[1] * (Real(2) * l2[1] - Real(1));
-        row2[2] = l2[2] * (Real(2) * l2[2] - Real(1));
-        row2[3] = l2[3] * (Real(2) * l2[3] - Real(1));
-        row3[0] = Real(4) * l0[0] * l1[0];
-        row3[1] = Real(4) * l0[1] * l1[1];
-        row3[2] = Real(4) * l0[2] * l1[2];
-        row3[3] = Real(4) * l0[3] * l1[3];
-        row4[0] = Real(4) * l1[0] * l2[0];
-        row4[1] = Real(4) * l1[1] * l2[1];
-        row4[2] = Real(4) * l1[2] * l2[2];
-        row4[3] = Real(4) * l1[3] * l2[3];
-        row5[0] = Real(4) * l0[0] * l2[0];
-        row5[1] = Real(4) * l0[1] * l2[1];
-        row5[2] = Real(4) * l0[2] * l2[2];
-        row5[3] = Real(4) * l0[3] * l2[3];
-        return;
+        Real second = Real(0);
+        for (std::size_t m = 0; m < n; ++m) {
+            if (m == i) {
+                continue;
+            }
+            for (std::size_t l = 0; l < n; ++l) {
+                if (l == i || l == m) {
+                    continue;
+                }
+                Real product = Real(1);
+                for (std::size_t j = 0; j < n; ++j) {
+                    if (j != i && j != m && j != l) {
+                        product *= x - nodes[j];
+                    }
+                }
+                second += product;
+            }
+        }
+        out.second[i] = second / denom;
     }
+}
 
-    auto write_q = [&](std::size_t q) {
-        const auto& xi = points[q];
-        const Real l1 = xi[0];
-        const Real l2 = xi[1];
-        const Real l0 = Real(1) - l1 - l2;
-        row0[q] = l0 * (Real(2) * l0 - Real(1));
-        row1[q] = l1 * (Real(2) * l1 - Real(1));
-        row2[q] = l2 * (Real(2) * l2 - Real(1));
-        row3[q] = Real(4) * l0 * l1;
-        row4[q] = Real(4) * l1 * l2;
-        row5[q] = Real(4) * l0 * l2;
-    };
+std::array<Real, 3> simplex_factor(int alpha, Real lambda, int order) {
+    Real value = Real(1);
+    Real first = Real(0);
+    Real second = Real(0);
 
-    if (points.size() == 4u) {
-        write_q(0u);
-        write_q(1u);
-        write_q(2u);
-        write_q(3u);
-        return;
+    for (int m = 0; m < alpha; ++m) {
+        const Real factor = Real(order) * lambda - Real(m);
+        const Real inv = Real(1) / Real(m + 1);
+        const Real old_value = value;
+        const Real old_first = first;
+        const Real old_second = second;
+        value = old_value * factor * inv;
+        first = (old_first * factor + old_value * Real(order)) * inv;
+        second = (old_second * factor + Real(2) * old_first * Real(order)) * inv;
     }
 
-    for (std::size_t q = 0; q < points.size(); ++q) {
-        write_q(q);
-    }
+    return {value, first, second};
 }
 
-void evaluate_triangle_order1_values_strided(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out) {
-    Real* row0 = values_out + 0u * output_stride;
-    Real* row1 = values_out + 1u * output_stride;
-    Real* row2 = values_out + 2u * output_stride;
+void evaluate_simplex(const Vec3& xi,
+                      BasisTopology top,
+                      int order,
+                      const std::vector<LagrangeBasis::SimplexExponent>& exponents,
+                      SimplexEval& out) {
+    const std::size_t n = exponents.size();
+    out.value.assign(n, Real(0));
+    out.gradient.assign(n, Gradient{});
+    out.hessian.assign(n, Hessian{});
 
-    for (std::size_t q = 0; q < points.size(); ++q) {
-        const auto& xi = points[q];
-        row0[q] = Real(1) - xi[0] - xi[1];
-        row1[q] = xi[0];
-        row2[q] = xi[1];
+    if (n == 1u && order == 0) {
+        out.value[0] = Real(1);
+        return;
     }
-}
 
-void evaluate_triangle_order2_gradients_strided(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT gradients_out) {
-    Real* row0 = gradients_out + 0u * 3u * output_stride;
-    Real* row1 = gradients_out + 1u * 3u * output_stride;
-    Real* row2 = gradients_out + 2u * 3u * output_stride;
-    Real* row3 = gradients_out + 3u * 3u * output_stride;
-    Real* row4 = gradients_out + 4u * 3u * output_stride;
-    Real* row5 = gradients_out + 5u * 3u * output_stride;
+    const int bary_count = top == BasisTopology::Triangle ? 3 : 4;
+    std::array<Real, 4> lambda{Real(0), Real(0), Real(0), Real(0)};
+    std::array<Gradient, 4> lambda_grad{};
 
-    for (std::size_t q = 0; q < points.size(); ++q) {
-        const auto& xi = points[q];
-        const Real l1 = xi[0];
-        const Real l2 = xi[1];
-        const Real l0 = Real(1) - l1 - l2;
-        const Real g0 = Real(1) - Real(4) * l0;
-        row0[0u * output_stride + q] = g0;
-        row0[1u * output_stride + q] = g0;
-        row0[2u * output_stride + q] = Real(0);
-        row1[0u * output_stride + q] = Real(4) * l1 - Real(1);
-        row1[1u * output_stride + q] = Real(0);
-        row1[2u * output_stride + q] = Real(0);
-        row2[0u * output_stride + q] = Real(0);
-        row2[1u * output_stride + q] = Real(4) * l2 - Real(1);
-        row2[2u * output_stride + q] = Real(0);
-        row3[0u * output_stride + q] = Real(4) * (l0 - l1);
-        row3[1u * output_stride + q] = Real(-4) * l1;
-        row3[2u * output_stride + q] = Real(0);
-        row4[0u * output_stride + q] = Real(4) * l2;
-        row4[1u * output_stride + q] = Real(4) * l1;
-        row4[2u * output_stride + q] = Real(0);
-        row5[0u * output_stride + q] = Real(-4) * l2;
-        row5[1u * output_stride + q] = Real(4) * (l0 - l2);
-        row5[2u * output_stride + q] = Real(0);
+    lambda[1] = xi[0];
+    lambda[2] = xi[1];
+    lambda_grad[1][0] = Real(1);
+    lambda_grad[2][1] = Real(1);
+    if (top == BasisTopology::Triangle) {
+        lambda[0] = Real(1) - xi[0] - xi[1];
+        lambda_grad[0][0] = Real(-1);
+        lambda_grad[0][1] = Real(-1);
+    } else {
+        lambda[3] = xi[2];
+        lambda[0] = Real(1) - xi[0] - xi[1] - xi[2];
+        lambda_grad[0][0] = Real(-1);
+        lambda_grad[0][1] = Real(-1);
+        lambda_grad[0][2] = Real(-1);
+        lambda_grad[3][2] = Real(1);
     }
-}
 
-inline void write_constant_hessian_q4(Real* SVMP_RESTRICT row,
-                                      std::size_t output_stride,
-                                      Real h00,
-                                      Real h01,
-                                      Real h02,
-                                      Real h10,
-                                      Real h11,
-                                      Real h12,
-                                      Real h20,
-                                      Real h21,
-                                      Real h22) {
-    Real* c0 = row + 0u * output_stride;
-    Real* c1 = row + 1u * output_stride;
-    Real* c2 = row + 2u * output_stride;
-    Real* c3 = row + 3u * output_stride;
-    Real* c4 = row + 4u * output_stride;
-    Real* c5 = row + 5u * output_stride;
-    Real* c6 = row + 6u * output_stride;
-    Real* c7 = row + 7u * output_stride;
-    Real* c8 = row + 8u * output_stride;
+    for (std::size_t i = 0; i < n; ++i) {
+        std::array<std::array<Real, 3>, 4> f{};
+        for (int a = 0; a < bary_count; ++a) {
+            f[static_cast<std::size_t>(a)] =
+                simplex_factor(exponents[i][static_cast<std::size_t>(a)],
+                               lambda[static_cast<std::size_t>(a)],
+                               order);
+        }
 
-    c0[0] = h00; c0[1] = h00; c0[2] = h00; c0[3] = h00;
-    c1[0] = h01; c1[1] = h01; c1[2] = h01; c1[3] = h01;
-    c2[0] = h02; c2[1] = h02; c2[2] = h02; c2[3] = h02;
-    c3[0] = h10; c3[1] = h10; c3[2] = h10; c3[3] = h10;
-    c4[0] = h11; c4[1] = h11; c4[2] = h11; c4[3] = h11;
-    c5[0] = h12; c5[1] = h12; c5[2] = h12; c5[3] = h12;
-    c6[0] = h20; c6[1] = h20; c6[2] = h20; c6[3] = h20;
-    c7[0] = h21; c7[1] = h21; c7[2] = h21; c7[3] = h21;
-    c8[0] = h22; c8[1] = h22; c8[2] = h22; c8[3] = h22;
-}
+        Real value = Real(1);
+        for (int a = 0; a < bary_count; ++a) {
+            value *= f[static_cast<std::size_t>(a)][0];
+        }
+        out.value[i] = value;
 
-SVMP_LAGRANGE_NOINLINE void evaluate_triangle_order2_hessians_q4(
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT hessians_out) {
-    write_constant_hessian_q4(hessians_out + 0u * 9u * output_stride,
-                              output_stride,
-                              Real(4), Real(4), Real(0),
-                              Real(4), Real(4), Real(0),
-                              Real(0), Real(0), Real(0));
-    write_constant_hessian_q4(hessians_out + 1u * 9u * output_stride,
-                              output_stride,
-                              Real(4), Real(0), Real(0),
-                              Real(0), Real(0), Real(0),
-                              Real(0), Real(0), Real(0));
-    write_constant_hessian_q4(hessians_out + 2u * 9u * output_stride,
-                              output_stride,
-                              Real(0), Real(0), Real(0),
-                              Real(0), Real(4), Real(0),
-                              Real(0), Real(0), Real(0));
-    write_constant_hessian_q4(hessians_out + 3u * 9u * output_stride,
-                              output_stride,
-                              Real(-8), Real(-4), Real(0),
-                              Real(-4), Real(0), Real(0),
-                              Real(0), Real(0), Real(0));
-    write_constant_hessian_q4(hessians_out + 4u * 9u * output_stride,
-                              output_stride,
-                              Real(0), Real(4), Real(0),
-                              Real(4), Real(0), Real(0),
-                              Real(0), Real(0), Real(0));
-    write_constant_hessian_q4(hessians_out + 5u * 9u * output_stride,
-                              output_stride,
-                              Real(0), Real(-4), Real(0),
-                              Real(-4), Real(-8), Real(0),
-                              Real(0), Real(0), Real(0));
-}
+        for (int a = 0; a < bary_count; ++a) {
+            Real product = f[static_cast<std::size_t>(a)][1];
+            for (int b = 0; b < bary_count; ++b) {
+                if (b != a) {
+                    product *= f[static_cast<std::size_t>(b)][0];
+                }
+            }
+            for (std::size_t c = 0; c < 3u; ++c) {
+                out.gradient[i][c] += product * lambda_grad[static_cast<std::size_t>(a)][c];
+            }
+        }
 
-SVMP_LAGRANGE_NOINLINE void evaluate_tet_order2_hessians_q4(
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT hessians_out) {
-    write_constant_hessian_q4(hessians_out + 0u * 9u * output_stride,
-                              output_stride,
-                              Real(4), Real(4), Real(4),
-                              Real(4), Real(4), Real(4),
-                              Real(4), Real(4), Real(4));
-    write_constant_hessian_q4(hessians_out + 1u * 9u * output_stride,
-                              output_stride,
-                              Real(4), Real(0), Real(0),
-                              Real(0), Real(0), Real(0),
-                              Real(0), Real(0), Real(0));
-    write_constant_hessian_q4(hessians_out + 2u * 9u * output_stride,
-                              output_stride,
-                              Real(0), Real(0), Real(0),
-                              Real(0), Real(4), Real(0),
-                              Real(0), Real(0), Real(0));
-    write_constant_hessian_q4(hessians_out + 3u * 9u * output_stride,
-                              output_stride,
-                              Real(0), Real(0), Real(0),
-                              Real(0), Real(0), Real(0),
-                              Real(0), Real(0), Real(4));
-    write_constant_hessian_q4(hessians_out + 4u * 9u * output_stride,
-                              output_stride,
-                              Real(-8), Real(-4), Real(-4),
-                              Real(-4), Real(0), Real(0),
-                              Real(-4), Real(0), Real(0));
-    write_constant_hessian_q4(hessians_out + 5u * 9u * output_stride,
-                              output_stride,
-                              Real(0), Real(4), Real(0),
-                              Real(4), Real(0), Real(0),
-                              Real(0), Real(0), Real(0));
-    write_constant_hessian_q4(hessians_out + 6u * 9u * output_stride,
-                              output_stride,
-                              Real(0), Real(-4), Real(0),
-                              Real(-4), Real(-8), Real(-4),
-                              Real(0), Real(-4), Real(0));
-    write_constant_hessian_q4(hessians_out + 7u * 9u * output_stride,
-                              output_stride,
-                              Real(0), Real(0), Real(-4),
-                              Real(0), Real(0), Real(-4),
-                              Real(-4), Real(-4), Real(-8));
-    write_constant_hessian_q4(hessians_out + 8u * 9u * output_stride,
-                              output_stride,
-                              Real(0), Real(0), Real(4),
-                              Real(0), Real(0), Real(0),
-                              Real(4), Real(0), Real(0));
-    write_constant_hessian_q4(hessians_out + 9u * 9u * output_stride,
-                              output_stride,
-                              Real(0), Real(0), Real(0),
-                              Real(0), Real(0), Real(4),
-                              Real(0), Real(4), Real(0));
+        for (int a = 0; a < bary_count; ++a) {
+            for (int b = 0; b < bary_count; ++b) {
+                Real product = (a == b)
+                    ? f[static_cast<std::size_t>(a)][2]
+                    : f[static_cast<std::size_t>(a)][1] *
+                      f[static_cast<std::size_t>(b)][1];
+                for (int c = 0; c < bary_count; ++c) {
+                    if (c != a && c != b) {
+                        product *= f[static_cast<std::size_t>(c)][0];
+                    }
+                }
+                for (std::size_t r = 0; r < 3u; ++r) {
+                    for (std::size_t c = 0; c < 3u; ++c) {
+                        out.hessian[i](r, c) +=
+                            product *
+                            lambda_grad[static_cast<std::size_t>(a)][r] *
+                            lambda_grad[static_cast<std::size_t>(b)][c];
+                    }
+                }
+            }
+        }
+    }
 }
 
-void evaluate_tet_order1_values_strided(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out) {
-    Real* row0 = values_out + 0u * output_stride;
-    Real* row1 = values_out + 1u * output_stride;
-    Real* row2 = values_out + 2u * output_stride;
-    Real* row3 = values_out + 3u * output_stride;
-
-    for (std::size_t q = 0; q < points.size(); ++q) {
-        const auto& xi = points[q];
-        row0[q] = Real(1) - xi[0] - xi[1] - xi[2];
-        row1[q] = xi[0];
-        row2[q] = xi[1];
-        row3[q] = xi[2];
-    }
+void store_gradient(const Gradient& gradient, Real* dst) {
+    dst[0] = gradient[0];
+    dst[1] = gradient[1];
+    dst[2] = gradient[2];
 }
 
-void evaluate_tet_order1_gradients_strided(
-    std::size_t num_qpts,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT gradients_out) {
-    Real* row0 = gradients_out + 0u * 3u * output_stride;
-    Real* row1 = gradients_out + 1u * 3u * output_stride;
-    Real* row2 = gradients_out + 2u * 3u * output_stride;
-    Real* row3 = gradients_out + 3u * 3u * output_stride;
+} // namespace
 
-    for (std::size_t q = 0; q < num_qpts; ++q) {
-        row0[0u * output_stride + q] = Real(-1);
-        row0[1u * output_stride + q] = Real(-1);
-        row0[2u * output_stride + q] = Real(-1);
-        row1[0u * output_stride + q] = Real(1);
-        row1[1u * output_stride + q] = Real(0);
-        row1[2u * output_stride + q] = Real(0);
-        row2[0u * output_stride + q] = Real(0);
-        row2[1u * output_stride + q] = Real(1);
-        row2[2u * output_stride + q] = Real(0);
-        row3[0u * output_stride + q] = Real(0);
-        row3[1u * output_stride + q] = Real(0);
-        row3[2u * output_stride + q] = Real(1);
-    }
+void prewarm_lagrange_basis_scratch(int max_order, std::size_t max_qpts) {
+    const auto n = static_cast<std::size_t>(std::max(0, max_order) + 1);
+    prewarm_basis_function_scratch(std::max(n * n * n, max_qpts));
 }
 
-void evaluate_zero_hessians_strided(
-    std::size_t num_nodes,
-    std::size_t num_qpts,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT hessians_out) {
-    if (num_qpts == 4u) {
-        for (std::size_t node = 0; node < num_nodes; ++node) {
-            write_constant_hessian_q4(hessians_out + node * 9u * output_stride,
-                                      output_stride,
-                                      Real(0), Real(0), Real(0),
-                                      Real(0), Real(0), Real(0),
-                                      Real(0), Real(0), Real(0));
-        }
-        return;
+LagrangeBasis::LagrangeBasis(ElementType type, int order)
+    : element_type_(type), order_(order) {
+    const auto normalized = normalize_lagrange_request(element_type_, order_);
+    element_type_ = normalized.element_type;
+    order_ = normalized.order;
+    if (order_ < 0) {
+        throw BasisConfigurationException("LagrangeBasis requires non-negative polynomial order",
+                                          __FILE__, __LINE__, __func__);
     }
 
-    for (std::size_t node = 0; node < num_nodes; ++node) {
-        Real* row = hessians_out + node * 9u * output_stride;
-        for (std::size_t q = 0; q < num_qpts; ++q) {
-            row[0u * output_stride + q] = Real(0);
-            row[1u * output_stride + q] = Real(0);
-            row[2u * output_stride + q] = Real(0);
-            row[3u * output_stride + q] = Real(0);
-            row[4u * output_stride + q] = Real(0);
-            row[5u * output_stride + q] = Real(0);
-            row[6u * output_stride + q] = Real(0);
-            row[7u * output_stride + q] = Real(0);
-            row[8u * output_stride + q] = Real(0);
-        }
-    }
+    topology_ = supported_lagrange_topology(element_type_);
+    dimension_ = reference_dimension(element_type_);
+    init_nodes();
 }
 
-void evaluate_tet_order2_values_strided(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out) {
-    Real* row0 = values_out + 0u * output_stride;
-    Real* row1 = values_out + 1u * output_stride;
-    Real* row2 = values_out + 2u * output_stride;
-    Real* row3 = values_out + 3u * output_stride;
-    Real* row4 = values_out + 4u * output_stride;
-    Real* row5 = values_out + 5u * output_stride;
-    Real* row6 = values_out + 6u * output_stride;
-    Real* row7 = values_out + 7u * output_stride;
-    Real* row8 = values_out + 8u * output_stride;
-    Real* row9 = values_out + 9u * output_stride;
-
-    if (points.size() == 4u && output_stride == 4u) {
-        Real l0[4];
-        Real l1[4];
-        Real l2[4];
-        Real l3[4];
-        for (std::size_t q = 0; q < 4u; ++q) {
-            const auto& xi = points[q];
-            l1[q] = xi[0];
-            l2[q] = xi[1];
-            l3[q] = xi[2];
-            l0[q] = Real(1) - l1[q] - l2[q] - l3[q];
-        }
-
-        row0[0] = l0[0] * (Real(2) * l0[0] - Real(1));
-        row0[1] = l0[1] * (Real(2) * l0[1] - Real(1));
-        row0[2] = l0[2] * (Real(2) * l0[2] - Real(1));
-        row0[3] = l0[3] * (Real(2) * l0[3] - Real(1));
-        row1[0] = l1[0] * (Real(2) * l1[0] - Real(1));
-        row1[1] = l1[1] * (Real(2) * l1[1] - Real(1));
-        row1[2] = l1[2] * (Real(2) * l1[2] - Real(1));
-        row1[3] = l1[3] * (Real(2) * l1[3] - Real(1));
-        row2[0] = l2[0] * (Real(2) * l2[0] - Real(1));
-        row2[1] = l2[1] * (Real(2) * l2[1] - Real(1));
-        row2[2] = l2[2] * (Real(2) * l2[2] - Real(1));
-        row2[3] = l2[3] * (Real(2) * l2[3] - Real(1));
-        row3[0] = l3[0] * (Real(2) * l3[0] - Real(1));
-        row3[1] = l3[1] * (Real(2) * l3[1] - Real(1));
-        row3[2] = l3[2] * (Real(2) * l3[2] - Real(1));
-        row3[3] = l3[3] * (Real(2) * l3[3] - Real(1));
-        row4[0] = Real(4) * l0[0] * l1[0];
-        row4[1] = Real(4) * l0[1] * l1[1];
-        row4[2] = Real(4) * l0[2] * l1[2];
-        row4[3] = Real(4) * l0[3] * l1[3];
-        row5[0] = Real(4) * l1[0] * l2[0];
-        row5[1] = Real(4) * l1[1] * l2[1];
-        row5[2] = Real(4) * l1[2] * l2[2];
-        row5[3] = Real(4) * l1[3] * l2[3];
-        row6[0] = Real(4) * l0[0] * l2[0];
-        row6[1] = Real(4) * l0[1] * l2[1];
-        row6[2] = Real(4) * l0[2] * l2[2];
-        row6[3] = Real(4) * l0[3] * l2[3];
-        row7[0] = Real(4) * l0[0] * l3[0];
-        row7[1] = Real(4) * l0[1] * l3[1];
-        row7[2] = Real(4) * l0[2] * l3[2];
-        row7[3] = Real(4) * l0[3] * l3[3];
-        row8[0] = Real(4) * l1[0] * l3[0];
-        row8[1] = Real(4) * l1[1] * l3[1];
-        row8[2] = Real(4) * l1[2] * l3[2];
-        row8[3] = Real(4) * l1[3] * l3[3];
-        row9[0] = Real(4) * l2[0] * l3[0];
-        row9[1] = Real(4) * l2[1] * l3[1];
-        row9[2] = Real(4) * l2[2] * l3[2];
-        row9[3] = Real(4) * l2[3] * l3[3];
-        return;
+void LagrangeBasis::init_equispaced_1d_nodes() {
+    nodes_1d_.resize(static_cast<std::size_t>(order_ + 1));
+    for (int i = 0; i <= order_; ++i) {
+        nodes_1d_[static_cast<std::size_t>(i)] =
+            equispaced_pm_one_coord(i, order_);
     }
+}
 
-    auto write_q = [&](std::size_t q) {
-        const auto& xi = points[q];
-        const Real l1 = xi[0];
-        const Real l2 = xi[1];
-        const Real l3 = xi[2];
-        const Real l0 = Real(1) - l1 - l2 - l3;
-        row0[q] = l0 * (Real(2) * l0 - Real(1));
-        row1[q] = l1 * (Real(2) * l1 - Real(1));
-        row2[q] = l2 * (Real(2) * l2 - Real(1));
-        row3[q] = l3 * (Real(2) * l3 - Real(1));
-        row4[q] = Real(4) * l0 * l1;
-        row5[q] = Real(4) * l1 * l2;
-        row6[q] = Real(4) * l0 * l2;
-        row7[q] = Real(4) * l0 * l3;
-        row8[q] = Real(4) * l1 * l3;
-        row9[q] = Real(4) * l2 * l3;
-    };
+void LagrangeBasis::init_nodes() {
+    nodes_.clear();
+    nodes_1d_.clear();
+    tensor_indices_.clear();
+    simplex_exponents_.clear();
+    wedge_indices_.clear();
 
-    if (points.size() == 4u) {
-        write_q(0u);
-        write_q(1u);
-        write_q(2u);
-        write_q(3u);
-        return;
+    switch (topology_) {
+        case BasisTopology::Point:
+            build_point_nodes();
+            return;
+        case BasisTopology::Line:
+            build_tensor_product_nodes(1);
+            return;
+        case BasisTopology::Quadrilateral:
+            build_tensor_product_nodes(2);
+            return;
+        case BasisTopology::Hexahedron:
+            build_tensor_product_nodes(3);
+            return;
+        case BasisTopology::Triangle:
+        case BasisTopology::Tetrahedron:
+            build_simplex_nodes();
+            return;
+        case BasisTopology::Wedge:
+            build_wedge_nodes();
+            return;
+        default:
+            break;
     }
 
-    for (std::size_t q = 0; q < points.size(); ++q) {
-        write_q(q);
-    }
+    throw BasisElementCompatibilityException("Unsupported element type in LagrangeBasis::init_nodes",
+                                             __FILE__, __LINE__, __func__);
 }
 
-inline void write_tet_order2_gradient_q(Real* SVMP_RESTRICT row,
-                                        std::size_t output_stride,
-                                        std::size_t q,
-                                        Real gx,
-                                        Real gy,
-                                        Real gz) {
-    row[0u * output_stride + q] = gx;
-    row[1u * output_stride + q] = gy;
-    row[2u * output_stride + q] = gz;
+void LagrangeBasis::build_point_nodes() {
+    nodes_.push_back(Vec3{Real(0), Real(0), Real(0)});
 }
 
-SVMP_LAGRANGE_NOINLINE void evaluate_tet_order2_gradients_strided(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT gradients_out) {
-    Real* row0 = gradients_out + 0u * 3u * output_stride;
-    Real* row1 = gradients_out + 1u * 3u * output_stride;
-    Real* row2 = gradients_out + 2u * 3u * output_stride;
-    Real* row3 = gradients_out + 3u * 3u * output_stride;
-    Real* row4 = gradients_out + 4u * 3u * output_stride;
-    Real* row5 = gradients_out + 5u * 3u * output_stride;
-    Real* row6 = gradients_out + 6u * 3u * output_stride;
-    Real* row7 = gradients_out + 7u * 3u * output_stride;
-    Real* row8 = gradients_out + 8u * 3u * output_stride;
-    Real* row9 = gradients_out + 9u * 3u * output_stride;
-
-    auto write_q = [&](std::size_t q) {
-        const auto& xi = points[q];
-        const Real l1 = xi[0];
-        const Real l2 = xi[1];
-        const Real l3 = xi[2];
-        const Real l0 = Real(1) - l1 - l2 - l3;
-        const Real four = Real(4);
-        const Real g0 = Real(1) - four * l0;
-
-        write_tet_order2_gradient_q(row0, output_stride, q, g0, g0, g0);
-        write_tet_order2_gradient_q(row1, output_stride, q, four * l1 - Real(1), Real(0), Real(0));
-        write_tet_order2_gradient_q(row2, output_stride, q, Real(0), four * l2 - Real(1), Real(0));
-        write_tet_order2_gradient_q(row3, output_stride, q, Real(0), Real(0), four * l3 - Real(1));
-        write_tet_order2_gradient_q(row4, output_stride, q, four * (l0 - l1), -four * l1, -four * l1);
-        write_tet_order2_gradient_q(row5, output_stride, q, four * l2, four * l1, Real(0));
-        write_tet_order2_gradient_q(row6, output_stride, q, -four * l2, four * (l0 - l2), -four * l2);
-        write_tet_order2_gradient_q(row7, output_stride, q, -four * l3, -four * l3, four * (l0 - l3));
-        write_tet_order2_gradient_q(row8, output_stride, q, four * l3, Real(0), four * l1);
-        write_tet_order2_gradient_q(row9, output_stride, q, Real(0), four * l3, four * l2);
-    };
-
-    if (points.size() == 4u) {
-        write_q(0u);
-        write_q(1u);
-        write_q(2u);
-        write_q(3u);
-        return;
-    }
-
-    for (std::size_t q = 0; q < points.size(); ++q) {
-        write_q(q);
+void LagrangeBasis::build_tensor_product_nodes(int dimensions) {
+    init_equispaced_1d_nodes();
+    nodes_ = ReferenceNodeLayout::get_lagrange_node_coords(element_type_, order_);
+    tensor_indices_.reserve(nodes_.size());
+    for (const auto& node : nodes_) {
+        TensorNodeIndex idx{0u, 0u, 0u};
+        idx[0] = axis_index_pm_one(node[0], order_);
+        if (dimensions >= 2) {
+            idx[1] = axis_index_pm_one(node[1], order_);
+        }
+        if (dimensions >= 3) {
+            idx[2] = axis_index_pm_one(node[2], order_);
+        }
+        tensor_indices_.push_back(idx);
     }
 }
 
-inline void fill_simplex_order3_factor_values(Real lambda, Real* SVMP_RESTRICT phi) {
-    const Real t = Real(3) * lambda;
-    phi[0] = Real(1);
-    phi[1] = t;
-    phi[2] = phi[1] * (t - Real(1)) * Real(0.5);
-    phi[3] = phi[2] * (t - Real(2)) / Real(3);
-}
-
-void evaluate_tet_order3_values_strided(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out) {
-    Real* row0 = values_out + 0u * output_stride;
-    Real* row1 = values_out + 1u * output_stride;
-    Real* row2 = values_out + 2u * output_stride;
-    Real* row3 = values_out + 3u * output_stride;
-    Real* row4 = values_out + 4u * output_stride;
-    Real* row5 = values_out + 5u * output_stride;
-    Real* row6 = values_out + 6u * output_stride;
-    Real* row7 = values_out + 7u * output_stride;
-    Real* row8 = values_out + 8u * output_stride;
-    Real* row9 = values_out + 9u * output_stride;
-    Real* row10 = values_out + 10u * output_stride;
-    Real* row11 = values_out + 11u * output_stride;
-    Real* row12 = values_out + 12u * output_stride;
-    Real* row13 = values_out + 13u * output_stride;
-    Real* row14 = values_out + 14u * output_stride;
-    Real* row15 = values_out + 15u * output_stride;
-    Real* row16 = values_out + 16u * output_stride;
-    Real* row17 = values_out + 17u * output_stride;
-    Real* row18 = values_out + 18u * output_stride;
-    Real* row19 = values_out + 19u * output_stride;
-
-    for (std::size_t q = 0; q < points.size(); ++q) {
-        const auto& xi = points[q];
-        const Real l1 = xi[0];
-        const Real l2 = xi[1];
-        const Real l3 = xi[2];
-        const Real l0 = Real(1) - l1 - l2 - l3;
-        Real p0[4];
-        Real p1[4];
-        Real p2[4];
-        Real p3[4];
-        fill_simplex_order3_factor_values(l0, p0);
-        fill_simplex_order3_factor_values(l1, p1);
-        fill_simplex_order3_factor_values(l2, p2);
-        fill_simplex_order3_factor_values(l3, p3);
-
-        row0[q] = p0[3];
-        row1[q] = p1[3];
-        row2[q] = p2[3];
-        row3[q] = p3[3];
-        row4[q] = p0[2] * p1[1];
-        row5[q] = p0[1] * p1[2];
-        row6[q] = p1[2] * p2[1];
-        row7[q] = p1[1] * p2[2];
-        row8[q] = p0[1] * p2[2];
-        row9[q] = p0[2] * p2[1];
-        row10[q] = p0[2] * p3[1];
-        row11[q] = p0[1] * p3[2];
-        row12[q] = p1[2] * p3[1];
-        row13[q] = p1[1] * p3[2];
-        row14[q] = p2[2] * p3[1];
-        row15[q] = p2[1] * p3[2];
-        row16[q] = p0[1] * p1[1] * p2[1];
-        row17[q] = p0[1] * p1[1] * p3[1];
-        row18[q] = p1[1] * p2[1] * p3[1];
-        row19[q] = p0[1] * p2[1] * p3[1];
+void LagrangeBasis::build_simplex_nodes() {
+    nodes_ = ReferenceNodeLayout::get_lagrange_node_coords(element_type_, order_);
+    simplex_exponents_.reserve(nodes_.size());
+    for (const auto& node : nodes_) {
+        simplex_exponents_.push_back(simplex_exponent_from_point(node, topology_, order_));
     }
 }
 
-void evaluate_triangle_order3_gradients_strided(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT gradients_out) {
-    Real* rows[10] = {
-        gradients_out + 0u * 3u * output_stride,
-        gradients_out + 1u * 3u * output_stride,
-        gradients_out + 2u * 3u * output_stride,
-        gradients_out + 3u * 3u * output_stride,
-        gradients_out + 4u * 3u * output_stride,
-        gradients_out + 5u * 3u * output_stride,
-        gradients_out + 6u * 3u * output_stride,
-        gradients_out + 7u * 3u * output_stride,
-        gradients_out + 8u * 3u * output_stride,
-        gradients_out + 9u * 3u * output_stride,
-    };
-
-    for (std::size_t q = 0; q < points.size(); ++q) {
-        const auto& xi = points[q];
-        const Real l1 = xi[0];
-        const Real l2 = xi[1];
-        const Real l0 = Real(1) - l1 - l2;
-
-        const Real p10 = Real(3) * l0;
-        const Real p11 = Real(3) * l1;
-        const Real p12 = Real(3) * l2;
-        const Real p20 = Real(0.5) * p10 * (p10 - Real(1));
-        const Real p21 = Real(0.5) * p11 * (p11 - Real(1));
-        const Real p22 = Real(0.5) * p12 * (p12 - Real(1));
-        const Real d10 = Real(3);
-        const Real d11 = Real(3);
-        const Real d12 = Real(3);
-        const Real d20 = Real(3) * p10 - Real(1.5);
-        const Real d21 = Real(3) * p11 - Real(1.5);
-        const Real d22 = Real(3) * p12 - Real(1.5);
-        const Real d30 = Real(1.5) * p10 * p10 - Real(3) * p10 + Real(1);
-        const Real d31 = Real(1.5) * p11 * p11 - Real(3) * p11 + Real(1);
-        const Real d32 = Real(1.5) * p12 * p12 - Real(3) * p12 + Real(1);
-
-        const Real dl0[10] = {
-            d30,
-            Real(0),
-            Real(0),
-            d20 * p11,
-            d10 * p21,
-            Real(0),
-            Real(0),
-            d10 * p22,
-            d20 * p12,
-            d10 * p11 * p12,
-        };
-        const Real dl1[10] = {
-            Real(0),
-            d31,
-            Real(0),
-            p20 * d11,
-            p10 * d21,
-            d21 * p12,
-            d11 * p22,
-            Real(0),
-            Real(0),
-            p10 * d11 * p12,
-        };
-        const Real dl2[10] = {
-            Real(0),
-            Real(0),
-            d32,
-            Real(0),
-            Real(0),
-            p21 * d12,
-            p11 * d22,
-            p10 * d22,
-            p20 * d12,
-            p10 * p11 * d12,
-        };
+void LagrangeBasis::build_wedge_nodes() {
+    init_equispaced_1d_nodes();
+    nodes_ = ReferenceNodeLayout::get_lagrange_node_coords(element_type_, order_);
+    const auto tri_nodes =
+        ReferenceNodeLayout::get_lagrange_node_coords(ElementType::Triangle3, order_);
+    simplex_exponents_.reserve(tri_nodes.size());
+    for (const auto& tri_node : tri_nodes) {
+        simplex_exponents_.push_back(
+            simplex_exponent_from_point(tri_node, BasisTopology::Triangle, order_));
+    }
 
-        for (std::size_t node = 0; node < 10u; ++node) {
-            Real* g = rows[node];
-            g[0u * output_stride + q] = dl1[node] - dl0[node];
-            g[1u * output_stride + q] = dl2[node] - dl0[node];
-            g[2u * output_stride + q] = Real(0);
+    wedge_indices_.reserve(nodes_.size());
+    for (const auto& node : nodes_) {
+        const auto tri_exp =
+            simplex_exponent_from_point(node, BasisTopology::Triangle, order_);
+        auto it = std::find(simplex_exponents_.begin(), simplex_exponents_.end(), tri_exp);
+        if (it == simplex_exponents_.end()) {
+            throw BasisConstructionException("LagrangeBasis: wedge node triangle index lookup failed",
+                                             __FILE__, __LINE__, __func__);
         }
+        const std::size_t tri_index =
+            static_cast<std::size_t>(std::distance(simplex_exponents_.begin(), it));
+        wedge_indices_.push_back({tri_index, axis_index_pm_one(node[2], order_)});
     }
 }
 
-void evaluate_hex_order1_values_strided(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out) {
-    Real* row0 = values_out + 0u * output_stride;
-    Real* row1 = values_out + 1u * output_stride;
-    Real* row2 = values_out + 2u * output_stride;
-    Real* row3 = values_out + 3u * output_stride;
-    Real* row4 = values_out + 4u * output_stride;
-    Real* row5 = values_out + 5u * output_stride;
-    Real* row6 = values_out + 6u * output_stride;
-    Real* row7 = values_out + 7u * output_stride;
-
-    const auto write_q = [&](std::size_t q) {
-        const auto& xi = points[q];
-        const Real lx = (Real(1) - xi[0]) * Real(0.5);
-        const Real ly = (Real(1) - xi[1]) * Real(0.5);
-        const Real lz = (Real(1) - xi[2]) * Real(0.5);
-        const Real ux = (Real(1) + xi[0]) * Real(0.5);
-        const Real uy = (Real(1) + xi[1]) * Real(0.5);
-        const Real uz = (Real(1) + xi[2]) * Real(0.5);
-        const Real lxly = lx * ly;
-        const Real uxly = ux * ly;
-        const Real uxuy = ux * uy;
-        const Real lxuy = lx * uy;
-        row0[q] = lxly * lz;
-        row1[q] = uxly * lz;
-        row2[q] = uxuy * lz;
-        row3[q] = lxuy * lz;
-        row4[q] = lxly * uz;
-        row5[q] = uxly * uz;
-        row6[q] = uxuy * uz;
-        row7[q] = lxuy * uz;
-    };
-    if (points.size() == 4u) {
-        write_q(0u);
-        write_q(1u);
-        write_q(2u);
-        write_q(3u);
+void LagrangeBasis::evaluate_all_to(const Vec3& xi,
+                                    Real* SVMP_RESTRICT values_out,
+                                    Real* SVMP_RESTRICT gradients_out,
+                                    Real* SVMP_RESTRICT hessians_out) const {
+    if (topology_ == BasisTopology::Point) {
+        if (values_out) {
+            values_out[0] = Real(1);
+        }
+        if (gradients_out) {
+            gradients_out[0] = gradients_out[1] = gradients_out[2] = Real(0);
+        }
+        if (hessians_out) {
+            std::fill_n(hessians_out, 9u, Real(0));
+        }
         return;
     }
 
-    for (std::size_t q = 0; q < points.size(); ++q) {
-        write_q(q);
-    }
-}
-
-template <bool NeedValues, bool NeedGradients, bool NeedHessians>
-void evaluate_hex_order1_outputs_strided(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) {
-    constexpr Real half = Real(0.5);
-    constexpr std::array<Real, 8> dx{{-half, half, half, -half, -half, half, half, -half}};
-    constexpr std::array<Real, 8> dy{{-half, -half, half, half, -half, -half, half, half}};
-    constexpr std::array<Real, 8> dz{{-half, -half, -half, -half, half, half, half, half}};
+    if (topology_ == BasisTopology::Line ||
+        topology_ == BasisTopology::Quadrilateral ||
+        topology_ == BasisTopology::Hexahedron) {
+        AxisEval ax;
+        AxisEval ay;
+        AxisEval az;
+        evaluate_1d_lagrange(xi[0], nodes_1d_, ax);
+        if (dimension_ >= 2) {
+            evaluate_1d_lagrange(xi[1], nodes_1d_, ay);
+        }
+        if (dimension_ >= 3) {
+            evaluate_1d_lagrange(xi[2], nodes_1d_, az);
+        }
 
-    for (std::size_t q = 0; q < points.size(); ++q) {
-        const auto& xi = points[q];
-        const Real lx = (Real(1) - xi[0]) * half;
-        const Real ly = (Real(1) - xi[1]) * half;
-        const Real lz = (Real(1) - xi[2]) * half;
-        const Real ux = (Real(1) + xi[0]) * half;
-        const Real uy = (Real(1) + xi[1]) * half;
-        const Real uz = (Real(1) + xi[2]) * half;
-        const Real xval[8] = {lx, ux, ux, lx, lx, ux, ux, lx};
-        const Real yval[8] = {ly, ly, uy, uy, ly, ly, uy, uy};
-        const Real zval[8] = {lz, lz, lz, lz, uz, uz, uz, uz};
+        for (std::size_t node = 0; node < tensor_indices_.size(); ++node) {
+            const auto& idx = tensor_indices_[node];
+            const Real vx = ax.value[idx[0]];
+            const Real dx = ax.first[idx[0]];
+            const Real d2x = ax.second[idx[0]];
+            const Real vy = dimension_ >= 2 ? ay.value[idx[1]] : Real(1);
+            const Real dy = dimension_ >= 2 ? ay.first[idx[1]] : Real(0);
+            const Real d2y = dimension_ >= 2 ? ay.second[idx[1]] : Real(0);
+            const Real vz = dimension_ >= 3 ? az.value[idx[2]] : Real(1);
+            const Real dz = dimension_ >= 3 ? az.first[idx[2]] : Real(0);
+            const Real d2z = dimension_ >= 3 ? az.second[idx[2]] : Real(0);
 
-        for (std::size_t node = 0; node < 8u; ++node) {
-            if constexpr (NeedValues) {
-                values_out[node * output_stride + q] =
-                    xval[node] * yval[node] * zval[node];
+            if (values_out) {
+                values_out[node] = vx * vy * vz;
             }
-            if constexpr (NeedGradients) {
-                Real* SVMP_RESTRICT g = gradients_out + node * 3u * output_stride;
-                g[0u * output_stride + q] = dx[node] * yval[node] * zval[node];
-                g[1u * output_stride + q] = xval[node] * dy[node] * zval[node];
-                g[2u * output_stride + q] = xval[node] * yval[node] * dz[node];
+            if (gradients_out) {
+                Real* g = gradients_out + node * 3u;
+                g[0] = dx * vy * vz;
+                g[1] = vx * dy * vz;
+                g[2] = vx * vy * dz;
             }
-            if constexpr (NeedHessians) {
-                Real* SVMP_RESTRICT H = hessians_out + node * 9u * output_stride;
-                const Real hxy = dx[node] * dy[node] * zval[node];
-                const Real hxz = dx[node] * yval[node] * dz[node];
-                const Real hyz = xval[node] * dy[node] * dz[node];
-                H[0u * output_stride + q] = Real(0);
-                H[1u * output_stride + q] = hxy;
-                H[2u * output_stride + q] = hxz;
-                H[3u * output_stride + q] = hxy;
-                H[4u * output_stride + q] = Real(0);
-                H[5u * output_stride + q] = hyz;
-                H[6u * output_stride + q] = hxz;
-                H[7u * output_stride + q] = hyz;
-                H[8u * output_stride + q] = Real(0);
+            if (hessians_out) {
+                Real* h = hessians_out + node * 9u;
+                h[0] = d2x * vy * vz;
+                h[1] = dx * dy * vz;
+                h[2] = dx * vy * dz;
+                h[3] = h[1];
+                h[4] = vx * d2y * vz;
+                h[5] = vx * dy * dz;
+                h[6] = h[2];
+                h[7] = h[5];
+                h[8] = vx * vy * d2z;
             }
         }
+        return;
     }
-}
-
-void evaluate_quad_order1_values_strided(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out) {
-    Real* row0 = values_out + 0u * output_stride;
-    Real* row1 = values_out + 1u * output_stride;
-    Real* row2 = values_out + 2u * output_stride;
-    Real* row3 = values_out + 3u * output_stride;
 
-    if (points.size() == 4u && output_stride == 4u) {
-        Real lx[4];
-        Real ux[4];
-        Real ly[4];
-        Real uy[4];
-        for (std::size_t q = 0; q < 4u; ++q) {
-            const auto& xi = points[q];
-            lx[q] = (Real(1) - xi[0]) * Real(0.5);
-            ux[q] = (Real(1) + xi[0]) * Real(0.5);
-            ly[q] = (Real(1) - xi[1]) * Real(0.5);
-            uy[q] = (Real(1) + xi[1]) * Real(0.5);
+    if (topology_ == BasisTopology::Triangle || topology_ == BasisTopology::Tetrahedron) {
+        SimplexEval simplex;
+        evaluate_simplex(xi, topology_, order_, simplex_exponents_, simplex);
+        for (std::size_t i = 0; i < simplex.value.size(); ++i) {
+            if (values_out) {
+                values_out[i] = simplex.value[i];
+            }
+            if (gradients_out) {
+                store_gradient(simplex.gradient[i], gradients_out + i * 3u);
+            }
+            if (hessians_out) {
+                store_hessian(simplex.hessian[i], hessians_out + i * 9u);
+            }
         }
-        row0[0] = lx[0] * ly[0];
-        row0[1] = lx[1] * ly[1];
-        row0[2] = lx[2] * ly[2];
-        row0[3] = lx[3] * ly[3];
-        row1[0] = ux[0] * ly[0];
-        row1[1] = ux[1] * ly[1];
-        row1[2] = ux[2] * ly[2];
-        row1[3] = ux[3] * ly[3];
-        row2[0] = ux[0] * uy[0];
-        row2[1] = ux[1] * uy[1];
-        row2[2] = ux[2] * uy[2];
-        row2[3] = ux[3] * uy[3];
-        row3[0] = lx[0] * uy[0];
-        row3[1] = lx[1] * uy[1];
-        row3[2] = lx[2] * uy[2];
-        row3[3] = lx[3] * uy[3];
         return;
     }
 
-    for (std::size_t q = 0; q < points.size(); ++q) {
-        const auto& xi = points[q];
-        const Real lx = (Real(1) - xi[0]) * Real(0.5);
-        const Real ly = (Real(1) - xi[1]) * Real(0.5);
-        const Real ux = (Real(1) + xi[0]) * Real(0.5);
-        const Real uy = (Real(1) + xi[1]) * Real(0.5);
-        row0[q] = lx * ly;
-        row1[q] = ux * ly;
-        row2[q] = ux * uy;
-        row3[q] = lx * uy;
-    }
-}
+    if (topology_ == BasisTopology::Wedge) {
+        SimplexEval tri;
+        AxisEval z_axis;
+        evaluate_simplex(xi, BasisTopology::Triangle, order_, simplex_exponents_, tri);
+        evaluate_1d_lagrange(xi[2], nodes_1d_, z_axis);
 
-void evaluate_quad_order1_gradients_strided(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT gradients_out) {
-    Real* row0 = gradients_out + 0u * 3u * output_stride;
-    Real* row1 = gradients_out + 1u * 3u * output_stride;
-    Real* row2 = gradients_out + 2u * 3u * output_stride;
-    Real* row3 = gradients_out + 3u * 3u * output_stride;
+        for (std::size_t node = 0; node < wedge_indices_.size(); ++node) {
+            const auto [tri_idx, z_idx] = wedge_indices_[node];
+            const Real tv = tri.value[tri_idx];
+            const Real zv = z_axis.value[z_idx];
+            const Real dz = z_axis.first[z_idx];
+            const Real d2z = z_axis.second[z_idx];
 
-    if (points.size() == 4u) {
-        Real lx[4];
-        Real ly[4];
-        Real ux[4];
-        Real uy[4];
-        for (std::size_t q = 0; q < 4u; ++q) {
-            const auto& xi = points[q];
-            lx[q] = (Real(1) - xi[0]) * Real(0.5);
-            ly[q] = (Real(1) - xi[1]) * Real(0.5);
-            ux[q] = (Real(1) + xi[0]) * Real(0.5);
-            uy[q] = (Real(1) + xi[1]) * Real(0.5);
+            if (values_out) {
+                values_out[node] = tv * zv;
+            }
+            if (gradients_out) {
+                Real* g = gradients_out + node * 3u;
+                g[0] = tri.gradient[tri_idx][0] * zv;
+                g[1] = tri.gradient[tri_idx][1] * zv;
+                g[2] = tv * dz;
+            }
+            if (hessians_out) {
+                Real* h = hessians_out + node * 9u;
+                const Hessian& th = tri.hessian[tri_idx];
+                const Gradient& tg = tri.gradient[tri_idx];
+                h[0] = th(0, 0) * zv;
+                h[1] = th(0, 1) * zv;
+                h[2] = tg[0] * dz;
+                h[3] = h[1];
+                h[4] = th(1, 1) * zv;
+                h[5] = tg[1] * dz;
+                h[6] = h[2];
+                h[7] = h[5];
+                h[8] = tv * d2z;
+            }
         }
-
-        auto write_component = [](Real* SVMP_RESTRICT row,
-                                  Real a0,
-                                  Real a1,
-                                  Real a2,
-                                  Real a3) {
-            row[0] = a0;
-            row[1] = a1;
-            row[2] = a2;
-            row[3] = a3;
-        };
-
-        write_component(row0, Real(-0.5) * ly[0], Real(-0.5) * ly[1],
-                        Real(-0.5) * ly[2], Real(-0.5) * ly[3]);
-        write_component(row0 + output_stride, Real(-0.5) * lx[0], Real(-0.5) * lx[1],
-                        Real(-0.5) * lx[2], Real(-0.5) * lx[3]);
-        write_component(row0 + 2u * output_stride, Real(0), Real(0), Real(0), Real(0));
-
-        write_component(row1, Real(0.5) * ly[0], Real(0.5) * ly[1],
-                        Real(0.5) * ly[2], Real(0.5) * ly[3]);
-        write_component(row1 + output_stride, Real(-0.5) * ux[0], Real(-0.5) * ux[1],
-                        Real(-0.5) * ux[2], Real(-0.5) * ux[3]);
-        write_component(row1 + 2u * output_stride, Real(0), Real(0), Real(0), Real(0));
-
-        write_component(row2, Real(0.5) * uy[0], Real(0.5) * uy[1],
-                        Real(0.5) * uy[2], Real(0.5) * uy[3]);
-        write_component(row2 + output_stride, Real(0.5) * ux[0], Real(0.5) * ux[1],
-                        Real(0.5) * ux[2], Real(0.5) * ux[3]);
-        write_component(row2 + 2u * output_stride, Real(0), Real(0), Real(0), Real(0));
-
-        write_component(row3, Real(-0.5) * uy[0], Real(-0.5) * uy[1],
-                        Real(-0.5) * uy[2], Real(-0.5) * uy[3]);
-        write_component(row3 + output_stride, Real(0.5) * lx[0], Real(0.5) * lx[1],
-                        Real(0.5) * lx[2], Real(0.5) * lx[3]);
-        write_component(row3 + 2u * output_stride, Real(0), Real(0), Real(0), Real(0));
         return;
     }
 
-    for (std::size_t q = 0; q < points.size(); ++q) {
-        const auto& xi = points[q];
-        const Real lx = (Real(1) - xi[0]) * Real(0.5);
-        const Real ly = (Real(1) - xi[1]) * Real(0.5);
-        const Real ux = (Real(1) + xi[0]) * Real(0.5);
-        const Real uy = (Real(1) + xi[1]) * Real(0.5);
-        row0[0u * output_stride + q] = Real(-0.5) * ly;
-        row0[1u * output_stride + q] = Real(-0.5) * lx;
-        row0[2u * output_stride + q] = Real(0);
-        row1[0u * output_stride + q] = Real( 0.5) * ly;
-        row1[1u * output_stride + q] = Real(-0.5) * ux;
-        row1[2u * output_stride + q] = Real(0);
-        row2[0u * output_stride + q] = Real( 0.5) * uy;
-        row2[1u * output_stride + q] = Real( 0.5) * ux;
-        row2[2u * output_stride + q] = Real(0);
-        row3[0u * output_stride + q] = Real(-0.5) * uy;
-        row3[1u * output_stride + q] = Real( 0.5) * lx;
-        row3[2u * output_stride + q] = Real(0);
-    }
+    throw BasisEvaluationException("Unsupported element in LagrangeBasis evaluation",
+                                   __FILE__, __LINE__, __func__);
 }
 
-inline void write_quad_order1_hessian_q(
-    Real* SVMP_RESTRICT row,
-    std::size_t output_stride,
-    std::size_t q,
-    Real xy) {
-    row[0u * output_stride + q] = Real(0);
-    row[1u * output_stride + q] = xy;
-    row[2u * output_stride + q] = Real(0);
-    row[3u * output_stride + q] = xy;
-    row[4u * output_stride + q] = Real(0);
-    row[5u * output_stride + q] = Real(0);
-    row[6u * output_stride + q] = Real(0);
-    row[7u * output_stride + q] = Real(0);
-    row[8u * output_stride + q] = Real(0);
+void LagrangeBasis::evaluate_values(const Vec3& xi,
+                                    std::vector<Real>& values) const {
+    values.resize(size());
+    evaluate_values_to(xi, values.data());
 }
 
-void evaluate_quad_order1_hessians_strided(
-    std::size_t num_qpts,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT hessians_out) {
-    Real* row0 = hessians_out + 0u * 9u * output_stride;
-    Real* row1 = hessians_out + 1u * 9u * output_stride;
-    Real* row2 = hessians_out + 2u * 9u * output_stride;
-    Real* row3 = hessians_out + 3u * 9u * output_stride;
-
-    constexpr Real positive = Real(0.25);
-    constexpr Real negative = Real(-0.25);
-    for (std::size_t q = 0; q < num_qpts; ++q) {
-        write_quad_order1_hessian_q(row0, output_stride, q, positive);
-        write_quad_order1_hessian_q(row1, output_stride, q, negative);
-        write_quad_order1_hessian_q(row2, output_stride, q, positive);
-        write_quad_order1_hessian_q(row3, output_stride, q, negative);
+void LagrangeBasis::evaluate_gradients(const Vec3& xi,
+                                       std::vector<Gradient>& gradients) const {
+    gradients.resize(size());
+    std::vector<Real> flat(size() * 3u, Real(0));
+    evaluate_gradients_to(xi, flat.data());
+    for (std::size_t i = 0; i < size(); ++i) {
+        gradients[i][0] = flat[i * 3u + 0u];
+        gradients[i][1] = flat[i * 3u + 1u];
+        gradients[i][2] = flat[i * 3u + 2u];
     }
 }
 
-template <std::size_t Q>
-inline void write_quad_order1_all_q4(
-    std::size_t output_stride,
-    std::size_t i,
-    std::size_t j,
-    const Real lx[4][2],
-    const Real ly[4][2],
-    Real* SVMP_RESTRICT value_row,
-    Real* SVMP_RESTRICT grad_row,
-    Real* SVMP_RESTRICT hess_row) {
-    const Real xv = lx[Q][i];
-    const Real yv = ly[Q][j];
-    const Real xd = (i == 0u) ? Real(-0.5) : Real(0.5);
-    const Real yd = (j == 0u) ? Real(-0.5) : Real(0.5);
-    const Real hxy = xd * yd;
-
-    value_row[Q] = xv * yv;
-    grad_row[0u * output_stride + Q] = xd * yv;
-    grad_row[1u * output_stride + Q] = xv * yd;
-    grad_row[2u * output_stride + Q] = Real(0);
-    hess_row[0u * output_stride + Q] = Real(0);
-    hess_row[4u * output_stride + Q] = Real(0);
-    hess_row[8u * output_stride + Q] = Real(0);
-    hess_row[1u * output_stride + Q] = hxy;
-    hess_row[3u * output_stride + Q] = hxy;
-    hess_row[2u * output_stride + Q] = Real(0);
-    hess_row[6u * output_stride + Q] = Real(0);
-    hess_row[5u * output_stride + Q] = Real(0);
-    hess_row[7u * output_stride + Q] = Real(0);
-}
-
-void evaluate_quad_order1_all_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) {
-    constexpr auto node_axes = detail::make_quad_tensor_node_axes<1>();
-
-    Real lx[4][2];
-    Real ly[4][2];
-    for (std::size_t q = 0; q < 4u; ++q) {
-        const auto& xi = points[q];
-        lx[q][0] = (Real(1) - xi[0]) * Real(0.5);
-        lx[q][1] = (Real(1) + xi[0]) * Real(0.5);
-        ly[q][0] = (Real(1) - xi[1]) * Real(0.5);
-        ly[q][1] = (Real(1) + xi[1]) * Real(0.5);
-    }
-
-    for (std::size_t node = 0; node < node_axes.size(); ++node) {
-        const auto& axes = node_axes[node];
-        const std::size_t i = axes[0];
-        const std::size_t j = axes[1];
-        Real* value_row = values_out + node * output_stride;
-        Real* grad_row = gradients_out + node * 3u * output_stride;
-        Real* hess_row = hessians_out + node * 9u * output_stride;
-        write_quad_order1_all_q4<0u>(
-            output_stride, i, j, lx, ly, value_row, grad_row, hess_row);
-        write_quad_order1_all_q4<1u>(
-            output_stride, i, j, lx, ly, value_row, grad_row, hess_row);
-        write_quad_order1_all_q4<2u>(
-            output_stride, i, j, lx, ly, value_row, grad_row, hess_row);
-        write_quad_order1_all_q4<3u>(
-            output_stride, i, j, lx, ly, value_row, grad_row, hess_row);
+void LagrangeBasis::evaluate_hessians(const Vec3& xi,
+                                      std::vector<Hessian>& hessians) const {
+    hessians.resize(size());
+    std::vector<Real> flat(size() * 9u, Real(0));
+    evaluate_hessians_to(xi, flat.data());
+    for (std::size_t i = 0; i < size(); ++i) {
+        hessians[i] = load_hessian(flat.data() + i * 9u);
     }
 }
 
-void evaluate_quad_order2_values_strided(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out) {
-    Real* row0 = values_out + 0u * output_stride;
-    Real* row1 = values_out + 1u * output_stride;
-    Real* row2 = values_out + 2u * output_stride;
-    Real* row3 = values_out + 3u * output_stride;
-    Real* row4 = values_out + 4u * output_stride;
-    Real* row5 = values_out + 5u * output_stride;
-    Real* row6 = values_out + 6u * output_stride;
-    Real* row7 = values_out + 7u * output_stride;
-    Real* row8 = values_out + 8u * output_stride;
-
-    for (std::size_t q = 0; q < points.size(); ++q) {
-        const auto& xi = points[q];
-        const Real x = xi[0];
-        const Real y = xi[1];
-        const Real x0 = x * (x - Real(1)) * Real(0.5);
-        const Real x1 = x * (x + Real(1)) * Real(0.5);
-        const Real x2 = Real(1) - x * x;
-        const Real y0 = y * (y - Real(1)) * Real(0.5);
-        const Real y1 = y * (y + Real(1)) * Real(0.5);
-        const Real y2 = Real(1) - y * y;
-
-        row0[q] = x0 * y0;
-        row1[q] = x1 * y0;
-        row2[q] = x1 * y1;
-        row3[q] = x0 * y1;
-        row4[q] = x2 * y0;
-        row5[q] = x1 * y2;
-        row6[q] = x2 * y1;
-        row7[q] = x0 * y2;
-        row8[q] = x2 * y2;
+void LagrangeBasis::evaluate_all(const Vec3& xi,
+                                 std::vector<Real>& values,
+                                 std::vector<Gradient>& gradients,
+                                 std::vector<Hessian>& hessians) const {
+    values.resize(size());
+    gradients.resize(size());
+    hessians.resize(size());
+    std::vector<Real> flat_g(size() * 3u, Real(0));
+    std::vector<Real> flat_h(size() * 9u, Real(0));
+    evaluate_all_to(xi, values.data(), flat_g.data(), flat_h.data());
+    for (std::size_t i = 0; i < size(); ++i) {
+        gradients[i][0] = flat_g[i * 3u + 0u];
+        gradients[i][1] = flat_g[i * 3u + 1u];
+        gradients[i][2] = flat_g[i * 3u + 2u];
+        hessians[i] = load_hessian(flat_h.data() + i * 9u);
     }
 }
 
-inline void write_quad_order2_gradient_q(
-    Real* SVMP_RESTRICT row,
-    std::size_t output_stride,
-    std::size_t q,
-    Real dx,
-    Real dy) {
-    row[0u * output_stride + q] = dx;
-    row[1u * output_stride + q] = dy;
-    row[2u * output_stride + q] = Real(0);
+void LagrangeBasis::evaluate_values_to(const Vec3& xi,
+                                       Real* SVMP_RESTRICT values_out) const {
+    evaluate_all_to(xi, values_out, nullptr, nullptr);
 }
 
-void evaluate_quad_order2_gradients_strided(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT gradients_out) {
-    if (points.size() == 4u) {
-        Real xv[4][3];
-        Real yv[4][3];
-        Real xd[4][3];
-        Real yd[4][3];
-        for (std::size_t q = 0; q < 4u; ++q) {
-            const auto& xi = points[q];
-            const Real x = xi[0];
-            const Real y = xi[1];
-            xv[q][0] = x * (x - Real(1)) * Real(0.5);
-            xv[q][1] = x * (x + Real(1)) * Real(0.5);
-            xv[q][2] = Real(1) - x * x;
-            yv[q][0] = y * (y - Real(1)) * Real(0.5);
-            yv[q][1] = y * (y + Real(1)) * Real(0.5);
-            yv[q][2] = Real(1) - y * y;
-            xd[q][0] = x - Real(0.5);
-            xd[q][1] = x + Real(0.5);
-            xd[q][2] = Real(-2) * x;
-            yd[q][0] = y - Real(0.5);
-            yd[q][1] = y + Real(0.5);
-            yd[q][2] = Real(-2) * y;
-        }
-
-        auto write_node = [&](std::size_t node, std::size_t i, std::size_t j) {
-            Real* SVMP_RESTRICT row = gradients_out + node * 3u * output_stride;
-            row[0u] = xd[0][i] * yv[0][j];
-            row[1u] = xd[1][i] * yv[1][j];
-            row[2u] = xd[2][i] * yv[2][j];
-            row[3u] = xd[3][i] * yv[3][j];
-            row[output_stride + 0u] = xv[0][i] * yd[0][j];
-            row[output_stride + 1u] = xv[1][i] * yd[1][j];
-            row[output_stride + 2u] = xv[2][i] * yd[2][j];
-            row[output_stride + 3u] = xv[3][i] * yd[3][j];
-            row[2u * output_stride + 0u] = Real(0);
-            row[2u * output_stride + 1u] = Real(0);
-            row[2u * output_stride + 2u] = Real(0);
-            row[2u * output_stride + 3u] = Real(0);
-        };
-
-        write_node(0u, 0u, 0u);
-        write_node(1u, 1u, 0u);
-        write_node(2u, 1u, 1u);
-        write_node(3u, 0u, 1u);
-        write_node(4u, 2u, 0u);
-        write_node(5u, 1u, 2u);
-        write_node(6u, 2u, 1u);
-        write_node(7u, 0u, 2u);
-        write_node(8u, 2u, 2u);
-        return;
-    }
-
-    Real* row0 = gradients_out + 0u * 3u * output_stride;
-    Real* row1 = gradients_out + 1u * 3u * output_stride;
-    Real* row2 = gradients_out + 2u * 3u * output_stride;
-    Real* row3 = gradients_out + 3u * 3u * output_stride;
-    Real* row4 = gradients_out + 4u * 3u * output_stride;
-    Real* row5 = gradients_out + 5u * 3u * output_stride;
-    Real* row6 = gradients_out + 6u * 3u * output_stride;
-    Real* row7 = gradients_out + 7u * 3u * output_stride;
-    Real* row8 = gradients_out + 8u * 3u * output_stride;
-
-    for (std::size_t q = 0; q < points.size(); ++q) {
-        const auto& xi = points[q];
-        const Real x = xi[0];
-        const Real y = xi[1];
-        const Real x0 = x * (x - Real(1)) * Real(0.5);
-        const Real x1 = x * (x + Real(1)) * Real(0.5);
-        const Real x2 = Real(1) - x * x;
-        const Real y0 = y * (y - Real(1)) * Real(0.5);
-        const Real y1 = y * (y + Real(1)) * Real(0.5);
-        const Real y2 = Real(1) - y * y;
-        const Real dx0 = x - Real(0.5);
-        const Real dx1 = x + Real(0.5);
-        const Real dx2 = Real(-2) * x;
-        const Real dy0 = y - Real(0.5);
-        const Real dy1 = y + Real(0.5);
-        const Real dy2 = Real(-2) * y;
+void LagrangeBasis::evaluate_gradients_to(const Vec3& xi,
+                                          Real* SVMP_RESTRICT gradients_out) const {
+    evaluate_all_to(xi, nullptr, gradients_out, nullptr);
+}
 
-        write_quad_order2_gradient_q(row0, output_stride, q, dx0 * y0, x0 * dy0);
-        write_quad_order2_gradient_q(row1, output_stride, q, dx1 * y0, x1 * dy0);
-        write_quad_order2_gradient_q(row2, output_stride, q, dx1 * y1, x1 * dy1);
-        write_quad_order2_gradient_q(row3, output_stride, q, dx0 * y1, x0 * dy1);
-        write_quad_order2_gradient_q(row4, output_stride, q, dx2 * y0, x2 * dy0);
-        write_quad_order2_gradient_q(row5, output_stride, q, dx1 * y2, x1 * dy2);
-        write_quad_order2_gradient_q(row6, output_stride, q, dx2 * y1, x2 * dy1);
-        write_quad_order2_gradient_q(row7, output_stride, q, dx0 * y2, x0 * dy2);
-        write_quad_order2_gradient_q(row8, output_stride, q, dx2 * y2, x2 * dy2);
-    }
-}
-
-inline void write_quad_order2_hessian_q(
-    Real* SVMP_RESTRICT row,
-    std::size_t output_stride,
-    std::size_t q,
-    Real hxx,
-    Real hxy,
-    Real hyy) {
-    row[0u * output_stride + q] = hxx;
-    row[1u * output_stride + q] = hxy;
-    row[2u * output_stride + q] = Real(0);
-    row[3u * output_stride + q] = hxy;
-    row[4u * output_stride + q] = hyy;
-    row[5u * output_stride + q] = Real(0);
-    row[6u * output_stride + q] = Real(0);
-    row[7u * output_stride + q] = Real(0);
-    row[8u * output_stride + q] = Real(0);
-}
-
-void evaluate_quad_order2_hessians_strided(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT hessians_out) {
-    Real* row0 = hessians_out + 0u * 9u * output_stride;
-    Real* row1 = hessians_out + 1u * 9u * output_stride;
-    Real* row2 = hessians_out + 2u * 9u * output_stride;
-    Real* row3 = hessians_out + 3u * 9u * output_stride;
-    Real* row4 = hessians_out + 4u * 9u * output_stride;
-    Real* row5 = hessians_out + 5u * 9u * output_stride;
-    Real* row6 = hessians_out + 6u * 9u * output_stride;
-    Real* row7 = hessians_out + 7u * 9u * output_stride;
-    Real* row8 = hessians_out + 8u * 9u * output_stride;
-
-    for (std::size_t q = 0; q < points.size(); ++q) {
-        const auto& xi = points[q];
-        const Real x = xi[0];
-        const Real y = xi[1];
-        const Real x0 = x * (x - Real(1)) * Real(0.5);
-        const Real x1 = x * (x + Real(1)) * Real(0.5);
-        const Real x2 = Real(1) - x * x;
-        const Real y0 = y * (y - Real(1)) * Real(0.5);
-        const Real y1 = y * (y + Real(1)) * Real(0.5);
-        const Real y2 = Real(1) - y * y;
-        const Real dx0 = x - Real(0.5);
-        const Real dx1 = x + Real(0.5);
-        const Real dx2 = Real(-2) * x;
-        const Real dy0 = y - Real(0.5);
-        const Real dy1 = y + Real(0.5);
-        const Real dy2 = Real(-2) * y;
-
-        write_quad_order2_hessian_q(row0, output_stride, q, y0, dx0 * dy0, x0);
-        write_quad_order2_hessian_q(row1, output_stride, q, y0, dx1 * dy0, x1);
-        write_quad_order2_hessian_q(row2, output_stride, q, y1, dx1 * dy1, x1);
-        write_quad_order2_hessian_q(row3, output_stride, q, y1, dx0 * dy1, x0);
-        write_quad_order2_hessian_q(row4, output_stride, q, Real(-2) * y0, dx2 * dy0, x2);
-        write_quad_order2_hessian_q(row5, output_stride, q, y2, dx1 * dy2, Real(-2) * x1);
-        write_quad_order2_hessian_q(row6, output_stride, q, Real(-2) * y1, dx2 * dy1, x2);
-        write_quad_order2_hessian_q(row7, output_stride, q, y2, dx0 * dy2, Real(-2) * x0);
-        write_quad_order2_hessian_q(row8, output_stride, q, Real(-2) * y2, dx2 * dy2, Real(-2) * x2);
-    }
-}
-
-inline void fill_order3_axis_values(Real x, Real* SVMP_RESTRICT values) {
-    const Real x2 = x * x;
-    values[0] = Real(-9.0 / 16.0) * (x - Real(1)) * (x2 - Real(1.0 / 9.0));
-    values[1] = Real( 9.0 / 16.0) * (x + Real(1)) * (x2 - Real(1.0 / 9.0));
-    values[2] = Real(27.0 / 16.0) * (x2 - Real(1)) * (x - Real(1.0 / 3.0));
-    values[3] = Real(-27.0 / 16.0) * (x2 - Real(1)) * (x + Real(1.0 / 3.0));
-}
-
-inline void fill_order3_axis_value_scalars(Real x,
-                                           Real& v0,
-                                           Real& v1,
-                                           Real& v2,
-                                           Real& v3) {
-    const Real x2 = x * x;
-    v0 = Real(-9.0 / 16.0) * (x - Real(1)) * (x2 - Real(1.0 / 9.0));
-    v1 = Real( 9.0 / 16.0) * (x + Real(1)) * (x2 - Real(1.0 / 9.0));
-    v2 = Real(27.0 / 16.0) * (x2 - Real(1)) * (x - Real(1.0 / 3.0));
-    v3 = Real(-27.0 / 16.0) * (x2 - Real(1)) * (x + Real(1.0 / 3.0));
-}
-
-void evaluate_line_order1_values_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out) {
-    Real* row0 = values_out + 0u * output_stride;
-    Real* row1 = values_out + 1u * output_stride;
-    for (std::size_t q = 0; q < 4u; ++q) {
-        const Real x = points[q][0];
-        row0[q] = (Real(1) - x) * Real(0.5);
-        row1[q] = (Real(1) + x) * Real(0.5);
-    }
-}
-
-void evaluate_line_order2_values_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out) {
-    Real* row0 = values_out + 0u * output_stride;
-    Real* row1 = values_out + 1u * output_stride;
-    Real* row2 = values_out + 2u * output_stride;
-    for (std::size_t q = 0; q < 4u; ++q) {
-        const Real x = points[q][0];
-        row0[q] = x * (x - Real(1)) * Real(0.5);
-        row1[q] = x * (x + Real(1)) * Real(0.5);
-        row2[q] = Real(1) - x * x;
-    }
-}
-
-void evaluate_line_order3_values_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out) {
-    Real* row0 = values_out + 0u * output_stride;
-    Real* row1 = values_out + 1u * output_stride;
-    Real* row2 = values_out + 2u * output_stride;
-    Real* row3 = values_out + 3u * output_stride;
-    for (std::size_t q = 0; q < 4u; ++q) {
-        Real values[4];
-        fill_order3_axis_values(points[q][0], values);
-        row0[q] = values[0];
-        row1[q] = values[1];
-        row2[q] = values[2];
-        row3[q] = values[3];
-    }
-}
-
-inline void fill_order3_axis_values_first(Real x,
-                                          Real* SVMP_RESTRICT values,
-                                          Real* SVMP_RESTRICT first);
-
-inline void fill_order3_axis_values_first_second(Real x,
-                                                 Real* SVMP_RESTRICT values,
-                                                 Real* SVMP_RESTRICT first,
-                                                 Real* SVMP_RESTRICT second);
-
-inline void write_line_gradient_q4_row(Real* SVMP_RESTRICT row,
-                                       std::size_t output_stride,
-                                       Real g0,
-                                       Real g1,
-                                       Real g2,
-                                       Real g3) {
-    row[0] = g0;
-    row[1] = g1;
-    row[2] = g2;
-    row[3] = g3;
-    row[output_stride + 0u] = Real(0);
-    row[output_stride + 1u] = Real(0);
-    row[output_stride + 2u] = Real(0);
-    row[output_stride + 3u] = Real(0);
-    row[2u * output_stride + 0u] = Real(0);
-    row[2u * output_stride + 1u] = Real(0);
-    row[2u * output_stride + 2u] = Real(0);
-    row[2u * output_stride + 3u] = Real(0);
-}
-
-inline void write_line_hessian_q4_row(Real* SVMP_RESTRICT row,
-                                      std::size_t output_stride,
-                                      Real h0,
-                                      Real h1,
-                                      Real h2,
-                                      Real h3) {
-    row[0] = h0;
-    row[1] = h1;
-    row[2] = h2;
-    row[3] = h3;
-    for (std::size_t component = 1u; component < 9u; ++component) {
-        Real* slot = row + component * output_stride;
-        slot[0] = Real(0);
-        slot[1] = Real(0);
-        slot[2] = Real(0);
-        slot[3] = Real(0);
-    }
-}
-
-SVMP_LAGRANGE_NOINLINE void evaluate_line_order1_gradients_q4(
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT gradients_out) {
-    write_line_gradient_q4_row(gradients_out + 0u * 3u * output_stride,
-                               output_stride,
-                               Real(-0.5), Real(-0.5), Real(-0.5), Real(-0.5));
-    write_line_gradient_q4_row(gradients_out + 1u * 3u * output_stride,
-                               output_stride,
-                               Real(0.5), Real(0.5), Real(0.5), Real(0.5));
-}
-
-SVMP_LAGRANGE_NOINLINE void evaluate_line_order1_hessians_q4(
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT hessians_out) {
-    write_line_hessian_q4_row(hessians_out + 0u * 9u * output_stride,
-                              output_stride, Real(0), Real(0), Real(0), Real(0));
-    write_line_hessian_q4_row(hessians_out + 1u * 9u * output_stride,
-                              output_stride, Real(0), Real(0), Real(0), Real(0));
-}
-
-SVMP_LAGRANGE_NOINLINE void evaluate_line_order1_all_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) {
-    evaluate_line_order1_values_q4(points, output_stride, values_out);
-    evaluate_line_order1_gradients_q4(output_stride, gradients_out);
-    evaluate_line_order1_hessians_q4(output_stride, hessians_out);
-}
-
-SVMP_LAGRANGE_NOINLINE void evaluate_line_order2_gradients_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT gradients_out) {
-    const Real x0 = points[0][0];
-    const Real x1 = points[1][0];
-    const Real x2 = points[2][0];
-    const Real x3 = points[3][0];
-    write_line_gradient_q4_row(gradients_out + 0u * 3u * output_stride,
-                               output_stride,
-                               x0 - Real(0.5), x1 - Real(0.5),
-                               x2 - Real(0.5), x3 - Real(0.5));
-    write_line_gradient_q4_row(gradients_out + 1u * 3u * output_stride,
-                               output_stride,
-                               x0 + Real(0.5), x1 + Real(0.5),
-                               x2 + Real(0.5), x3 + Real(0.5));
-    write_line_gradient_q4_row(gradients_out + 2u * 3u * output_stride,
-                               output_stride,
-                               Real(-2) * x0, Real(-2) * x1,
-                               Real(-2) * x2, Real(-2) * x3);
-}
-
-SVMP_LAGRANGE_NOINLINE void evaluate_line_order2_hessians_q4(
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT hessians_out) {
-    write_line_hessian_q4_row(hessians_out + 0u * 9u * output_stride,
-                              output_stride, Real(1), Real(1), Real(1), Real(1));
-    write_line_hessian_q4_row(hessians_out + 1u * 9u * output_stride,
-                              output_stride, Real(1), Real(1), Real(1), Real(1));
-    write_line_hessian_q4_row(hessians_out + 2u * 9u * output_stride,
-                              output_stride, Real(-2), Real(-2), Real(-2), Real(-2));
-}
-
-SVMP_LAGRANGE_NOINLINE void evaluate_line_order2_all_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) {
-    evaluate_line_order2_values_q4(points, output_stride, values_out);
-    evaluate_line_order2_gradients_q4(points, output_stride, gradients_out);
-    evaluate_line_order2_hessians_q4(output_stride, hessians_out);
-}
-
-SVMP_LAGRANGE_NOINLINE void evaluate_line_order3_gradients_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT gradients_out) {
-    Real first[4][4];
-    for (std::size_t q = 0; q < 4u; ++q) {
-        Real values[4];
-        fill_order3_axis_values_first(points[q][0], values, first[q]);
-    }
-    for (std::size_t node = 0; node < 4u; ++node) {
-        write_line_gradient_q4_row(gradients_out + node * 3u * output_stride,
-                                   output_stride,
-                                   first[0][node], first[1][node],
-                                   first[2][node], first[3][node]);
-    }
-}
-
-SVMP_LAGRANGE_NOINLINE void evaluate_line_order3_hessians_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT hessians_out) {
-    Real second[4][4];
-    for (std::size_t q = 0; q < 4u; ++q) {
-        Real values[4];
-        Real first[4];
-        fill_order3_axis_values_first_second(points[q][0], values, first, second[q]);
-    }
-    for (std::size_t node = 0; node < 4u; ++node) {
-        write_line_hessian_q4_row(hessians_out + node * 9u * output_stride,
-                                  output_stride,
-                                  second[0][node], second[1][node],
-                                  second[2][node], second[3][node]);
-    }
-}
-
-SVMP_LAGRANGE_NOINLINE void evaluate_line_order3_all_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) {
-    Real values[4][4];
-    Real first[4][4];
-    Real second[4][4];
-    for (std::size_t q = 0; q < 4u; ++q) {
-        fill_order3_axis_values_first_second(points[q][0], values[q], first[q], second[q]);
-    }
-    for (std::size_t node = 0; node < 4u; ++node) {
-        Real* value_row = values_out + node * output_stride;
-        value_row[0] = values[0][node];
-        value_row[1] = values[1][node];
-        value_row[2] = values[2][node];
-        value_row[3] = values[3][node];
-        write_line_gradient_q4_row(gradients_out + node * 3u * output_stride,
-                                   output_stride,
-                                   first[0][node], first[1][node],
-                                   first[2][node], first[3][node]);
-        write_line_hessian_q4_row(hessians_out + node * 9u * output_stride,
-                                  output_stride,
-                                  second[0][node], second[1][node],
-                                  second[2][node], second[3][node]);
-    }
-}
-
-inline void fill_order3_axis_values_first(Real x,
-                                          Real* SVMP_RESTRICT values,
-                                          Real* SVMP_RESTRICT first) {
-    fill_order3_axis_values(x, values);
-    const Real x2 = x * x;
-    first[0] = Real(-9.0 / 16.0) * (Real(3) * x2 - Real(2) * x - Real(1.0 / 9.0));
-    first[1] = Real( 9.0 / 16.0) * (Real(3) * x2 + Real(2) * x - Real(1.0 / 9.0));
-    first[2] = Real(27.0 / 16.0) * (Real(3) * x2 - Real(2.0 / 3.0) * x - Real(1));
-    first[3] = Real(-27.0 / 16.0) * (Real(3) * x2 + Real(2.0 / 3.0) * x - Real(1));
-}
-
-inline void fill_order3_axis_values_first_second(Real x,
-                                                 Real* SVMP_RESTRICT values,
-                                                 Real* SVMP_RESTRICT first,
-                                                 Real* SVMP_RESTRICT second) {
-    fill_order3_axis_values_first(x, values, first);
-    second[0] = Real(-9.0 / 16.0) * (Real(6) * x - Real(2));
-    second[1] = Real( 9.0 / 16.0) * (Real(6) * x + Real(2));
-    second[2] = Real(27.0 / 16.0) * (Real(6) * x - Real(2.0 / 3.0));
-    second[3] = Real(-27.0 / 16.0) * (Real(6) * x + Real(2.0 / 3.0));
-}
-
-inline void write_quad_order3_value_row_q4(Real* SVMP_RESTRICT row,
-                                           const Real lx[4][4],
-                                           const Real ly[4][4],
-                                           std::size_t i,
-                                           std::size_t j) {
-    row[0] = lx[0][i] * ly[0][j];
-    row[1] = lx[1][i] * ly[1][j];
-    row[2] = lx[2][i] * ly[2][j];
-    row[3] = lx[3][i] * ly[3][j];
-}
-
-void evaluate_quad_order3_values_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out) {
-    if (output_stride == 4u) {
-        Real* row0 = values_out + 0u * 4u;
-        Real* row1 = values_out + 1u * 4u;
-        Real* row2 = values_out + 2u * 4u;
-        Real* row3 = values_out + 3u * 4u;
-        Real* row4 = values_out + 4u * 4u;
-        Real* row5 = values_out + 5u * 4u;
-        Real* row6 = values_out + 6u * 4u;
-        Real* row7 = values_out + 7u * 4u;
-        Real* row8 = values_out + 8u * 4u;
-        Real* row9 = values_out + 9u * 4u;
-        Real* row10 = values_out + 10u * 4u;
-        Real* row11 = values_out + 11u * 4u;
-        Real* row12 = values_out + 12u * 4u;
-        Real* row13 = values_out + 13u * 4u;
-        Real* row14 = values_out + 14u * 4u;
-        Real* row15 = values_out + 15u * 4u;
-
-        auto write_q = [&](std::size_t q) {
-            const auto& xi = points[q];
-            Real x0;
-            Real x1;
-            Real x2;
-            Real x3;
-            Real y0;
-            Real y1;
-            Real y2;
-            Real y3;
-            fill_order3_axis_value_scalars(xi[0], x0, x1, x2, x3);
-            fill_order3_axis_value_scalars(xi[1], y0, y1, y2, y3);
-            row0[q] = x0 * y0;
-            row1[q] = x1 * y0;
-            row2[q] = x1 * y1;
-            row3[q] = x0 * y1;
-            row4[q] = x2 * y0;
-            row5[q] = x3 * y0;
-            row6[q] = x1 * y2;
-            row7[q] = x1 * y3;
-            row8[q] = x3 * y1;
-            row9[q] = x2 * y1;
-            row10[q] = x0 * y3;
-            row11[q] = x0 * y2;
-            row12[q] = x2 * y2;
-            row13[q] = x3 * y2;
-            row14[q] = x2 * y3;
-            row15[q] = x3 * y3;
-        };
-
-        write_q(0u);
-        write_q(1u);
-        write_q(2u);
-        write_q(3u);
-        return;
-    }
-
-    Real lx[4][4];
-    Real ly[4][4];
-    for (std::size_t q = 0; q < 4u; ++q) {
-        const auto& xi = points[q];
-        fill_order3_axis_values(xi[0], lx[q]);
-        fill_order3_axis_values(xi[1], ly[q]);
-    }
-
-    write_quad_order3_value_row_q4(values_out + 0u * output_stride, lx, ly, 0u, 0u);
-    write_quad_order3_value_row_q4(values_out + 1u * output_stride, lx, ly, 1u, 0u);
-    write_quad_order3_value_row_q4(values_out + 2u * output_stride, lx, ly, 1u, 1u);
-    write_quad_order3_value_row_q4(values_out + 3u * output_stride, lx, ly, 0u, 1u);
-    write_quad_order3_value_row_q4(values_out + 4u * output_stride, lx, ly, 2u, 0u);
-    write_quad_order3_value_row_q4(values_out + 5u * output_stride, lx, ly, 3u, 0u);
-    write_quad_order3_value_row_q4(values_out + 6u * output_stride, lx, ly, 1u, 2u);
-    write_quad_order3_value_row_q4(values_out + 7u * output_stride, lx, ly, 1u, 3u);
-    write_quad_order3_value_row_q4(values_out + 8u * output_stride, lx, ly, 3u, 1u);
-    write_quad_order3_value_row_q4(values_out + 9u * output_stride, lx, ly, 2u, 1u);
-    write_quad_order3_value_row_q4(values_out + 10u * output_stride, lx, ly, 0u, 3u);
-    write_quad_order3_value_row_q4(values_out + 11u * output_stride, lx, ly, 0u, 2u);
-    write_quad_order3_value_row_q4(values_out + 12u * output_stride, lx, ly, 2u, 2u);
-    write_quad_order3_value_row_q4(values_out + 13u * output_stride, lx, ly, 3u, 2u);
-    write_quad_order3_value_row_q4(values_out + 14u * output_stride, lx, ly, 2u, 3u);
-    write_quad_order3_value_row_q4(values_out + 15u * output_stride, lx, ly, 3u, 3u);
-}
-
-void evaluate_quad_order3_values_strided(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out) {
-    if (points.size() == 4u) {
-        evaluate_quad_order3_values_q4(points, output_stride, values_out);
-        return;
-    }
-
-    Real* row0 = values_out + 0u * output_stride;
-    Real* row1 = values_out + 1u * output_stride;
-    Real* row2 = values_out + 2u * output_stride;
-    Real* row3 = values_out + 3u * output_stride;
-    Real* row4 = values_out + 4u * output_stride;
-    Real* row5 = values_out + 5u * output_stride;
-    Real* row6 = values_out + 6u * output_stride;
-    Real* row7 = values_out + 7u * output_stride;
-    Real* row8 = values_out + 8u * output_stride;
-    Real* row9 = values_out + 9u * output_stride;
-    Real* row10 = values_out + 10u * output_stride;
-    Real* row11 = values_out + 11u * output_stride;
-    Real* row12 = values_out + 12u * output_stride;
-    Real* row13 = values_out + 13u * output_stride;
-    Real* row14 = values_out + 14u * output_stride;
-    Real* row15 = values_out + 15u * output_stride;
-
-    for (std::size_t q = 0; q < points.size(); ++q) {
-        const auto& xi = points[q];
-        Real lx[4];
-        Real ly[4];
-        fill_order3_axis_values(xi[0], lx);
-        fill_order3_axis_values(xi[1], ly);
-        row0[q] = lx[0] * ly[0];
-        row1[q] = lx[1] * ly[0];
-        row2[q] = lx[1] * ly[1];
-        row3[q] = lx[0] * ly[1];
-        row4[q] = lx[2] * ly[0];
-        row5[q] = lx[3] * ly[0];
-        row6[q] = lx[1] * ly[2];
-        row7[q] = lx[1] * ly[3];
-        row8[q] = lx[3] * ly[1];
-        row9[q] = lx[2] * ly[1];
-        row10[q] = lx[0] * ly[3];
-        row11[q] = lx[0] * ly[2];
-        row12[q] = lx[2] * ly[2];
-        row13[q] = lx[3] * ly[2];
-        row14[q] = lx[2] * ly[3];
-        row15[q] = lx[3] * ly[3];
-    }
-}
-
-template <std::size_t N>
-inline void write_quad_gradient_row_q4(
-    Real* SVMP_RESTRICT row,
-    std::size_t output_stride,
-    const Real (&lx)[4][N],
-    const Real (&ly)[4][N],
-    const Real (&dx)[4][N],
-    const Real (&dy)[4][N],
-    std::size_t i,
-    std::size_t j) {
-    row[0u] = dx[0][i] * ly[0][j];
-    row[1u] = dx[1][i] * ly[1][j];
-    row[2u] = dx[2][i] * ly[2][j];
-    row[3u] = dx[3][i] * ly[3][j];
-    row[output_stride + 0u] = lx[0][i] * dy[0][j];
-    row[output_stride + 1u] = lx[1][i] * dy[1][j];
-    row[output_stride + 2u] = lx[2][i] * dy[2][j];
-    row[output_stride + 3u] = lx[3][i] * dy[3][j];
-    row[2u * output_stride + 0u] = Real(0);
-    row[2u * output_stride + 1u] = Real(0);
-    row[2u * output_stride + 2u] = Real(0);
-    row[2u * output_stride + 3u] = Real(0);
-}
-
-inline void fill_order4_axis_values_first(Real x,
-                                          Real* SVMP_RESTRICT values,
-                                          Real* SVMP_RESTRICT first) {
-    const Real r = (x + Real(1)) * Real(2);
-    const Real r2 = r * r;
-    const Real r3 = r2 * r;
-    const Real f0 = r;
-    const Real f1 = r - Real(1);
-    const Real f2 = r - Real(2);
-    const Real f3 = r - Real(3);
-    const Real f4 = r - Real(4);
-    const Real f01 = f0 * f1;
-    const Real f12 = f1 * f2;
-    const Real f23 = f2 * f3;
-    const Real f34 = f3 * f4;
-
-    values[0] = (f12 * f34) / Real(24);
-    values[1] = (f01 * f23) / Real(24);
-    values[2] = -(f0 * f2 * f34) / Real(6);
-    values[3] = (f01 * f34) / Real(4);
-    values[4] = -(f01 * f2 * f4) / Real(6);
-
-    first[0] = (Real(4) * r3 - Real(30) * r2 + Real(70) * r - Real(50)) / Real(12);
-    first[1] = (Real(4) * r3 - Real(18) * r2 + Real(22) * r - Real(6)) / Real(12);
-    first[2] = (-Real(4) * r3 + Real(27) * r2 - Real(52) * r + Real(24)) / Real(3);
-    first[3] = Real(2) * r3 - Real(12) * r2 + Real(19) * r - Real(6);
-    first[4] = (-Real(4) * r3 + Real(21) * r2 - Real(28) * r + Real(8)) / Real(3);
-}
-
-SVMP_LAGRANGE_NOINLINE void evaluate_quad_order3_gradients_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT gradients_out) {
-    constexpr auto node_axes = detail::make_quad_tensor_node_axes<3>();
-
-    Real lx[4][4];
-    Real ly[4][4];
-    Real dx[4][4];
-    Real dy[4][4];
-    for (std::size_t q = 0; q < 4u; ++q) {
-        const auto& xi = points[q];
-        fill_order3_axis_values_first(xi[0], lx[q], dx[q]);
-        fill_order3_axis_values_first(xi[1], ly[q], dy[q]);
-    }
-
-    for (std::size_t node = 0; node < node_axes.size(); ++node) {
-        const auto& axes = node_axes[node];
-        write_quad_gradient_row_q4(
-            gradients_out + node * 3u * output_stride,
-            output_stride,
-            lx,
-            ly,
-            dx,
-            dy,
-            axes[0],
-            axes[1]);
-    }
-}
-
-SVMP_LAGRANGE_NOINLINE void evaluate_quad_order4_gradients_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT gradients_out) {
-    constexpr auto node_axes = detail::make_quad_tensor_node_axes<4>();
-
-    Real lx[4][5];
-    Real ly[4][5];
-    Real dx[4][5];
-    Real dy[4][5];
-    for (std::size_t q = 0; q < 4u; ++q) {
-        const auto& xi = points[q];
-        fill_order4_axis_values_first(xi[0], lx[q], dx[q]);
-        fill_order4_axis_values_first(xi[1], ly[q], dy[q]);
-    }
-
-    for (std::size_t node = 0; node < node_axes.size(); ++node) {
-        const auto& axes = node_axes[node];
-        write_quad_gradient_row_q4(
-            gradients_out + node * 3u * output_stride,
-            output_stride,
-            lx,
-            ly,
-            dx,
-            dy,
-            axes[0],
-            axes[1]);
-    }
-}
-
-void evaluate_quad_order3_gradients_strided(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT gradients_out) {
-    if (points.size() == 4u) {
-        evaluate_quad_order3_gradients_q4(points, output_stride, gradients_out);
-        return;
-    }
-
-    Real* row0 = gradients_out + 0u * 3u * output_stride;
-    Real* row1 = gradients_out + 1u * 3u * output_stride;
-    Real* row2 = gradients_out + 2u * 3u * output_stride;
-    Real* row3 = gradients_out + 3u * 3u * output_stride;
-    Real* row4 = gradients_out + 4u * 3u * output_stride;
-    Real* row5 = gradients_out + 5u * 3u * output_stride;
-    Real* row6 = gradients_out + 6u * 3u * output_stride;
-    Real* row7 = gradients_out + 7u * 3u * output_stride;
-    Real* row8 = gradients_out + 8u * 3u * output_stride;
-    Real* row9 = gradients_out + 9u * 3u * output_stride;
-    Real* row10 = gradients_out + 10u * 3u * output_stride;
-    Real* row11 = gradients_out + 11u * 3u * output_stride;
-    Real* row12 = gradients_out + 12u * 3u * output_stride;
-    Real* row13 = gradients_out + 13u * 3u * output_stride;
-    Real* row14 = gradients_out + 14u * 3u * output_stride;
-    Real* row15 = gradients_out + 15u * 3u * output_stride;
-
-    for (std::size_t q = 0; q < points.size(); ++q) {
-        const auto& xi = points[q];
-        Real lx[4];
-        Real ly[4];
-        Real dx[4];
-        Real dy[4];
-        fill_order3_axis_values_first(xi[0], lx, dx);
-        fill_order3_axis_values_first(xi[1], ly, dy);
-        write_quad_order2_gradient_q(row0, output_stride, q, dx[0] * ly[0], lx[0] * dy[0]);
-        write_quad_order2_gradient_q(row1, output_stride, q, dx[1] * ly[0], lx[1] * dy[0]);
-        write_quad_order2_gradient_q(row2, output_stride, q, dx[1] * ly[1], lx[1] * dy[1]);
-        write_quad_order2_gradient_q(row3, output_stride, q, dx[0] * ly[1], lx[0] * dy[1]);
-        write_quad_order2_gradient_q(row4, output_stride, q, dx[2] * ly[0], lx[2] * dy[0]);
-        write_quad_order2_gradient_q(row5, output_stride, q, dx[3] * ly[0], lx[3] * dy[0]);
-        write_quad_order2_gradient_q(row6, output_stride, q, dx[1] * ly[2], lx[1] * dy[2]);
-        write_quad_order2_gradient_q(row7, output_stride, q, dx[1] * ly[3], lx[1] * dy[3]);
-        write_quad_order2_gradient_q(row8, output_stride, q, dx[3] * ly[1], lx[3] * dy[1]);
-        write_quad_order2_gradient_q(row9, output_stride, q, dx[2] * ly[1], lx[2] * dy[1]);
-        write_quad_order2_gradient_q(row10, output_stride, q, dx[0] * ly[3], lx[0] * dy[3]);
-        write_quad_order2_gradient_q(row11, output_stride, q, dx[0] * ly[2], lx[0] * dy[2]);
-        write_quad_order2_gradient_q(row12, output_stride, q, dx[2] * ly[2], lx[2] * dy[2]);
-        write_quad_order2_gradient_q(row13, output_stride, q, dx[3] * ly[2], lx[3] * dy[2]);
-        write_quad_order2_gradient_q(row14, output_stride, q, dx[2] * ly[3], lx[2] * dy[3]);
-        write_quad_order2_gradient_q(row15, output_stride, q, dx[3] * ly[3], lx[3] * dy[3]);
-    }
-}
-
-void evaluate_quad_order3_hessians_strided(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT hessians_out) {
-    Real* row0 = hessians_out + 0u * 9u * output_stride;
-    Real* row1 = hessians_out + 1u * 9u * output_stride;
-    Real* row2 = hessians_out + 2u * 9u * output_stride;
-    Real* row3 = hessians_out + 3u * 9u * output_stride;
-    Real* row4 = hessians_out + 4u * 9u * output_stride;
-    Real* row5 = hessians_out + 5u * 9u * output_stride;
-    Real* row6 = hessians_out + 6u * 9u * output_stride;
-    Real* row7 = hessians_out + 7u * 9u * output_stride;
-    Real* row8 = hessians_out + 8u * 9u * output_stride;
-    Real* row9 = hessians_out + 9u * 9u * output_stride;
-    Real* row10 = hessians_out + 10u * 9u * output_stride;
-    Real* row11 = hessians_out + 11u * 9u * output_stride;
-    Real* row12 = hessians_out + 12u * 9u * output_stride;
-    Real* row13 = hessians_out + 13u * 9u * output_stride;
-    Real* row14 = hessians_out + 14u * 9u * output_stride;
-    Real* row15 = hessians_out + 15u * 9u * output_stride;
-
-    for (std::size_t q = 0; q < points.size(); ++q) {
-        const auto& xi = points[q];
-        Real lx[4];
-        Real ly[4];
-        Real dx[4];
-        Real dy[4];
-        Real hx[4];
-        Real hy[4];
-        fill_order3_axis_values_first_second(xi[0], lx, dx, hx);
-        fill_order3_axis_values_first_second(xi[1], ly, dy, hy);
-        write_quad_order2_hessian_q(row0, output_stride, q, hx[0] * ly[0], dx[0] * dy[0], lx[0] * hy[0]);
-        write_quad_order2_hessian_q(row1, output_stride, q, hx[1] * ly[0], dx[1] * dy[0], lx[1] * hy[0]);
-        write_quad_order2_hessian_q(row2, output_stride, q, hx[1] * ly[1], dx[1] * dy[1], lx[1] * hy[1]);
-        write_quad_order2_hessian_q(row3, output_stride, q, hx[0] * ly[1], dx[0] * dy[1], lx[0] * hy[1]);
-        write_quad_order2_hessian_q(row4, output_stride, q, hx[2] * ly[0], dx[2] * dy[0], lx[2] * hy[0]);
-        write_quad_order2_hessian_q(row5, output_stride, q, hx[3] * ly[0], dx[3] * dy[0], lx[3] * hy[0]);
-        write_quad_order2_hessian_q(row6, output_stride, q, hx[1] * ly[2], dx[1] * dy[2], lx[1] * hy[2]);
-        write_quad_order2_hessian_q(row7, output_stride, q, hx[1] * ly[3], dx[1] * dy[3], lx[1] * hy[3]);
-        write_quad_order2_hessian_q(row8, output_stride, q, hx[3] * ly[1], dx[3] * dy[1], lx[3] * hy[1]);
-        write_quad_order2_hessian_q(row9, output_stride, q, hx[2] * ly[1], dx[2] * dy[1], lx[2] * hy[1]);
-        write_quad_order2_hessian_q(row10, output_stride, q, hx[0] * ly[3], dx[0] * dy[3], lx[0] * hy[3]);
-        write_quad_order2_hessian_q(row11, output_stride, q, hx[0] * ly[2], dx[0] * dy[2], lx[0] * hy[2]);
-        write_quad_order2_hessian_q(row12, output_stride, q, hx[2] * ly[2], dx[2] * dy[2], lx[2] * hy[2]);
-        write_quad_order2_hessian_q(row13, output_stride, q, hx[3] * ly[2], dx[3] * dy[2], lx[3] * hy[2]);
-        write_quad_order2_hessian_q(row14, output_stride, q, hx[2] * ly[3], dx[2] * dy[3], lx[2] * hy[3]);
-        write_quad_order2_hessian_q(row15, output_stride, q, hx[3] * ly[3], dx[3] * dy[3], lx[3] * hy[3]);
-    }
-}
-
-template <std::size_t Q>
-inline void write_quad_order3_all_q4(
-    std::size_t output_stride,
-    std::size_t i,
-    std::size_t j,
-    const Real lx[4][4],
-    const Real ly[4][4],
-    const Real dx[4][4],
-    const Real dy[4][4],
-    const Real hx[4][4],
-    const Real hy[4][4],
-    Real* SVMP_RESTRICT value_row,
-    Real* SVMP_RESTRICT grad_row,
-    Real* SVMP_RESTRICT hess_row) {
-    const Real xv = lx[Q][i];
-    const Real yv = ly[Q][j];
-    const Real xd = dx[Q][i];
-    const Real yd = dy[Q][j];
-    const Real hxy = xd * yd;
-
-    value_row[Q] = xv * yv;
-    grad_row[0u * output_stride + Q] = xd * yv;
-    grad_row[1u * output_stride + Q] = xv * yd;
-    grad_row[2u * output_stride + Q] = Real(0);
-    hess_row[0u * output_stride + Q] = hx[Q][i] * yv;
-    hess_row[4u * output_stride + Q] = xv * hy[Q][j];
-    hess_row[8u * output_stride + Q] = Real(0);
-    hess_row[1u * output_stride + Q] = hxy;
-    hess_row[3u * output_stride + Q] = hxy;
-    hess_row[2u * output_stride + Q] = Real(0);
-    hess_row[6u * output_stride + Q] = Real(0);
-    hess_row[5u * output_stride + Q] = Real(0);
-    hess_row[7u * output_stride + Q] = Real(0);
-}
-
-void evaluate_quad_order3_all_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) {
-    constexpr auto node_axes = detail::make_quad_tensor_node_axes<3>();
-
-    Real lx[4][4];
-    Real ly[4][4];
-    Real dx[4][4];
-    Real dy[4][4];
-    Real hx[4][4];
-    Real hy[4][4];
-    for (std::size_t q = 0; q < 4u; ++q) {
-        const auto& xi = points[q];
-        fill_order3_axis_values_first_second(xi[0], lx[q], dx[q], hx[q]);
-        fill_order3_axis_values_first_second(xi[1], ly[q], dy[q], hy[q]);
-    }
-
-    for (std::size_t node = 0; node < node_axes.size(); ++node) {
-        const auto& axes = node_axes[node];
-        const std::size_t i = axes[0];
-        const std::size_t j = axes[1];
-        Real* value_row = values_out + node * output_stride;
-        Real* grad_row = gradients_out + node * 3u * output_stride;
-        Real* hess_row = hessians_out + node * 9u * output_stride;
-        write_quad_order3_all_q4<0u>(
-            output_stride, i, j, lx, ly, dx, dy, hx, hy, value_row, grad_row, hess_row);
-        write_quad_order3_all_q4<1u>(
-            output_stride, i, j, lx, ly, dx, dy, hx, hy, value_row, grad_row, hess_row);
-        write_quad_order3_all_q4<2u>(
-            output_stride, i, j, lx, ly, dx, dy, hx, hy, value_row, grad_row, hess_row);
-        write_quad_order3_all_q4<3u>(
-            output_stride, i, j, lx, ly, dx, dy, hx, hy, value_row, grad_row, hess_row);
-    }
-}
-
-void evaluate_hex_order3_values_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out) {
-    constexpr auto node_axes = detail::make_hex_tensor_node_axes<3>();
-
-    Real lx[4][4];
-    Real ly[4][4];
-    Real lz[4][4];
-    for (std::size_t q = 0; q < 4u; ++q) {
-        const auto& xi = points[q];
-        fill_order3_axis_values(xi[0], lx[q]);
-        fill_order3_axis_values(xi[1], ly[q]);
-        fill_order3_axis_values(xi[2], lz[q]);
-    }
-
-    for (std::size_t node = 0; node < node_axes.size(); ++node) {
-        const auto& axes = node_axes[node];
-        const std::size_t i = axes[0];
-        const std::size_t j = axes[1];
-        const std::size_t k = axes[2];
-        Real* row = values_out + node * output_stride;
-        row[0] = lx[0][i] * ly[0][j] * lz[0][k];
-        row[1] = lx[1][i] * ly[1][j] * lz[1][k];
-        row[2] = lx[2][i] * ly[2][j] * lz[2][k];
-        row[3] = lx[3][i] * ly[3][j] * lz[3][k];
-    }
-}
-
-void evaluate_hex_order3_gradients_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT gradients_out) {
-    constexpr auto node_axes = detail::make_hex_tensor_node_axes<3>();
-
-    Real lx[4][4];
-    Real ly[4][4];
-    Real lz[4][4];
-    Real dx[4][4];
-    Real dy[4][4];
-    Real dz[4][4];
-    for (std::size_t q = 0; q < 4u; ++q) {
-        const auto& xi = points[q];
-        fill_order3_axis_values_first(xi[0], lx[q], dx[q]);
-        fill_order3_axis_values_first(xi[1], ly[q], dy[q]);
-        fill_order3_axis_values_first(xi[2], lz[q], dz[q]);
-    }
-
-    for (std::size_t node = 0; node < node_axes.size(); ++node) {
-        const auto& axes = node_axes[node];
-        const std::size_t i = axes[0];
-        const std::size_t j = axes[1];
-        const std::size_t k = axes[2];
-        Real* row = gradients_out + node * 3u * output_stride;
-        row[0] = dx[0][i] * ly[0][j] * lz[0][k];
-        row[1] = dx[1][i] * ly[1][j] * lz[1][k];
-        row[2] = dx[2][i] * ly[2][j] * lz[2][k];
-        row[3] = dx[3][i] * ly[3][j] * lz[3][k];
-        row[output_stride + 0u] = lx[0][i] * dy[0][j] * lz[0][k];
-        row[output_stride + 1u] = lx[1][i] * dy[1][j] * lz[1][k];
-        row[output_stride + 2u] = lx[2][i] * dy[2][j] * lz[2][k];
-        row[output_stride + 3u] = lx[3][i] * dy[3][j] * lz[3][k];
-        row[2u * output_stride + 0u] = lx[0][i] * ly[0][j] * dz[0][k];
-        row[2u * output_stride + 1u] = lx[1][i] * ly[1][j] * dz[1][k];
-        row[2u * output_stride + 2u] = lx[2][i] * ly[2][j] * dz[2][k];
-        row[2u * output_stride + 3u] = lx[3][i] * ly[3][j] * dz[3][k];
-    }
-}
-
-template <std::size_t Q, bool WriteValue, bool WriteGradient>
-inline void write_hex_order3_q4_hessian_outputs(
-    std::size_t output_stride,
-    std::size_t i,
-    std::size_t j,
-    std::size_t k,
-    const Real lx[4][4],
-    const Real ly[4][4],
-    const Real lz[4][4],
-    const Real dx[4][4],
-    const Real dy[4][4],
-    const Real dz[4][4],
-    const Real hx[4][4],
-    const Real hy[4][4],
-    const Real hz[4][4],
-    Real* SVMP_RESTRICT value_row,
-    Real* SVMP_RESTRICT grad_row,
-    Real* SVMP_RESTRICT hess_row) {
-    const Real xv = lx[Q][i];
-    const Real yv = ly[Q][j];
-    const Real zv = lz[Q][k];
-    const Real yz = yv * zv;
-
-    if constexpr (WriteValue) {
-        value_row[Q] = xv * yz;
-    }
-
-    const Real xd = dx[Q][i];
-    const Real yd = dy[Q][j];
-    const Real zd = dz[Q][k];
-    const Real yd_z = yd * zv;
-    const Real yv_zd = yv * zd;
-
-    if constexpr (WriteGradient) {
-        grad_row[0u * output_stride + Q] = xd * yz;
-        grad_row[1u * output_stride + Q] = xv * yd_z;
-        grad_row[2u * output_stride + Q] = xv * yv_zd;
-    }
-
-    const Real hxy = xd * yd_z;
-    const Real hxz = xd * yv_zd;
-    const Real hyz = xv * yd * zd;
-    hess_row[0u * output_stride + Q] = hx[Q][i] * yz;
-    hess_row[4u * output_stride + Q] = xv * hy[Q][j] * zv;
-    hess_row[8u * output_stride + Q] = xv * yv * hz[Q][k];
-    hess_row[1u * output_stride + Q] = hxy;
-    hess_row[3u * output_stride + Q] = hxy;
-    hess_row[2u * output_stride + Q] = hxz;
-    hess_row[6u * output_stride + Q] = hxz;
-    hess_row[5u * output_stride + Q] = hyz;
-    hess_row[7u * output_stride + Q] = hyz;
-}
-
-template <bool WriteValue, bool WriteGradient>
-void evaluate_hex_order3_q4_hessian_outputs(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) {
-    constexpr auto node_axes = detail::make_hex_tensor_node_axes<3>();
-
-    Real lx[4][4];
-    Real ly[4][4];
-    Real lz[4][4];
-    Real dx[4][4];
-    Real dy[4][4];
-    Real dz[4][4];
-    Real hx[4][4];
-    Real hy[4][4];
-    Real hz[4][4];
-    for (std::size_t q = 0; q < 4u; ++q) {
-        const auto& xi = points[q];
-        fill_order3_axis_values_first_second(xi[0], lx[q], dx[q], hx[q]);
-        fill_order3_axis_values_first_second(xi[1], ly[q], dy[q], hy[q]);
-        fill_order3_axis_values_first_second(xi[2], lz[q], dz[q], hz[q]);
-    }
-
-    for (std::size_t node = 0; node < node_axes.size(); ++node) {
-        const auto& axes = node_axes[node];
-        const std::size_t i = axes[0];
-        const std::size_t j = axes[1];
-        const std::size_t k = axes[2];
-        Real* value_row = values_out ? values_out + node * output_stride : nullptr;
-        Real* grad_row = gradients_out ? gradients_out + node * 3u * output_stride : nullptr;
-        Real* hess_row = hessians_out + node * 9u * output_stride;
-        write_hex_order3_q4_hessian_outputs<0u, WriteValue, WriteGradient>(
-            output_stride, i, j, k, lx, ly, lz, dx, dy, dz, hx, hy, hz,
-            value_row, grad_row, hess_row);
-        write_hex_order3_q4_hessian_outputs<1u, WriteValue, WriteGradient>(
-            output_stride, i, j, k, lx, ly, lz, dx, dy, dz, hx, hy, hz,
-            value_row, grad_row, hess_row);
-        write_hex_order3_q4_hessian_outputs<2u, WriteValue, WriteGradient>(
-            output_stride, i, j, k, lx, ly, lz, dx, dy, dz, hx, hy, hz,
-            value_row, grad_row, hess_row);
-        write_hex_order3_q4_hessian_outputs<3u, WriteValue, WriteGradient>(
-            output_stride, i, j, k, lx, ly, lz, dx, dy, dz, hx, hy, hz,
-            value_row, grad_row, hess_row);
-    }
-}
-
-void evaluate_hex_order3_hessians_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT hessians_out) {
-    evaluate_hex_order3_q4_hessian_outputs<false, false>(
-        points, output_stride, nullptr, nullptr, hessians_out);
-}
-
-void evaluate_hex_order3_all_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) {
-    evaluate_hex_order3_q4_hessian_outputs<true, true>(
-        points, output_stride, values_out, gradients_out, hessians_out);
-}
-
-void evaluate_hex_order2_values_strided(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out) {
-    Real* row0 = values_out + 0u * output_stride;
-    Real* row1 = values_out + 1u * output_stride;
-    Real* row2 = values_out + 2u * output_stride;
-    Real* row3 = values_out + 3u * output_stride;
-    Real* row4 = values_out + 4u * output_stride;
-    Real* row5 = values_out + 5u * output_stride;
-    Real* row6 = values_out + 6u * output_stride;
-    Real* row7 = values_out + 7u * output_stride;
-    Real* row8 = values_out + 8u * output_stride;
-    Real* row9 = values_out + 9u * output_stride;
-    Real* row10 = values_out + 10u * output_stride;
-    Real* row11 = values_out + 11u * output_stride;
-    Real* row12 = values_out + 12u * output_stride;
-    Real* row13 = values_out + 13u * output_stride;
-    Real* row14 = values_out + 14u * output_stride;
-    Real* row15 = values_out + 15u * output_stride;
-    Real* row16 = values_out + 16u * output_stride;
-    Real* row17 = values_out + 17u * output_stride;
-    Real* row18 = values_out + 18u * output_stride;
-    Real* row19 = values_out + 19u * output_stride;
-    Real* row20 = values_out + 20u * output_stride;
-    Real* row21 = values_out + 21u * output_stride;
-    Real* row22 = values_out + 22u * output_stride;
-    Real* row23 = values_out + 23u * output_stride;
-    Real* row24 = values_out + 24u * output_stride;
-    Real* row25 = values_out + 25u * output_stride;
-    Real* row26 = values_out + 26u * output_stride;
-
-    for (std::size_t q = 0; q < points.size(); ++q) {
-        const auto& xi = points[q];
-        const Real x = xi[0];
-        const Real y = xi[1];
-        const Real z = xi[2];
-        const Real x0 = x * (x - Real(1)) * Real(0.5);
-        const Real x1 = x * (x + Real(1)) * Real(0.5);
-        const Real x2 = Real(1) - x * x;
-        const Real y0 = y * (y - Real(1)) * Real(0.5);
-        const Real y1 = y * (y + Real(1)) * Real(0.5);
-        const Real y2 = Real(1) - y * y;
-        const Real z0 = z * (z - Real(1)) * Real(0.5);
-        const Real z1 = z * (z + Real(1)) * Real(0.5);
-        const Real z2 = Real(1) - z * z;
-        const Real x0y0 = x0 * y0;
-        const Real x1y0 = x1 * y0;
-        const Real x1y1 = x1 * y1;
-        const Real x0y1 = x0 * y1;
-        const Real x2y0 = x2 * y0;
-        const Real x1y2 = x1 * y2;
-        const Real x2y1 = x2 * y1;
-        const Real x0y2 = x0 * y2;
-        const Real x2y2 = x2 * y2;
-
-        row0[q] = x0y0 * z0;
-        row1[q] = x1y0 * z0;
-        row2[q] = x1y1 * z0;
-        row3[q] = x0y1 * z0;
-        row4[q] = x0y0 * z1;
-        row5[q] = x1y0 * z1;
-        row6[q] = x1y1 * z1;
-        row7[q] = x0y1 * z1;
-        row8[q] = x2y0 * z0;
-        row9[q] = x1y2 * z0;
-        row10[q] = x2y1 * z0;
-        row11[q] = x0y2 * z0;
-        row12[q] = x2y0 * z1;
-        row13[q] = x1y2 * z1;
-        row14[q] = x2y1 * z1;
-        row15[q] = x0y2 * z1;
-        row16[q] = x0y0 * z2;
-        row17[q] = x1y0 * z2;
-        row18[q] = x1y1 * z2;
-        row19[q] = x0y1 * z2;
-        row20[q] = x2y2 * z0;
-        row21[q] = x2y2 * z1;
-        row22[q] = x2y0 * z2;
-        row23[q] = x1y2 * z2;
-        row24[q] = x2y1 * z2;
-        row25[q] = x0y2 * z2;
-        row26[q] = x2y2 * z2;
-    }
-}
-
-inline void fill_order2_axis_values_first(Real x,
-                                          Real* SVMP_RESTRICT values,
-                                          Real* SVMP_RESTRICT first) {
-    values[0] = x * (x - Real(1)) * Real(0.5);
-    values[1] = x * (x + Real(1)) * Real(0.5);
-    values[2] = Real(1) - x * x;
-    first[0] = x - Real(0.5);
-    first[1] = x + Real(0.5);
-    first[2] = Real(-2) * x;
-}
-
-inline void fill_order2_axis_values_first_second(Real x,
-                                                 Real* SVMP_RESTRICT values,
-                                                 Real* SVMP_RESTRICT first,
-                                                 Real* SVMP_RESTRICT second) {
-    fill_order2_axis_values_first(x, values, first);
-    second[0] = Real(1);
-    second[1] = Real(1);
-    second[2] = Real(-2);
-}
-
-template <std::size_t Q>
-inline void write_hex_order2_hessian_q4(
-    std::size_t output_stride,
-    std::size_t i,
-    std::size_t j,
-    std::size_t k,
-    const Real lx[4][3],
-    const Real ly[4][3],
-    const Real lz[4][3],
-    const Real dx[4][3],
-    const Real dy[4][3],
-    const Real dz[4][3],
-    const Real hx[4][3],
-    const Real hy[4][3],
-    const Real hz[4][3],
-    Real* SVMP_RESTRICT hess_row) {
-    const Real xv = lx[Q][i];
-    const Real yv = ly[Q][j];
-    const Real zv = lz[Q][k];
-    const Real yz = yv * zv;
-    const Real xd = dx[Q][i];
-    const Real yd = dy[Q][j];
-    const Real zd = dz[Q][k];
-    const Real yd_z = yd * zv;
-    const Real yv_zd = yv * zd;
-    const Real hxy = xd * yd_z;
-    const Real hxz = xd * yv_zd;
-    const Real hyz = xv * yd * zd;
-    hess_row[0u * output_stride + Q] = hx[Q][i] * yz;
-    hess_row[4u * output_stride + Q] = xv * hy[Q][j] * zv;
-    hess_row[8u * output_stride + Q] = xv * yv * hz[Q][k];
-    hess_row[1u * output_stride + Q] = hxy;
-    hess_row[3u * output_stride + Q] = hxy;
-    hess_row[2u * output_stride + Q] = hxz;
-    hess_row[6u * output_stride + Q] = hxz;
-    hess_row[5u * output_stride + Q] = hyz;
-    hess_row[7u * output_stride + Q] = hyz;
-}
-
-void evaluate_hex_order2_hessians_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT hessians_out) {
-    constexpr auto node_axes = detail::make_hex_tensor_node_axes<2>();
-
-    Real lx[4][3];
-    Real ly[4][3];
-    Real lz[4][3];
-    Real dx[4][3];
-    Real dy[4][3];
-    Real dz[4][3];
-    Real hx[4][3];
-    Real hy[4][3];
-    Real hz[4][3];
-    for (std::size_t q = 0; q < 4u; ++q) {
-        const auto& xi = points[q];
-        fill_order2_axis_values_first_second(xi[0], lx[q], dx[q], hx[q]);
-        fill_order2_axis_values_first_second(xi[1], ly[q], dy[q], hy[q]);
-        fill_order2_axis_values_first_second(xi[2], lz[q], dz[q], hz[q]);
-    }
-
-    for (std::size_t node = 0; node < node_axes.size(); ++node) {
-        const auto& axes = node_axes[node];
-        const std::size_t i = axes[0];
-        const std::size_t j = axes[1];
-        const std::size_t k = axes[2];
-        Real* hess_row = hessians_out + node * 9u * output_stride;
-        write_hex_order2_hessian_q4<0u>(
-            output_stride, i, j, k, lx, ly, lz, dx, dy, dz, hx, hy, hz, hess_row);
-        write_hex_order2_hessian_q4<1u>(
-            output_stride, i, j, k, lx, ly, lz, dx, dy, dz, hx, hy, hz, hess_row);
-        write_hex_order2_hessian_q4<2u>(
-            output_stride, i, j, k, lx, ly, lz, dx, dy, dz, hx, hy, hz, hess_row);
-        write_hex_order2_hessian_q4<3u>(
-            output_stride, i, j, k, lx, ly, lz, dx, dy, dz, hx, hy, hz, hess_row);
-    }
-}
-
-template <std::size_t Q>
-inline void write_quad_order2_all_q4(
-    std::size_t output_stride,
-    std::size_t i,
-    std::size_t j,
-    const Real lx[4][3],
-    const Real ly[4][3],
-    const Real dx[4][3],
-    const Real dy[4][3],
-    const Real hx[4][3],
-    const Real hy[4][3],
-    Real* SVMP_RESTRICT value_row,
-    Real* SVMP_RESTRICT grad_row,
-    Real* SVMP_RESTRICT hess_row) {
-    const Real xv = lx[Q][i];
-    const Real yv = ly[Q][j];
-    const Real xd = dx[Q][i];
-    const Real yd = dy[Q][j];
-    const Real hxy = xd * yd;
-
-    value_row[Q] = xv * yv;
-    grad_row[0u * output_stride + Q] = xd * yv;
-    grad_row[1u * output_stride + Q] = xv * yd;
-    grad_row[2u * output_stride + Q] = Real(0);
-    hess_row[0u * output_stride + Q] = hx[Q][i] * yv;
-    hess_row[4u * output_stride + Q] = xv * hy[Q][j];
-    hess_row[8u * output_stride + Q] = Real(0);
-    hess_row[1u * output_stride + Q] = hxy;
-    hess_row[3u * output_stride + Q] = hxy;
-    hess_row[2u * output_stride + Q] = Real(0);
-    hess_row[6u * output_stride + Q] = Real(0);
-    hess_row[5u * output_stride + Q] = Real(0);
-    hess_row[7u * output_stride + Q] = Real(0);
-}
-
-void evaluate_quad_order2_all_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) {
-    constexpr auto node_axes = detail::make_quad_tensor_node_axes<2>();
-
-    Real lx[4][3];
-    Real ly[4][3];
-    Real dx[4][3];
-    Real dy[4][3];
-    Real hx[4][3];
-    Real hy[4][3];
-    for (std::size_t q = 0; q < 4u; ++q) {
-        const auto& xi = points[q];
-        fill_order2_axis_values_first_second(xi[0], lx[q], dx[q], hx[q]);
-        fill_order2_axis_values_first_second(xi[1], ly[q], dy[q], hy[q]);
-    }
-
-    for (std::size_t node = 0; node < node_axes.size(); ++node) {
-        const auto& axes = node_axes[node];
-        const std::size_t i = axes[0];
-        const std::size_t j = axes[1];
-        Real* value_row = values_out + node * output_stride;
-        Real* grad_row = gradients_out + node * 3u * output_stride;
-        Real* hess_row = hessians_out + node * 9u * output_stride;
-        write_quad_order2_all_q4<0u>(
-            output_stride, i, j, lx, ly, dx, dy, hx, hy, value_row, grad_row, hess_row);
-        write_quad_order2_all_q4<1u>(
-            output_stride, i, j, lx, ly, dx, dy, hx, hy, value_row, grad_row, hess_row);
-        write_quad_order2_all_q4<2u>(
-            output_stride, i, j, lx, ly, dx, dy, hx, hy, value_row, grad_row, hess_row);
-        write_quad_order2_all_q4<3u>(
-            output_stride, i, j, lx, ly, dx, dy, hx, hy, value_row, grad_row, hess_row);
-    }
-}
-
-void evaluate_hex_order2_gradients_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT gradients_out) {
-    constexpr std::array<std::array<std::size_t, 3>, 27> node_axes = {{
-        {{0u, 0u, 0u}}, {{1u, 0u, 0u}}, {{1u, 1u, 0u}}, {{0u, 1u, 0u}},
-        {{0u, 0u, 1u}}, {{1u, 0u, 1u}}, {{1u, 1u, 1u}}, {{0u, 1u, 1u}},
-        {{2u, 0u, 0u}}, {{1u, 2u, 0u}}, {{2u, 1u, 0u}}, {{0u, 2u, 0u}},
-        {{2u, 0u, 1u}}, {{1u, 2u, 1u}}, {{2u, 1u, 1u}}, {{0u, 2u, 1u}},
-        {{0u, 0u, 2u}}, {{1u, 0u, 2u}}, {{1u, 1u, 2u}}, {{0u, 1u, 2u}},
-        {{2u, 2u, 0u}}, {{2u, 2u, 1u}}, {{2u, 0u, 2u}}, {{1u, 2u, 2u}},
-        {{2u, 1u, 2u}}, {{0u, 2u, 2u}}, {{2u, 2u, 2u}},
-    }};
-
-    Real lx[4][3];
-    Real ly[4][3];
-    Real lz[4][3];
-    Real dx[4][3];
-    Real dy[4][3];
-    Real dz[4][3];
-    for (std::size_t q = 0; q < 4u; ++q) {
-        const auto& xi = points[q];
-        fill_order2_axis_values_first(xi[0], lx[q], dx[q]);
-        fill_order2_axis_values_first(xi[1], ly[q], dy[q]);
-        fill_order2_axis_values_first(xi[2], lz[q], dz[q]);
-    }
-
-    for (std::size_t node = 0; node < node_axes.size(); ++node) {
-        const auto& axes = node_axes[node];
-        const std::size_t i = axes[0];
-        const std::size_t j = axes[1];
-        const std::size_t k = axes[2];
-        Real* row = gradients_out + node * 3u * output_stride;
-        row[0] = dx[0][i] * ly[0][j] * lz[0][k];
-        row[1] = dx[1][i] * ly[1][j] * lz[1][k];
-        row[2] = dx[2][i] * ly[2][j] * lz[2][k];
-        row[3] = dx[3][i] * ly[3][j] * lz[3][k];
-        row[output_stride + 0u] = lx[0][i] * dy[0][j] * lz[0][k];
-        row[output_stride + 1u] = lx[1][i] * dy[1][j] * lz[1][k];
-        row[output_stride + 2u] = lx[2][i] * dy[2][j] * lz[2][k];
-        row[output_stride + 3u] = lx[3][i] * dy[3][j] * lz[3][k];
-        row[2u * output_stride + 0u] = lx[0][i] * ly[0][j] * dz[0][k];
-        row[2u * output_stride + 1u] = lx[1][i] * ly[1][j] * dz[1][k];
-        row[2u * output_stride + 2u] = lx[2][i] * ly[2][j] * dz[2][k];
-        row[2u * output_stride + 3u] = lx[3][i] * ly[3][j] * dz[3][k];
-    }
-}
-
-template<typename FastBasis>
-void evaluate_constant_fast_hessians_strided(
-    std::size_t num_qpts,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT hessians_out) {
-    std::array<Hessian, FastBasis::n_dofs> fast_hessians{};
-    FastBasis::evaluate_hessians(math::Vector<Real, 3>{}, fast_hessians);
-    for (std::size_t i = 0; i < fast_hessians.size(); ++i) {
-        const Hessian& hessian = fast_hessians[i];
-        Real* H = hessians_out + i * 9u * output_stride;
-        const Real h00 = hessian(0, 0);
-        const Real h01 = hessian(0, 1);
-        const Real h02 = hessian(0, 2);
-        const Real h10 = hessian(1, 0);
-        const Real h11 = hessian(1, 1);
-        const Real h12 = hessian(1, 2);
-        const Real h20 = hessian(2, 0);
-        const Real h21 = hessian(2, 1);
-        const Real h22 = hessian(2, 2);
-        for (std::size_t q = 0; q < num_qpts; ++q) {
-            H[0u * output_stride + q] = h00;
-            H[1u * output_stride + q] = h01;
-            H[2u * output_stride + q] = h02;
-            H[3u * output_stride + q] = h10;
-            H[4u * output_stride + q] = h11;
-            H[5u * output_stride + q] = h12;
-            H[6u * output_stride + q] = h20;
-            H[7u * output_stride + q] = h21;
-            H[8u * output_stride + q] = h22;
-        }
-    }
-}
-
-template<typename FastBasis>
-void evaluate_fast_outputs_with_constant_hessians_strided(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) {
-    for (std::size_t q = 0; q < points.size(); ++q) {
-        const auto& xi = points[q];
-        if (values_out != nullptr) {
-            std::array<Real, FastBasis::n_dofs> fast_values{};
-            FastBasis::evaluate(xi, fast_values);
-            for (std::size_t i = 0; i < fast_values.size(); ++i) {
-                values_out[i * output_stride + q] = fast_values[i];
-            }
-        }
-        if (gradients_out != nullptr) {
-            std::array<Gradient, FastBasis::n_dofs> fast_gradients{};
-            FastBasis::evaluate_gradients(xi, fast_gradients);
-            for (std::size_t i = 0; i < fast_gradients.size(); ++i) {
-                Real* g = gradients_out + i * 3u * output_stride;
-                g[0u * output_stride + q] = fast_gradients[i][0];
-                g[1u * output_stride + q] = fast_gradients[i][1];
-                g[2u * output_stride + q] = fast_gradients[i][2];
-            }
-        }
-    }
-    evaluate_constant_fast_hessians_strided<FastBasis>(
-        points.size(), output_stride, hessians_out);
-}
-
-template<int Order>
-void evaluate_wedge_fast_outputs_strided(
-    const std::vector<std::array<std::size_t, 2>>& wedge_indices,
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) {
-    static_assert(Order >= 1 && Order <= 2,
-                  "wedge fast outputs rely on low-order public triangle ordering");
-    using TriFast = LagrangeTriFast<Order>;
-    constexpr std::size_t axis_size = static_cast<std::size_t>(Order + 1);
-    const bool need_grad = gradients_out != nullptr;
-    const bool need_hess = hessians_out != nullptr;
-
-    for (std::size_t q = 0; q < points.size(); ++q) {
-        const auto& xi = points[q];
-        std::array<Real, TriFast::n_dofs> tri_values{};
-        std::array<Gradient, TriFast::n_dofs> tri_gradients{};
-        std::array<Hessian, TriFast::n_dofs> tri_hessians{};
-        std::array<Real, axis_size> z_values{};
-        std::array<Real, axis_size> z_first{};
-        std::array<Real, axis_size> z_second{};
-
-        TriFast::evaluate(xi, tri_values);
-        if (need_grad || need_hess) {
-            TriFast::evaluate_gradients(xi, tri_gradients);
-        }
-        if (need_hess) {
-            TriFast::evaluate_hessians(xi, tri_hessians);
-            detail::fill_axis_values_first_second<Order>(xi[2], z_values, z_first, z_second);
-        } else if (need_grad) {
-            detail::fill_axis_values_first<Order>(xi[2], z_values, z_first);
-        } else {
-            detail::fill_axis_values<Order>(xi[2], z_values);
-        }
-
-        for (std::size_t node = 0; node < wedge_indices.size(); ++node) {
-            const auto& index = wedge_indices[node];
-            const std::size_t tri = index[0];
-            const std::size_t z = index[1];
-            const Real tri_v = tri_values[tri];
-            const Real zv = z_values[z];
-
-            if (values_out != nullptr) {
-                values_out[node * output_stride + q] = tri_v * zv;
-            }
-
-            if (gradients_out != nullptr) {
-                Real* g = gradients_out + node * 3u * output_stride;
-                const Gradient& tri_g = tri_gradients[tri];
-                g[0u * output_stride + q] = tri_g[0] * zv;
-                g[1u * output_stride + q] = tri_g[1] * zv;
-                g[2u * output_stride + q] = tri_v * z_first[z];
-            }
-
-            if (hessians_out != nullptr) {
-                Real* H = hessians_out + node * 9u * output_stride;
-                const Gradient& tri_g = tri_gradients[tri];
-                const Hessian& tri_H = tri_hessians[tri];
-                const Real zd = z_first[z];
-                const Real hxz = tri_g[0] * zd;
-                const Real hxy = tri_H(0, 1) * zv;
-                const Real hyz = tri_g[1] * zd;
-                H[0u * output_stride + q] = tri_H(0, 0) * zv;
-                H[1u * output_stride + q] = hxy;
-                H[2u * output_stride + q] = hxz;
-                H[3u * output_stride + q] = hxy;
-                H[4u * output_stride + q] = tri_H(1, 1) * zv;
-                H[5u * output_stride + q] = hyz;
-                H[6u * output_stride + q] = hxz;
-                H[7u * output_stride + q] = hyz;
-                H[8u * output_stride + q] = tri_v * z_second[z];
-            }
-        }
-    }
-}
-
-template <int Order>
-inline void fill_triangle_simplex_product_factors(Real lambda, Real* SVMP_RESTRICT factors) {
-    const Real t = static_cast<Real>(Order) * lambda;
-    factors[0] = Real(1);
-    for (int a = 1; a <= Order; ++a) {
-        factors[a] =
-            factors[a - 1] *
-            (t - static_cast<Real>(a - 1)) /
-            static_cast<Real>(a);
-    }
-}
-
-template <int Order>
-SVMP_LAGRANGE_NOINLINE SVMP_LAGRANGE_ALIGN64 bool evaluate_wedge_values_product_q4(
-    const std::vector<std::array<int, 4>>& simplex_exponents,
-    const std::vector<std::array<std::size_t, 2>>& wedge_indices,
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out) {
-    constexpr std::size_t tri_count =
-        static_cast<std::size_t>((Order + 1) * (Order + 2) / 2);
-    if (simplex_exponents.size() != tri_count || points.size() != 4u) {
-        return false;
-    }
-
-    Real tri_values[4][tri_count];
-    std::array<Real, Order + 1> z_values[4];
-    for (std::size_t q = 0; q < 4u; ++q) {
-        const auto& xi = points[q];
-        const Real l1 = xi[0];
-        const Real l2 = xi[1];
-        const Real l0 = Real(1) - l1 - l2;
-        Real f0[Order + 1];
-        Real f1[Order + 1];
-        Real f2[Order + 1];
-        fill_triangle_simplex_product_factors<Order>(l0, f0);
-        fill_triangle_simplex_product_factors<Order>(l1, f1);
-        fill_triangle_simplex_product_factors<Order>(l2, f2);
-        detail::fill_axis_values<Order>(xi[2], z_values[q]);
-
-        for (std::size_t tri = 0; tri < tri_count; ++tri) {
-            const auto& e = simplex_exponents[tri];
-            tri_values[q][tri] =
-                f0[static_cast<std::size_t>(e[0])] *
-                f1[static_cast<std::size_t>(e[1])] *
-                f2[static_cast<std::size_t>(e[2])];
-        }
-    }
-
-    for (std::size_t node = 0; node < wedge_indices.size(); ++node) {
-        const auto& index = wedge_indices[node];
-        const std::size_t tri = index[0];
-        const std::size_t z = index[1];
-        Real* SVMP_RESTRICT row = values_out + node * output_stride;
-        row[0] = tri_values[0][tri] * z_values[0][z];
-        row[1] = tri_values[1][tri] * z_values[1][z];
-        row[2] = tri_values[2][tri] * z_values[2][z];
-        row[3] = tri_values[3][tri] * z_values[3][z];
-    }
-    return true;
-}
-
-bool try_evaluate_wedge_values_product_q4(
-    const std::vector<std::array<int, 4>>& simplex_exponents,
-    const std::vector<std::array<std::size_t, 2>>& wedge_indices,
-    int order,
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out) {
-    switch (order) {
-        case 4:
-            return evaluate_wedge_values_product_q4<4>(
-                simplex_exponents, wedge_indices, points, output_stride, values_out);
-        case 5:
-            return evaluate_wedge_values_product_q4<5>(
-                simplex_exponents, wedge_indices, points, output_stride, values_out);
-        case 6:
-            return evaluate_wedge_values_product_q4<6>(
-                simplex_exponents, wedge_indices, points, output_stride, values_out);
-        case 7:
-            return evaluate_wedge_values_product_q4<7>(
-                simplex_exponents, wedge_indices, points, output_stride, values_out);
-        case 8:
-            return evaluate_wedge_values_product_q4<8>(
-                simplex_exponents, wedge_indices, points, output_stride, values_out);
-        default:
-            return false;
-    }
-}
-
-void evaluate_wedge_order1_values_q4(
-    const std::vector<std::array<std::size_t, 2>>& wedge_indices,
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out) {
-    Real tri[4][3];
-    Real axis[4][2];
-    for (std::size_t q = 0; q < 4u; ++q) {
-        const auto& xi = points[q];
-        tri[q][0] = Real(1) - xi[0] - xi[1];
-        tri[q][1] = xi[0];
-        tri[q][2] = xi[1];
-        axis[q][0] = (Real(1) - xi[2]) * Real(0.5);
-        axis[q][1] = (Real(1) + xi[2]) * Real(0.5);
-    }
-
-    for (std::size_t node = 0; node < wedge_indices.size(); ++node) {
-        const auto& index = wedge_indices[node];
-        const std::size_t tri_node = index[0];
-        const std::size_t axis_node = index[1];
-        Real* row = values_out + node * output_stride;
-        row[0] = tri[0][tri_node] * axis[0][axis_node];
-        row[1] = tri[1][tri_node] * axis[1][axis_node];
-        row[2] = tri[2][tri_node] * axis[2][axis_node];
-        row[3] = tri[3][tri_node] * axis[3][axis_node];
-    }
-}
-
-bool evaluate_wedge_fast_strided(
-    int order,
-    const std::vector<std::array<std::size_t, 2>>& wedge_indices,
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) {
-    if (order == 3) {
-        return false;
-    }
-    if (order == 1 &&
-        points.size() == 4u &&
-        values_out != nullptr &&
-        gradients_out == nullptr &&
-        hessians_out == nullptr) {
-        evaluate_wedge_order1_values_q4(wedge_indices, points, output_stride, values_out);
-        return true;
-    }
-
-    switch (order) {
-        case 1:
-            evaluate_wedge_fast_outputs_strided<1>(
-                wedge_indices, points, output_stride, values_out, gradients_out, hessians_out);
-            return true;
-        case 2:
-            evaluate_wedge_fast_outputs_strided<2>(
-                wedge_indices, points, output_stride, values_out, gradients_out, hessians_out);
-            return true;
-        default:
-            return false;
-    }
-}
-
-bool evaluate_fixed_lagrange_fast(LagrangeTopology topology,
-                                  int order,
-                                  const math::Vector<Real, 3>& xi,
-                                  std::vector<Real>* values,
-                                  std::vector<Gradient>* gradients,
-                                  std::vector<Hessian>* hessians) {
-    switch (order) {
-        case 1:
-            return evaluate_fixed_lagrange_fast_order<1>(
-                topology, xi, values, gradients, hessians);
-        case 2:
-            return evaluate_fixed_lagrange_fast_order<2>(
-                topology, xi, values, gradients, hessians);
-        case 3:
-            return evaluate_fixed_lagrange_fast_order<3>(
-                topology, xi, values, gradients, hessians);
-        default:
-            return false;
-    }
-}
-
-bool evaluate_fixed_lagrange_fast_strided(LagrangeTopology topology,
-                                          int order,
-                                          const std::vector<math::Vector<Real, 3>>& points,
-                                          std::size_t output_stride,
-                                          Real* SVMP_RESTRICT values_out,
-                                          Real* SVMP_RESTRICT gradients_out,
-                                          Real* SVMP_RESTRICT hessians_out) {
-    if (topology == LagrangeTopology::Line &&
-        points.size() == 4u) {
-        const bool values_only =
-            values_out != nullptr && gradients_out == nullptr && hessians_out == nullptr;
-        const bool gradients_only =
-            values_out == nullptr && gradients_out != nullptr && hessians_out == nullptr;
-        const bool hessians_only =
-            values_out == nullptr && gradients_out == nullptr && hessians_out != nullptr;
-        const bool all_outputs =
-            values_out != nullptr && gradients_out != nullptr && hessians_out != nullptr;
-        if (values_only) {
-            if (order == 1) {
-                evaluate_line_order1_values_q4(points, output_stride, values_out);
-                return true;
-            }
-            if (order == 2) {
-                evaluate_line_order2_values_q4(points, output_stride, values_out);
-                return true;
-            }
-            if (order == 3) {
-                evaluate_line_order3_values_q4(points, output_stride, values_out);
-                return true;
-            }
-        }
-        if (order == 1) {
-            if (gradients_only) {
-                evaluate_line_order1_gradients_q4(output_stride, gradients_out);
-                return true;
-            }
-            if (hessians_only) {
-                evaluate_line_order1_hessians_q4(output_stride, hessians_out);
-                return true;
-            }
-            if (all_outputs) {
-                evaluate_line_order1_all_q4(
-                    points, output_stride, values_out, gradients_out, hessians_out);
-                return true;
-            }
-        }
-        if (order == 2) {
-            if (gradients_only) {
-                evaluate_line_order2_gradients_q4(points, output_stride, gradients_out);
-                return true;
-            }
-            if (hessians_only) {
-                evaluate_line_order2_hessians_q4(output_stride, hessians_out);
-                return true;
-            }
-            if (all_outputs) {
-                evaluate_line_order2_all_q4(
-                    points, output_stride, values_out, gradients_out, hessians_out);
-                return true;
-            }
-        }
-        if (order == 3) {
-            if (gradients_only) {
-                evaluate_line_order3_gradients_q4(points, output_stride, gradients_out);
-                return true;
-            }
-            if (hessians_only) {
-                evaluate_line_order3_hessians_q4(points, output_stride, hessians_out);
-                return true;
-            }
-            if (all_outputs) {
-                evaluate_line_order3_all_q4(
-                    points, output_stride, values_out, gradients_out, hessians_out);
-                return true;
-            }
-        }
-    }
-
-    if (topology == LagrangeTopology::Tetrahedron &&
-        order == 3 &&
-        (gradients_out != nullptr || hessians_out != nullptr)) {
-        return false;
-    }
-    if (topology == LagrangeTopology::Triangle &&
-        order == 3 &&
-        hessians_out != nullptr) {
-        return false;
-    }
-    if (topology == LagrangeTopology::Triangle &&
-        order == 1 &&
-        values_out != nullptr &&
-        gradients_out == nullptr &&
-        hessians_out == nullptr) {
-        evaluate_triangle_order1_values_strided(points, output_stride, values_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Triangle &&
-        order == 1 &&
-        values_out == nullptr &&
-        gradients_out != nullptr &&
-        hessians_out == nullptr) {
-        evaluate_triangle_order1_gradients_strided(points.size(), output_stride, gradients_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Tetrahedron &&
-        order == 1 &&
-        values_out != nullptr &&
-        gradients_out == nullptr &&
-        hessians_out == nullptr) {
-        evaluate_tet_order1_values_strided(points, output_stride, values_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Tetrahedron &&
-        order == 1 &&
-        values_out == nullptr &&
-        gradients_out != nullptr &&
-        hessians_out == nullptr) {
-        evaluate_tet_order1_gradients_strided(points.size(), output_stride, gradients_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Triangle &&
-        order == 1 &&
-        values_out == nullptr &&
-        gradients_out == nullptr &&
-        hessians_out != nullptr) {
-        evaluate_zero_hessians_strided(3u, points.size(), output_stride, hessians_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Tetrahedron &&
-        order == 1 &&
-        values_out == nullptr &&
-        gradients_out == nullptr &&
-        hessians_out != nullptr) {
-        evaluate_zero_hessians_strided(4u, points.size(), output_stride, hessians_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Triangle &&
-        order == 1 &&
-        values_out != nullptr &&
-        gradients_out != nullptr &&
-        hessians_out != nullptr) {
-        evaluate_triangle_order1_values_strided(points, output_stride, values_out);
-        evaluate_triangle_order1_gradients_strided(points.size(), output_stride, gradients_out);
-        evaluate_zero_hessians_strided(3u, points.size(), output_stride, hessians_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Tetrahedron &&
-        order == 1 &&
-        values_out != nullptr &&
-        gradients_out != nullptr &&
-        hessians_out != nullptr) {
-        evaluate_tet_order1_values_strided(points, output_stride, values_out);
-        evaluate_tet_order1_gradients_strided(points.size(), output_stride, gradients_out);
-        evaluate_zero_hessians_strided(4u, points.size(), output_stride, hessians_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Triangle &&
-        order == 2 &&
-        values_out != nullptr &&
-        gradients_out == nullptr &&
-        hessians_out == nullptr) {
-        evaluate_triangle_order2_values_strided(points, output_stride, values_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Triangle &&
-        order == 2 &&
-        values_out == nullptr &&
-        gradients_out != nullptr &&
-        hessians_out == nullptr) {
-        evaluate_triangle_order2_gradients_strided(points, output_stride, gradients_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Triangle &&
-        order == 2 &&
-        points.size() == 4u &&
-        values_out == nullptr &&
-        gradients_out == nullptr &&
-        hessians_out != nullptr) {
-        evaluate_triangle_order2_hessians_q4(output_stride, hessians_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Triangle &&
-        order == 2 &&
-        points.size() == 4u &&
-        values_out != nullptr &&
-        gradients_out != nullptr &&
-        hessians_out != nullptr) {
-        evaluate_triangle_order2_values_strided(points, output_stride, values_out);
-        evaluate_triangle_order2_gradients_strided(points, output_stride, gradients_out);
-        evaluate_triangle_order2_hessians_q4(output_stride, hessians_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Tetrahedron &&
-        order == 2 &&
-        values_out != nullptr &&
-        gradients_out == nullptr &&
-        hessians_out == nullptr) {
-        evaluate_tet_order2_values_strided(points, output_stride, values_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Tetrahedron &&
-        order == 2 &&
-        values_out == nullptr &&
-        gradients_out != nullptr &&
-        hessians_out == nullptr) {
-        evaluate_tet_order2_gradients_strided(points, output_stride, gradients_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Tetrahedron &&
-        order == 2 &&
-        points.size() == 4u &&
-        values_out == nullptr &&
-        gradients_out == nullptr &&
-        hessians_out != nullptr) {
-        evaluate_tet_order2_hessians_q4(output_stride, hessians_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Tetrahedron &&
-        order == 2 &&
-        points.size() == 4u &&
-        values_out != nullptr &&
-        gradients_out != nullptr &&
-        hessians_out != nullptr) {
-        evaluate_tet_order2_values_strided(points, output_stride, values_out);
-        evaluate_tet_order2_gradients_strided(points, output_stride, gradients_out);
-        evaluate_tet_order2_hessians_q4(output_stride, hessians_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Tetrahedron &&
-        order == 3 &&
-        values_out != nullptr &&
-        gradients_out == nullptr &&
-        hessians_out == nullptr) {
-        evaluate_tet_order3_values_strided(points, output_stride, values_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Triangle &&
-        order == 3 &&
-        values_out != nullptr &&
-        gradients_out == nullptr &&
-        hessians_out == nullptr) {
-        evaluate_triangle_order3_values_strided(points, output_stride, values_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Triangle &&
-        order == 3 &&
-        values_out == nullptr &&
-        gradients_out != nullptr &&
-        hessians_out == nullptr) {
-        evaluate_triangle_order3_gradients_strided(points, output_stride, gradients_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Hexahedron &&
-        order == 1 &&
-        values_out != nullptr &&
-        gradients_out == nullptr &&
-        hessians_out == nullptr) {
-        evaluate_hex_order1_values_strided(points, output_stride, values_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Hexahedron &&
-        order == 1 &&
-        values_out == nullptr &&
-        gradients_out != nullptr &&
-        hessians_out == nullptr) {
-        evaluate_hex_order1_outputs_strided<false, true, false>(
-            points, output_stride, values_out, gradients_out, hessians_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Hexahedron &&
-        order == 1 &&
-        values_out == nullptr &&
-        gradients_out == nullptr &&
-        hessians_out != nullptr) {
-        evaluate_hex_order1_outputs_strided<false, false, true>(
-            points, output_stride, values_out, gradients_out, hessians_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Hexahedron &&
-        order == 1 &&
-        values_out != nullptr &&
-        gradients_out != nullptr &&
-        hessians_out != nullptr) {
-        evaluate_hex_order1_outputs_strided<true, true, true>(
-            points, output_stride, values_out, gradients_out, hessians_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Quadrilateral &&
-        order == 1 &&
-        values_out != nullptr &&
-        gradients_out == nullptr &&
-        hessians_out == nullptr) {
-        evaluate_quad_order1_values_strided(points, output_stride, values_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Quadrilateral &&
-        order == 1 &&
-        values_out == nullptr &&
-        gradients_out != nullptr &&
-        hessians_out == nullptr) {
-        evaluate_quad_order1_gradients_strided(points, output_stride, gradients_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Quadrilateral &&
-        order == 1 &&
-        values_out == nullptr &&
-        gradients_out == nullptr &&
-        hessians_out != nullptr) {
-        evaluate_quad_order1_hessians_strided(points.size(), output_stride, hessians_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Quadrilateral &&
-        order == 1 &&
-        points.size() == 4u &&
-        values_out != nullptr &&
-        gradients_out != nullptr &&
-        hessians_out != nullptr) {
-        evaluate_quad_order1_all_q4(points, output_stride, values_out, gradients_out, hessians_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Quadrilateral &&
-        order == 2 &&
-        values_out != nullptr &&
-        gradients_out == nullptr &&
-        hessians_out == nullptr) {
-        evaluate_quad_order2_values_strided(points, output_stride, values_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Quadrilateral &&
-        order == 2 &&
-        values_out == nullptr &&
-        gradients_out != nullptr &&
-        hessians_out == nullptr) {
-        evaluate_quad_order2_gradients_strided(points, output_stride, gradients_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Quadrilateral &&
-        order == 2 &&
-        values_out == nullptr &&
-        gradients_out == nullptr &&
-        hessians_out != nullptr) {
-        evaluate_quad_order2_hessians_strided(points, output_stride, hessians_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Quadrilateral &&
-        order == 2 &&
-        points.size() == 4u &&
-        values_out != nullptr &&
-        gradients_out != nullptr &&
-        hessians_out != nullptr) {
-        evaluate_quad_order2_all_q4(points, output_stride, values_out, gradients_out, hessians_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Quadrilateral &&
-        order == 3 &&
-        values_out != nullptr &&
-        gradients_out == nullptr &&
-        hessians_out == nullptr) {
-        evaluate_quad_order3_values_strided(points, output_stride, values_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Quadrilateral &&
-        order == 3 &&
-        values_out == nullptr &&
-        gradients_out != nullptr &&
-        hessians_out == nullptr) {
-        evaluate_quad_order3_gradients_strided(points, output_stride, gradients_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Quadrilateral &&
-        order == 3 &&
-        values_out == nullptr &&
-        gradients_out == nullptr &&
-        hessians_out != nullptr) {
-        evaluate_quad_order3_hessians_strided(points, output_stride, hessians_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Quadrilateral &&
-        order == 3 &&
-        points.size() == 4u &&
-        values_out != nullptr &&
-        gradients_out != nullptr &&
-        hessians_out != nullptr) {
-        evaluate_quad_order3_all_q4(points, output_stride, values_out, gradients_out, hessians_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Hexahedron &&
-        order == 2 &&
-        values_out != nullptr &&
-        gradients_out == nullptr &&
-        hessians_out == nullptr) {
-        evaluate_hex_order2_values_strided(points, output_stride, values_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Hexahedron &&
-        order == 2 &&
-        points.size() == 4u &&
-        values_out == nullptr &&
-        gradients_out != nullptr &&
-        hessians_out == nullptr) {
-        evaluate_hex_order2_gradients_q4(points, output_stride, gradients_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Hexahedron &&
-        order == 2 &&
-        points.size() == 4u &&
-        values_out == nullptr &&
-        gradients_out == nullptr &&
-        hessians_out != nullptr) {
-        evaluate_hex_order2_hessians_q4(points, output_stride, hessians_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Hexahedron &&
-        order == 2 &&
-        points.size() == 4u &&
-        values_out != nullptr &&
-        gradients_out != nullptr &&
-        hessians_out != nullptr) {
-        evaluate_hex_order2_values_strided(points, output_stride, values_out);
-        evaluate_hex_order2_gradients_q4(points, output_stride, gradients_out);
-        evaluate_hex_order2_hessians_q4(points, output_stride, hessians_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Hexahedron &&
-        order == 3 &&
-        points.size() == 4u &&
-        output_stride == 4u &&
-        hessians_out != nullptr) {
-        return false;
-    }
-    if (topology == LagrangeTopology::Hexahedron &&
-        order == 3 &&
-        points.size() == 4u &&
-        values_out != nullptr &&
-        gradients_out == nullptr &&
-        hessians_out == nullptr) {
-        evaluate_hex_order3_values_q4(points, output_stride, values_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Hexahedron &&
-        order == 3 &&
-        points.size() == 4u &&
-        values_out == nullptr &&
-        gradients_out != nullptr &&
-        hessians_out == nullptr) {
-        evaluate_hex_order3_gradients_q4(points, output_stride, gradients_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Hexahedron &&
-        order == 3 &&
-        points.size() == 4u &&
-        values_out == nullptr &&
-        gradients_out == nullptr &&
-        hessians_out != nullptr) {
-        evaluate_hex_order3_hessians_q4(points, output_stride, hessians_out);
-        return true;
-    }
-    if (topology == LagrangeTopology::Hexahedron &&
-        order == 3 &&
-        points.size() == 4u &&
-        values_out != nullptr &&
-        gradients_out != nullptr &&
-        hessians_out != nullptr) {
-        evaluate_hex_order3_all_q4(points, output_stride, values_out, gradients_out, hessians_out);
-        return true;
-    }
-    if (hessians_out != nullptr && order > 1 &&
-        (topology == LagrangeTopology::Quadrilateral ||
-         topology == LagrangeTopology::Hexahedron)) {
-        return false;
-    }
-    if (hessians_out != nullptr) {
-        const bool hessians_only = values_out == nullptr && gradients_out == nullptr;
-        if (order == 1) {
-            if (topology == LagrangeTopology::Triangle && hessians_only) {
-                evaluate_fast_outputs_with_constant_hessians_strided<LagrangeTriFast<1>>(
-                    points, output_stride, values_out, gradients_out, hessians_out);
-                return true;
-            }
-            if (topology == LagrangeTopology::Tetrahedron) {
-                evaluate_fast_outputs_with_constant_hessians_strided<LagrangeTetFast<1>>(
-                    points, output_stride, values_out, gradients_out, hessians_out);
-                return true;
-            }
-        } else if (order == 2) {
-            if (topology == LagrangeTopology::Triangle && hessians_only) {
-                evaluate_fast_outputs_with_constant_hessians_strided<LagrangeTriFast<2>>(
-                    points, output_stride, values_out, gradients_out, hessians_out);
-                return true;
-            }
-            if (topology == LagrangeTopology::Tetrahedron) {
-                evaluate_fast_outputs_with_constant_hessians_strided<LagrangeTetFast<2>>(
-                    points, output_stride, values_out, gradients_out, hessians_out);
-                return true;
-            }
-        }
-    }
-
-    switch (order) {
-        case 1:
-            return evaluate_fixed_lagrange_fast_strided_order<1>(
-                topology, points, output_stride, values_out, gradients_out, hessians_out);
-        case 2:
-            return evaluate_fixed_lagrange_fast_strided_order<2>(
-                topology, points, output_stride, values_out, gradients_out, hessians_out);
-        case 3:
-            return evaluate_fixed_lagrange_fast_strided_order<3>(
-                topology, points, output_stride, values_out, gradients_out, hessians_out);
-        default:
-            return false;
-    }
-}
-
-bool evaluate_fixed_lagrange_fast_to(LagrangeTopology topology,
-                                     int order,
-                                     const math::Vector<Real, 3>& xi,
-                                     Real* SVMP_RESTRICT values_out,
-                                     Real* SVMP_RESTRICT gradients_out,
-                                     Real* SVMP_RESTRICT hessians_out) {
-    switch (order) {
-        case 1:
-            return evaluate_fixed_lagrange_fast_to_order<1>(
-                topology, xi, values_out, gradients_out, hessians_out);
-        case 2:
-            return evaluate_fixed_lagrange_fast_to_order<2>(
-                topology, xi, values_out, gradients_out, hessians_out);
-        case 3:
-            return evaluate_fixed_lagrange_fast_to_order<3>(
-                topology, xi, values_out, gradients_out, hessians_out);
-        default:
-            return false;
-    }
-}
-
-template<std::size_t N>
-struct AxisMonomialCoefficientTable {
-    std::array<Real, N * N> values{};
-    std::array<Real, N * (N > 1 ? N - 1 : 0)> first{};
-    std::array<Real, N * (N > 2 ? N - 2 : 0)> second{};
-};
-
-template<std::size_t N>
-constexpr AxisMonomialCoefficientTable<N> make_axis_monomial_coefficient_table() {
-    AxisMonomialCoefficientTable<N> table{};
-    std::array<Real, N> nodes{};
-    constexpr int order = static_cast<int>(N) - 1;
-    for (std::size_t i = 0; i < N; ++i) {
-        nodes[i] = detail::equispaced_pm_one_coord(static_cast<int>(i), order);
-    }
-
-    for (std::size_t i = 0; i < N; ++i) {
-        std::array<Real, N> coeffs{};
-        std::array<Real, N> next{};
-        coeffs[0] = Real(1);
-        std::size_t degree = 0;
-        for (std::size_t j = 0; j < N; ++j) {
-            if (j == i) {
-                continue;
-            }
-            next = {};
-            for (std::size_t k = 0; k <= degree; ++k) {
-                next[k] -= nodes[j] * coeffs[k];
-                next[k + 1] += coeffs[k];
-            }
-            coeffs = next;
-            ++degree;
-        }
-
-        Real denominator = Real(1);
-        for (std::size_t j = 0; j < N; ++j) {
-            if (j != i) {
-                denominator *= nodes[i] - nodes[j];
-            }
-        }
-        const Real inv_denominator = Real(1) / denominator;
-        for (std::size_t k = 0; k < N; ++k) {
-            table.values[i * N + k] = coeffs[k] * inv_denominator;
-        }
-        if constexpr (N >= 2) {
-            for (std::size_t k = 1; k < N; ++k) {
-                table.first[i * (N - 1) + (k - 1)] =
-                    static_cast<Real>(k) * table.values[i * N + k];
-            }
-        }
-        if constexpr (N >= 3) {
-            for (std::size_t k = 2; k < N; ++k) {
-                table.second[i * (N - 2) + (k - 2)] =
-                    static_cast<Real>(k * (k - 1)) * table.values[i * N + k];
-            }
-        }
-    }
-
-    return table;
-}
-
-template<std::size_t N>
-void assign_axis_coefficient_table(const AxisMonomialCoefficientTable<N>& table,
-                                   std::vector<Real>& values,
-                                   std::vector<Real>& first,
-                                   std::vector<Real>& second) {
-    assign_array(values, table.values);
-    assign_array(first, table.first);
-    assign_array(second, table.second);
-}
-
-bool assign_precomputed_axis_coefficients(int n_axis,
-                                          std::vector<Real>& values,
-                                          std::vector<Real>& first,
-                                          std::vector<Real>& second) {
-    static constexpr auto kAxisCoefficients1 = make_axis_monomial_coefficient_table<1>();
-    static constexpr auto kAxisCoefficients2 = make_axis_monomial_coefficient_table<2>();
-    static constexpr auto kAxisCoefficients3 = make_axis_monomial_coefficient_table<3>();
-    static constexpr auto kAxisCoefficients4 = make_axis_monomial_coefficient_table<4>();
-    static constexpr auto kAxisCoefficients5 = make_axis_monomial_coefficient_table<5>();
-
-    switch (n_axis) {
-        case 1:
-            assign_axis_coefficient_table(kAxisCoefficients1, values, first, second);
-            return true;
-        case 2:
-            assign_axis_coefficient_table(kAxisCoefficients2, values, first, second);
-            return true;
-        case 3:
-            assign_axis_coefficient_table(kAxisCoefficients3, values, first, second);
-            return true;
-        case 4:
-            assign_axis_coefficient_table(kAxisCoefficients4, values, first, second);
-            return true;
-        case 5:
-            assign_axis_coefficient_table(kAxisCoefficients5, values, first, second);
-            return true;
-        default:
-            return false;
-    }
-}
-
-LagrangeTopologyTraits lagrange_topology_traits(ElementType type) {
-    const auto topo = topology(type);
-    if (topo != LagrangeTopology::Unknown) {
-        return {topo, reference_dimension(type)};
-    }
-
-    throw BasisElementCompatibilityException("Unsupported element type for LagrangeBasis",
-                                             __FILE__, __LINE__, __func__);
-}
-
-std::size_t lattice_index_pm_one(Real coord, int order, const char* context) {
-    if (order <= 0) {
-        if (!coordinate_matches_expected(coord, Real(0))) {
-            throw BasisNodeOrderingException(context, __FILE__, __LINE__, __func__);
-        }
-        return 0;
-    }
-
-    const Real scaled = (coord + Real(1)) * static_cast<Real>(order) / Real(2);
-    const long idx = std::lround(scaled);
-    if (idx < 0 || idx > order ||
-        !coordinate_matches_expected(
-            coord,
-            detail::equispaced_pm_one_coord(static_cast<int>(idx), order))) {
-        throw BasisNodeOrderingException(context, __FILE__, __LINE__, __func__);
-    }
-    return static_cast<std::size_t>(idx);
-}
-
-int simplex_lattice_index(Real coord, int order, const char* context) {
-    if (order <= 0) {
-        if (!coordinate_matches_expected(coord, Real(0)) &&
-            !coordinate_matches_expected(coord, Real(0.25)) &&
-            !coordinate_matches_expected(coord, Real(1) / Real(3))) {
-            throw BasisNodeOrderingException(context, __FILE__, __LINE__, __func__);
-        }
-        return 0;
-    }
-
-    const Real scaled = coord * static_cast<Real>(order);
-    const long idx = std::lround(scaled);
-    const Real reconstructed = static_cast<Real>(idx) / static_cast<Real>(order);
-    if (idx < 0 || idx > order || !coordinate_matches_expected(coord, reconstructed)) {
-        throw BasisNodeOrderingException(context, __FILE__, __LINE__, __func__);
-    }
-    return static_cast<int>(idx);
-}
-
-std::array<int, 4> triangle_exponents_from_public_node(const math::Vector<Real, 3>& node,
-                                                       int order) {
-    if (order == 0) {
-        return {0, 0, 0, 0};
-    }
-
-    const int j = simplex_lattice_index(node[0], order,
-                                        "LagrangeBasis: invalid triangle node coordinate for public ordering");
-    const int k = simplex_lattice_index(node[1], order,
-                                        "LagrangeBasis: invalid triangle node coordinate for public ordering");
-    const int i = order - j - k;
-    if (i < 0) {
-        throw BasisNodeOrderingException("LagrangeBasis: invalid triangle barycentric coordinates for public ordering",
-                                         __FILE__, __LINE__, __func__);
-    }
-    return {i, j, k, 0};
-}
-
-std::array<int, 4> tetrahedron_exponents_from_public_node(const math::Vector<Real, 3>& node,
-                                                          int order) {
-    if (order == 0) {
-        return {0, 0, 0, 0};
-    }
-
-    const int j = simplex_lattice_index(node[0], order,
-                                        "LagrangeBasis: invalid tetrahedron node x-coordinate for public ordering");
-    const int k = simplex_lattice_index(node[1], order,
-                                        "LagrangeBasis: invalid tetrahedron node y-coordinate for public ordering");
-    const int l = simplex_lattice_index(node[2], order,
-                                        "LagrangeBasis: invalid tetrahedron node z-coordinate for public ordering");
-    const int i = order - j - k - l;
-    if (i < 0) {
-        throw BasisNodeOrderingException("LagrangeBasis: invalid tetrahedron barycentric coordinates for public ordering",
-                                         __FILE__, __LINE__, __func__);
-    }
-    return {i, j, k, l};
-}
-
-struct NormalizedLagrangeRequest {
-    ElementType element_type;
-    int order;
-};
-
-// Non-owning view of the per-axis 1D Lagrange basis evaluations
-// (values, first derivative, second derivative), each of length `size`.
-struct AxisBasisEvaluations {
-    const Real* values;
-    const Real* first;
-    const Real* second;
-    std::size_t size;
-};
-
-AxisBasisEvaluations constant_axis_basis() {
-    static const Real kOne[1]  = {Real(1)};
-    static const Real kZero[1] = {Real(0)};
-    return AxisBasisEvaluations{kOne, kZero, kZero, 1};
-}
-
-// Horner-form evaluator for the precomputed 1D Lagrange basis.
-//
-// Inputs are precomputed monomial coefficients of L_i(x), L_i'(x), L_i''(x)
-// (built once at LagrangeBasis construction). Evaluation is purely
-// multiply-add on the coefficients — no divisions and no node-position
-// lookups in the hot path. Templated on N for compile-time loop unrolling
-// and FMA-friendly straight-line code on the common Hex/Quad/Line orders.
-//
-// Layout:
-//   v_coeffs:  N * N entries; row i holds [c_i0, c_i1, ..., c_i(N-1)]
-//              such that L_i(x) = sum_k c_ik * x^k
-//   d_coeffs:  N * (N-1) entries; row i holds derivative coefficients of L_i'(x)
-//   d2_coeffs: N * (N-2) entries; row i holds coefficients of L_i''(x)
-//              (only valid when N >= 3)
-template<int N>
-inline void evaluate_1d_horner_impl(const Real* v_coeffs,
-                                    const Real* d_coeffs,
-                                    const Real* d2_coeffs,
-                                    Real xi,
-                                    Real* values, Real* first, Real* second) {
-    if constexpr (N == 1) {
-        values[0] = v_coeffs[0];
-        if (first)  first[0]  = Real(0);
-        if (second) second[0] = Real(0);
-        return;
-    } else {
-        // Values: degree N-1 polynomials.
-        for (int i = 0; i < N; ++i) {
-            const Real* c = v_coeffs + i * N;
-            Real r = c[N - 1];
-            for (int k = N - 1; k > 0; --k) {
-                r = r * xi + c[k - 1];
-            }
-            values[i] = r;
-        }
-
-        if (!first && !second) return;
-
-        if (first) {
-            // First derivatives: degree N-2 polynomials (per row of d_coeffs).
-            for (int i = 0; i < N; ++i) {
-                const Real* c = d_coeffs + i * (N - 1);
-                Real r = c[N - 2];
-                for (int k = N - 2; k > 0; --k) {
-                    r = r * xi + c[k - 1];
-                }
-                first[i] = r;
-            }
-        }
-
-        if (!second) return;
-
-        if constexpr (N <= 2) {
-            for (int i = 0; i < N; ++i) second[i] = Real(0);
-        } else {
-            // Second derivatives: degree N-3 polynomials (per row of d2_coeffs).
-            for (int i = 0; i < N; ++i) {
-                const Real* c = d2_coeffs + i * (N - 2);
-                Real r = c[N - 3];
-                for (int k = N - 3; k > 0; --k) {
-                    r = r * xi + c[k - 1];
-                }
-                second[i] = r;
-            }
-        }
-    }
-}
-
-void fill_equispaced_barycentric_weights(int n_axis, Real* weights) {
-    const int order = n_axis - 1;
-    Real weight = (order % 2 == 0) ? Real(1) : Real(-1);
-    Real max_abs = Real(0);
-    for (int i = 0; i < n_axis; ++i) {
-        weights[i] = weight;
-        max_abs = std::max(max_abs, std::abs(weight));
-        if (i < order) {
-            weight *= -static_cast<Real>(order - i) / static_cast<Real>(i + 1);
-        }
-    }
-
-    if (max_abs > Real(0)) {
-        const Real inv_scale = Real(1) / max_abs;
-        for (int i = 0; i < n_axis; ++i) {
-            weights[i] *= inv_scale;
-        }
-    }
-}
-
-bool coordinate_matches_axis_node(Real xi, Real node) {
-    return coordinate_matches_expected(xi, node);
-}
-
-struct CompensatedSum {
-    Real sum{Real(0)};
-    Real compensation{Real(0)};
-
-    void add(Real value) noexcept {
-        const Real y = value - compensation;
-        const Real t = sum + y;
-        compensation = (t - sum) - y;
-        sum = t;
-    }
-};
-
-void distribute_residual_by_abs(int n_axis, Real* values, Real residual) {
-    if (values == nullptr || n_axis <= 0 || residual == Real(0)) {
-        return;
-    }
-
-    CompensatedSum abs_sum;
-    int largest_index = 0;
-    Real largest_abs = Real(0);
-    for (int i = 0; i < n_axis; ++i) {
-        const Real magnitude = std::abs(values[i]);
-        abs_sum.add(magnitude);
-        if (magnitude > largest_abs) {
-            largest_abs = magnitude;
-            largest_index = i;
-        }
-    }
-
-    if (abs_sum.sum <= Real(0)) {
-        values[0] += residual;
-        return;
-    }
-
-    const Real inv_abs_sum = Real(1) / abs_sum.sum;
-    CompensatedSum applied;
-    for (int i = 0; i < n_axis; ++i) {
-        const Real correction = residual * std::abs(values[i]) * inv_abs_sum;
-        values[i] += correction;
-        applied.add(correction);
-    }
-    values[largest_index] += residual - applied.sum;
-}
-
-void evaluate_1d_barycentric_runtime(int n_axis,
-                                     Real xi,
-                                     const Real* weights,
-                                     Real* values,
-                                     Real* first,
-                                     Real* second) {
-    const int order = n_axis - 1;
-    BASIS_CHECK_EVAL(weights != nullptr,
-                     "LagrangeBasis: missing cached barycentric weights for runtime axis evaluation");
-
-    int node_index = -1;
-    for (int i = 0; i < n_axis; ++i) {
-        const Real node = detail::equispaced_pm_one_coord(i, order);
-        if (coordinate_matches_axis_node(xi, node)) {
-            node_index = i;
-            break;
-        }
-    }
-
-    if (node_index >= 0) {
-        std::fill(values, values + n_axis, Real(0));
-        values[node_index] = Real(1);
-        if (!first && !second) {
-            return;
-        }
-
-        const Real xk = detail::equispaced_pm_one_coord(node_index, order);
-        const Real wk = weights[static_cast<std::size_t>(node_index)];
-        Real reciprocal_sum = Real(0);
-        if (second) {
-            for (int m = 0; m < n_axis; ++m) {
-                if (m == node_index) {
-                    continue;
-                }
-                const Real xm = detail::equispaced_pm_one_coord(m, order);
-                reciprocal_sum += Real(1) / (xk - xm);
-            }
-        }
-
-        Real first_diagonal = Real(0);
-        Real second_diagonal = Real(0);
-        if (first) {
-            std::fill(first, first + n_axis, Real(0));
-        }
-        if (second) {
-            std::fill(second, second + n_axis, Real(0));
-        }
-
-        for (int j = 0; j < n_axis; ++j) {
-            if (j == node_index) {
-                continue;
-            }
-            const Real xj = detail::equispaced_pm_one_coord(j, order);
-            const Real distance = xk - xj;
-            const Real offdiag_first = weights[static_cast<std::size_t>(j)] / (wk * distance);
-            first_diagonal -= offdiag_first;
-            if (first) {
-                first[j] = offdiag_first;
-            }
-            if (second) {
-                const Real offdiag_second =
-                    Real(2) * offdiag_first * (reciprocal_sum - Real(1) / distance);
-                second[j] = offdiag_second;
-                second_diagonal -= offdiag_second;
-            }
-        }
-        if (first) {
-            first[node_index] = first_diagonal;
-        }
-        if (second) {
-            second[node_index] = second_diagonal;
-        }
-        return;
-    }
-
-    Real sum0 = Real(0);
-    Real sum1 = Real(0);
-    Real sum2 = Real(0);
-    for (int i = 0; i < n_axis; ++i) {
-        const Real node = detail::equispaced_pm_one_coord(i, order);
-        const Real inv_distance = Real(1) / (xi - node);
-        const Real weighted = weights[static_cast<std::size_t>(i)] * inv_distance;
-        sum0 += weighted;
-        sum1 += weighted * inv_distance;
-        sum2 += weighted * inv_distance * inv_distance;
-    }
-
-    const Real inv_sum0 = Real(1) / sum0;
-    const Real first_ratio = sum1 * inv_sum0;
-    const Real second_ratio = sum2 * inv_sum0;
-    const Real first_ratio_sq = first_ratio * first_ratio;
-
-    CompensatedSum value_sum;
-    CompensatedSum first_sum;
-    CompensatedSum second_sum;
-    for (int i = 0; i < n_axis; ++i) {
-        const Real node = detail::equispaced_pm_one_coord(i, order);
-        const Real inv_distance = Real(1) / (xi - node);
-        const Real value = weights[static_cast<std::size_t>(i)] * inv_distance * inv_sum0;
-        values[i] = value;
-        value_sum.add(value);
-        if (first || second) {
-            const Real derivative_factor = first_ratio - inv_distance;
-            if (first) {
-                first[i] = value * derivative_factor;
-                first_sum.add(first[i]);
-            }
-            if (second) {
-                second[i] = value * (derivative_factor * derivative_factor +
-                                     inv_distance * inv_distance -
-                                     Real(2) * second_ratio +
-                                     first_ratio_sq);
-                second_sum.add(second[i]);
-            }
-        }
-    }
-
-    distribute_residual_by_abs(n_axis, values, Real(1) - value_sum.sum);
-    if (first) {
-        distribute_residual_by_abs(n_axis, first, -first_sum.sum);
-    }
-    if (second) {
-        distribute_residual_by_abs(n_axis, second, -second_sum.sum);
-    }
-}
-
-// 1D Lagrange-basis evaluator. Writes n_axis entries to each non-null output
-// buffer. Dispatches to compile-time Horner specializations for sizes 1..9
-// (orders 0..8 — the Lagrange performance sweep) and uses barycentric
-// evaluation above that threshold to avoid high-order monomial conditioning
-// issues.
-void evaluate_1d_basis_to(const Real* v_coeffs,
-                          const Real* d_coeffs,
-                          const Real* d2_coeffs,
-                          const Real* barycentric_weights,
-                          int n_axis, Real xi,
-                          Real* values, Real* first, Real* second) {
-    switch (n_axis) {
-        case 1: evaluate_1d_horner_impl<1>(v_coeffs, d_coeffs, d2_coeffs, xi, values, first, second); return;
-        case 2: evaluate_1d_horner_impl<2>(v_coeffs, d_coeffs, d2_coeffs, xi, values, first, second); return;
-        case 3: evaluate_1d_horner_impl<3>(v_coeffs, d_coeffs, d2_coeffs, xi, values, first, second); return;
-        case 4: evaluate_1d_horner_impl<4>(v_coeffs, d_coeffs, d2_coeffs, xi, values, first, second); return;
-        case 5: evaluate_1d_horner_impl<5>(v_coeffs, d_coeffs, d2_coeffs, xi, values, first, second); return;
-        case 6: evaluate_1d_horner_impl<6>(v_coeffs, d_coeffs, d2_coeffs, xi, values, first, second); return;
-        case 7: evaluate_1d_horner_impl<7>(v_coeffs, d_coeffs, d2_coeffs, xi, values, first, second); return;
-        case 8: evaluate_1d_horner_impl<8>(v_coeffs, d_coeffs, d2_coeffs, xi, values, first, second); return;
-        case 9: evaluate_1d_horner_impl<9>(v_coeffs, d_coeffs, d2_coeffs, xi, values, first, second); return;
-        default:
-            evaluate_1d_barycentric_runtime(n_axis, xi, barycentric_weights, values, first, second);
-            return;
-    }
-}
-
-// Selects which derivative passes are computed by the 1D evaluator.
-enum class AxisDeriv {
-    ValuesOnly,           // skip first and second
-    ValuesAndFirst,       // for gradients
-    ValuesAndFirstAndSecond, // for hessians or fused evaluate_all
-};
-
-// Per-axis storage (values, first derivative, second derivative). Backed by
-// per-thread scratch that grows lazily; subsequent calls reuse capacity with no
-// reallocation.
-struct AxisScratch {
-    std::vector<Real> values;
-    std::vector<Real> first;
-    std::vector<Real> second;
-
-    void reserveFor(std::size_t n) {
-        if (values.size() < n) values.resize(n);
-        if (first.size() < n) first.resize(n);
-        if (second.size() < n) second.resize(n);
-    }
-};
-
-struct AxisBatchScratch {
-    std::vector<Real> values;
-    std::vector<Real> first;
-    std::vector<Real> second;
-
-    void resizeFor(std::size_t count, AxisDeriv level) {
-        if (values.size() < count) values.resize(count);
-        if (level != AxisDeriv::ValuesOnly && first.size() < count) first.resize(count);
-        if (level == AxisDeriv::ValuesAndFirstAndSecond && second.size() < count) second.resize(count);
-    }
-};
-
-template<int Order, bool NeedFirst, bool NeedSecond>
-inline void fill_simplex_factor_sequence_fixed(Real lambda,
-                                               Real* SVMP_RESTRICT phi,
-                                               Real* SVMP_RESTRICT dphi,
-                                               Real* SVMP_RESTRICT d2phi) {
-    static_assert(!NeedSecond || NeedFirst,
-                  "second derivative factors require first-derivative recurrence state");
-    phi[0] = Real(1);
-    if constexpr (NeedFirst) {
-        dphi[0] = Real(0);
-    }
-    if constexpr (NeedSecond) {
-        d2phi[0] = Real(0);
-    }
-
-    const Real t = static_cast<Real>(Order) * lambda;
-    const Real dt_dlambda = static_cast<Real>(Order);
-    Real dphi_dt_prev = Real(0);
-    Real d2phi_dt2_prev = Real(0);
-    for (int a = 1; a <= Order; ++a) {
-        const std::size_t au = static_cast<std::size_t>(a);
-        const Real inv_a = Real(1) / static_cast<Real>(a);
-        const Real s = (t - static_cast<Real>(a - 1)) * inv_a;
-        phi[au] = s * phi[au - 1];
-
-        if constexpr (NeedFirst) {
-            const Real dphi_dt_old = dphi_dt_prev;
-            const Real dphi_dt = inv_a * phi[au - 1] + s * dphi_dt_old;
-            dphi[au] = dt_dlambda * dphi_dt;
-
-            if constexpr (NeedSecond) {
-                const Real d2phi_dt2 = Real(2) * inv_a * dphi_dt_old + s * d2phi_dt2_prev;
-                d2phi[au] = dt_dlambda * dt_dlambda * d2phi_dt2;
-                d2phi_dt2_prev = d2phi_dt2;
-            }
-            dphi_dt_prev = dphi_dt;
-        }
-    }
-}
-
-template<int Order, bool NeedSecond>
-inline void fill_triangle_factors_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    Real (&phi0)[4][Order + 1],
-    Real (&phi1)[4][Order + 1],
-    Real (&phi2)[4][Order + 1],
-    Real (&dphi0)[4][Order + 1],
-    Real (&dphi1)[4][Order + 1],
-    Real (&dphi2)[4][Order + 1],
-    Real (&d2phi0)[4][Order + 1],
-    Real (&d2phi1)[4][Order + 1],
-    Real (&d2phi2)[4][Order + 1]) {
-    for (std::size_t q = 0; q < 4u; ++q) {
-        const auto& xi = points[q];
-        const Real l1 = xi[0];
-        const Real l2 = xi[1];
-        const Real l0 = Real(1) - l1 - l2;
-        if constexpr (NeedSecond) {
-            fill_simplex_factor_sequence_fixed<Order, true, true>(
-                l0, phi0[q], dphi0[q], d2phi0[q]);
-            fill_simplex_factor_sequence_fixed<Order, true, true>(
-                l1, phi1[q], dphi1[q], d2phi1[q]);
-            fill_simplex_factor_sequence_fixed<Order, true, true>(
-                l2, phi2[q], dphi2[q], d2phi2[q]);
-        } else {
-            fill_simplex_factor_sequence_fixed<Order, true, false>(
-                l0, phi0[q], dphi0[q], nullptr);
-            fill_simplex_factor_sequence_fixed<Order, true, false>(
-                l1, phi1[q], dphi1[q], nullptr);
-            fill_simplex_factor_sequence_fixed<Order, true, false>(
-                l2, phi2[q], dphi2[q], nullptr);
-        }
-    }
-}
-
-template<std::size_t Q>
-inline void write_wedge_gradient_strided_q(std::size_t tri_stride,
-                                           std::size_t axis_stride,
-                                           std::size_t tri,
-                                           std::size_t z,
-                                           std::size_t output_stride,
-                                           const Real* SVMP_RESTRICT tri_values,
-                                           const Real* SVMP_RESTRICT tri_g,
-                                           const AxisBatchScratch& axis_batch,
-                                           Real* SVMP_RESTRICT g) {
-    const std::size_t tri_q = tri * tri_stride + Q;
-    const std::size_t z_q = Q * axis_stride + z;
-    const Real tri_v = tri_values[tri_q];
-    const Real zv = axis_batch.values[z_q];
-    g[0u * output_stride + Q] = tri_g[0u * tri_stride + Q] * zv;
-    g[1u * output_stride + Q] = tri_g[1u * tri_stride + Q] * zv;
-    g[2u * output_stride + Q] = tri_v * axis_batch.first[z_q];
-}
-
-template<std::size_t Q>
-inline void write_wedge_gradient_stride4_q(std::size_t tri_stride,
-                                           std::size_t axis_stride,
-                                           std::size_t tri,
-                                           std::size_t z,
-                                           const Real* SVMP_RESTRICT tri_values,
-                                           const Real* SVMP_RESTRICT tri_g,
-                                           const AxisBatchScratch& axis_batch,
-                                           Real* SVMP_RESTRICT g) {
-    const std::size_t tri_q = tri * tri_stride + Q;
-    const std::size_t z_q = Q * axis_stride + z;
-    const Real tri_v = tri_values[tri_q];
-    const Real zv = axis_batch.values[z_q];
-    g[Q] = tri_g[0u * tri_stride + Q] * zv;
-    g[4u + Q] = tri_g[1u * tri_stride + Q] * zv;
-    g[8u + Q] = tri_v * axis_batch.first[z_q];
-}
-
-template<std::size_t Q>
-inline void write_wedge_hessian_strided_q(std::size_t tri_stride,
-                                          std::size_t axis_stride,
-                                          std::size_t tri,
-                                          std::size_t z,
-                                          std::size_t output_stride,
-                                          const Real* SVMP_RESTRICT tri_values,
-                                          const Real* SVMP_RESTRICT tri_g,
-                                          const Real* SVMP_RESTRICT tri_H,
-                                          const AxisBatchScratch& axis_batch,
-                                          Real* SVMP_RESTRICT H) {
-    const std::size_t tri_q = tri * tri_stride + Q;
-    const std::size_t z_q = Q * axis_stride + z;
-    const Real tri_v = tri_values[tri_q];
-    const Real zv = axis_batch.values[z_q];
-    const Real zd = axis_batch.first[z_q];
-    const Real tri_gx = tri_g[0u * tri_stride + Q];
-    const Real tri_gy = tri_g[1u * tri_stride + Q];
-    const Real tri_hxx = tri_H[0u * tri_stride + Q];
-    const Real tri_hxy = tri_H[1u * tri_stride + Q];
-    const Real tri_hyy = tri_H[2u * tri_stride + Q];
-    const Real hxz = tri_gx * zd;
-    const Real hxy = tri_hxy * zv;
-    const Real hyz = tri_gy * zd;
-
-    H[0u * output_stride + Q] = tri_hxx * zv;
-    H[1u * output_stride + Q] = hxy;
-    H[2u * output_stride + Q] = hxz;
-    H[3u * output_stride + Q] = hxy;
-    H[4u * output_stride + Q] = tri_hyy * zv;
-    H[5u * output_stride + Q] = hyz;
-    H[6u * output_stride + Q] = hxz;
-    H[7u * output_stride + Q] = hyz;
-    H[8u * output_stride + Q] = tri_v * axis_batch.second[z_q];
-}
-
-template<std::size_t Q>
-inline void write_wedge_hessian_stride4_q(std::size_t tri_stride,
-                                          std::size_t axis_stride,
-                                          std::size_t tri,
-                                          std::size_t z,
-                                          const Real* SVMP_RESTRICT tri_values,
-                                          const Real* SVMP_RESTRICT tri_g,
-                                          const Real* SVMP_RESTRICT tri_H,
-                                          const AxisBatchScratch& axis_batch,
-                                          Real* SVMP_RESTRICT H) {
-    const std::size_t tri_q = tri * tri_stride + Q;
-    const std::size_t z_q = Q * axis_stride + z;
-    const Real tri_v = tri_values[tri_q];
-    const Real zv = axis_batch.values[z_q];
-    const Real zd = axis_batch.first[z_q];
-    const Real tri_gx = tri_g[0u * tri_stride + Q];
-    const Real tri_gy = tri_g[1u * tri_stride + Q];
-    const Real tri_hxx = tri_H[0u * tri_stride + Q];
-    const Real tri_hxy = tri_H[1u * tri_stride + Q];
-    const Real tri_hyy = tri_H[2u * tri_stride + Q];
-    const Real hxz = tri_gx * zd;
-    const Real hxy = tri_hxy * zv;
-    const Real hyz = tri_gy * zd;
-
-    H[Q] = tri_hxx * zv;
-    H[4u + Q] = hxy;
-    H[8u + Q] = hxz;
-    H[12u + Q] = hxy;
-    H[16u + Q] = tri_hyy * zv;
-    H[20u + Q] = hyz;
-    H[24u + Q] = hxz;
-    H[28u + Q] = hyz;
-    H[32u + Q] = tri_v * axis_batch.second[z_q];
-}
-
-template<std::size_t Q>
-inline void write_wedge_all_strided_q(std::size_t tri_stride,
-                                      std::size_t axis_stride,
-                                      std::size_t tri,
-                                      std::size_t z,
-                                      std::size_t output_stride,
-                                      const Real* SVMP_RESTRICT tri_values,
-                                      const Real* SVMP_RESTRICT tri_g,
-                                      const Real* SVMP_RESTRICT tri_H,
-                                      const AxisBatchScratch& axis_batch,
-                                      Real* SVMP_RESTRICT value_row,
-                                      Real* SVMP_RESTRICT g,
-                                      Real* SVMP_RESTRICT H) {
-    const std::size_t tri_q = tri * tri_stride + Q;
-    const std::size_t z_q = Q * axis_stride + z;
-    const Real tri_v = tri_values[tri_q];
-    const Real zv = axis_batch.values[z_q];
-    const Real zd = axis_batch.first[z_q];
-    const Real tri_gx = tri_g[0u * tri_stride + Q];
-    const Real tri_gy = tri_g[1u * tri_stride + Q];
-    const Real tri_hxx = tri_H[0u * tri_stride + Q];
-    const Real tri_hxy = tri_H[1u * tri_stride + Q];
-    const Real tri_hyy = tri_H[2u * tri_stride + Q];
-    const Real hxz = tri_gx * zd;
-    const Real hxy = tri_hxy * zv;
-    const Real hyz = tri_gy * zd;
-
-    value_row[Q] = tri_v * zv;
-    g[0u * output_stride + Q] = tri_gx * zv;
-    g[1u * output_stride + Q] = tri_gy * zv;
-    g[2u * output_stride + Q] = tri_v * zd;
-    H[0u * output_stride + Q] = tri_hxx * zv;
-    H[1u * output_stride + Q] = hxy;
-    H[2u * output_stride + Q] = hxz;
-    H[3u * output_stride + Q] = hxy;
-    H[4u * output_stride + Q] = tri_hyy * zv;
-    H[5u * output_stride + Q] = hyz;
-    H[6u * output_stride + Q] = hxz;
-    H[7u * output_stride + Q] = hyz;
-    H[8u * output_stride + Q] = tri_v * axis_batch.second[z_q];
-}
-
-template<std::size_t Q>
-inline void write_wedge_all_stride4_q(std::size_t tri_stride,
-                                      std::size_t axis_stride,
-                                      std::size_t tri,
-                                      std::size_t z,
-                                      const Real* SVMP_RESTRICT tri_values,
-                                      const Real* SVMP_RESTRICT tri_g,
-                                      const Real* SVMP_RESTRICT tri_H,
-                                      const AxisBatchScratch& axis_batch,
-                                      Real* SVMP_RESTRICT value_row,
-                                      Real* SVMP_RESTRICT g,
-                                      Real* SVMP_RESTRICT H) {
-    const std::size_t tri_q = tri * tri_stride + Q;
-    const std::size_t z_q = Q * axis_stride + z;
-    const Real tri_v = tri_values[tri_q];
-    const Real zv = axis_batch.values[z_q];
-    const Real zd = axis_batch.first[z_q];
-    const Real tri_gx = tri_g[0u * tri_stride + Q];
-    const Real tri_gy = tri_g[1u * tri_stride + Q];
-    const Real tri_hxx = tri_H[0u * tri_stride + Q];
-    const Real tri_hxy = tri_H[1u * tri_stride + Q];
-    const Real tri_hyy = tri_H[2u * tri_stride + Q];
-    const Real hxz = tri_gx * zd;
-    const Real hxy = tri_hxy * zv;
-    const Real hyz = tri_gy * zd;
-
-    value_row[Q] = tri_v * zv;
-    g[Q] = tri_gx * zv;
-    g[4u + Q] = tri_gy * zv;
-    g[8u + Q] = tri_v * zd;
-    H[Q] = tri_hxx * zv;
-    H[4u + Q] = hxy;
-    H[8u + Q] = hxz;
-    H[12u + Q] = hxy;
-    H[16u + Q] = tri_hyy * zv;
-    H[20u + Q] = hyz;
-    H[24u + Q] = hxz;
-    H[28u + Q] = hyz;
-    H[32u + Q] = tri_v * axis_batch.second[z_q];
-}
-
-template<int Order, bool NeedHess>
-bool evaluate_wedge_fused_stride4_q4(
-    const std::vector<std::array<int, 4>>& simplex_exponents,
-    const std::vector<std::size_t>& wedge_node_by_tri_z,
-    const std::vector<math::Vector<Real, 3>>& points,
-    const AxisBatchScratch& axis_batch,
-    int n_axis,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) {
-    static_assert(Order >= 3 && Order <= 8, "fused wedge q4 path covers orders 3..8");
-    const std::size_t tri_count = simplex_exponents.size();
-    const std::size_t z_count = static_cast<std::size_t>(n_axis);
-    if (points.size() != 4u ||
-        z_count != static_cast<std::size_t>(Order + 1) ||
-        wedge_node_by_tri_z.size() != tri_count * z_count) {
-        return false;
-    }
-
-    Real phi0[4][Order + 1];
-    Real phi1[4][Order + 1];
-    Real phi2[4][Order + 1];
-    Real dphi0[4][Order + 1];
-    Real dphi1[4][Order + 1];
-    Real dphi2[4][Order + 1];
-    Real d2phi0[4][Order + 1];
-    Real d2phi1[4][Order + 1];
-    Real d2phi2[4][Order + 1];
-    fill_triangle_factors_q4<Order, NeedHess>(
-        points, phi0, phi1, phi2, dphi0, dphi1, dphi2, d2phi0, d2phi1, d2phi2);
-
-    for (std::size_t tri = 0; tri < tri_count; ++tri) {
-        const auto& e = simplex_exponents[tri];
-        const std::size_t i0 = static_cast<std::size_t>(e[0]);
-        const std::size_t i1 = static_cast<std::size_t>(e[1]);
-        const std::size_t i2 = static_cast<std::size_t>(e[2]);
-
-        Real tri_v[4];
-        Real tri_gx[4];
-        Real tri_gy[4];
-        Real tri_hxx[4];
-        Real tri_hxy[4];
-        Real tri_hyy[4];
-        for (std::size_t q = 0; q < 4u; ++q) {
-            const Real v0 = phi0[q][i0];
-            const Real v1 = phi1[q][i1];
-            const Real v2 = phi2[q][i2];
-            const Real D0 = dphi0[q][i0];
-            const Real D1 = dphi1[q][i1];
-            const Real D2 = dphi2[q][i2];
-            const Real dl0 = D0 * v1 * v2;
-            tri_v[q] = v0 * v1 * v2;
-            tri_gx[q] = v0 * D1 * v2 - dl0;
-            tri_gy[q] = v0 * v1 * D2 - dl0;
-
-            if constexpr (NeedHess) {
-                const Real DD0 = d2phi0[q][i0];
-                const Real DD1 = d2phi1[q][i1];
-                const Real DD2 = d2phi2[q][i2];
-                const Real H00 = DD0 * v1 * v2;
-                const Real H11 = v0 * DD1 * v2;
-                const Real H22 = v0 * v1 * DD2;
-                const Real H01 = D0 * D1 * v2;
-                const Real H02 = D0 * v1 * D2;
-                const Real H12 = v0 * D1 * D2;
-                tri_hxx[q] = H00 - Real(2) * H01 + H11;
-                tri_hxy[q] = H00 - H01 - H02 + H12;
-                tri_hyy[q] = H00 - Real(2) * H02 + H22;
-            }
-        }
-
-        for (std::size_t z = 0; z < z_count; ++z) {
-            const std::size_t node = wedge_node_by_tri_z[tri * z_count + z];
-            Real* SVMP_RESTRICT value_row =
-                values_out != nullptr ? values_out + node * 4u : nullptr;
-            Real* SVMP_RESTRICT g =
-                gradients_out != nullptr ? gradients_out + node * 12u : nullptr;
-            Real* SVMP_RESTRICT H =
-                hessians_out != nullptr ? hessians_out + node * 36u : nullptr;
-
-            const Real z0 = axis_batch.values[z];
-            const Real z1 = axis_batch.values[z_count + z];
-            const Real z2 = axis_batch.values[2u * z_count + z];
-            const Real z3 = axis_batch.values[3u * z_count + z];
-            const Real dz0 = axis_batch.first[z];
-            const Real dz1 = axis_batch.first[z_count + z];
-            const Real dz2 = axis_batch.first[2u * z_count + z];
-            const Real dz3 = axis_batch.first[3u * z_count + z];
-
-            if (value_row != nullptr) {
-                value_row[0] = tri_v[0] * z0;
-                value_row[1] = tri_v[1] * z1;
-                value_row[2] = tri_v[2] * z2;
-                value_row[3] = tri_v[3] * z3;
-            }
-            if (g != nullptr) {
-                g[0] = tri_gx[0] * z0;
-                g[1] = tri_gx[1] * z1;
-                g[2] = tri_gx[2] * z2;
-                g[3] = tri_gx[3] * z3;
-                g[4] = tri_gy[0] * z0;
-                g[5] = tri_gy[1] * z1;
-                g[6] = tri_gy[2] * z2;
-                g[7] = tri_gy[3] * z3;
-                g[8] = tri_v[0] * dz0;
-                g[9] = tri_v[1] * dz1;
-                g[10] = tri_v[2] * dz2;
-                g[11] = tri_v[3] * dz3;
-            }
-            if constexpr (NeedHess) {
-                if (H != nullptr) {
-                    const Real d2z0 = axis_batch.second[z];
-                    const Real d2z1 = axis_batch.second[z_count + z];
-                    const Real d2z2 = axis_batch.second[2u * z_count + z];
-                    const Real d2z3 = axis_batch.second[3u * z_count + z];
-                    const Real hxz0 = tri_gx[0] * dz0;
-                    const Real hxz1 = tri_gx[1] * dz1;
-                    const Real hxz2 = tri_gx[2] * dz2;
-                    const Real hxz3 = tri_gx[3] * dz3;
-                    const Real hyz0 = tri_gy[0] * dz0;
-                    const Real hyz1 = tri_gy[1] * dz1;
-                    const Real hyz2 = tri_gy[2] * dz2;
-                    const Real hyz3 = tri_gy[3] * dz3;
-                    H[0] = tri_hxx[0] * z0;
-                    H[1] = tri_hxx[1] * z1;
-                    H[2] = tri_hxx[2] * z2;
-                    H[3] = tri_hxx[3] * z3;
-                    H[4] = tri_hxy[0] * z0;
-                    H[5] = tri_hxy[1] * z1;
-                    H[6] = tri_hxy[2] * z2;
-                    H[7] = tri_hxy[3] * z3;
-                    H[8] = hxz0;
-                    H[9] = hxz1;
-                    H[10] = hxz2;
-                    H[11] = hxz3;
-                    H[12] = H[4];
-                    H[13] = H[5];
-                    H[14] = H[6];
-                    H[15] = H[7];
-                    H[16] = tri_hyy[0] * z0;
-                    H[17] = tri_hyy[1] * z1;
-                    H[18] = tri_hyy[2] * z2;
-                    H[19] = tri_hyy[3] * z3;
-                    H[20] = hyz0;
-                    H[21] = hyz1;
-                    H[22] = hyz2;
-                    H[23] = hyz3;
-                    H[24] = hxz0;
-                    H[25] = hxz1;
-                    H[26] = hxz2;
-                    H[27] = hxz3;
-                    H[28] = hyz0;
-                    H[29] = hyz1;
-                    H[30] = hyz2;
-                    H[31] = hyz3;
-                    H[32] = tri_v[0] * d2z0;
-                    H[33] = tri_v[1] * d2z1;
-                    H[34] = tri_v[2] * d2z2;
-                    H[35] = tri_v[3] * d2z3;
-                }
-            }
-        }
-    }
-    return true;
-}
-
-template<bool NeedHess>
-bool try_evaluate_wedge_fused_stride4_q4(
-    const std::vector<std::array<int, 4>>& simplex_exponents,
-    const std::vector<std::size_t>& wedge_node_by_tri_z,
-    int order,
-    const std::vector<math::Vector<Real, 3>>& points,
-    const AxisBatchScratch& axis_batch,
-    int n_axis,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) {
-    switch (order) {
-        case 3:
-            return evaluate_wedge_fused_stride4_q4<3, NeedHess>(
-                simplex_exponents, wedge_node_by_tri_z, points, axis_batch, n_axis,
-                values_out, gradients_out, hessians_out);
-        case 4:
-            return evaluate_wedge_fused_stride4_q4<4, NeedHess>(
-                simplex_exponents, wedge_node_by_tri_z, points, axis_batch, n_axis,
-                values_out, gradients_out, hessians_out);
-        case 5:
-            return evaluate_wedge_fused_stride4_q4<5, NeedHess>(
-                simplex_exponents, wedge_node_by_tri_z, points, axis_batch, n_axis,
-                values_out, gradients_out, hessians_out);
-        case 6:
-            return evaluate_wedge_fused_stride4_q4<6, NeedHess>(
-                simplex_exponents, wedge_node_by_tri_z, points, axis_batch, n_axis,
-                values_out, gradients_out, hessians_out);
-        case 7:
-            return evaluate_wedge_fused_stride4_q4<7, NeedHess>(
-                simplex_exponents, wedge_node_by_tri_z, points, axis_batch, n_axis,
-                values_out, gradients_out, hessians_out);
-        case 8:
-            return evaluate_wedge_fused_stride4_q4<8, NeedHess>(
-                simplex_exponents, wedge_node_by_tri_z, points, axis_batch, n_axis,
-                values_out, gradients_out, hessians_out);
-        default:
-            return false;
-    }
-}
-
-struct TensorProductTableScratch {
-    std::vector<Real> vv;
-    std::vector<Real> dv;
-    std::vector<Real> vd;
-    std::vector<Real> d2v;
-    std::vector<Real> vd2;
-    std::vector<Real> dd;
-
-    void resizeFor(std::size_t count) {
-        if (vv.size() < count) vv.resize(count);
-        if (dv.size() < count) dv.resize(count);
-        if (vd.size() < count) vd.resize(count);
-        if (d2v.size() < count) d2v.resize(count);
-        if (vd2.size() < count) vd2.resize(count);
-        if (dd.size() < count) dd.resize(count);
-    }
-};
-
-// Caller-provided scratch buffers used by tensor-product evaluation. Three
-// independent axes plus reusable simplex/wedge intermediates.
-struct LagrangeEvaluateScratch {
-    AxisScratch axis_x;
-    AxisScratch axis_y;
-    AxisScratch axis_z;
-    AxisBatchScratch axis_x_batch;
-    AxisBatchScratch axis_y_batch;
-    AxisBatchScratch axis_z_batch;
-    TensorProductTableScratch tensor_tables;
-
-    std::vector<Real> tri_values;
-    std::vector<Gradient> tri_gradients;
-    std::vector<Hessian> tri_hessians;
-    std::vector<Real> tri_gradient_components;
-    std::vector<Real> tri_hessian_components;
-    std::vector<Real> wedge_tri_values_batch;
-    std::vector<Real> wedge_tri_gradient_batch;
-    std::vector<Real> wedge_tri_hessian_batch;
-
-    std::vector<Real> strided_values_tmp;
-    std::vector<Real> strided_gradients_tmp;
-    std::vector<Real> strided_hessians_tmp;
-
-    void prewarm(int max_order, std::size_t max_qpts) {
-        const int clamped_order = std::max(max_order, 0);
-        const std::size_t axis_size = static_cast<std::size_t>(clamped_order) + 1u;
-        const std::size_t axis_batch_size = axis_size * max_qpts;
-        const std::size_t tensor_table_size =
-            axis_size * axis_size * std::max<std::size_t>(max_qpts, 1u);
-        const std::size_t tensor_dofs = tensor_table_size * axis_size;
-        const std::size_t tri_count = axis_size * (axis_size + 1u) / 2u;
-
-        axis_x.reserveFor(axis_size);
-        axis_y.reserveFor(axis_size);
-        axis_z.reserveFor(axis_size);
-        axis_x_batch.resizeFor(axis_batch_size, AxisDeriv::ValuesAndFirstAndSecond);
-        axis_y_batch.resizeFor(axis_batch_size, AxisDeriv::ValuesAndFirstAndSecond);
-        axis_z_batch.resizeFor(axis_batch_size, AxisDeriv::ValuesAndFirstAndSecond);
-        tensor_tables.resizeFor(tensor_table_size);
-        tri_values.reserve(tri_count);
-        tri_gradients.reserve(tri_count);
-        tri_hessians.reserve(tri_count);
-        tri_gradient_components.reserve(tri_count * 3u);
-        tri_hessian_components.reserve(tri_count * 9u);
-        wedge_tri_values_batch.reserve(tri_count * max_qpts);
-        wedge_tri_gradient_batch.reserve(tri_count * 3u * max_qpts);
-        wedge_tri_hessian_batch.reserve(tri_count * 9u * max_qpts);
-        strided_values_tmp.reserve(tensor_dofs);
-        strided_gradients_tmp.reserve(tensor_dofs * 3u);
-        strided_hessians_tmp.reserve(tensor_dofs * 9u);
-    }
-};
-
-LagrangeEvaluateScratch& evaluate_scratch() {
-    // Scratch is intentionally thread-local: assembly and benchmark callers run
-    // evaluation on persistent worker threads, so capacity is reused by thread.
-    static thread_local LagrangeEvaluateScratch s;
-    return s;
-}
-
-// Fill axis scratch and return a non-owning view. Uncomputed slots still have
-// valid pointers to scratch storage (they may hold stale data) — callers must
-// only read the slots they requested via `level`. Common low orders use
-// precomputed Horner coefficients; high orders use barycentric axis evaluation.
-AxisBasisEvaluations fill_axis_scratch(AxisScratch& s,
-                                       const Real* v_coeffs,
-                                       const Real* d_coeffs,
-                                       const Real* d2_coeffs,
-                                       const Real* barycentric_weights,
-                                       int n_axis, Real xi,
-                                       AxisDeriv level) {
-    const std::size_t n = static_cast<std::size_t>(n_axis);
-    s.reserveFor(n);
-    Real* first  = (level == AxisDeriv::ValuesOnly) ? nullptr : s.first.data();
-    Real* second = (level == AxisDeriv::ValuesAndFirstAndSecond) ? s.second.data() : nullptr;
-    evaluate_1d_basis_to(v_coeffs, d_coeffs, d2_coeffs, barycentric_weights,
-                         n_axis, xi, s.values.data(), first, second);
-    return AxisBasisEvaluations{s.values.data(), s.first.data(), s.second.data(), n};
-}
-
-void fill_axis_batch(AxisBatchScratch& scratch,
-                     const std::vector<math::Vector<Real, 3>>& points,
-                     std::size_t component,
-                     const Real* v_coeffs,
-                     const Real* d_coeffs,
-                     const Real* d2_coeffs,
-                     const Real* barycentric_weights,
-                     int n_axis,
-                     AxisDeriv level) {
-    const std::size_t count = points.size() * static_cast<std::size_t>(n_axis);
-    scratch.resizeFor(count, level);
-    Real* first = (level == AxisDeriv::ValuesOnly) ? nullptr : scratch.first.data();
-    Real* second = (level == AxisDeriv::ValuesAndFirstAndSecond) ? scratch.second.data() : nullptr;
-    const std::size_t axis_stride = static_cast<std::size_t>(n_axis);
-    for (std::size_t q = 0; q < points.size(); ++q) {
-        evaluate_1d_basis_to(v_coeffs, d_coeffs, d2_coeffs, barycentric_weights, n_axis,
-                             points[q][component],
-                             scratch.values.data() + q * axis_stride,
-                             first ? first + q * axis_stride : nullptr,
-                             second ? second + q * axis_stride : nullptr);
-    }
-}
-
-// Maximum yz-table footprint that fits comfortably on the stack for the
-// Lagrange performance sweep. Order-8 hex q=4 needs 4*(9x9) entries per table.
-// Higher orders fall back to thread_local heap buffers.
-inline constexpr std::size_t kMaxStackYZ = 384;
-
-struct TensorProductVectorSink {
-    std::vector<Real>* values;
-    std::vector<Gradient>* gradients;
-    std::vector<Hessian>* hessians;
-
-    bool wants_values() const noexcept { return values != nullptr; }
-    bool wants_gradients() const noexcept { return gradients != nullptr; }
-    bool wants_hessians() const noexcept { return hessians != nullptr; }
-
-    void prepare(std::size_t n_nodes) const {
-        if (values)    values->resize(n_nodes);
-        if (gradients) gradients->resize(n_nodes);
-        if (hessians)  hessians->resize(n_nodes);
-    }
-
-    void write_value(std::size_t n, Real value) const {
-        (*values)[n] = value;
-    }
-
-    void write_gradient(std::size_t n, Real dx, Real dy, Real dz) const {
-        auto& g = (*gradients)[n];
-        g[0] = dx;
-        g[1] = dy;
-        g[2] = dz;
-    }
-
-    void write_hessian(std::size_t n,
-                       Real xx,
-                       Real yy,
-                       Real zz,
-                       Real xy,
-                       Real xz,
-                       Real yz) const {
-        (*hessians)[n] = make_symmetric_hessian(xx, yy, zz, xy, xz, yz);
-    }
-};
-
-struct TensorProductRawSink {
-    Real* values;
-    Real* gradients;
-    Real* hessians;
-
-    bool wants_values() const noexcept { return values != nullptr; }
-    bool wants_gradients() const noexcept { return gradients != nullptr; }
-    bool wants_hessians() const noexcept { return hessians != nullptr; }
-
-    void prepare(std::size_t) const {}
-
-    void write_value(std::size_t n, Real value) const {
-        values[n] = value;
-    }
-
-    void write_gradient(std::size_t n, Real dx, Real dy, Real dz) const {
-        Real* g = gradients + n * 3u;
-        g[0] = dx;
-        g[1] = dy;
-        g[2] = dz;
-    }
-
-    void write_hessian(std::size_t n,
-                       Real xx,
-                       Real yy,
-                       Real zz,
-                       Real xy,
-                       Real xz,
-                       Real yz) const {
-        Real* H = hessians + n * 9u;
-        H[0] = xx;
-        H[4] = yy;
-        H[8] = zz;
-        H[1] = xy; H[3] = xy;
-        H[2] = xz; H[6] = xz;
-        H[5] = yz; H[7] = yz;
-    }
-};
-
-// Fused sum-factorized tensor-product evaluator.
-//
-// Precomputes one to six (ny x nz)-shaped tables of partial products
-// `M_xy[j*nz + k]` so that the inner per-node loop performs at most one
-// multiplication per output instead of two. With all three output buffers
-// supplied, this is the fused values + gradients + hessians path that shares
-// every per-axis evaluation.
-//
-// Per-node multiply count (vs. the unfactored variants):
-//   values only       : 1  (was 2)
-//   gradients only    : 3  (was 6)
-//   hessians only     : 6  (was 12)
-//   all three         : 10 (was 20)
-//
-// Dimensional scope: works uniformly for Line/Quadrilateral/Hexahedron with
-// the unused axes' size folded to 1 via constant_axis_basis().
-template <typename Sink>
-void evaluate_tensor_product_factorized_impl(
-    const std::vector<std::array<std::size_t, 3>>& tensor_indices,
-    const AxisBasisEvaluations& x_axis,
-    const AxisBasisEvaluations& y_axis,
-    const AxisBasisEvaluations& z_axis,
-    const Sink& sink) {
-    const std::size_t ny = y_axis.size;
-    const std::size_t nz = z_axis.size;
-    const std::size_t nyz = ny * nz;
-    const bool need_values = sink.wants_values();
-    const bool need_grad = sink.wants_gradients();
-    const bool need_hess = sink.wants_hessians();
-
-    Real Mvv_stack[kMaxStackYZ];
-    Real Mdv_stack[kMaxStackYZ];
-    Real Mvd_stack[kMaxStackYZ];
-    Real Md2v_stack[kMaxStackYZ];
-    Real Mvd2_stack[kMaxStackYZ];
-    Real Mdd_stack[kMaxStackYZ];
-
-    Real* Mvv;
-    Real* Mdv;
-    Real* Mvd;
-    Real* Md2v;
-    Real* Mvd2;
-    Real* Mdd;
-    if (nyz <= kMaxStackYZ) {
-        Mvv = Mvv_stack;
-        Mdv = Mdv_stack;
-        Mvd = Mvd_stack;
-        Md2v = Md2v_stack;
-        Mvd2 = Mvd2_stack;
-        Mdd = Mdd_stack;
-    } else {
-        auto& tables = evaluate_scratch().tensor_tables;
-        tables.resizeFor(nyz);
-        Mvv = tables.vv.data();
-        Mdv = tables.dv.data();
-        Mvd = tables.vd.data();
-        Md2v = tables.d2v.data();
-        Mvd2 = tables.vd2.data();
-        Mdd = tables.dd.data();
-    }
-
-    // M_vv is required by every output (values, ∂ξ, ∂ξ²).
-    for (std::size_t j = 0; j < ny; ++j) {
-        const Real yv = y_axis.values[j];
-        for (std::size_t k = 0; k < nz; ++k) {
-            Mvv[j * nz + k] = yv * z_axis.values[k];
-        }
-    }
-
-    if (need_grad || need_hess) {
-        for (std::size_t j = 0; j < ny; ++j) {
-            const Real yv = y_axis.values[j];
-            const Real yd = y_axis.first[j];
-            for (std::size_t k = 0; k < nz; ++k) {
-                Mdv[j * nz + k] = yd * z_axis.values[k];
-                Mvd[j * nz + k] = yv * z_axis.first[k];
-            }
-        }
-    }
-
-    if (need_hess) {
-        for (std::size_t j = 0; j < ny; ++j) {
-            const Real yv = y_axis.values[j];
-            const Real yd = y_axis.first[j];
-            const Real yd2 = y_axis.second[j];
-            for (std::size_t k = 0; k < nz; ++k) {
-                Md2v[j * nz + k] = yd2 * z_axis.values[k];
-                Mvd2[j * nz + k] = yv  * z_axis.second[k];
-                Mdd[j * nz + k]  = yd  * z_axis.first[k];
-            }
-        }
-    }
-
-    const std::size_t n_nodes = tensor_indices.size();
-    sink.prepare(n_nodes);
-
-    for (std::size_t n = 0; n < n_nodes; ++n) {
-        const auto& idx = tensor_indices[n];
-        const std::size_t i = idx[0];
-        const std::size_t jk = idx[1] * nz + idx[2];
-
-        const Real Lx = x_axis.values[i];
-
-        if (need_values) {
-            sink.write_value(n, Lx * Mvv[jk]);
-        }
-
-        if (need_grad) {
-            const Real dLx = x_axis.first[i];
-            sink.write_gradient(n,
-                                dLx * Mvv[jk],
-                                Lx  * Mdv[jk],
-                                Lx  * Mvd[jk]);
-        }
-
-        if (need_hess) {
-            const Real dLx  = x_axis.first[i];
-            const Real d2Lx = x_axis.second[i];
-            sink.write_hessian(n,
-                               d2Lx * Mvv[jk],
-                               Lx   * Md2v[jk],
-                               Lx   * Mvd2[jk],
-                               dLx  * Mdv[jk],
-                               dLx  * Mvd[jk],
-                               Lx   * Mdd[jk]);
-        }
-    }
-}
-
-void evaluate_tensor_product_factorized(
-    const std::vector<std::array<std::size_t, 3>>& tensor_indices,
-    const AxisBasisEvaluations& x_axis,
-    const AxisBasisEvaluations& y_axis,
-    const AxisBasisEvaluations& z_axis,
-    std::vector<Real>* values_out,
-    std::vector<Gradient>* gradients_out,
-    std::vector<Hessian>* hessians_out) {
-    const TensorProductVectorSink sink{values_out, gradients_out, hessians_out};
-    evaluate_tensor_product_factorized_impl(tensor_indices, x_axis, y_axis, z_axis, sink);
-}
-
-void evaluate_tensor_product_factorized_to(
-    const std::vector<std::array<std::size_t, 3>>& tensor_indices,
-    const AxisBasisEvaluations& x_axis,
-    const AxisBasisEvaluations& y_axis,
-    const AxisBasisEvaluations& z_axis,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) {
-    const TensorProductRawSink sink{values_out, gradients_out, hessians_out};
-    evaluate_tensor_product_factorized_impl(tensor_indices, x_axis, y_axis, z_axis, sink);
-}
-
-template <std::size_t Q>
-inline void write_tensor_product_value_strided_q(
-    std::size_t axis_stride,
-    std::size_t nyz,
-    std::size_t i,
-    std::size_t jk,
-    const AxisBatchScratch& x_batch,
-    const Real* SVMP_RESTRICT Mvv,
-    Real* SVMP_RESTRICT value_row) {
-    const std::size_t q_axis = Q * axis_stride;
-    const std::size_t slot = Q * nyz + jk;
-    value_row[Q] = x_batch.values[q_axis + i] * Mvv[slot];
-}
-
-template <std::size_t Q>
-inline void write_tensor_product_hessian_strided_q(
-    std::size_t axis_stride,
-    std::size_t nyz,
-    std::size_t i,
-    std::size_t jk,
-    std::size_t output_stride,
-    const AxisBatchScratch& x_batch,
-    const Real* SVMP_RESTRICT Mvv,
-    const Real* SVMP_RESTRICT Mdv,
-    const Real* SVMP_RESTRICT Mvd,
-    const Real* SVMP_RESTRICT Md2v,
-    const Real* SVMP_RESTRICT Mvd2,
-    const Real* SVMP_RESTRICT Mdd,
-    Real* SVMP_RESTRICT hess_row) {
-    const std::size_t q_axis = Q * axis_stride;
-    const std::size_t slot = Q * nyz + jk;
-    const Real xv = x_batch.values[q_axis + i];
-    const Real xd = x_batch.first[q_axis + i];
-    const Real x2 = x_batch.second[q_axis + i];
-    const Real hxy = xd * Mdv[slot];
-    const Real hxz = xd * Mvd[slot];
-    const Real hyz = xv * Mdd[slot];
-    hess_row[0u * output_stride + Q] = x2 * Mvv[slot];
-    hess_row[4u * output_stride + Q] = xv * Md2v[slot];
-    hess_row[8u * output_stride + Q] = xv * Mvd2[slot];
-    hess_row[1u * output_stride + Q] = hxy;
-    hess_row[3u * output_stride + Q] = hxy;
-    hess_row[2u * output_stride + Q] = hxz;
-    hess_row[6u * output_stride + Q] = hxz;
-    hess_row[5u * output_stride + Q] = hyz;
-    hess_row[7u * output_stride + Q] = hyz;
-}
-
-template <std::size_t Q>
-inline void write_tensor_product_hessian_stride4_q(
-    std::size_t axis_stride,
-    std::size_t nyz,
-    std::size_t i,
-    std::size_t jk,
-    const AxisBatchScratch& x_batch,
-    const Real* SVMP_RESTRICT Mvv,
-    const Real* SVMP_RESTRICT Mdv,
-    const Real* SVMP_RESTRICT Mvd,
-    const Real* SVMP_RESTRICT Md2v,
-    const Real* SVMP_RESTRICT Mvd2,
-    const Real* SVMP_RESTRICT Mdd,
-    Real* SVMP_RESTRICT hess_row) {
-    const std::size_t q_axis = Q * axis_stride;
-    const std::size_t slot = Q * nyz + jk;
-    const Real xv = x_batch.values[q_axis + i];
-    const Real xd = x_batch.first[q_axis + i];
-    const Real x2 = x_batch.second[q_axis + i];
-    const Real hxy = xd * Mdv[slot];
-    const Real hxz = xd * Mvd[slot];
-    const Real hyz = xv * Mdd[slot];
-    hess_row[Q] = x2 * Mvv[slot];
-    hess_row[16u + Q] = xv * Md2v[slot];
-    hess_row[32u + Q] = xv * Mvd2[slot];
-    hess_row[4u + Q] = hxy;
-    hess_row[12u + Q] = hxy;
-    hess_row[8u + Q] = hxz;
-    hess_row[24u + Q] = hxz;
-    hess_row[20u + Q] = hyz;
-    hess_row[28u + Q] = hyz;
-}
-
-template <std::size_t Q>
-inline void write_tensor_product_gradient_strided_q(
-    std::size_t axis_stride,
-    std::size_t nyz,
-    std::size_t i,
-    std::size_t jk,
-    std::size_t output_stride,
-    const AxisBatchScratch& x_batch,
-    const Real* SVMP_RESTRICT Mvv,
-    const Real* SVMP_RESTRICT Mdv,
-    const Real* SVMP_RESTRICT Mvd,
-    Real* SVMP_RESTRICT grad_row) {
-    const std::size_t q_axis = Q * axis_stride;
-    const std::size_t slot = Q * nyz + jk;
-    const Real xv = x_batch.values[q_axis + i];
-    const Real xd = x_batch.first[q_axis + i];
-    grad_row[0u * output_stride + Q] = xd * Mvv[slot];
-    grad_row[1u * output_stride + Q] = xv * Mdv[slot];
-    grad_row[2u * output_stride + Q] = xv * Mvd[slot];
-}
-
-template <std::size_t Q>
-inline void write_tensor_product_gradient_stride4_q(
-    std::size_t axis_stride,
-    std::size_t nyz,
-    std::size_t i,
-    std::size_t jk,
-    const AxisBatchScratch& x_batch,
-    const Real* SVMP_RESTRICT Mvv,
-    const Real* SVMP_RESTRICT Mdv,
-    const Real* SVMP_RESTRICT Mvd,
-    Real* SVMP_RESTRICT grad_row) {
-    const std::size_t q_axis = Q * axis_stride;
-    const std::size_t slot = Q * nyz + jk;
-    const Real xv = x_batch.values[q_axis + i];
-    const Real xd = x_batch.first[q_axis + i];
-    grad_row[Q] = xd * Mvv[slot];
-    grad_row[4u + Q] = xv * Mdv[slot];
-    grad_row[8u + Q] = xv * Mvd[slot];
-}
-
-template <std::size_t Q>
-inline void write_tensor_product_all_strided_q(
-    std::size_t axis_stride,
-    std::size_t nyz,
-    std::size_t i,
-    std::size_t jk,
-    std::size_t output_stride,
-    const AxisBatchScratch& x_batch,
-    const Real* SVMP_RESTRICT Mvv,
-    const Real* SVMP_RESTRICT Mdv,
-    const Real* SVMP_RESTRICT Mvd,
-    const Real* SVMP_RESTRICT Md2v,
-    const Real* SVMP_RESTRICT Mvd2,
-    const Real* SVMP_RESTRICT Mdd,
-    Real* SVMP_RESTRICT value_row,
-    Real* SVMP_RESTRICT grad_row,
-    Real* SVMP_RESTRICT hess_row) {
-    const std::size_t q_axis = Q * axis_stride;
-    const std::size_t slot = Q * nyz + jk;
-    const Real xv = x_batch.values[q_axis + i];
-    const Real xd = x_batch.first[q_axis + i];
-    value_row[Q] = xv * Mvv[slot];
-    grad_row[0u * output_stride + Q] = xd * Mvv[slot];
-    grad_row[1u * output_stride + Q] = xv * Mdv[slot];
-    grad_row[2u * output_stride + Q] = xv * Mvd[slot];
-
-    const Real x2 = x_batch.second[q_axis + i];
-    const Real hxy = xd * Mdv[slot];
-    const Real hxz = xd * Mvd[slot];
-    const Real hyz = xv * Mdd[slot];
-    hess_row[0u * output_stride + Q] = x2 * Mvv[slot];
-    hess_row[4u * output_stride + Q] = xv * Md2v[slot];
-    hess_row[8u * output_stride + Q] = xv * Mvd2[slot];
-    hess_row[1u * output_stride + Q] = hxy;
-    hess_row[3u * output_stride + Q] = hxy;
-    hess_row[2u * output_stride + Q] = hxz;
-    hess_row[6u * output_stride + Q] = hxz;
-    hess_row[5u * output_stride + Q] = hyz;
-    hess_row[7u * output_stride + Q] = hyz;
-}
-
-template <std::size_t Q>
-inline void write_tensor_product_all_stride4_q(
-    std::size_t axis_stride,
-    std::size_t nyz,
-    std::size_t i,
-    std::size_t jk,
-    const AxisBatchScratch& x_batch,
-    const Real* SVMP_RESTRICT Mvv,
-    const Real* SVMP_RESTRICT Mdv,
-    const Real* SVMP_RESTRICT Mvd,
-    const Real* SVMP_RESTRICT Md2v,
-    const Real* SVMP_RESTRICT Mvd2,
-    const Real* SVMP_RESTRICT Mdd,
-    Real* SVMP_RESTRICT value_row,
-    Real* SVMP_RESTRICT grad_row,
-    Real* SVMP_RESTRICT hess_row) {
-    const std::size_t q_axis = Q * axis_stride;
-    const std::size_t slot = Q * nyz + jk;
-    const Real xv = x_batch.values[q_axis + i];
-    const Real xd = x_batch.first[q_axis + i];
-    value_row[Q] = xv * Mvv[slot];
-    grad_row[Q] = xd * Mvv[slot];
-    grad_row[4u + Q] = xv * Mdv[slot];
-    grad_row[8u + Q] = xv * Mvd[slot];
-
-    const Real x2 = x_batch.second[q_axis + i];
-    const Real hxy = xd * Mdv[slot];
-    const Real hxz = xd * Mvd[slot];
-    const Real hyz = xv * Mdd[slot];
-    hess_row[Q] = x2 * Mvv[slot];
-    hess_row[16u + Q] = xv * Md2v[slot];
-    hess_row[32u + Q] = xv * Mvd2[slot];
-    hess_row[4u + Q] = hxy;
-    hess_row[12u + Q] = hxy;
-    hess_row[8u + Q] = hxz;
-    hess_row[24u + Q] = hxz;
-    hess_row[20u + Q] = hyz;
-    hess_row[28u + Q] = hyz;
-}
-
-SVMP_LAGRANGE_NOINLINE SVMP_LAGRANGE_ALIGN64 bool
-evaluate_tensor_product_values_stride4_q4_transposed(
-    const std::vector<std::array<std::size_t, 3>>& tensor_indices,
-    std::size_t axis_stride,
-    const AxisBatchScratch& x_batch,
-    const AxisBatchScratch& y_batch,
-    const AxisBatchScratch& z_batch,
-    Real* SVMP_RESTRICT values_out) {
-    const std::size_t nyz = axis_stride * axis_stride;
-    const std::size_t table_count = 4u * nyz;
-    if (table_count > kMaxStackYZ || values_out == nullptr) {
-        return false;
-    }
-
-    Real Mvv_stack[kMaxStackYZ];
-    for (std::size_t j = 0; j < axis_stride; ++j) {
-        const Real yv0 = y_batch.values[j];
-        const Real yv1 = y_batch.values[axis_stride + j];
-        const Real yv2 = y_batch.values[2u * axis_stride + j];
-        const Real yv3 = y_batch.values[3u * axis_stride + j];
-        for (std::size_t k = 0; k < axis_stride; ++k) {
-            const std::size_t base = (j * axis_stride + k) * 4u;
-            Mvv_stack[base + 0u] = yv0 * z_batch.values[k];
-            Mvv_stack[base + 1u] = yv1 * z_batch.values[axis_stride + k];
-            Mvv_stack[base + 2u] = yv2 * z_batch.values[2u * axis_stride + k];
-            Mvv_stack[base + 3u] = yv3 * z_batch.values[3u * axis_stride + k];
-        }
-    }
-
-    for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
-        const auto& idx = tensor_indices[node];
-        const std::size_t i = idx[0];
-        const std::size_t jk = (idx[1] * axis_stride + idx[2]) * 4u;
-        Real* SVMP_RESTRICT value_row = values_out + node * 4u;
-        value_row[0u] = x_batch.values[i] * Mvv_stack[jk + 0u];
-        value_row[1u] = x_batch.values[axis_stride + i] * Mvv_stack[jk + 1u];
-        value_row[2u] = x_batch.values[2u * axis_stride + i] * Mvv_stack[jk + 2u];
-        value_row[3u] = x_batch.values[3u * axis_stride + i] * Mvv_stack[jk + 3u];
-    }
-
-    return true;
-}
-
-SVMP_LAGRANGE_NOINLINE SVMP_LAGRANGE_ALIGN64 bool
-evaluate_tensor_product_gradients_stride4_q4_transposed(
-    const std::vector<std::array<std::size_t, 3>>& tensor_indices,
-    std::size_t axis_stride,
-    const AxisBatchScratch& x_batch,
-    const AxisBatchScratch& y_batch,
-    const AxisBatchScratch& z_batch,
-    Real* SVMP_RESTRICT gradients_out) {
-    const std::size_t nyz = axis_stride * axis_stride;
-    const std::size_t table_count = 4u * nyz;
-    if (table_count > kMaxStackYZ || gradients_out == nullptr) {
-        return false;
-    }
-
-    Real Mvv_stack[kMaxStackYZ];
-    Real Mdv_stack[kMaxStackYZ];
-    Real Mvd_stack[kMaxStackYZ];
-    for (std::size_t j = 0; j < axis_stride; ++j) {
-        const Real yv0 = y_batch.values[j];
-        const Real yv1 = y_batch.values[axis_stride + j];
-        const Real yv2 = y_batch.values[2u * axis_stride + j];
-        const Real yv3 = y_batch.values[3u * axis_stride + j];
-        const Real yd0 = y_batch.first[j];
-        const Real yd1 = y_batch.first[axis_stride + j];
-        const Real yd2 = y_batch.first[2u * axis_stride + j];
-        const Real yd3 = y_batch.first[3u * axis_stride + j];
-        for (std::size_t k = 0; k < axis_stride; ++k) {
-            const std::size_t base = (j * axis_stride + k) * 4u;
-            const Real zv0 = z_batch.values[k];
-            const Real zv1 = z_batch.values[axis_stride + k];
-            const Real zv2 = z_batch.values[2u * axis_stride + k];
-            const Real zv3 = z_batch.values[3u * axis_stride + k];
-            const Real zd0 = z_batch.first[k];
-            const Real zd1 = z_batch.first[axis_stride + k];
-            const Real zd2 = z_batch.first[2u * axis_stride + k];
-            const Real zd3 = z_batch.first[3u * axis_stride + k];
-
-            Mvv_stack[base + 0u] = yv0 * zv0;
-            Mvv_stack[base + 1u] = yv1 * zv1;
-            Mvv_stack[base + 2u] = yv2 * zv2;
-            Mvv_stack[base + 3u] = yv3 * zv3;
-            Mdv_stack[base + 0u] = yd0 * zv0;
-            Mdv_stack[base + 1u] = yd1 * zv1;
-            Mdv_stack[base + 2u] = yd2 * zv2;
-            Mdv_stack[base + 3u] = yd3 * zv3;
-            Mvd_stack[base + 0u] = yv0 * zd0;
-            Mvd_stack[base + 1u] = yv1 * zd1;
-            Mvd_stack[base + 2u] = yv2 * zd2;
-            Mvd_stack[base + 3u] = yv3 * zd3;
-        }
-    }
-
-    for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
-        const auto& idx = tensor_indices[node];
-        const std::size_t i = idx[0];
-        const std::size_t jk = (idx[1] * axis_stride + idx[2]) * 4u;
-
-        const Real xv0 = x_batch.values[i];
-        const Real xv1 = x_batch.values[axis_stride + i];
-        const Real xv2 = x_batch.values[2u * axis_stride + i];
-        const Real xv3 = x_batch.values[3u * axis_stride + i];
-        const Real xd0 = x_batch.first[i];
-        const Real xd1 = x_batch.first[axis_stride + i];
-        const Real xd2 = x_batch.first[2u * axis_stride + i];
-        const Real xd3 = x_batch.first[3u * axis_stride + i];
-
-        Real* SVMP_RESTRICT grad_row = gradients_out + node * 12u;
-        grad_row[0u] = xd0 * Mvv_stack[jk + 0u];
-        grad_row[1u] = xd1 * Mvv_stack[jk + 1u];
-        grad_row[2u] = xd2 * Mvv_stack[jk + 2u];
-        grad_row[3u] = xd3 * Mvv_stack[jk + 3u];
-        grad_row[4u] = xv0 * Mdv_stack[jk + 0u];
-        grad_row[5u] = xv1 * Mdv_stack[jk + 1u];
-        grad_row[6u] = xv2 * Mdv_stack[jk + 2u];
-        grad_row[7u] = xv3 * Mdv_stack[jk + 3u];
-        grad_row[8u] = xv0 * Mvd_stack[jk + 0u];
-        grad_row[9u] = xv1 * Mvd_stack[jk + 1u];
-        grad_row[10u] = xv2 * Mvd_stack[jk + 2u];
-        grad_row[11u] = xv3 * Mvd_stack[jk + 3u];
-    }
-
-    return true;
-}
-
-template<bool NeedAllOutputs>
-SVMP_LAGRANGE_NOINLINE SVMP_LAGRANGE_ALIGN64 bool
-evaluate_tensor_product_second_stride4_q4_transposed(
-    const std::vector<std::array<std::size_t, 3>>& tensor_indices,
-    std::size_t axis_stride,
-    const AxisBatchScratch& x_batch,
-    const AxisBatchScratch& y_batch,
-    const AxisBatchScratch& z_batch,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) {
-    const std::size_t nyz = axis_stride * axis_stride;
-    const std::size_t table_count = 4u * nyz;
-    if (table_count > kMaxStackYZ || hessians_out == nullptr) {
-        return false;
-    }
-    if constexpr (NeedAllOutputs) {
-        if (values_out == nullptr || gradients_out == nullptr) {
-            return false;
-        }
-    }
-
-    Real Mvv_stack[kMaxStackYZ];
-    Real Mdv_stack[kMaxStackYZ];
-    Real Mvd_stack[kMaxStackYZ];
-    Real Md2v_stack[kMaxStackYZ];
-    Real Mvd2_stack[kMaxStackYZ];
-    Real Mdd_stack[kMaxStackYZ];
-
-    for (std::size_t j = 0; j < axis_stride; ++j) {
-        const Real yv0 = y_batch.values[j];
-        const Real yv1 = y_batch.values[axis_stride + j];
-        const Real yv2 = y_batch.values[2u * axis_stride + j];
-        const Real yv3 = y_batch.values[3u * axis_stride + j];
-        const Real yd0 = y_batch.first[j];
-        const Real yd1 = y_batch.first[axis_stride + j];
-        const Real yd2 = y_batch.first[2u * axis_stride + j];
-        const Real yd3 = y_batch.first[3u * axis_stride + j];
-        const Real y20 = y_batch.second[j];
-        const Real y21 = y_batch.second[axis_stride + j];
-        const Real y22 = y_batch.second[2u * axis_stride + j];
-        const Real y23 = y_batch.second[3u * axis_stride + j];
-
-        for (std::size_t k = 0; k < axis_stride; ++k) {
-            const std::size_t base = (j * axis_stride + k) * 4u;
-            const Real zv0 = z_batch.values[k];
-            const Real zv1 = z_batch.values[axis_stride + k];
-            const Real zv2 = z_batch.values[2u * axis_stride + k];
-            const Real zv3 = z_batch.values[3u * axis_stride + k];
-            const Real zd0 = z_batch.first[k];
-            const Real zd1 = z_batch.first[axis_stride + k];
-            const Real zd2 = z_batch.first[2u * axis_stride + k];
-            const Real zd3 = z_batch.first[3u * axis_stride + k];
-            const Real z20 = z_batch.second[k];
-            const Real z21 = z_batch.second[axis_stride + k];
-            const Real z22 = z_batch.second[2u * axis_stride + k];
-            const Real z23 = z_batch.second[3u * axis_stride + k];
-
-            Mvv_stack[base + 0u] = yv0 * zv0;
-            Mvv_stack[base + 1u] = yv1 * zv1;
-            Mvv_stack[base + 2u] = yv2 * zv2;
-            Mvv_stack[base + 3u] = yv3 * zv3;
-            Mdv_stack[base + 0u] = yd0 * zv0;
-            Mdv_stack[base + 1u] = yd1 * zv1;
-            Mdv_stack[base + 2u] = yd2 * zv2;
-            Mdv_stack[base + 3u] = yd3 * zv3;
-            Mvd_stack[base + 0u] = yv0 * zd0;
-            Mvd_stack[base + 1u] = yv1 * zd1;
-            Mvd_stack[base + 2u] = yv2 * zd2;
-            Mvd_stack[base + 3u] = yv3 * zd3;
-            Md2v_stack[base + 0u] = y20 * zv0;
-            Md2v_stack[base + 1u] = y21 * zv1;
-            Md2v_stack[base + 2u] = y22 * zv2;
-            Md2v_stack[base + 3u] = y23 * zv3;
-            Mvd2_stack[base + 0u] = yv0 * z20;
-            Mvd2_stack[base + 1u] = yv1 * z21;
-            Mvd2_stack[base + 2u] = yv2 * z22;
-            Mvd2_stack[base + 3u] = yv3 * z23;
-            Mdd_stack[base + 0u] = yd0 * zd0;
-            Mdd_stack[base + 1u] = yd1 * zd1;
-            Mdd_stack[base + 2u] = yd2 * zd2;
-            Mdd_stack[base + 3u] = yd3 * zd3;
-        }
-    }
-
-    for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
-        const auto& idx = tensor_indices[node];
-        const std::size_t i = idx[0];
-        const std::size_t jk = (idx[1] * axis_stride + idx[2]) * 4u;
-
-        const Real xv0 = x_batch.values[i];
-        const Real xv1 = x_batch.values[axis_stride + i];
-        const Real xv2 = x_batch.values[2u * axis_stride + i];
-        const Real xv3 = x_batch.values[3u * axis_stride + i];
-        const Real xd0 = x_batch.first[i];
-        const Real xd1 = x_batch.first[axis_stride + i];
-        const Real xd2 = x_batch.first[2u * axis_stride + i];
-        const Real xd3 = x_batch.first[3u * axis_stride + i];
-        const Real x20 = x_batch.second[i];
-        const Real x21 = x_batch.second[axis_stride + i];
-        const Real x22 = x_batch.second[2u * axis_stride + i];
-        const Real x23 = x_batch.second[3u * axis_stride + i];
-
-        const Real mvv0 = Mvv_stack[jk + 0u];
-        const Real mvv1 = Mvv_stack[jk + 1u];
-        const Real mvv2 = Mvv_stack[jk + 2u];
-        const Real mvv3 = Mvv_stack[jk + 3u];
-        const Real mdv0 = Mdv_stack[jk + 0u];
-        const Real mdv1 = Mdv_stack[jk + 1u];
-        const Real mdv2 = Mdv_stack[jk + 2u];
-        const Real mdv3 = Mdv_stack[jk + 3u];
-        const Real mvd0 = Mvd_stack[jk + 0u];
-        const Real mvd1 = Mvd_stack[jk + 1u];
-        const Real mvd2 = Mvd_stack[jk + 2u];
-        const Real mvd3 = Mvd_stack[jk + 3u];
-        const Real md2v0 = Md2v_stack[jk + 0u];
-        const Real md2v1 = Md2v_stack[jk + 1u];
-        const Real md2v2 = Md2v_stack[jk + 2u];
-        const Real md2v3 = Md2v_stack[jk + 3u];
-        const Real mvd20 = Mvd2_stack[jk + 0u];
-        const Real mvd21 = Mvd2_stack[jk + 1u];
-        const Real mvd22 = Mvd2_stack[jk + 2u];
-        const Real mvd23 = Mvd2_stack[jk + 3u];
-        const Real mdd0 = Mdd_stack[jk + 0u];
-        const Real mdd1 = Mdd_stack[jk + 1u];
-        const Real mdd2 = Mdd_stack[jk + 2u];
-        const Real mdd3 = Mdd_stack[jk + 3u];
-
-        if constexpr (NeedAllOutputs) {
-            Real* SVMP_RESTRICT value_row = values_out + node * 4u;
-            value_row[0u] = xv0 * mvv0;
-            value_row[1u] = xv1 * mvv1;
-            value_row[2u] = xv2 * mvv2;
-            value_row[3u] = xv3 * mvv3;
-
-            Real* SVMP_RESTRICT grad_row = gradients_out + node * 12u;
-            grad_row[0u] = xd0 * mvv0;
-            grad_row[1u] = xd1 * mvv1;
-            grad_row[2u] = xd2 * mvv2;
-            grad_row[3u] = xd3 * mvv3;
-            grad_row[4u] = xv0 * mdv0;
-            grad_row[5u] = xv1 * mdv1;
-            grad_row[6u] = xv2 * mdv2;
-            grad_row[7u] = xv3 * mdv3;
-            grad_row[8u] = xv0 * mvd0;
-            grad_row[9u] = xv1 * mvd1;
-            grad_row[10u] = xv2 * mvd2;
-            grad_row[11u] = xv3 * mvd3;
-        }
-
-        const Real hxy0 = xd0 * mdv0;
-        const Real hxy1 = xd1 * mdv1;
-        const Real hxy2 = xd2 * mdv2;
-        const Real hxy3 = xd3 * mdv3;
-        const Real hxz0 = xd0 * mvd0;
-        const Real hxz1 = xd1 * mvd1;
-        const Real hxz2 = xd2 * mvd2;
-        const Real hxz3 = xd3 * mvd3;
-        const Real hyz0 = xv0 * mdd0;
-        const Real hyz1 = xv1 * mdd1;
-        const Real hyz2 = xv2 * mdd2;
-        const Real hyz3 = xv3 * mdd3;
-
-        Real* SVMP_RESTRICT hess_row = hessians_out + node * 36u;
-        hess_row[0u] = x20 * mvv0;
-        hess_row[1u] = x21 * mvv1;
-        hess_row[2u] = x22 * mvv2;
-        hess_row[3u] = x23 * mvv3;
-        hess_row[4u] = hxy0;
-        hess_row[5u] = hxy1;
-        hess_row[6u] = hxy2;
-        hess_row[7u] = hxy3;
-        hess_row[8u] = hxz0;
-        hess_row[9u] = hxz1;
-        hess_row[10u] = hxz2;
-        hess_row[11u] = hxz3;
-        hess_row[12u] = hxy0;
-        hess_row[13u] = hxy1;
-        hess_row[14u] = hxy2;
-        hess_row[15u] = hxy3;
-        hess_row[16u] = xv0 * md2v0;
-        hess_row[17u] = xv1 * md2v1;
-        hess_row[18u] = xv2 * md2v2;
-        hess_row[19u] = xv3 * md2v3;
-        hess_row[20u] = hyz0;
-        hess_row[21u] = hyz1;
-        hess_row[22u] = hyz2;
-        hess_row[23u] = hyz3;
-        hess_row[24u] = hxz0;
-        hess_row[25u] = hxz1;
-        hess_row[26u] = hxz2;
-        hess_row[27u] = hxz3;
-        hess_row[28u] = hyz0;
-        hess_row[29u] = hyz1;
-        hess_row[30u] = hyz2;
-        hess_row[31u] = hyz3;
-        hess_row[32u] = xv0 * mvd20;
-        hess_row[33u] = xv1 * mvd21;
-        hess_row[34u] = xv2 * mvd22;
-        hess_row[35u] = xv3 * mvd23;
-    }
-
-    return true;
-}
-
-template<int N>
-constexpr std::size_t line_public_axis_index(std::size_t node) noexcept {
-    return node == 0u ? 0u : (node == 1u ? static_cast<std::size_t>(N - 1) : node - 1u);
-}
-
-template<int N>
-constexpr std::array<Real, N> make_line_axis_inv_denoms() noexcept {
-    std::array<Real, N> inv_denoms{};
-    for (int i = 0; i < N; ++i) {
-        Real denom = Real(1);
-        for (int j = 0; j < N; ++j) {
-            if (j != i) {
-                denom *= static_cast<Real>(i - j);
-            }
-        }
-        inv_denoms[static_cast<std::size_t>(i)] = Real(1) / denom;
-    }
-    return inv_denoms;
-}
-
-template<int N>
-void fill_line_values_product(Real x, Real* SVMP_RESTRICT values) {
-    static constexpr auto inv_denoms = make_line_axis_inv_denoms<N>();
-    const Real p = static_cast<Real>(N - 1);
-    const Real r = (x + Real(1)) * p * Real(0.5);
-    Real prefix[N];
-    Real suffix[N];
-    prefix[0] = Real(1);
-    for (int i = 1; i < N; ++i) {
-        prefix[i] = prefix[i - 1] * (r - static_cast<Real>(i - 1));
-    }
-    suffix[N - 1] = Real(1);
-    for (int i = N - 2; i >= 0; --i) {
-        suffix[i] = suffix[i + 1] * (r - static_cast<Real>(i + 1));
-    }
-    for (int i = 0; i < N; ++i) {
-        const std::size_t slot = static_cast<std::size_t>(i);
-        values[slot] = prefix[i] * suffix[i] * inv_denoms[slot];
-    }
-}
-
-template<int N>
-void fill_line_values_product_derivatives(Real x,
-                                          Real* SVMP_RESTRICT values,
-                                          Real* SVMP_RESTRICT first,
-                                          Real* SVMP_RESTRICT second) {
-    static constexpr auto inv_denoms = make_line_axis_inv_denoms<N>();
-    const Real p = static_cast<Real>(N - 1);
-    const Real drdx = p * Real(0.5);
-    const Real d2rdx2 = drdx * drdx;
-    const Real r = (x + Real(1)) * drdx;
-
-    Real prefix[N + 1];
-    Real prefix_d1[N + 1];
-    Real prefix_d2[N + 1];
-    Real suffix[N + 1];
-    Real suffix_d1[N + 1];
-    Real suffix_d2[N + 1];
-
-    const bool need_second = second != nullptr;
-
-    prefix[0] = Real(1);
-    prefix_d1[0] = Real(0);
-    if (need_second) {
-        prefix_d2[0] = Real(0);
-    }
-    for (int i = 0; i < N; ++i) {
-        const Real factor = r - static_cast<Real>(i);
-        prefix[i + 1] = prefix[i] * factor;
-        prefix_d1[i + 1] = prefix_d1[i] * factor + prefix[i];
-        if (need_second) {
-            prefix_d2[i + 1] = prefix_d2[i] * factor + Real(2) * prefix_d1[i];
-        }
-    }
-
-    suffix[N] = Real(1);
-    suffix_d1[N] = Real(0);
-    if (need_second) {
-        suffix_d2[N] = Real(0);
-    }
-    for (int i = N - 1; i >= 0; --i) {
-        const Real factor = r - static_cast<Real>(i);
-        suffix[i] = suffix[i + 1] * factor;
-        suffix_d1[i] = suffix_d1[i + 1] * factor + suffix[i + 1];
-        if (need_second) {
-            suffix_d2[i] = suffix_d2[i + 1] * factor + Real(2) * suffix_d1[i + 1];
-        }
-    }
-
-    for (int i = 0; i < N; ++i) {
-        const std::size_t slot = static_cast<std::size_t>(i);
-        const Real inv = inv_denoms[slot];
-        const Real pre = prefix[i];
-        const Real suf = suffix[i + 1];
-        const Real pre_d1 = prefix_d1[i];
-        const Real suf_d1 = suffix_d1[i + 1];
-        values[slot] = pre * suf * inv;
-        if (first != nullptr) {
-            first[slot] = (pre_d1 * suf + pre * suf_d1) * inv * drdx;
-        }
-        if (second != nullptr) {
-            const Real d2 =
-                prefix_d2[i] * suf +
-                Real(2) * pre_d1 * suf_d1 +
-                pre * suffix_d2[i + 1];
-            second[slot] = d2 * inv * d2rdx2;
-        }
-    }
-}
-
-template<int N>
-void fill_axis_batch_product_q4(
-    AxisBatchScratch& scratch,
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t component,
-    AxisDeriv level) {
-    constexpr std::size_t axis_stride = static_cast<std::size_t>(N);
-    scratch.resizeFor(4u * axis_stride, level);
-    for (std::size_t q = 0; q < 4u; ++q) {
-        Real* values = scratch.values.data() + q * axis_stride;
-        if (level == AxisDeriv::ValuesOnly) {
-            fill_line_values_product<N>(points[q][component], values);
-        } else {
-            Real* first = scratch.first.data() + q * axis_stride;
-            Real* second = level == AxisDeriv::ValuesAndFirstAndSecond
-                ? scratch.second.data() + q * axis_stride
-                : nullptr;
-            fill_line_values_product_derivatives<N>(
-                points[q][component], values, first, second);
-        }
-    }
-}
-
-bool try_fill_axis_batch_product_q4(
-    AxisBatchScratch& scratch,
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t component,
-    int n_axis,
-    AxisDeriv level) {
-    switch (n_axis) {
-        case 5:
-            fill_axis_batch_product_q4<5>(scratch, points, component, level);
-            return true;
-        case 6:
-            fill_axis_batch_product_q4<6>(scratch, points, component, level);
-            return true;
-        case 7:
-            fill_axis_batch_product_q4<7>(scratch, points, component, level);
-            return true;
-        case 8:
-            fill_axis_batch_product_q4<8>(scratch, points, component, level);
-            return true;
-        case 9:
-            fill_axis_batch_product_q4<9>(scratch, points, component, level);
-            return true;
-        default:
-            return false;
-    }
-}
-
-template<int N>
-SVMP_LAGRANGE_NOINLINE SVMP_LAGRANGE_ALIGN64 void evaluate_line_values_product_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out) {
-    Real q0[N];
-    Real q1[N];
-    Real q2[N];
-    Real q3[N];
-    fill_line_values_product<N>(points[0][0], q0);
-    fill_line_values_product<N>(points[1][0], q1);
-    fill_line_values_product<N>(points[2][0], q2);
-    fill_line_values_product<N>(points[3][0], q3);
-
-    for (std::size_t node = 0; node < static_cast<std::size_t>(N); ++node) {
-        const std::size_t i = line_public_axis_index<N>(node);
-        Real* row = values_out + node * output_stride;
-        row[0] = q0[i];
-        row[1] = q1[i];
-        row[2] = q2[i];
-        row[3] = q3[i];
-    }
-}
-
-FE_ALWAYS_INLINE void write_line_order4_values_q(
-    Real x,
-    std::size_t q,
-    Real* SVMP_RESTRICT row0,
-    Real* SVMP_RESTRICT row1,
-    Real* SVMP_RESTRICT row2,
-    Real* SVMP_RESTRICT row3,
-    Real* SVMP_RESTRICT row4) {
-    const Real r = (x + Real(1)) * Real(2);
-    const Real f0 = r;
-    const Real f1 = r - Real(1);
-    const Real f2 = r - Real(2);
-    const Real f3 = r - Real(3);
-    const Real f4 = r - Real(4);
-    const Real f01 = f0 * f1;
-    const Real f12 = f1 * f2;
-    const Real f23 = f2 * f3;
-    const Real f34 = f3 * f4;
-    const Real v0 = (f12 * f34) / Real(24);
-    const Real v1 = -(f0 * f2 * f34) / Real(6);
-    const Real v2 = (f01 * f34) / Real(4);
-    const Real v3 = -(f01 * f2 * f4) / Real(6);
-    const Real v4 = (f01 * f23) / Real(24);
-    row0[q] = v0;
-    row1[q] = v4;
-    row2[q] = v1;
-    row3[q] = v2;
-    row4[q] = v3;
-}
-
-SVMP_LAGRANGE_NOINLINE SVMP_LAGRANGE_ALIGN64 void evaluate_line_order4_values_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out) {
-    Real* row0 = values_out + 0u * output_stride;
-    Real* row1 = values_out + 1u * output_stride;
-    Real* row2 = values_out + 2u * output_stride;
-    Real* row3 = values_out + 3u * output_stride;
-    Real* row4 = values_out + 4u * output_stride;
-    write_line_order4_values_q(points[0][0], 0u, row0, row1, row2, row3, row4);
-    write_line_order4_values_q(points[1][0], 1u, row0, row1, row2, row3, row4);
-    write_line_order4_values_q(points[2][0], 2u, row0, row1, row2, row3, row4);
-    write_line_order4_values_q(points[3][0], 3u, row0, row1, row2, row3, row4);
-}
-
-SVMP_LAGRANGE_NOINLINE void evaluate_triangle_order1_gradients_strided(
-    std::size_t num_qpts,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT gradients_out) {
-    Real* SVMP_RESTRICT row0 = gradients_out + 0u * 3u * output_stride;
-    Real* SVMP_RESTRICT row1 = gradients_out + 1u * 3u * output_stride;
-    Real* SVMP_RESTRICT row2 = gradients_out + 2u * 3u * output_stride;
-
-    for (std::size_t q = 0; q < num_qpts; ++q) {
-        row0[0u * output_stride + q] = Real(-1);
-        row0[1u * output_stride + q] = Real(-1);
-        row0[2u * output_stride + q] = Real(0);
-        row1[0u * output_stride + q] = Real(1);
-        row1[1u * output_stride + q] = Real(0);
-        row1[2u * output_stride + q] = Real(0);
-        row2[0u * output_stride + q] = Real(0);
-        row2[1u * output_stride + q] = Real(1);
-        row2[2u * output_stride + q] = Real(0);
-    }
-}
-
-template<int N>
-SVMP_LAGRANGE_NOINLINE void evaluate_line_hessians_product_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT hessians_out) {
-    Real values[4][N];
-    Real second[4][N];
-    for (std::size_t q = 0; q < 4u; ++q) {
-        fill_line_values_product_derivatives<N>(
-            points[q][0], values[q], nullptr, second[q]);
-    }
-    for (std::size_t node = 0; node < static_cast<std::size_t>(N); ++node) {
-        const std::size_t i = line_public_axis_index<N>(node);
-        write_line_hessian_q4_row(hessians_out + node * 9u * output_stride,
-                                  output_stride,
-                                  second[0][i], second[1][i],
-                                  second[2][i], second[3][i]);
-    }
-}
-
-template<int N>
-SVMP_LAGRANGE_NOINLINE void evaluate_line_all_product_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) {
-    Real values[4][N];
-    Real first[4][N];
-    Real second[4][N];
-    for (std::size_t q = 0; q < 4u; ++q) {
-        fill_line_values_product_derivatives<N>(
-            points[q][0], values[q], first[q], second[q]);
-    }
-    for (std::size_t node = 0; node < static_cast<std::size_t>(N); ++node) {
-        const std::size_t i = line_public_axis_index<N>(node);
-        Real* value_row = values_out + node * output_stride;
-        value_row[0] = values[0][i];
-        value_row[1] = values[1][i];
-        value_row[2] = values[2][i];
-        value_row[3] = values[3][i];
-        write_line_gradient_q4_row(gradients_out + node * 3u * output_stride,
-                                   output_stride,
-                                   first[0][i], first[1][i],
-                                   first[2][i], first[3][i]);
-        write_line_hessian_q4_row(hessians_out + node * 9u * output_stride,
-                                  output_stride,
-                                  second[0][i], second[1][i],
-                                  second[2][i], second[3][i]);
-    }
-}
-
-inline void write_quad_product_value_row_q4(
-    Real* SVMP_RESTRICT row,
-    const Real* SVMP_RESTRICT x0,
-    const Real* SVMP_RESTRICT x1,
-    const Real* SVMP_RESTRICT x2,
-    const Real* SVMP_RESTRICT x3,
-    const Real* SVMP_RESTRICT y0,
-    const Real* SVMP_RESTRICT y1,
-    const Real* SVMP_RESTRICT y2,
-    const Real* SVMP_RESTRICT y3,
-    std::size_t i,
-    std::size_t j) {
-    row[0] = x0[i] * y0[j];
-    row[1] = x1[i] * y1[j];
-    row[2] = x2[i] * y2[j];
-    row[3] = x3[i] * y3[j];
-}
-
-template<int N>
-void evaluate_quad_values_product_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out) {
-    Real x0[N];
-    Real x1[N];
-    Real x2[N];
-    Real x3[N];
-    Real y0[N];
-    Real y1[N];
-    Real y2[N];
-    Real y3[N];
-    fill_line_values_product<N>(points[0][0], x0);
-    fill_line_values_product<N>(points[1][0], x1);
-    fill_line_values_product<N>(points[2][0], x2);
-    fill_line_values_product<N>(points[3][0], x3);
-    fill_line_values_product<N>(points[0][1], y0);
-    fill_line_values_product<N>(points[1][1], y1);
-    fill_line_values_product<N>(points[2][1], y2);
-    fill_line_values_product<N>(points[3][1], y3);
-
-    constexpr std::size_t p = static_cast<std::size_t>(N - 1);
-    std::size_t node = 0u;
-    write_quad_product_value_row_q4(values_out + node++ * output_stride,
-                                    x0, x1, x2, x3, y0, y1, y2, y3, 0u, 0u);
-    write_quad_product_value_row_q4(values_out + node++ * output_stride,
-                                    x0, x1, x2, x3, y0, y1, y2, y3, p, 0u);
-    write_quad_product_value_row_q4(values_out + node++ * output_stride,
-                                    x0, x1, x2, x3, y0, y1, y2, y3, p, p);
-    write_quad_product_value_row_q4(values_out + node++ * output_stride,
-                                    x0, x1, x2, x3, y0, y1, y2, y3, 0u, p);
-
-    for (std::size_t i = 1u; i < p; ++i) {
-        write_quad_product_value_row_q4(values_out + node++ * output_stride,
-                                        x0, x1, x2, x3, y0, y1, y2, y3, i, 0u);
-    }
-    for (std::size_t j = 1u; j < p; ++j) {
-        write_quad_product_value_row_q4(values_out + node++ * output_stride,
-                                        x0, x1, x2, x3, y0, y1, y2, y3, p, j);
-    }
-    for (std::size_t i = p - 1u; i > 0u; --i) {
-        write_quad_product_value_row_q4(values_out + node++ * output_stride,
-                                        x0, x1, x2, x3, y0, y1, y2, y3, i, p);
-    }
-    for (std::size_t j = p - 1u; j > 0u; --j) {
-        write_quad_product_value_row_q4(values_out + node++ * output_stride,
-                                        x0, x1, x2, x3, y0, y1, y2, y3, 0u, j);
-    }
-    for (std::size_t j = 1u; j < p; ++j) {
-        for (std::size_t i = 1u; i < p; ++i) {
-            write_quad_product_value_row_q4(values_out + node++ * output_stride,
-                                            x0, x1, x2, x3, y0, y1, y2, y3, i, j);
-        }
-    }
-}
-
-template<int N>
-void evaluate_quad_derivatives_product_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) {
-    const bool need_grad = gradients_out != nullptr;
-    const bool need_hess = hessians_out != nullptr;
-    Real xv[4][N];
-    Real xd[4][N];
-    Real x2[4][N];
-    Real yv[4][N];
-    Real yd[4][N];
-    Real y2[4][N];
-
-    for (std::size_t q = 0; q < 4u; ++q) {
-        fill_line_values_product_derivatives<N>(
-            points[q][0], xv[q], (need_grad || need_hess) ? xd[q] : nullptr,
-            need_hess ? x2[q] : nullptr);
-        fill_line_values_product_derivatives<N>(
-            points[q][1], yv[q], (need_grad || need_hess) ? yd[q] : nullptr,
-            need_hess ? y2[q] : nullptr);
-    }
-
-    constexpr std::size_t p = static_cast<std::size_t>(N - 1);
-    std::size_t node = 0u;
-    auto write_node = [&](std::size_t i, std::size_t j) {
-        Real* value_row = values_out != nullptr ? values_out + node * output_stride : nullptr;
-        Real* grad_row = gradients_out != nullptr ? gradients_out + node * 3u * output_stride : nullptr;
-        Real* hess_row = hessians_out != nullptr ? hessians_out + node * 9u * output_stride : nullptr;
-        if (grad_row != nullptr) {
-            grad_row[2u * output_stride + 0u] = Real(0);
-            grad_row[2u * output_stride + 1u] = Real(0);
-            grad_row[2u * output_stride + 2u] = Real(0);
-            grad_row[2u * output_stride + 3u] = Real(0);
-        }
-        if (hess_row != nullptr) {
-            hess_row[2u * output_stride + 0u] = Real(0);
-            hess_row[2u * output_stride + 1u] = Real(0);
-            hess_row[2u * output_stride + 2u] = Real(0);
-            hess_row[2u * output_stride + 3u] = Real(0);
-            hess_row[5u * output_stride + 0u] = Real(0);
-            hess_row[5u * output_stride + 1u] = Real(0);
-            hess_row[5u * output_stride + 2u] = Real(0);
-            hess_row[5u * output_stride + 3u] = Real(0);
-            hess_row[6u * output_stride + 0u] = Real(0);
-            hess_row[6u * output_stride + 1u] = Real(0);
-            hess_row[6u * output_stride + 2u] = Real(0);
-            hess_row[6u * output_stride + 3u] = Real(0);
-            hess_row[7u * output_stride + 0u] = Real(0);
-            hess_row[7u * output_stride + 1u] = Real(0);
-            hess_row[7u * output_stride + 2u] = Real(0);
-            hess_row[7u * output_stride + 3u] = Real(0);
-            hess_row[8u * output_stride + 0u] = Real(0);
-            hess_row[8u * output_stride + 1u] = Real(0);
-            hess_row[8u * output_stride + 2u] = Real(0);
-            hess_row[8u * output_stride + 3u] = Real(0);
-        }
-        for (std::size_t q = 0; q < 4u; ++q) {
-            const Real x_value = xv[q][i];
-            const Real y_value = yv[q][j];
-            if (value_row != nullptr) {
-                value_row[q] = x_value * y_value;
-            }
-            if (grad_row != nullptr) {
-                grad_row[0u * output_stride + q] = xd[q][i] * y_value;
-                grad_row[1u * output_stride + q] = x_value * yd[q][j];
-            }
-            if (hess_row != nullptr) {
-                const Real hxy = xd[q][i] * yd[q][j];
-                hess_row[0u * output_stride + q] = x2[q][i] * y_value;
-                hess_row[1u * output_stride + q] = hxy;
-                hess_row[3u * output_stride + q] = hxy;
-                hess_row[4u * output_stride + q] = x_value * y2[q][j];
-            }
-        }
-        ++node;
-    };
-
-    write_node(0u, 0u);
-    write_node(p, 0u);
-    write_node(p, p);
-    write_node(0u, p);
-    for (std::size_t i = 1u; i < p; ++i) {
-        write_node(i, 0u);
-    }
-    for (std::size_t j = 1u; j < p; ++j) {
-        write_node(p, j);
-    }
-    for (std::size_t i = p - 1u; i > 0u; --i) {
-        write_node(i, p);
-    }
-    for (std::size_t j = p - 1u; j > 0u; --j) {
-        write_node(0u, j);
-    }
-    for (std::size_t j = 1u; j < p; ++j) {
-        for (std::size_t i = 1u; i < p; ++i) {
-            write_node(i, j);
-        }
-    }
-}
-
-SVMP_LAGRANGE_NOINLINE SVMP_LAGRANGE_ALIGN64 void evaluate_quad_order8_gradients_product_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT gradients_out) {
-    constexpr int N = 9;
-    constexpr std::size_t p = 8u;
-    Real xv[4][N];
-    Real xd[4][N];
-    Real yv[4][N];
-    Real yd[4][N];
-
-    for (std::size_t q = 0; q < 4u; ++q) {
-        fill_line_values_product_derivatives<N>(points[q][0], xv[q], xd[q], nullptr);
-        fill_line_values_product_derivatives<N>(points[q][1], yv[q], yd[q], nullptr);
-    }
-
-    std::size_t node = 0u;
-    auto write_node = [&](std::size_t i, std::size_t j) {
-        Real* SVMP_RESTRICT row = gradients_out + node * 3u * output_stride;
-        row[0u] = xd[0][i] * yv[0][j];
-        row[1u] = xd[1][i] * yv[1][j];
-        row[2u] = xd[2][i] * yv[2][j];
-        row[3u] = xd[3][i] * yv[3][j];
-        row[output_stride + 0u] = xv[0][i] * yd[0][j];
-        row[output_stride + 1u] = xv[1][i] * yd[1][j];
-        row[output_stride + 2u] = xv[2][i] * yd[2][j];
-        row[output_stride + 3u] = xv[3][i] * yd[3][j];
-        row[2u * output_stride + 0u] = Real(0);
-        row[2u * output_stride + 1u] = Real(0);
-        row[2u * output_stride + 2u] = Real(0);
-        row[2u * output_stride + 3u] = Real(0);
-        ++node;
-    };
-
-    write_node(0u, 0u);
-    write_node(p, 0u);
-    write_node(p, p);
-    write_node(0u, p);
-    for (std::size_t i = 1u; i < p; ++i) {
-        write_node(i, 0u);
-    }
-    for (std::size_t j = 1u; j < p; ++j) {
-        write_node(p, j);
-    }
-    for (std::size_t i = p - 1u; i > 0u; --i) {
-        write_node(i, p);
-    }
-    for (std::size_t j = p - 1u; j > 0u; --j) {
-        write_node(0u, j);
-    }
-    for (std::size_t j = 1u; j < p; ++j) {
-        for (std::size_t i = 1u; i < p; ++i) {
-            write_node(i, j);
-        }
-    }
-}
-
-template<int N>
-void evaluate_line_gradients_horner_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    const Real* SVMP_RESTRICT d_coeffs,
-    Real* SVMP_RESTRICT gradients_out) {
-    const Real x0 = points[0][0];
-    const Real x1 = points[1][0];
-    const Real x2 = points[2][0];
-    const Real x3 = points[3][0];
-
-    for (std::size_t node = 0; node < static_cast<std::size_t>(N); ++node) {
-        const std::size_t i = line_public_axis_index<N>(node);
-        const Real* c = d_coeffs + i * static_cast<std::size_t>(N - 1);
-        Real r0 = c[N - 2];
-        Real r1 = c[N - 2];
-        Real r2 = c[N - 2];
-        Real r3 = c[N - 2];
-        for (int k = N - 2; k > 0; --k) {
-            const Real ck = c[k - 1];
-            r0 = r0 * x0 + ck;
-            r1 = r1 * x1 + ck;
-            r2 = r2 * x2 + ck;
-            r3 = r3 * x3 + ck;
-        }
-        Real* row = gradients_out + node * 3u * output_stride;
-        row[0] = r0;
-        row[1] = r1;
-        row[2] = r2;
-        row[3] = r3;
-        row[output_stride + 0u] = Real(0);
-        row[output_stride + 1u] = Real(0);
-        row[output_stride + 2u] = Real(0);
-        row[output_stride + 3u] = Real(0);
-        row[2u * output_stride + 0u] = Real(0);
-        row[2u * output_stride + 1u] = Real(0);
-        row[2u * output_stride + 2u] = Real(0);
-        row[2u * output_stride + 3u] = Real(0);
-    }
-}
-
-bool try_evaluate_line_values_horner_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    const Real* SVMP_RESTRICT v_coeffs,
-    int n_axis,
-    Real* SVMP_RESTRICT values_out) {
-    (void)v_coeffs;
-    switch (n_axis) {
-        case 5:
-            evaluate_line_order4_values_q4(points, output_stride, values_out);
-            return true;
-        case 6:
-            evaluate_line_values_product_q4<6>(points, output_stride, values_out);
-            return true;
-        case 7:
-            evaluate_line_values_product_q4<7>(points, output_stride, values_out);
-            return true;
-        case 8:
-            evaluate_line_values_product_q4<8>(points, output_stride, values_out);
-            return true;
-        case 9:
-            evaluate_line_values_product_q4<9>(points, output_stride, values_out);
-            return true;
-        default:
-            return false;
-    }
-}
-
-bool try_evaluate_line_gradients_horner_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    const Real* SVMP_RESTRICT d_coeffs,
-    int n_axis,
-    Real* SVMP_RESTRICT gradients_out) {
-    switch (n_axis) {
-        case 5:
-            evaluate_line_gradients_horner_q4<5>(points, output_stride, d_coeffs, gradients_out);
-            return true;
-        case 6:
-            evaluate_line_gradients_horner_q4<6>(points, output_stride, d_coeffs, gradients_out);
-            return true;
-        case 7:
-            evaluate_line_gradients_horner_q4<7>(points, output_stride, d_coeffs, gradients_out);
-            return true;
-        case 8:
-            evaluate_line_gradients_horner_q4<8>(points, output_stride, d_coeffs, gradients_out);
-            return true;
-        case 9:
-            evaluate_line_gradients_horner_q4<9>(points, output_stride, d_coeffs, gradients_out);
-            return true;
-        default:
-            return false;
-    }
-}
-
-SVMP_LAGRANGE_NOINLINE bool try_evaluate_line_hessians_product_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    int n_axis,
-    Real* SVMP_RESTRICT hessians_out) {
-    switch (n_axis) {
-        case 5:
-            evaluate_line_hessians_product_q4<5>(points, output_stride, hessians_out);
-            return true;
-        case 6:
-            evaluate_line_hessians_product_q4<6>(points, output_stride, hessians_out);
-            return true;
-        case 7:
-            evaluate_line_hessians_product_q4<7>(points, output_stride, hessians_out);
-            return true;
-        case 8:
-            evaluate_line_hessians_product_q4<8>(points, output_stride, hessians_out);
-            return true;
-        case 9:
-            evaluate_line_hessians_product_q4<9>(points, output_stride, hessians_out);
-            return true;
-        default:
-            return false;
-    }
-}
-
-SVMP_LAGRANGE_NOINLINE bool try_evaluate_line_all_product_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    int n_axis,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) {
-    switch (n_axis) {
-        case 5:
-            evaluate_line_all_product_q4<5>(
-                points, output_stride, values_out, gradients_out, hessians_out);
-            return true;
-        case 6:
-            evaluate_line_all_product_q4<6>(
-                points, output_stride, values_out, gradients_out, hessians_out);
-            return true;
-        case 7:
-            evaluate_line_all_product_q4<7>(
-                points, output_stride, values_out, gradients_out, hessians_out);
-            return true;
-        case 8:
-            evaluate_line_all_product_q4<8>(
-                points, output_stride, values_out, gradients_out, hessians_out);
-            return true;
-        case 9:
-            evaluate_line_all_product_q4<9>(
-                points, output_stride, values_out, gradients_out, hessians_out);
-            return true;
-        default:
-            return false;
-    }
-}
-
-SVMP_LAGRANGE_NOINLINE bool try_evaluate_quad_values_product_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    int n_axis,
-    Real* SVMP_RESTRICT values_out) {
-    switch (n_axis) {
-        case 5:
-            evaluate_quad_values_product_q4<5>(points, output_stride, values_out);
-            return true;
-        case 6:
-            evaluate_quad_values_product_q4<6>(points, output_stride, values_out);
-            return true;
-        case 7:
-            evaluate_quad_values_product_q4<7>(points, output_stride, values_out);
-            return true;
-        case 8:
-            evaluate_quad_values_product_q4<8>(points, output_stride, values_out);
-            return true;
-        case 9:
-            evaluate_quad_values_product_q4<9>(points, output_stride, values_out);
-            return true;
-        default:
-            return false;
-    }
-}
-
-SVMP_LAGRANGE_NOINLINE bool try_evaluate_quad_derivatives_product_q4(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    int n_axis,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) {
-    switch (n_axis) {
-        case 5:
-            evaluate_quad_derivatives_product_q4<5>(
-                points, output_stride, values_out, gradients_out, hessians_out);
-            return true;
-        case 6:
-            evaluate_quad_derivatives_product_q4<6>(
-                points, output_stride, values_out, gradients_out, hessians_out);
-            return true;
-        case 7:
-            evaluate_quad_derivatives_product_q4<7>(
-                points, output_stride, values_out, gradients_out, hessians_out);
-            return true;
-        case 8:
-            evaluate_quad_derivatives_product_q4<8>(
-                points, output_stride, values_out, gradients_out, hessians_out);
-            return true;
-        case 9:
-            evaluate_quad_derivatives_product_q4<9>(
-                points, output_stride, values_out, gradients_out, hessians_out);
-            return true;
-        default:
-            return false;
-    }
-}
-
-void evaluate_tensor_product_points_strided(
-    LagrangeTopology topology,
-    const std::vector<std::array<std::size_t, 3>>& tensor_indices,
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    const Real* v_coeffs,
-    const Real* d_coeffs,
-    const Real* d2_coeffs,
-    const Real* barycentric_weights,
-    int n_axis,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) {
-    const std::size_t num_qpts = points.size();
-    if (num_qpts == 0 || tensor_indices.empty()) {
-        return;
-    }
-
-    const bool need_grad = gradients_out != nullptr;
-    const bool need_hess = hessians_out != nullptr;
-    const bool values_only = values_out != nullptr && !need_grad && !need_hess;
-    const bool gradients_only = values_out == nullptr && need_grad && !need_hess;
-    const bool hessians_only = values_out == nullptr && gradients_out == nullptr && need_hess;
-    const bool all_outputs = values_out != nullptr && need_grad && need_hess;
-    const AxisDeriv level = need_hess
-        ? AxisDeriv::ValuesAndFirstAndSecond
-        : (need_grad ? AxisDeriv::ValuesAndFirst : AxisDeriv::ValuesOnly);
-
-    if (topology == LagrangeTopology::Line && num_qpts == 4u) {
-        if (values_only &&
-            try_evaluate_line_values_horner_q4(
-                points, output_stride, v_coeffs, n_axis, values_out)) {
-            return;
-        }
-        if (gradients_only &&
-            try_evaluate_line_gradients_horner_q4(
-                points, output_stride, d_coeffs, n_axis, gradients_out)) {
-            return;
-        }
-        if (hessians_only &&
-            try_evaluate_line_hessians_product_q4(
-                points, output_stride, n_axis, hessians_out)) {
-            return;
-        }
-        if (all_outputs &&
-            try_evaluate_line_all_product_q4(
-                points, output_stride, n_axis, values_out, gradients_out, hessians_out)) {
-            return;
-        }
-    }
-    if (topology == LagrangeTopology::Quadrilateral &&
-        values_only &&
-        num_qpts == 4u &&
-        try_evaluate_quad_values_product_q4(points, output_stride, n_axis, values_out)) {
-        return;
-    }
-    if (topology == LagrangeTopology::Quadrilateral &&
-        gradients_only &&
-        num_qpts == 4u &&
-        n_axis == 5) {
-        evaluate_quad_order4_gradients_q4(points, output_stride, gradients_out);
-        return;
-    }
-    if (topology == LagrangeTopology::Quadrilateral &&
-        gradients_only &&
-        num_qpts == 4u &&
-        n_axis == 9) {
-        evaluate_quad_order8_gradients_product_q4(points, output_stride, gradients_out);
-        return;
-    }
-    if (topology == LagrangeTopology::Quadrilateral &&
-        (gradients_only || hessians_only || all_outputs) &&
-        num_qpts == 4u &&
-        try_evaluate_quad_derivatives_product_q4(
-            points, output_stride, n_axis, values_out, gradients_out, hessians_out)) {
-        return;
-    }
-
-    auto& scratch = evaluate_scratch();
-    AxisBatchScratch& x_batch = scratch.axis_x_batch;
-    AxisBatchScratch& y_batch = scratch.axis_y_batch;
-    AxisBatchScratch& z_batch = scratch.axis_z_batch;
-
-    const bool has_y = topology != LagrangeTopology::Line;
-    const bool has_z = topology == LagrangeTopology::Hexahedron;
-    const std::size_t axis_stride = static_cast<std::size_t>(n_axis);
-    const bool use_product_axis_batch =
-        has_z &&
-        gradients_only &&
-        num_qpts == 4u &&
-        n_axis >= 5 &&
-        n_axis <= 9;
-    auto fill_tensor_axis_batch = [&](AxisBatchScratch& batch, std::size_t component) {
-        if (use_product_axis_batch &&
-            try_fill_axis_batch_product_q4(batch, points, component, n_axis, level)) {
-            return;
-        }
-        fill_axis_batch(batch, points, component, v_coeffs, d_coeffs, d2_coeffs,
-                        barycentric_weights, n_axis, level);
-    };
-
-    fill_tensor_axis_batch(x_batch, 0u);
-    if (!has_y) {
-        if (values_only) {
-            if (num_qpts == 4u) {
-                for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
-                    const std::size_t i = tensor_indices[node][0];
-                    Real* value_row = values_out + node * output_stride;
-                    value_row[0] = x_batch.values[i];
-                    value_row[1] = x_batch.values[axis_stride + i];
-                    value_row[2] = x_batch.values[2u * axis_stride + i];
-                    value_row[3] = x_batch.values[3u * axis_stride + i];
-                }
-                return;
-            }
-            for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
-                const std::size_t i = tensor_indices[node][0];
-                Real* value_row = values_out + node * output_stride;
-
-                for (std::size_t q = 0; q < num_qpts; ++q) {
-                    value_row[q] = x_batch.values[q * axis_stride + i];
-                }
-            }
-            return;
-        }
-
-        if (gradients_only) {
-            for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
-                const std::size_t i = tensor_indices[node][0];
-                Real* grad_row = gradients_out + node * 3u * output_stride;
-
-                for (std::size_t q = 0; q < num_qpts; ++q) {
-                    grad_row[0u * output_stride + q] =
-                        x_batch.first[q * axis_stride + i];
-                    grad_row[1u * output_stride + q] = Real(0);
-                    grad_row[2u * output_stride + q] = Real(0);
-                }
-            }
-            return;
-        }
-
-        for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
-            const std::size_t i = tensor_indices[node][0];
-            Real* value_row = values_out ? values_out + node * output_stride : nullptr;
-            Real* grad_row = gradients_out ? gradients_out + node * 3u * output_stride : nullptr;
-            Real* hess_row = hessians_out ? hessians_out + node * 9u * output_stride : nullptr;
-
-            for (std::size_t q = 0; q < num_qpts; ++q) {
-                const std::size_t q_axis = q * axis_stride + i;
-                if (value_row != nullptr) {
-                    value_row[q] = x_batch.values[q_axis];
-                }
-                if (need_grad) {
-                    grad_row[0u * output_stride + q] = x_batch.first[q_axis];
-                    grad_row[1u * output_stride + q] = Real(0);
-                    grad_row[2u * output_stride + q] = Real(0);
-                }
-                if (need_hess) {
-                    hess_row[0u * output_stride + q] = x_batch.second[q_axis];
-                    hess_row[1u * output_stride + q] = Real(0);
-                    hess_row[2u * output_stride + q] = Real(0);
-                    hess_row[3u * output_stride + q] = Real(0);
-                    hess_row[4u * output_stride + q] = Real(0);
-                    hess_row[5u * output_stride + q] = Real(0);
-                    hess_row[6u * output_stride + q] = Real(0);
-                    hess_row[7u * output_stride + q] = Real(0);
-                    hess_row[8u * output_stride + q] = Real(0);
-                }
-            }
-        }
-        return;
-    }
-    const bool use_tensor_tables =
-        has_z ||
-        (axis_stride == 8u && !(need_hess && values_out == nullptr && gradients_out == nullptr));
-    if (use_tensor_tables) {
-        fill_tensor_axis_batch(y_batch, 1u);
-    } else if (has_y) {
-        fill_tensor_axis_batch(y_batch, 1u);
-    }
-    if (has_z) {
-        fill_tensor_axis_batch(z_batch, 2u);
-    }
-
-    if (use_tensor_tables) {
-        const std::size_t ny = axis_stride;
-        const std::size_t nz = has_z ? axis_stride : 1u;
-        const std::size_t nyz = ny * nz;
-        const std::size_t table_count = num_qpts * nyz;
-
-        if (has_z && num_qpts == 4u && output_stride == 4u) {
-            if (values_only &&
-                evaluate_tensor_product_values_stride4_q4_transposed(
-                    tensor_indices, axis_stride, x_batch, y_batch, z_batch, values_out)) {
-                return;
-            }
-            if (gradients_only &&
-                evaluate_tensor_product_gradients_stride4_q4_transposed(
-                    tensor_indices, axis_stride, x_batch, y_batch, z_batch, gradients_out)) {
-                return;
-            }
-            if (hessians_only &&
-                evaluate_tensor_product_second_stride4_q4_transposed<false>(
-                    tensor_indices, axis_stride, x_batch, y_batch, z_batch,
-                    nullptr, nullptr, hessians_out)) {
-                return;
-            }
-            if (all_outputs &&
-                evaluate_tensor_product_second_stride4_q4_transposed<true>(
-                    tensor_indices, axis_stride, x_batch, y_batch, z_batch,
-                    values_out, gradients_out, hessians_out)) {
-                return;
-            }
-        }
-
-        Real Mvv_stack[kMaxStackYZ];
-        Real Mdv_stack[kMaxStackYZ];
-        Real Mvd_stack[kMaxStackYZ];
-        Real Md2v_stack[kMaxStackYZ];
-        Real Mvd2_stack[kMaxStackYZ];
-        Real Mdd_stack[kMaxStackYZ];
-
-        Real* Mvv;
-        Real* Mdv;
-        Real* Mvd;
-        Real* Md2v;
-        Real* Mvd2;
-        Real* Mdd;
-        if (table_count <= kMaxStackYZ) {
-            Mvv = Mvv_stack;
-            Mdv = Mdv_stack;
-            Mvd = Mvd_stack;
-            Md2v = Md2v_stack;
-            Mvd2 = Mvd2_stack;
-            Mdd = Mdd_stack;
-        } else {
-            auto& tables = scratch.tensor_tables;
-            tables.resizeFor(table_count);
-            Mvv = tables.vv.data();
-            Mdv = tables.dv.data();
-            Mvd = tables.vd.data();
-            Md2v = tables.d2v.data();
-            Mvd2 = tables.vd2.data();
-            Mdd = tables.dd.data();
-        }
-
-        for (std::size_t q = 0; q < num_qpts; ++q) {
-            const std::size_t q_axis = q * axis_stride;
-            const std::size_t q_table = q * nyz;
-            for (std::size_t j = 0; j < ny; ++j) {
-                const Real yv = y_batch.values[q_axis + j];
-                const Real yd = (need_grad || need_hess) ? y_batch.first[q_axis + j] : Real(0);
-                const Real y2 = need_hess ? y_batch.second[q_axis + j] : Real(0);
-                for (std::size_t k = 0; k < nz; ++k) {
-                    const std::size_t slot = q_table + j * nz + k;
-                    const Real zv = has_z ? z_batch.values[q_axis + k] : Real(1);
-                    Mvv[slot] = yv * zv;
-                    if (need_grad || need_hess) {
-                        const Real zd = has_z ? z_batch.first[q_axis + k] : Real(0);
-                        Mdv[slot] = yd * zv;
-                        Mvd[slot] = yv * zd;
-                    }
-                    if (need_hess) {
-                        const Real zd = has_z ? z_batch.first[q_axis + k] : Real(0);
-                        const Real z2 = has_z ? z_batch.second[q_axis + k] : Real(0);
-                        Md2v[slot] = y2 * zv;
-                        Mvd2[slot] = yv * z2;
-                        Mdd[slot] = yd * zd;
-                    }
-                }
-            }
-        }
-
-        if (values_only) {
-            if (has_z && num_qpts == 4u) {
-                for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
-                    const auto& idx = tensor_indices[node];
-                    const std::size_t i = idx[0];
-                    const std::size_t jk = idx[1] * nz + idx[2];
-                    Real* value_row = values_out + node * output_stride;
-
-                    write_tensor_product_value_strided_q<0>(
-                        axis_stride, nyz, i, jk, x_batch, Mvv, value_row);
-                    write_tensor_product_value_strided_q<1>(
-                        axis_stride, nyz, i, jk, x_batch, Mvv, value_row);
-                    write_tensor_product_value_strided_q<2>(
-                        axis_stride, nyz, i, jk, x_batch, Mvv, value_row);
-                    write_tensor_product_value_strided_q<3>(
-                        axis_stride, nyz, i, jk, x_batch, Mvv, value_row);
-                }
-                return;
-            }
-            for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
-                const auto& idx = tensor_indices[node];
-                const std::size_t i = idx[0];
-                const std::size_t jk = idx[1] * nz + idx[2];
-                Real* value_row = values_out + node * output_stride;
-
-                for (std::size_t q = 0; q < num_qpts; ++q) {
-                    const std::size_t q_axis = q * axis_stride;
-                    const std::size_t slot = q * nyz + jk;
-                    value_row[q] = x_batch.values[q_axis + i] * Mvv[slot];
-                }
-            }
-            return;
-        }
-
-        if (gradients_only) {
-            if (has_z && num_qpts == 4u) {
-                if (output_stride == 4u) {
-                    for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
-                        const auto& idx = tensor_indices[node];
-                        const std::size_t i = idx[0];
-                        const std::size_t jk = idx[1] * nz + idx[2];
-                        Real* grad_row = gradients_out + node * 3u * output_stride;
-
-                        write_tensor_product_gradient_stride4_q<0>(
-                            axis_stride, nyz, i, jk, x_batch, Mvv, Mdv, Mvd, grad_row);
-                        write_tensor_product_gradient_stride4_q<1>(
-                            axis_stride, nyz, i, jk, x_batch, Mvv, Mdv, Mvd, grad_row);
-                        write_tensor_product_gradient_stride4_q<2>(
-                            axis_stride, nyz, i, jk, x_batch, Mvv, Mdv, Mvd, grad_row);
-                        write_tensor_product_gradient_stride4_q<3>(
-                            axis_stride, nyz, i, jk, x_batch, Mvv, Mdv, Mvd, grad_row);
-                    }
-                } else {
-                    for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
-                        const auto& idx = tensor_indices[node];
-                        const std::size_t i = idx[0];
-                        const std::size_t jk = idx[1] * nz + idx[2];
-                        Real* grad_row = gradients_out + node * 3u * output_stride;
-
-                        write_tensor_product_gradient_strided_q<0>(
-                            axis_stride, nyz, i, jk, output_stride, x_batch,
-                            Mvv, Mdv, Mvd, grad_row);
-                        write_tensor_product_gradient_strided_q<1>(
-                            axis_stride, nyz, i, jk, output_stride, x_batch,
-                            Mvv, Mdv, Mvd, grad_row);
-                        write_tensor_product_gradient_strided_q<2>(
-                            axis_stride, nyz, i, jk, output_stride, x_batch,
-                            Mvv, Mdv, Mvd, grad_row);
-                        write_tensor_product_gradient_strided_q<3>(
-                            axis_stride, nyz, i, jk, output_stride, x_batch,
-                            Mvv, Mdv, Mvd, grad_row);
-                    }
-                }
-                return;
-            }
-
-            for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
-                const auto& idx = tensor_indices[node];
-                const std::size_t i = idx[0];
-                const std::size_t jk = idx[1] * nz + idx[2];
-                Real* grad_row = gradients_out + node * 3u * output_stride;
-
-                for (std::size_t q = 0; q < num_qpts; ++q) {
-                    const std::size_t q_axis = q * axis_stride;
-                    const std::size_t slot = q * nyz + jk;
-                    const Real xv = x_batch.values[q_axis + i];
-                    const Real xd = x_batch.first[q_axis + i];
-                    grad_row[0u * output_stride + q] = xd * Mvv[slot];
-                    grad_row[1u * output_stride + q] = xv * Mdv[slot];
-                    grad_row[2u * output_stride + q] = xv * Mvd[slot];
-                }
-            }
-            return;
-        }
-
-        if (has_z && num_qpts == 4u && hessians_only) {
-            if (output_stride == 4u) {
-                for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
-                    const auto& idx = tensor_indices[node];
-                    const std::size_t i = idx[0];
-                    const std::size_t jk = idx[1] * nz + idx[2];
-                    Real* hess_row = hessians_out + node * 9u * output_stride;
-
-                    write_tensor_product_hessian_stride4_q<0>(
-                        axis_stride, nyz, i, jk, x_batch,
-                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, hess_row);
-                    write_tensor_product_hessian_stride4_q<1>(
-                        axis_stride, nyz, i, jk, x_batch,
-                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, hess_row);
-                    write_tensor_product_hessian_stride4_q<2>(
-                        axis_stride, nyz, i, jk, x_batch,
-                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, hess_row);
-                    write_tensor_product_hessian_stride4_q<3>(
-                        axis_stride, nyz, i, jk, x_batch,
-                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, hess_row);
-                }
-            } else {
-                for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
-                    const auto& idx = tensor_indices[node];
-                    const std::size_t i = idx[0];
-                    const std::size_t jk = idx[1] * nz + idx[2];
-                    Real* hess_row = hessians_out + node * 9u * output_stride;
-
-                    write_tensor_product_hessian_strided_q<0>(
-                        axis_stride, nyz, i, jk, output_stride, x_batch,
-                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, hess_row);
-                    write_tensor_product_hessian_strided_q<1>(
-                        axis_stride, nyz, i, jk, output_stride, x_batch,
-                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, hess_row);
-                    write_tensor_product_hessian_strided_q<2>(
-                        axis_stride, nyz, i, jk, output_stride, x_batch,
-                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, hess_row);
-                    write_tensor_product_hessian_strided_q<3>(
-                        axis_stride, nyz, i, jk, output_stride, x_batch,
-                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, hess_row);
-                }
-            }
-            return;
-        }
-
-        if (has_z && num_qpts == 4u && all_outputs) {
-            if (output_stride == 4u) {
-                for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
-                    const auto& idx = tensor_indices[node];
-                    const std::size_t i = idx[0];
-                    const std::size_t jk = idx[1] * nz + idx[2];
-                    Real* value_row = values_out + node * output_stride;
-                    Real* grad_row = gradients_out + node * 3u * output_stride;
-                    Real* hess_row = hessians_out + node * 9u * output_stride;
-
-                    write_tensor_product_all_stride4_q<0>(
-                        axis_stride, nyz, i, jk, x_batch,
-                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, value_row, grad_row, hess_row);
-                    write_tensor_product_all_stride4_q<1>(
-                        axis_stride, nyz, i, jk, x_batch,
-                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, value_row, grad_row, hess_row);
-                    write_tensor_product_all_stride4_q<2>(
-                        axis_stride, nyz, i, jk, x_batch,
-                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, value_row, grad_row, hess_row);
-                    write_tensor_product_all_stride4_q<3>(
-                        axis_stride, nyz, i, jk, x_batch,
-                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, value_row, grad_row, hess_row);
-                }
-            } else {
-                for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
-                    const auto& idx = tensor_indices[node];
-                    const std::size_t i = idx[0];
-                    const std::size_t jk = idx[1] * nz + idx[2];
-                    Real* value_row = values_out + node * output_stride;
-                    Real* grad_row = gradients_out + node * 3u * output_stride;
-                    Real* hess_row = hessians_out + node * 9u * output_stride;
-
-                    write_tensor_product_all_strided_q<0>(
-                        axis_stride, nyz, i, jk, output_stride, x_batch,
-                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, value_row, grad_row, hess_row);
-                    write_tensor_product_all_strided_q<1>(
-                        axis_stride, nyz, i, jk, output_stride, x_batch,
-                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, value_row, grad_row, hess_row);
-                    write_tensor_product_all_strided_q<2>(
-                        axis_stride, nyz, i, jk, output_stride, x_batch,
-                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, value_row, grad_row, hess_row);
-                    write_tensor_product_all_strided_q<3>(
-                        axis_stride, nyz, i, jk, output_stride, x_batch,
-                        Mvv, Mdv, Mvd, Md2v, Mvd2, Mdd, value_row, grad_row, hess_row);
-                }
-            }
-            return;
-        }
-
-        for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
-            const auto& idx = tensor_indices[node];
-            const std::size_t i = idx[0];
-            const std::size_t jk = idx[1] * nz + idx[2];
-
-            Real* value_row = values_out ? values_out + node * output_stride : nullptr;
-            Real* grad_row = gradients_out ? gradients_out + node * 3u * output_stride : nullptr;
-            Real* hess_row = hessians_out ? hessians_out + node * 9u * output_stride : nullptr;
-
-            for (std::size_t q = 0; q < num_qpts; ++q) {
-                const std::size_t q_axis = q * axis_stride;
-                const std::size_t slot = q * nyz + jk;
-                const Real xv = x_batch.values[q_axis + i];
-
-                if (value_row != nullptr) {
-                    value_row[q] = xv * Mvv[slot];
-                }
-
-                if (need_grad) {
-                    const Real xd = x_batch.first[q_axis + i];
-                    grad_row[0u * output_stride + q] = xd * Mvv[slot];
-                    grad_row[1u * output_stride + q] = xv * Mdv[slot];
-                    grad_row[2u * output_stride + q] = xv * Mvd[slot];
-                }
-
-                if (need_hess) {
-                    const Real xd = x_batch.first[q_axis + i];
-                    const Real x2 = x_batch.second[q_axis + i];
-                    const Real hxy = xd * Mdv[slot];
-                    const Real hxz = xd * Mvd[slot];
-                    const Real hyz = xv * Mdd[slot];
-                    hess_row[0u * output_stride + q] = x2 * Mvv[slot];
-                    hess_row[4u * output_stride + q] = xv * Md2v[slot];
-                    hess_row[8u * output_stride + q] = xv * Mvd2[slot];
-                    hess_row[1u * output_stride + q] = hxy;
-                    hess_row[3u * output_stride + q] = hxy;
-                    hess_row[2u * output_stride + q] = hxz;
-                    hess_row[6u * output_stride + q] = hxz;
-                    hess_row[5u * output_stride + q] = hyz;
-                    hess_row[7u * output_stride + q] = hyz;
-                }
-            }
-        }
-        return;
-    }
-
-    for (std::size_t node = 0; node < tensor_indices.size(); ++node) {
-        const auto& idx = tensor_indices[node];
-        const std::size_t i = idx[0];
-        const std::size_t j = idx[1];
-
-        Real* value_row = values_out ? values_out + node * output_stride : nullptr;
-        Real* grad_row = gradients_out ? gradients_out + node * 3u * output_stride : nullptr;
-        Real* hess_row = hessians_out ? hessians_out + node * 9u * output_stride : nullptr;
-
-        for (std::size_t q = 0; q < num_qpts; ++q) {
-            const std::size_t q_axis = q * axis_stride;
-            const Real xv = x_batch.values[q_axis + i];
-            const Real yv = y_batch.values[q_axis + j];
-
-            if (value_row != nullptr) {
-                value_row[q] = xv * yv;
-            }
-
-            if (need_grad) {
-                const Real xd = x_batch.first[q_axis + i];
-                const Real yd = y_batch.first[q_axis + j];
-                grad_row[0u * output_stride + q] = xd * yv;
-                grad_row[1u * output_stride + q] = xv * yd;
-                grad_row[2u * output_stride + q] = Real(0);
-            }
-
-            if (need_hess) {
-                const Real xd = x_batch.first[q_axis + i];
-                const Real yd = y_batch.first[q_axis + j];
-                const Real x2 = x_batch.second[q_axis + i];
-                const Real y2 = y_batch.second[q_axis + j];
-                const Real hxy = xd * yd;
-
-                hess_row[0u * output_stride + q] = x2 * yv;
-                hess_row[4u * output_stride + q] = xv * y2;
-                hess_row[8u * output_stride + q] = Real(0);
-                hess_row[1u * output_stride + q] = hxy;
-                hess_row[3u * output_stride + q] = hxy;
-                hess_row[2u * output_stride + q] = Real(0);
-                hess_row[6u * output_stride + q] = Real(0);
-                hess_row[5u * output_stride + q] = Real(0);
-                hess_row[7u * output_stride + q] = Real(0);
-            }
-        }
-    }
-}
-
-void evaluate_wedge_points_strided(
-    const std::vector<std::array<int, 4>>& simplex_exponents,
-    const std::vector<std::array<std::size_t, 2>>& wedge_indices,
-    const std::vector<std::size_t>& wedge_node_by_tri_z,
-    int order,
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    const Real* v_coeffs,
-    const Real* d_coeffs,
-    const Real* d2_coeffs,
-    const Real* barycentric_weights,
-    int n_axis,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) {
-    if (points.empty() || wedge_indices.empty()) {
-        return;
-    }
-
-    const bool want_values = values_out != nullptr;
-    const bool need_grad = gradients_out != nullptr;
-    const bool need_hess = hessians_out != nullptr;
-    const bool values_only = want_values && !need_grad && !need_hess;
-    const bool gradients_only = !want_values && need_grad && !need_hess;
-    const bool hessians_only = !want_values && !need_grad && need_hess;
-    const bool all_outputs = want_values && need_grad && need_hess;
-    const bool use_batched_wedge =
-        (values_only && order <= 3) ||
-        (gradients_only && order >= 2) ||
-        (hessians_only && order >= 3) ||
-        (all_outputs && order >= 3);
-    if (values_only &&
-        order >= 4 &&
-        order <= 8 &&
-        try_evaluate_wedge_values_product_q4(
-            simplex_exponents, wedge_indices, order, points, output_stride, values_out)) {
-        return;
-    }
-    const AxisDeriv level = need_hess
-        ? AxisDeriv::ValuesAndFirstAndSecond
-        : (need_grad ? AxisDeriv::ValuesAndFirst : AxisDeriv::ValuesOnly);
-
-    LagrangeEvaluateScratch& scratch = evaluate_scratch();
-    const std::size_t tri_count = simplex_exponents.size();
-    if (use_batched_wedge) {
-        const std::size_t num_qpts = points.size();
-        const std::size_t tri_stride = num_qpts;
-        if (num_qpts == 4u &&
-            output_stride == 4u &&
-            (gradients_only || hessians_only || all_outputs) &&
-            order >= 3 &&
-            order <= 8 &&
-            wedge_node_by_tri_z.size() == tri_count * static_cast<std::size_t>(n_axis)) {
-            const bool use_product_axis_batch =
-                gradients_only &&
-                n_axis >= 5 &&
-                n_axis <= 9;
-            if (!use_product_axis_batch ||
-                !try_fill_axis_batch_product_q4(
-                    scratch.axis_z_batch, points, 2u, n_axis, level)) {
-                fill_axis_batch(scratch.axis_z_batch,
-                                points,
-                                2u,
-                                v_coeffs,
-                                d_coeffs,
-                                d2_coeffs,
-                                barycentric_weights,
-                                n_axis,
-                                level);
-            }
-            if (need_hess) {
-                if (try_evaluate_wedge_fused_stride4_q4<true>(
-                        simplex_exponents, wedge_node_by_tri_z, order, points,
-                        scratch.axis_z_batch, n_axis, values_out, gradients_out, hessians_out)) {
-                    return;
-                }
-            } else if (try_evaluate_wedge_fused_stride4_q4<false>(
-                           simplex_exponents, wedge_node_by_tri_z, order, points,
-                           scratch.axis_z_batch, n_axis, values_out, gradients_out, hessians_out)) {
-                return;
-            }
-        }
-
-        const std::size_t tri_values_size = tri_count * tri_stride;
-        scratch.wedge_tri_values_batch.resize(tri_values_size);
-        if (need_grad || need_hess) {
-            scratch.wedge_tri_gradient_batch.resize(tri_count * 2u * tri_stride);
-        }
-        if (need_hess) {
-            scratch.wedge_tri_hessian_batch.resize(tri_count * 3u * tri_stride);
-        }
-
-        detail::evaluate_triangle_simplex_basis_wedge_components_strided(
-            simplex_exponents,
-            order,
-            points,
-            tri_stride,
-            scratch.wedge_tri_values_batch.data(),
-            (need_grad || need_hess) ? scratch.wedge_tri_gradient_batch.data() : nullptr,
-            need_hess ? scratch.wedge_tri_hessian_batch.data() : nullptr);
-
-        const bool use_product_axis_batch =
-            gradients_only &&
-            points.size() == 4u &&
-            n_axis >= 5 &&
-            n_axis <= 9;
-        if (!use_product_axis_batch ||
-            !try_fill_axis_batch_product_q4(
-                scratch.axis_z_batch, points, 2u, n_axis, level)) {
-            fill_axis_batch(scratch.axis_z_batch,
-                            points,
-                            2u,
-                            v_coeffs,
-                            d_coeffs,
-                            d2_coeffs,
-                            barycentric_weights,
-                            n_axis,
-                            level);
-        }
-
-        const std::size_t axis_stride = static_cast<std::size_t>(n_axis);
-        if (all_outputs) {
-            if (num_qpts == 4u) {
-                if (output_stride == 4u) {
-                    for (std::size_t node = 0; node < wedge_indices.size(); ++node) {
-                        const auto& index = wedge_indices[node];
-                        const std::size_t tri = index[0];
-                        const std::size_t z = index[1];
-                        Real* value_row = values_out + node * output_stride;
-                        Real* g = gradients_out + node * 3u * output_stride;
-                        Real* H = hessians_out + node * 9u * output_stride;
-                        const Real* tri_g = scratch.wedge_tri_gradient_batch.data() + tri * 2u * tri_stride;
-                        const Real* tri_H = scratch.wedge_tri_hessian_batch.data() + tri * 3u * tri_stride;
-                        const Real* tri_values = scratch.wedge_tri_values_batch.data();
-
-                        write_wedge_all_stride4_q<0>(
-                            tri_stride, axis_stride, tri, z,
-                            tri_values, tri_g, tri_H, scratch.axis_z_batch, value_row, g, H);
-                        write_wedge_all_stride4_q<1>(
-                            tri_stride, axis_stride, tri, z,
-                            tri_values, tri_g, tri_H, scratch.axis_z_batch, value_row, g, H);
-                        write_wedge_all_stride4_q<2>(
-                            tri_stride, axis_stride, tri, z,
-                            tri_values, tri_g, tri_H, scratch.axis_z_batch, value_row, g, H);
-                        write_wedge_all_stride4_q<3>(
-                            tri_stride, axis_stride, tri, z,
-                            tri_values, tri_g, tri_H, scratch.axis_z_batch, value_row, g, H);
-                    }
-                } else {
-                    for (std::size_t node = 0; node < wedge_indices.size(); ++node) {
-                        const auto& index = wedge_indices[node];
-                        const std::size_t tri = index[0];
-                        const std::size_t z = index[1];
-                        Real* value_row = values_out + node * output_stride;
-                        Real* g = gradients_out + node * 3u * output_stride;
-                        Real* H = hessians_out + node * 9u * output_stride;
-                        const Real* tri_g = scratch.wedge_tri_gradient_batch.data() + tri * 2u * tri_stride;
-                        const Real* tri_H = scratch.wedge_tri_hessian_batch.data() + tri * 3u * tri_stride;
-                        const Real* tri_values = scratch.wedge_tri_values_batch.data();
-
-                        write_wedge_all_strided_q<0>(
-                            tri_stride, axis_stride, tri, z, output_stride,
-                            tri_values, tri_g, tri_H, scratch.axis_z_batch, value_row, g, H);
-                        write_wedge_all_strided_q<1>(
-                            tri_stride, axis_stride, tri, z, output_stride,
-                            tri_values, tri_g, tri_H, scratch.axis_z_batch, value_row, g, H);
-                        write_wedge_all_strided_q<2>(
-                            tri_stride, axis_stride, tri, z, output_stride,
-                            tri_values, tri_g, tri_H, scratch.axis_z_batch, value_row, g, H);
-                        write_wedge_all_strided_q<3>(
-                            tri_stride, axis_stride, tri, z, output_stride,
-                            tri_values, tri_g, tri_H, scratch.axis_z_batch, value_row, g, H);
-                    }
-                }
-                return;
-            }
-
-            for (std::size_t node = 0; node < wedge_indices.size(); ++node) {
-                const auto& index = wedge_indices[node];
-                const std::size_t tri = index[0];
-                const std::size_t z = index[1];
-                Real* value_row = values_out + node * output_stride;
-                Real* g = gradients_out + node * 3u * output_stride;
-                Real* H = hessians_out + node * 9u * output_stride;
-                const Real* tri_g = scratch.wedge_tri_gradient_batch.data() + tri * 2u * tri_stride;
-                const Real* tri_H = scratch.wedge_tri_hessian_batch.data() + tri * 3u * tri_stride;
-
-                for (std::size_t q = 0; q < num_qpts; ++q) {
-                    const std::size_t tri_q = tri * tri_stride + q;
-                    const std::size_t z_q = q * axis_stride + z;
-                    const Real tri_v = scratch.wedge_tri_values_batch[tri_q];
-                    const Real zv = scratch.axis_z_batch.values[z_q];
-                    const Real zd = scratch.axis_z_batch.first[z_q];
-                    const Real tri_gx = tri_g[0u * tri_stride + q];
-                    const Real tri_gy = tri_g[1u * tri_stride + q];
-                    const Real tri_hxx = tri_H[0u * tri_stride + q];
-                    const Real tri_hxy = tri_H[1u * tri_stride + q];
-                    const Real tri_hyy = tri_H[2u * tri_stride + q];
-                    const Real hxz = tri_gx * zd;
-                    const Real hxy = tri_hxy * zv;
-                    const Real hyz = tri_gy * zd;
-
-                    value_row[q] = tri_v * zv;
-                    g[0u * output_stride + q] = tri_gx * zv;
-                    g[1u * output_stride + q] = tri_gy * zv;
-                    g[2u * output_stride + q] = tri_v * zd;
-                    H[0u * output_stride + q] = tri_hxx * zv;
-                    H[1u * output_stride + q] = hxy;
-                    H[2u * output_stride + q] = hxz;
-                    H[3u * output_stride + q] = hxy;
-                    H[4u * output_stride + q] = tri_hyy * zv;
-                    H[5u * output_stride + q] = hyz;
-                    H[6u * output_stride + q] = hxz;
-                    H[7u * output_stride + q] = hyz;
-                    H[8u * output_stride + q] = tri_v * scratch.axis_z_batch.second[z_q];
-                }
-            }
-            return;
-        }
-
-        if (hessians_only) {
-            if (num_qpts == 4u) {
-                if (output_stride == 4u) {
-                    for (std::size_t node = 0; node < wedge_indices.size(); ++node) {
-                        const auto& index = wedge_indices[node];
-                        const std::size_t tri = index[0];
-                        const std::size_t z = index[1];
-                        Real* H = hessians_out + node * 9u * output_stride;
-                        const Real* tri_g = scratch.wedge_tri_gradient_batch.data() + tri * 2u * tri_stride;
-                        const Real* tri_H = scratch.wedge_tri_hessian_batch.data() + tri * 3u * tri_stride;
-                        const Real* tri_values = scratch.wedge_tri_values_batch.data();
-
-                        write_wedge_hessian_stride4_q<0>(
-                            tri_stride, axis_stride, tri, z,
-                            tri_values, tri_g, tri_H, scratch.axis_z_batch, H);
-                        write_wedge_hessian_stride4_q<1>(
-                            tri_stride, axis_stride, tri, z,
-                            tri_values, tri_g, tri_H, scratch.axis_z_batch, H);
-                        write_wedge_hessian_stride4_q<2>(
-                            tri_stride, axis_stride, tri, z,
-                            tri_values, tri_g, tri_H, scratch.axis_z_batch, H);
-                        write_wedge_hessian_stride4_q<3>(
-                            tri_stride, axis_stride, tri, z,
-                            tri_values, tri_g, tri_H, scratch.axis_z_batch, H);
-                    }
-                } else {
-                    for (std::size_t node = 0; node < wedge_indices.size(); ++node) {
-                        const auto& index = wedge_indices[node];
-                        const std::size_t tri = index[0];
-                        const std::size_t z = index[1];
-                        Real* H = hessians_out + node * 9u * output_stride;
-                        const Real* tri_g = scratch.wedge_tri_gradient_batch.data() + tri * 2u * tri_stride;
-                        const Real* tri_H = scratch.wedge_tri_hessian_batch.data() + tri * 3u * tri_stride;
-                        const Real* tri_values = scratch.wedge_tri_values_batch.data();
-
-                        write_wedge_hessian_strided_q<0>(
-                            tri_stride, axis_stride, tri, z, output_stride,
-                            tri_values, tri_g, tri_H, scratch.axis_z_batch, H);
-                        write_wedge_hessian_strided_q<1>(
-                            tri_stride, axis_stride, tri, z, output_stride,
-                            tri_values, tri_g, tri_H, scratch.axis_z_batch, H);
-                        write_wedge_hessian_strided_q<2>(
-                            tri_stride, axis_stride, tri, z, output_stride,
-                            tri_values, tri_g, tri_H, scratch.axis_z_batch, H);
-                        write_wedge_hessian_strided_q<3>(
-                            tri_stride, axis_stride, tri, z, output_stride,
-                            tri_values, tri_g, tri_H, scratch.axis_z_batch, H);
-                    }
-                }
-                return;
-            }
-
-            for (std::size_t node = 0; node < wedge_indices.size(); ++node) {
-                const auto& index = wedge_indices[node];
-                const std::size_t tri = index[0];
-                const std::size_t z = index[1];
-                Real* H = hessians_out + node * 9u * output_stride;
-                const Real* tri_g = scratch.wedge_tri_gradient_batch.data() + tri * 2u * tri_stride;
-                const Real* tri_H = scratch.wedge_tri_hessian_batch.data() + tri * 3u * tri_stride;
-
-                for (std::size_t q = 0; q < num_qpts; ++q) {
-                    const std::size_t tri_q = tri * tri_stride + q;
-                    const std::size_t z_q = q * axis_stride + z;
-                    const Real tri_v = scratch.wedge_tri_values_batch[tri_q];
-                    const Real zv = scratch.axis_z_batch.values[z_q];
-                    const Real zd = scratch.axis_z_batch.first[z_q];
-                    const Real tri_gx = tri_g[0u * tri_stride + q];
-                    const Real tri_gy = tri_g[1u * tri_stride + q];
-                    const Real tri_hxx = tri_H[0u * tri_stride + q];
-                    const Real tri_hxy = tri_H[1u * tri_stride + q];
-                    const Real tri_hyy = tri_H[2u * tri_stride + q];
-                    const Real hxz = tri_gx * zd;
-                    const Real hxy = tri_hxy * zv;
-                    const Real hyz = tri_gy * zd;
-
-                    H[0u * output_stride + q] = tri_hxx * zv;
-                    H[1u * output_stride + q] = hxy;
-                    H[2u * output_stride + q] = hxz;
-                    H[3u * output_stride + q] = hxy;
-                    H[4u * output_stride + q] = tri_hyy * zv;
-                    H[5u * output_stride + q] = hyz;
-                    H[6u * output_stride + q] = hxz;
-                    H[7u * output_stride + q] = hyz;
-                    H[8u * output_stride + q] = tri_v * scratch.axis_z_batch.second[z_q];
-                }
-            }
-            return;
-        }
-
-        if (gradients_only) {
-            if (num_qpts == 4u) {
-                if (output_stride == 4u) {
-                    for (std::size_t node = 0; node < wedge_indices.size(); ++node) {
-                        const auto& index = wedge_indices[node];
-                        const std::size_t tri = index[0];
-                        const std::size_t z = index[1];
-                        Real* g = gradients_out + node * 3u * output_stride;
-                        const Real* tri_g = scratch.wedge_tri_gradient_batch.data() + tri * 2u * tri_stride;
-                        const Real* tri_values = scratch.wedge_tri_values_batch.data();
-
-                        write_wedge_gradient_stride4_q<0>(
-                            tri_stride, axis_stride, tri, z,
-                            tri_values, tri_g, scratch.axis_z_batch, g);
-                        write_wedge_gradient_stride4_q<1>(
-                            tri_stride, axis_stride, tri, z,
-                            tri_values, tri_g, scratch.axis_z_batch, g);
-                        write_wedge_gradient_stride4_q<2>(
-                            tri_stride, axis_stride, tri, z,
-                            tri_values, tri_g, scratch.axis_z_batch, g);
-                        write_wedge_gradient_stride4_q<3>(
-                            tri_stride, axis_stride, tri, z,
-                            tri_values, tri_g, scratch.axis_z_batch, g);
-                    }
-                } else {
-                    for (std::size_t node = 0; node < wedge_indices.size(); ++node) {
-                        const auto& index = wedge_indices[node];
-                        const std::size_t tri = index[0];
-                        const std::size_t z = index[1];
-                        Real* g = gradients_out + node * 3u * output_stride;
-                        const Real* tri_g = scratch.wedge_tri_gradient_batch.data() + tri * 2u * tri_stride;
-                        const Real* tri_values = scratch.wedge_tri_values_batch.data();
-
-                        write_wedge_gradient_strided_q<0>(
-                            tri_stride, axis_stride, tri, z, output_stride,
-                            tri_values, tri_g, scratch.axis_z_batch, g);
-                        write_wedge_gradient_strided_q<1>(
-                            tri_stride, axis_stride, tri, z, output_stride,
-                            tri_values, tri_g, scratch.axis_z_batch, g);
-                        write_wedge_gradient_strided_q<2>(
-                            tri_stride, axis_stride, tri, z, output_stride,
-                            tri_values, tri_g, scratch.axis_z_batch, g);
-                        write_wedge_gradient_strided_q<3>(
-                            tri_stride, axis_stride, tri, z, output_stride,
-                            tri_values, tri_g, scratch.axis_z_batch, g);
-                    }
-                }
-                return;
-            }
-
-            for (std::size_t node = 0; node < wedge_indices.size(); ++node) {
-                const auto& index = wedge_indices[node];
-                const std::size_t tri = index[0];
-                const std::size_t z = index[1];
-                Real* g = gradients_out + node * 3u * output_stride;
-                const Real* tri_g = scratch.wedge_tri_gradient_batch.data() + tri * 2u * tri_stride;
-
-                for (std::size_t q = 0; q < num_qpts; ++q) {
-                    const std::size_t tri_q = tri * tri_stride + q;
-                    const std::size_t z_q = q * axis_stride + z;
-                    const Real tri_v = scratch.wedge_tri_values_batch[tri_q];
-                    const Real zv = scratch.axis_z_batch.values[z_q];
-                    g[0u * output_stride + q] = tri_g[0u * tri_stride + q] * zv;
-                    g[1u * output_stride + q] = tri_g[1u * tri_stride + q] * zv;
-                    g[2u * output_stride + q] = tri_v * scratch.axis_z_batch.first[z_q];
-                }
-            }
-            return;
-        }
-
-        for (std::size_t node = 0; node < wedge_indices.size(); ++node) {
-            const auto& index = wedge_indices[node];
-            const std::size_t tri = index[0];
-            const std::size_t z = index[1];
-            Real* value_row = values_out ? values_out + node * output_stride : nullptr;
-            Real* g = gradients_out ? gradients_out + node * 3u * output_stride : nullptr;
-            Real* H = hessians_out ? hessians_out + node * 9u * output_stride : nullptr;
-
-            for (std::size_t q = 0; q < num_qpts; ++q) {
-                const std::size_t tri_q = tri * tri_stride + q;
-                const std::size_t z_q = q * axis_stride + z;
-                const Real tri_v = scratch.wedge_tri_values_batch[tri_q];
-                const Real zv = scratch.axis_z_batch.values[z_q];
-                if (values_out != nullptr) {
-                    value_row[q] = tri_v * zv;
-                }
-
-                if (need_grad) {
-                    const Real* tri_g = scratch.wedge_tri_gradient_batch.data() + tri * 2u * tri_stride;
-                    g[0u * output_stride + q] = tri_g[0u * tri_stride + q] * zv;
-                    g[1u * output_stride + q] = tri_g[1u * tri_stride + q] * zv;
-                    g[2u * output_stride + q] = tri_v * scratch.axis_z_batch.first[z_q];
-                }
-
-                if (need_hess) {
-                    const Real* tri_g = scratch.wedge_tri_gradient_batch.data() + tri * 2u * tri_stride;
-                    const Real* tri_H = scratch.wedge_tri_hessian_batch.data() + tri * 3u * tri_stride;
-                    const Real zd = scratch.axis_z_batch.first[z_q];
-                    const Real hxz = tri_g[0u * tri_stride + q] * zd;
-                    const Real hxy = tri_H[1u * tri_stride + q] * zv;
-                    const Real hyz = tri_g[1u * tri_stride + q] * zd;
-                    H[0u * output_stride + q] = tri_H[0u * tri_stride + q] * zv;
-                    H[1u * output_stride + q] = hxy;
-                    H[2u * output_stride + q] = hxz;
-                    H[3u * output_stride + q] = hxy;
-                    H[4u * output_stride + q] = tri_H[2u * tri_stride + q] * zv;
-                    H[5u * output_stride + q] = hyz;
-                    H[6u * output_stride + q] = hxz;
-                    H[7u * output_stride + q] = hyz;
-                    H[8u * output_stride + q] = tri_v * scratch.axis_z_batch.second[z_q];
-                }
-            }
-        }
-
-        return;
-    }
-
-    scratch.tri_values.resize(tri_count);
-    if (need_grad || need_hess) {
-        scratch.tri_gradient_components.resize(tri_count * 3u);
-    }
-    if (need_hess) {
-        scratch.tri_hessian_components.resize(tri_count * 9u);
-    }
-
-    for (std::size_t q = 0; q < points.size(); ++q) {
-        const auto& xi = points[q];
-        const AxisBasisEvaluations z_axis =
-            fill_axis_scratch(scratch.axis_z,
-                              v_coeffs,
-                              d_coeffs,
-                              d2_coeffs,
-                              barycentric_weights,
-                              n_axis,
-                              xi[2],
-                              level);
-        detail::evaluate_triangle_simplex_basis_to(
-            simplex_exponents,
-            order,
-            xi,
-            scratch.tri_values.data(),
-            (need_grad || need_hess) ? scratch.tri_gradient_components.data() : nullptr,
-            need_hess ? scratch.tri_hessian_components.data() : nullptr);
-
-        for (std::size_t node = 0; node < wedge_indices.size(); ++node) {
-            const auto& index = wedge_indices[node];
-            const std::size_t tri = index[0];
-            const std::size_t z = index[1];
-            const Real tri_v = scratch.tri_values[tri];
-            const Real zv = z_axis.values[z];
-
-            if (values_out != nullptr) {
-                values_out[node * output_stride + q] = tri_v * zv;
-            }
-
-            if (need_grad) {
-                const Real* tri_g = scratch.tri_gradient_components.data() + tri * 3u;
-                Real* g = gradients_out + node * 3u * output_stride;
-                g[0u * output_stride + q] = tri_g[0] * zv;
-                g[1u * output_stride + q] = tri_g[1] * zv;
-                g[2u * output_stride + q] = tri_v * z_axis.first[z];
-            }
-
-            if (need_hess) {
-                const Real* tri_g = scratch.tri_gradient_components.data() + tri * 3u;
-                const Real* tri_H = scratch.tri_hessian_components.data() + tri * 9u;
-                const Real zd = z_axis.first[z];
-                const Real hxz = tri_g[0] * zd;
-                const Real hxy = tri_H[1] * zv;
-                const Real hyz = tri_g[1] * zd;
-                Real* H = hessians_out + node * 9u * output_stride;
-                H[0u * output_stride + q] = tri_H[0] * zv;
-                H[1u * output_stride + q] = hxy;
-                H[2u * output_stride + q] = hxz;
-                H[3u * output_stride + q] = hxy;
-                H[4u * output_stride + q] = tri_H[4] * zv;
-                H[5u * output_stride + q] = hyz;
-                H[6u * output_stride + q] = hxz;
-                H[7u * output_stride + q] = hyz;
-                H[8u * output_stride + q] = tri_v * z_axis.second[z];
-            }
-        }
-    }
-}
-
-NormalizedLagrangeRequest normalize_lagrange_request(ElementType element_type, int order) {
-    switch (element_type) {
-        case ElementType::Line3:
-            return {ElementType::Line2, std::max(order, 2)};
-        case ElementType::Triangle6:
-            return {ElementType::Triangle3, std::max(order, 2)};
-        case ElementType::Quad9:
-            return {ElementType::Quad4, std::max(order, 2)};
-        case ElementType::Quad8:
-            throw BasisElementCompatibilityException(
-                "Quad8 is a serendipity element; use SerendipityBasis for Quad8",
-                __FILE__, __LINE__, __func__);
-        case ElementType::Tetra10:
-            return {ElementType::Tetra4, std::max(order, 2)};
-        case ElementType::Hex27:
-            return {ElementType::Hex8, std::max(order, 2)};
-        case ElementType::Hex20:
-            throw BasisElementCompatibilityException(
-                "Hex20 is a serendipity element; use SerendipityBasis for Hex20",
-                __FILE__, __LINE__, __func__);
-        case ElementType::Wedge18:
-            return {ElementType::Wedge6, std::max(order, 2)};
-        case ElementType::Wedge15:
-            throw BasisElementCompatibilityException(
-                "Wedge15 is a serendipity element; use SerendipityBasis for Wedge15",
-                __FILE__, __LINE__, __func__);
-        case ElementType::Pyramid13:
-            throw BasisElementCompatibilityException(
-                "Pyramid13 is a serendipity variant; use SerendipityBasis (Pyramid13) or the complete-family Lagrange path via LagrangeBasis (Pyramid5, order >= 2)",
-                __FILE__, __LINE__, __func__);
-        case ElementType::Pyramid14:
-            return {ElementType::Pyramid5, std::max(order, 2)};
-        default:
-            return {element_type, order};
-    }
-}
-
-} // namespace
-
-void prewarm_lagrange_basis_scratch(int max_order, std::size_t max_qpts) {
-    evaluate_scratch().prewarm(max_order, max_qpts);
-}
-
-LagrangeBasis::LagrangeBasis(ElementType type, int order)
-    : element_type_(type), dimension_(0), order_(order) {
-    const NormalizedLagrangeRequest normalized = normalize_lagrange_request(element_type_, order_);
-    element_type_ = normalized.element_type;
-    order_ = normalized.order;
-
-    if (order_ < 0) {
-        throw BasisConfigurationException("LagrangeBasis requires non-negative polynomial order",
-                                          __FILE__, __LINE__, __func__);
-    }
-
-    dimension_ = lagrange_topology_traits(element_type_).dimension;
-
-    init_nodes();
-    init_evaluation_dispatch();
-}
-
-void LagrangeBasis::init_nodes() {
-    nodes_.clear();
-    nodes_1d_.clear();
-    tensor_indices_.clear();
-    simplex_exponents_.clear();
-    wedge_indices_.clear();
-    wedge_node_by_tri_z_.clear();
-    axis_v_coeffs_.clear();
-    axis_d_coeffs_.clear();
-    axis_d2_coeffs_.clear();
-    axis_barycentric_weights_.clear();
-    const auto topology = lagrange_topology_traits(element_type_).topology;
-    topology_id_ = static_cast<int>(topology);
-    switch (topology) {
-        case LagrangeTopology::Point:
-            build_point_nodes();
-            return;
-        case LagrangeTopology::Line:
-            build_tensor_product_nodes(1);
-            compute_axis_monomial_coefficients();
-            return;
-        case LagrangeTopology::Quadrilateral:
-            build_tensor_product_nodes(2);
-            compute_axis_monomial_coefficients();
-            return;
-        case LagrangeTopology::Hexahedron:
-            build_tensor_product_nodes(3);
-            compute_axis_monomial_coefficients();
-            return;
-        case LagrangeTopology::Triangle:
-        case LagrangeTopology::Tetrahedron:
-            build_simplex_nodes();
-            return;
-        case LagrangeTopology::Wedge:
-            build_wedge_nodes();
-            compute_axis_monomial_coefficients();
-            return;
-        case LagrangeTopology::Pyramid:
-            build_pyramid_nodes();
-            return;
-        case LagrangeTopology::Unknown:
-            break;
-    }
-
-    throw BasisElementCompatibilityException("Unsupported element type in LagrangeBasis::init_nodes",
-                                             __FILE__, __LINE__, __func__);
-}
-
-void LagrangeBasis::init_evaluation_dispatch() {
-    const auto topology = static_cast<LagrangeTopology>(topology_id_);
-    switch (topology) {
-        case LagrangeTopology::Point:
-            vector_evaluation_dispatch_ = &LagrangeBasis::evaluate_point_vectors;
-            return;
-        case LagrangeTopology::Line:
-        case LagrangeTopology::Quadrilateral:
-        case LagrangeTopology::Hexahedron:
-            vector_evaluation_dispatch_ = &LagrangeBasis::evaluate_tensor_product_vectors;
-            return;
-        case LagrangeTopology::Triangle:
-            vector_evaluation_dispatch_ = &LagrangeBasis::evaluate_triangle_vectors;
-            return;
-        case LagrangeTopology::Tetrahedron:
-            vector_evaluation_dispatch_ = &LagrangeBasis::evaluate_tetrahedron_vectors;
-            return;
-        case LagrangeTopology::Wedge:
-            vector_evaluation_dispatch_ = &LagrangeBasis::evaluate_wedge_vectors;
-            return;
-        case LagrangeTopology::Pyramid:
-            vector_evaluation_dispatch_ = &LagrangeBasis::evaluate_pyramid_vectors;
-            return;
-        case LagrangeTopology::Unknown:
-            break;
-    }
-    vector_evaluation_dispatch_ = &LagrangeBasis::evaluate_unsupported_vectors;
-}
-
-void LagrangeBasis::compute_axis_monomial_coefficients() {
-    const int N = static_cast<int>(nodes_1d_.size());
-    if (N == 0) return;
-
-    axis_barycentric_weights_.resize(static_cast<std::size_t>(N));
-    fill_equispaced_barycentric_weights(N, axis_barycentric_weights_.data());
-
-    if (assign_precomputed_axis_coefficients(N, axis_v_coeffs_, axis_d_coeffs_, axis_d2_coeffs_)) {
-        return;
-    }
-
-    axis_v_coeffs_.assign(static_cast<std::size_t>(N) * static_cast<std::size_t>(N), Real(0));
-    if (N >= 2) {
-        axis_d_coeffs_.assign(static_cast<std::size_t>(N) * static_cast<std::size_t>(N - 1), Real(0));
-    }
-    if (N >= 3) {
-        axis_d2_coeffs_.assign(static_cast<std::size_t>(N) * static_cast<std::size_t>(N - 2), Real(0));
-    }
-
-    if (N == 1) {
-        axis_v_coeffs_[0] = Real(1);
-        return;
-    }
-
-    // For each L_i, compute monomial coefficients of P_i(x) = prod_{j != i} (x - x_j),
-    // then divide by w_i = prod_{j != i} (x_i - x_j).
-    std::vector<Real> coeffs;
-    coeffs.reserve(static_cast<std::size_t>(N));
-    for (int i = 0; i < N; ++i) {
-        coeffs.assign(1, Real(1));  // start with constant polynomial 1
-        for (int j = 0; j < N; ++j) {
-            if (j == i) continue;
-            // Multiply (x - x_j) into coeffs (in-place via temp).
-            std::vector<Real> next(coeffs.size() + 1, Real(0));
-            for (std::size_t k = 0; k < coeffs.size(); ++k) {
-                next[k]     -= nodes_1d_[static_cast<std::size_t>(j)] * coeffs[k];
-                next[k + 1] += coeffs[k];
-            }
-            coeffs.swap(next);
-        }
-        // Divide by w_i.
-        Real denom = Real(1);
-        for (int j = 0; j < N; ++j) {
-            if (j == i) continue;
-            denom *= (nodes_1d_[static_cast<std::size_t>(i)] - nodes_1d_[static_cast<std::size_t>(j)]);
-        }
-        const Real inv_denom = Real(1) / denom;
-        for (int k = 0; k < N; ++k) {
-            axis_v_coeffs_[static_cast<std::size_t>(i) * static_cast<std::size_t>(N) + static_cast<std::size_t>(k)]
-                = coeffs[static_cast<std::size_t>(k)] * inv_denom;
-        }
-
-        // First derivative coefficients: d/dx (sum_k c_ik * x^k) = sum_{k>=1} k*c_ik * x^(k-1).
-        if (N >= 2) {
-            for (int k = 1; k < N; ++k) {
-                axis_d_coeffs_[static_cast<std::size_t>(i) * static_cast<std::size_t>(N - 1)
-                              + static_cast<std::size_t>(k - 1)]
-                    = static_cast<Real>(k)
-                      * axis_v_coeffs_[static_cast<std::size_t>(i) * static_cast<std::size_t>(N)
-                                       + static_cast<std::size_t>(k)];
-            }
-        }
-
-        // Second derivative coefficients: d^2/dx^2 = sum_{k>=2} k*(k-1)*c_ik * x^(k-2).
-        if (N >= 3) {
-            for (int k = 2; k < N; ++k) {
-                axis_d2_coeffs_[static_cast<std::size_t>(i) * static_cast<std::size_t>(N - 2)
-                              + static_cast<std::size_t>(k - 2)]
-                    = static_cast<Real>(k * (k - 1))
-                      * axis_v_coeffs_[static_cast<std::size_t>(i) * static_cast<std::size_t>(N)
-                                       + static_cast<std::size_t>(k)];
-            }
-        }
-    }
-}
-
-void LagrangeBasis::build_point_nodes() {
-    nodes_.push_back(math::Vector<Real, 3>{Real(0), Real(0), Real(0)});
-}
-
-void LagrangeBasis::init_equispaced_1d_nodes() {
-    nodes_1d_.clear();
-    for (int i = 0; i <= std::max(order_, 0); ++i) {
-        nodes_1d_.push_back(detail::equispaced_pm_one_coord(i, order_));
-    }
-}
-
-void LagrangeBasis::build_tensor_product_nodes(int dimensions) {
-    init_equispaced_1d_nodes();
-
-    if (dimensions < 1 || dimensions > 3) {
-        throw BasisConfigurationException("LagrangeBasis::build_tensor_product_nodes requires dimension 1, 2, or 3",
-                                          __FILE__, __LINE__, __func__);
-    }
-
-    nodes_ = ReferenceNodeLayout::get_lagrange_node_coords(element_type_, order_);
-    tensor_indices_.resize(nodes_.size(), TensorNodeIndex{0u, 0u, 0u});
-    for (std::size_t n = 0; n < nodes_.size(); ++n) {
-        tensor_indices_[n][0] = lattice_index_pm_one(
-            nodes_[n][0], order_,
-            "LagrangeBasis: invalid tensor-product x-coordinate in public node ordering");
-        if (dimensions >= 2) {
-            tensor_indices_[n][1] = lattice_index_pm_one(
-                nodes_[n][1], order_,
-                "LagrangeBasis: invalid tensor-product y-coordinate in public node ordering");
-        }
-        if (dimensions == 3) {
-            tensor_indices_[n][2] = lattice_index_pm_one(
-                nodes_[n][2], order_,
-                "LagrangeBasis: invalid tensor-product z-coordinate in public node ordering");
-        }
-    }
-}
-
-void LagrangeBasis::build_simplex_nodes() {
-    nodes_ = ReferenceNodeLayout::get_lagrange_node_coords(element_type_, order_);
-    const auto topology = static_cast<LagrangeTopology>(topology_id_);
-    simplex_exponents_.clear();
-    simplex_exponents_.reserve(nodes_.size());
-    for (const auto& node : nodes_) {
-        switch (topology) {
-            case LagrangeTopology::Triangle:
-                simplex_exponents_.push_back(triangle_exponents_from_public_node(node, order_));
-                break;
-            case LagrangeTopology::Tetrahedron:
-                simplex_exponents_.push_back(tetrahedron_exponents_from_public_node(node, order_));
-                break;
-            default:
-                throw BasisElementCompatibilityException("LagrangeBasis::build_simplex_nodes requires simplex topology",
-                                                         __FILE__, __LINE__, __func__);
-        }
-    }
-}
-
-void LagrangeBasis::build_wedge_nodes() {
-    init_equispaced_1d_nodes();
-    const auto triangle_nodes = ReferenceNodeLayout::get_lagrange_node_coords(ElementType::Triangle3, order_);
-    simplex_exponents_.clear();
-    simplex_exponents_.reserve(triangle_nodes.size());
-    std::unordered_map<std::array<int, 4>, std::size_t, SimplexExponentHash> triangle_index_by_exponent;
-    triangle_index_by_exponent.reserve(triangle_nodes.size());
-    for (std::size_t tri = 0; tri < triangle_nodes.size(); ++tri) {
-        const auto exponents = triangle_exponents_from_public_node(triangle_nodes[tri], order_);
-        simplex_exponents_.push_back(exponents);
-        const auto inserted = triangle_index_by_exponent.emplace(exponents, tri);
-        if (!inserted.second) {
-            throw BasisNodeOrderingException("LagrangeBasis: duplicate wedge triangle descriptor",
-                                             __FILE__, __LINE__, __func__);
-        }
-    }
-
-    nodes_ = ReferenceNodeLayout::get_lagrange_node_coords(element_type_, order_);
-    wedge_indices_.clear();
-    wedge_indices_.reserve(nodes_.size());
-    const std::size_t z_count = static_cast<std::size_t>(order_ + 1);
-    const std::size_t missing_node = nodes_.size();
-    wedge_node_by_tri_z_.assign(triangle_nodes.size() * z_count, missing_node);
-    for (std::size_t node_index = 0; node_index < nodes_.size(); ++node_index) {
-        const auto& node = nodes_[node_index];
-        const auto exponents = triangle_exponents_from_public_node(node, order_);
-        const auto found = triangle_index_by_exponent.find(exponents);
-        if (found == triangle_index_by_exponent.end()) {
-            throw BasisNodeOrderingException("LagrangeBasis: failed to resolve wedge triangle descriptor in public ordering",
-                                             __FILE__, __LINE__, __func__);
-        }
-        const std::size_t tri = found->second;
-        const std::size_t z =
-            lattice_index_pm_one(node[2], order_,
-                                 "LagrangeBasis: invalid wedge z-coordinate in public node ordering");
-        wedge_indices_.push_back(WedgeNodeIndex{tri, z});
-        wedge_node_by_tri_z_[tri * z_count + z] = node_index;
-    }
-    for (std::size_t entry = 0; entry < wedge_node_by_tri_z_.size(); ++entry) {
-        if (wedge_node_by_tri_z_[entry] == missing_node) {
-            throw BasisNodeOrderingException("LagrangeBasis: incomplete wedge tensor-product node map",
-                                             __FILE__, __LINE__, __func__);
-        }
-    }
-}
-
-void LagrangeBasis::build_pyramid_nodes() {
-    nodes_ = detail::lagrange_pyramid::nodes(order_);
-}
-
-void LagrangeBasis::evaluate_point_vectors(const math::Vector<Real, 3>&,
-                                           std::vector<Real>* values,
-                                           std::vector<Gradient>* gradients,
-                                           std::vector<Hessian>* hessians) const {
-    if (values != nullptr) {
-        values->resize(1u);
-        (*values)[0] = Real(1);
-    }
-    if (gradients != nullptr) {
-        gradients->resize(1u);
-        (*gradients)[0] = Gradient{};
-    }
-    if (hessians != nullptr) {
-        hessians->resize(1u);
-        (*hessians)[0] = Hessian{};
-    }
-}
-
-void LagrangeBasis::evaluate_tensor_product_vectors(const math::Vector<Real, 3>& xi,
-                                                    std::vector<Real>* values,
-                                                    std::vector<Gradient>* gradients,
-                                                    std::vector<Hessian>* hessians) const {
-    const auto topology = static_cast<LagrangeTopology>(topology_id_);
-    if (evaluate_fixed_lagrange_fast(topology, order_, xi, values, gradients, hessians)) {
-        return;
-    }
-
-    const int n_axis = static_cast<int>(nodes_1d_.size());
-    const Real* vc = axis_v_coeffs_.data();
-    const Real* dc = axis_d_coeffs_.data();
-    const Real* d2c = axis_d2_coeffs_.data();
-    const Real* bw = axis_barycentric_weights_.data();
-    const AxisDeriv level = hessians != nullptr ? AxisDeriv::ValuesAndFirstAndSecond
-                           : gradients != nullptr ? AxisDeriv::ValuesAndFirst
-                                                  : AxisDeriv::ValuesOnly;
-
-    LagrangeEvaluateScratch& scratch = evaluate_scratch();
-    const AxisBasisEvaluations x_axis =
-        fill_axis_scratch(scratch.axis_x, vc, dc, d2c, bw, n_axis, xi[0], level);
-    AxisBasisEvaluations y_axis = constant_axis_basis();
-    AxisBasisEvaluations z_axis = constant_axis_basis();
-
-    if (topology != LagrangeTopology::Line) {
-        y_axis = fill_axis_scratch(scratch.axis_y, vc, dc, d2c, bw, n_axis, xi[1], level);
-    }
-    if (topology == LagrangeTopology::Hexahedron) {
-        z_axis = fill_axis_scratch(scratch.axis_z, vc, dc, d2c, bw, n_axis, xi[2], level);
-    }
-
-    evaluate_tensor_product_factorized(tensor_indices_, x_axis, y_axis, z_axis,
-                                       values, gradients, hessians);
-}
-
-void LagrangeBasis::evaluate_triangle_vectors(const math::Vector<Real, 3>& xi,
-                                              std::vector<Real>* values,
-                                              std::vector<Gradient>* gradients,
-                                              std::vector<Hessian>* hessians) const {
-    const auto topology = static_cast<LagrangeTopology>(topology_id_);
-    if (evaluate_fixed_lagrange_fast(topology, order_, xi, values, gradients, hessians)) {
-        return;
-    }
-    detail::evaluate_triangle_simplex_basis(simplex_exponents_, order_, xi,
-                                            values, gradients, hessians);
-}
-
-void LagrangeBasis::evaluate_tetrahedron_vectors(const math::Vector<Real, 3>& xi,
-                                                 std::vector<Real>* values,
-                                                 std::vector<Gradient>* gradients,
-                                                 std::vector<Hessian>* hessians) const {
-    const auto topology = static_cast<LagrangeTopology>(topology_id_);
-    if (evaluate_fixed_lagrange_fast(topology, order_, xi, values, gradients, hessians)) {
-        return;
-    }
-    detail::evaluate_tetrahedron_simplex_basis(simplex_exponents_, order_, xi,
-                                               values, gradients, hessians);
-}
-
-void LagrangeBasis::evaluate_wedge_vectors(const math::Vector<Real, 3>& xi,
-                                           std::vector<Real>* values,
-                                           std::vector<Gradient>* gradients,
-                                           std::vector<Hessian>* hessians) const {
-    const int n_axis = static_cast<int>(nodes_1d_.size());
-    const Real* vc = axis_v_coeffs_.data();
-    const Real* dc = axis_d_coeffs_.data();
-    const Real* d2c = axis_d2_coeffs_.data();
-    const Real* bw = axis_barycentric_weights_.data();
-    const AxisDeriv level = hessians != nullptr ? AxisDeriv::ValuesAndFirstAndSecond
-                           : gradients != nullptr ? AxisDeriv::ValuesAndFirst
-                                                  : AxisDeriv::ValuesOnly;
-
-    LagrangeEvaluateScratch& scratch = evaluate_scratch();
-    const AxisBasisEvaluations z_axis =
-        fill_axis_scratch(scratch.axis_z, vc, dc, d2c, bw, n_axis, xi[2], level);
-
-    if (hessians != nullptr) {
-        detail::evaluate_triangle_simplex_basis(
-            simplex_exponents_, order_, xi,
-            &scratch.tri_values, &scratch.tri_gradients, &scratch.tri_hessians);
-    } else if (gradients != nullptr) {
-        detail::evaluate_triangle_simplex_basis(
-            simplex_exponents_, order_, xi,
-            &scratch.tri_values, &scratch.tri_gradients, nullptr);
-    } else {
-        detail::evaluate_triangle_simplex_basis(
-            simplex_exponents_, order_, xi,
-            &scratch.tri_values, nullptr, nullptr);
-    }
-
-    const std::size_t n_nodes = wedge_indices_.size();
-    if (values != nullptr) {
-        values->resize(n_nodes);
-    }
-    if (gradients != nullptr) {
-        gradients->resize(n_nodes);
-    }
-    if (hessians != nullptr) {
-        hessians->resize(n_nodes);
-    }
-
-    for (std::size_t n = 0; n < n_nodes; ++n) {
-        const auto& index = wedge_indices_[n];
-        const std::size_t tri_idx = index[0];
-        const std::size_t z_idx = index[1];
-        const Real zv = z_axis.values[z_idx];
-        const Real tri_v = scratch.tri_values[tri_idx];
-
-        if (values != nullptr) {
-            (*values)[n] = tri_v * zv;
-        }
-        if (gradients != nullptr) {
-            const Real zd = z_axis.first[z_idx];
-            (*gradients)[n][0] = scratch.tri_gradients[tri_idx][0] * zv;
-            (*gradients)[n][1] = scratch.tri_gradients[tri_idx][1] * zv;
-            (*gradients)[n][2] = tri_v * zd;
-        }
-        if (hessians != nullptr) {
-            const Real zd = z_axis.first[z_idx];
-            const Real zd2 = z_axis.second[z_idx];
-            Hessian H{};
-            H(0, 0) = scratch.tri_hessians[tri_idx](0, 0) * zv;
-            H(1, 1) = scratch.tri_hessians[tri_idx](1, 1) * zv;
-            H(0, 1) = scratch.tri_hessians[tri_idx](0, 1) * zv;
-            H(1, 0) = H(0, 1);
-            H(2, 2) = tri_v * zd2;
-            H(0, 2) = scratch.tri_gradients[tri_idx][0] * zd;
-            H(2, 0) = H(0, 2);
-            H(1, 2) = scratch.tri_gradients[tri_idx][1] * zd;
-            H(2, 1) = H(1, 2);
-            (*hessians)[n] = H;
-        }
-    }
-}
-
-void LagrangeBasis::evaluate_pyramid_vectors(const math::Vector<Real, 3>& xi,
-                                             std::vector<Real>* values,
-                                             std::vector<Gradient>* gradients,
-                                             std::vector<Hessian>* hessians) const {
-    if (values != nullptr && gradients != nullptr && hessians != nullptr) {
-        detail::lagrange_pyramid::evaluate_all(order_, xi, *values, *gradients, *hessians);
-        return;
-    }
-    if (values != nullptr) {
-        detail::lagrange_pyramid::evaluate_values(order_, xi, *values);
-    }
-    if (gradients != nullptr) {
-        detail::lagrange_pyramid::evaluate_gradients(order_, xi, *gradients);
-    }
-    if (hessians != nullptr) {
-        detail::lagrange_pyramid::evaluate_hessians(order_, xi, *hessians);
-    }
-}
-
-void LagrangeBasis::evaluate_unsupported_vectors(const math::Vector<Real, 3>&,
-                                                 std::vector<Real>*,
-                                                 std::vector<Gradient>*,
-                                                 std::vector<Hessian>*) const {
-    throw BasisEvaluationException("Unsupported element in LagrangeBasis vector evaluation",
-                                   __FILE__, __LINE__, __func__);
-}
-
-void LagrangeBasis::evaluate_values(const math::Vector<Real, 3>& xi,
-                                    std::vector<Real>& values) const {
-    (this->*vector_evaluation_dispatch_)(xi, &values, nullptr, nullptr);
-}
-
-void LagrangeBasis::evaluate_gradients(const math::Vector<Real, 3>& xi,
-                                       std::vector<Gradient>& gradients) const {
-    (this->*vector_evaluation_dispatch_)(xi, nullptr, &gradients, nullptr);
-}
-
-void LagrangeBasis::evaluate_hessians(const math::Vector<Real, 3>& xi,
-                                      std::vector<Hessian>& hessians) const {
-    (this->*vector_evaluation_dispatch_)(xi, nullptr, nullptr, &hessians);
-}
-
-void LagrangeBasis::evaluate_all(const math::Vector<Real, 3>& xi,
-                                 std::vector<Real>& values,
-                                 std::vector<Gradient>& gradients,
-                                 std::vector<Hessian>& hessians) const {
-    (this->*vector_evaluation_dispatch_)(xi, &values, &gradients, &hessians);
-}
-
-void LagrangeBasis::evaluate_values_to(const math::Vector<Real, 3>& xi,
-                                       Real* SVMP_RESTRICT values_out) const {
-    const auto topology = static_cast<LagrangeTopology>(topology_id_);
-    if (evaluate_fixed_lagrange_fast_to(topology, order_, xi, values_out, nullptr, nullptr)) {
-        return;
-    }
-
-    const int n_axis = static_cast<int>(nodes_1d_.size());
-    const Real* vc = axis_v_coeffs_.data();
-    const Real* dc = axis_d_coeffs_.data();
-    const Real* d2c = axis_d2_coeffs_.data();
-    const Real* bw = axis_barycentric_weights_.data();
-    switch (topology) {
-        case LagrangeTopology::Point:
-            values_out[0] = Real(1);
-            return;
-        case LagrangeTopology::Line:
-        case LagrangeTopology::Quadrilateral:
-        case LagrangeTopology::Hexahedron: {
-            LagrangeEvaluateScratch& scratch = evaluate_scratch();
-            const AxisBasisEvaluations x_axis =
-                fill_axis_scratch(scratch.axis_x, vc, dc, d2c, bw, n_axis, xi[0], AxisDeriv::ValuesOnly);
-            AxisBasisEvaluations y_axis = constant_axis_basis();
-            AxisBasisEvaluations z_axis = constant_axis_basis();
-            if (topology != LagrangeTopology::Line) {
-                y_axis = fill_axis_scratch(scratch.axis_y, vc, dc, d2c, bw, n_axis, xi[1], AxisDeriv::ValuesOnly);
-            }
-            if (topology == LagrangeTopology::Hexahedron) {
-                z_axis = fill_axis_scratch(scratch.axis_z, vc, dc, d2c, bw, n_axis, xi[2], AxisDeriv::ValuesOnly);
-            }
-            evaluate_tensor_product_factorized_to(tensor_indices_, x_axis, y_axis, z_axis,
-                                                  values_out, nullptr, nullptr);
-            return;
-        }
-        case LagrangeTopology::Triangle:
-            detail::evaluate_triangle_simplex_basis_to(simplex_exponents_, order_, xi,
-                                                       values_out, nullptr, nullptr);
-            return;
-        case LagrangeTopology::Tetrahedron:
-            detail::evaluate_tetrahedron_simplex_basis_to(simplex_exponents_, order_, xi,
-                                                          values_out, nullptr, nullptr);
-            return;
-        case LagrangeTopology::Wedge: {
-            LagrangeEvaluateScratch& scratch = evaluate_scratch();
-            const AxisBasisEvaluations z_axis =
-                fill_axis_scratch(scratch.axis_z, vc, dc, d2c, bw, n_axis, xi[2], AxisDeriv::ValuesOnly);
-            scratch.tri_values.resize(simplex_exponents_.size());
-            detail::evaluate_triangle_simplex_basis_to(simplex_exponents_, order_, xi,
-                                                       scratch.tri_values.data(), nullptr, nullptr);
-            for (std::size_t n = 0; n < wedge_indices_.size(); ++n) {
-                const auto& index = wedge_indices_[n];
-                values_out[n] = scratch.tri_values[index[0]] * z_axis.values[index[1]];
-            }
-            return;
-        }
-        case LagrangeTopology::Pyramid: {
-            detail::lagrange_pyramid::evaluate_values_to(order_, xi, values_out);
-            return;
-        }
-        case LagrangeTopology::Unknown:
-            break;
-    }
-
-    throw BasisEvaluationException("Unsupported element in evaluate_values_to",
-                                   __FILE__, __LINE__, __func__);
-}
-
-void LagrangeBasis::evaluate_gradients_to(const math::Vector<Real, 3>& xi,
-                                          Real* SVMP_RESTRICT gradients_out) const {
-    const auto topology = static_cast<LagrangeTopology>(topology_id_);
-    if (evaluate_fixed_lagrange_fast_to(topology, order_, xi, nullptr, gradients_out, nullptr)) {
-        return;
-    }
-
-    const int n_axis = static_cast<int>(nodes_1d_.size());
-    const Real* vc = axis_v_coeffs_.data();
-    const Real* dc = axis_d_coeffs_.data();
-    const Real* d2c = axis_d2_coeffs_.data();
-    const Real* bw = axis_barycentric_weights_.data();
-    switch (topology) {
-        case LagrangeTopology::Point:
-            gradients_out[0] = Real(0);
-            gradients_out[1] = Real(0);
-            gradients_out[2] = Real(0);
-            return;
-        case LagrangeTopology::Line:
-        case LagrangeTopology::Quadrilateral:
-        case LagrangeTopology::Hexahedron: {
-            LagrangeEvaluateScratch& scratch = evaluate_scratch();
-            const AxisBasisEvaluations x_axis =
-                fill_axis_scratch(scratch.axis_x, vc, dc, d2c, bw, n_axis, xi[0], AxisDeriv::ValuesAndFirst);
-            AxisBasisEvaluations y_axis = constant_axis_basis();
-            AxisBasisEvaluations z_axis = constant_axis_basis();
-            if (topology != LagrangeTopology::Line) {
-                y_axis = fill_axis_scratch(scratch.axis_y, vc, dc, d2c, bw, n_axis, xi[1], AxisDeriv::ValuesAndFirst);
-            }
-            if (topology == LagrangeTopology::Hexahedron) {
-                z_axis = fill_axis_scratch(scratch.axis_z, vc, dc, d2c, bw, n_axis, xi[2], AxisDeriv::ValuesAndFirst);
-            }
-            evaluate_tensor_product_factorized_to(tensor_indices_, x_axis, y_axis, z_axis,
-                                                  nullptr, gradients_out, nullptr);
-            return;
-        }
-        case LagrangeTopology::Triangle:
-            detail::evaluate_triangle_simplex_basis_to(simplex_exponents_, order_, xi,
-                                                       nullptr, gradients_out, nullptr);
-            return;
-        case LagrangeTopology::Tetrahedron:
-            detail::evaluate_tetrahedron_simplex_basis_to(simplex_exponents_, order_, xi,
-                                                          nullptr, gradients_out, nullptr);
-            return;
-        case LagrangeTopology::Wedge: {
-            LagrangeEvaluateScratch& scratch = evaluate_scratch();
-            const AxisBasisEvaluations z_axis =
-                fill_axis_scratch(scratch.axis_z, vc, dc, d2c, bw, n_axis, xi[2], AxisDeriv::ValuesAndFirst);
-            const std::size_t tri_count = simplex_exponents_.size();
-            scratch.tri_values.resize(tri_count);
-            scratch.tri_gradient_components.resize(tri_count * 3u);
-            detail::evaluate_triangle_simplex_basis_to(simplex_exponents_, order_, xi,
-                                                       scratch.tri_values.data(),
-                                                       scratch.tri_gradient_components.data(),
-                                                       nullptr);
-            for (std::size_t n = 0; n < wedge_indices_.size(); ++n) {
-                const auto& index = wedge_indices_[n];
-                const std::size_t tri = index[0];
-                const std::size_t z = index[1];
-                const Real* tri_g = scratch.tri_gradient_components.data() + tri * 3u;
-                Real* g = gradients_out + n * 3u;
-                g[0] = tri_g[0] * z_axis.values[z];
-                g[1] = tri_g[1] * z_axis.values[z];
-                g[2] = scratch.tri_values[tri] * z_axis.first[z];
-            }
-            return;
-        }
-        case LagrangeTopology::Pyramid: {
-            detail::lagrange_pyramid::evaluate_gradients_to(order_, xi, gradients_out);
-            return;
-        }
-        case LagrangeTopology::Unknown:
-            break;
-    }
-
-    throw BasisEvaluationException("Unsupported element in evaluate_gradients_to",
-                                   __FILE__, __LINE__, __func__);
-}
-
-void LagrangeBasis::evaluate_hessians_to(const math::Vector<Real, 3>& xi,
+void LagrangeBasis::evaluate_hessians_to(const Vec3& xi,
                                          Real* SVMP_RESTRICT hessians_out) const {
-    const auto topology = static_cast<LagrangeTopology>(topology_id_);
-    if (evaluate_fixed_lagrange_fast_to(topology, order_, xi, nullptr, nullptr, hessians_out)) {
-        return;
-    }
-
-    const int n_axis = static_cast<int>(nodes_1d_.size());
-    const Real* vc = axis_v_coeffs_.data();
-    const Real* dc = axis_d_coeffs_.data();
-    const Real* d2c = axis_d2_coeffs_.data();
-    const Real* bw = axis_barycentric_weights_.data();
-    switch (topology) {
-        case LagrangeTopology::Point:
-            for (std::size_t i = 0; i < 9; ++i) {
-                hessians_out[i] = Real(0);
-            }
-            return;
-        case LagrangeTopology::Line:
-        case LagrangeTopology::Quadrilateral:
-        case LagrangeTopology::Hexahedron: {
-            LagrangeEvaluateScratch& scratch = evaluate_scratch();
-            const AxisBasisEvaluations x_axis =
-                fill_axis_scratch(scratch.axis_x, vc, dc, d2c, bw, n_axis, xi[0], AxisDeriv::ValuesAndFirstAndSecond);
-            AxisBasisEvaluations y_axis = constant_axis_basis();
-            AxisBasisEvaluations z_axis = constant_axis_basis();
-            if (topology != LagrangeTopology::Line) {
-                y_axis = fill_axis_scratch(scratch.axis_y, vc, dc, d2c, bw, n_axis, xi[1], AxisDeriv::ValuesAndFirstAndSecond);
-            }
-            if (topology == LagrangeTopology::Hexahedron) {
-                z_axis = fill_axis_scratch(scratch.axis_z, vc, dc, d2c, bw, n_axis, xi[2], AxisDeriv::ValuesAndFirstAndSecond);
-            }
-            evaluate_tensor_product_factorized_to(tensor_indices_, x_axis, y_axis, z_axis,
-                                                  nullptr, nullptr, hessians_out);
-            return;
-        }
-        case LagrangeTopology::Triangle:
-            detail::evaluate_triangle_simplex_basis_to(simplex_exponents_, order_, xi,
-                                                       nullptr, nullptr, hessians_out);
-            return;
-        case LagrangeTopology::Tetrahedron:
-            detail::evaluate_tetrahedron_simplex_basis_to(simplex_exponents_, order_, xi,
-                                                          nullptr, nullptr, hessians_out);
-            return;
-        case LagrangeTopology::Wedge: {
-            LagrangeEvaluateScratch& scratch = evaluate_scratch();
-            const AxisBasisEvaluations z_axis =
-                fill_axis_scratch(scratch.axis_z, vc, dc, d2c, bw, n_axis, xi[2], AxisDeriv::ValuesAndFirstAndSecond);
-            const std::size_t tri_count = simplex_exponents_.size();
-            scratch.tri_values.resize(tri_count);
-            scratch.tri_gradient_components.resize(tri_count * 3u);
-            scratch.tri_hessian_components.resize(tri_count * 9u);
-            detail::evaluate_triangle_simplex_basis_to(simplex_exponents_, order_, xi,
-                                                       scratch.tri_values.data(),
-                                                       scratch.tri_gradient_components.data(),
-                                                       scratch.tri_hessian_components.data());
-            for (std::size_t n = 0; n < wedge_indices_.size(); ++n) {
-                const auto& index = wedge_indices_[n];
-                const std::size_t tri = index[0];
-                const std::size_t z = index[1];
-                const Real zv = z_axis.values[z];
-                const Real zd = z_axis.first[z];
-                const Real zd2 = z_axis.second[z];
-                const Real* tri_g = scratch.tri_gradient_components.data() + tri * 3u;
-                const Real* tri_H = scratch.tri_hessian_components.data() + tri * 9u;
-                const Real hxy = tri_H[1] * zv;
-                const Real hxz = tri_g[0] * zd;
-                const Real hyz = tri_g[1] * zd;
-                Real* H = hessians_out + n * 9u;
-                H[0] = tri_H[0] * zv;
-                H[4] = tri_H[4] * zv;
-                H[1] = hxy;
-                H[3] = hxy;
-                H[8] = scratch.tri_values[tri] * zd2;
-                H[2] = hxz;
-                H[6] = hxz;
-                H[5] = hyz;
-                H[7] = hyz;
-            }
-            return;
-        }
-        case LagrangeTopology::Pyramid: {
-            detail::lagrange_pyramid::evaluate_hessians_to(order_, xi, hessians_out);
-            return;
-        }
-        case LagrangeTopology::Unknown:
-            break;
-    }
-
-    throw BasisEvaluationException("Unsupported element in evaluate_hessians_to",
-                                   __FILE__, __LINE__, __func__);
-}
-
-void LagrangeBasis::evaluate_all_to(const math::Vector<Real, 3>& xi,
-                                    Real* SVMP_RESTRICT values_out,
-                                    Real* SVMP_RESTRICT gradients_out,
-                                    Real* SVMP_RESTRICT hessians_out) const {
-    const auto topology = static_cast<LagrangeTopology>(topology_id_);
-    if (evaluate_fixed_lagrange_fast_to(topology, order_, xi, values_out, gradients_out, hessians_out)) {
-        return;
-    }
-
-    const int n_axis = static_cast<int>(nodes_1d_.size());
-    const Real* vc = axis_v_coeffs_.data();
-    const Real* dc = axis_d_coeffs_.data();
-    const Real* d2c = axis_d2_coeffs_.data();
-    const Real* bw = axis_barycentric_weights_.data();
-    switch (topology) {
-        case LagrangeTopology::Point:
-            values_out[0] = Real(1);
-            gradients_out[0] = Real(0);
-            gradients_out[1] = Real(0);
-            gradients_out[2] = Real(0);
-            for (std::size_t i = 0; i < 9; ++i) {
-                hessians_out[i] = Real(0);
-            }
-            return;
-        case LagrangeTopology::Line:
-        case LagrangeTopology::Quadrilateral:
-        case LagrangeTopology::Hexahedron: {
-            LagrangeEvaluateScratch& scratch = evaluate_scratch();
-            const AxisBasisEvaluations x_axis =
-                fill_axis_scratch(scratch.axis_x, vc, dc, d2c, bw, n_axis, xi[0], AxisDeriv::ValuesAndFirstAndSecond);
-            AxisBasisEvaluations y_axis = constant_axis_basis();
-            AxisBasisEvaluations z_axis = constant_axis_basis();
-            if (topology != LagrangeTopology::Line) {
-                y_axis = fill_axis_scratch(scratch.axis_y, vc, dc, d2c, bw, n_axis, xi[1], AxisDeriv::ValuesAndFirstAndSecond);
-            }
-            if (topology == LagrangeTopology::Hexahedron) {
-                z_axis = fill_axis_scratch(scratch.axis_z, vc, dc, d2c, bw, n_axis, xi[2], AxisDeriv::ValuesAndFirstAndSecond);
-            }
-            evaluate_tensor_product_factorized_to(tensor_indices_, x_axis, y_axis, z_axis,
-                                                  values_out, gradients_out, hessians_out);
-            return;
-        }
-        case LagrangeTopology::Triangle:
-            detail::evaluate_triangle_simplex_basis_to(simplex_exponents_, order_, xi,
-                                                       values_out, gradients_out, hessians_out);
-            return;
-        case LagrangeTopology::Tetrahedron:
-            detail::evaluate_tetrahedron_simplex_basis_to(simplex_exponents_, order_, xi,
-                                                          values_out, gradients_out, hessians_out);
-            return;
-        case LagrangeTopology::Wedge: {
-            LagrangeEvaluateScratch& scratch = evaluate_scratch();
-            const AxisBasisEvaluations z_axis =
-                fill_axis_scratch(scratch.axis_z, vc, dc, d2c, bw, n_axis, xi[2], AxisDeriv::ValuesAndFirstAndSecond);
-            const std::size_t tri_count = simplex_exponents_.size();
-            scratch.tri_values.resize(tri_count);
-            scratch.tri_gradient_components.resize(tri_count * 3u);
-            scratch.tri_hessian_components.resize(tri_count * 9u);
-            detail::evaluate_triangle_simplex_basis_to(simplex_exponents_, order_, xi,
-                                                       scratch.tri_values.data(),
-                                                       scratch.tri_gradient_components.data(),
-                                                       scratch.tri_hessian_components.data());
-            for (std::size_t n = 0; n < wedge_indices_.size(); ++n) {
-                const auto& index = wedge_indices_[n];
-                const std::size_t tri = index[0];
-                const std::size_t z = index[1];
-                const Real zv = z_axis.values[z];
-                const Real zd = z_axis.first[z];
-                const Real zd2 = z_axis.second[z];
-                const Real tri_v = scratch.tri_values[tri];
-                const Real* tri_g = scratch.tri_gradient_components.data() + tri * 3u;
-                const Real* tri_H = scratch.tri_hessian_components.data() + tri * 9u;
-                const Real hxy = tri_H[1] * zv;
-                const Real hxz = tri_g[0] * zd;
-                const Real hyz = tri_g[1] * zd;
-
-                values_out[n] = tri_v * zv;
-
-                Real* g = gradients_out + n * 3u;
-                g[0] = tri_g[0] * zv;
-                g[1] = tri_g[1] * zv;
-                g[2] = tri_v * zd;
-
-                Real* H = hessians_out + n * 9u;
-                H[0] = tri_H[0] * zv;
-                H[4] = tri_H[4] * zv;
-                H[1] = hxy;
-                H[3] = hxy;
-                H[8] = tri_v * zd2;
-                H[2] = hxz;
-                H[6] = hxz;
-                H[5] = hyz;
-                H[7] = hyz;
-            }
-            return;
-        }
-        case LagrangeTopology::Pyramid: {
-            detail::lagrange_pyramid::evaluate_all_to(
-                order_, xi, values_out, gradients_out, hessians_out);
-            return;
-        }
-        case LagrangeTopology::Unknown:
-            break;
-    }
-
-    throw BasisEvaluationException("Unsupported element in evaluate_all_to",
-                                   __FILE__, __LINE__, __func__);
-}
-
-void LagrangeBasis::evaluate_at_quadrature_points(
-    const std::vector<math::Vector<Real, 3>>& points,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) const {
-    evaluate_at_quadrature_points_strided(points, points.size(), values_out, gradients_out, hessians_out);
-}
-
-void LagrangeBasis::evaluate_at_quadrature_points_strided(
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) const {
-    const std::size_t num_qpts = points.size();
-    const std::size_t num_dofs = size();
-    if (output_stride < num_qpts) {
-        throw BasisConfigurationException("LagrangeBasis strided evaluation requires output_stride >= points.size()",
-                                          __FILE__, __LINE__, __func__);
-    }
-    if (values_out == nullptr && gradients_out == nullptr && hessians_out == nullptr) {
-        return;
-    }
-
-    const auto topology = static_cast<LagrangeTopology>(topology_id_);
-    if (evaluate_fixed_lagrange_fast_strided(topology,
-                                             order_,
-                                             points,
-                                             output_stride,
-                                             values_out,
-                                             gradients_out,
-                                             hessians_out)) {
-        return;
-    }
-
-    if (topology == LagrangeTopology::Line ||
-        topology == LagrangeTopology::Quadrilateral ||
-        topology == LagrangeTopology::Hexahedron) {
-        evaluate_tensor_product_points_strided(topology,
-                                               tensor_indices_,
-                                               points,
-                                               output_stride,
-                                               axis_v_coeffs_.data(),
-                                               axis_d_coeffs_.data(),
-                                               axis_d2_coeffs_.data(),
-                                               axis_barycentric_weights_.data(),
-                                               static_cast<int>(nodes_1d_.size()),
-                                               values_out,
-                                               gradients_out,
-                                               hessians_out);
-        return;
-    }
-
-    if (topology == LagrangeTopology::Triangle) {
-        detail::evaluate_triangle_simplex_basis_strided(
-            simplex_exponents_, order_, points, output_stride, values_out, gradients_out, hessians_out);
-        return;
-    }
-
-    if (topology == LagrangeTopology::Tetrahedron) {
-        detail::evaluate_tetrahedron_simplex_basis_strided(
-            simplex_exponents_, order_, points, output_stride, values_out, gradients_out, hessians_out);
-        return;
-    }
-
-    if (topology == LagrangeTopology::Wedge &&
-        evaluate_wedge_fast_strided(order_,
-                                    wedge_indices_,
-                                    points,
-                                    output_stride,
-                                    values_out,
-                                    gradients_out,
-                                    hessians_out)) {
-        return;
-    }
-
-    const bool wedge_scalar_hessian_fallback =
-        topology == LagrangeTopology::Wedge &&
-        values_out == nullptr &&
-        gradients_out == nullptr &&
-        hessians_out != nullptr &&
-        order_ <= 2;
-    if (topology == LagrangeTopology::Wedge && !wedge_scalar_hessian_fallback) {
-        evaluate_wedge_points_strided(simplex_exponents_,
-                                      wedge_indices_,
-                                      wedge_node_by_tri_z_,
-                                      order_,
-                                      points,
-                                      output_stride,
-                                      axis_v_coeffs_.data(),
-                                      axis_d_coeffs_.data(),
-                                      axis_d2_coeffs_.data(),
-                                      axis_barycentric_weights_.data(),
-                                      static_cast<int>(nodes_1d_.size()),
-                                      values_out,
-                                      gradients_out,
-                                      hessians_out);
-        return;
-    }
-
-    if (topology == LagrangeTopology::Pyramid) {
-        detail::lagrange_pyramid::evaluate_at_quadrature_points_strided(
-            order_, points, output_stride, values_out, gradients_out, hessians_out);
-        return;
-    }
-
-    auto& scratch = evaluate_scratch();
-    auto& v_tmp = scratch.strided_values_tmp;
-    auto& g_tmp = scratch.strided_gradients_tmp;
-    auto& h_tmp = scratch.strided_hessians_tmp;
-
-    if (values_out)    v_tmp.resize(num_dofs);
-    if (gradients_out) g_tmp.resize(num_dofs * 3u);
-    if (hessians_out)  h_tmp.resize(num_dofs * 9u);
-
-    for (std::size_t q = 0; q < num_qpts; ++q) {
-        if (values_out && gradients_out && hessians_out) {
-            evaluate_all_to(points[q], v_tmp.data(), g_tmp.data(), h_tmp.data());
-        } else {
-            if (values_out)    evaluate_values_to(points[q], v_tmp.data());
-            if (gradients_out) evaluate_gradients_to(points[q], g_tmp.data());
-            if (hessians_out)  evaluate_hessians_to(points[q], h_tmp.data());
-        }
-
-        if (values_out) {
-            for (std::size_t d = 0; d < num_dofs; ++d) {
-                values_out[d * output_stride + q] = v_tmp[d];
-            }
-        }
-        if (gradients_out) {
-            for (std::size_t d = 0; d < num_dofs; ++d) {
-                gradients_out[(d * 3u + 0u) * output_stride + q] = g_tmp[d * 3u + 0u];
-                gradients_out[(d * 3u + 1u) * output_stride + q] = g_tmp[d * 3u + 1u];
-                gradients_out[(d * 3u + 2u) * output_stride + q] = g_tmp[d * 3u + 2u];
-            }
-        }
-        if (hessians_out) {
-            for (std::size_t d = 0; d < num_dofs; ++d) {
-                scatter_hessian_components_strided(
-                    h_tmp.data() + d * 9u,
-                    hessians_out + d * 9u * output_stride,
-                    output_stride,
-                    q);
-            }
-        }
-    }
+    evaluate_all_to(xi, nullptr, nullptr, hessians_out);
 }
 
 } // namespace basis
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasis.h b/Code/Source/solver/FE/Basis/LagrangeBasis.h
index 91f7e379c..a5fe8e0fa 100644
--- a/Code/Source/solver/FE/Basis/LagrangeBasis.h
+++ b/Code/Source/solver/FE/Basis/LagrangeBasis.h
@@ -8,12 +8,9 @@
 #ifndef SVMP_FE_BASIS_LAGRANGEBASIS_H
 #define SVMP_FE_BASIS_LAGRANGEBASIS_H
 
-/**
- * @file LagrangeBasis.h
- * @brief Nodal Lagrange polynomial basis on reference elements
- */
-
 #include "BasisFunction.h"
+#include "BasisTraits.h"
+
 #include <array>
 #include <cstddef>
 
@@ -23,33 +20,12 @@ namespace basis {
 
 void prewarm_lagrange_basis_scratch(int max_order, std::size_t max_qpts = 0);
 
-/**
- * @brief Complete nodal H1 Lagrange basis on canonical reference topologies
- *
- * Supports arbitrary polynomial order on the canonical complete families:
- * `Line2`, `Triangle3`, `Quad4`, `Tetra4`, `Hex8`, `Wedge6`, and `Pyramid5`.
- * Low-order complete-family aliases (`Line3`, `Triangle6`, `Quad9`,
- * `Tetra10`, `Hex27`, `Wedge18`, `Pyramid14`) normalize to their canonical
- * topology plus order. Serendipity variants remain intentionally excluded.
- *
- * Node locations are generated on canonical reference elements using
- * equispaced coordinates on tensor-product elements, barycentric grids on
- * simplices, tensorized triangle-line grids on wedges, and a rational nodal
- * pyramid construction on `Pyramid5`.
- *
- * The evaluator is numerically stabilized for those nodes, but the
- * interpolation problem itself remains the equispaced Lagrange problem. For
- * high-order interpolation, especially order >= 4, prefer `SpectralBasis`
- * (GLL / Warp & Blend nodes) unless exact equispaced nodal placement is part
- * of the requested discretization.
- *
- * For the rational pyramid family, basis values remain exact at the apex.
- * Gradients and Hessians are analytic on the supported interior reference
- * domain, but the exact-apex nodal derivative limit is not unique and those
- * derivative queries throw at the exact apex.
- */
 class LagrangeBasis : public BasisFunction {
 public:
+    using TensorNodeIndex = std::array<std::size_t, 3>;
+    using SimplexExponent = std::array<int, 4>;
+    using WedgeNodeIndex = std::array<std::size_t, 2>;
+
     LagrangeBasis(ElementType type, int order);
 
     BasisType basis_type() const noexcept override { return BasisType::Lagrange; }
@@ -57,7 +33,6 @@ class LagrangeBasis : public BasisFunction {
     int dimension() const noexcept override { return dimension_; }
     int order() const noexcept override { return order_; }
     std::size_t size() const noexcept override { return nodes_.size(); }
-    bool cache_identity_is_structural() const noexcept override { return true; }
 
     const std::vector<math::Vector<Real, 3>>& nodes() const noexcept { return nodes_; }
 
@@ -72,96 +47,32 @@ class LagrangeBasis : public BasisFunction {
                       std::vector<Gradient>& gradients,
                       std::vector<Hessian>& hessians) const final;
 
-    void evaluate_at_quadrature_points(
-        const std::vector<math::Vector<Real, 3>>& points,
-        Real* SVMP_RESTRICT values_out,
-        Real* SVMP_RESTRICT gradients_out,
-        Real* SVMP_RESTRICT hessians_out) const final;
-    void evaluate_at_quadrature_points_strided(
-        const std::vector<math::Vector<Real, 3>>& points,
-        std::size_t output_stride,
-        Real* SVMP_RESTRICT values_out,
-        Real* SVMP_RESTRICT gradients_out,
-        Real* SVMP_RESTRICT hessians_out) const final;
-
-    // Raw-pointer output API. Caller must pre-size buffers to size().
-    void evaluate_values_to(const math::Vector<Real, 3>& xi, Real* SVMP_RESTRICT values_out) const final;
-    void evaluate_gradients_to(const math::Vector<Real, 3>& xi, Real* SVMP_RESTRICT gradients_out) const final;
-    void evaluate_hessians_to(const math::Vector<Real, 3>& xi, Real* SVMP_RESTRICT hessians_out) const final;
+    void evaluate_values_to(const math::Vector<Real, 3>& xi,
+                            Real* SVMP_RESTRICT values_out) const final;
+    void evaluate_gradients_to(const math::Vector<Real, 3>& xi,
+                               Real* SVMP_RESTRICT gradients_out) const final;
+    void evaluate_hessians_to(const math::Vector<Real, 3>& xi,
+                              Real* SVMP_RESTRICT hessians_out) const final;
 
 private:
-    using TensorNodeIndex = std::array<std::size_t, 3>;
-    using WedgeNodeIndex = std::array<std::size_t, 2>;
-    using VectorEvaluationDispatch = void (LagrangeBasis::*)(
-        const math::Vector<Real, 3>&,
-        std::vector<Real>*,
-        std::vector<Gradient>*,
-        std::vector<Hessian>*) const;
-
-    // Cached topology encoded as int because the topology enum lives in
-    // the .cpp anon namespace. Set once in init_nodes.
-    int topology_id_ = 0;
-
     ElementType element_type_;
-    int dimension_;
-    int order_;
+    BasisTopology topology_{BasisTopology::Unknown};
+    int dimension_{0};
+    int order_{0};
 
     std::vector<Real> nodes_1d_;
     std::vector<math::Vector<Real, 3>> nodes_;
     std::vector<TensorNodeIndex> tensor_indices_;
-    std::vector<std::array<int, 4>> simplex_exponents_;
+    std::vector<SimplexExponent> simplex_exponents_;
     std::vector<WedgeNodeIndex> wedge_indices_;
-    std::vector<std::size_t> wedge_node_by_tri_z_;
-
-    // Precomputed Horner-form coefficients of the 1D Lagrange basis.
-    // Layout per axis (n_axis = nodes_1d_.size() = order_+1):
-    //   axis_v_coeffs_[i * n_axis + k] = coeff of x^k in L_i(x), 0 <= i,k < n_axis
-    //   axis_d_coeffs_[i * (n_axis - 1) + k] = coeff of x^k in L_i'(x)
-    //   axis_d2_coeffs_[i * (n_axis - 2) + k] = coeff of x^k in L_i''(x)  (only if n_axis >= 3)
-    // Populated by build_tensor_product_nodes / build_wedge_nodes.
-    std::vector<Real> axis_v_coeffs_;
-    std::vector<Real> axis_d_coeffs_;
-    std::vector<Real> axis_d2_coeffs_;
-    std::vector<Real> axis_barycentric_weights_;
-    VectorEvaluationDispatch vector_evaluation_dispatch_{nullptr};
 
     void init_nodes();
-    void init_evaluation_dispatch();
     void build_point_nodes();
     void build_tensor_product_nodes(int dimensions);
     void build_simplex_nodes();
     void build_wedge_nodes();
-    void build_pyramid_nodes();
     void init_equispaced_1d_nodes();
-    void compute_axis_monomial_coefficients();
-    void evaluate_point_vectors(const math::Vector<Real, 3>& xi,
-                                std::vector<Real>* values,
-                                std::vector<Gradient>* gradients,
-                                std::vector<Hessian>* hessians) const;
-    void evaluate_tensor_product_vectors(const math::Vector<Real, 3>& xi,
-                                         std::vector<Real>* values,
-                                         std::vector<Gradient>* gradients,
-                                         std::vector<Hessian>* hessians) const;
-    void evaluate_triangle_vectors(const math::Vector<Real, 3>& xi,
-                                   std::vector<Real>* values,
-                                   std::vector<Gradient>* gradients,
-                                   std::vector<Hessian>* hessians) const;
-    void evaluate_tetrahedron_vectors(const math::Vector<Real, 3>& xi,
-                                      std::vector<Real>* values,
-                                      std::vector<Gradient>* gradients,
-                                      std::vector<Hessian>* hessians) const;
-    void evaluate_wedge_vectors(const math::Vector<Real, 3>& xi,
-                                std::vector<Real>* values,
-                                std::vector<Gradient>* gradients,
-                                std::vector<Hessian>* hessians) const;
-    void evaluate_pyramid_vectors(const math::Vector<Real, 3>& xi,
-                                  std::vector<Real>* values,
-                                  std::vector<Gradient>* gradients,
-                                  std::vector<Hessian>* hessians) const;
-    void evaluate_unsupported_vectors(const math::Vector<Real, 3>& xi,
-                                      std::vector<Real>* values,
-                                      std::vector<Gradient>* gradients,
-                                      std::vector<Hessian>* hessians) const;
+
     void evaluate_all_to(const math::Vector<Real, 3>& xi,
                          Real* SVMP_RESTRICT values_out,
                          Real* SVMP_RESTRICT gradients_out,
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasisFast.h b/Code/Source/solver/FE/Basis/LagrangeBasisFast.h
deleted file mode 100644
index 5b9faae04..000000000
--- a/Code/Source/solver/FE/Basis/LagrangeBasisFast.h
+++ /dev/null
@@ -1,1378 +0,0 @@
-/* Copyright (c) Stanford University, The Regents of the University of California, and others.
- *
- * All Rights Reserved.
- *
- * See License file.
- */
-
-#ifndef SVMP_FE_BASIS_LAGRANGEBASISFAST_H
-#define SVMP_FE_BASIS_LAGRANGEBASISFAST_H
-
-/**
- * @file LagrangeBasisFast.h
- * @brief Header-only zero-overhead specializations of the Lagrange basis
- *
- * Provides templated static methods for the common nodal Lagrange families
- * with compile-time-known polynomial order. Callers that know their basis
- * type and order at compile time use these directly — there is no virtual
- * dispatch, no std::vector allocation, no scratch lookup, and no topology
- * switch. The output buffers are stack-allocated std::array, sized at
- * compile time. The compiler fully unrolls and constant-folds.
- *
- * These specializations are an alternative entry point to the runtime path
- * provided by `LagrangeBasis`. The runtime path remains the canonical API
- * for generic callers; these specializations serve hot loops that know the
- * element type.
- *
- * Node orderings match `ReferenceNodeLayout::get_lagrange_node_coords(...)` (VTK).
- */
-
-#include "Types.h"
-#include "Math/Vector.h"
-#include "Math/Matrix.h"
-#include <array>
-#include <cstddef>
-
-namespace svmp {
-namespace FE {
-namespace basis {
-
-using Gradient = math::Vector<Real, 3>;
-using Hessian  = math::Matrix<Real, 3, 3>;
-
-namespace detail {
-
-constexpr Gradient scaled_gradient(const Gradient& gradient, Real scale) {
-    return Gradient{scale * gradient[0], scale * gradient[1], scale * gradient[2]};
-}
-
-constexpr Gradient p2_edge_gradient(Real left,
-                                    const Gradient& left_gradient,
-                                    Real right,
-                                    const Gradient& right_gradient) {
-    return Gradient{
-        Real(4) * (left_gradient[0] * right + right_gradient[0] * left),
-        Real(4) * (left_gradient[1] * right + right_gradient[1] * left),
-        Real(4) * (left_gradient[2] * right + right_gradient[2] * left),
-    };
-}
-
-constexpr Hessian p2_vertex_hessian(const Gradient& gradient) {
-    Hessian hessian{};
-    for (std::size_t row = 0; row < 3u; ++row) {
-        for (std::size_t col = 0; col < 3u; ++col) {
-            hessian(row, col) = Real(4) * gradient[row] * gradient[col];
-        }
-    }
-    return hessian;
-}
-
-constexpr Hessian p2_edge_hessian(const Gradient& left_gradient,
-                                  const Gradient& right_gradient) {
-    Hessian hessian{};
-    for (std::size_t row = 0; row < 3u; ++row) {
-        for (std::size_t col = 0; col < 3u; ++col) {
-            hessian(row, col) = Real(4) * (
-                left_gradient[row] * right_gradient[col] +
-                right_gradient[row] * left_gradient[col]);
-        }
-    }
-    return hessian;
-}
-
-constexpr std::size_t public_axis_index(int lattice, int order) noexcept {
-    return lattice == 0 ? 0u :
-           lattice == order ? 1u :
-           static_cast<std::size_t>(lattice + 1);
-}
-
-template<int Order>
-constexpr Real public_axis_coord(std::size_t public_index) noexcept {
-    const int lattice = public_index == 0u ? 0 :
-                        public_index == 1u ? Order :
-                        static_cast<int>(public_index) - 1;
-    return Real(-1) + Real(2) * static_cast<Real>(lattice) / static_cast<Real>(Order);
-}
-
-template<int Order>
-constexpr std::array<Real, Order + 1> make_public_axis_nodes() {
-    std::array<Real, Order + 1> nodes{};
-    for (std::size_t i = 0; i < nodes.size(); ++i) {
-        nodes[i] = public_axis_coord<Order>(i);
-    }
-    return nodes;
-}
-
-template<int Order>
-constexpr std::array<Real, Order + 1> make_public_axis_inverse_denominators() {
-    constexpr auto nodes = make_public_axis_nodes<Order>();
-    std::array<Real, Order + 1> inv_denominators{};
-    for (std::size_t i = 0; i < nodes.size(); ++i) {
-        Real denominator = Real(1);
-        for (std::size_t j = 0; j < nodes.size(); ++j) {
-            if (j != i) {
-                denominator *= nodes[i] - nodes[j];
-            }
-        }
-        inv_denominators[i] = Real(1) / denominator;
-    }
-    return inv_denominators;
-}
-
-template<int Order, bool NeedFirst, bool NeedSecond>
-void fill_axis_lagrange(Real x,
-                        std::array<Real, Order + 1>& values,
-                        std::array<Real, Order + 1>* first,
-                        std::array<Real, Order + 1>* second) {
-    constexpr auto nodes = make_public_axis_nodes<Order>();
-    constexpr auto inv_denominators = make_public_axis_inverse_denominators<Order>();
-    for (std::size_t i = 0; i < nodes.size(); ++i) {
-        Real product = Real(1);
-        for (std::size_t j = 0; j < nodes.size(); ++j) {
-            if (j != i) {
-                product *= x - nodes[j];
-            }
-        }
-        values[i] = product * inv_denominators[i];
-
-        if constexpr (NeedFirst) {
-            Real derivative = Real(0);
-            for (std::size_t m = 0; m < nodes.size(); ++m) {
-                if (m == i) {
-                    continue;
-                }
-                Real term = Real(1);
-                for (std::size_t j = 0; j < nodes.size(); ++j) {
-                    if (j != i && j != m) {
-                        term *= x - nodes[j];
-                    }
-                }
-                derivative += term;
-            }
-            (*first)[i] = derivative * inv_denominators[i];
-        }
-
-        if constexpr (NeedSecond) {
-            Real curvature = Real(0);
-            for (std::size_t m = 0; m < nodes.size(); ++m) {
-                if (m == i) {
-                    continue;
-                }
-                for (std::size_t l = 0; l < nodes.size(); ++l) {
-                    if (l == i || l == m) {
-                        continue;
-                    }
-                    Real term = Real(1);
-                    for (std::size_t j = 0; j < nodes.size(); ++j) {
-                        if (j != i && j != m && j != l) {
-                            term *= x - nodes[j];
-                        }
-                    }
-                    curvature += term;
-                }
-            }
-            (*second)[i] = curvature * inv_denominators[i];
-        }
-    }
-}
-
-template<int Order>
-void fill_axis_values(Real x, std::array<Real, Order + 1>& values) {
-    fill_axis_lagrange<Order, false, false>(x, values, nullptr, nullptr);
-}
-
-template<int Order>
-void fill_axis_values_first(Real x,
-                            std::array<Real, Order + 1>& values,
-                            std::array<Real, Order + 1>& first) {
-    fill_axis_lagrange<Order, true, false>(x, values, &first, nullptr);
-}
-
-template<int Order>
-void fill_axis_values_first_second(Real x,
-                                   std::array<Real, Order + 1>& values,
-                                   std::array<Real, Order + 1>& first,
-                                   std::array<Real, Order + 1>& second) {
-    fill_axis_lagrange<Order, true, true>(x, values, &first, &second);
-}
-
-template<int Order>
-constexpr std::array<std::array<std::size_t, 2>, (Order + 1) * (Order + 1)>
-make_quad_tensor_node_axes() {
-    std::array<std::array<std::size_t, 2>, (Order + 1) * (Order + 1)> axes{};
-    std::size_t n = 0;
-
-    axes[n++] = {{0u, 0u}};
-    axes[n++] = {{1u, 0u}};
-    axes[n++] = {{1u, 1u}};
-    axes[n++] = {{0u, 1u}};
-
-    for (int i = 1; i < Order; ++i) {
-        axes[n++] = {{public_axis_index(i, Order), 0u}};
-    }
-    for (int j = 1; j < Order; ++j) {
-        axes[n++] = {{1u, public_axis_index(j, Order)}};
-    }
-    for (int i = Order - 1; i >= 1; --i) {
-        axes[n++] = {{public_axis_index(i, Order), 1u}};
-    }
-    for (int j = Order - 1; j >= 1; --j) {
-        axes[n++] = {{0u, public_axis_index(j, Order)}};
-    }
-
-    for (int j = 1; j < Order; ++j) {
-        for (int i = 1; i < Order; ++i) {
-            axes[n++] = {{public_axis_index(i, Order), public_axis_index(j, Order)}};
-        }
-    }
-
-    return axes;
-}
-
-template<int Order>
-constexpr std::array<std::array<std::size_t, 3>, (Order + 1) * (Order + 1) * (Order + 1)>
-make_hex_tensor_node_axes() {
-    std::array<std::array<std::size_t, 3>, (Order + 1) * (Order + 1) * (Order + 1)> axes{};
-    std::size_t n = 0;
-
-    axes[n++] = {{0u, 0u, 0u}};
-    axes[n++] = {{1u, 0u, 0u}};
-    axes[n++] = {{1u, 1u, 0u}};
-    axes[n++] = {{0u, 1u, 0u}};
-    axes[n++] = {{0u, 0u, 1u}};
-    axes[n++] = {{1u, 0u, 1u}};
-    axes[n++] = {{1u, 1u, 1u}};
-    axes[n++] = {{0u, 1u, 1u}};
-
-    for (int i = 1; i < Order; ++i) {
-        axes[n++] = {{public_axis_index(i, Order), 0u, 0u}};
-    }
-    for (int j = 1; j < Order; ++j) {
-        axes[n++] = {{1u, public_axis_index(j, Order), 0u}};
-    }
-    for (int i = Order - 1; i >= 1; --i) {
-        axes[n++] = {{public_axis_index(i, Order), 1u, 0u}};
-    }
-    for (int j = Order - 1; j >= 1; --j) {
-        axes[n++] = {{0u, public_axis_index(j, Order), 0u}};
-    }
-    for (int i = 1; i < Order; ++i) {
-        axes[n++] = {{public_axis_index(i, Order), 0u, 1u}};
-    }
-    for (int j = 1; j < Order; ++j) {
-        axes[n++] = {{1u, public_axis_index(j, Order), 1u}};
-    }
-    for (int i = Order - 1; i >= 1; --i) {
-        axes[n++] = {{public_axis_index(i, Order), 1u, 1u}};
-    }
-    for (int j = Order - 1; j >= 1; --j) {
-        axes[n++] = {{0u, public_axis_index(j, Order), 1u}};
-    }
-    for (int k = 1; k < Order; ++k) {
-        axes[n++] = {{0u, 0u, public_axis_index(k, Order)}};
-    }
-    for (int k = 1; k < Order; ++k) {
-        axes[n++] = {{1u, 0u, public_axis_index(k, Order)}};
-    }
-    for (int k = 1; k < Order; ++k) {
-        axes[n++] = {{1u, 1u, public_axis_index(k, Order)}};
-    }
-    for (int k = 1; k < Order; ++k) {
-        axes[n++] = {{0u, 1u, public_axis_index(k, Order)}};
-    }
-
-    for (int j = 1; j < Order; ++j) {
-        for (int i = 1; i < Order; ++i) {
-            axes[n++] = {{public_axis_index(i, Order), public_axis_index(j, Order), 0u}};
-        }
-    }
-    for (int j = 1; j < Order; ++j) {
-        for (int i = 1; i < Order; ++i) {
-            axes[n++] = {{public_axis_index(i, Order), public_axis_index(j, Order), 1u}};
-        }
-    }
-    for (int k = 1; k < Order; ++k) {
-        for (int i = 1; i < Order; ++i) {
-            axes[n++] = {{public_axis_index(i, Order), 0u, public_axis_index(k, Order)}};
-        }
-    }
-    for (int k = 1; k < Order; ++k) {
-        for (int j = 1; j < Order; ++j) {
-            axes[n++] = {{1u, public_axis_index(j, Order), public_axis_index(k, Order)}};
-        }
-    }
-    for (int k = 1; k < Order; ++k) {
-        for (int i = Order - 1; i >= 1; --i) {
-            axes[n++] = {{public_axis_index(i, Order), 1u, public_axis_index(k, Order)}};
-        }
-    }
-    for (int k = 1; k < Order; ++k) {
-        for (int j = Order - 1; j >= 1; --j) {
-            axes[n++] = {{0u, public_axis_index(j, Order), public_axis_index(k, Order)}};
-        }
-    }
-
-    for (int k = 1; k < Order; ++k) {
-        for (int j = 1; j < Order; ++j) {
-            for (int i = 1; i < Order; ++i) {
-                axes[n++] = {{public_axis_index(i, Order),
-                              public_axis_index(j, Order),
-                              public_axis_index(k, Order)}};
-            }
-        }
-    }
-
-    return axes;
-}
-
-template<int Order>
-constexpr std::array<std::array<std::size_t, 3>, (Order + 1) * (Order + 2) / 2>
-make_triangle_simplex_exponents() {
-    std::array<std::array<std::size_t, 3>, (Order + 1) * (Order + 2) / 2> exponents{};
-    std::size_t n = 0;
-
-    exponents[n++] = {{static_cast<std::size_t>(Order), 0u, 0u}};
-    exponents[n++] = {{0u, static_cast<std::size_t>(Order), 0u}};
-    exponents[n++] = {{0u, 0u, static_cast<std::size_t>(Order)}};
-
-    for (int m = 1; m < Order; ++m) {
-        exponents[n++] = {{static_cast<std::size_t>(Order - m), static_cast<std::size_t>(m), 0u}};
-    }
-    for (int m = 1; m < Order; ++m) {
-        exponents[n++] = {{0u, static_cast<std::size_t>(Order - m), static_cast<std::size_t>(m)}};
-    }
-    for (int m = 1; m < Order; ++m) {
-        exponents[n++] = {{static_cast<std::size_t>(m), 0u, static_cast<std::size_t>(Order - m)}};
-    }
-
-    for (int c = 1; c <= Order - 2; ++c) {
-        for (int b = 1; b <= Order - c - 1; ++b) {
-            const int a = Order - b - c;
-            exponents[n++] = {{static_cast<std::size_t>(a),
-                               static_cast<std::size_t>(b),
-                               static_cast<std::size_t>(c)}};
-        }
-    }
-
-    return exponents;
-}
-
-template<int Order>
-constexpr std::array<std::array<std::size_t, 4>, (Order + 1) * (Order + 2) * (Order + 3) / 6>
-make_tetrahedron_simplex_exponents() {
-    std::array<std::array<std::size_t, 4>, (Order + 1) * (Order + 2) * (Order + 3) / 6> exponents{};
-    std::size_t n = 0;
-
-    exponents[n++] = {{static_cast<std::size_t>(Order), 0u, 0u, 0u}};
-    exponents[n++] = {{0u, static_cast<std::size_t>(Order), 0u, 0u}};
-    exponents[n++] = {{0u, 0u, static_cast<std::size_t>(Order), 0u}};
-    exponents[n++] = {{0u, 0u, 0u, static_cast<std::size_t>(Order)}};
-
-    constexpr int edges[6][2] = {
-        {0, 1}, {1, 2}, {2, 0}, {0, 3}, {1, 3}, {2, 3}
-    };
-    for (const auto& edge : edges) {
-        for (int m = 1; m < Order; ++m) {
-            std::array<std::size_t, 4> e{};
-            e[static_cast<std::size_t>(edge[0])] = static_cast<std::size_t>(Order - m);
-            e[static_cast<std::size_t>(edge[1])] = static_cast<std::size_t>(m);
-            exponents[n++] = e;
-        }
-    }
-
-    constexpr int faces[4][3] = {
-        {0, 1, 2},
-        {0, 1, 3},
-        {1, 2, 3},
-        {0, 2, 3},
-    };
-    for (const auto& face : faces) {
-        for (int c = 1; c <= Order - 2; ++c) {
-            for (int b = 1; b <= Order - c - 1; ++b) {
-                const int a = Order - b - c;
-                std::array<std::size_t, 4> e{};
-                e[static_cast<std::size_t>(face[0])] = static_cast<std::size_t>(a);
-                e[static_cast<std::size_t>(face[1])] = static_cast<std::size_t>(b);
-                e[static_cast<std::size_t>(face[2])] = static_cast<std::size_t>(c);
-                exponents[n++] = e;
-            }
-        }
-    }
-
-    for (int l = 1; l <= Order - 3; ++l) {
-        for (int k = 1; k <= Order - l - 2; ++k) {
-            for (int j = 1; j <= Order - l - k - 1; ++j) {
-                const int i = Order - j - k - l;
-                exponents[n++] = {{static_cast<std::size_t>(i),
-                                   static_cast<std::size_t>(j),
-                                   static_cast<std::size_t>(k),
-                                   static_cast<std::size_t>(l)}};
-            }
-        }
-    }
-
-    return exponents;
-}
-
-template<int Order, bool NeedFirst, bool NeedSecond>
-void fill_simplex_factor_sequence(Real lambda,
-                                  std::array<Real, Order + 1>& phi,
-                                  std::array<Real, Order + 1>* dphi,
-                                  std::array<Real, Order + 1>* d2phi) {
-    phi[0] = Real(1);
-    if constexpr (NeedFirst) {
-        (*dphi)[0] = Real(0);
-    }
-    if constexpr (NeedSecond) {
-        (*d2phi)[0] = Real(0);
-    }
-
-    const Real t = static_cast<Real>(Order) * lambda;
-    constexpr Real dt_dlambda = static_cast<Real>(Order);
-    Real dphi_dt_prev = Real(0);
-    Real d2phi_dt2_prev = Real(0);
-
-    for (int a = 1; a <= Order; ++a) {
-        const std::size_t au = static_cast<std::size_t>(a);
-        const Real inv_a = Real(1) / static_cast<Real>(a);
-        const Real s = (t - static_cast<Real>(a - 1)) * inv_a;
-        phi[au] = s * phi[au - 1];
-
-        if constexpr (NeedFirst) {
-            const Real dphi_dt = inv_a * phi[au - 1] + s * dphi_dt_prev;
-            (*dphi)[au] = dt_dlambda * dphi_dt;
-
-            if constexpr (NeedSecond) {
-                const Real d2phi_dt2 = Real(2) * inv_a * dphi_dt_prev + s * d2phi_dt2_prev;
-                (*d2phi)[au] = dt_dlambda * dt_dlambda * d2phi_dt2;
-                d2phi_dt2_prev = d2phi_dt2;
-            }
-
-            dphi_dt_prev = dphi_dt;
-        }
-    }
-}
-
-template<int Order>
-void fill_simplex_factor_values(Real lambda, std::array<Real, Order + 1>& phi) {
-    fill_simplex_factor_sequence<Order, false, false>(lambda, phi, nullptr, nullptr);
-}
-
-template<int Order>
-void fill_simplex_factor_values_first(Real lambda,
-                                      std::array<Real, Order + 1>& phi,
-                                      std::array<Real, Order + 1>& dphi) {
-    fill_simplex_factor_sequence<Order, true, false>(lambda, phi, &dphi, nullptr);
-}
-
-template<int Order>
-void fill_simplex_factor_values_first_second(Real lambda,
-                                             std::array<Real, Order + 1>& phi,
-                                             std::array<Real, Order + 1>& dphi,
-                                             std::array<Real, Order + 1>& d2phi) {
-    fill_simplex_factor_sequence<Order, true, true>(lambda, phi, &dphi, &d2phi);
-}
-
-} // namespace detail
-
-// ---------------------------------------------------------------------------
-// LagrangeLineFast<Order>
-// ---------------------------------------------------------------------------
-template<int Order>
-struct LagrangeLineFast;
-
-template<>
-struct LagrangeLineFast<1> {
-    static constexpr int n_dofs = 2;
-
-    static constexpr void evaluate(const math::Vector<Real, 3>& xi, std::array<Real, n_dofs>& out) {
-        out[0] = (Real(1) - xi[0]) * Real(0.5);
-        out[1] = (Real(1) + xi[0]) * Real(0.5);
-    }
-
-    static constexpr void evaluate_gradients(const math::Vector<Real, 3>& /*xi*/,
-                                             std::array<Gradient, n_dofs>& out) {
-        out[0] = Gradient{Real(-0.5), Real(0), Real(0)};
-        out[1] = Gradient{Real( 0.5), Real(0), Real(0)};
-    }
-
-    static constexpr void evaluate_hessians(const math::Vector<Real, 3>& /*xi*/,
-                                            std::array<Hessian, n_dofs>& out) {
-        out[0] = Hessian{};
-        out[1] = Hessian{};
-    }
-};
-
-template<>
-struct LagrangeLineFast<2> {
-    static constexpr int n_dofs = 3;
-
-    static constexpr void evaluate(const math::Vector<Real, 3>& xi, std::array<Real, n_dofs>& out) {
-        const Real x = xi[0];
-        out[0] = x * (x - Real(1)) * Real(0.5);
-        out[1] = x * (x + Real(1)) * Real(0.5);
-        out[2] = (Real(1) - x) * (Real(1) + x);
-    }
-
-    static constexpr void evaluate_gradients(const math::Vector<Real, 3>& xi,
-                                             std::array<Gradient, n_dofs>& out) {
-        const Real x = xi[0];
-        out[0] = Gradient{x - Real(0.5), Real(0), Real(0)};
-        out[1] = Gradient{x + Real(0.5), Real(0), Real(0)};
-        out[2] = Gradient{Real(-2) * x, Real(0), Real(0)};
-    }
-
-    static constexpr void evaluate_hessians(const math::Vector<Real, 3>& /*xi*/,
-                                            std::array<Hessian, n_dofs>& out) {
-        out[0] = Hessian{};
-        out[1] = Hessian{};
-        out[2] = Hessian{};
-        out[0](0, 0) = Real(1);
-        out[1](0, 0) = Real(1);
-        out[2](0, 0) = Real(-2);
-    }
-};
-
-template<>
-struct LagrangeLineFast<3> {
-    static constexpr int n_dofs = 4;
-
-    static void evaluate(const math::Vector<Real, 3>& xi, std::array<Real, n_dofs>& out) {
-        detail::fill_axis_values<3>(xi[0], out);
-    }
-
-    static void evaluate_gradients(const math::Vector<Real, 3>& xi,
-                                   std::array<Gradient, n_dofs>& out) {
-        std::array<Real, n_dofs> values{};
-        std::array<Real, n_dofs> first{};
-        detail::fill_axis_values_first<3>(xi[0], values, first);
-        for (std::size_t i = 0; i < first.size(); ++i) {
-            out[i] = Gradient{first[i], Real(0), Real(0)};
-        }
-    }
-
-    static void evaluate_hessians(const math::Vector<Real, 3>& xi,
-                                  std::array<Hessian, n_dofs>& out) {
-        std::array<Real, n_dofs> values{};
-        std::array<Real, n_dofs> first{};
-        std::array<Real, n_dofs> second{};
-        detail::fill_axis_values_first_second<3>(xi[0], values, first, second);
-        for (std::size_t i = 0; i < second.size(); ++i) {
-            Hessian H{};
-            H(0, 0) = second[i];
-            out[i] = H;
-        }
-    }
-};
-
-// ---------------------------------------------------------------------------
-// LagrangeQuadFast<Order>
-// ---------------------------------------------------------------------------
-template<int Order>
-struct LagrangeQuadFast;
-
-template<>
-struct LagrangeQuadFast<1> {
-    static constexpr int n_dofs = 4;
-
-    // VTK Quad4 corner ordering: (-,-), (+,-), (+,+), (-,+).
-    static constexpr void evaluate(const math::Vector<Real, 3>& xi, std::array<Real, n_dofs>& out) {
-        const Real lx = (Real(1) - xi[0]) * Real(0.5);
-        const Real ly = (Real(1) - xi[1]) * Real(0.5);
-        const Real ux = (Real(1) + xi[0]) * Real(0.5);
-        const Real uy = (Real(1) + xi[1]) * Real(0.5);
-        out[0] = lx * ly;
-        out[1] = ux * ly;
-        out[2] = ux * uy;
-        out[3] = lx * uy;
-    }
-
-    static constexpr void evaluate_gradients(const math::Vector<Real, 3>& xi,
-                                             std::array<Gradient, n_dofs>& out) {
-        const Real lx = (Real(1) - xi[0]) * Real(0.5);
-        const Real ly = (Real(1) - xi[1]) * Real(0.5);
-        const Real ux = (Real(1) + xi[0]) * Real(0.5);
-        const Real uy = (Real(1) + xi[1]) * Real(0.5);
-        out[0] = Gradient{Real(-0.5) * ly, Real(-0.5) * lx, Real(0)};
-        out[1] = Gradient{Real( 0.5) * ly, Real(-0.5) * ux, Real(0)};
-        out[2] = Gradient{Real( 0.5) * uy, Real( 0.5) * ux, Real(0)};
-        out[3] = Gradient{Real(-0.5) * uy, Real( 0.5) * lx, Real(0)};
-    }
-
-    static constexpr void evaluate_hessians(const math::Vector<Real, 3>& /*xi*/,
-                                            std::array<Hessian, n_dofs>& out) {
-        out[0] = Hessian{};
-        out[1] = Hessian{};
-        out[2] = Hessian{};
-        out[3] = Hessian{};
-        constexpr Real qrt = Real(0.25);
-        out[0](0, 1) = qrt;  out[0](1, 0) = qrt;
-        out[1](0, 1) = -qrt; out[1](1, 0) = -qrt;
-        out[2](0, 1) = qrt;  out[2](1, 0) = qrt;
-        out[3](0, 1) = -qrt; out[3](1, 0) = -qrt;
-    }
-};
-
-template<>
-struct LagrangeQuadFast<2> {
-    static constexpr int n_dofs = 9;
-
-    static constexpr std::array<std::array<std::size_t, 2>, n_dofs> node_axes = {{
-        {{0u, 0u}}, {{1u, 0u}}, {{1u, 1u}}, {{0u, 1u}},
-        {{2u, 0u}}, {{1u, 2u}}, {{2u, 1u}}, {{0u, 2u}},
-        {{2u, 2u}},
-    }};
-
-    static void evaluate(const math::Vector<Real, 3>& xi, std::array<Real, n_dofs>& out) {
-        std::array<Real, LagrangeLineFast<2>::n_dofs> lx{};
-        std::array<Real, LagrangeLineFast<2>::n_dofs> ly{};
-        LagrangeLineFast<2>::evaluate({xi[0], Real(0), Real(0)}, lx);
-        LagrangeLineFast<2>::evaluate({xi[1], Real(0), Real(0)}, ly);
-        for (std::size_t n = 0; n < node_axes.size(); ++n) {
-            out[n] = lx[node_axes[n][0]] * ly[node_axes[n][1]];
-        }
-    }
-
-    static void evaluate_gradients(const math::Vector<Real, 3>& xi,
-                                   std::array<Gradient, n_dofs>& out) {
-        std::array<Real, LagrangeLineFast<2>::n_dofs> lx{};
-        std::array<Real, LagrangeLineFast<2>::n_dofs> ly{};
-        std::array<Gradient, LagrangeLineFast<2>::n_dofs> gx{};
-        std::array<Gradient, LagrangeLineFast<2>::n_dofs> gy{};
-        LagrangeLineFast<2>::evaluate({xi[0], Real(0), Real(0)}, lx);
-        LagrangeLineFast<2>::evaluate({xi[1], Real(0), Real(0)}, ly);
-        LagrangeLineFast<2>::evaluate_gradients({xi[0], Real(0), Real(0)}, gx);
-        LagrangeLineFast<2>::evaluate_gradients({xi[1], Real(0), Real(0)}, gy);
-        for (std::size_t n = 0; n < node_axes.size(); ++n) {
-            const auto i = node_axes[n][0];
-            const auto j = node_axes[n][1];
-            out[n] = Gradient{gx[i][0] * ly[j], lx[i] * gy[j][0], Real(0)};
-        }
-    }
-
-    static void evaluate_hessians(const math::Vector<Real, 3>& xi,
-                                  std::array<Hessian, n_dofs>& out) {
-        std::array<Real, LagrangeLineFast<2>::n_dofs> lx{};
-        std::array<Real, LagrangeLineFast<2>::n_dofs> ly{};
-        std::array<Gradient, LagrangeLineFast<2>::n_dofs> gx{};
-        std::array<Gradient, LagrangeLineFast<2>::n_dofs> gy{};
-        std::array<Hessian, LagrangeLineFast<2>::n_dofs> hx{};
-        std::array<Hessian, LagrangeLineFast<2>::n_dofs> hy{};
-        LagrangeLineFast<2>::evaluate({xi[0], Real(0), Real(0)}, lx);
-        LagrangeLineFast<2>::evaluate({xi[1], Real(0), Real(0)}, ly);
-        LagrangeLineFast<2>::evaluate_gradients({xi[0], Real(0), Real(0)}, gx);
-        LagrangeLineFast<2>::evaluate_gradients({xi[1], Real(0), Real(0)}, gy);
-        LagrangeLineFast<2>::evaluate_hessians({xi[0], Real(0), Real(0)}, hx);
-        LagrangeLineFast<2>::evaluate_hessians({xi[1], Real(0), Real(0)}, hy);
-        for (std::size_t n = 0; n < node_axes.size(); ++n) {
-            const auto i = node_axes[n][0];
-            const auto j = node_axes[n][1];
-            Hessian H{};
-            H(0, 0) = hx[i](0, 0) * ly[j];
-            H(1, 1) = lx[i] * hy[j](0, 0);
-            H(0, 1) = gx[i][0] * gy[j][0];
-            H(1, 0) = H(0, 1);
-            out[n] = H;
-        }
-    }
-};
-
-template<>
-struct LagrangeQuadFast<3> {
-    static constexpr int n_dofs = 16;
-
-    static constexpr std::array<std::array<std::size_t, 2>, n_dofs> node_axes =
-        detail::make_quad_tensor_node_axes<3>();
-
-    static void evaluate(const math::Vector<Real, 3>& xi, std::array<Real, n_dofs>& out) {
-        std::array<Real, LagrangeLineFast<3>::n_dofs> lx{};
-        std::array<Real, LagrangeLineFast<3>::n_dofs> ly{};
-        detail::fill_axis_values<3>(xi[0], lx);
-        detail::fill_axis_values<3>(xi[1], ly);
-        for (std::size_t n = 0; n < node_axes.size(); ++n) {
-            out[n] = lx[node_axes[n][0]] * ly[node_axes[n][1]];
-        }
-    }
-
-    static void evaluate_gradients(const math::Vector<Real, 3>& xi,
-                                   std::array<Gradient, n_dofs>& out) {
-        std::array<Real, LagrangeLineFast<3>::n_dofs> lx{};
-        std::array<Real, LagrangeLineFast<3>::n_dofs> ly{};
-        std::array<Real, LagrangeLineFast<3>::n_dofs> gx{};
-        std::array<Real, LagrangeLineFast<3>::n_dofs> gy{};
-        detail::fill_axis_values_first<3>(xi[0], lx, gx);
-        detail::fill_axis_values_first<3>(xi[1], ly, gy);
-        for (std::size_t n = 0; n < node_axes.size(); ++n) {
-            const auto i = node_axes[n][0];
-            const auto j = node_axes[n][1];
-            out[n] = Gradient{gx[i] * ly[j], lx[i] * gy[j], Real(0)};
-        }
-    }
-
-    static void evaluate_hessians(const math::Vector<Real, 3>& xi,
-                                  std::array<Hessian, n_dofs>& out) {
-        std::array<Real, LagrangeLineFast<3>::n_dofs> lx{};
-        std::array<Real, LagrangeLineFast<3>::n_dofs> ly{};
-        std::array<Real, LagrangeLineFast<3>::n_dofs> gx{};
-        std::array<Real, LagrangeLineFast<3>::n_dofs> gy{};
-        std::array<Real, LagrangeLineFast<3>::n_dofs> hx{};
-        std::array<Real, LagrangeLineFast<3>::n_dofs> hy{};
-        detail::fill_axis_values_first_second<3>(xi[0], lx, gx, hx);
-        detail::fill_axis_values_first_second<3>(xi[1], ly, gy, hy);
-        for (std::size_t n = 0; n < node_axes.size(); ++n) {
-            const auto i = node_axes[n][0];
-            const auto j = node_axes[n][1];
-            Hessian H{};
-            H(0, 0) = hx[i] * ly[j];
-            H(1, 1) = lx[i] * hy[j];
-            H(0, 1) = gx[i] * gy[j];
-            H(1, 0) = H(0, 1);
-            out[n] = H;
-        }
-    }
-};
-
-// ---------------------------------------------------------------------------
-// LagrangeHexFast<Order>
-// ---------------------------------------------------------------------------
-template<int Order>
-struct LagrangeHexFast;
-
-template<>
-struct LagrangeHexFast<1> {
-    static constexpr int n_dofs = 8;
-
-    // VTK Hex8 corner ordering: (-,-,-), (+,-,-), (+,+,-), (-,+,-),
-    //                           (-,-,+), (+,-,+), (+,+,+), (-,+,+).
-    static constexpr void evaluate(const math::Vector<Real, 3>& xi, std::array<Real, n_dofs>& out) {
-        const Real lx = (Real(1) - xi[0]) * Real(0.5);
-        const Real ly = (Real(1) - xi[1]) * Real(0.5);
-        const Real lz = (Real(1) - xi[2]) * Real(0.5);
-        const Real ux = (Real(1) + xi[0]) * Real(0.5);
-        const Real uy = (Real(1) + xi[1]) * Real(0.5);
-        const Real uz = (Real(1) + xi[2]) * Real(0.5);
-        // Precompute z-plane partial products (sum factorization).
-        const Real lxly = lx * ly;
-        const Real uxly = ux * ly;
-        const Real uxuy = ux * uy;
-        const Real lxuy = lx * uy;
-        out[0] = lxly * lz;
-        out[1] = uxly * lz;
-        out[2] = uxuy * lz;
-        out[3] = lxuy * lz;
-        out[4] = lxly * uz;
-        out[5] = uxly * uz;
-        out[6] = uxuy * uz;
-        out[7] = lxuy * uz;
-    }
-
-    static constexpr void evaluate_gradients(const math::Vector<Real, 3>& xi,
-                                             std::array<Gradient, n_dofs>& out) {
-        const Real lx = (Real(1) - xi[0]) * Real(0.5);
-        const Real ly = (Real(1) - xi[1]) * Real(0.5);
-        const Real lz = (Real(1) - xi[2]) * Real(0.5);
-        const Real ux = (Real(1) + xi[0]) * Real(0.5);
-        const Real uy = (Real(1) + xi[1]) * Real(0.5);
-        const Real uz = (Real(1) + xi[2]) * Real(0.5);
-        // dL_0(x)/dx = -0.5, dL_1(x)/dx = +0.5 along each axis.
-        out[0] = Gradient{Real(-0.5) * ly * lz, Real(-0.5) * lx * lz, Real(-0.5) * lx * ly};
-        out[1] = Gradient{Real( 0.5) * ly * lz, Real(-0.5) * ux * lz, Real(-0.5) * ux * ly};
-        out[2] = Gradient{Real( 0.5) * uy * lz, Real( 0.5) * ux * lz, Real(-0.5) * ux * uy};
-        out[3] = Gradient{Real(-0.5) * uy * lz, Real( 0.5) * lx * lz, Real(-0.5) * lx * uy};
-        out[4] = Gradient{Real(-0.5) * ly * uz, Real(-0.5) * lx * uz, Real( 0.5) * lx * ly};
-        out[5] = Gradient{Real( 0.5) * ly * uz, Real(-0.5) * ux * uz, Real( 0.5) * ux * ly};
-        out[6] = Gradient{Real( 0.5) * uy * uz, Real( 0.5) * ux * uz, Real( 0.5) * ux * uy};
-        out[7] = Gradient{Real(-0.5) * uy * uz, Real( 0.5) * lx * uz, Real( 0.5) * lx * uy};
-    }
-
-    static constexpr void evaluate_hessians(const math::Vector<Real, 3>& xi,
-                                            std::array<Hessian, n_dofs>& out) {
-        const Real lx = (Real(1) - xi[0]) * Real(0.5);
-        const Real ly = (Real(1) - xi[1]) * Real(0.5);
-        const Real lz = (Real(1) - xi[2]) * Real(0.5);
-        const Real ux = (Real(1) + xi[0]) * Real(0.5);
-        const Real uy = (Real(1) + xi[1]) * Real(0.5);
-        const Real uz = (Real(1) + xi[2]) * Real(0.5);
-        const Real ax[8] = {lx, ux, ux, lx, lx, ux, ux, lx};
-        const Real ay[8] = {ly, ly, uy, uy, ly, ly, uy, uy};
-        const Real az[8] = {lz, lz, lz, lz, uz, uz, uz, uz};
-        const int sx[8] = {-1, 1, 1, -1, -1, 1, 1, -1};
-        const int sy[8] = {-1, -1, 1, 1, -1, -1, 1, 1};
-        const int sz[8] = {-1, -1, -1, -1, 1, 1, 1, 1};
-        constexpr Real qrt = Real(0.25);
-        for (std::size_t n = 0; n < static_cast<std::size_t>(n_dofs); ++n) {
-            out[n] = Hessian{};
-            out[n](0, 1) = static_cast<Real>(sx[n] * sy[n]) * qrt * az[n];
-            out[n](1, 0) = out[n](0, 1);
-            out[n](0, 2) = static_cast<Real>(sx[n] * sz[n]) * qrt * ay[n];
-            out[n](2, 0) = out[n](0, 2);
-            out[n](1, 2) = static_cast<Real>(sy[n] * sz[n]) * qrt * ax[n];
-            out[n](2, 1) = out[n](1, 2);
-        }
-    }
-};
-
-template<>
-struct LagrangeHexFast<2> {
-    static constexpr int n_dofs = 27;
-
-    static constexpr std::array<std::array<std::size_t, 3>, n_dofs> node_axes = {{
-        {{0u, 0u, 0u}}, {{1u, 0u, 0u}}, {{1u, 1u, 0u}}, {{0u, 1u, 0u}},
-        {{0u, 0u, 1u}}, {{1u, 0u, 1u}}, {{1u, 1u, 1u}}, {{0u, 1u, 1u}},
-        {{2u, 0u, 0u}}, {{1u, 2u, 0u}}, {{2u, 1u, 0u}}, {{0u, 2u, 0u}},
-        {{2u, 0u, 1u}}, {{1u, 2u, 1u}}, {{2u, 1u, 1u}}, {{0u, 2u, 1u}},
-        {{0u, 0u, 2u}}, {{1u, 0u, 2u}}, {{1u, 1u, 2u}}, {{0u, 1u, 2u}},
-        {{2u, 2u, 0u}}, {{2u, 2u, 1u}}, {{2u, 0u, 2u}}, {{1u, 2u, 2u}},
-        {{2u, 1u, 2u}}, {{0u, 2u, 2u}}, {{2u, 2u, 2u}},
-    }};
-
-    static void evaluate(const math::Vector<Real, 3>& xi, std::array<Real, n_dofs>& out) {
-        std::array<Real, LagrangeLineFast<2>::n_dofs> lx{};
-        std::array<Real, LagrangeLineFast<2>::n_dofs> ly{};
-        std::array<Real, LagrangeLineFast<2>::n_dofs> lz{};
-        LagrangeLineFast<2>::evaluate({xi[0], Real(0), Real(0)}, lx);
-        LagrangeLineFast<2>::evaluate({xi[1], Real(0), Real(0)}, ly);
-        LagrangeLineFast<2>::evaluate({xi[2], Real(0), Real(0)}, lz);
-        for (std::size_t n = 0; n < node_axes.size(); ++n) {
-            out[n] = lx[node_axes[n][0]] * ly[node_axes[n][1]] * lz[node_axes[n][2]];
-        }
-    }
-
-    static void evaluate_gradients(const math::Vector<Real, 3>& xi,
-                                   std::array<Gradient, n_dofs>& out) {
-        std::array<Real, LagrangeLineFast<2>::n_dofs> lx{};
-        std::array<Real, LagrangeLineFast<2>::n_dofs> ly{};
-        std::array<Real, LagrangeLineFast<2>::n_dofs> lz{};
-        std::array<Gradient, LagrangeLineFast<2>::n_dofs> gx{};
-        std::array<Gradient, LagrangeLineFast<2>::n_dofs> gy{};
-        std::array<Gradient, LagrangeLineFast<2>::n_dofs> gz{};
-        LagrangeLineFast<2>::evaluate({xi[0], Real(0), Real(0)}, lx);
-        LagrangeLineFast<2>::evaluate({xi[1], Real(0), Real(0)}, ly);
-        LagrangeLineFast<2>::evaluate({xi[2], Real(0), Real(0)}, lz);
-        LagrangeLineFast<2>::evaluate_gradients({xi[0], Real(0), Real(0)}, gx);
-        LagrangeLineFast<2>::evaluate_gradients({xi[1], Real(0), Real(0)}, gy);
-        LagrangeLineFast<2>::evaluate_gradients({xi[2], Real(0), Real(0)}, gz);
-        for (std::size_t n = 0; n < node_axes.size(); ++n) {
-            const auto i = node_axes[n][0];
-            const auto j = node_axes[n][1];
-            const auto k = node_axes[n][2];
-            out[n] = Gradient{
-                gx[i][0] * ly[j] * lz[k],
-                lx[i] * gy[j][0] * lz[k],
-                lx[i] * ly[j] * gz[k][0],
-            };
-        }
-    }
-
-    static void evaluate_hessians(const math::Vector<Real, 3>& xi,
-                                  std::array<Hessian, n_dofs>& out) {
-        std::array<Real, LagrangeLineFast<2>::n_dofs> lx{};
-        std::array<Real, LagrangeLineFast<2>::n_dofs> ly{};
-        std::array<Real, LagrangeLineFast<2>::n_dofs> lz{};
-        std::array<Gradient, LagrangeLineFast<2>::n_dofs> gx{};
-        std::array<Gradient, LagrangeLineFast<2>::n_dofs> gy{};
-        std::array<Gradient, LagrangeLineFast<2>::n_dofs> gz{};
-        std::array<Hessian, LagrangeLineFast<2>::n_dofs> hx{};
-        std::array<Hessian, LagrangeLineFast<2>::n_dofs> hy{};
-        std::array<Hessian, LagrangeLineFast<2>::n_dofs> hz{};
-        LagrangeLineFast<2>::evaluate({xi[0], Real(0), Real(0)}, lx);
-        LagrangeLineFast<2>::evaluate({xi[1], Real(0), Real(0)}, ly);
-        LagrangeLineFast<2>::evaluate({xi[2], Real(0), Real(0)}, lz);
-        LagrangeLineFast<2>::evaluate_gradients({xi[0], Real(0), Real(0)}, gx);
-        LagrangeLineFast<2>::evaluate_gradients({xi[1], Real(0), Real(0)}, gy);
-        LagrangeLineFast<2>::evaluate_gradients({xi[2], Real(0), Real(0)}, gz);
-        LagrangeLineFast<2>::evaluate_hessians({xi[0], Real(0), Real(0)}, hx);
-        LagrangeLineFast<2>::evaluate_hessians({xi[1], Real(0), Real(0)}, hy);
-        LagrangeLineFast<2>::evaluate_hessians({xi[2], Real(0), Real(0)}, hz);
-        for (std::size_t n = 0; n < node_axes.size(); ++n) {
-            const auto i = node_axes[n][0];
-            const auto j = node_axes[n][1];
-            const auto k = node_axes[n][2];
-            Hessian H{};
-            H(0, 0) = hx[i](0, 0) * ly[j] * lz[k];
-            H(1, 1) = lx[i] * hy[j](0, 0) * lz[k];
-            H(2, 2) = lx[i] * ly[j] * hz[k](0, 0);
-            H(0, 1) = gx[i][0] * gy[j][0] * lz[k];
-            H(1, 0) = H(0, 1);
-            H(0, 2) = gx[i][0] * ly[j] * gz[k][0];
-            H(2, 0) = H(0, 2);
-            H(1, 2) = lx[i] * gy[j][0] * gz[k][0];
-            H(2, 1) = H(1, 2);
-            out[n] = H;
-        }
-    }
-};
-
-template<>
-struct LagrangeHexFast<3> {
-    static constexpr int n_dofs = 64;
-
-    static constexpr std::array<std::array<std::size_t, 3>, n_dofs> node_axes =
-        detail::make_hex_tensor_node_axes<3>();
-
-    static void evaluate(const math::Vector<Real, 3>& xi, std::array<Real, n_dofs>& out) {
-        std::array<Real, LagrangeLineFast<3>::n_dofs> lx{};
-        std::array<Real, LagrangeLineFast<3>::n_dofs> ly{};
-        std::array<Real, LagrangeLineFast<3>::n_dofs> lz{};
-        detail::fill_axis_values<3>(xi[0], lx);
-        detail::fill_axis_values<3>(xi[1], ly);
-        detail::fill_axis_values<3>(xi[2], lz);
-        for (std::size_t n = 0; n < node_axes.size(); ++n) {
-            out[n] = lx[node_axes[n][0]] * ly[node_axes[n][1]] * lz[node_axes[n][2]];
-        }
-    }
-
-    static void evaluate_gradients(const math::Vector<Real, 3>& xi,
-                                   std::array<Gradient, n_dofs>& out) {
-        std::array<Real, LagrangeLineFast<3>::n_dofs> lx{};
-        std::array<Real, LagrangeLineFast<3>::n_dofs> ly{};
-        std::array<Real, LagrangeLineFast<3>::n_dofs> lz{};
-        std::array<Real, LagrangeLineFast<3>::n_dofs> gx{};
-        std::array<Real, LagrangeLineFast<3>::n_dofs> gy{};
-        std::array<Real, LagrangeLineFast<3>::n_dofs> gz{};
-        detail::fill_axis_values_first<3>(xi[0], lx, gx);
-        detail::fill_axis_values_first<3>(xi[1], ly, gy);
-        detail::fill_axis_values_first<3>(xi[2], lz, gz);
-        for (std::size_t n = 0; n < node_axes.size(); ++n) {
-            const auto i = node_axes[n][0];
-            const auto j = node_axes[n][1];
-            const auto k = node_axes[n][2];
-            out[n] = Gradient{
-                gx[i] * ly[j] * lz[k],
-                lx[i] * gy[j] * lz[k],
-                lx[i] * ly[j] * gz[k],
-            };
-        }
-    }
-
-    static void evaluate_hessians(const math::Vector<Real, 3>& xi,
-                                  std::array<Hessian, n_dofs>& out) {
-        std::array<Real, LagrangeLineFast<3>::n_dofs> lx{};
-        std::array<Real, LagrangeLineFast<3>::n_dofs> ly{};
-        std::array<Real, LagrangeLineFast<3>::n_dofs> lz{};
-        std::array<Real, LagrangeLineFast<3>::n_dofs> gx{};
-        std::array<Real, LagrangeLineFast<3>::n_dofs> gy{};
-        std::array<Real, LagrangeLineFast<3>::n_dofs> gz{};
-        std::array<Real, LagrangeLineFast<3>::n_dofs> hx{};
-        std::array<Real, LagrangeLineFast<3>::n_dofs> hy{};
-        std::array<Real, LagrangeLineFast<3>::n_dofs> hz{};
-        detail::fill_axis_values_first_second<3>(xi[0], lx, gx, hx);
-        detail::fill_axis_values_first_second<3>(xi[1], ly, gy, hy);
-        detail::fill_axis_values_first_second<3>(xi[2], lz, gz, hz);
-        for (std::size_t n = 0; n < node_axes.size(); ++n) {
-            const auto i = node_axes[n][0];
-            const auto j = node_axes[n][1];
-            const auto k = node_axes[n][2];
-            Hessian H{};
-            H(0, 0) = hx[i] * ly[j] * lz[k];
-            H(1, 1) = lx[i] * hy[j] * lz[k];
-            H(2, 2) = lx[i] * ly[j] * hz[k];
-            H(0, 1) = gx[i] * gy[j] * lz[k];
-            H(1, 0) = H(0, 1);
-            H(0, 2) = gx[i] * ly[j] * gz[k];
-            H(2, 0) = H(0, 2);
-            H(1, 2) = lx[i] * gy[j] * gz[k];
-            H(2, 1) = H(1, 2);
-            out[n] = H;
-        }
-    }
-};
-
-// ---------------------------------------------------------------------------
-// LagrangeTriFast<Order>
-// ---------------------------------------------------------------------------
-template<int Order>
-struct LagrangeTriFast;
-
-template<>
-struct LagrangeTriFast<1> {
-    static constexpr int n_dofs = 3;
-
-    static constexpr void evaluate(const math::Vector<Real, 3>& xi, std::array<Real, n_dofs>& out) {
-        out[0] = Real(1) - xi[0] - xi[1];
-        out[1] = xi[0];
-        out[2] = xi[1];
-    }
-
-    static constexpr void evaluate_gradients(const math::Vector<Real, 3>& /*xi*/,
-                                             std::array<Gradient, n_dofs>& out) {
-        out[0] = Gradient{Real(-1), Real(-1), Real(0)};
-        out[1] = Gradient{Real( 1), Real( 0), Real(0)};
-        out[2] = Gradient{Real( 0), Real( 1), Real(0)};
-    }
-
-    static constexpr void evaluate_hessians(const math::Vector<Real, 3>& /*xi*/,
-                                            std::array<Hessian, n_dofs>& out) {
-        out[0] = Hessian{};
-        out[1] = Hessian{};
-        out[2] = Hessian{};
-    }
-};
-
-template<>
-struct LagrangeTriFast<2> {
-    static constexpr int n_dofs = 6;
-
-    static constexpr void evaluate(const math::Vector<Real, 3>& xi, std::array<Real, n_dofs>& out) {
-        const Real l1 = xi[0];
-        const Real l2 = xi[1];
-        const Real l0 = Real(1) - l1 - l2;
-
-        out[0] = l0 * (Real(2) * l0 - Real(1));
-        out[1] = l1 * (Real(2) * l1 - Real(1));
-        out[2] = l2 * (Real(2) * l2 - Real(1));
-        out[3] = Real(4) * l0 * l1;
-        out[4] = Real(4) * l1 * l2;
-        out[5] = Real(4) * l0 * l2;
-    }
-
-    static constexpr void evaluate_gradients(const math::Vector<Real, 3>& xi,
-                                             std::array<Gradient, n_dofs>& out) {
-        const Real l1 = xi[0];
-        const Real l2 = xi[1];
-        const Real l0 = Real(1) - l1 - l2;
-        constexpr Gradient g0{Real(-1), Real(-1), Real(0)};
-        constexpr Gradient g1{Real( 1), Real( 0), Real(0)};
-        constexpr Gradient g2{Real( 0), Real( 1), Real(0)};
-
-        out[0] = detail::scaled_gradient(g0, Real(4) * l0 - Real(1));
-        out[1] = detail::scaled_gradient(g1, Real(4) * l1 - Real(1));
-        out[2] = detail::scaled_gradient(g2, Real(4) * l2 - Real(1));
-        out[3] = detail::p2_edge_gradient(l0, g0, l1, g1);
-        out[4] = detail::p2_edge_gradient(l1, g1, l2, g2);
-        out[5] = detail::p2_edge_gradient(l0, g0, l2, g2);
-    }
-
-    static constexpr void evaluate_hessians(const math::Vector<Real, 3>& /*xi*/,
-                                            std::array<Hessian, n_dofs>& out) {
-        constexpr Gradient g0{Real(-1), Real(-1), Real(0)};
-        constexpr Gradient g1{Real( 1), Real( 0), Real(0)};
-        constexpr Gradient g2{Real( 0), Real( 1), Real(0)};
-
-        out[0] = detail::p2_vertex_hessian(g0);
-        out[1] = detail::p2_vertex_hessian(g1);
-        out[2] = detail::p2_vertex_hessian(g2);
-        out[3] = detail::p2_edge_hessian(g0, g1);
-        out[4] = detail::p2_edge_hessian(g1, g2);
-        out[5] = detail::p2_edge_hessian(g0, g2);
-    }
-};
-
-template<>
-struct LagrangeTriFast<3> {
-    static constexpr int n_dofs = 10;
-
-    static constexpr std::array<std::array<std::size_t, 3>, n_dofs> exponents =
-        detail::make_triangle_simplex_exponents<3>();
-
-    static void evaluate(const math::Vector<Real, 3>& xi, std::array<Real, n_dofs>& out) {
-        const Real l1 = xi[0];
-        const Real l2 = xi[1];
-        const Real l0 = Real(1) - l1 - l2;
-        std::array<Real, 4> phi0{};
-        std::array<Real, 4> phi1{};
-        std::array<Real, 4> phi2{};
-        detail::fill_simplex_factor_values<3>(l0, phi0);
-        detail::fill_simplex_factor_values<3>(l1, phi1);
-        detail::fill_simplex_factor_values<3>(l2, phi2);
-
-        for (std::size_t n = 0; n < exponents.size(); ++n) {
-            const auto& e = exponents[n];
-            out[n] = phi0[e[0]] * phi1[e[1]] * phi2[e[2]];
-        }
-    }
-
-    static void evaluate_gradients(const math::Vector<Real, 3>& xi,
-                                   std::array<Gradient, n_dofs>& out) {
-        const Real l1 = xi[0];
-        const Real l2 = xi[1];
-        const Real l0 = Real(1) - l1 - l2;
-        std::array<Real, 4> phi0{};
-        std::array<Real, 4> phi1{};
-        std::array<Real, 4> phi2{};
-        std::array<Real, 4> dphi0{};
-        std::array<Real, 4> dphi1{};
-        std::array<Real, 4> dphi2{};
-        detail::fill_simplex_factor_values_first<3>(l0, phi0, dphi0);
-        detail::fill_simplex_factor_values_first<3>(l1, phi1, dphi1);
-        detail::fill_simplex_factor_values_first<3>(l2, phi2, dphi2);
-
-        for (std::size_t n = 0; n < exponents.size(); ++n) {
-            const auto& e = exponents[n];
-            const Real v0 = phi0[e[0]];
-            const Real v1 = phi1[e[1]];
-            const Real v2 = phi2[e[2]];
-            const Real dl0 = dphi0[e[0]] * v1 * v2;
-            const Real dl1 = v0 * dphi1[e[1]] * v2;
-            const Real dl2 = v0 * v1 * dphi2[e[2]];
-            out[n] = Gradient{dl1 - dl0, dl2 - dl0, Real(0)};
-        }
-    }
-
-    static void evaluate_hessians(const math::Vector<Real, 3>& xi,
-                                  std::array<Hessian, n_dofs>& out) {
-        const Real l1 = xi[0];
-        const Real l2 = xi[1];
-        const Real l0 = Real(1) - l1 - l2;
-        std::array<Real, 4> phi0{};
-        std::array<Real, 4> phi1{};
-        std::array<Real, 4> phi2{};
-        std::array<Real, 4> dphi0{};
-        std::array<Real, 4> dphi1{};
-        std::array<Real, 4> dphi2{};
-        std::array<Real, 4> d2phi0{};
-        std::array<Real, 4> d2phi1{};
-        std::array<Real, 4> d2phi2{};
-        detail::fill_simplex_factor_values_first_second<3>(l0, phi0, dphi0, d2phi0);
-        detail::fill_simplex_factor_values_first_second<3>(l1, phi1, dphi1, d2phi1);
-        detail::fill_simplex_factor_values_first_second<3>(l2, phi2, dphi2, d2phi2);
-
-        for (std::size_t n = 0; n < exponents.size(); ++n) {
-            const auto& e = exponents[n];
-            const Real v0 = phi0[e[0]];
-            const Real v1 = phi1[e[1]];
-            const Real v2 = phi2[e[2]];
-            const Real D0 = dphi0[e[0]];
-            const Real D1 = dphi1[e[1]];
-            const Real D2 = dphi2[e[2]];
-            const Real H00 = d2phi0[e[0]] * v1 * v2;
-            const Real H11 = v0 * d2phi1[e[1]] * v2;
-            const Real H22 = v0 * v1 * d2phi2[e[2]];
-            const Real H01 = D0 * D1 * v2;
-            const Real H02 = D0 * v1 * D2;
-            const Real H12 = v0 * D1 * D2;
-
-            Hessian H{};
-            H(0, 0) = H00 - Real(2) * H01 + H11;
-            H(1, 1) = H00 - Real(2) * H02 + H22;
-            H(0, 1) = H00 - H01 - H02 + H12;
-            H(1, 0) = H(0, 1);
-            out[n] = H;
-        }
-    }
-};
-
-// ---------------------------------------------------------------------------
-// LagrangeTetFast<Order>
-// ---------------------------------------------------------------------------
-template<int Order>
-struct LagrangeTetFast;
-
-template<>
-struct LagrangeTetFast<1> {
-    static constexpr int n_dofs = 4;
-
-    static constexpr void evaluate(const math::Vector<Real, 3>& xi, std::array<Real, n_dofs>& out) {
-        out[0] = Real(1) - xi[0] - xi[1] - xi[2];
-        out[1] = xi[0];
-        out[2] = xi[1];
-        out[3] = xi[2];
-    }
-
-    static constexpr void evaluate_gradients(const math::Vector<Real, 3>& /*xi*/,
-                                             std::array<Gradient, n_dofs>& out) {
-        out[0] = Gradient{Real(-1), Real(-1), Real(-1)};
-        out[1] = Gradient{Real( 1), Real( 0), Real( 0)};
-        out[2] = Gradient{Real( 0), Real( 1), Real( 0)};
-        out[3] = Gradient{Real( 0), Real( 0), Real( 1)};
-    }
-
-    static constexpr void evaluate_hessians(const math::Vector<Real, 3>& /*xi*/,
-                                            std::array<Hessian, n_dofs>& out) {
-        out[0] = Hessian{};
-        out[1] = Hessian{};
-        out[2] = Hessian{};
-        out[3] = Hessian{};
-    }
-};
-
-template<>
-struct LagrangeTetFast<2> {
-    static constexpr int n_dofs = 10;
-
-    static constexpr void evaluate(const math::Vector<Real, 3>& xi, std::array<Real, n_dofs>& out) {
-        const Real l1 = xi[0];
-        const Real l2 = xi[1];
-        const Real l3 = xi[2];
-        const Real l0 = Real(1) - l1 - l2 - l3;
-
-        out[0] = l0 * (Real(2) * l0 - Real(1));
-        out[1] = l1 * (Real(2) * l1 - Real(1));
-        out[2] = l2 * (Real(2) * l2 - Real(1));
-        out[3] = l3 * (Real(2) * l3 - Real(1));
-        out[4] = Real(4) * l0 * l1;
-        out[5] = Real(4) * l1 * l2;
-        out[6] = Real(4) * l0 * l2;
-        out[7] = Real(4) * l0 * l3;
-        out[8] = Real(4) * l1 * l3;
-        out[9] = Real(4) * l2 * l3;
-    }
-
-    static constexpr void evaluate_gradients(const math::Vector<Real, 3>& xi,
-                                             std::array<Gradient, n_dofs>& out) {
-        const Real l1 = xi[0];
-        const Real l2 = xi[1];
-        const Real l3 = xi[2];
-        const Real l0 = Real(1) - l1 - l2 - l3;
-        constexpr Gradient g0{Real(-1), Real(-1), Real(-1)};
-        constexpr Gradient g1{Real( 1), Real( 0), Real( 0)};
-        constexpr Gradient g2{Real( 0), Real( 1), Real( 0)};
-        constexpr Gradient g3{Real( 0), Real( 0), Real( 1)};
-
-        out[0] = detail::scaled_gradient(g0, Real(4) * l0 - Real(1));
-        out[1] = detail::scaled_gradient(g1, Real(4) * l1 - Real(1));
-        out[2] = detail::scaled_gradient(g2, Real(4) * l2 - Real(1));
-        out[3] = detail::scaled_gradient(g3, Real(4) * l3 - Real(1));
-        out[4] = detail::p2_edge_gradient(l0, g0, l1, g1);
-        out[5] = detail::p2_edge_gradient(l1, g1, l2, g2);
-        out[6] = detail::p2_edge_gradient(l0, g0, l2, g2);
-        out[7] = detail::p2_edge_gradient(l0, g0, l3, g3);
-        out[8] = detail::p2_edge_gradient(l1, g1, l3, g3);
-        out[9] = detail::p2_edge_gradient(l2, g2, l3, g3);
-    }
-
-    static constexpr void evaluate_hessians(const math::Vector<Real, 3>& /*xi*/,
-                                            std::array<Hessian, n_dofs>& out) {
-        constexpr Gradient g0{Real(-1), Real(-1), Real(-1)};
-        constexpr Gradient g1{Real( 1), Real( 0), Real( 0)};
-        constexpr Gradient g2{Real( 0), Real( 1), Real( 0)};
-        constexpr Gradient g3{Real( 0), Real( 0), Real( 1)};
-
-        out[0] = detail::p2_vertex_hessian(g0);
-        out[1] = detail::p2_vertex_hessian(g1);
-        out[2] = detail::p2_vertex_hessian(g2);
-        out[3] = detail::p2_vertex_hessian(g3);
-        out[4] = detail::p2_edge_hessian(g0, g1);
-        out[5] = detail::p2_edge_hessian(g1, g2);
-        out[6] = detail::p2_edge_hessian(g0, g2);
-        out[7] = detail::p2_edge_hessian(g0, g3);
-        out[8] = detail::p2_edge_hessian(g1, g3);
-        out[9] = detail::p2_edge_hessian(g2, g3);
-    }
-};
-
-template<>
-struct LagrangeTetFast<3> {
-    static constexpr int n_dofs = 20;
-
-    static constexpr std::array<std::array<std::size_t, 4>, n_dofs> exponents =
-        detail::make_tetrahedron_simplex_exponents<3>();
-
-    static void evaluate(const math::Vector<Real, 3>& xi, std::array<Real, n_dofs>& out) {
-        const Real l1 = xi[0];
-        const Real l2 = xi[1];
-        const Real l3 = xi[2];
-        const Real l0 = Real(1) - l1 - l2 - l3;
-        std::array<Real, 4> phi0{};
-        std::array<Real, 4> phi1{};
-        std::array<Real, 4> phi2{};
-        std::array<Real, 4> phi3{};
-        detail::fill_simplex_factor_values<3>(l0, phi0);
-        detail::fill_simplex_factor_values<3>(l1, phi1);
-        detail::fill_simplex_factor_values<3>(l2, phi2);
-        detail::fill_simplex_factor_values<3>(l3, phi3);
-
-        for (std::size_t n = 0; n < exponents.size(); ++n) {
-            const auto& e = exponents[n];
-            out[n] = phi0[e[0]] * phi1[e[1]] * phi2[e[2]] * phi3[e[3]];
-        }
-    }
-
-    static void evaluate_gradients(const math::Vector<Real, 3>& xi,
-                                   std::array<Gradient, n_dofs>& out) {
-        const Real l1 = xi[0];
-        const Real l2 = xi[1];
-        const Real l3 = xi[2];
-        const Real l0 = Real(1) - l1 - l2 - l3;
-        std::array<Real, 4> phi0{};
-        std::array<Real, 4> phi1{};
-        std::array<Real, 4> phi2{};
-        std::array<Real, 4> phi3{};
-        std::array<Real, 4> dphi0{};
-        std::array<Real, 4> dphi1{};
-        std::array<Real, 4> dphi2{};
-        std::array<Real, 4> dphi3{};
-        detail::fill_simplex_factor_values_first<3>(l0, phi0, dphi0);
-        detail::fill_simplex_factor_values_first<3>(l1, phi1, dphi1);
-        detail::fill_simplex_factor_values_first<3>(l2, phi2, dphi2);
-        detail::fill_simplex_factor_values_first<3>(l3, phi3, dphi3);
-
-        for (std::size_t n = 0; n < exponents.size(); ++n) {
-            const auto& e = exponents[n];
-            const Real v0 = phi0[e[0]];
-            const Real v1 = phi1[e[1]];
-            const Real v2 = phi2[e[2]];
-            const Real v3 = phi3[e[3]];
-            const Real dl0 = dphi0[e[0]] * v1 * v2 * v3;
-            const Real dl1 = v0 * dphi1[e[1]] * v2 * v3;
-            const Real dl2 = v0 * v1 * dphi2[e[2]] * v3;
-            const Real dl3 = v0 * v1 * v2 * dphi3[e[3]];
-            out[n] = Gradient{dl1 - dl0, dl2 - dl0, dl3 - dl0};
-        }
-    }
-
-    static void evaluate_hessians(const math::Vector<Real, 3>& xi,
-                                  std::array<Hessian, n_dofs>& out) {
-        const Real l1 = xi[0];
-        const Real l2 = xi[1];
-        const Real l3 = xi[2];
-        const Real l0 = Real(1) - l1 - l2 - l3;
-        std::array<Real, 4> phi0{};
-        std::array<Real, 4> phi1{};
-        std::array<Real, 4> phi2{};
-        std::array<Real, 4> phi3{};
-        std::array<Real, 4> dphi0{};
-        std::array<Real, 4> dphi1{};
-        std::array<Real, 4> dphi2{};
-        std::array<Real, 4> dphi3{};
-        std::array<Real, 4> d2phi0{};
-        std::array<Real, 4> d2phi1{};
-        std::array<Real, 4> d2phi2{};
-        std::array<Real, 4> d2phi3{};
-        detail::fill_simplex_factor_values_first_second<3>(l0, phi0, dphi0, d2phi0);
-        detail::fill_simplex_factor_values_first_second<3>(l1, phi1, dphi1, d2phi1);
-        detail::fill_simplex_factor_values_first_second<3>(l2, phi2, dphi2, d2phi2);
-        detail::fill_simplex_factor_values_first_second<3>(l3, phi3, dphi3, d2phi3);
-
-        for (std::size_t n = 0; n < exponents.size(); ++n) {
-            const auto& e = exponents[n];
-            const Real v0 = phi0[e[0]];
-            const Real v1 = phi1[e[1]];
-            const Real v2 = phi2[e[2]];
-            const Real v3 = phi3[e[3]];
-            const Real D0 = dphi0[e[0]];
-            const Real D1 = dphi1[e[1]];
-            const Real D2 = dphi2[e[2]];
-            const Real D3 = dphi3[e[3]];
-
-            const Real H00 = d2phi0[e[0]] * v1 * v2 * v3;
-            const Real H11 = v0 * d2phi1[e[1]] * v2 * v3;
-            const Real H22 = v0 * v1 * d2phi2[e[2]] * v3;
-            const Real H33 = v0 * v1 * v2 * d2phi3[e[3]];
-            const Real H01 = D0 * D1 * v2 * v3;
-            const Real H02 = D0 * v1 * D2 * v3;
-            const Real H03 = D0 * v1 * v2 * D3;
-            const Real H12 = v0 * D1 * D2 * v3;
-            const Real H13 = v0 * D1 * v2 * D3;
-            const Real H23 = v0 * v1 * D2 * D3;
-
-            Hessian H{};
-            H(0, 0) = H00 - Real(2) * H01 + H11;
-            H(1, 1) = H00 - Real(2) * H02 + H22;
-            H(2, 2) = H00 - Real(2) * H03 + H33;
-            H(0, 1) = H00 - H01 - H02 + H12;
-            H(1, 0) = H(0, 1);
-            H(0, 2) = H00 - H01 - H03 + H13;
-            H(2, 0) = H(0, 2);
-            H(1, 2) = H00 - H02 - H03 + H23;
-            H(2, 1) = H(1, 2);
-            out[n] = H;
-        }
-    }
-};
-
-} // namespace basis
-} // namespace FE
-} // namespace svmp
-
-#endif // SVMP_FE_BASIS_LAGRANGEBASISFAST_H
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasisPyramid.cpp b/Code/Source/solver/FE/Basis/LagrangeBasisPyramid.cpp
deleted file mode 100644
index 4a332621e..000000000
--- a/Code/Source/solver/FE/Basis/LagrangeBasisPyramid.cpp
+++ /dev/null
@@ -1,2069 +0,0 @@
-#include "LagrangeBasisPyramid.h"
-
-#include <algorithm>
-#include <array>
-#include <cmath>
-#include <map>
-#include <memory>
-#include <mutex>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "Basis/BasisExceptions.h"
-#include "BasisTolerance.h"
-#include "Math/DenseLinearAlgebra.h"
-#include "Math/DenseTransformKernels.h"
-#include "LagrangeBasisUtility.h"
-#include "PyramidModalBasis.h"
-
-namespace svmp {
-namespace FE {
-namespace basis {
-namespace detail {
-
-class PyramidLagrangeCache {
-public:
-    using ModalTerm = pyramid_modal::Term;
-
-    struct UvPolynomial {
-        using Power = std::pair<int, int>;
-        std::vector<std::pair<Power, Real>> coeffs;
-
-        void add_term(int pu, int pv, Real coeff, Real tol = Real(1e-14)) {
-            if (std::abs(coeff) <= tol) {
-                return;
-            }
-            const auto key = std::make_pair(pu, pv);
-            const auto found = std::lower_bound(
-                coeffs.begin(),
-                coeffs.end(),
-                key,
-                [](const auto& entry, const Power& value) { return entry.first < value; });
-            if (found == coeffs.end() || found->first != key) {
-                coeffs.insert(found, {key, coeff});
-                return;
-            }
-
-            found->second += coeff;
-            if (std::abs(found->second) <= tol) {
-                coeffs.erase(found);
-            }
-        }
-
-        void add_scaled(const UvPolynomial& other, Real scale, Real tol = Real(1e-14)) {
-            if (std::abs(scale) <= tol) {
-                return;
-            }
-            for (const auto& [powers, coeff] : other.coeffs) {
-                add_term(powers.first, powers.second, scale * coeff, tol);
-            }
-        }
-
-        bool empty(Real tol = Real(1e-12)) const {
-            for (const auto& [powers, coeff] : coeffs) {
-                (void)powers;
-                if (std::abs(coeff) > tol) {
-                    return false;
-                }
-            }
-            return true;
-        }
-
-        bool is_constant(Real tol = Real(1e-12)) const {
-            for (const auto& [powers, coeff] : coeffs) {
-                if (std::abs(coeff) <= tol) {
-                    continue;
-                }
-                if (powers.first != 0 || powers.second != 0) {
-                    return false;
-                }
-            }
-            return true;
-        }
-
-        Real constant_value(Real tol = Real(1e-12)) const {
-            Real value = Real(0);
-            for (const auto& [powers, coeff] : coeffs) {
-                if (std::abs(coeff) <= tol) {
-                    continue;
-                }
-                if (powers.first == 0 && powers.second == 0) {
-                    value += coeff;
-                }
-            }
-            return value;
-        }
-    };
-
-    struct ApexSeries {
-        std::vector<std::pair<int, UvPolynomial>> by_power;
-
-        void add_term(int beta, int pu, int pv, Real coeff, Real tol = Real(1e-14)) {
-            const auto found = find_or_insert(beta);
-            found->second.add_term(pu, pv, coeff, tol);
-            if (found->second.empty(tol)) {
-                by_power.erase(found);
-            }
-        }
-
-        void add_scaled(const ApexSeries& other, Real scale, Real tol = Real(1e-14)) {
-            if (std::abs(scale) <= tol) {
-                return;
-            }
-            for (const auto& [beta, poly] : other.by_power) {
-                const auto found = find_or_insert(beta);
-                found->second.add_scaled(poly, scale, tol);
-                if (found->second.empty(tol)) {
-                    by_power.erase(found);
-                }
-            }
-        }
-
-    private:
-        std::vector<std::pair<int, UvPolynomial>>::iterator find_or_insert(int beta) {
-            const auto found = std::lower_bound(
-                by_power.begin(),
-                by_power.end(),
-                beta,
-                [](const auto& entry, int value) { return entry.first < value; });
-            if (found != by_power.end() && found->first == beta) {
-                return found;
-            }
-            return by_power.insert(found, {beta, UvPolynomial{}});
-        }
-    };
-
-    using GradientSeries = std::array<ApexSeries, 3>;
-    using HessianSeries = std::array<std::array<ApexSeries, 3>, 3>;
-
-    enum class ApexLimitKind {
-        Constant,
-        DirectionDependent,
-        Singular,
-    };
-
-    enum class ApexRankStatus {
-        Exact,
-        DirectionDependent,
-        Singular,
-    };
-
-    struct ApexClassification {
-        ApexLimitKind kind{ApexLimitKind::Constant};
-        Real constant_value{0};
-        int leading_power{1};
-    };
-
-    struct ApexData {
-        std::vector<Real> values;
-        std::vector<Gradient> gradients;
-        std::vector<Hessian> hessians;
-        ApexRankStatus gradient_status{ApexRankStatus::Exact};
-        ApexRankStatus hessian_status{ApexRankStatus::Exact};
-    };
-
-    struct OrderData {
-        int order{0};
-        std::vector<math::Vector<Real, 3>> nodes;
-        std::vector<ModalTerm> modal_terms;
-        std::vector<Real> modal_to_nodal;
-        ApexData apex;
-    };
-
-    struct EvaluationScratch {
-        std::vector<Real> modal_values;
-        std::vector<Real> modal_gradient_components;
-        std::vector<Real> modal_hessian_components;
-        std::vector<Gradient> modal_gradients;
-        std::vector<Hessian> modal_hessians;
-        pyramid_modal::EvaluationPoint modal_point;
-
-        void prewarm(std::size_t max_size, std::size_t max_qpts) {
-            const std::size_t batched_size = max_size * std::max<std::size_t>(max_qpts, 1u);
-            modal_values.reserve(batched_size);
-            modal_gradient_components.reserve(batched_size * 3u);
-            modal_hessian_components.reserve(batched_size * 9u);
-            modal_gradients.reserve(max_size);
-            modal_hessians.reserve(max_size);
-        }
-    };
-
-    static EvaluationScratch& evaluation_scratch() {
-        // Scratch is intentionally thread-local: production assembly uses a
-        // persistent worker-thread team, so buffers stay warm on each worker.
-        static thread_local EvaluationScratch scratch;
-        return scratch;
-    }
-
-    static void prewarm_scratch(std::size_t max_size, std::size_t max_qpts) {
-        evaluation_scratch().prewarm(max_size, max_qpts);
-    }
-
-    static bool is_apex_point(const math::Vector<Real, 3>& xi) {
-        const Real tol = apex_coord_tolerance();
-        return std::abs(xi[0]) <= tol &&
-               std::abs(xi[1]) <= tol &&
-               std::abs(Real(1) - xi[2]) <= tol;
-    }
-
-    static bool on_degenerate_top_plane(const math::Vector<Real, 3>& xi) {
-        return basis_near_zero(Real(1) - xi[2]);
-    }
-
-    static void validate_top_plane_query(const math::Vector<Real, 3>& xi) {
-        if (on_degenerate_top_plane(xi) && !is_apex_point(xi)) [[unlikely]] {
-            throw BasisEvaluationException(
-                "Pyramid reference evaluation on the degenerate z=1 plane is only defined at the apex",
-                __FILE__, __LINE__, __func__);
-        }
-    }
-
-    static OrderData build_order_data(int order) {
-        OrderData data;
-        data.order = order;
-
-        data.nodes = build_public_nodes(order);
-        data.modal_terms = pyramid_modal::build_terms(order);
-
-        const std::size_t n = data.nodes.size();
-        if (data.modal_terms.size() != n) {
-            throw BasisConstructionException("LagrangeBasis pyramid modal basis size mismatch",
-                                             __FILE__, __LINE__, __func__);
-        }
-
-        std::vector<Real> vandermonde(n * n, Real(0));
-        for (std::size_t row = 0; row < n; ++row) {
-            pyramid_modal::EvaluationPoint modal_point;
-            pyramid_modal::prepare_evaluation_point(
-                data.modal_terms, data.nodes[row], modal_point);
-            for (std::size_t col = 0; col < n; ++col) {
-                Real value = Real(0);
-                pyramid_modal::evaluate_term(data.modal_terms[col], modal_point, value);
-                vandermonde[row * n + col] = value;
-            }
-        }
-
-        const auto inverse_result = math::invert_dense_matrix_with_diagnostics(
-            std::move(vandermonde),
-            n,
-            "LagrangeBasis pyramid Vandermonde");
-        math::validate_dense_inverse_diagnostics(
-            inverse_result,
-            n,
-            "LagrangeBasis pyramid Vandermonde");
-        const std::vector<Real>& inverse = inverse_result.inverse;
-
-        data.modal_to_nodal.assign(n * n, Real(0));
-        for (std::size_t basis_i = 0; basis_i < n; ++basis_i) {
-            for (std::size_t modal_j = 0; modal_j < n; ++modal_j) {
-                data.modal_to_nodal[basis_i * n + modal_j] =
-                    inverse[modal_j * n + basis_i];
-            }
-        }
-        data.apex = build_apex_data(data);
-        return data;
-    }
-
-    static bool has_low_order_fast_modal_to_nodal(const OrderData& data) noexcept {
-        return data.order == 1 || data.order == 2;
-    }
-
-    static const OrderData& get(int order) {
-        constexpr int kMaxOnceCachedOrder = 12;
-        if (order >= 0 && order <= kMaxOnceCachedOrder) {
-            static std::array<std::once_flag, kMaxOnceCachedOrder + 1> flags;
-            static std::array<std::unique_ptr<OrderData>, kMaxOnceCachedOrder + 1> cache;
-            const auto idx = static_cast<std::size_t>(order);
-            std::call_once(flags[idx], [idx, order]() {
-                cache[idx] = std::make_unique<OrderData>(build_order_data(order));
-            });
-            return *cache[idx];
-        }
-
-        static std::mutex fallback_mutex;
-        static std::map<int, std::unique_ptr<OrderData>> fallback_cache;
-
-        std::lock_guard<std::mutex> lock(fallback_mutex);
-        const auto found = fallback_cache.find(order);
-        if (found != fallback_cache.end()) {
-            return *found->second;
-        }
-
-        auto data = std::make_unique<OrderData>(build_order_data(order));
-        const auto [it, inserted] = fallback_cache.emplace(order, std::move(data));
-        (void)inserted;
-        return *it->second;
-    }
-
-    static void evaluate_values(const OrderData& data,
-                                const math::Vector<Real, 3>& xi,
-                                std::vector<Real>& values) {
-        validate_top_plane_query(xi);
-        if (is_apex_point(xi)) {
-            values = data.apex.values;
-            return;
-        }
-
-        auto& scratch = evaluation_scratch();
-        auto& modal = scratch.modal_values;
-        auto& modal_point = scratch.modal_point;
-        modal.resize(data.modal_terms.size());
-        pyramid_modal::prepare_evaluation_point(data.modal_terms, xi, modal_point);
-        for (std::size_t m = 0; m < data.modal_terms.size(); ++m) {
-            pyramid_modal::evaluate_term(data.modal_terms[m], modal_point, modal[m]);
-        }
-        if (has_low_order_fast_modal_to_nodal(data)) {
-            apply_sparse_basis_to_nodal(data, modal, values);
-        } else {
-            apply_modal_to_nodal(data, modal, values);
-        }
-    }
-
-    static void evaluate_gradients(const OrderData& data,
-                                   const math::Vector<Real, 3>& xi,
-                                   std::vector<Gradient>& gradients) {
-        validate_top_plane_query(xi);
-        if (is_apex_point(xi)) {
-            if (data.apex.gradient_status != ApexRankStatus::Exact) {
-                throw BasisEvaluationException(
-                    apex_status_message("gradient", data.apex.gradient_status),
-                    __FILE__, __LINE__, __func__);
-            }
-            gradients = data.apex.gradients;
-            return;
-        }
-
-        auto& scratch = evaluation_scratch();
-        auto& modal_gradients = scratch.modal_gradients;
-        auto& modal_point = scratch.modal_point;
-        modal_gradients.resize(data.modal_terms.size());
-        pyramid_modal::prepare_evaluation_point(data.modal_terms, xi, modal_point);
-        for (std::size_t m = 0; m < data.modal_terms.size(); ++m) {
-            Real value = Real(0);
-            pyramid_modal::evaluate_term(data.modal_terms[m], modal_point, value, &modal_gradients[m]);
-        }
-        if (has_low_order_fast_modal_to_nodal(data)) {
-            apply_sparse_basis_to_nodal(data, modal_gradients, gradients);
-        } else {
-            apply_modal_to_nodal(data, modal_gradients, gradients);
-        }
-    }
-
-    static void evaluate_hessians(const OrderData& data,
-                                  const math::Vector<Real, 3>& xi,
-                                  std::vector<Hessian>& hessians) {
-        validate_top_plane_query(xi);
-        if (is_apex_point(xi)) {
-            if (data.apex.hessian_status != ApexRankStatus::Exact) {
-                throw BasisEvaluationException(
-                    apex_status_message("Hessian", data.apex.hessian_status),
-                    __FILE__, __LINE__, __func__);
-            }
-            hessians = data.apex.hessians;
-            return;
-        }
-
-        auto& scratch = evaluation_scratch();
-        auto& modal_hessians = scratch.modal_hessians;
-        auto& modal_point = scratch.modal_point;
-        modal_hessians.resize(data.modal_terms.size());
-        pyramid_modal::prepare_evaluation_point(data.modal_terms, xi, modal_point);
-        for (std::size_t m = 0; m < data.modal_terms.size(); ++m) {
-            Real value = Real(0);
-            pyramid_modal::evaluate_term(data.modal_terms[m], modal_point, value, nullptr, &modal_hessians[m]);
-        }
-        if (has_low_order_fast_modal_to_nodal(data)) {
-            apply_sparse_basis_to_nodal(data, modal_hessians, hessians);
-        } else {
-            apply_modal_to_nodal(data, modal_hessians, hessians);
-        }
-    }
-
-    static void evaluate_all(const OrderData& data,
-                             const math::Vector<Real, 3>& xi,
-                             std::vector<Real>& values,
-                             std::vector<Gradient>& gradients,
-                             std::vector<Hessian>& hessians) {
-        validate_top_plane_query(xi);
-        if (is_apex_point(xi)) {
-            if (data.apex.gradient_status != ApexRankStatus::Exact) {
-                throw BasisEvaluationException(
-                    apex_status_message("gradient", data.apex.gradient_status),
-                    __FILE__, __LINE__, __func__);
-            }
-            if (data.apex.hessian_status != ApexRankStatus::Exact) {
-                throw BasisEvaluationException(
-                    apex_status_message("Hessian", data.apex.hessian_status),
-                    __FILE__, __LINE__, __func__);
-            }
-            values = data.apex.values;
-            gradients = data.apex.gradients;
-            hessians = data.apex.hessians;
-            return;
-        }
-
-        const std::size_t n = data.modal_terms.size();
-        auto& scratch = evaluation_scratch();
-        auto& modal_values = scratch.modal_values;
-        auto& modal_gradients = scratch.modal_gradients;
-        auto& modal_hessians = scratch.modal_hessians;
-        auto& modal_point = scratch.modal_point;
-        modal_values.resize(n);
-        modal_gradients.resize(n);
-        modal_hessians.resize(n);
-        pyramid_modal::prepare_evaluation_point(data.modal_terms, xi, modal_point);
-
-        for (std::size_t m = 0; m < n; ++m) {
-            pyramid_modal::evaluate_term(
-                data.modal_terms[m], modal_point, modal_values[m], &modal_gradients[m], &modal_hessians[m]);
-        }
-
-        if (has_low_order_fast_modal_to_nodal(data)) {
-            apply_sparse_basis_to_nodal_all(
-                data, modal_values, modal_gradients, modal_hessians, values, gradients, hessians);
-            return;
-        }
-
-        values.resize(n);
-        gradients.resize(n);
-        hessians.resize(n);
-        for (std::size_t basis_i = 0; basis_i < n; ++basis_i) {
-            const Real* row = data.modal_to_nodal.data() + basis_i * n;
-            Gradient gradient{};
-            Hessian hessian{};
-            Real value = Real(0);
-            for (std::size_t modal_j = 0; modal_j < n; ++modal_j) {
-                const Real coeff = row[modal_j];
-                value += coeff * modal_values[modal_j];
-
-                const Real* modal_gradient = modal_gradients[modal_j].data();
-                gradient[0] += coeff * modal_gradient[0];
-                gradient[1] += coeff * modal_gradient[1];
-                gradient[2] += coeff * modal_gradient[2];
-
-                const Real* modal_hessian = modal_hessians[modal_j].data();
-                Real* hessian_data = hessian.data();
-                hessian_data[0] += coeff * modal_hessian[0];
-                hessian_data[1] += coeff * modal_hessian[1];
-                hessian_data[2] += coeff * modal_hessian[2];
-                hessian_data[4] += coeff * modal_hessian[4];
-                hessian_data[5] += coeff * modal_hessian[5];
-                hessian_data[8] += coeff * modal_hessian[8];
-            }
-            values[basis_i] = value;
-            gradients[basis_i] = gradient;
-            Real* hessian_data = hessian.data();
-            hessian_data[3] = hessian_data[1];
-            hessian_data[6] = hessian_data[2];
-            hessian_data[7] = hessian_data[5];
-            hessians[basis_i] = hessian;
-        }
-    }
-
-    static void evaluate_values_to(const OrderData& data,
-                                   const math::Vector<Real, 3>& xi,
-                                   Real* SVMP_RESTRICT values_out) {
-        validate_top_plane_query(xi);
-        if (is_apex_point(xi)) {
-            std::copy(data.apex.values.begin(), data.apex.values.end(), values_out);
-            return;
-        }
-
-        auto& scratch = evaluation_scratch();
-        auto& modal = scratch.modal_values;
-        auto& modal_point = scratch.modal_point;
-        modal.resize(data.modal_terms.size());
-        pyramid_modal::prepare_evaluation_point(data.modal_terms, xi, modal_point);
-        for (std::size_t m = 0; m < data.modal_terms.size(); ++m) {
-            pyramid_modal::evaluate_term(data.modal_terms[m], modal_point, modal[m]);
-        }
-        if (has_low_order_fast_modal_to_nodal(data)) {
-            apply_sparse_basis_to_nodal_to(data, modal, values_out);
-        } else {
-            apply_modal_to_nodal_to(data, modal, values_out);
-        }
-    }
-
-    static void evaluate_gradients_to(const OrderData& data,
-                                      const math::Vector<Real, 3>& xi,
-                                      Real* SVMP_RESTRICT gradients_out) {
-        validate_top_plane_query(xi);
-        if (is_apex_point(xi)) {
-            if (data.apex.gradient_status != ApexRankStatus::Exact) {
-                throw BasisEvaluationException(
-                    apex_status_message("gradient", data.apex.gradient_status),
-                    __FILE__, __LINE__, __func__);
-            }
-            for (std::size_t i = 0; i < data.apex.gradients.size(); ++i) {
-                gradients_out[i * 3u + 0u] = data.apex.gradients[i][0];
-                gradients_out[i * 3u + 1u] = data.apex.gradients[i][1];
-                gradients_out[i * 3u + 2u] = data.apex.gradients[i][2];
-            }
-            return;
-        }
-
-        auto& scratch = evaluation_scratch();
-        auto& modal_gradients = scratch.modal_gradients;
-        auto& modal_point = scratch.modal_point;
-        modal_gradients.resize(data.modal_terms.size());
-        pyramid_modal::prepare_evaluation_point(data.modal_terms, xi, modal_point);
-        for (std::size_t m = 0; m < data.modal_terms.size(); ++m) {
-            Real value = Real(0);
-            pyramid_modal::evaluate_term(data.modal_terms[m], modal_point, value, &modal_gradients[m]);
-        }
-        if (has_low_order_fast_modal_to_nodal(data)) {
-            apply_sparse_basis_to_nodal_to(data, modal_gradients, gradients_out);
-        } else {
-            apply_modal_to_nodal_to(data, modal_gradients, gradients_out);
-        }
-    }
-
-    static void evaluate_hessians_to(const OrderData& data,
-                                     const math::Vector<Real, 3>& xi,
-                                     Real* SVMP_RESTRICT hessians_out) {
-        validate_top_plane_query(xi);
-        if (is_apex_point(xi)) {
-            if (data.apex.hessian_status != ApexRankStatus::Exact) {
-                throw BasisEvaluationException(
-                    apex_status_message("Hessian", data.apex.hessian_status),
-                    __FILE__, __LINE__, __func__);
-            }
-            for (std::size_t i = 0; i < data.apex.hessians.size(); ++i) {
-                store_hessian(data.apex.hessians[i], hessians_out + i * 9u);
-            }
-            return;
-        }
-
-        auto& scratch = evaluation_scratch();
-        auto& modal_hessians = scratch.modal_hessians;
-        auto& modal_point = scratch.modal_point;
-        modal_hessians.resize(data.modal_terms.size());
-        pyramid_modal::prepare_evaluation_point(data.modal_terms, xi, modal_point);
-        for (std::size_t m = 0; m < data.modal_terms.size(); ++m) {
-            Real value = Real(0);
-            pyramid_modal::evaluate_term(data.modal_terms[m], modal_point, value, nullptr, &modal_hessians[m]);
-        }
-        if (has_low_order_fast_modal_to_nodal(data)) {
-            apply_sparse_basis_to_nodal_to(data, modal_hessians, hessians_out);
-        } else {
-            apply_modal_to_nodal_to(data, modal_hessians, hessians_out);
-        }
-    }
-
-    static void evaluate_all_to(const OrderData& data,
-                                const math::Vector<Real, 3>& xi,
-                                Real* SVMP_RESTRICT values_out,
-                                Real* SVMP_RESTRICT gradients_out,
-                                Real* SVMP_RESTRICT hessians_out) {
-        validate_top_plane_query(xi);
-        if (is_apex_point(xi)) {
-            if (data.apex.gradient_status != ApexRankStatus::Exact) {
-                throw BasisEvaluationException(
-                    apex_status_message("gradient", data.apex.gradient_status),
-                    __FILE__, __LINE__, __func__);
-            }
-            if (data.apex.hessian_status != ApexRankStatus::Exact) {
-                throw BasisEvaluationException(
-                    apex_status_message("Hessian", data.apex.hessian_status),
-                    __FILE__, __LINE__, __func__);
-            }
-            std::copy(data.apex.values.begin(), data.apex.values.end(), values_out);
-            for (std::size_t i = 0; i < data.apex.gradients.size(); ++i) {
-                gradients_out[i * 3u + 0u] = data.apex.gradients[i][0];
-                gradients_out[i * 3u + 1u] = data.apex.gradients[i][1];
-                gradients_out[i * 3u + 2u] = data.apex.gradients[i][2];
-            }
-            for (std::size_t i = 0; i < data.apex.hessians.size(); ++i) {
-                const Real* hessian = data.apex.hessians[i].data();
-                std::copy(hessian, hessian + 9u, hessians_out + i * 9u);
-            }
-            return;
-        }
-
-        const std::size_t n = data.modal_terms.size();
-        auto& scratch = evaluation_scratch();
-        auto& modal_values = scratch.modal_values;
-        auto& modal_gradients = scratch.modal_gradients;
-        auto& modal_hessians = scratch.modal_hessians;
-        auto& modal_point = scratch.modal_point;
-        modal_values.resize(n);
-        modal_gradients.resize(n);
-        modal_hessians.resize(n);
-        pyramid_modal::prepare_evaluation_point(data.modal_terms, xi, modal_point);
-
-        for (std::size_t m = 0; m < n; ++m) {
-            pyramid_modal::evaluate_term(
-                data.modal_terms[m], modal_point, modal_values[m], &modal_gradients[m], &modal_hessians[m]);
-        }
-
-        if (has_low_order_fast_modal_to_nodal(data)) {
-            apply_sparse_basis_to_nodal_all_to(
-                data, modal_values, modal_gradients, modal_hessians, values_out, gradients_out, hessians_out);
-            return;
-        }
-
-        for (std::size_t basis_i = 0; basis_i < n; ++basis_i) {
-            const Real* row = data.modal_to_nodal.data() + basis_i * n;
-            Real value = Real(0);
-            Real gradient[3] = {Real(0), Real(0), Real(0)};
-            Real hessian[9] = {};
-            for (std::size_t modal_j = 0; modal_j < n; ++modal_j) {
-                const Real coeff = row[modal_j];
-                value += coeff * modal_values[modal_j];
-
-                const Real* modal_gradient = modal_gradients[modal_j].data();
-                gradient[0] += coeff * modal_gradient[0];
-                gradient[1] += coeff * modal_gradient[1];
-                gradient[2] += coeff * modal_gradient[2];
-
-                const Real* modal_hessian = modal_hessians[modal_j].data();
-                hessian[0] += coeff * modal_hessian[0];
-                hessian[1] += coeff * modal_hessian[1];
-                hessian[2] += coeff * modal_hessian[2];
-                hessian[4] += coeff * modal_hessian[4];
-                hessian[5] += coeff * modal_hessian[5];
-                hessian[8] += coeff * modal_hessian[8];
-            }
-
-            values_out[basis_i] = value;
-            Real* gradient_out = gradients_out + basis_i * 3u;
-            gradient_out[0] = gradient[0];
-            gradient_out[1] = gradient[1];
-            gradient_out[2] = gradient[2];
-
-            Real* hessian_out = hessians_out + basis_i * 9u;
-            hessian_out[0] = hessian[0];
-            hessian_out[1] = hessian[1];
-            hessian_out[2] = hessian[2];
-            hessian_out[3] = hessian[1];
-            hessian_out[4] = hessian[4];
-            hessian_out[5] = hessian[5];
-            hessian_out[6] = hessian[2];
-            hessian_out[7] = hessian[5];
-            hessian_out[8] = hessian[8];
-        }
-    }
-
-    static void evaluate_at_quadrature_points_strided(
-        const OrderData& data,
-        const std::vector<math::Vector<Real, 3>>& points,
-        std::size_t output_stride,
-        Real* SVMP_RESTRICT values_out,
-        Real* SVMP_RESTRICT gradients_out,
-        Real* SVMP_RESTRICT hessians_out) {
-        const unsigned mask = (values_out != nullptr ? 1u : 0u) |
-                              (gradients_out != nullptr ? 2u : 0u) |
-                              (hessians_out != nullptr ? 4u : 0u);
-        switch (mask) {
-            case 0u:
-                validate_strided_points(points);
-                return;
-            case 1u:
-                evaluate_at_quadrature_points_strided_impl<true, false, false>(
-                    data, points, output_stride, values_out, gradients_out, hessians_out);
-                return;
-            case 2u:
-                evaluate_at_quadrature_points_strided_impl<false, true, false>(
-                    data, points, output_stride, values_out, gradients_out, hessians_out);
-                return;
-            case 3u:
-                evaluate_at_quadrature_points_strided_impl<true, true, false>(
-                    data, points, output_stride, values_out, gradients_out, hessians_out);
-                return;
-            case 4u:
-                evaluate_at_quadrature_points_strided_impl<false, false, true>(
-                    data, points, output_stride, values_out, gradients_out, hessians_out);
-                return;
-            case 5u:
-                evaluate_at_quadrature_points_strided_impl<true, false, true>(
-                    data, points, output_stride, values_out, gradients_out, hessians_out);
-                return;
-            case 6u:
-                evaluate_at_quadrature_points_strided_impl<false, true, true>(
-                    data, points, output_stride, values_out, gradients_out, hessians_out);
-                return;
-            case 7u:
-                evaluate_at_quadrature_points_strided_impl<true, true, true>(
-                    data, points, output_stride, values_out, gradients_out, hessians_out);
-                return;
-            default:
-                return;
-        }
-    }
-
-private:
-    static void validate_strided_points(const std::vector<math::Vector<Real, 3>>& points) {
-        for (const auto& xi : points) {
-            validate_top_plane_query(xi);
-        }
-    }
-
-    template <bool NeedValues, bool NeedGradients, bool NeedHessians>
-    static void write_apex_strided(const OrderData& data,
-                                   std::size_t q,
-                                   std::size_t output_stride,
-                                   Real* SVMP_RESTRICT values_out,
-                                   Real* SVMP_RESTRICT gradients_out,
-                                   Real* SVMP_RESTRICT hessians_out) {
-        const std::size_t n = data.modal_terms.size();
-        if constexpr (NeedValues) {
-            for (std::size_t basis_i = 0; basis_i < n; ++basis_i) {
-                values_out[basis_i * output_stride + q] = data.apex.values[basis_i];
-            }
-        }
-        if constexpr (NeedGradients) {
-            if (data.apex.gradient_status != ApexRankStatus::Exact) {
-                throw BasisEvaluationException(
-                    apex_status_message("gradient", data.apex.gradient_status),
-                    __FILE__, __LINE__, __func__);
-            }
-            for (std::size_t basis_i = 0; basis_i < n; ++basis_i) {
-                Real* g = gradients_out + basis_i * 3u * output_stride;
-                g[0u * output_stride + q] = data.apex.gradients[basis_i][0];
-                g[1u * output_stride + q] = data.apex.gradients[basis_i][1];
-                g[2u * output_stride + q] = data.apex.gradients[basis_i][2];
-            }
-        }
-        if constexpr (NeedHessians) {
-            if (data.apex.hessian_status != ApexRankStatus::Exact) {
-                throw BasisEvaluationException(
-                    apex_status_message("Hessian", data.apex.hessian_status),
-                    __FILE__, __LINE__, __func__);
-            }
-            for (std::size_t basis_i = 0; basis_i < n; ++basis_i) {
-                const Real* hessian = data.apex.hessians[basis_i].data();
-                Real* H = hessians_out + basis_i * 9u * output_stride;
-                for (std::size_t component = 0; component < 9u; ++component) {
-                    H[component * output_stride + q] = hessian[component];
-                }
-            }
-        }
-    }
-
-    template <int Px,
-              int Py,
-              int Pz,
-              int DenomPower,
-              bool NeedValues,
-              bool NeedGradients,
-              bool NeedHessians>
-    static void fill_low_order_modal_jet(std::size_t modal_i,
-                                         const Real* SVMP_RESTRICT xp,
-                                         const Real* SVMP_RESTRICT yp,
-                                         const Real* SVMP_RESTRICT zp,
-                                         const Real* SVMP_RESTRICT inv_tp,
-                                         Real* SVMP_RESTRICT modal_values,
-                                         Real (*SVMP_RESTRICT modal_gradients)[3],
-                                         Real (*SVMP_RESTRICT modal_hessians)[9]) {
-        const Real xy_base = xp[Px] * yp[Py];
-        const Real base = xy_base * zp[Pz];
-        const Real inv_denom = inv_tp[DenomPower];
-        const Real value = base * inv_denom;
-
-        if constexpr (NeedValues) {
-            modal_values[modal_i] = value;
-        }
-        if constexpr (NeedGradients) {
-            Real* g = modal_gradients[modal_i];
-            if constexpr (Px > 0) {
-                g[0] = static_cast<Real>(Px) * xp[Px - 1] * yp[Py] * zp[Pz] * inv_denom;
-            } else {
-                g[0] = Real(0);
-            }
-            if constexpr (Py > 0) {
-                g[1] = static_cast<Real>(Py) * xp[Px] * yp[Py - 1] * zp[Pz] * inv_denom;
-            } else {
-                g[1] = Real(0);
-            }
-            Real gz = Real(0);
-            if constexpr (Pz > 0) {
-                gz += static_cast<Real>(Pz) * xy_base * zp[Pz - 1] * inv_denom;
-            }
-            if constexpr (DenomPower > 0) {
-                gz += static_cast<Real>(DenomPower) * base * inv_tp[DenomPower + 1];
-            }
-            g[2] = gz;
-        }
-        if constexpr (NeedHessians) {
-            Real* H = modal_hessians[modal_i];
-            if constexpr (Px > 1) {
-                H[0] = static_cast<Real>(Px * (Px - 1)) *
-                       xp[Px - 2] * yp[Py] * zp[Pz] * inv_denom;
-            } else {
-                H[0] = Real(0);
-            }
-            if constexpr (Py > 1) {
-                H[4] = static_cast<Real>(Py * (Py - 1)) *
-                       xp[Px] * yp[Py - 2] * zp[Pz] * inv_denom;
-            } else {
-                H[4] = Real(0);
-            }
-            Real hxy = Real(0);
-            if constexpr (Px > 0 && Py > 0) {
-                hxy = static_cast<Real>(Px * Py) *
-                      xp[Px - 1] * yp[Py - 1] * zp[Pz] * inv_denom;
-            }
-            H[1] = hxy;
-            H[3] = hxy;
-
-            Real hxz = Real(0);
-            if constexpr (Px > 0) {
-                constexpr Real px_real = static_cast<Real>(Px);
-                const Real x_deriv_y = px_real * xp[Px - 1] * yp[Py];
-                if constexpr (Pz > 0) {
-                    hxz += x_deriv_y * static_cast<Real>(Pz) *
-                           zp[Pz - 1] * inv_denom;
-                }
-                if constexpr (DenomPower > 0) {
-                    hxz += x_deriv_y * static_cast<Real>(DenomPower) *
-                           zp[Pz] * inv_tp[DenomPower + 1];
-                }
-            }
-            H[2] = hxz;
-            H[6] = hxz;
-
-            Real hyz = Real(0);
-            if constexpr (Py > 0) {
-                constexpr Real py_real = static_cast<Real>(Py);
-                const Real x_y_deriv = py_real * xp[Px] * yp[Py - 1];
-                if constexpr (Pz > 0) {
-                    hyz += x_y_deriv * static_cast<Real>(Pz) *
-                           zp[Pz - 1] * inv_denom;
-                }
-                if constexpr (DenomPower > 0) {
-                    hyz += x_y_deriv * static_cast<Real>(DenomPower) *
-                           zp[Pz] * inv_tp[DenomPower + 1];
-                }
-            }
-            H[5] = hyz;
-            H[7] = hyz;
-
-            Real hzz = Real(0);
-            if constexpr (Pz > 1) {
-                hzz += static_cast<Real>(Pz * (Pz - 1)) *
-                       xy_base * zp[Pz - 2] * inv_denom;
-            }
-            if constexpr (Pz > 0 && DenomPower > 0) {
-                hzz += static_cast<Real>(2 * Pz * DenomPower) * xy_base *
-                       zp[Pz - 1] * inv_tp[DenomPower + 1];
-            }
-            if constexpr (DenomPower > 0) {
-                hzz += static_cast<Real>(DenomPower * (DenomPower + 1)) *
-                       base * inv_tp[DenomPower + 2];
-            }
-            H[8] = hzz;
-        }
-    }
-
-    template <bool NeedValues, bool NeedGradients, bool NeedHessians>
-    static void evaluate_low_order_modal_jets(const OrderData& data,
-                                              const math::Vector<Real, 3>& xi,
-                                              Real* SVMP_RESTRICT modal_values,
-                                              Real (*SVMP_RESTRICT modal_gradients)[3],
-                                              Real (*SVMP_RESTRICT modal_hessians)[9]) {
-        const Real x = xi[0];
-        const Real y = xi[1];
-        const Real z = xi[2];
-        const Real inv_t = Real(1) / (Real(1) - z);
-        const Real xp[3] = {Real(1), x, x * x};
-        const Real yp[3] = {Real(1), y, y * y};
-        const Real zp[3] = {Real(1), z, z * z};
-        Real inv_tp[5] = {Real(1), inv_t, Real(0), Real(0), Real(0)};
-        inv_tp[2] = inv_tp[1] * inv_t;
-        inv_tp[3] = inv_tp[2] * inv_t;
-        inv_tp[4] = inv_tp[3] * inv_t;
-
-        fill_low_order_modal_jet<0, 0, 0, 0, NeedValues, NeedGradients, NeedHessians>(
-            0u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
-        fill_low_order_modal_jet<1, 0, 0, 0, NeedValues, NeedGradients, NeedHessians>(
-            1u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
-        if (data.order == 1) {
-            fill_low_order_modal_jet<0, 1, 0, 0, NeedValues, NeedGradients, NeedHessians>(
-                2u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
-            fill_low_order_modal_jet<1, 1, 0, 1, NeedValues, NeedGradients, NeedHessians>(
-                3u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
-            fill_low_order_modal_jet<0, 0, 1, 0, NeedValues, NeedGradients, NeedHessians>(
-                4u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
-            return;
-        }
-
-        fill_low_order_modal_jet<2, 0, 0, 0, NeedValues, NeedGradients, NeedHessians>(
-            2u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
-        fill_low_order_modal_jet<0, 1, 0, 0, NeedValues, NeedGradients, NeedHessians>(
-            3u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
-        fill_low_order_modal_jet<1, 1, 0, 1, NeedValues, NeedGradients, NeedHessians>(
-            4u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
-        fill_low_order_modal_jet<2, 1, 0, 1, NeedValues, NeedGradients, NeedHessians>(
-            5u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
-        fill_low_order_modal_jet<0, 2, 0, 0, NeedValues, NeedGradients, NeedHessians>(
-            6u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
-        fill_low_order_modal_jet<1, 2, 0, 1, NeedValues, NeedGradients, NeedHessians>(
-            7u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
-        fill_low_order_modal_jet<2, 2, 0, 2, NeedValues, NeedGradients, NeedHessians>(
-            8u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
-        fill_low_order_modal_jet<0, 0, 1, 0, NeedValues, NeedGradients, NeedHessians>(
-            9u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
-        fill_low_order_modal_jet<1, 0, 1, 0, NeedValues, NeedGradients, NeedHessians>(
-            10u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
-        fill_low_order_modal_jet<0, 1, 1, 0, NeedValues, NeedGradients, NeedHessians>(
-            11u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
-        fill_low_order_modal_jet<1, 1, 1, 1, NeedValues, NeedGradients, NeedHessians>(
-            12u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
-        fill_low_order_modal_jet<0, 0, 2, 0, NeedValues, NeedGradients, NeedHessians>(
-            13u, xp, yp, zp, inv_tp, modal_values, modal_gradients, modal_hessians);
-    }
-
-    template <bool NeedValues, bool NeedGradients, bool NeedHessians>
-    static bool try_evaluate_low_order_strided(
-        const OrderData& data,
-        const std::vector<math::Vector<Real, 3>>& points,
-        std::size_t output_stride,
-        Real* SVMP_RESTRICT values_out,
-        Real* SVMP_RESTRICT gradients_out,
-        Real* SVMP_RESTRICT hessians_out) {
-        if (!has_low_order_fast_modal_to_nodal(data)) {
-            return false;
-        }
-        for (const auto& xi : points) {
-            validate_top_plane_query(xi);
-            if (is_apex_point(xi)) {
-                return false;
-            }
-        }
-
-        Real modal_values[14];
-        Real modal_gradients[14][3];
-        Real modal_hessians[14][9];
-        for (std::size_t q = 0; q < points.size(); ++q) {
-            evaluate_low_order_modal_jets<NeedValues, NeedGradients, NeedHessians>(
-                data, points[q], modal_values, modal_gradients, modal_hessians);
-            if constexpr (NeedValues) {
-                apply_low_order_combination(
-                    data,
-                    1u,
-                    [&](std::size_t modal_i, std::size_t) {
-                        return modal_values[modal_i];
-                    },
-                    [&](std::size_t basis_i, std::size_t, Real value) {
-                        values_out[basis_i * output_stride + q] = value;
-                    });
-            }
-            if constexpr (NeedGradients) {
-                apply_low_order_combination(
-                    data,
-                    3u,
-                    [&](std::size_t modal_i, std::size_t component) {
-                        return modal_gradients[modal_i][component];
-                    },
-                    [&](std::size_t basis_i, std::size_t component, Real value) {
-                        gradients_out[basis_i * 3u * output_stride +
-                                      component * output_stride + q] = value;
-                    });
-            }
-            if constexpr (NeedHessians) {
-                apply_low_order_combination(
-                    data,
-                    9u,
-                    [&](std::size_t modal_i, std::size_t component) {
-                        return modal_hessians[modal_i][component];
-                    },
-                    [&](std::size_t basis_i, std::size_t component, Real value) {
-                        hessians_out[basis_i * 9u * output_stride +
-                                     component * output_stride + q] = value;
-                    });
-            }
-        }
-        return true;
-    }
-
-    template <bool NeedValues, bool NeedGradients, bool NeedHessians>
-    static void evaluate_at_quadrature_points_strided_impl(
-        const OrderData& data,
-        const std::vector<math::Vector<Real, 3>>& points,
-        std::size_t output_stride,
-        Real* SVMP_RESTRICT values_out,
-        Real* SVMP_RESTRICT gradients_out,
-        Real* SVMP_RESTRICT hessians_out) {
-        const std::size_t n = data.modal_terms.size();
-        if (points.empty() || n == 0u) {
-            return;
-        }
-        if (try_evaluate_low_order_strided<NeedValues, NeedGradients, NeedHessians>(
-                data, points, output_stride, values_out, gradients_out, hessians_out)) {
-            return;
-        }
-
-        auto& scratch = evaluation_scratch();
-        auto& modal_values = scratch.modal_values;
-        auto& modal_gradients = scratch.modal_gradients;
-        auto& modal_hessians = scratch.modal_hessians;
-        auto& modal_point = scratch.modal_point;
-        if constexpr (NeedValues) {
-            modal_values.resize(n);
-        }
-        if constexpr (NeedGradients) {
-            modal_gradients.resize(n);
-        }
-        if constexpr (NeedHessians) {
-            modal_hessians.resize(n);
-        }
-        const bool use_fast_modal_to_nodal = has_low_order_fast_modal_to_nodal(data);
-
-        if (!use_fast_modal_to_nodal) {
-            bool has_apex_query = false;
-            for (const auto& xi : points) {
-                validate_top_plane_query(xi);
-                has_apex_query = has_apex_query || is_apex_point(xi);
-            }
-
-            if (!has_apex_query) {
-                const std::size_t num_qpts = points.size();
-                if constexpr (NeedValues) {
-                    modal_values.resize(n * num_qpts);
-                }
-                if constexpr (NeedGradients) {
-                    scratch.modal_gradient_components.resize(n * 3u * num_qpts);
-                }
-                if constexpr (NeedHessians) {
-                    scratch.modal_hessian_components.resize(n * 9u * num_qpts);
-                }
-
-                for (std::size_t q = 0; q < num_qpts; ++q) {
-                    const auto& xi = points[q];
-                    pyramid_modal::prepare_evaluation_point(data.modal_terms, xi, modal_point);
-                    for (std::size_t modal_j = 0; modal_j < n; ++modal_j) {
-                        Real modal_value = Real(0);
-                        Gradient modal_gradient{};
-                        Hessian modal_hessian{};
-                        pyramid_modal::evaluate_term(
-                            data.modal_terms[modal_j],
-                            modal_point,
-                            modal_value,
-                            NeedGradients ? &modal_gradient : nullptr,
-                            NeedHessians ? &modal_hessian : nullptr);
-                        if constexpr (NeedValues) {
-                            modal_values[modal_j * num_qpts + q] = modal_value;
-                        }
-                        if constexpr (NeedGradients) {
-                            for (std::size_t component = 0; component < 3u; ++component) {
-                                scratch.modal_gradient_components[
-                                    (modal_j * 3u + component) * num_qpts + q] =
-                                    modal_gradient[component];
-                            }
-                        }
-                        if constexpr (NeedHessians) {
-                            for (std::size_t component = 0; component < 9u; ++component) {
-                                scratch.modal_hessian_components[
-                                    (modal_j * 9u + component) * num_qpts + q] =
-                                    modal_hessian.data()[component];
-                            }
-                        }
-                    }
-                }
-
-                const Real* transform = data.modal_to_nodal.data();
-                if constexpr (NeedValues) {
-                    math::dense_transform_batched_row_major(
-                        transform,
-                        n,
-                        n,
-                        modal_values.data(),
-                        num_qpts,
-                        values_out,
-                        output_stride,
-                        num_qpts);
-                }
-                if constexpr (NeedGradients) {
-                    for (std::size_t component = 0; component < 3u; ++component) {
-                        math::dense_transform_batched_row_major(
-                            transform,
-                            n,
-                            n,
-                            scratch.modal_gradient_components.data() + component * num_qpts,
-                            3u * num_qpts,
-                            gradients_out + component * output_stride,
-                            3u * output_stride,
-                            num_qpts);
-                    }
-                }
-                if constexpr (NeedHessians) {
-                    for (std::size_t component = 0; component < 9u; ++component) {
-                        math::dense_transform_batched_row_major(
-                            transform,
-                            n,
-                            n,
-                            scratch.modal_hessian_components.data() + component * num_qpts,
-                            9u * num_qpts,
-                            hessians_out + component * output_stride,
-                            9u * output_stride,
-                            num_qpts);
-                    }
-                }
-                return;
-            }
-        }
-
-        for (std::size_t q = 0; q < points.size(); ++q) {
-            const auto& xi = points[q];
-            validate_top_plane_query(xi);
-
-            if (is_apex_point(xi)) {
-                write_apex_strided<NeedValues, NeedGradients, NeedHessians>(
-                    data, q, output_stride, values_out, gradients_out, hessians_out);
-                continue;
-            }
-
-            pyramid_modal::prepare_evaluation_point(data.modal_terms, xi, modal_point);
-            for (std::size_t modal_j = 0; modal_j < n; ++modal_j) {
-                Gradient* gradient_out = nullptr;
-                Hessian* hessian_out = nullptr;
-                if constexpr (NeedGradients) {
-                    gradient_out = &modal_gradients[modal_j];
-                }
-                if constexpr (NeedHessians) {
-                    hessian_out = &modal_hessians[modal_j];
-                }
-                if constexpr (NeedValues) {
-                    pyramid_modal::evaluate_term(
-                        data.modal_terms[modal_j],
-                        modal_point,
-                        modal_values[modal_j],
-                        gradient_out,
-                        hessian_out);
-                } else {
-                    Real value = Real(0);
-                    pyramid_modal::evaluate_term(
-                        data.modal_terms[modal_j],
-                        modal_point,
-                        value,
-                        gradient_out,
-                        hessian_out);
-                }
-            }
-
-            if (use_fast_modal_to_nodal) {
-                if constexpr (NeedValues) {
-                    apply_low_order_combination(
-                        data,
-                        1u,
-                        [&](std::size_t modal_i, std::size_t) {
-                            return modal_values[modal_i];
-                        },
-                        [&](std::size_t basis_i, std::size_t, Real value) {
-                            values_out[basis_i * output_stride + q] = value;
-                        });
-                }
-                if constexpr (NeedGradients) {
-                    apply_low_order_combination(
-                        data,
-                        3u,
-                        [&](std::size_t modal_i, std::size_t component) {
-                            return modal_gradients[modal_i][component];
-                        },
-                        [&](std::size_t basis_i, std::size_t component, Real value) {
-                            gradients_out[basis_i * 3u * output_stride +
-                                          component * output_stride + q] = value;
-                        });
-                }
-                if constexpr (NeedHessians) {
-                    apply_low_order_combination(
-                        data,
-                        9u,
-                        [&](std::size_t modal_i, std::size_t component) {
-                            return modal_hessians[modal_i].data()[component];
-                        },
-                        [&](std::size_t basis_i, std::size_t component, Real value) {
-                            hessians_out[basis_i * 9u * output_stride +
-                                         component * output_stride + q] = value;
-                        });
-                }
-                continue;
-            }
-
-            for (std::size_t basis_i = 0; basis_i < n; ++basis_i) {
-                const Real* matrix_row = data.modal_to_nodal.data() + basis_i * n;
-                [[maybe_unused]] Real value = Real(0);
-                [[maybe_unused]] std::array<Real, 3> gradient{};
-                [[maybe_unused]] std::array<Real, 9> hessian{};
-
-                for (std::size_t modal_j = 0; modal_j < n; ++modal_j) {
-                    const Real coeff = matrix_row[modal_j];
-                    if constexpr (NeedValues) {
-                        value += coeff * modal_values[modal_j];
-                    }
-                    if constexpr (NeedGradients) {
-                        const Real* modal_gradient = modal_gradients[modal_j].data();
-                        gradient[0] += coeff * modal_gradient[0];
-                        gradient[1] += coeff * modal_gradient[1];
-                        gradient[2] += coeff * modal_gradient[2];
-                    }
-                    if constexpr (NeedHessians) {
-                        const Real* modal_hessian = modal_hessians[modal_j].data();
-                        for (std::size_t component = 0; component < 9u; ++component) {
-                            hessian[component] += coeff * modal_hessian[component];
-                        }
-                    }
-                }
-
-                if constexpr (NeedValues) {
-                    values_out[basis_i * output_stride + q] = value;
-                }
-                if constexpr (NeedGradients) {
-                    Real* g = gradients_out + basis_i * 3u * output_stride;
-                    g[0u * output_stride + q] = gradient[0];
-                    g[1u * output_stride + q] = gradient[1];
-                    g[2u * output_stride + q] = gradient[2];
-                }
-                if constexpr (NeedHessians) {
-                    Real* H = hessians_out + basis_i * 9u * output_stride;
-                    for (std::size_t component = 0; component < 9u; ++component) {
-                        H[component * output_stride + q] = hessian[component];
-                    }
-                }
-            }
-        }
-    }
-
-    static Real apex_coord_tolerance() noexcept {
-        return basis_scaled_tolerance();
-    }
-
-    // Coefficient pruning for symbolic apex series, not a reference-coordinate
-    // roundoff test. Keep this strict and separate from BasisTolerance.
-    static constexpr Real kSeriesTolerance = Real(1e-12);
-
-    static Real binomial_coeff(int n, int k) {
-        if (k < 0 || k > n) {
-            return Real(0);
-        }
-        if (k == 0 || k == n) {
-            return Real(1);
-        }
-        k = std::min(k, n - k);
-        Real coeff = Real(1);
-        for (int i = 1; i <= k; ++i) {
-            coeff *= static_cast<Real>(n - (k - i));
-            coeff /= static_cast<Real>(i);
-        }
-        return coeff;
-    }
-
-    static void add_z_expansion(ApexSeries& series,
-                                int z_power,
-                                int beta0,
-                                int pu,
-                                int pv,
-                                Real coeff) {
-        for (int q = 0; q <= z_power; ++q) {
-            const Real z_coeff = coeff * binomial_coeff(z_power, q) *
-                                 ((q % 2 == 0) ? Real(1) : Real(-1));
-            series.add_term(beta0 + q, pu, pv, z_coeff, kSeriesTolerance);
-        }
-    }
-
-    static ApexSeries modal_value_asymptotic(const ModalTerm& term) {
-        ApexSeries series;
-        add_z_expansion(series,
-                        term.pz,
-                        term.px + term.py - term.denom_power,
-                        term.px,
-                        term.py,
-                        Real(1));
-        return series;
-    }
-
-    static GradientSeries modal_gradient_asymptotic(const ModalTerm& term) {
-        GradientSeries gradient_series{};
-
-        if (term.px > 0) {
-            add_z_expansion(gradient_series[0],
-                            term.pz,
-                            term.px - 1 + term.py - term.denom_power,
-                            term.px - 1,
-                            term.py,
-                            static_cast<Real>(term.px));
-        }
-
-        if (term.py > 0) {
-            add_z_expansion(gradient_series[1],
-                            term.pz,
-                            term.px + term.py - 1 - term.denom_power,
-                            term.px,
-                            term.py - 1,
-                            static_cast<Real>(term.py));
-        }
-
-        if (term.pz > 0) {
-            add_z_expansion(gradient_series[2],
-                            term.pz - 1,
-                            term.px + term.py - term.denom_power,
-                            term.px,
-                            term.py,
-                            static_cast<Real>(term.pz));
-        }
-        if (term.denom_power > 0) {
-            add_z_expansion(gradient_series[2],
-                            term.pz,
-                            term.px + term.py - term.denom_power - 1,
-                            term.px,
-                            term.py,
-                            static_cast<Real>(term.denom_power));
-        }
-
-        return gradient_series;
-    }
-
-    static HessianSeries modal_hessian_asymptotic(const ModalTerm& term) {
-        HessianSeries hessian_series{};
-
-        if (term.px > 1) {
-            add_z_expansion(hessian_series[0][0],
-                            term.pz,
-                            term.px - 2 + term.py - term.denom_power,
-                            term.px - 2,
-                            term.py,
-                            static_cast<Real>(term.px * (term.px - 1)));
-        }
-
-        if (term.py > 1) {
-            add_z_expansion(hessian_series[1][1],
-                            term.pz,
-                            term.px + term.py - 2 - term.denom_power,
-                            term.px,
-                            term.py - 2,
-                            static_cast<Real>(term.py * (term.py - 1)));
-        }
-
-        if (term.px > 0 && term.py > 0) {
-            add_z_expansion(hessian_series[0][1],
-                            term.pz,
-                            term.px + term.py - 2 - term.denom_power,
-                            term.px - 1,
-                            term.py - 1,
-                            static_cast<Real>(term.px * term.py));
-            hessian_series[1][0] = hessian_series[0][1];
-        }
-
-        if (term.px > 0 && term.pz > 0) {
-            add_z_expansion(hessian_series[0][2],
-                            term.pz - 1,
-                            term.px - 1 + term.py - term.denom_power,
-                            term.px - 1,
-                            term.py,
-                            static_cast<Real>(term.px * term.pz));
-        }
-        if (term.px > 0 && term.denom_power > 0) {
-            add_z_expansion(hessian_series[0][2],
-                            term.pz,
-                            term.px - 1 + term.py - term.denom_power - 1,
-                            term.px - 1,
-                            term.py,
-                            static_cast<Real>(term.px * term.denom_power));
-        }
-        hessian_series[2][0] = hessian_series[0][2];
-
-        if (term.py > 0 && term.pz > 0) {
-            add_z_expansion(hessian_series[1][2],
-                            term.pz - 1,
-                            term.px + term.py - 1 - term.denom_power,
-                            term.px,
-                            term.py - 1,
-                            static_cast<Real>(term.py * term.pz));
-        }
-        if (term.py > 0 && term.denom_power > 0) {
-            add_z_expansion(hessian_series[1][2],
-                            term.pz,
-                            term.px + term.py - 1 - term.denom_power - 1,
-                            term.px,
-                            term.py - 1,
-                            static_cast<Real>(term.py * term.denom_power));
-        }
-        hessian_series[2][1] = hessian_series[1][2];
-
-        if (term.pz > 1) {
-            add_z_expansion(hessian_series[2][2],
-                            term.pz - 2,
-                            term.px + term.py - term.denom_power,
-                            term.px,
-                            term.py,
-                            static_cast<Real>(term.pz * (term.pz - 1)));
-        }
-        if (term.pz > 0 && term.denom_power > 0) {
-            add_z_expansion(hessian_series[2][2],
-                            term.pz - 1,
-                            term.px + term.py - term.denom_power - 1,
-                            term.px,
-                            term.py,
-                            static_cast<Real>(2 * term.pz * term.denom_power));
-        }
-        if (term.denom_power > 0) {
-            add_z_expansion(hessian_series[2][2],
-                            term.pz,
-                            term.px + term.py - term.denom_power - 2,
-                            term.px,
-                            term.py,
-                            static_cast<Real>(term.denom_power * (term.denom_power + 1)));
-        }
-
-        return hessian_series;
-    }
-
-    static ApexClassification classify_series(const ApexSeries& series) {
-        for (const auto& [beta, poly] : series.by_power) {
-            if (poly.empty(kSeriesTolerance)) {
-                continue;
-            }
-            if (beta < 0) {
-                return {ApexLimitKind::Singular, Real(0), beta};
-            }
-            if (beta > 0) {
-                return {ApexLimitKind::Constant, Real(0), beta};
-            }
-            if (poly.is_constant(kSeriesTolerance)) {
-                return {ApexLimitKind::Constant, poly.constant_value(kSeriesTolerance), beta};
-            }
-            return {ApexLimitKind::DirectionDependent, Real(0), beta};
-        }
-        return {ApexLimitKind::Constant, Real(0), 1};
-    }
-
-    static void accumulate_rank_status(ApexRankStatus& status,
-                                       const ApexClassification& classification) {
-        if (classification.kind == ApexLimitKind::Singular) {
-            status = ApexRankStatus::Singular;
-            return;
-        }
-        if (classification.kind == ApexLimitKind::DirectionDependent &&
-            status != ApexRankStatus::Singular) {
-            status = ApexRankStatus::DirectionDependent;
-        }
-    }
-
-    static std::string apex_status_message(const char* rank,
-                                           ApexRankStatus status) {
-        switch (status) {
-            case ApexRankStatus::DirectionDependent:
-                return std::string("Pyramid rational nodal ") + rank +
-                       " at the exact apex is not uniquely defined under admissible interior approaches";
-            case ApexRankStatus::Singular:
-                return std::string("Pyramid rational nodal ") + rank +
-                       " at the exact apex is singular for this basis family";
-            case ApexRankStatus::Exact:
-                return std::string("Pyramid rational nodal ") + rank +
-                       " apex evaluation unexpectedly reported non-exact status";
-        }
-        return std::string("Pyramid rational nodal ") + rank +
-               " apex evaluation is not available";
-    }
-
-    static ApexData build_apex_data(const OrderData& data) {
-        const std::size_t n = data.modal_terms.size();
-
-        std::vector<ApexSeries> modal_values(n);
-        std::vector<GradientSeries> modal_gradients(n);
-        std::vector<HessianSeries> modal_hessians(n);
-        for (std::size_t m = 0; m < n; ++m) {
-            modal_values[m] = modal_value_asymptotic(data.modal_terms[m]);
-            modal_gradients[m] = modal_gradient_asymptotic(data.modal_terms[m]);
-            modal_hessians[m] = modal_hessian_asymptotic(data.modal_terms[m]);
-        }
-
-        std::vector<ApexSeries> nodal_values(n);
-        std::vector<GradientSeries> nodal_gradients(n);
-        std::vector<HessianSeries> nodal_hessians(n);
-        for (std::size_t i = 0; i < n; ++i) {
-            for (std::size_t m = 0; m < n; ++m) {
-                const Real coeff = data.modal_to_nodal[i * n + m];
-                nodal_values[i].add_scaled(modal_values[m], coeff, kSeriesTolerance);
-                for (int d = 0; d < 3; ++d) {
-                    nodal_gradients[i][static_cast<std::size_t>(d)].add_scaled(
-                        modal_gradients[m][static_cast<std::size_t>(d)], coeff, kSeriesTolerance);
-                }
-                for (int r = 0; r < 3; ++r) {
-                    for (int c = 0; c < 3; ++c) {
-                        nodal_hessians[i][static_cast<std::size_t>(r)][static_cast<std::size_t>(c)]
-                            .add_scaled(
-                                modal_hessians[m][static_cast<std::size_t>(r)][static_cast<std::size_t>(c)],
-                                coeff,
-                                kSeriesTolerance);
-                    }
-                }
-            }
-        }
-
-        ApexData apex;
-        apex.values.assign(n, Real(0));
-        apex.gradients.assign(n, Gradient{});
-        apex.hessians.assign(n, Hessian{});
-
-        for (std::size_t i = 0; i < n; ++i) {
-            const ApexClassification value_class = classify_series(nodal_values[i]);
-            if (value_class.kind != ApexLimitKind::Constant) {
-                throw BasisConstructionException(
-                    "Pyramid nodal value at apex is not uniquely defined for basis index " +
-                    std::to_string(i),
-                    __FILE__, __LINE__, __func__);
-            }
-            apex.values[i] = value_class.constant_value;
-
-            for (int d = 0; d < 3; ++d) {
-                const ApexClassification grad_class = classify_series(
-                    nodal_gradients[i][static_cast<std::size_t>(d)]);
-                accumulate_rank_status(apex.gradient_status, grad_class);
-                if (grad_class.kind == ApexLimitKind::Constant) {
-                    apex.gradients[i][static_cast<std::size_t>(d)] = grad_class.constant_value;
-                }
-            }
-
-            for (int r = 0; r < 3; ++r) {
-                for (int c = 0; c < 3; ++c) {
-                    const ApexClassification hess_class = classify_series(
-                        nodal_hessians[i][static_cast<std::size_t>(r)][static_cast<std::size_t>(c)]);
-                    accumulate_rank_status(apex.hessian_status, hess_class);
-                    if (hess_class.kind == ApexLimitKind::Constant) {
-                        apex.hessians[i](static_cast<std::size_t>(r),
-                                         static_cast<std::size_t>(c)) = hess_class.constant_value;
-                    }
-                }
-            }
-        }
-
-        if (apex.gradient_status != ApexRankStatus::Exact) {
-            apex.gradients.clear();
-        }
-        if (apex.hessian_status != ApexRankStatus::Exact) {
-            apex.hessians.clear();
-        }
-
-        return apex;
-    }
-
-    static std::vector<math::Vector<Real, 3>> build_public_nodes(int order) {
-        if (order == 0) {
-            return {math::Vector<Real, 3>{Real(0), Real(0), Real(0.25)}};
-        }
-
-        std::vector<math::Vector<Real, 3>> nodes;
-        nodes.reserve(static_cast<std::size_t>((order + 1) * (order + 2) * (2 * order + 3) / 6));
-
-        nodes.push_back(math::Vector<Real, 3>{Real(-1), Real(-1), Real(0)});
-        nodes.push_back(math::Vector<Real, 3>{Real(1), Real(-1), Real(0)});
-        nodes.push_back(math::Vector<Real, 3>{Real(1), Real(1), Real(0)});
-        nodes.push_back(math::Vector<Real, 3>{Real(-1), Real(1), Real(0)});
-        nodes.push_back(math::Vector<Real, 3>{Real(0), Real(0), Real(1)});
-
-        for (int m = 1; m < order; ++m) {
-            nodes.push_back(math::Vector<Real, 3>{equispaced_pm_one_coord(m, order), Real(-1), Real(0)});
-        }
-        for (int m = 1; m < order; ++m) {
-            nodes.push_back(math::Vector<Real, 3>{Real(1), equispaced_pm_one_coord(m, order), Real(0)});
-        }
-        for (int m = order - 1; m >= 1; --m) {
-            nodes.push_back(math::Vector<Real, 3>{equispaced_pm_one_coord(m, order), Real(1), Real(0)});
-        }
-        for (int m = order - 1; m >= 1; --m) {
-            nodes.push_back(math::Vector<Real, 3>{Real(-1), equispaced_pm_one_coord(m, order), Real(0)});
-        }
-
-        for (int level = 1; level < order; ++level) {
-            const Real z = static_cast<Real>(level) / static_cast<Real>(order);
-            const Real scale = Real(1) - z;
-            nodes.push_back(math::Vector<Real, 3>{-scale, -scale, z});
-            nodes.push_back(math::Vector<Real, 3>{scale, -scale, z});
-            nodes.push_back(math::Vector<Real, 3>{scale, scale, z});
-            nodes.push_back(math::Vector<Real, 3>{-scale, scale, z});
-        }
-
-        for (int j = 1; j < order; ++j) {
-            for (int i = 1; i < order; ++i) {
-                nodes.push_back(math::Vector<Real, 3>{equispaced_pm_one_coord(i, order),
-                                                      equispaced_pm_one_coord(j, order),
-                                                      Real(0)});
-            }
-        }
-
-        for (int level = 1; level < order - 1; ++level) {
-            const int n = order - level;
-            const Real z = static_cast<Real>(level) / static_cast<Real>(order);
-            const Real scale = Real(1) - z;
-
-            for (int m = 1; m < n; ++m) {
-                const Real s = equispaced_pm_one_coord(m, n) * scale;
-                nodes.push_back(math::Vector<Real, 3>{s, -scale, z});
-            }
-            for (int m = 1; m < n; ++m) {
-                const Real s = equispaced_pm_one_coord(m, n) * scale;
-                nodes.push_back(math::Vector<Real, 3>{scale, s, z});
-            }
-            for (int m = n - 1; m >= 1; --m) {
-                const Real s = equispaced_pm_one_coord(m, n) * scale;
-                nodes.push_back(math::Vector<Real, 3>{s, scale, z});
-            }
-            for (int m = n - 1; m >= 1; --m) {
-                const Real s = equispaced_pm_one_coord(m, n) * scale;
-                nodes.push_back(math::Vector<Real, 3>{-scale, s, z});
-            }
-        }
-
-        for (int level = 1; level < order - 1; ++level) {
-            const int n = order - level;
-            const Real z = static_cast<Real>(level) / static_cast<Real>(order);
-            const Real scale = Real(1) - z;
-            for (int j = 1; j < n; ++j) {
-                for (int i = 1; i < n; ++i) {
-                    nodes.push_back(math::Vector<Real, 3>{equispaced_pm_one_coord(i, n) * scale,
-                                                          equispaced_pm_one_coord(j, n) * scale,
-                                                          z});
-                }
-            }
-        }
-
-        return nodes;
-    }
-
-    struct VectorValueSink {
-        std::vector<Real>& output;
-        void resize(std::size_t n) const { output.resize(n); }
-        void write(std::size_t i, Real value) const { output[i] = value; }
-    };
-
-    struct RawValueSink {
-        Real* output;
-        void resize(std::size_t) const {}
-        void write(std::size_t i, Real value) const { output[i] = value; }
-    };
-
-    struct VectorGradientSink {
-        std::vector<Gradient>& output;
-        void resize(std::size_t n) const { output.resize(n); }
-        void write(std::size_t i, const Gradient& value) const { output[i] = value; }
-    };
-
-    struct RawGradientSink {
-        Real* output;
-        void resize(std::size_t) const {}
-        void write(std::size_t i, const Gradient& value) const {
-            Real* dst = output + i * 3u;
-            dst[0] = value[0];
-            dst[1] = value[1];
-            dst[2] = value[2];
-        }
-    };
-
-    struct VectorHessianSink {
-        std::vector<Hessian>& output;
-        void resize(std::size_t n) const { output.resize(n); }
-        void write(std::size_t i, const Hessian& value) const { output[i] = value; }
-    };
-
-    struct RawHessianSink {
-        Real* output;
-        void resize(std::size_t) const {}
-        void write(std::size_t i, const Hessian& value) const {
-            store_hessian(value, output + i * 9u);
-        }
-    };
-
-    template <typename Get, typename Set>
-    static void apply_order1_combination(std::size_t components,
-                                         const Get& get,
-                                         const Set& set) {
-        for (std::size_t c = 0; c < components; ++c) {
-            const Real m0 = get(0u, c);
-            const Real m1 = get(1u, c);
-            const Real m2 = get(2u, c);
-            const Real m3 = get(3u, c);
-            const Real m4 = get(4u, c);
-            set(0u, c, Real(0.25) * (m0 - m1 - m2 + m3 - m4));
-            set(1u, c, Real(0.25) * (m0 + m1 - m2 - m3 - m4));
-            set(2u, c, Real(0.25) * (m0 + m1 + m2 + m3 - m4));
-            set(3u, c, Real(0.25) * (m0 - m1 + m2 - m3 - m4));
-            set(4u, c, m4);
-        }
-    }
-
-    template <typename Get, typename Set>
-    static void apply_order2_combination(std::size_t components,
-                                         const Get& get,
-                                         const Set& set) {
-        for (std::size_t c = 0; c < components; ++c) {
-            const Real m0 = get(0u, c);
-            const Real m1 = get(1u, c);
-            const Real m2 = get(2u, c);
-            const Real m3 = get(3u, c);
-            const Real m4 = get(4u, c);
-            const Real m5 = get(5u, c);
-            const Real m6 = get(6u, c);
-            const Real m7 = get(7u, c);
-            const Real m8 = get(8u, c);
-            const Real m9 = get(9u, c);
-            const Real m10 = get(10u, c);
-            const Real m11 = get(11u, c);
-            const Real m12 = get(12u, c);
-            const Real m13 = get(13u, c);
-            set(0u, c, Real(0.25) * (m4 - m5 - m7 + m8 - m9 + m10 + m11 - Real(2) * m12 + m13));
-            set(1u, c, Real(0.25) * (-m4 - m5 + m7 + m8 - m9 - m10 + m11 + Real(2) * m12 + m13));
-            set(2u, c, Real(0.25) * (m4 + m5 + m7 + m8 - m9 - m10 - m11 - Real(2) * m12 + m13));
-            set(3u, c, Real(0.25) * (-m4 + m5 - m7 + m8 - m9 + m10 - m11 + Real(2) * m12 + m13));
-            set(4u, c, -m9 + Real(2) * m13);
-            set(5u, c, Real(0.5) * (-m3 + m5 + m6 - m8 + m11));
-            set(6u, c, Real(0.5) * (m1 + m2 - m7 - m8 - m10));
-            set(7u, c, Real(0.5) * (m3 - m5 + m6 - m8 - m11));
-            set(8u, c, Real(0.5) * (-m1 + m2 + m7 - m8 + m10));
-            set(9u, c, m9 - m10 - m11 + m12 - m13);
-            set(10u, c, m9 + m10 - m11 - m12 - m13);
-            set(11u, c, m9 + m10 + m11 + m12 - m13);
-            set(12u, c, m9 - m10 + m11 - m12 - m13);
-            set(13u, c, m0 - m2 - m6 + m8 - Real(2) * m9 + m13);
-        }
-    }
-
-    template <typename Get, typename Set>
-    static void apply_low_order_combination(const OrderData& data,
-                                            std::size_t components,
-                                            const Get& get,
-                                            const Set& set) {
-        if (data.order == 1) {
-            apply_order1_combination(components, get, set);
-            return;
-        }
-        apply_order2_combination(components, get, set);
-    }
-
-    static void apply_sparse_basis_to_nodal(const OrderData& data,
-                                            const std::vector<Real>& modal_values,
-                                            std::vector<Real>& nodal_values) {
-        const std::size_t n = modal_values.size();
-        nodal_values.resize(n);
-        apply_low_order_combination(
-            data,
-            1u,
-            [&](std::size_t modal_i, std::size_t) { return modal_values[modal_i]; },
-            [&](std::size_t basis_i, std::size_t, Real value) { nodal_values[basis_i] = value; });
-    }
-
-    static void apply_sparse_basis_to_nodal_to(const OrderData& data,
-                                               const std::vector<Real>& modal_values,
-                                               Real* SVMP_RESTRICT nodal_values) {
-        apply_low_order_combination(
-            data,
-            1u,
-            [&](std::size_t modal_i, std::size_t) { return modal_values[modal_i]; },
-            [&](std::size_t basis_i, std::size_t, Real value) { nodal_values[basis_i] = value; });
-    }
-
-    static void apply_sparse_basis_to_nodal(const OrderData& data,
-                                            const std::vector<Gradient>& modal_gradients,
-                                            std::vector<Gradient>& nodal_gradients) {
-        const std::size_t n = modal_gradients.size();
-        nodal_gradients.resize(n);
-        apply_low_order_combination(
-            data,
-            3u,
-            [&](std::size_t modal_i, std::size_t component) {
-                return modal_gradients[modal_i][component];
-            },
-            [&](std::size_t basis_i, std::size_t component, Real value) {
-                nodal_gradients[basis_i][component] = value;
-            });
-    }
-
-    static void apply_sparse_basis_to_nodal_to(const OrderData& data,
-                                               const std::vector<Gradient>& modal_gradients,
-                                               Real* SVMP_RESTRICT nodal_gradients) {
-        apply_low_order_combination(
-            data,
-            3u,
-            [&](std::size_t modal_i, std::size_t component) {
-                return modal_gradients[modal_i][component];
-            },
-            [&](std::size_t basis_i, std::size_t component, Real value) {
-                nodal_gradients[basis_i * 3u + component] = value;
-            });
-    }
-
-    static void apply_sparse_basis_to_nodal(const OrderData& data,
-                                            const std::vector<Hessian>& modal_hessians,
-                                            std::vector<Hessian>& nodal_hessians) {
-        const std::size_t n = modal_hessians.size();
-        nodal_hessians.resize(n);
-        apply_low_order_combination(
-            data,
-            9u,
-            [&](std::size_t modal_i, std::size_t component) {
-                return modal_hessians[modal_i].data()[component];
-            },
-            [&](std::size_t basis_i, std::size_t component, Real value) {
-                nodal_hessians[basis_i].data()[component] = value;
-            });
-    }
-
-    static void apply_sparse_basis_to_nodal_to(const OrderData& data,
-                                               const std::vector<Hessian>& modal_hessians,
-                                               Real* SVMP_RESTRICT nodal_hessians) {
-        apply_low_order_combination(
-            data,
-            9u,
-            [&](std::size_t modal_i, std::size_t component) {
-                return modal_hessians[modal_i].data()[component];
-            },
-            [&](std::size_t basis_i, std::size_t component, Real value) {
-                nodal_hessians[basis_i * 9u + component] = value;
-            });
-    }
-
-    static void apply_sparse_basis_to_nodal_all(
-        const OrderData& data,
-        const std::vector<Real>& modal_values,
-        const std::vector<Gradient>& modal_gradients,
-        const std::vector<Hessian>& modal_hessians,
-        std::vector<Real>& nodal_values,
-        std::vector<Gradient>& nodal_gradients,
-        std::vector<Hessian>& nodal_hessians) {
-        const std::size_t n = modal_values.size();
-        nodal_values.resize(n);
-        nodal_gradients.resize(n);
-        nodal_hessians.resize(n);
-        apply_low_order_combination(
-            data,
-            1u,
-            [&](std::size_t modal_i, std::size_t) { return modal_values[modal_i]; },
-            [&](std::size_t basis_i, std::size_t, Real value) { nodal_values[basis_i] = value; });
-        apply_low_order_combination(
-            data,
-            3u,
-            [&](std::size_t modal_i, std::size_t component) {
-                return modal_gradients[modal_i][component];
-            },
-            [&](std::size_t basis_i, std::size_t component, Real value) {
-                nodal_gradients[basis_i][component] = value;
-            });
-        apply_low_order_combination(
-            data,
-            9u,
-            [&](std::size_t modal_i, std::size_t component) {
-                return modal_hessians[modal_i].data()[component];
-            },
-            [&](std::size_t basis_i, std::size_t component, Real value) {
-                nodal_hessians[basis_i].data()[component] = value;
-            });
-    }
-
-    static void apply_sparse_basis_to_nodal_all_to(
-        const OrderData& data,
-        const std::vector<Real>& modal_values,
-        const std::vector<Gradient>& modal_gradients,
-        const std::vector<Hessian>& modal_hessians,
-        Real* SVMP_RESTRICT nodal_values,
-        Real* SVMP_RESTRICT nodal_gradients,
-        Real* SVMP_RESTRICT nodal_hessians) {
-        apply_low_order_combination(
-            data,
-            1u,
-            [&](std::size_t modal_i, std::size_t) { return modal_values[modal_i]; },
-            [&](std::size_t basis_i, std::size_t, Real value) { nodal_values[basis_i] = value; });
-        apply_low_order_combination(
-            data,
-            3u,
-            [&](std::size_t modal_i, std::size_t component) {
-                return modal_gradients[modal_i][component];
-            },
-            [&](std::size_t basis_i, std::size_t component, Real value) {
-                nodal_gradients[basis_i * 3u + component] = value;
-            });
-        apply_low_order_combination(
-            data,
-            9u,
-            [&](std::size_t modal_i, std::size_t component) {
-                return modal_hessians[modal_i].data()[component];
-            },
-            [&](std::size_t basis_i, std::size_t component, Real value) {
-                nodal_hessians[basis_i * 9u + component] = value;
-            });
-    }
-
-    template <typename Sink>
-    // Keep modal transform helpers free of forced-inline attributes unless
-    // compiler-versioned benchmarks and LLVM IR checks show a stable benefit.
-    static void apply_modal_values_to_nodal(const OrderData& data,
-                                            const std::vector<Real>& modal_values,
-                                            const Sink& sink) {
-        const std::size_t n = modal_values.size();
-        sink.resize(n);
-        for (std::size_t basis_i = 0; basis_i < n; ++basis_i) {
-            const Real* row = data.modal_to_nodal.data() + basis_i * n;
-            Real value = Real(0);
-            for (std::size_t modal_j = 0; modal_j < n; ++modal_j) {
-                value += row[modal_j] * modal_values[modal_j];
-            }
-            sink.write(basis_i, value);
-        }
-    }
-
-    template <typename Sink>
-    static void apply_modal_gradients_to_nodal(const OrderData& data,
-                                               const std::vector<Gradient>& modal_gradients,
-                                               const Sink& sink) {
-        const std::size_t n = modal_gradients.size();
-        sink.resize(n);
-        for (std::size_t basis_i = 0; basis_i < n; ++basis_i) {
-            const Real* row = data.modal_to_nodal.data() + basis_i * n;
-            Gradient gradient{};
-            for (std::size_t modal_j = 0; modal_j < n; ++modal_j) {
-                const Real coeff = row[modal_j];
-                for (std::size_t component = 0; component < 3u; ++component) {
-                    gradient[component] += coeff * modal_gradients[modal_j][component];
-                }
-            }
-            sink.write(basis_i, gradient);
-        }
-    }
-
-    template <typename Sink>
-    static void apply_modal_hessians_to_nodal(const OrderData& data,
-                                              const std::vector<Hessian>& modal_hessians,
-                                              const Sink& sink) {
-        const std::size_t n = modal_hessians.size();
-        sink.resize(n);
-        for (std::size_t basis_i = 0; basis_i < n; ++basis_i) {
-            const Real* matrix_row = data.modal_to_nodal.data() + basis_i * n;
-            Hessian hessian{};
-            for (std::size_t modal_j = 0; modal_j < n; ++modal_j) {
-                const Real coeff = matrix_row[modal_j];
-                for (std::size_t row = 0; row < 3u; ++row) {
-                    for (std::size_t col = 0; col < 3u; ++col) {
-                        hessian(row, col) += coeff * modal_hessians[modal_j](row, col);
-                    }
-                }
-            }
-            sink.write(basis_i, hessian);
-        }
-    }
-
-    static void apply_modal_to_nodal(const OrderData& data,
-                                     const std::vector<Real>& modal_values,
-                                     std::vector<Real>& nodal_values) {
-        apply_modal_values_to_nodal(data, modal_values, VectorValueSink{nodal_values});
-    }
-
-    static void apply_modal_to_nodal(const OrderData& data,
-                                     const std::vector<Gradient>& modal_gradients,
-                                     std::vector<Gradient>& nodal_gradients) {
-        apply_modal_gradients_to_nodal(data, modal_gradients, VectorGradientSink{nodal_gradients});
-    }
-
-    static void apply_modal_to_nodal(const OrderData& data,
-                                     const std::vector<Hessian>& modal_hessians,
-                                     std::vector<Hessian>& nodal_hessians) {
-        apply_modal_hessians_to_nodal(data, modal_hessians, VectorHessianSink{nodal_hessians});
-    }
-
-    static void apply_modal_to_nodal_to(const OrderData& data,
-                                        const std::vector<Real>& modal_values,
-                                        Real* nodal_values) {
-        apply_modal_values_to_nodal(data, modal_values, RawValueSink{nodal_values});
-    }
-
-    static void apply_modal_to_nodal_to(const OrderData& data,
-                                        const std::vector<Gradient>& modal_gradients,
-                                        Real* nodal_gradients) {
-        apply_modal_gradients_to_nodal(data, modal_gradients, RawGradientSink{nodal_gradients});
-    }
-
-    static void apply_modal_to_nodal_to(const OrderData& data,
-                                        const std::vector<Hessian>& modal_hessians,
-                                        Real* nodal_hessians) {
-        apply_modal_hessians_to_nodal(data, modal_hessians, RawHessianSink{nodal_hessians});
-    }
-};
-
-namespace lagrange_pyramid {
-
-const std::vector<math::Vector<Real, 3>>& nodes(int order) {
-    return PyramidLagrangeCache::get(order).nodes;
-}
-
-void prewarm_scratch(int order, std::size_t max_qpts) {
-    const auto& data = PyramidLagrangeCache::get(order);
-    PyramidLagrangeCache::prewarm_scratch(data.modal_terms.size(), max_qpts);
-}
-
-void evaluate_values(int order,
-                     const math::Vector<Real, 3>& xi,
-                     std::vector<Real>& values) {
-    const auto& data = PyramidLagrangeCache::get(order);
-    PyramidLagrangeCache::evaluate_values(data, xi, values);
-}
-
-void evaluate_gradients(int order,
-                        const math::Vector<Real, 3>& xi,
-                        std::vector<Gradient>& gradients) {
-    const auto& data = PyramidLagrangeCache::get(order);
-    PyramidLagrangeCache::evaluate_gradients(data, xi, gradients);
-}
-
-void evaluate_hessians(int order,
-                       const math::Vector<Real, 3>& xi,
-                       std::vector<Hessian>& hessians) {
-    const auto& data = PyramidLagrangeCache::get(order);
-    PyramidLagrangeCache::evaluate_hessians(data, xi, hessians);
-}
-
-void evaluate_all(int order,
-                  const math::Vector<Real, 3>& xi,
-                  std::vector<Real>& values,
-                  std::vector<Gradient>& gradients,
-                  std::vector<Hessian>& hessians) {
-    const auto& data = PyramidLagrangeCache::get(order);
-    PyramidLagrangeCache::evaluate_all(data, xi, values, gradients, hessians);
-}
-
-void evaluate_values_to(int order,
-                        const math::Vector<Real, 3>& xi,
-                        Real* SVMP_RESTRICT values_out) {
-    const auto& data = PyramidLagrangeCache::get(order);
-    PyramidLagrangeCache::evaluate_values_to(data, xi, values_out);
-}
-
-void evaluate_gradients_to(int order,
-                           const math::Vector<Real, 3>& xi,
-                           Real* SVMP_RESTRICT gradients_out) {
-    const auto& data = PyramidLagrangeCache::get(order);
-    PyramidLagrangeCache::evaluate_gradients_to(data, xi, gradients_out);
-}
-
-void evaluate_hessians_to(int order,
-                          const math::Vector<Real, 3>& xi,
-                          Real* SVMP_RESTRICT hessians_out) {
-    const auto& data = PyramidLagrangeCache::get(order);
-    PyramidLagrangeCache::evaluate_hessians_to(data, xi, hessians_out);
-}
-
-void evaluate_all_to(int order,
-                     const math::Vector<Real, 3>& xi,
-                     Real* SVMP_RESTRICT values_out,
-                     Real* SVMP_RESTRICT gradients_out,
-                     Real* SVMP_RESTRICT hessians_out) {
-    const auto& data = PyramidLagrangeCache::get(order);
-    PyramidLagrangeCache::evaluate_all_to(data, xi, values_out, gradients_out, hessians_out);
-}
-
-void evaluate_at_quadrature_points_strided(
-    int order,
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) {
-    const auto& data = PyramidLagrangeCache::get(order);
-    PyramidLagrangeCache::evaluate_at_quadrature_points_strided(
-        data, points, output_stride, values_out, gradients_out, hessians_out);
-}
-
-} // namespace lagrange_pyramid
-
-} // namespace detail
-} // namespace basis
-} // namespace FE
-} // namespace svmp
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasisPyramid.h b/Code/Source/solver/FE/Basis/LagrangeBasisPyramid.h
deleted file mode 100644
index 76859501c..000000000
--- a/Code/Source/solver/FE/Basis/LagrangeBasisPyramid.h
+++ /dev/null
@@ -1,67 +0,0 @@
-#ifndef SVMP_FE_BASIS_LAGRANGEBASISPYRAMID_H
-#define SVMP_FE_BASIS_LAGRANGEBASISPYRAMID_H
-
-// Private declarations for the rational pyramid Lagrange helper implemented in
-// LagrangeBasisPyramid.cpp. This header is intentionally small so the large
-// construction and apex-classification code stays out of LagrangeBasis.cpp.
-
-#include "BasisFunction.h"
-
-#include <cstddef>
-#include <vector>
-
-namespace svmp {
-namespace FE {
-namespace basis {
-namespace detail {
-namespace lagrange_pyramid {
-
-const std::vector<math::Vector<Real, 3>>& nodes(int order);
-
-void prewarm_scratch(int order, std::size_t max_qpts = 0);
-
-void evaluate_values(int order,
-                     const math::Vector<Real, 3>& xi,
-                     std::vector<Real>& values);
-void evaluate_gradients(int order,
-                        const math::Vector<Real, 3>& xi,
-                        std::vector<Gradient>& gradients);
-void evaluate_hessians(int order,
-                       const math::Vector<Real, 3>& xi,
-                       std::vector<Hessian>& hessians);
-void evaluate_all(int order,
-                  const math::Vector<Real, 3>& xi,
-                  std::vector<Real>& values,
-                  std::vector<Gradient>& gradients,
-                  std::vector<Hessian>& hessians);
-
-void evaluate_values_to(int order,
-                        const math::Vector<Real, 3>& xi,
-                        Real* SVMP_RESTRICT values_out);
-void evaluate_gradients_to(int order,
-                           const math::Vector<Real, 3>& xi,
-                           Real* SVMP_RESTRICT gradients_out);
-void evaluate_hessians_to(int order,
-                          const math::Vector<Real, 3>& xi,
-                          Real* SVMP_RESTRICT hessians_out);
-void evaluate_all_to(int order,
-                     const math::Vector<Real, 3>& xi,
-                     Real* SVMP_RESTRICT values_out,
-                     Real* SVMP_RESTRICT gradients_out,
-                     Real* SVMP_RESTRICT hessians_out);
-
-void evaluate_at_quadrature_points_strided(
-    int order,
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out);
-
-} // namespace lagrange_pyramid
-} // namespace detail
-} // namespace basis
-} // namespace FE
-} // namespace svmp
-
-#endif // SVMP_FE_BASIS_LAGRANGEBASISPYRAMID_H
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasisSimplex.cpp b/Code/Source/solver/FE/Basis/LagrangeBasisSimplex.cpp
deleted file mode 100644
index 36325576a..000000000
--- a/Code/Source/solver/FE/Basis/LagrangeBasisSimplex.cpp
+++ /dev/null
@@ -1,2457 +0,0 @@
-#include "LagrangeBasisSimplex.h"
-
-#include <array>
-#include <vector>
-
-namespace svmp {
-namespace FE {
-namespace basis {
-namespace detail {
-
-// Falling-factorial (equispaced barycentric) Lagrange factors for simplex nodes.
-//
-// For a fixed polynomial order p and barycentric coordinate lambda in [0, 1],
-// define
-//   phi_a(lambda) = product_{m=0}^{a-1} (p * lambda - m) / (a - m), a = 0..p
-// Then for a multi-index (i0, i1, ..., id) with sum i_k = p, the simplex
-// Lagrange basis function is product_k phi_{i_k}(lambda_k), nodal on the
-// barycentric lattice.
-//
-// Output buffers must each be sized to at least p+1 entries; the function
-// writes every output slot (no pre-zero required by the caller).
-template <bool NeedFirst, bool NeedSecond>
-void simplex_lagrange_factor_sequence_impl(int p,
-                                           Real lambda,
-                                           Real* phi,
-                                           Real* dphi,
-                                           Real* d2phi) {
-    static_assert(!NeedSecond || NeedFirst,
-                  "second derivative factors require first-derivative recurrence state");
-
-    phi[0] = Real(1);
-    if constexpr (NeedFirst) {
-        dphi[0] = Real(0);
-    }
-    if constexpr (NeedSecond) {
-        d2phi[0] = Real(0);
-    }
-    if (p == 0) {
-        return;
-    }
-
-    const Real t = static_cast<Real>(p) * lambda;
-    const Real dt_dlambda = static_cast<Real>(p);
-
-    Real dphi_dt_prev = Real(0);
-    Real d2phi_dt2_prev = Real(0);
-
-    for (int a = 1; a <= p; ++a) {
-        const std::size_t au = static_cast<std::size_t>(a);
-        const Real inv_a = Real(1) / static_cast<Real>(a);
-        const Real s = (t - static_cast<Real>(a - 1)) * inv_a;
-
-        phi[au] = s * phi[au - 1];
-
-        if constexpr (NeedFirst) {
-            const Real dphi_dt_old = dphi_dt_prev;
-            const Real dphi_dt = inv_a * phi[au - 1] + s * dphi_dt_old;
-            dphi[au] = dt_dlambda * dphi_dt;
-
-            if constexpr (NeedSecond) {
-                const Real d2phi_dt2 = Real(2) * inv_a * dphi_dt_old + s * d2phi_dt2_prev;
-                d2phi[au] = dt_dlambda * dt_dlambda * d2phi_dt2;
-                d2phi_dt2_prev = d2phi_dt2;
-            }
-
-            dphi_dt_prev = dphi_dt;
-        }
-    }
-}
-
-void simplex_lagrange_factor_sequence(int p,
-                                      Real lambda,
-                                      Real* phi,
-                                      Real* dphi,
-                                      Real* d2phi) {
-    if (d2phi != nullptr) {
-        simplex_lagrange_factor_sequence_impl<true, true>(p, lambda, phi, dphi, d2phi);
-    } else if (dphi != nullptr) {
-        simplex_lagrange_factor_sequence_impl<true, false>(p, lambda, phi, dphi, nullptr);
-    } else {
-        simplex_lagrange_factor_sequence_impl<false, false>(p, lambda, phi, nullptr, nullptr);
-    }
-}
-
-constexpr int kFixedSimplexAxisOrder = 12;
-constexpr std::size_t kFixedSimplexAxisSize =
-    static_cast<std::size_t>(kFixedSimplexAxisOrder + 1);
-constexpr std::size_t kFixedSimplexBatchEntries = 512;
-
-template <int Order>
-inline void simplex_lagrange_factor_values_product(Real lambda,
-                                                   Real* SVMP_RESTRICT values) {
-    static_assert(Order >= 0, "simplex order must be non-negative");
-    values[0] = Real(1);
-    const Real t = static_cast<Real>(Order) * lambda;
-    for (int a = 1; a <= Order; ++a) {
-        const Real inv_a = Real(1) / static_cast<Real>(a);
-        values[a] = values[a - 1] * (t - static_cast<Real>(a - 1)) * inv_a;
-    }
-}
-
-template <int Order>
-void evaluate_triangle_simplex_values_q4(
-    const std::vector<std::array<int, 4>>& simplex_exponents,
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out) {
-    static_assert(Order >= 4 && Order <= 8, "specialized simplex path covers orders 4..8");
-
-    Real phi0[4][Order + 1];
-    Real phi1[4][Order + 1];
-    Real phi2[4][Order + 1];
-
-    for (std::size_t q = 0; q < 4u; ++q) {
-        const auto& xi = points[q];
-        const Real l1 = xi[0];
-        const Real l2 = xi[1];
-        const Real l0 = Real(1) - l1 - l2;
-        simplex_lagrange_factor_values_product<Order>(l0, phi0[q]);
-        simplex_lagrange_factor_values_product<Order>(l1, phi1[q]);
-        simplex_lagrange_factor_values_product<Order>(l2, phi2[q]);
-    }
-
-    const std::size_t num_nodes = simplex_exponents.size();
-    for (std::size_t node = 0; node < num_nodes; ++node) {
-        const auto& e = simplex_exponents[node];
-        const std::size_t i0 = static_cast<std::size_t>(e[0]);
-        const std::size_t i1 = static_cast<std::size_t>(e[1]);
-        const std::size_t i2 = static_cast<std::size_t>(e[2]);
-        Real* SVMP_RESTRICT row = values_out + node * output_stride;
-        row[0] = phi0[0][i0] * phi1[0][i1] * phi2[0][i2];
-        row[1] = phi0[1][i0] * phi1[1][i1] * phi2[1][i2];
-        row[2] = phi0[2][i0] * phi1[2][i1] * phi2[2][i2];
-        row[3] = phi0[3][i0] * phi1[3][i1] * phi2[3][i2];
-    }
-}
-
-bool try_evaluate_triangle_simplex_values_q4(
-    const std::vector<std::array<int, 4>>& simplex_exponents,
-    int order,
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out) {
-    switch (order) {
-    case 4:
-        evaluate_triangle_simplex_values_q4<4>(
-            simplex_exponents, points, output_stride, values_out);
-        return true;
-    case 5:
-        evaluate_triangle_simplex_values_q4<5>(
-            simplex_exponents, points, output_stride, values_out);
-        return true;
-    case 6:
-        evaluate_triangle_simplex_values_q4<6>(
-            simplex_exponents, points, output_stride, values_out);
-        return true;
-    case 7:
-        evaluate_triangle_simplex_values_q4<7>(
-            simplex_exponents, points, output_stride, values_out);
-        return true;
-    case 8:
-        evaluate_triangle_simplex_values_q4<8>(
-            simplex_exponents, points, output_stride, values_out);
-        return true;
-    default:
-        return false;
-    }
-}
-
-template <int Order>
-void evaluate_tetrahedron_simplex_values_q4(
-    const std::vector<std::array<int, 4>>& simplex_exponents,
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out) {
-    static_assert(Order >= 4 && Order <= 8, "specialized simplex path covers orders 4..8");
-
-    Real phi0[4][Order + 1];
-    Real phi1[4][Order + 1];
-    Real phi2[4][Order + 1];
-    Real phi3[4][Order + 1];
-
-    for (std::size_t q = 0; q < 4u; ++q) {
-        const auto& xi = points[q];
-        const Real l1 = xi[0];
-        const Real l2 = xi[1];
-        const Real l3 = xi[2];
-        const Real l0 = Real(1) - l1 - l2 - l3;
-        simplex_lagrange_factor_values_product<Order>(l0, phi0[q]);
-        simplex_lagrange_factor_values_product<Order>(l1, phi1[q]);
-        simplex_lagrange_factor_values_product<Order>(l2, phi2[q]);
-        simplex_lagrange_factor_values_product<Order>(l3, phi3[q]);
-    }
-
-    const std::size_t num_nodes = simplex_exponents.size();
-    for (std::size_t node = 0; node < num_nodes; ++node) {
-        const auto& e = simplex_exponents[node];
-        const std::size_t i0 = static_cast<std::size_t>(e[0]);
-        const std::size_t i1 = static_cast<std::size_t>(e[1]);
-        const std::size_t i2 = static_cast<std::size_t>(e[2]);
-        const std::size_t i3 = static_cast<std::size_t>(e[3]);
-        Real* SVMP_RESTRICT row = values_out + node * output_stride;
-        row[0] = phi0[0][i0] * phi1[0][i1] * phi2[0][i2] * phi3[0][i3];
-        row[1] = phi0[1][i0] * phi1[1][i1] * phi2[1][i2] * phi3[1][i3];
-        row[2] = phi0[2][i0] * phi1[2][i1] * phi2[2][i2] * phi3[2][i3];
-        row[3] = phi0[3][i0] * phi1[3][i1] * phi2[3][i2] * phi3[3][i3];
-    }
-}
-
-bool try_evaluate_tetrahedron_simplex_values_q4(
-    const std::vector<std::array<int, 4>>& simplex_exponents,
-    int order,
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out) {
-    switch (order) {
-    case 4:
-        evaluate_tetrahedron_simplex_values_q4<4>(
-            simplex_exponents, points, output_stride, values_out);
-        return true;
-    case 5:
-        evaluate_tetrahedron_simplex_values_q4<5>(
-            simplex_exponents, points, output_stride, values_out);
-        return true;
-    case 6:
-        evaluate_tetrahedron_simplex_values_q4<6>(
-            simplex_exponents, points, output_stride, values_out);
-        return true;
-    case 7:
-        evaluate_tetrahedron_simplex_values_q4<7>(
-            simplex_exponents, points, output_stride, values_out);
-        return true;
-    case 8:
-        evaluate_tetrahedron_simplex_values_q4<8>(
-            simplex_exponents, points, output_stride, values_out);
-        return true;
-    default:
-        return false;
-    }
-}
-
-template <int Order>
-void evaluate_tetrahedron_simplex_gradients_q4(
-    const std::vector<std::array<int, 4>>& simplex_exponents,
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT gradients_out) {
-    static_assert(Order >= 3 && Order <= 8,
-                  "specialized tetrahedron gradient path covers orders 3..8");
-
-    Real phi0[4][Order + 1];
-    Real phi1[4][Order + 1];
-    Real phi2[4][Order + 1];
-    Real phi3[4][Order + 1];
-    Real dphi0[4][Order + 1];
-    Real dphi1[4][Order + 1];
-    Real dphi2[4][Order + 1];
-    Real dphi3[4][Order + 1];
-
-    for (std::size_t q = 0; q < 4u; ++q) {
-        const auto& xi = points[q];
-        const Real l1 = xi[0];
-        const Real l2 = xi[1];
-        const Real l3 = xi[2];
-        const Real l0 = Real(1) - l1 - l2 - l3;
-        simplex_lagrange_factor_sequence_impl<true, false>(
-            Order, l0, phi0[q], dphi0[q], nullptr);
-        simplex_lagrange_factor_sequence_impl<true, false>(
-            Order, l1, phi1[q], dphi1[q], nullptr);
-        simplex_lagrange_factor_sequence_impl<true, false>(
-            Order, l2, phi2[q], dphi2[q], nullptr);
-        simplex_lagrange_factor_sequence_impl<true, false>(
-            Order, l3, phi3[q], dphi3[q], nullptr);
-    }
-
-    const std::size_t num_nodes = simplex_exponents.size();
-    for (std::size_t node = 0; node < num_nodes; ++node) {
-        const auto& e = simplex_exponents[node];
-        const std::size_t i0 = static_cast<std::size_t>(e[0]);
-        const std::size_t i1 = static_cast<std::size_t>(e[1]);
-        const std::size_t i2 = static_cast<std::size_t>(e[2]);
-        const std::size_t i3 = static_cast<std::size_t>(e[3]);
-        Real gx[4];
-        Real gy[4];
-        Real gz[4];
-
-        for (std::size_t q = 0; q < 4u; ++q) {
-            const Real v0 = phi0[q][i0];
-            const Real v1 = phi1[q][i1];
-            const Real v2 = phi2[q][i2];
-            const Real v3 = phi3[q][i3];
-            const Real D0 = dphi0[q][i0];
-            const Real D1 = dphi1[q][i1];
-            const Real D2 = dphi2[q][i2];
-            const Real D3 = dphi3[q][i3];
-            const Real v23 = v2 * v3;
-            const Real v01 = v0 * v1;
-            const Real dl0 = D0 * v1 * v23;
-            gx[q] = v0 * D1 * v23 - dl0;
-            gy[q] = v01 * D2 * v3 - dl0;
-            gz[q] = v01 * v2 * D3 - dl0;
-        }
-
-        Real* SVMP_RESTRICT g = gradients_out + node * 3u * output_stride;
-        g[0u] = gx[0];
-        g[1u] = gx[1];
-        g[2u] = gx[2];
-        g[3u] = gx[3];
-        g[output_stride + 0u] = gy[0];
-        g[output_stride + 1u] = gy[1];
-        g[output_stride + 2u] = gy[2];
-        g[output_stride + 3u] = gy[3];
-        g[2u * output_stride + 0u] = gz[0];
-        g[2u * output_stride + 1u] = gz[1];
-        g[2u * output_stride + 2u] = gz[2];
-        g[2u * output_stride + 3u] = gz[3];
-    }
-}
-
-template <int Order>
-void evaluate_triangle_simplex_gradients_q4(
-    const std::vector<std::array<int, 4>>& simplex_exponents,
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT gradients_out) {
-    static_assert((Order == 2) || (Order >= 4 && Order <= 8),
-                  "specialized simplex path covers order 2 and orders 4..8");
-
-    Real phi0[4][Order + 1];
-    Real phi1[4][Order + 1];
-    Real phi2[4][Order + 1];
-    Real dphi0[4][Order + 1];
-    Real dphi1[4][Order + 1];
-    Real dphi2[4][Order + 1];
-
-    for (std::size_t q = 0; q < 4u; ++q) {
-        const auto& xi = points[q];
-        const Real l1 = xi[0];
-        const Real l2 = xi[1];
-        const Real l0 = Real(1) - l1 - l2;
-        simplex_lagrange_factor_sequence_impl<true, false>(
-            Order, l0, phi0[q], dphi0[q], nullptr);
-        simplex_lagrange_factor_sequence_impl<true, false>(
-            Order, l1, phi1[q], dphi1[q], nullptr);
-        simplex_lagrange_factor_sequence_impl<true, false>(
-            Order, l2, phi2[q], dphi2[q], nullptr);
-    }
-
-    const std::size_t num_nodes = simplex_exponents.size();
-    for (std::size_t node = 0; node < num_nodes; ++node) {
-        const auto& e = simplex_exponents[node];
-        const std::size_t i0 = static_cast<std::size_t>(e[0]);
-        const std::size_t i1 = static_cast<std::size_t>(e[1]);
-        const std::size_t i2 = static_cast<std::size_t>(e[2]);
-        Real* SVMP_RESTRICT g = gradients_out + node * 3u * output_stride;
-
-        for (std::size_t q = 0; q < 4u; ++q) {
-            const Real v0 = phi0[q][i0];
-            const Real v1 = phi1[q][i1];
-            const Real v2 = phi2[q][i2];
-            const Real D0 = dphi0[q][i0];
-            const Real D1 = dphi1[q][i1];
-            const Real D2 = dphi2[q][i2];
-            const Real dl0 = D0 * v1 * v2;
-            g[0u * output_stride + q] = v0 * D1 * v2 - dl0;
-            g[1u * output_stride + q] = v0 * v1 * D2 - dl0;
-            g[2u * output_stride + q] = Real(0);
-        }
-    }
-}
-
-bool try_evaluate_triangle_simplex_gradients_q4(
-    const std::vector<std::array<int, 4>>& simplex_exponents,
-    int order,
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT gradients_out) {
-    switch (order) {
-    case 2:
-        evaluate_triangle_simplex_gradients_q4<2>(
-            simplex_exponents, points, output_stride, gradients_out);
-        return true;
-    case 4:
-        evaluate_triangle_simplex_gradients_q4<4>(
-            simplex_exponents, points, output_stride, gradients_out);
-        return true;
-    case 5:
-        evaluate_triangle_simplex_gradients_q4<5>(
-            simplex_exponents, points, output_stride, gradients_out);
-        return true;
-    case 6:
-        evaluate_triangle_simplex_gradients_q4<6>(
-            simplex_exponents, points, output_stride, gradients_out);
-        return true;
-    case 7:
-        evaluate_triangle_simplex_gradients_q4<7>(
-            simplex_exponents, points, output_stride, gradients_out);
-        return true;
-    case 8:
-        evaluate_triangle_simplex_gradients_q4<8>(
-            simplex_exponents, points, output_stride, gradients_out);
-        return true;
-    default:
-        return false;
-    }
-}
-
-template <int Order>
-void evaluate_triangle_simplex_hessian_outputs_q4(
-    const std::vector<std::array<int, 4>>& simplex_exponents,
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) {
-    static_assert(Order >= 2 && Order <= 8, "specialized simplex path covers orders 2..8");
-
-    Real phi0[4][Order + 1];
-    Real phi1[4][Order + 1];
-    Real phi2[4][Order + 1];
-    Real dphi0[4][Order + 1];
-    Real dphi1[4][Order + 1];
-    Real dphi2[4][Order + 1];
-    Real d2phi0[4][Order + 1];
-    Real d2phi1[4][Order + 1];
-    Real d2phi2[4][Order + 1];
-
-    for (std::size_t q = 0; q < 4u; ++q) {
-        const auto& xi = points[q];
-        const Real l1 = xi[0];
-        const Real l2 = xi[1];
-        const Real l0 = Real(1) - l1 - l2;
-        simplex_lagrange_factor_sequence_impl<true, true>(
-            Order, l0, phi0[q], dphi0[q], d2phi0[q]);
-        simplex_lagrange_factor_sequence_impl<true, true>(
-            Order, l1, phi1[q], dphi1[q], d2phi1[q]);
-        simplex_lagrange_factor_sequence_impl<true, true>(
-            Order, l2, phi2[q], dphi2[q], d2phi2[q]);
-    }
-
-    const std::size_t num_nodes = simplex_exponents.size();
-    for (std::size_t node = 0; node < num_nodes; ++node) {
-        const auto& e = simplex_exponents[node];
-        const std::size_t i0 = static_cast<std::size_t>(e[0]);
-        const std::size_t i1 = static_cast<std::size_t>(e[1]);
-        const std::size_t i2 = static_cast<std::size_t>(e[2]);
-        Real* SVMP_RESTRICT value_row = values_out ? values_out + node * output_stride : nullptr;
-        Real* SVMP_RESTRICT g = gradients_out ? gradients_out + node * 3u * output_stride : nullptr;
-        Real* SVMP_RESTRICT H = hessians_out + node * 9u * output_stride;
-        H[2u * output_stride + 0u] = Real(0);
-        H[2u * output_stride + 1u] = Real(0);
-        H[2u * output_stride + 2u] = Real(0);
-        H[2u * output_stride + 3u] = Real(0);
-        H[5u * output_stride + 0u] = Real(0);
-        H[5u * output_stride + 1u] = Real(0);
-        H[5u * output_stride + 2u] = Real(0);
-        H[5u * output_stride + 3u] = Real(0);
-        H[6u * output_stride + 0u] = Real(0);
-        H[6u * output_stride + 1u] = Real(0);
-        H[6u * output_stride + 2u] = Real(0);
-        H[6u * output_stride + 3u] = Real(0);
-        H[7u * output_stride + 0u] = Real(0);
-        H[7u * output_stride + 1u] = Real(0);
-        H[7u * output_stride + 2u] = Real(0);
-        H[7u * output_stride + 3u] = Real(0);
-        H[8u * output_stride + 0u] = Real(0);
-        H[8u * output_stride + 1u] = Real(0);
-        H[8u * output_stride + 2u] = Real(0);
-        H[8u * output_stride + 3u] = Real(0);
-
-        for (std::size_t q = 0; q < 4u; ++q) {
-            const Real v0 = phi0[q][i0];
-            const Real v1 = phi1[q][i1];
-            const Real v2 = phi2[q][i2];
-            if (value_row != nullptr) {
-                value_row[q] = v0 * v1 * v2;
-            }
-
-            const Real D0 = dphi0[q][i0];
-            const Real D1 = dphi1[q][i1];
-            const Real D2 = dphi2[q][i2];
-            if (g != nullptr) {
-                const Real dl0 = D0 * v1 * v2;
-                g[0u * output_stride + q] = v0 * D1 * v2 - dl0;
-                g[1u * output_stride + q] = v0 * v1 * D2 - dl0;
-                g[2u * output_stride + q] = Real(0);
-            }
-
-            const Real DD0 = d2phi0[q][i0];
-            const Real DD1 = d2phi1[q][i1];
-            const Real DD2 = d2phi2[q][i2];
-            const Real H00 = DD0 * v1 * v2;
-            const Real H11 = v0 * DD1 * v2;
-            const Real H22 = v0 * v1 * DD2;
-            const Real H01 = D0 * D1 * v2;
-            const Real H02 = D0 * v1 * D2;
-            const Real H12 = v0 * D1 * D2;
-            const Real h01 = H00 - H01 - H02 + H12;
-            H[0u * output_stride + q] = H00 - Real(2) * H01 + H11;
-            H[1u * output_stride + q] = h01;
-            H[3u * output_stride + q] = h01;
-            H[4u * output_stride + q] = H00 - Real(2) * H02 + H22;
-        }
-    }
-}
-
-bool try_evaluate_triangle_simplex_hessian_outputs_q4(
-    const std::vector<std::array<int, 4>>& simplex_exponents,
-    int order,
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) {
-    switch (order) {
-    case 2:
-        evaluate_triangle_simplex_hessian_outputs_q4<2>(
-            simplex_exponents, points, output_stride, values_out, gradients_out, hessians_out);
-        return true;
-    case 3:
-        evaluate_triangle_simplex_hessian_outputs_q4<3>(
-            simplex_exponents, points, output_stride, values_out, gradients_out, hessians_out);
-        return true;
-    case 4:
-        evaluate_triangle_simplex_hessian_outputs_q4<4>(
-            simplex_exponents, points, output_stride, values_out, gradients_out, hessians_out);
-        return true;
-    case 5:
-        evaluate_triangle_simplex_hessian_outputs_q4<5>(
-            simplex_exponents, points, output_stride, values_out, gradients_out, hessians_out);
-        return true;
-    case 6:
-        evaluate_triangle_simplex_hessian_outputs_q4<6>(
-            simplex_exponents, points, output_stride, values_out, gradients_out, hessians_out);
-        return true;
-    case 7:
-        evaluate_triangle_simplex_hessian_outputs_q4<7>(
-            simplex_exponents, points, output_stride, values_out, gradients_out, hessians_out);
-        return true;
-    case 8:
-        evaluate_triangle_simplex_hessian_outputs_q4<8>(
-            simplex_exponents, points, output_stride, values_out, gradients_out, hessians_out);
-        return true;
-    default:
-        return false;
-    }
-}
-
-template <int Order, std::size_t Q>
-inline void write_tetrahedron_simplex_hessian_q4(
-    const Real (&phi0)[4][Order + 1],
-    const Real (&phi1)[4][Order + 1],
-    const Real (&phi2)[4][Order + 1],
-    const Real (&phi3)[4][Order + 1],
-    const Real (&dphi0)[4][Order + 1],
-    const Real (&dphi1)[4][Order + 1],
-    const Real (&dphi2)[4][Order + 1],
-    const Real (&dphi3)[4][Order + 1],
-    const Real (&d2phi0)[4][Order + 1],
-    const Real (&d2phi1)[4][Order + 1],
-    const Real (&d2phi2)[4][Order + 1],
-    const Real (&d2phi3)[4][Order + 1],
-    std::size_t i0,
-    std::size_t i1,
-    std::size_t i2,
-    std::size_t i3,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT H) {
-    const Real v0 = phi0[Q][i0];
-    const Real v1 = phi1[Q][i1];
-    const Real v2 = phi2[Q][i2];
-    const Real v3 = phi3[Q][i3];
-    const Real D0 = dphi0[Q][i0];
-    const Real D1 = dphi1[Q][i1];
-    const Real D2 = dphi2[Q][i2];
-    const Real D3 = dphi3[Q][i3];
-    const Real DD0 = d2phi0[Q][i0];
-    const Real DD1 = d2phi1[Q][i1];
-    const Real DD2 = d2phi2[Q][i2];
-    const Real DD3 = d2phi3[Q][i3];
-    const Real H00 = DD0 * v1 * v2 * v3;
-    const Real H11 = v0 * DD1 * v2 * v3;
-    const Real H22 = v0 * v1 * DD2 * v3;
-    const Real H33 = v0 * v1 * v2 * DD3;
-    const Real H01 = D0 * D1 * v2 * v3;
-    const Real H02 = D0 * v1 * D2 * v3;
-    const Real H03 = D0 * v1 * v2 * D3;
-    const Real H12 = v0 * D1 * D2 * v3;
-    const Real H13 = v0 * D1 * v2 * D3;
-    const Real H23 = v0 * v1 * D2 * D3;
-    const Real h01 = H00 - H01 - H02 + H12;
-    const Real h02 = H00 - H01 - H03 + H13;
-    const Real h12 = H00 - H02 - H03 + H23;
-    H[0u * output_stride + Q] = H00 - Real(2) * H01 + H11;
-    H[1u * output_stride + Q] = h01;
-    H[2u * output_stride + Q] = h02;
-    H[3u * output_stride + Q] = h01;
-    H[4u * output_stride + Q] = H00 - Real(2) * H02 + H22;
-    H[5u * output_stride + Q] = h12;
-    H[6u * output_stride + Q] = h02;
-    H[7u * output_stride + Q] = h12;
-    H[8u * output_stride + Q] = H00 - Real(2) * H03 + H33;
-}
-
-template <int Order, std::size_t Q>
-inline void write_tetrahedron_simplex_hessian_stride4_q(
-    const Real (&phi0)[4][Order + 1],
-    const Real (&phi1)[4][Order + 1],
-    const Real (&phi2)[4][Order + 1],
-    const Real (&phi3)[4][Order + 1],
-    const Real (&dphi0)[4][Order + 1],
-    const Real (&dphi1)[4][Order + 1],
-    const Real (&dphi2)[4][Order + 1],
-    const Real (&dphi3)[4][Order + 1],
-    const Real (&d2phi0)[4][Order + 1],
-    const Real (&d2phi1)[4][Order + 1],
-    const Real (&d2phi2)[4][Order + 1],
-    const Real (&d2phi3)[4][Order + 1],
-    std::size_t i0,
-    std::size_t i1,
-    std::size_t i2,
-    std::size_t i3,
-    Real* SVMP_RESTRICT H) {
-    const Real v0 = phi0[Q][i0];
-    const Real v1 = phi1[Q][i1];
-    const Real v2 = phi2[Q][i2];
-    const Real v3 = phi3[Q][i3];
-    const Real D0 = dphi0[Q][i0];
-    const Real D1 = dphi1[Q][i1];
-    const Real D2 = dphi2[Q][i2];
-    const Real D3 = dphi3[Q][i3];
-    const Real DD0 = d2phi0[Q][i0];
-    const Real DD1 = d2phi1[Q][i1];
-    const Real DD2 = d2phi2[Q][i2];
-    const Real DD3 = d2phi3[Q][i3];
-    const Real v12 = v1 * v2;
-    const Real v13 = v1 * v3;
-    const Real v23 = v2 * v3;
-    const Real v123 = v1 * v23;
-    const Real v023 = v0 * v23;
-    const Real v013 = v0 * v13;
-    const Real v012 = v0 * v12;
-    const Real H00 = DD0 * v123;
-    const Real H11 = DD1 * v023;
-    const Real H22 = DD2 * v013;
-    const Real H33 = DD3 * v012;
-    const Real H01 = D0 * D1 * v23;
-    const Real H02 = D0 * D2 * v13;
-    const Real H03 = D0 * D3 * v12;
-    const Real H12 = D1 * D2 * v0 * v3;
-    const Real H13 = D1 * D3 * v0 * v2;
-    const Real H23 = D2 * D3 * v0 * v1;
-    const Real h01 = H00 - H01 - H02 + H12;
-    const Real h02 = H00 - H01 - H03 + H13;
-    const Real h12 = H00 - H02 - H03 + H23;
-    H[Q] = H00 - Real(2) * H01 + H11;
-    H[4u + Q] = h01;
-    H[8u + Q] = h02;
-    H[12u + Q] = h01;
-    H[16u + Q] = H00 - Real(2) * H02 + H22;
-    H[20u + Q] = h12;
-    H[24u + Q] = h02;
-    H[28u + Q] = h12;
-    H[32u + Q] = H00 - Real(2) * H03 + H33;
-}
-
-template <int Order, std::size_t Q>
-inline void write_tetrahedron_simplex_all_stride4_q(
-    const Real (&phi0)[4][Order + 1],
-    const Real (&phi1)[4][Order + 1],
-    const Real (&phi2)[4][Order + 1],
-    const Real (&phi3)[4][Order + 1],
-    const Real (&dphi0)[4][Order + 1],
-    const Real (&dphi1)[4][Order + 1],
-    const Real (&dphi2)[4][Order + 1],
-    const Real (&dphi3)[4][Order + 1],
-    const Real (&d2phi0)[4][Order + 1],
-    const Real (&d2phi1)[4][Order + 1],
-    const Real (&d2phi2)[4][Order + 1],
-    const Real (&d2phi3)[4][Order + 1],
-    std::size_t i0,
-    std::size_t i1,
-    std::size_t i2,
-    std::size_t i3,
-    Real* SVMP_RESTRICT value_row,
-    Real* SVMP_RESTRICT g,
-    Real* SVMP_RESTRICT H) {
-    const Real v0 = phi0[Q][i0];
-    const Real v1 = phi1[Q][i1];
-    const Real v2 = phi2[Q][i2];
-    const Real v3 = phi3[Q][i3];
-    const Real D0 = dphi0[Q][i0];
-    const Real D1 = dphi1[Q][i1];
-    const Real D2 = dphi2[Q][i2];
-    const Real D3 = dphi3[Q][i3];
-    const Real DD0 = d2phi0[Q][i0];
-    const Real DD1 = d2phi1[Q][i1];
-    const Real DD2 = d2phi2[Q][i2];
-    const Real DD3 = d2phi3[Q][i3];
-    const Real v12 = v1 * v2;
-    const Real v13 = v1 * v3;
-    const Real v23 = v2 * v3;
-    const Real v123 = v1 * v23;
-    const Real v023 = v0 * v23;
-    const Real v013 = v0 * v13;
-    const Real v012 = v0 * v12;
-    const Real dl0 = D0 * v123;
-    const Real H00 = DD0 * v123;
-    const Real H11 = DD1 * v023;
-    const Real H22 = DD2 * v013;
-    const Real H33 = DD3 * v012;
-    const Real H01 = D0 * D1 * v23;
-    const Real H02 = D0 * D2 * v13;
-    const Real H03 = D0 * D3 * v12;
-    const Real H12 = D1 * D2 * v0 * v3;
-    const Real H13 = D1 * D3 * v0 * v2;
-    const Real H23 = D2 * D3 * v0 * v1;
-    const Real h01 = H00 - H01 - H02 + H12;
-    const Real h02 = H00 - H01 - H03 + H13;
-    const Real h12 = H00 - H02 - H03 + H23;
-
-    value_row[Q] = v0 * v123;
-    g[Q] = D1 * v023 - dl0;
-    g[4u + Q] = D2 * v013 - dl0;
-    g[8u + Q] = D3 * v012 - dl0;
-    H[Q] = H00 - Real(2) * H01 + H11;
-    H[4u + Q] = h01;
-    H[8u + Q] = h02;
-    H[12u + Q] = h01;
-    H[16u + Q] = H00 - Real(2) * H02 + H22;
-    H[20u + Q] = h12;
-    H[24u + Q] = h02;
-    H[28u + Q] = h12;
-    H[32u + Q] = H00 - Real(2) * H03 + H33;
-}
-
-template <int Order>
-void evaluate_tetrahedron_simplex_hessian_outputs_q4(
-    const std::vector<std::array<int, 4>>& simplex_exponents,
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) {
-    static_assert(Order >= 2 && Order <= 8, "specialized simplex path covers orders 2..8");
-
-    Real phi0[4][Order + 1];
-    Real phi1[4][Order + 1];
-    Real phi2[4][Order + 1];
-    Real phi3[4][Order + 1];
-    Real dphi0[4][Order + 1];
-    Real dphi1[4][Order + 1];
-    Real dphi2[4][Order + 1];
-    Real dphi3[4][Order + 1];
-    Real d2phi0[4][Order + 1];
-    Real d2phi1[4][Order + 1];
-    Real d2phi2[4][Order + 1];
-    Real d2phi3[4][Order + 1];
-
-    for (std::size_t q = 0; q < 4u; ++q) {
-        const auto& xi = points[q];
-        const Real l1 = xi[0];
-        const Real l2 = xi[1];
-        const Real l3 = xi[2];
-        const Real l0 = Real(1) - l1 - l2 - l3;
-        simplex_lagrange_factor_sequence_impl<true, true>(
-            Order, l0, phi0[q], dphi0[q], d2phi0[q]);
-        simplex_lagrange_factor_sequence_impl<true, true>(
-            Order, l1, phi1[q], dphi1[q], d2phi1[q]);
-        simplex_lagrange_factor_sequence_impl<true, true>(
-            Order, l2, phi2[q], dphi2[q], d2phi2[q]);
-        simplex_lagrange_factor_sequence_impl<true, true>(
-            Order, l3, phi3[q], dphi3[q], d2phi3[q]);
-    }
-
-    const std::size_t num_nodes = simplex_exponents.size();
-    if (values_out == nullptr && gradients_out == nullptr) {
-        if (output_stride == 4u) {
-            for (std::size_t node = 0; node < num_nodes; ++node) {
-                const auto& e = simplex_exponents[node];
-                const std::size_t i0 = static_cast<std::size_t>(e[0]);
-                const std::size_t i1 = static_cast<std::size_t>(e[1]);
-                const std::size_t i2 = static_cast<std::size_t>(e[2]);
-                const std::size_t i3 = static_cast<std::size_t>(e[3]);
-                Real* SVMP_RESTRICT H = hessians_out + node * 36u;
-                write_tetrahedron_simplex_hessian_stride4_q<Order, 0>(
-                    phi0, phi1, phi2, phi3, dphi0, dphi1, dphi2, dphi3,
-                    d2phi0, d2phi1, d2phi2, d2phi3, i0, i1, i2, i3, H);
-                write_tetrahedron_simplex_hessian_stride4_q<Order, 1>(
-                    phi0, phi1, phi2, phi3, dphi0, dphi1, dphi2, dphi3,
-                    d2phi0, d2phi1, d2phi2, d2phi3, i0, i1, i2, i3, H);
-                write_tetrahedron_simplex_hessian_stride4_q<Order, 2>(
-                    phi0, phi1, phi2, phi3, dphi0, dphi1, dphi2, dphi3,
-                    d2phi0, d2phi1, d2phi2, d2phi3, i0, i1, i2, i3, H);
-                write_tetrahedron_simplex_hessian_stride4_q<Order, 3>(
-                    phi0, phi1, phi2, phi3, dphi0, dphi1, dphi2, dphi3,
-                    d2phi0, d2phi1, d2phi2, d2phi3, i0, i1, i2, i3, H);
-            }
-        } else {
-            for (std::size_t node = 0; node < num_nodes; ++node) {
-                const auto& e = simplex_exponents[node];
-                const std::size_t i0 = static_cast<std::size_t>(e[0]);
-                const std::size_t i1 = static_cast<std::size_t>(e[1]);
-                const std::size_t i2 = static_cast<std::size_t>(e[2]);
-                const std::size_t i3 = static_cast<std::size_t>(e[3]);
-                Real* SVMP_RESTRICT H = hessians_out + node * 9u * output_stride;
-                write_tetrahedron_simplex_hessian_q4<Order, 0>(
-                    phi0, phi1, phi2, phi3, dphi0, dphi1, dphi2, dphi3,
-                    d2phi0, d2phi1, d2phi2, d2phi3, i0, i1, i2, i3, output_stride, H);
-                write_tetrahedron_simplex_hessian_q4<Order, 1>(
-                    phi0, phi1, phi2, phi3, dphi0, dphi1, dphi2, dphi3,
-                    d2phi0, d2phi1, d2phi2, d2phi3, i0, i1, i2, i3, output_stride, H);
-                write_tetrahedron_simplex_hessian_q4<Order, 2>(
-                    phi0, phi1, phi2, phi3, dphi0, dphi1, dphi2, dphi3,
-                    d2phi0, d2phi1, d2phi2, d2phi3, i0, i1, i2, i3, output_stride, H);
-                write_tetrahedron_simplex_hessian_q4<Order, 3>(
-                    phi0, phi1, phi2, phi3, dphi0, dphi1, dphi2, dphi3,
-                    d2phi0, d2phi1, d2phi2, d2phi3, i0, i1, i2, i3, output_stride, H);
-            }
-        }
-        return;
-    }
-
-    if (values_out != nullptr && gradients_out != nullptr) {
-        if (output_stride == 4u) {
-            for (std::size_t node = 0; node < num_nodes; ++node) {
-                const auto& e = simplex_exponents[node];
-                const std::size_t i0 = static_cast<std::size_t>(e[0]);
-                const std::size_t i1 = static_cast<std::size_t>(e[1]);
-                const std::size_t i2 = static_cast<std::size_t>(e[2]);
-                const std::size_t i3 = static_cast<std::size_t>(e[3]);
-                Real* SVMP_RESTRICT value_row = values_out + node * output_stride;
-                Real* SVMP_RESTRICT g = gradients_out + node * 3u * output_stride;
-                Real* SVMP_RESTRICT H = hessians_out + node * 9u * output_stride;
-                write_tetrahedron_simplex_all_stride4_q<Order, 0>(
-                    phi0, phi1, phi2, phi3, dphi0, dphi1, dphi2, dphi3,
-                    d2phi0, d2phi1, d2phi2, d2phi3, i0, i1, i2, i3, value_row, g, H);
-                write_tetrahedron_simplex_all_stride4_q<Order, 1>(
-                    phi0, phi1, phi2, phi3, dphi0, dphi1, dphi2, dphi3,
-                    d2phi0, d2phi1, d2phi2, d2phi3, i0, i1, i2, i3, value_row, g, H);
-                write_tetrahedron_simplex_all_stride4_q<Order, 2>(
-                    phi0, phi1, phi2, phi3, dphi0, dphi1, dphi2, dphi3,
-                    d2phi0, d2phi1, d2phi2, d2phi3, i0, i1, i2, i3, value_row, g, H);
-                write_tetrahedron_simplex_all_stride4_q<Order, 3>(
-                    phi0, phi1, phi2, phi3, dphi0, dphi1, dphi2, dphi3,
-                    d2phi0, d2phi1, d2phi2, d2phi3, i0, i1, i2, i3, value_row, g, H);
-            }
-            return;
-        }
-
-        for (std::size_t node = 0; node < num_nodes; ++node) {
-            const auto& e = simplex_exponents[node];
-            const std::size_t i0 = static_cast<std::size_t>(e[0]);
-            const std::size_t i1 = static_cast<std::size_t>(e[1]);
-            const std::size_t i2 = static_cast<std::size_t>(e[2]);
-            const std::size_t i3 = static_cast<std::size_t>(e[3]);
-            Real* SVMP_RESTRICT value_row = values_out + node * output_stride;
-            Real* SVMP_RESTRICT g = gradients_out + node * 3u * output_stride;
-            Real* SVMP_RESTRICT H = hessians_out + node * 9u * output_stride;
-
-            for (std::size_t q = 0; q < 4u; ++q) {
-                const Real v0 = phi0[q][i0];
-                const Real v1 = phi1[q][i1];
-                const Real v2 = phi2[q][i2];
-                const Real v3 = phi3[q][i3];
-                const Real D0 = dphi0[q][i0];
-                const Real D1 = dphi1[q][i1];
-                const Real D2 = dphi2[q][i2];
-                const Real D3 = dphi3[q][i3];
-                const Real DD0 = d2phi0[q][i0];
-                const Real DD1 = d2phi1[q][i1];
-                const Real DD2 = d2phi2[q][i2];
-                const Real DD3 = d2phi3[q][i3];
-                const Real v12 = v1 * v2;
-                const Real v13 = v1 * v3;
-                const Real v23 = v2 * v3;
-                const Real v123 = v1 * v23;
-                const Real v023 = v0 * v23;
-                const Real v013 = v0 * v13;
-                const Real v012 = v0 * v12;
-                const Real dl0 = D0 * v123;
-                const Real H00 = DD0 * v123;
-                const Real H11 = DD1 * v023;
-                const Real H22 = DD2 * v013;
-                const Real H33 = DD3 * v012;
-                const Real H01 = D0 * D1 * v23;
-                const Real H02 = D0 * D2 * v13;
-                const Real H03 = D0 * D3 * v12;
-                const Real H12 = D1 * D2 * v0 * v3;
-                const Real H13 = D1 * D3 * v0 * v2;
-                const Real H23 = D2 * D3 * v0 * v1;
-                const Real h01 = H00 - H01 - H02 + H12;
-                const Real h02 = H00 - H01 - H03 + H13;
-                const Real h12 = H00 - H02 - H03 + H23;
-
-                value_row[q] = v0 * v123;
-                g[0u * output_stride + q] = D1 * v023 - dl0;
-                g[1u * output_stride + q] = D2 * v013 - dl0;
-                g[2u * output_stride + q] = D3 * v012 - dl0;
-                H[0u * output_stride + q] = H00 - Real(2) * H01 + H11;
-                H[1u * output_stride + q] = h01;
-                H[2u * output_stride + q] = h02;
-                H[3u * output_stride + q] = h01;
-                H[4u * output_stride + q] = H00 - Real(2) * H02 + H22;
-                H[5u * output_stride + q] = h12;
-                H[6u * output_stride + q] = h02;
-                H[7u * output_stride + q] = h12;
-                H[8u * output_stride + q] = H00 - Real(2) * H03 + H33;
-            }
-        }
-        return;
-    }
-
-    for (std::size_t node = 0; node < num_nodes; ++node) {
-        const auto& e = simplex_exponents[node];
-        const std::size_t i0 = static_cast<std::size_t>(e[0]);
-        const std::size_t i1 = static_cast<std::size_t>(e[1]);
-        const std::size_t i2 = static_cast<std::size_t>(e[2]);
-        const std::size_t i3 = static_cast<std::size_t>(e[3]);
-        Real* SVMP_RESTRICT value_row = values_out ? values_out + node * output_stride : nullptr;
-        Real* SVMP_RESTRICT g = gradients_out ? gradients_out + node * 3u * output_stride : nullptr;
-        Real* SVMP_RESTRICT H = hessians_out + node * 9u * output_stride;
-
-        for (std::size_t q = 0; q < 4u; ++q) {
-            const Real v0 = phi0[q][i0];
-            const Real v1 = phi1[q][i1];
-            const Real v2 = phi2[q][i2];
-            const Real v3 = phi3[q][i3];
-            if (value_row != nullptr) {
-                value_row[q] = v0 * v1 * v2 * v3;
-            }
-
-            const Real D0 = dphi0[q][i0];
-            const Real D1 = dphi1[q][i1];
-            const Real D2 = dphi2[q][i2];
-            const Real D3 = dphi3[q][i3];
-            if (g != nullptr) {
-                const Real dl0 = D0 * v1 * v2 * v3;
-                g[0u * output_stride + q] = v0 * D1 * v2 * v3 - dl0;
-                g[1u * output_stride + q] = v0 * v1 * D2 * v3 - dl0;
-                g[2u * output_stride + q] = v0 * v1 * v2 * D3 - dl0;
-            }
-
-            const Real DD0 = d2phi0[q][i0];
-            const Real DD1 = d2phi1[q][i1];
-            const Real DD2 = d2phi2[q][i2];
-            const Real DD3 = d2phi3[q][i3];
-            const Real H00 = DD0 * v1 * v2 * v3;
-            const Real H11 = v0 * DD1 * v2 * v3;
-            const Real H22 = v0 * v1 * DD2 * v3;
-            const Real H33 = v0 * v1 * v2 * DD3;
-            const Real H01 = D0 * D1 * v2 * v3;
-            const Real H02 = D0 * v1 * D2 * v3;
-            const Real H03 = D0 * v1 * v2 * D3;
-            const Real H12 = v0 * D1 * D2 * v3;
-            const Real H13 = v0 * D1 * v2 * D3;
-            const Real H23 = v0 * v1 * D2 * D3;
-            const Real h01 = H00 - H01 - H02 + H12;
-            const Real h02 = H00 - H01 - H03 + H13;
-            const Real h12 = H00 - H02 - H03 + H23;
-            H[0u * output_stride + q] = H00 - Real(2) * H01 + H11;
-            H[1u * output_stride + q] = h01;
-            H[2u * output_stride + q] = h02;
-            H[3u * output_stride + q] = h01;
-            H[4u * output_stride + q] = H00 - Real(2) * H02 + H22;
-            H[5u * output_stride + q] = h12;
-            H[6u * output_stride + q] = h02;
-            H[7u * output_stride + q] = h12;
-            H[8u * output_stride + q] = H00 - Real(2) * H03 + H33;
-        }
-    }
-}
-
-bool try_evaluate_tetrahedron_simplex_hessian_outputs_q4(
-    const std::vector<std::array<int, 4>>& simplex_exponents,
-    int order,
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) {
-    switch (order) {
-    case 2:
-        evaluate_tetrahedron_simplex_hessian_outputs_q4<2>(
-            simplex_exponents, points, output_stride, values_out, gradients_out, hessians_out);
-        return true;
-    case 3:
-        evaluate_tetrahedron_simplex_hessian_outputs_q4<3>(
-            simplex_exponents, points, output_stride, values_out, gradients_out, hessians_out);
-        return true;
-    case 4:
-        evaluate_tetrahedron_simplex_hessian_outputs_q4<4>(
-            simplex_exponents, points, output_stride, values_out, gradients_out, hessians_out);
-        return true;
-    case 5:
-        evaluate_tetrahedron_simplex_hessian_outputs_q4<5>(
-            simplex_exponents, points, output_stride, values_out, gradients_out, hessians_out);
-        return true;
-    case 6:
-        evaluate_tetrahedron_simplex_hessian_outputs_q4<6>(
-            simplex_exponents, points, output_stride, values_out, gradients_out, hessians_out);
-        return true;
-    case 7:
-        evaluate_tetrahedron_simplex_hessian_outputs_q4<7>(
-            simplex_exponents, points, output_stride, values_out, gradients_out, hessians_out);
-        return true;
-    case 8:
-        evaluate_tetrahedron_simplex_hessian_outputs_q4<8>(
-            simplex_exponents, points, output_stride, values_out, gradients_out, hessians_out);
-        return true;
-    default:
-        return false;
-    }
-}
-
-// Per-thread scratch space for simplex factor sequences. Common low orders use
-// fixed storage; higher orders fall back to dynamic vectors.
-struct SimplexAxisScratch {
-    std::size_t size{0};
-    std::array<Real, kFixedSimplexAxisSize> phi_fixed{};
-    std::array<Real, kFixedSimplexAxisSize> dphi_fixed{};
-    std::array<Real, kFixedSimplexAxisSize> d2phi_fixed{};
-    std::vector<Real> phi_dynamic;
-    std::vector<Real> dphi_dynamic;
-    std::vector<Real> d2phi_dynamic;
-
-    void reserveFor(std::size_t n) {
-        size = n;
-        if (n <= kFixedSimplexAxisSize) {
-            return;
-        }
-        if (phi_dynamic.size() < n) phi_dynamic.resize(n);
-        if (dphi_dynamic.size() < n) dphi_dynamic.resize(n);
-        if (d2phi_dynamic.size() < n) d2phi_dynamic.resize(n);
-    }
-
-    Real* phi() noexcept {
-        return size <= kFixedSimplexAxisSize ? phi_fixed.data() : phi_dynamic.data();
-    }
-
-    Real* dphi() noexcept {
-        return size <= kFixedSimplexAxisSize ? dphi_fixed.data() : dphi_dynamic.data();
-    }
-
-    Real* d2phi() noexcept {
-        return size <= kFixedSimplexAxisSize ? d2phi_fixed.data() : d2phi_dynamic.data();
-    }
-
-    const Real* phi() const noexcept {
-        return size <= kFixedSimplexAxisSize ? phi_fixed.data() : phi_dynamic.data();
-    }
-
-    const Real* dphi() const noexcept {
-        return size <= kFixedSimplexAxisSize ? dphi_fixed.data() : dphi_dynamic.data();
-    }
-
-    const Real* d2phi() const noexcept {
-        return size <= kFixedSimplexAxisSize ? d2phi_fixed.data() : d2phi_dynamic.data();
-    }
-};
-
-SimplexAxisScratch& simplex_axis_scratch_slot(int slot) {
-    thread_local SimplexAxisScratch s[4];
-    return s[slot];
-}
-
-struct SimplexVectorSink {
-    std::vector<Real>* values;
-    std::vector<Gradient>* gradients;
-    std::vector<Hessian>* hessians;
-
-    bool wants_values() const noexcept { return values != nullptr; }
-    bool wants_gradients() const noexcept { return gradients != nullptr; }
-    bool wants_hessians() const noexcept { return hessians != nullptr; }
-
-    void prepare(std::size_t n_nodes) const {
-        if (values)    values->resize(n_nodes);
-        if (gradients) gradients->resize(n_nodes);
-        if (hessians)  hessians->resize(n_nodes);
-    }
-
-    void write_value(std::size_t n, Real value) const {
-        (*values)[n] = value;
-    }
-
-    void write_gradient(std::size_t n, Real x, Real y, Real z) const {
-        auto& gradient = (*gradients)[n];
-        gradient[0] = x;
-        gradient[1] = y;
-        gradient[2] = z;
-    }
-
-    void write_hessian(std::size_t n,
-                       Real xx,
-                       Real yy,
-                       Real zz,
-                       Real xy,
-                       Real xz,
-                       Real yz) const {
-        Hessian hessian{};
-        hessian(0, 0) = xx;
-        hessian(1, 1) = yy;
-        hessian(2, 2) = zz;
-        hessian(0, 1) = xy; hessian(1, 0) = xy;
-        hessian(0, 2) = xz; hessian(2, 0) = xz;
-        hessian(1, 2) = yz; hessian(2, 1) = yz;
-        (*hessians)[n] = hessian;
-    }
-};
-
-struct SimplexRawSink {
-    Real* values;
-    Real* gradients;
-    Real* hessians;
-
-    bool wants_values() const noexcept { return values != nullptr; }
-    bool wants_gradients() const noexcept { return gradients != nullptr; }
-    bool wants_hessians() const noexcept { return hessians != nullptr; }
-
-    void prepare(std::size_t) const {}
-
-    void write_value(std::size_t n, Real value) const {
-        values[n] = value;
-    }
-
-    void write_gradient(std::size_t n, Real x, Real y, Real z) const {
-        Real* gradient = gradients + n * 3u;
-        gradient[0] = x;
-        gradient[1] = y;
-        gradient[2] = z;
-    }
-
-    void write_hessian(std::size_t n,
-                       Real xx,
-                       Real yy,
-                       Real zz,
-                       Real xy,
-                       Real xz,
-                       Real yz) const {
-        Real* hessian = hessians + n * 9u;
-        hessian[0] = xx;
-        hessian[1] = xy;
-        hessian[2] = xz;
-        hessian[3] = xy;
-        hessian[4] = yy;
-        hessian[5] = yz;
-        hessian[6] = xz;
-        hessian[7] = yz;
-        hessian[8] = zz;
-    }
-};
-
-template <typename Sink>
-void evaluate_triangle_simplex_basis_impl(const std::vector<std::array<int, 4>>& simplex_exponents,
-                                          int order,
-                                          const math::Vector<Real, 3>& xi,
-                                          const Sink& sink) {
-    const Real l1 = xi[0];
-    const Real l2 = xi[1];
-    const Real l0 = Real(1) - l1 - l2;
-
-    const std::size_t n = static_cast<std::size_t>(order + 1);
-    SimplexAxisScratch& s0 = simplex_axis_scratch_slot(0);
-    SimplexAxisScratch& s1 = simplex_axis_scratch_slot(1);
-    SimplexAxisScratch& s2 = simplex_axis_scratch_slot(2);
-    s0.reserveFor(n);
-    s1.reserveFor(n);
-    s2.reserveFor(n);
-
-    const std::size_t num_nodes = simplex_exponents.size();
-    sink.prepare(num_nodes);
-    const bool need_values = sink.wants_values();
-    const bool need_gradients = sink.wants_gradients();
-    const bool need_hessians = sink.wants_hessians();
-    Real* d0_out = (need_gradients || need_hessians) ? s0.dphi() : nullptr;
-    Real* d1_out = (need_gradients || need_hessians) ? s1.dphi() : nullptr;
-    Real* d2_out = (need_gradients || need_hessians) ? s2.dphi() : nullptr;
-    Real* d20_out = need_hessians ? s0.d2phi() : nullptr;
-    Real* d21_out = need_hessians ? s1.d2phi() : nullptr;
-    Real* d22_out = need_hessians ? s2.d2phi() : nullptr;
-
-    simplex_lagrange_factor_sequence(order, l0, s0.phi(), d0_out, d20_out);
-    simplex_lagrange_factor_sequence(order, l1, s1.phi(), d1_out, d21_out);
-    simplex_lagrange_factor_sequence(order, l2, s2.phi(), d2_out, d22_out);
-    const Real* phi0 = s0.phi();
-    const Real* phi1 = s1.phi();
-    const Real* phi2 = s2.phi();
-    const Real* dphi0 = s0.dphi();
-    const Real* dphi1 = s1.dphi();
-    const Real* dphi2 = s2.dphi();
-    const Real* d2phi0 = s0.d2phi();
-    const Real* d2phi1 = s1.d2phi();
-    const Real* d2phi2 = s2.d2phi();
-
-    for (std::size_t n_idx = 0; n_idx < num_nodes; ++n_idx) {
-        const auto& e = simplex_exponents[n_idx];
-        const std::size_t i0 = static_cast<std::size_t>(e[0]);
-        const std::size_t i1 = static_cast<std::size_t>(e[1]);
-        const std::size_t i2 = static_cast<std::size_t>(e[2]);
-
-        const Real v0 = phi0[i0];
-        const Real v1 = phi1[i1];
-        const Real v2 = phi2[i2];
-        if (need_values) {
-            sink.write_value(n_idx, v0 * v1 * v2);
-        }
-        if (!need_gradients && !need_hessians) {
-            continue;
-        }
-
-        const Real D0 = dphi0[i0];
-        const Real D1 = dphi1[i1];
-        const Real D2 = dphi2[i2];
-
-        if (need_gradients) {
-            const Real dl0 = D0 * v1 * v2;
-            const Real dl1 = v0 * D1 * v2;
-            const Real dl2 = v0 * v1 * D2;
-            sink.write_gradient(n_idx, dl1 - dl0, dl2 - dl0, Real(0));
-        }
-
-        if (need_hessians) {
-            const Real DD0 = d2phi0[i0];
-            const Real DD1 = d2phi1[i1];
-            const Real DD2 = d2phi2[i2];
-
-            const Real H00 = DD0 * v1 * v2;
-            const Real H11 = v0 * DD1 * v2;
-            const Real H22 = v0 * v1 * DD2;
-            const Real H01 = D0 * D1 * v2;
-            const Real H02 = D0 * v1 * D2;
-            const Real H12 = v0 * D1 * D2;
-
-            sink.write_hessian(n_idx,
-                               H00 - Real(2) * H01 + H11,
-                               H00 - Real(2) * H02 + H22,
-                               Real(0),
-                               H00 - H01 - H02 + H12,
-                               Real(0),
-                               Real(0));
-        }
-    }
-}
-
-void evaluate_triangle_simplex_basis(const std::vector<std::array<int, 4>>& simplex_exponents,
-                                     int order,
-                                     const math::Vector<Real, 3>& xi,
-                                     std::vector<Real>* values,
-                                     std::vector<Gradient>* gradients,
-                                     std::vector<Hessian>* hessians) {
-    const SimplexVectorSink sink{values, gradients, hessians};
-    evaluate_triangle_simplex_basis_impl(simplex_exponents, order, xi, sink);
-}
-
-void evaluate_triangle_simplex_basis_to(const std::vector<std::array<int, 4>>& simplex_exponents,
-                                        int order,
-                                        const math::Vector<Real, 3>& xi,
-                                        Real* SVMP_RESTRICT values_out,
-                                        Real* SVMP_RESTRICT gradients_out,
-                                        Real* SVMP_RESTRICT hessians_out) {
-    const SimplexRawSink sink{values_out, gradients_out, hessians_out};
-    evaluate_triangle_simplex_basis_impl(simplex_exponents, order, xi, sink);
-}
-
-void evaluate_triangle_simplex_basis_strided(
-    const std::vector<std::array<int, 4>>& simplex_exponents,
-    int order,
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) {
-    const std::size_t num_nodes = simplex_exponents.size();
-    if (points.empty() || num_nodes == 0u) {
-        return;
-    }
-
-    const std::size_t sequence_size = static_cast<std::size_t>(order + 1);
-    const std::size_t num_qpts = points.size();
-    const bool need_gradients = gradients_out != nullptr;
-    const bool need_hessians = hessians_out != nullptr;
-    if (num_qpts == 4u &&
-        values_out != nullptr &&
-        !need_gradients &&
-        !need_hessians &&
-        try_evaluate_triangle_simplex_values_q4(
-            simplex_exponents, order, points, output_stride, values_out)) {
-        return;
-    }
-    if (num_qpts == 4u &&
-        values_out == nullptr &&
-        need_gradients &&
-        !need_hessians &&
-        try_evaluate_triangle_simplex_gradients_q4(
-            simplex_exponents, order, points, output_stride, gradients_out)) {
-        return;
-    }
-    if (num_qpts == 4u &&
-        need_hessians &&
-        try_evaluate_triangle_simplex_hessian_outputs_q4(
-            simplex_exponents, order, points, output_stride,
-            values_out, gradients_out, hessians_out)) {
-        return;
-    }
-    const std::size_t batch_entries = sequence_size * num_qpts;
-    if (batch_entries <= kFixedSimplexBatchEntries) {
-        if (values_out != nullptr && gradients_out == nullptr && hessians_out == nullptr) {
-            std::array<Real, kFixedSimplexBatchEntries> phi0_batch;
-            std::array<Real, kFixedSimplexBatchEntries> phi1_batch;
-            std::array<Real, kFixedSimplexBatchEntries> phi2_batch;
-
-            for (std::size_t q = 0; q < num_qpts; ++q) {
-                const auto& xi = points[q];
-                const Real l1 = xi[0];
-                const Real l2 = xi[1];
-                const Real l0 = Real(1) - l1 - l2;
-                const std::size_t offset = q * sequence_size;
-                simplex_lagrange_factor_sequence(
-                    order, l0, phi0_batch.data() + offset, nullptr, nullptr);
-                simplex_lagrange_factor_sequence(
-                    order, l1, phi1_batch.data() + offset, nullptr, nullptr);
-                simplex_lagrange_factor_sequence(
-                    order, l2, phi2_batch.data() + offset, nullptr, nullptr);
-            }
-
-            for (std::size_t node = 0; node < num_nodes; ++node) {
-                const auto& e = simplex_exponents[node];
-                const std::size_t i0 = static_cast<std::size_t>(e[0]);
-                const std::size_t i1 = static_cast<std::size_t>(e[1]);
-                const std::size_t i2 = static_cast<std::size_t>(e[2]);
-                Real* value_row = values_out + node * output_stride;
-
-                for (std::size_t q = 0; q < num_qpts; ++q) {
-                    const std::size_t offset = q * sequence_size;
-                    value_row[q] =
-                        phi0_batch[offset + i0] *
-                        phi1_batch[offset + i1] *
-                        phi2_batch[offset + i2];
-                }
-            }
-            return;
-        }
-
-        if (values_out == nullptr && gradients_out != nullptr && hessians_out == nullptr) {
-            std::array<Real, kFixedSimplexBatchEntries> phi0_batch;
-            std::array<Real, kFixedSimplexBatchEntries> phi1_batch;
-            std::array<Real, kFixedSimplexBatchEntries> phi2_batch;
-            std::array<Real, kFixedSimplexBatchEntries> dphi0_batch;
-            std::array<Real, kFixedSimplexBatchEntries> dphi1_batch;
-            std::array<Real, kFixedSimplexBatchEntries> dphi2_batch;
-
-            for (std::size_t q = 0; q < num_qpts; ++q) {
-                const auto& xi = points[q];
-                const Real l1 = xi[0];
-                const Real l2 = xi[1];
-                const Real l0 = Real(1) - l1 - l2;
-                const std::size_t offset = q * sequence_size;
-                simplex_lagrange_factor_sequence(
-                    order, l0, phi0_batch.data() + offset, dphi0_batch.data() + offset, nullptr);
-                simplex_lagrange_factor_sequence(
-                    order, l1, phi1_batch.data() + offset, dphi1_batch.data() + offset, nullptr);
-                simplex_lagrange_factor_sequence(
-                    order, l2, phi2_batch.data() + offset, dphi2_batch.data() + offset, nullptr);
-            }
-
-            for (std::size_t node = 0; node < num_nodes; ++node) {
-                const auto& e = simplex_exponents[node];
-                const std::size_t i0 = static_cast<std::size_t>(e[0]);
-                const std::size_t i1 = static_cast<std::size_t>(e[1]);
-                const std::size_t i2 = static_cast<std::size_t>(e[2]);
-                Real* g = gradients_out + node * 3u * output_stride;
-
-                for (std::size_t q = 0; q < num_qpts; ++q) {
-                    const std::size_t offset = q * sequence_size;
-                    const Real v0 = phi0_batch[offset + i0];
-                    const Real v1 = phi1_batch[offset + i1];
-                    const Real v2 = phi2_batch[offset + i2];
-                    const Real D0 = dphi0_batch[offset + i0];
-                    const Real D1 = dphi1_batch[offset + i1];
-                    const Real D2 = dphi2_batch[offset + i2];
-                    const Real dl0 = D0 * v1 * v2;
-                    g[0u * output_stride + q] = v0 * D1 * v2 - dl0;
-                    g[1u * output_stride + q] = v0 * v1 * D2 - dl0;
-                    g[2u * output_stride + q] = Real(0);
-                }
-            }
-            return;
-        }
-
-        if (order >= 4 &&
-            values_out == nullptr &&
-            gradients_out == nullptr &&
-            hessians_out != nullptr) {
-            std::array<Real, kFixedSimplexBatchEntries> phi0_batch;
-            std::array<Real, kFixedSimplexBatchEntries> phi1_batch;
-            std::array<Real, kFixedSimplexBatchEntries> phi2_batch;
-            std::array<Real, kFixedSimplexBatchEntries> dphi0_batch;
-            std::array<Real, kFixedSimplexBatchEntries> dphi1_batch;
-            std::array<Real, kFixedSimplexBatchEntries> dphi2_batch;
-            std::array<Real, kFixedSimplexBatchEntries> d2phi0_batch;
-            std::array<Real, kFixedSimplexBatchEntries> d2phi1_batch;
-            std::array<Real, kFixedSimplexBatchEntries> d2phi2_batch;
-
-            for (std::size_t q = 0; q < num_qpts; ++q) {
-                const auto& xi = points[q];
-                const Real l1 = xi[0];
-                const Real l2 = xi[1];
-                const Real l0 = Real(1) - l1 - l2;
-                const std::size_t offset = q * sequence_size;
-                simplex_lagrange_factor_sequence(
-                    order, l0, phi0_batch.data() + offset,
-                    dphi0_batch.data() + offset, d2phi0_batch.data() + offset);
-                simplex_lagrange_factor_sequence(
-                    order, l1, phi1_batch.data() + offset,
-                    dphi1_batch.data() + offset, d2phi1_batch.data() + offset);
-                simplex_lagrange_factor_sequence(
-                    order, l2, phi2_batch.data() + offset,
-                    dphi2_batch.data() + offset, d2phi2_batch.data() + offset);
-            }
-
-            for (std::size_t node = 0; node < num_nodes; ++node) {
-                const auto& e = simplex_exponents[node];
-                const std::size_t i0 = static_cast<std::size_t>(e[0]);
-                const std::size_t i1 = static_cast<std::size_t>(e[1]);
-                const std::size_t i2 = static_cast<std::size_t>(e[2]);
-                Real* H = hessians_out + node * 9u * output_stride;
-
-                for (std::size_t q = 0; q < num_qpts; ++q) {
-                    const std::size_t offset = q * sequence_size;
-                    const Real v0 = phi0_batch[offset + i0];
-                    const Real v1 = phi1_batch[offset + i1];
-                    const Real v2 = phi2_batch[offset + i2];
-                    const Real D0 = dphi0_batch[offset + i0];
-                    const Real D1 = dphi1_batch[offset + i1];
-                    const Real D2 = dphi2_batch[offset + i2];
-                    const Real DD0 = d2phi0_batch[offset + i0];
-                    const Real DD1 = d2phi1_batch[offset + i1];
-                    const Real DD2 = d2phi2_batch[offset + i2];
-                    const Real H00 = DD0 * v1 * v2;
-                    const Real H11 = v0 * DD1 * v2;
-                    const Real H22 = v0 * v1 * DD2;
-                    const Real H01 = D0 * D1 * v2;
-                    const Real H02 = D0 * v1 * D2;
-                    const Real H12 = v0 * D1 * D2;
-                    const Real h01 = H00 - H01 - H02 + H12;
-
-                    H[0u * output_stride + q] = H00 - Real(2) * H01 + H11;
-                    H[1u * output_stride + q] = h01;
-                    H[2u * output_stride + q] = Real(0);
-                    H[3u * output_stride + q] = h01;
-                    H[4u * output_stride + q] = H00 - Real(2) * H02 + H22;
-                    H[5u * output_stride + q] = Real(0);
-                    H[6u * output_stride + q] = Real(0);
-                    H[7u * output_stride + q] = Real(0);
-                    H[8u * output_stride + q] = Real(0);
-                }
-            }
-            return;
-        }
-
-        std::array<Real, kFixedSimplexBatchEntries> phi0_batch;
-        std::array<Real, kFixedSimplexBatchEntries> phi1_batch;
-        std::array<Real, kFixedSimplexBatchEntries> phi2_batch;
-        std::array<Real, kFixedSimplexBatchEntries> dphi0_batch;
-        std::array<Real, kFixedSimplexBatchEntries> dphi1_batch;
-        std::array<Real, kFixedSimplexBatchEntries> dphi2_batch;
-        std::array<Real, kFixedSimplexBatchEntries> d2phi0_batch;
-        std::array<Real, kFixedSimplexBatchEntries> d2phi1_batch;
-        std::array<Real, kFixedSimplexBatchEntries> d2phi2_batch;
-
-        for (std::size_t q = 0; q < num_qpts; ++q) {
-            const auto& xi = points[q];
-            const Real l1 = xi[0];
-            const Real l2 = xi[1];
-            const Real l0 = Real(1) - l1 - l2;
-            const std::size_t offset = q * sequence_size;
-            Real* d0_out = (need_gradients || need_hessians) ? dphi0_batch.data() + offset : nullptr;
-            Real* d1_out = (need_gradients || need_hessians) ? dphi1_batch.data() + offset : nullptr;
-            Real* d2_out = (need_gradients || need_hessians) ? dphi2_batch.data() + offset : nullptr;
-            Real* d20_out = need_hessians ? d2phi0_batch.data() + offset : nullptr;
-            Real* d21_out = need_hessians ? d2phi1_batch.data() + offset : nullptr;
-            Real* d22_out = need_hessians ? d2phi2_batch.data() + offset : nullptr;
-            simplex_lagrange_factor_sequence(order, l0, phi0_batch.data() + offset, d0_out, d20_out);
-            simplex_lagrange_factor_sequence(order, l1, phi1_batch.data() + offset, d1_out, d21_out);
-            simplex_lagrange_factor_sequence(order, l2, phi2_batch.data() + offset, d2_out, d22_out);
-        }
-
-        for (std::size_t node = 0; node < num_nodes; ++node) {
-            const auto& e = simplex_exponents[node];
-            const std::size_t i0 = static_cast<std::size_t>(e[0]);
-            const std::size_t i1 = static_cast<std::size_t>(e[1]);
-            const std::size_t i2 = static_cast<std::size_t>(e[2]);
-            Real* value_row = values_out ? values_out + node * output_stride : nullptr;
-            Real* g = gradients_out ? gradients_out + node * 3u * output_stride : nullptr;
-            Real* H = hessians_out ? hessians_out + node * 9u * output_stride : nullptr;
-
-            for (std::size_t q = 0; q < num_qpts; ++q) {
-                const std::size_t offset = q * sequence_size;
-                const Real v0 = phi0_batch[offset + i0];
-                const Real v1 = phi1_batch[offset + i1];
-                const Real v2 = phi2_batch[offset + i2];
-                if (value_row != nullptr) {
-                    value_row[q] = v0 * v1 * v2;
-                }
-                if (!need_gradients && !need_hessians) {
-                    continue;
-                }
-
-                const Real D0 = dphi0_batch[offset + i0];
-                const Real D1 = dphi1_batch[offset + i1];
-                const Real D2 = dphi2_batch[offset + i2];
-
-                if (gradients_out != nullptr) {
-                    const Real dl0 = D0 * v1 * v2;
-                    const Real dl1 = v0 * D1 * v2;
-                    const Real dl2 = v0 * v1 * D2;
-                    g[0u * output_stride + q] = dl1 - dl0;
-                    g[1u * output_stride + q] = dl2 - dl0;
-                    g[2u * output_stride + q] = Real(0);
-                }
-
-                if (hessians_out != nullptr) {
-                    const Real DD0 = d2phi0_batch[offset + i0];
-                    const Real DD1 = d2phi1_batch[offset + i1];
-                    const Real DD2 = d2phi2_batch[offset + i2];
-                    const Real H00 = DD0 * v1 * v2;
-                    const Real H11 = v0 * DD1 * v2;
-                    const Real H22 = v0 * v1 * DD2;
-                    const Real H01 = D0 * D1 * v2;
-                    const Real H02 = D0 * v1 * D2;
-                    const Real H12 = v0 * D1 * D2;
-                    const Real h01 = H00 - H01 - H02 + H12;
-                    H[0u * output_stride + q] = H00 - Real(2) * H01 + H11;
-                    H[1u * output_stride + q] = h01;
-                    H[2u * output_stride + q] = Real(0);
-                    H[3u * output_stride + q] = h01;
-                    H[4u * output_stride + q] = H00 - Real(2) * H02 + H22;
-                    H[5u * output_stride + q] = Real(0);
-                    H[6u * output_stride + q] = Real(0);
-                    H[7u * output_stride + q] = Real(0);
-                    H[8u * output_stride + q] = Real(0);
-                }
-            }
-        }
-        return;
-    }
-
-    SimplexAxisScratch& s0 = simplex_axis_scratch_slot(0);
-    SimplexAxisScratch& s1 = simplex_axis_scratch_slot(1);
-    SimplexAxisScratch& s2 = simplex_axis_scratch_slot(2);
-    s0.reserveFor(sequence_size);
-    s1.reserveFor(sequence_size);
-    s2.reserveFor(sequence_size);
-
-    for (std::size_t q = 0; q < points.size(); ++q) {
-        const auto& xi = points[q];
-        const Real l1 = xi[0];
-        const Real l2 = xi[1];
-        const Real l0 = Real(1) - l1 - l2;
-
-        Real* d0_out = (need_gradients || need_hessians) ? s0.dphi() : nullptr;
-        Real* d1_out = (need_gradients || need_hessians) ? s1.dphi() : nullptr;
-        Real* d2_out = (need_gradients || need_hessians) ? s2.dphi() : nullptr;
-        Real* d20_out = need_hessians ? s0.d2phi() : nullptr;
-        Real* d21_out = need_hessians ? s1.d2phi() : nullptr;
-        Real* d22_out = need_hessians ? s2.d2phi() : nullptr;
-
-        simplex_lagrange_factor_sequence(order, l0, s0.phi(), d0_out, d20_out);
-        simplex_lagrange_factor_sequence(order, l1, s1.phi(), d1_out, d21_out);
-        simplex_lagrange_factor_sequence(order, l2, s2.phi(), d2_out, d22_out);
-        const Real* phi0 = s0.phi();
-        const Real* phi1 = s1.phi();
-        const Real* phi2 = s2.phi();
-        const Real* dphi0 = s0.dphi();
-        const Real* dphi1 = s1.dphi();
-        const Real* dphi2 = s2.dphi();
-        const Real* d2phi0 = s0.d2phi();
-        const Real* d2phi1 = s1.d2phi();
-        const Real* d2phi2 = s2.d2phi();
-
-        for (std::size_t node = 0; node < num_nodes; ++node) {
-            const auto& e = simplex_exponents[node];
-            const std::size_t i0 = static_cast<std::size_t>(e[0]);
-            const std::size_t i1 = static_cast<std::size_t>(e[1]);
-            const std::size_t i2 = static_cast<std::size_t>(e[2]);
-
-            const Real v0 = phi0[i0];
-            const Real v1 = phi1[i1];
-            const Real v2 = phi2[i2];
-            const Real value = v0 * v1 * v2;
-            if (values_out != nullptr) {
-                values_out[node * output_stride + q] = value;
-            }
-            if (!need_gradients && !need_hessians) {
-                continue;
-            }
-
-            const Real D0 = dphi0[i0];
-            const Real D1 = dphi1[i1];
-            const Real D2 = dphi2[i2];
-
-            if (gradients_out != nullptr) {
-                const Real dl0 = D0 * v1 * v2;
-                const Real dl1 = v0 * D1 * v2;
-                const Real dl2 = v0 * v1 * D2;
-                Real* g = gradients_out + node * 3u * output_stride;
-                g[0u * output_stride + q] = dl1 - dl0;
-                g[1u * output_stride + q] = dl2 - dl0;
-                g[2u * output_stride + q] = Real(0);
-            }
-
-            if (hessians_out != nullptr) {
-                const Real DD0 = d2phi0[i0];
-                const Real DD1 = d2phi1[i1];
-                const Real DD2 = d2phi2[i2];
-
-                const Real H00 = DD0 * v1 * v2;
-                const Real H11 = v0 * DD1 * v2;
-                const Real H22 = v0 * v1 * DD2;
-                const Real H01 = D0 * D1 * v2;
-                const Real H02 = D0 * v1 * D2;
-                const Real H12 = v0 * D1 * D2;
-
-                Real* H = hessians_out + node * 9u * output_stride;
-                const Real h01 = H00 - H01 - H02 + H12;
-                H[0u * output_stride + q] = H00 - Real(2) * H01 + H11;
-                H[1u * output_stride + q] = h01;
-                H[2u * output_stride + q] = Real(0);
-                H[3u * output_stride + q] = h01;
-                H[4u * output_stride + q] = H00 - Real(2) * H02 + H22;
-                H[5u * output_stride + q] = Real(0);
-                H[6u * output_stride + q] = Real(0);
-                H[7u * output_stride + q] = Real(0);
-                H[8u * output_stride + q] = Real(0);
-            }
-        }
-    }
-}
-
-void evaluate_triangle_simplex_basis_wedge_components_strided(
-    const std::vector<std::array<int, 4>>& simplex_exponents,
-    int order,
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_xy_out,
-    Real* SVMP_RESTRICT hessians_xx_xy_yy_out) {
-    const std::size_t num_nodes = simplex_exponents.size();
-    if (points.empty() || num_nodes == 0u) {
-        return;
-    }
-
-    const std::size_t sequence_size = static_cast<std::size_t>(order + 1);
-    const std::size_t num_qpts = points.size();
-    const bool need_gradients = gradients_xy_out != nullptr;
-    const bool need_hessians = hessians_xx_xy_yy_out != nullptr;
-    const std::size_t batch_entries = sequence_size * num_qpts;
-
-    if (batch_entries <= kFixedSimplexBatchEntries) {
-        if (values_out != nullptr &&
-            gradients_xy_out != nullptr &&
-            hessians_xx_xy_yy_out == nullptr) {
-            std::array<Real, kFixedSimplexBatchEntries> phi0_batch;
-            std::array<Real, kFixedSimplexBatchEntries> phi1_batch;
-            std::array<Real, kFixedSimplexBatchEntries> phi2_batch;
-            std::array<Real, kFixedSimplexBatchEntries> dphi0_batch;
-            std::array<Real, kFixedSimplexBatchEntries> dphi1_batch;
-            std::array<Real, kFixedSimplexBatchEntries> dphi2_batch;
-
-            for (std::size_t q = 0; q < num_qpts; ++q) {
-                const auto& xi = points[q];
-                const Real l1 = xi[0];
-                const Real l2 = xi[1];
-                const Real l0 = Real(1) - l1 - l2;
-                const std::size_t offset = q * sequence_size;
-                simplex_lagrange_factor_sequence(
-                    order, l0, phi0_batch.data() + offset, dphi0_batch.data() + offset, nullptr);
-                simplex_lagrange_factor_sequence(
-                    order, l1, phi1_batch.data() + offset, dphi1_batch.data() + offset, nullptr);
-                simplex_lagrange_factor_sequence(
-                    order, l2, phi2_batch.data() + offset, dphi2_batch.data() + offset, nullptr);
-            }
-
-            for (std::size_t node = 0; node < num_nodes; ++node) {
-                const auto& e = simplex_exponents[node];
-                const std::size_t i0 = static_cast<std::size_t>(e[0]);
-                const std::size_t i1 = static_cast<std::size_t>(e[1]);
-                const std::size_t i2 = static_cast<std::size_t>(e[2]);
-                Real* value_row = values_out + node * output_stride;
-                Real* g = gradients_xy_out + node * 2u * output_stride;
-
-                for (std::size_t q = 0; q < num_qpts; ++q) {
-                    const std::size_t offset = q * sequence_size;
-                    const Real v0 = phi0_batch[offset + i0];
-                    const Real v1 = phi1_batch[offset + i1];
-                    const Real v2 = phi2_batch[offset + i2];
-                    const Real D0 = dphi0_batch[offset + i0];
-                    const Real D1 = dphi1_batch[offset + i1];
-                    const Real D2 = dphi2_batch[offset + i2];
-                    const Real dl0 = D0 * v1 * v2;
-                    value_row[q] = v0 * v1 * v2;
-                    g[0u * output_stride + q] = v0 * D1 * v2 - dl0;
-                    g[1u * output_stride + q] = v0 * v1 * D2 - dl0;
-                }
-            }
-            return;
-        }
-
-        if (values_out != nullptr &&
-            gradients_xy_out != nullptr &&
-            hessians_xx_xy_yy_out != nullptr) {
-            std::array<Real, kFixedSimplexBatchEntries> phi0_batch;
-            std::array<Real, kFixedSimplexBatchEntries> phi1_batch;
-            std::array<Real, kFixedSimplexBatchEntries> phi2_batch;
-            std::array<Real, kFixedSimplexBatchEntries> dphi0_batch;
-            std::array<Real, kFixedSimplexBatchEntries> dphi1_batch;
-            std::array<Real, kFixedSimplexBatchEntries> dphi2_batch;
-            std::array<Real, kFixedSimplexBatchEntries> d2phi0_batch;
-            std::array<Real, kFixedSimplexBatchEntries> d2phi1_batch;
-            std::array<Real, kFixedSimplexBatchEntries> d2phi2_batch;
-
-            for (std::size_t q = 0; q < num_qpts; ++q) {
-                const auto& xi = points[q];
-                const Real l1 = xi[0];
-                const Real l2 = xi[1];
-                const Real l0 = Real(1) - l1 - l2;
-                const std::size_t offset = q * sequence_size;
-                simplex_lagrange_factor_sequence_impl<true, true>(
-                    order, l0, phi0_batch.data() + offset,
-                    dphi0_batch.data() + offset, d2phi0_batch.data() + offset);
-                simplex_lagrange_factor_sequence_impl<true, true>(
-                    order, l1, phi1_batch.data() + offset,
-                    dphi1_batch.data() + offset, d2phi1_batch.data() + offset);
-                simplex_lagrange_factor_sequence_impl<true, true>(
-                    order, l2, phi2_batch.data() + offset,
-                    dphi2_batch.data() + offset, d2phi2_batch.data() + offset);
-            }
-
-            for (std::size_t node = 0; node < num_nodes; ++node) {
-                const auto& e = simplex_exponents[node];
-                const std::size_t i0 = static_cast<std::size_t>(e[0]);
-                const std::size_t i1 = static_cast<std::size_t>(e[1]);
-                const std::size_t i2 = static_cast<std::size_t>(e[2]);
-                Real* SVMP_RESTRICT value_row = values_out + node * output_stride;
-                Real* SVMP_RESTRICT g = gradients_xy_out + node * 2u * output_stride;
-                Real* SVMP_RESTRICT H = hessians_xx_xy_yy_out + node * 3u * output_stride;
-
-                for (std::size_t q = 0; q < num_qpts; ++q) {
-                    const std::size_t offset = q * sequence_size;
-                    const Real v0 = phi0_batch[offset + i0];
-                    const Real v1 = phi1_batch[offset + i1];
-                    const Real v2 = phi2_batch[offset + i2];
-                    const Real D0 = dphi0_batch[offset + i0];
-                    const Real D1 = dphi1_batch[offset + i1];
-                    const Real D2 = dphi2_batch[offset + i2];
-                    const Real dl0 = D0 * v1 * v2;
-                    const Real dl1 = v0 * D1 * v2;
-                    const Real dl2 = v0 * v1 * D2;
-                    const Real DD0 = d2phi0_batch[offset + i0];
-                    const Real DD1 = d2phi1_batch[offset + i1];
-                    const Real DD2 = d2phi2_batch[offset + i2];
-                    const Real H00 = DD0 * v1 * v2;
-                    const Real H11 = v0 * DD1 * v2;
-                    const Real H22 = v0 * v1 * DD2;
-                    const Real H01 = D0 * D1 * v2;
-                    const Real H02 = D0 * v1 * D2;
-                    const Real H12 = v0 * D1 * D2;
-
-                    value_row[q] = v0 * v1 * v2;
-                    g[0u * output_stride + q] = dl1 - dl0;
-                    g[1u * output_stride + q] = dl2 - dl0;
-                    H[0u * output_stride + q] = H00 - Real(2) * H01 + H11;
-                    H[1u * output_stride + q] = H00 - H01 - H02 + H12;
-                    H[2u * output_stride + q] = H00 - Real(2) * H02 + H22;
-                }
-            }
-            return;
-        }
-
-        std::array<Real, kFixedSimplexBatchEntries> phi0_batch;
-        std::array<Real, kFixedSimplexBatchEntries> phi1_batch;
-        std::array<Real, kFixedSimplexBatchEntries> phi2_batch;
-        std::array<Real, kFixedSimplexBatchEntries> dphi0_batch;
-        std::array<Real, kFixedSimplexBatchEntries> dphi1_batch;
-        std::array<Real, kFixedSimplexBatchEntries> dphi2_batch;
-        std::array<Real, kFixedSimplexBatchEntries> d2phi0_batch;
-        std::array<Real, kFixedSimplexBatchEntries> d2phi1_batch;
-        std::array<Real, kFixedSimplexBatchEntries> d2phi2_batch;
-
-        for (std::size_t q = 0; q < num_qpts; ++q) {
-            const auto& xi = points[q];
-            const Real l1 = xi[0];
-            const Real l2 = xi[1];
-            const Real l0 = Real(1) - l1 - l2;
-            const std::size_t offset = q * sequence_size;
-            Real* d0_out = (need_gradients || need_hessians) ? dphi0_batch.data() + offset : nullptr;
-            Real* d1_out = (need_gradients || need_hessians) ? dphi1_batch.data() + offset : nullptr;
-            Real* d2_out = (need_gradients || need_hessians) ? dphi2_batch.data() + offset : nullptr;
-            Real* d20_out = need_hessians ? d2phi0_batch.data() + offset : nullptr;
-            Real* d21_out = need_hessians ? d2phi1_batch.data() + offset : nullptr;
-            Real* d22_out = need_hessians ? d2phi2_batch.data() + offset : nullptr;
-            simplex_lagrange_factor_sequence(order, l0, phi0_batch.data() + offset, d0_out, d20_out);
-            simplex_lagrange_factor_sequence(order, l1, phi1_batch.data() + offset, d1_out, d21_out);
-            simplex_lagrange_factor_sequence(order, l2, phi2_batch.data() + offset, d2_out, d22_out);
-        }
-
-        for (std::size_t node = 0; node < num_nodes; ++node) {
-            const auto& e = simplex_exponents[node];
-            const std::size_t i0 = static_cast<std::size_t>(e[0]);
-            const std::size_t i1 = static_cast<std::size_t>(e[1]);
-            const std::size_t i2 = static_cast<std::size_t>(e[2]);
-            Real* value_row = values_out ? values_out + node * output_stride : nullptr;
-            Real* g = gradients_xy_out ? gradients_xy_out + node * 2u * output_stride : nullptr;
-            Real* H = hessians_xx_xy_yy_out ? hessians_xx_xy_yy_out + node * 3u * output_stride : nullptr;
-
-            for (std::size_t q = 0; q < num_qpts; ++q) {
-                const std::size_t offset = q * sequence_size;
-                const Real v0 = phi0_batch[offset + i0];
-                const Real v1 = phi1_batch[offset + i1];
-                const Real v2 = phi2_batch[offset + i2];
-                if (value_row != nullptr) {
-                    value_row[q] = v0 * v1 * v2;
-                }
-                if (!need_gradients && !need_hessians) {
-                    continue;
-                }
-
-                const Real D0 = dphi0_batch[offset + i0];
-                const Real D1 = dphi1_batch[offset + i1];
-                const Real D2 = dphi2_batch[offset + i2];
-                const Real dl0 = D0 * v1 * v2;
-                const Real dl1 = v0 * D1 * v2;
-                const Real dl2 = v0 * v1 * D2;
-
-                if (gradients_xy_out != nullptr) {
-                    g[0u * output_stride + q] = dl1 - dl0;
-                    g[1u * output_stride + q] = dl2 - dl0;
-                }
-
-                if (hessians_xx_xy_yy_out != nullptr) {
-                    const Real DD0 = d2phi0_batch[offset + i0];
-                    const Real DD1 = d2phi1_batch[offset + i1];
-                    const Real DD2 = d2phi2_batch[offset + i2];
-                    const Real H00 = DD0 * v1 * v2;
-                    const Real H11 = v0 * DD1 * v2;
-                    const Real H22 = v0 * v1 * DD2;
-                    const Real H01 = D0 * D1 * v2;
-                    const Real H02 = D0 * v1 * D2;
-                    const Real H12 = v0 * D1 * D2;
-                    H[0u * output_stride + q] = H00 - Real(2) * H01 + H11;
-                    H[1u * output_stride + q] = H00 - H01 - H02 + H12;
-                    H[2u * output_stride + q] = H00 - Real(2) * H02 + H22;
-                }
-            }
-        }
-        return;
-    }
-
-    SimplexAxisScratch& s0 = simplex_axis_scratch_slot(0);
-    SimplexAxisScratch& s1 = simplex_axis_scratch_slot(1);
-    SimplexAxisScratch& s2 = simplex_axis_scratch_slot(2);
-    s0.reserveFor(sequence_size);
-    s1.reserveFor(sequence_size);
-    s2.reserveFor(sequence_size);
-
-    for (std::size_t q = 0; q < points.size(); ++q) {
-        const auto& xi = points[q];
-        const Real l1 = xi[0];
-        const Real l2 = xi[1];
-        const Real l0 = Real(1) - l1 - l2;
-
-        Real* d0_out = (need_gradients || need_hessians) ? s0.dphi() : nullptr;
-        Real* d1_out = (need_gradients || need_hessians) ? s1.dphi() : nullptr;
-        Real* d2_out = (need_gradients || need_hessians) ? s2.dphi() : nullptr;
-        Real* d20_out = need_hessians ? s0.d2phi() : nullptr;
-        Real* d21_out = need_hessians ? s1.d2phi() : nullptr;
-        Real* d22_out = need_hessians ? s2.d2phi() : nullptr;
-        simplex_lagrange_factor_sequence(order, l0, s0.phi(), d0_out, d20_out);
-        simplex_lagrange_factor_sequence(order, l1, s1.phi(), d1_out, d21_out);
-        simplex_lagrange_factor_sequence(order, l2, s2.phi(), d2_out, d22_out);
-
-        const Real* phi0 = s0.phi();
-        const Real* phi1 = s1.phi();
-        const Real* phi2 = s2.phi();
-        const Real* dphi0 = s0.dphi();
-        const Real* dphi1 = s1.dphi();
-        const Real* dphi2 = s2.dphi();
-        const Real* d2phi0 = s0.d2phi();
-        const Real* d2phi1 = s1.d2phi();
-        const Real* d2phi2 = s2.d2phi();
-
-        for (std::size_t node = 0; node < num_nodes; ++node) {
-            const auto& e = simplex_exponents[node];
-            const std::size_t i0 = static_cast<std::size_t>(e[0]);
-            const std::size_t i1 = static_cast<std::size_t>(e[1]);
-            const std::size_t i2 = static_cast<std::size_t>(e[2]);
-            const Real v0 = phi0[i0];
-            const Real v1 = phi1[i1];
-            const Real v2 = phi2[i2];
-
-            if (values_out != nullptr) {
-                values_out[node * output_stride + q] = v0 * v1 * v2;
-            }
-            if (!need_gradients && !need_hessians) {
-                continue;
-            }
-
-            const Real D0 = dphi0[i0];
-            const Real D1 = dphi1[i1];
-            const Real D2 = dphi2[i2];
-            const Real dl0 = D0 * v1 * v2;
-            const Real dl1 = v0 * D1 * v2;
-            const Real dl2 = v0 * v1 * D2;
-
-            if (gradients_xy_out != nullptr) {
-                Real* g = gradients_xy_out + node * 2u * output_stride;
-                g[0u * output_stride + q] = dl1 - dl0;
-                g[1u * output_stride + q] = dl2 - dl0;
-            }
-
-            if (hessians_xx_xy_yy_out != nullptr) {
-                const Real DD0 = d2phi0[i0];
-                const Real DD1 = d2phi1[i1];
-                const Real DD2 = d2phi2[i2];
-                const Real H00 = DD0 * v1 * v2;
-                const Real H11 = v0 * DD1 * v2;
-                const Real H22 = v0 * v1 * DD2;
-                const Real H01 = D0 * D1 * v2;
-                const Real H02 = D0 * v1 * D2;
-                const Real H12 = v0 * D1 * D2;
-                Real* H = hessians_xx_xy_yy_out + node * 3u * output_stride;
-                H[0u * output_stride + q] = H00 - Real(2) * H01 + H11;
-                H[1u * output_stride + q] = H00 - H01 - H02 + H12;
-                H[2u * output_stride + q] = H00 - Real(2) * H02 + H22;
-            }
-        }
-    }
-}
-
-template <typename Sink>
-void evaluate_tetrahedron_simplex_basis_impl(const std::vector<std::array<int, 4>>& simplex_exponents,
-                                             int order,
-                                             const math::Vector<Real, 3>& xi,
-                                             const Sink& sink) {
-    const Real l1 = xi[0];
-    const Real l2 = xi[1];
-    const Real l3 = xi[2];
-    const Real l0 = Real(1) - l1 - l2 - l3;
-
-    const std::size_t n = static_cast<std::size_t>(order + 1);
-    SimplexAxisScratch& s0 = simplex_axis_scratch_slot(0);
-    SimplexAxisScratch& s1 = simplex_axis_scratch_slot(1);
-    SimplexAxisScratch& s2 = simplex_axis_scratch_slot(2);
-    SimplexAxisScratch& s3 = simplex_axis_scratch_slot(3);
-    s0.reserveFor(n);
-    s1.reserveFor(n);
-    s2.reserveFor(n);
-    s3.reserveFor(n);
-
-    const std::size_t num_nodes = simplex_exponents.size();
-    sink.prepare(num_nodes);
-    const bool need_values = sink.wants_values();
-    const bool need_gradients = sink.wants_gradients();
-    const bool need_hessians = sink.wants_hessians();
-    Real* d0_out = (need_gradients || need_hessians) ? s0.dphi() : nullptr;
-    Real* d1_out = (need_gradients || need_hessians) ? s1.dphi() : nullptr;
-    Real* d2_out = (need_gradients || need_hessians) ? s2.dphi() : nullptr;
-    Real* d3_out = (need_gradients || need_hessians) ? s3.dphi() : nullptr;
-    Real* d20_out = need_hessians ? s0.d2phi() : nullptr;
-    Real* d21_out = need_hessians ? s1.d2phi() : nullptr;
-    Real* d22_out = need_hessians ? s2.d2phi() : nullptr;
-    Real* d23_out = need_hessians ? s3.d2phi() : nullptr;
-
-    simplex_lagrange_factor_sequence(order, l0, s0.phi(), d0_out, d20_out);
-    simplex_lagrange_factor_sequence(order, l1, s1.phi(), d1_out, d21_out);
-    simplex_lagrange_factor_sequence(order, l2, s2.phi(), d2_out, d22_out);
-    simplex_lagrange_factor_sequence(order, l3, s3.phi(), d3_out, d23_out);
-    const Real* phi0 = s0.phi();
-    const Real* phi1 = s1.phi();
-    const Real* phi2 = s2.phi();
-    const Real* phi3 = s3.phi();
-    const Real* dphi0 = s0.dphi();
-    const Real* dphi1 = s1.dphi();
-    const Real* dphi2 = s2.dphi();
-    const Real* dphi3 = s3.dphi();
-    const Real* d2phi0 = s0.d2phi();
-    const Real* d2phi1 = s1.d2phi();
-    const Real* d2phi2 = s2.d2phi();
-    const Real* d2phi3 = s3.d2phi();
-
-    for (std::size_t n_idx = 0; n_idx < num_nodes; ++n_idx) {
-        const auto& e = simplex_exponents[n_idx];
-        const std::size_t i0 = static_cast<std::size_t>(e[0]);
-        const std::size_t i1 = static_cast<std::size_t>(e[1]);
-        const std::size_t i2 = static_cast<std::size_t>(e[2]);
-        const std::size_t i3 = static_cast<std::size_t>(e[3]);
-
-        const Real v0 = phi0[i0];
-        const Real v1 = phi1[i1];
-        const Real v2 = phi2[i2];
-        const Real v3 = phi3[i3];
-        if (need_values) {
-            sink.write_value(n_idx, v0 * v1 * v2 * v3);
-        }
-        if (!need_gradients && !need_hessians) {
-            continue;
-        }
-
-        const Real D0 = dphi0[i0];
-        const Real D1 = dphi1[i1];
-        const Real D2 = dphi2[i2];
-        const Real D3 = dphi3[i3];
-
-        if (need_gradients) {
-            const Real dl0 = D0 * v1 * v2 * v3;
-            const Real dl1 = v0 * D1 * v2 * v3;
-            const Real dl2 = v0 * v1 * D2 * v3;
-            const Real dl3 = v0 * v1 * v2 * D3;
-            sink.write_gradient(n_idx, dl1 - dl0, dl2 - dl0, dl3 - dl0);
-        }
-
-        if (need_hessians) {
-            const Real DD0 = d2phi0[i0];
-            const Real DD1 = d2phi1[i1];
-            const Real DD2 = d2phi2[i2];
-            const Real DD3 = d2phi3[i3];
-
-            const Real H00 = DD0 * v1 * v2 * v3;
-            const Real H11 = v0 * DD1 * v2 * v3;
-            const Real H22 = v0 * v1 * DD2 * v3;
-            const Real H33 = v0 * v1 * v2 * DD3;
-
-            const Real H01 = D0 * D1 * v2 * v3;
-            const Real H02 = D0 * v1 * D2 * v3;
-            const Real H03 = D0 * v1 * v2 * D3;
-            const Real H12 = v0 * D1 * D2 * v3;
-            const Real H13 = v0 * D1 * v2 * D3;
-            const Real H23 = v0 * v1 * D2 * D3;
-
-            sink.write_hessian(n_idx,
-                               H00 - Real(2) * H01 + H11,
-                               H00 - Real(2) * H02 + H22,
-                               H00 - Real(2) * H03 + H33,
-                               H00 - H01 - H02 + H12,
-                               H00 - H01 - H03 + H13,
-                               H00 - H02 - H03 + H23);
-        }
-    }
-}
-
-void evaluate_tetrahedron_simplex_basis(const std::vector<std::array<int, 4>>& simplex_exponents,
-                                        int order,
-                                        const math::Vector<Real, 3>& xi,
-                                        std::vector<Real>* values,
-                                        std::vector<Gradient>* gradients,
-                                        std::vector<Hessian>* hessians) {
-    const SimplexVectorSink sink{values, gradients, hessians};
-    evaluate_tetrahedron_simplex_basis_impl(simplex_exponents, order, xi, sink);
-}
-
-void evaluate_tetrahedron_simplex_basis_to(const std::vector<std::array<int, 4>>& simplex_exponents,
-                                           int order,
-                                           const math::Vector<Real, 3>& xi,
-                                           Real* SVMP_RESTRICT values_out,
-                                           Real* SVMP_RESTRICT gradients_out,
-                                           Real* SVMP_RESTRICT hessians_out) {
-    const SimplexRawSink sink{values_out, gradients_out, hessians_out};
-    evaluate_tetrahedron_simplex_basis_impl(simplex_exponents, order, xi, sink);
-}
-
-void evaluate_tetrahedron_simplex_basis_strided(
-    const std::vector<std::array<int, 4>>& simplex_exponents,
-    int order,
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out) {
-    const std::size_t num_nodes = simplex_exponents.size();
-    if (points.empty() || num_nodes == 0u) {
-        return;
-    }
-
-    const std::size_t sequence_size = static_cast<std::size_t>(order + 1);
-    const std::size_t num_qpts = points.size();
-    const bool need_gradients = gradients_out != nullptr;
-    const bool need_hessians = hessians_out != nullptr;
-    if (num_qpts == 4u &&
-        values_out != nullptr &&
-        !need_gradients &&
-        !need_hessians &&
-        try_evaluate_tetrahedron_simplex_values_q4(
-            simplex_exponents, order, points, output_stride, values_out)) {
-        return;
-    }
-    if (num_qpts == 4u &&
-        values_out == nullptr &&
-        need_gradients &&
-        !need_hessians) {
-        switch (order) {
-        case 3:
-            evaluate_tetrahedron_simplex_gradients_q4<3>(
-                simplex_exponents, points, output_stride, gradients_out);
-            return;
-        case 4:
-            evaluate_tetrahedron_simplex_gradients_q4<4>(
-                simplex_exponents, points, output_stride, gradients_out);
-            return;
-        case 5:
-            evaluate_tetrahedron_simplex_gradients_q4<5>(
-                simplex_exponents, points, output_stride, gradients_out);
-            return;
-        case 6:
-            evaluate_tetrahedron_simplex_gradients_q4<6>(
-                simplex_exponents, points, output_stride, gradients_out);
-            return;
-        case 7:
-            evaluate_tetrahedron_simplex_gradients_q4<7>(
-                simplex_exponents, points, output_stride, gradients_out);
-            return;
-        case 8:
-            evaluate_tetrahedron_simplex_gradients_q4<8>(
-                simplex_exponents, points, output_stride, gradients_out);
-            return;
-        default:
-            break;
-        }
-    }
-    if (num_qpts == 4u &&
-        need_hessians &&
-        try_evaluate_tetrahedron_simplex_hessian_outputs_q4(
-            simplex_exponents, order, points, output_stride,
-            values_out, gradients_out, hessians_out)) {
-        return;
-    }
-    const std::size_t batch_entries = sequence_size * num_qpts;
-    if (batch_entries <= kFixedSimplexBatchEntries) {
-        if (values_out != nullptr && gradients_out == nullptr && hessians_out == nullptr) {
-            std::array<Real, kFixedSimplexBatchEntries> phi0_batch;
-            std::array<Real, kFixedSimplexBatchEntries> phi1_batch;
-            std::array<Real, kFixedSimplexBatchEntries> phi2_batch;
-            std::array<Real, kFixedSimplexBatchEntries> phi3_batch;
-
-            for (std::size_t q = 0; q < num_qpts; ++q) {
-                const auto& xi = points[q];
-                const Real l1 = xi[0];
-                const Real l2 = xi[1];
-                const Real l3 = xi[2];
-                const Real l0 = Real(1) - l1 - l2 - l3;
-                const std::size_t offset = q * sequence_size;
-                simplex_lagrange_factor_sequence(
-                    order, l0, phi0_batch.data() + offset, nullptr, nullptr);
-                simplex_lagrange_factor_sequence(
-                    order, l1, phi1_batch.data() + offset, nullptr, nullptr);
-                simplex_lagrange_factor_sequence(
-                    order, l2, phi2_batch.data() + offset, nullptr, nullptr);
-                simplex_lagrange_factor_sequence(
-                    order, l3, phi3_batch.data() + offset, nullptr, nullptr);
-            }
-
-            for (std::size_t node = 0; node < num_nodes; ++node) {
-                const auto& e = simplex_exponents[node];
-                const std::size_t i0 = static_cast<std::size_t>(e[0]);
-                const std::size_t i1 = static_cast<std::size_t>(e[1]);
-                const std::size_t i2 = static_cast<std::size_t>(e[2]);
-                const std::size_t i3 = static_cast<std::size_t>(e[3]);
-                Real* value_row = values_out + node * output_stride;
-
-                for (std::size_t q = 0; q < num_qpts; ++q) {
-                    const std::size_t offset = q * sequence_size;
-                    value_row[q] =
-                        phi0_batch[offset + i0] *
-                        phi1_batch[offset + i1] *
-                        phi2_batch[offset + i2] *
-                        phi3_batch[offset + i3];
-                }
-            }
-            return;
-        }
-
-        if (values_out == nullptr && gradients_out != nullptr && hessians_out == nullptr) {
-            std::array<Real, kFixedSimplexBatchEntries> phi0_batch;
-            std::array<Real, kFixedSimplexBatchEntries> phi1_batch;
-            std::array<Real, kFixedSimplexBatchEntries> phi2_batch;
-            std::array<Real, kFixedSimplexBatchEntries> phi3_batch;
-            std::array<Real, kFixedSimplexBatchEntries> dphi0_batch;
-            std::array<Real, kFixedSimplexBatchEntries> dphi1_batch;
-            std::array<Real, kFixedSimplexBatchEntries> dphi2_batch;
-            std::array<Real, kFixedSimplexBatchEntries> dphi3_batch;
-
-            for (std::size_t q = 0; q < num_qpts; ++q) {
-                const auto& xi = points[q];
-                const Real l1 = xi[0];
-                const Real l2 = xi[1];
-                const Real l3 = xi[2];
-                const Real l0 = Real(1) - l1 - l2 - l3;
-                const std::size_t offset = q * sequence_size;
-                simplex_lagrange_factor_sequence(
-                    order, l0, phi0_batch.data() + offset, dphi0_batch.data() + offset, nullptr);
-                simplex_lagrange_factor_sequence(
-                    order, l1, phi1_batch.data() + offset, dphi1_batch.data() + offset, nullptr);
-                simplex_lagrange_factor_sequence(
-                    order, l2, phi2_batch.data() + offset, dphi2_batch.data() + offset, nullptr);
-                simplex_lagrange_factor_sequence(
-                    order, l3, phi3_batch.data() + offset, dphi3_batch.data() + offset, nullptr);
-            }
-
-            for (std::size_t node = 0; node < num_nodes; ++node) {
-                const auto& e = simplex_exponents[node];
-                const std::size_t i0 = static_cast<std::size_t>(e[0]);
-                const std::size_t i1 = static_cast<std::size_t>(e[1]);
-                const std::size_t i2 = static_cast<std::size_t>(e[2]);
-                const std::size_t i3 = static_cast<std::size_t>(e[3]);
-                Real* g = gradients_out + node * 3u * output_stride;
-
-                for (std::size_t q = 0; q < num_qpts; ++q) {
-                    const std::size_t offset = q * sequence_size;
-                    const Real v0 = phi0_batch[offset + i0];
-                    const Real v1 = phi1_batch[offset + i1];
-                    const Real v2 = phi2_batch[offset + i2];
-                    const Real v3 = phi3_batch[offset + i3];
-                    const Real D0 = dphi0_batch[offset + i0];
-                    const Real D1 = dphi1_batch[offset + i1];
-                    const Real D2 = dphi2_batch[offset + i2];
-                    const Real D3 = dphi3_batch[offset + i3];
-                    const Real v23 = v2 * v3;
-                    const Real dl0 = D0 * v1 * v23;
-                    g[0u * output_stride + q] = v0 * D1 * v23 - dl0;
-                    g[1u * output_stride + q] = v0 * v1 * D2 * v3 - dl0;
-                    g[2u * output_stride + q] = v0 * v1 * v2 * D3 - dl0;
-                }
-            }
-            return;
-        }
-
-        std::array<Real, kFixedSimplexBatchEntries> phi0_batch;
-        std::array<Real, kFixedSimplexBatchEntries> phi1_batch;
-        std::array<Real, kFixedSimplexBatchEntries> phi2_batch;
-        std::array<Real, kFixedSimplexBatchEntries> phi3_batch;
-        std::array<Real, kFixedSimplexBatchEntries> dphi0_batch;
-        std::array<Real, kFixedSimplexBatchEntries> dphi1_batch;
-        std::array<Real, kFixedSimplexBatchEntries> dphi2_batch;
-        std::array<Real, kFixedSimplexBatchEntries> dphi3_batch;
-        std::array<Real, kFixedSimplexBatchEntries> d2phi0_batch;
-        std::array<Real, kFixedSimplexBatchEntries> d2phi1_batch;
-        std::array<Real, kFixedSimplexBatchEntries> d2phi2_batch;
-        std::array<Real, kFixedSimplexBatchEntries> d2phi3_batch;
-
-        for (std::size_t q = 0; q < num_qpts; ++q) {
-            const auto& xi = points[q];
-            const Real l1 = xi[0];
-            const Real l2 = xi[1];
-            const Real l3 = xi[2];
-            const Real l0 = Real(1) - l1 - l2 - l3;
-            const std::size_t offset = q * sequence_size;
-            Real* d0_out = (need_gradients || need_hessians) ? dphi0_batch.data() + offset : nullptr;
-            Real* d1_out = (need_gradients || need_hessians) ? dphi1_batch.data() + offset : nullptr;
-            Real* d2_out = (need_gradients || need_hessians) ? dphi2_batch.data() + offset : nullptr;
-            Real* d3_out = (need_gradients || need_hessians) ? dphi3_batch.data() + offset : nullptr;
-            Real* d20_out = need_hessians ? d2phi0_batch.data() + offset : nullptr;
-            Real* d21_out = need_hessians ? d2phi1_batch.data() + offset : nullptr;
-            Real* d22_out = need_hessians ? d2phi2_batch.data() + offset : nullptr;
-            Real* d23_out = need_hessians ? d2phi3_batch.data() + offset : nullptr;
-            simplex_lagrange_factor_sequence(order, l0, phi0_batch.data() + offset, d0_out, d20_out);
-            simplex_lagrange_factor_sequence(order, l1, phi1_batch.data() + offset, d1_out, d21_out);
-            simplex_lagrange_factor_sequence(order, l2, phi2_batch.data() + offset, d2_out, d22_out);
-            simplex_lagrange_factor_sequence(order, l3, phi3_batch.data() + offset, d3_out, d23_out);
-        }
-
-        for (std::size_t node = 0; node < num_nodes; ++node) {
-            const auto& e = simplex_exponents[node];
-            const std::size_t i0 = static_cast<std::size_t>(e[0]);
-            const std::size_t i1 = static_cast<std::size_t>(e[1]);
-            const std::size_t i2 = static_cast<std::size_t>(e[2]);
-            const std::size_t i3 = static_cast<std::size_t>(e[3]);
-            Real* value_row = values_out ? values_out + node * output_stride : nullptr;
-            Real* g = gradients_out ? gradients_out + node * 3u * output_stride : nullptr;
-            Real* H = hessians_out ? hessians_out + node * 9u * output_stride : nullptr;
-
-            for (std::size_t q = 0; q < num_qpts; ++q) {
-                const std::size_t offset = q * sequence_size;
-                const Real v0 = phi0_batch[offset + i0];
-                const Real v1 = phi1_batch[offset + i1];
-                const Real v2 = phi2_batch[offset + i2];
-                const Real v3 = phi3_batch[offset + i3];
-                if (value_row != nullptr) {
-                    value_row[q] = v0 * v1 * v2 * v3;
-                }
-                if (!need_gradients && !need_hessians) {
-                    continue;
-                }
-
-                const Real D0 = dphi0_batch[offset + i0];
-                const Real D1 = dphi1_batch[offset + i1];
-                const Real D2 = dphi2_batch[offset + i2];
-                const Real D3 = dphi3_batch[offset + i3];
-
-                if (gradients_out != nullptr) {
-                    const Real dl0 = D0 * v1 * v2 * v3;
-                    const Real dl1 = v0 * D1 * v2 * v3;
-                    const Real dl2 = v0 * v1 * D2 * v3;
-                    const Real dl3 = v0 * v1 * v2 * D3;
-                    g[0u * output_stride + q] = dl1 - dl0;
-                    g[1u * output_stride + q] = dl2 - dl0;
-                    g[2u * output_stride + q] = dl3 - dl0;
-                }
-
-                if (hessians_out != nullptr) {
-                    const Real DD0 = d2phi0_batch[offset + i0];
-                    const Real DD1 = d2phi1_batch[offset + i1];
-                    const Real DD2 = d2phi2_batch[offset + i2];
-                    const Real DD3 = d2phi3_batch[offset + i3];
-                    const Real H00 = DD0 * v1 * v2 * v3;
-                    const Real H11 = v0 * DD1 * v2 * v3;
-                    const Real H22 = v0 * v1 * DD2 * v3;
-                    const Real H33 = v0 * v1 * v2 * DD3;
-                    const Real H01 = D0 * D1 * v2 * v3;
-                    const Real H02 = D0 * v1 * D2 * v3;
-                    const Real H03 = D0 * v1 * v2 * D3;
-                    const Real H12 = v0 * D1 * D2 * v3;
-                    const Real H13 = v0 * D1 * v2 * D3;
-                    const Real H23 = v0 * v1 * D2 * D3;
-                    const Real h01 = H00 - H01 - H02 + H12;
-                    const Real h02 = H00 - H01 - H03 + H13;
-                    const Real h12 = H00 - H02 - H03 + H23;
-                    H[0u * output_stride + q] = H00 - Real(2) * H01 + H11;
-                    H[1u * output_stride + q] = h01;
-                    H[2u * output_stride + q] = h02;
-                    H[3u * output_stride + q] = h01;
-                    H[4u * output_stride + q] = H00 - Real(2) * H02 + H22;
-                    H[5u * output_stride + q] = h12;
-                    H[6u * output_stride + q] = h02;
-                    H[7u * output_stride + q] = h12;
-                    H[8u * output_stride + q] = H00 - Real(2) * H03 + H33;
-                }
-            }
-        }
-        return;
-    }
-
-    SimplexAxisScratch& s0 = simplex_axis_scratch_slot(0);
-    SimplexAxisScratch& s1 = simplex_axis_scratch_slot(1);
-    SimplexAxisScratch& s2 = simplex_axis_scratch_slot(2);
-    SimplexAxisScratch& s3 = simplex_axis_scratch_slot(3);
-    s0.reserveFor(sequence_size);
-    s1.reserveFor(sequence_size);
-    s2.reserveFor(sequence_size);
-    s3.reserveFor(sequence_size);
-
-    for (std::size_t q = 0; q < points.size(); ++q) {
-        const auto& xi = points[q];
-        const Real l1 = xi[0];
-        const Real l2 = xi[1];
-        const Real l3 = xi[2];
-        const Real l0 = Real(1) - l1 - l2 - l3;
-
-        Real* d0_out = (need_gradients || need_hessians) ? s0.dphi() : nullptr;
-        Real* d1_out = (need_gradients || need_hessians) ? s1.dphi() : nullptr;
-        Real* d2_out = (need_gradients || need_hessians) ? s2.dphi() : nullptr;
-        Real* d3_out = (need_gradients || need_hessians) ? s3.dphi() : nullptr;
-        Real* d20_out = need_hessians ? s0.d2phi() : nullptr;
-        Real* d21_out = need_hessians ? s1.d2phi() : nullptr;
-        Real* d22_out = need_hessians ? s2.d2phi() : nullptr;
-        Real* d23_out = need_hessians ? s3.d2phi() : nullptr;
-
-        simplex_lagrange_factor_sequence(order, l0, s0.phi(), d0_out, d20_out);
-        simplex_lagrange_factor_sequence(order, l1, s1.phi(), d1_out, d21_out);
-        simplex_lagrange_factor_sequence(order, l2, s2.phi(), d2_out, d22_out);
-        simplex_lagrange_factor_sequence(order, l3, s3.phi(), d3_out, d23_out);
-        const Real* phi0 = s0.phi();
-        const Real* phi1 = s1.phi();
-        const Real* phi2 = s2.phi();
-        const Real* phi3 = s3.phi();
-        const Real* dphi0 = s0.dphi();
-        const Real* dphi1 = s1.dphi();
-        const Real* dphi2 = s2.dphi();
-        const Real* dphi3 = s3.dphi();
-        const Real* d2phi0 = s0.d2phi();
-        const Real* d2phi1 = s1.d2phi();
-        const Real* d2phi2 = s2.d2phi();
-        const Real* d2phi3 = s3.d2phi();
-
-        for (std::size_t node = 0; node < num_nodes; ++node) {
-            const auto& e = simplex_exponents[node];
-            const std::size_t i0 = static_cast<std::size_t>(e[0]);
-            const std::size_t i1 = static_cast<std::size_t>(e[1]);
-            const std::size_t i2 = static_cast<std::size_t>(e[2]);
-            const std::size_t i3 = static_cast<std::size_t>(e[3]);
-
-            const Real v0 = phi0[i0];
-            const Real v1 = phi1[i1];
-            const Real v2 = phi2[i2];
-            const Real v3 = phi3[i3];
-            if (values_out != nullptr) {
-                values_out[node * output_stride + q] = v0 * v1 * v2 * v3;
-            }
-            if (!need_gradients && !need_hessians) {
-                continue;
-            }
-
-            const Real D0 = dphi0[i0];
-            const Real D1 = dphi1[i1];
-            const Real D2 = dphi2[i2];
-            const Real D3 = dphi3[i3];
-
-            if (gradients_out != nullptr) {
-                const Real dl0 = D0 * v1 * v2 * v3;
-                const Real dl1 = v0 * D1 * v2 * v3;
-                const Real dl2 = v0 * v1 * D2 * v3;
-                const Real dl3 = v0 * v1 * v2 * D3;
-                Real* g = gradients_out + node * 3u * output_stride;
-                g[0u * output_stride + q] = dl1 - dl0;
-                g[1u * output_stride + q] = dl2 - dl0;
-                g[2u * output_stride + q] = dl3 - dl0;
-            }
-
-            if (hessians_out != nullptr) {
-                const Real DD0 = d2phi0[i0];
-                const Real DD1 = d2phi1[i1];
-                const Real DD2 = d2phi2[i2];
-                const Real DD3 = d2phi3[i3];
-
-                const Real H00 = DD0 * v1 * v2 * v3;
-                const Real H11 = v0 * DD1 * v2 * v3;
-                const Real H22 = v0 * v1 * DD2 * v3;
-                const Real H33 = v0 * v1 * v2 * DD3;
-
-                const Real H01 = D0 * D1 * v2 * v3;
-                const Real H02 = D0 * v1 * D2 * v3;
-                const Real H03 = D0 * v1 * v2 * D3;
-                const Real H12 = v0 * D1 * D2 * v3;
-                const Real H13 = v0 * D1 * v2 * D3;
-                const Real H23 = v0 * v1 * D2 * D3;
-
-                const Real h01 = H00 - H01 - H02 + H12;
-                const Real h02 = H00 - H01 - H03 + H13;
-                const Real h12 = H00 - H02 - H03 + H23;
-
-                Real* H = hessians_out + node * 9u * output_stride;
-                H[0u * output_stride + q] = H00 - Real(2) * H01 + H11;
-                H[1u * output_stride + q] = h01;
-                H[2u * output_stride + q] = h02;
-                H[3u * output_stride + q] = h01;
-                H[4u * output_stride + q] = H00 - Real(2) * H02 + H22;
-                H[5u * output_stride + q] = h12;
-                H[6u * output_stride + q] = h02;
-                H[7u * output_stride + q] = h12;
-                H[8u * output_stride + q] = H00 - Real(2) * H03 + H33;
-            }
-        }
-    }
-}
-
-} // namespace detail
-} // namespace basis
-} // namespace FE
-} // namespace svmp
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasisSimplex.h b/Code/Source/solver/FE/Basis/LagrangeBasisSimplex.h
deleted file mode 100644
index 19cf725bd..000000000
--- a/Code/Source/solver/FE/Basis/LagrangeBasisSimplex.h
+++ /dev/null
@@ -1,78 +0,0 @@
-#ifndef SVMP_FE_BASIS_LAGRANGEBASISSIMPLEX_H
-#define SVMP_FE_BASIS_LAGRANGEBASISSIMPLEX_H
-
-// Private declarations for simplex Lagrange evaluation helpers implemented in
-// LagrangeBasisSimplex.cpp.
-
-#include "BasisFunction.h"
-
-#include <array>
-#include <cstddef>
-#include <vector>
-
-namespace svmp {
-namespace FE {
-namespace basis {
-namespace detail {
-
-void evaluate_triangle_simplex_basis(const std::vector<std::array<int, 4>>& simplex_exponents,
-                                     int order,
-                                     const math::Vector<Real, 3>& xi,
-                                     std::vector<Real>* values,
-                                     std::vector<Gradient>* gradients,
-                                     std::vector<Hessian>* hessians);
-
-void evaluate_triangle_simplex_basis_to(const std::vector<std::array<int, 4>>& simplex_exponents,
-                                        int order,
-                                        const math::Vector<Real, 3>& xi,
-                                        Real* SVMP_RESTRICT values_out,
-                                        Real* SVMP_RESTRICT gradients_out,
-                                        Real* SVMP_RESTRICT hessians_out);
-
-void evaluate_triangle_simplex_basis_strided(
-    const std::vector<std::array<int, 4>>& simplex_exponents,
-    int order,
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out);
-
-void evaluate_triangle_simplex_basis_wedge_components_strided(
-    const std::vector<std::array<int, 4>>& simplex_exponents,
-    int order,
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_xy_out,
-    Real* SVMP_RESTRICT hessians_xx_xy_yy_out);
-
-void evaluate_tetrahedron_simplex_basis(const std::vector<std::array<int, 4>>& simplex_exponents,
-                                        int order,
-                                        const math::Vector<Real, 3>& xi,
-                                        std::vector<Real>* values,
-                                        std::vector<Gradient>* gradients,
-                                        std::vector<Hessian>* hessians);
-
-void evaluate_tetrahedron_simplex_basis_to(const std::vector<std::array<int, 4>>& simplex_exponents,
-                                           int order,
-                                           const math::Vector<Real, 3>& xi,
-                                           Real* SVMP_RESTRICT values_out,
-                                           Real* SVMP_RESTRICT gradients_out,
-                                           Real* SVMP_RESTRICT hessians_out);
-
-void evaluate_tetrahedron_simplex_basis_strided(
-    const std::vector<std::array<int, 4>>& simplex_exponents,
-    int order,
-    const std::vector<math::Vector<Real, 3>>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT gradients_out,
-    Real* SVMP_RESTRICT hessians_out);
-
-} // namespace detail
-} // namespace basis
-} // namespace FE
-} // namespace svmp
-
-#endif // SVMP_FE_BASIS_LAGRANGEBASISSIMPLEX_H
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasisUtility.h b/Code/Source/solver/FE/Basis/LagrangeBasisUtility.h
deleted file mode 100644
index e622de1c6..000000000
--- a/Code/Source/solver/FE/Basis/LagrangeBasisUtility.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#ifndef SVMP_FE_BASIS_LAGRANGEBASISUTILITY_H
-#define SVMP_FE_BASIS_LAGRANGEBASISUTILITY_H
-
-// Private helper for LagrangeBasis internals.
-// This header is only intended to be included after the FE basis scalar types
-// are already available.
-
-namespace svmp {
-namespace FE {
-namespace basis {
-namespace detail {
-
-inline constexpr Real equispaced_pm_one_coord(int i, int order) {
-    if (order <= 0) {
-        return Real(0);
-    }
-    return Real(-1) + Real(2) * static_cast<Real>(i) / static_cast<Real>(order);
-}
-
-} // namespace detail
-} // namespace basis
-} // namespace FE
-} // namespace svmp
-
-#endif // SVMP_FE_BASIS_LAGRANGEBASISUTILITY_H
diff --git a/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp b/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
index 20f743916..ae3ea8ed3 100644
--- a/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
+++ b/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
@@ -6,8 +6,8 @@
  */
 
 #include "NodeOrderingConventions.h"
-#include "Basis/BasisExceptions.h"
-#include "Basis/BasisTraits.h"
+#include "BasisExceptions.h"
+#include "BasisTraits.h"
 
 #include <array>
 
@@ -18,160 +18,7 @@ namespace basis {
 namespace {
 
 using Point = math::Vector<Real, 3>;
-using RawPoint = std::array<Real, 3>;
 
-template<std::size_t N>
-using NodeTable = std::array<RawPoint, N>;
-
-struct NodeTableView {
-    const RawPoint* data{nullptr};
-    std::size_t size{0};
-};
-
-inline constexpr NodeTable<2> kLine2Nodes = {{
-    {Real(-1), Real(0), Real(0)},
-    {Real(1), Real(0), Real(0)},
-}};
-
-inline constexpr NodeTable<3> kLine3Nodes = {{
-    {Real(-1), Real(0), Real(0)},
-    {Real(1), Real(0), Real(0)},
-    {Real(0), Real(0), Real(0)},
-}};
-
-inline constexpr NodeTable<3> kTriangle3Nodes = {{
-    {Real(0), Real(0), Real(0)},
-    {Real(1), Real(0), Real(0)},
-    {Real(0), Real(1), Real(0)},
-}};
-
-inline constexpr NodeTable<6> kTriangle6Nodes = {{
-    {Real(0), Real(0), Real(0)},
-    {Real(1), Real(0), Real(0)},
-    {Real(0), Real(1), Real(0)},
-    {Real(0.5), Real(0), Real(0)},
-    {Real(0.5), Real(0.5), Real(0)},
-    {Real(0), Real(0.5), Real(0)},
-}};
-
-inline constexpr NodeTable<4> kQuad4Nodes = {{
-    {Real(-1), Real(-1), Real(0)},
-    {Real(1), Real(-1), Real(0)},
-    {Real(1), Real(1), Real(0)},
-    {Real(-1), Real(1), Real(0)},
-}};
-
-inline constexpr NodeTable<9> kQuad9Nodes = {{
-    {Real(-1), Real(-1), Real(0)},
-    {Real(1), Real(-1), Real(0)},
-    {Real(1), Real(1), Real(0)},
-    {Real(-1), Real(1), Real(0)},
-    {Real(0), Real(-1), Real(0)},
-    {Real(1), Real(0), Real(0)},
-    {Real(0), Real(1), Real(0)},
-    {Real(-1), Real(0), Real(0)},
-    {Real(0), Real(0), Real(0)},
-}};
-
-inline constexpr NodeTable<8> kQuad8Nodes = {{
-    {Real(-1), Real(-1), Real(0)},
-    {Real(1), Real(-1), Real(0)},
-    {Real(1), Real(1), Real(0)},
-    {Real(-1), Real(1), Real(0)},
-    {Real(0), Real(-1), Real(0)},
-    {Real(1), Real(0), Real(0)},
-    {Real(0), Real(1), Real(0)},
-    {Real(-1), Real(0), Real(0)},
-}};
-
-inline constexpr NodeTable<4> kTetra4Nodes = {{
-    {Real(0), Real(0), Real(0)},
-    {Real(1), Real(0), Real(0)},
-    {Real(0), Real(1), Real(0)},
-    {Real(0), Real(0), Real(1)},
-}};
-
-inline constexpr NodeTable<10> kTetra10Nodes = {{
-    {Real(0), Real(0), Real(0)},
-    {Real(1), Real(0), Real(0)},
-    {Real(0), Real(1), Real(0)},
-    {Real(0), Real(0), Real(1)},
-    {Real(0.5), Real(0), Real(0)},
-    {Real(0.5), Real(0.5), Real(0)},
-    {Real(0), Real(0.5), Real(0)},
-    {Real(0), Real(0), Real(0.5)},
-    {Real(0.5), Real(0), Real(0.5)},
-    {Real(0), Real(0.5), Real(0.5)},
-}};
-
-inline constexpr NodeTable<8> kHex8Nodes = {{
-    {Real(-1), Real(-1), Real(-1)},
-    {Real(1), Real(-1), Real(-1)},
-    {Real(1), Real(1), Real(-1)},
-    {Real(-1), Real(1), Real(-1)},
-    {Real(-1), Real(-1), Real(1)},
-    {Real(1), Real(-1), Real(1)},
-    {Real(1), Real(1), Real(1)},
-    {Real(-1), Real(1), Real(1)},
-}};
-
-inline constexpr NodeTable<27> kHex27Nodes = {{
-    {Real(-1), Real(-1), Real(-1)},
-    {Real(1), Real(-1), Real(-1)},
-    {Real(1), Real(1), Real(-1)},
-    {Real(-1), Real(1), Real(-1)},
-    {Real(-1), Real(-1), Real(1)},
-    {Real(1), Real(-1), Real(1)},
-    {Real(1), Real(1), Real(1)},
-    {Real(-1), Real(1), Real(1)},
-    {Real(0), Real(-1), Real(-1)},
-    {Real(1), Real(0), Real(-1)},
-    {Real(0), Real(1), Real(-1)},
-    {Real(-1), Real(0), Real(-1)},
-    {Real(0), Real(-1), Real(1)},
-    {Real(1), Real(0), Real(1)},
-    {Real(0), Real(1), Real(1)},
-    {Real(-1), Real(0), Real(1)},
-    {Real(-1), Real(-1), Real(0)},
-    {Real(1), Real(-1), Real(0)},
-    {Real(1), Real(1), Real(0)},
-    {Real(-1), Real(1), Real(0)},
-    {Real(0), Real(0), Real(-1)},
-    {Real(0), Real(0), Real(1)},
-    {Real(0), Real(-1), Real(0)},
-    {Real(1), Real(0), Real(0)},
-    {Real(0), Real(1), Real(0)},
-    {Real(-1), Real(0), Real(0)},
-    {Real(0), Real(0), Real(0)},
-}};
-
-inline constexpr NodeTable<20> kHex20Nodes = {{
-    {Real(-1), Real(-1), Real(-1)},
-    {Real(1), Real(-1), Real(-1)},
-    {Real(1), Real(1), Real(-1)},
-    {Real(-1), Real(1), Real(-1)},
-    {Real(-1), Real(-1), Real(1)},
-    {Real(1), Real(-1), Real(1)},
-    {Real(1), Real(1), Real(1)},
-    {Real(-1), Real(1), Real(1)},
-    {Real(0), Real(-1), Real(-1)},
-    {Real(1), Real(0), Real(-1)},
-    {Real(0), Real(1), Real(-1)},
-    {Real(-1), Real(0), Real(-1)},
-    {Real(0), Real(-1), Real(1)},
-    {Real(1), Real(0), Real(1)},
-    {Real(0), Real(1), Real(1)},
-    {Real(-1), Real(0), Real(1)},
-    {Real(-1), Real(-1), Real(0)},
-    {Real(1), Real(-1), Real(0)},
-    {Real(1), Real(1), Real(0)},
-    {Real(-1), Real(1), Real(0)},
-}};
-
-// Mesh uses conventional Hex20 ordering: corners first, then edge midpoints in
-// {bottom, top, vertical} groups. The quadratic Hex20 serendipity polynomial
-// table uses an axis-grouped edge order. This maps public mesh/reference index
-// to the internal polynomial-table index.
 constexpr std::array<std::size_t, 20> kHex20MeshToBasisOrder = {
     0, 1, 2, 3, 4, 5, 6, 7,
     8, 13, 10, 12,
@@ -179,157 +26,6 @@ constexpr std::array<std::size_t, 20> kHex20MeshToBasisOrder = {
     16, 17, 19, 18
 };
 
-inline constexpr NodeTable<6> kWedge6Nodes = {{
-    {Real(0), Real(0), Real(-1)},
-    {Real(1), Real(0), Real(-1)},
-    {Real(0), Real(1), Real(-1)},
-    {Real(0), Real(0), Real(1)},
-    {Real(1), Real(0), Real(1)},
-    {Real(0), Real(1), Real(1)},
-}};
-
-inline constexpr NodeTable<18> kWedge18Nodes = {{
-    {Real(0), Real(0), Real(-1)},
-    {Real(1), Real(0), Real(-1)},
-    {Real(0), Real(1), Real(-1)},
-    {Real(0), Real(0), Real(1)},
-    {Real(1), Real(0), Real(1)},
-    {Real(0), Real(1), Real(1)},
-    {Real(0.5), Real(0), Real(-1)},
-    {Real(0.5), Real(0.5), Real(-1)},
-    {Real(0), Real(0.5), Real(-1)},
-    {Real(0.5), Real(0), Real(1)},
-    {Real(0.5), Real(0.5), Real(1)},
-    {Real(0), Real(0.5), Real(1)},
-    {Real(0), Real(0), Real(0)},
-    {Real(1), Real(0), Real(0)},
-    {Real(0), Real(1), Real(0)},
-    {Real(0.5), Real(0), Real(0)},
-    {Real(0.5), Real(0.5), Real(0)},
-    {Real(0), Real(0.5), Real(0)},
-}};
-
-inline constexpr NodeTable<15> kWedge15Nodes = {{
-    {Real(0), Real(0), Real(-1)},
-    {Real(1), Real(0), Real(-1)},
-    {Real(0), Real(1), Real(-1)},
-    {Real(0), Real(0), Real(1)},
-    {Real(1), Real(0), Real(1)},
-    {Real(0), Real(1), Real(1)},
-    {Real(0.5), Real(0), Real(-1)},
-    {Real(0.5), Real(0.5), Real(-1)},
-    {Real(0), Real(0.5), Real(-1)},
-    {Real(0.5), Real(0), Real(1)},
-    {Real(0.5), Real(0.5), Real(1)},
-    {Real(0), Real(0.5), Real(1)},
-    {Real(0), Real(0), Real(0)},
-    {Real(1), Real(0), Real(0)},
-    {Real(0), Real(1), Real(0)},
-}};
-
-inline constexpr NodeTable<5> kPyramid5Nodes = {{
-    {Real(-1), Real(-1), Real(0)},
-    {Real(1), Real(-1), Real(0)},
-    {Real(1), Real(1), Real(0)},
-    {Real(-1), Real(1), Real(0)},
-    {Real(0), Real(0), Real(1)},
-}};
-
-inline constexpr NodeTable<14> kPyramid14Nodes = {{
-    {Real(-1), Real(-1), Real(0)},
-    {Real(1), Real(-1), Real(0)},
-    {Real(1), Real(1), Real(0)},
-    {Real(-1), Real(1), Real(0)},
-    {Real(0), Real(0), Real(1)},
-    {Real(0), Real(-1), Real(0)},
-    {Real(1), Real(0), Real(0)},
-    {Real(0), Real(1), Real(0)},
-    {Real(-1), Real(0), Real(0)},
-    {Real(-0.5), Real(-0.5), Real(0.5)},
-    {Real(0.5), Real(-0.5), Real(0.5)},
-    {Real(0.5), Real(0.5), Real(0.5)},
-    {Real(-0.5), Real(0.5), Real(0.5)},
-    {Real(0), Real(0), Real(0)},
-}};
-
-inline constexpr NodeTable<13> kPyramid13Nodes = {{
-    {Real(-1), Real(-1), Real(0)},
-    {Real(1), Real(-1), Real(0)},
-    {Real(1), Real(1), Real(0)},
-    {Real(-1), Real(1), Real(0)},
-    {Real(0), Real(0), Real(1)},
-    {Real(0), Real(-1), Real(0)},
-    {Real(1), Real(0), Real(0)},
-    {Real(0), Real(1), Real(0)},
-    {Real(-1), Real(0), Real(0)},
-    {Real(-0.5), Real(-0.5), Real(0.5)},
-    {Real(0.5), Real(-0.5), Real(0.5)},
-    {Real(0.5), Real(0.5), Real(0.5)},
-    {Real(-0.5), Real(0.5), Real(0.5)},
-}};
-
-template<std::size_t N>
-constexpr NodeTableView view(const NodeTable<N>& table) noexcept {
-    return NodeTableView{table.data(), table.size()};
-}
-
-Point to_point(const RawPoint& raw) {
-    return Point{raw[0], raw[1], raw[2]};
-}
-
-constexpr NodeTableView fixed_node_table(ElementType elem_type) noexcept {
-    switch (elem_type) {
-        case ElementType::Line2:     return view(kLine2Nodes);
-        case ElementType::Line3:     return view(kLine3Nodes);
-        case ElementType::Triangle3: return view(kTriangle3Nodes);
-        case ElementType::Triangle6: return view(kTriangle6Nodes);
-        case ElementType::Quad4:     return view(kQuad4Nodes);
-        case ElementType::Quad8:     return view(kQuad8Nodes);
-        case ElementType::Quad9:     return view(kQuad9Nodes);
-        case ElementType::Tetra4:    return view(kTetra4Nodes);
-        case ElementType::Tetra10:   return view(kTetra10Nodes);
-        case ElementType::Hex8:      return view(kHex8Nodes);
-        case ElementType::Hex20:     return view(kHex20Nodes);
-        case ElementType::Hex27:     return view(kHex27Nodes);
-        case ElementType::Wedge6:    return view(kWedge6Nodes);
-        case ElementType::Wedge15:   return view(kWedge15Nodes);
-        case ElementType::Wedge18:   return view(kWedge18Nodes);
-        case ElementType::Pyramid5:  return view(kPyramid5Nodes);
-        case ElementType::Pyramid13: return view(kPyramid13Nodes);
-        case ElementType::Pyramid14: return view(kPyramid14Nodes);
-        default:                     return {};
-    }
-}
-
-constexpr NodeTableView fixed_complete_lagrange_table(ElementType canonical_type,
-                                                      int order) noexcept {
-    switch (canonical_type) {
-        case ElementType::Line2:
-            return order == 1 ? view(kLine2Nodes) :
-                   order == 2 ? view(kLine3Nodes) : NodeTableView{};
-        case ElementType::Triangle3:
-            return order == 1 ? view(kTriangle3Nodes) :
-                   order == 2 ? view(kTriangle6Nodes) : NodeTableView{};
-        case ElementType::Quad4:
-            return order == 1 ? view(kQuad4Nodes) :
-                   order == 2 ? view(kQuad9Nodes) : NodeTableView{};
-        case ElementType::Tetra4:
-            return order == 1 ? view(kTetra4Nodes) :
-                   order == 2 ? view(kTetra10Nodes) : NodeTableView{};
-        case ElementType::Hex8:
-            return order == 1 ? view(kHex8Nodes) :
-                   order == 2 ? view(kHex27Nodes) : NodeTableView{};
-        case ElementType::Wedge6:
-            return order == 1 ? view(kWedge6Nodes) :
-                   order == 2 ? view(kWedge18Nodes) : NodeTableView{};
-        case ElementType::Pyramid5:
-            return order == 1 ? view(kPyramid5Nodes) :
-                   order == 2 ? view(kPyramid14Nodes) : NodeTableView{};
-        default:
-            return {};
-    }
-}
-
 Real line_coord_pm_one(int i, int order) {
     if (order <= 0) {
         return Real(0);
@@ -352,10 +48,10 @@ void append_triangle_face_interior(std::vector<Point>& nodes,
     for (int c = 1; c <= order - 2; ++c) {
         for (int b = 1; b <= order - c - 1; ++b) {
             const int a = order - b - c;
-            const Real la = static_cast<Real>(a) / static_cast<Real>(order);
-            const Real lb = static_cast<Real>(b) / static_cast<Real>(order);
-            const Real lc = static_cast<Real>(c) / static_cast<Real>(order);
-            nodes.push_back(v0 * la + v1 * lb + v2 * lc);
+            const Real inv = Real(1) / Real(order);
+            nodes.push_back(v0 * (Real(a) * inv) +
+                            v1 * (Real(b) * inv) +
+                            v2 * (Real(c) * inv));
         }
     }
 }
@@ -382,7 +78,6 @@ std::vector<Point> generate_triangle_nodes(int order) {
 
     std::vector<Point> nodes;
     nodes.reserve(static_cast<std::size_t>((order + 1) * (order + 2) / 2));
-
     nodes.push_back(Point{Real(0), Real(0), Real(0)});
     nodes.push_back(Point{Real(1), Real(0), Real(0)});
     nodes.push_back(Point{Real(0), Real(1), Real(0)});
@@ -398,13 +93,11 @@ std::vector<Point> generate_triangle_nodes(int order) {
         nodes.push_back(Point{Real(0), line_coord_zero_one(order - m, order), Real(0)});
     }
 
-    append_triangle_face_interior(
-        nodes,
-        Point{Real(0), Real(0), Real(0)},
-        Point{Real(1), Real(0), Real(0)},
-        Point{Real(0), Real(1), Real(0)},
-        order);
-
+    append_triangle_face_interior(nodes,
+                                  Point{Real(0), Real(0), Real(0)},
+                                  Point{Real(1), Real(0), Real(0)},
+                                  Point{Real(0), Real(1), Real(0)},
+                                  order);
     return nodes;
 }
 
@@ -415,7 +108,6 @@ std::vector<Point> generate_quad_nodes(int order) {
 
     std::vector<Point> nodes;
     nodes.reserve(static_cast<std::size_t>((order + 1) * (order + 1)));
-
     nodes.push_back(Point{Real(-1), Real(-1), Real(0)});
     nodes.push_back(Point{Real(1), Real(-1), Real(0)});
     nodes.push_back(Point{Real(1), Real(1), Real(0)});
@@ -433,13 +125,12 @@ std::vector<Point> generate_quad_nodes(int order) {
     for (int j = order - 1; j >= 1; --j) {
         nodes.push_back(Point{Real(-1), line_coord_pm_one(j, order), Real(0)});
     }
-
     for (int j = 1; j < order; ++j) {
         for (int i = 1; i < order; ++i) {
-            nodes.push_back(Point{line_coord_pm_one(i, order), line_coord_pm_one(j, order), Real(0)});
+            nodes.push_back(Point{line_coord_pm_one(i, order),
+                                  line_coord_pm_one(j, order), Real(0)});
         }
     }
-
     return nodes;
 }
 
@@ -448,22 +139,20 @@ std::vector<Point> generate_tetra_nodes(int order) {
         return {Point{Real(0.25), Real(0.25), Real(0.25)}};
     }
 
-    std::vector<Point> nodes;
-    nodes.reserve(static_cast<std::size_t>((order + 1) * (order + 2) * (order + 3) / 6));
-
     const Point verts[] = {
         Point{Real(0), Real(0), Real(0)},
         Point{Real(1), Real(0), Real(0)},
         Point{Real(0), Real(1), Real(0)},
         Point{Real(0), Real(0), Real(1)},
     };
+
+    std::vector<Point> nodes;
+    nodes.reserve(static_cast<std::size_t>((order + 1) * (order + 2) * (order + 3) / 6));
     for (const auto& v : verts) {
         nodes.push_back(v);
     }
 
-    const int edges[6][2] = {
-        {0, 1}, {1, 2}, {2, 0}, {0, 3}, {1, 3}, {2, 3}
-    };
+    const int edges[6][2] = {{0, 1}, {1, 2}, {2, 0}, {0, 3}, {1, 3}, {2, 3}};
     for (const auto& edge : edges) {
         for (int m = 1; m < order; ++m) {
             const Real t = static_cast<Real>(m) / static_cast<Real>(order);
@@ -471,32 +160,24 @@ std::vector<Point> generate_tetra_nodes(int order) {
         }
     }
 
-    const int faces[4][3] = {
-        {0, 1, 2},
-        {0, 1, 3},
-        {1, 2, 3},
-        {0, 2, 3},
-    };
+    const int faces[4][3] = {{0, 1, 2}, {0, 1, 3}, {1, 2, 3}, {0, 2, 3}};
     for (const auto& face : faces) {
-        append_triangle_face_interior(
-            nodes,
-            verts[face[0]],
-            verts[face[1]],
-            verts[face[2]],
-            order);
+        append_triangle_face_interior(nodes,
+                                      verts[face[0]],
+                                      verts[face[1]],
+                                      verts[face[2]],
+                                      order);
     }
 
     for (int l = 1; l <= order - 3; ++l) {
         for (int k = 1; k <= order - l - 2; ++k) {
             for (int j = 1; j <= order - l - k - 1; ++j) {
-                const Real x = static_cast<Real>(j) / static_cast<Real>(order);
-                const Real y = static_cast<Real>(k) / static_cast<Real>(order);
-                const Real z = static_cast<Real>(l) / static_cast<Real>(order);
-                nodes.push_back(Point{x, y, z});
+                nodes.push_back(Point{Real(j) / Real(order),
+                                      Real(k) / Real(order),
+                                      Real(l) / Real(order)});
             }
         }
     }
-
     return nodes;
 }
 
@@ -505,9 +186,6 @@ std::vector<Point> generate_hex_nodes(int order) {
         return {Point{Real(0), Real(0), Real(0)}};
     }
 
-    std::vector<Point> nodes;
-    nodes.reserve(static_cast<std::size_t>((order + 1) * (order + 1) * (order + 1)));
-
     const Point verts[] = {
         Point{Real(-1), Real(-1), Real(-1)},
         Point{Real(1), Real(-1), Real(-1)},
@@ -518,6 +196,9 @@ std::vector<Point> generate_hex_nodes(int order) {
         Point{Real(1), Real(1), Real(1)},
         Point{Real(-1), Real(1), Real(1)},
     };
+
+    std::vector<Point> nodes;
+    nodes.reserve(static_cast<std::size_t>((order + 1) * (order + 1) * (order + 1)));
     for (const auto& v : verts) {
         nodes.push_back(v);
     }
@@ -564,7 +245,6 @@ std::vector<Point> generate_hex_nodes(int order) {
             nodes.push_back(Point{Real(-1), line_coord_pm_one(j, order), line_coord_pm_one(k, order)});
         }
     }
-
     for (int k = 1; k < order; ++k) {
         for (int j = 1; j < order; ++j) {
             for (int i = 1; i < order; ++i) {
@@ -574,7 +254,6 @@ std::vector<Point> generate_hex_nodes(int order) {
             }
         }
     }
-
     return nodes;
 }
 
@@ -583,9 +262,6 @@ std::vector<Point> generate_wedge_nodes(int order) {
         return {Point{Real(1) / Real(3), Real(1) / Real(3), Real(0)}};
     }
 
-    std::vector<Point> nodes;
-    nodes.reserve(static_cast<std::size_t>((order + 1) * (order + 1) * (order + 2) / 2));
-
     const Point verts[] = {
         Point{Real(0), Real(0), Real(-1)},
         Point{Real(1), Real(0), Real(-1)},
@@ -594,6 +270,9 @@ std::vector<Point> generate_wedge_nodes(int order) {
         Point{Real(1), Real(0), Real(1)},
         Point{Real(0), Real(1), Real(1)},
     };
+
+    std::vector<Point> nodes;
+    nodes.reserve(static_cast<std::size_t>((order + 1) * (order + 1) * (order + 2) / 2));
     for (const auto& v : verts) {
         nodes.push_back(v);
     }
@@ -610,10 +289,8 @@ std::vector<Point> generate_wedge_nodes(int order) {
         }
     }
 
-    append_triangle_face_interior(
-        nodes, verts[0], verts[1], verts[2], order);
-    append_triangle_face_interior(
-        nodes, verts[3], verts[4], verts[5], order);
+    append_triangle_face_interior(nodes, verts[0], verts[1], verts[2], order);
+    append_triangle_face_interior(nodes, verts[3], verts[4], verts[5], order);
 
     for (int r = 1; r < order; ++r) {
         const Real z = line_coord_pm_one(r, order);
@@ -635,138 +312,21 @@ std::vector<Point> generate_wedge_nodes(int order) {
         const Real z = line_coord_pm_one(r, order);
         for (int c = 1; c <= order - 2; ++c) {
             for (int b = 1; b <= order - c - 1; ++b) {
-                const Real x = static_cast<Real>(b) / static_cast<Real>(order);
-                const Real y = static_cast<Real>(c) / static_cast<Real>(order);
-                nodes.push_back(Point{x, y, z});
-            }
-        }
-    }
-
-    return nodes;
-}
-
-std::vector<Point> generate_pyramid_nodes(int order) {
-    if (order == 0) {
-        return {Point{Real(0), Real(0), Real(0.25)}};
-    }
-
-    std::vector<Point> nodes;
-    nodes.reserve(static_cast<std::size_t>((order + 1) * (order + 2) * (2 * order + 3) / 6));
-
-    nodes.push_back(Point{Real(-1), Real(-1), Real(0)});
-    nodes.push_back(Point{Real(1), Real(-1), Real(0)});
-    nodes.push_back(Point{Real(1), Real(1), Real(0)});
-    nodes.push_back(Point{Real(-1), Real(1), Real(0)});
-    nodes.push_back(Point{Real(0), Real(0), Real(1)});
-
-    for (int m = 1; m < order; ++m) {
-        nodes.push_back(Point{line_coord_pm_one(m, order), Real(-1), Real(0)});
-    }
-    for (int m = 1; m < order; ++m) {
-        nodes.push_back(Point{Real(1), line_coord_pm_one(m, order), Real(0)});
-    }
-    for (int m = order - 1; m >= 1; --m) {
-        nodes.push_back(Point{line_coord_pm_one(m, order), Real(1), Real(0)});
-    }
-    for (int m = order - 1; m >= 1; --m) {
-        nodes.push_back(Point{Real(-1), line_coord_pm_one(m, order), Real(0)});
-    }
-
-    for (int level = 1; level < order; ++level) {
-        const Real z = static_cast<Real>(level) / static_cast<Real>(order);
-        const Real scale = Real(1) - z;
-        nodes.push_back(Point{-scale, -scale, z});
-        nodes.push_back(Point{scale, -scale, z});
-        nodes.push_back(Point{scale, scale, z});
-        nodes.push_back(Point{-scale, scale, z});
-    }
-
-    for (int j = 1; j < order; ++j) {
-        for (int i = 1; i < order; ++i) {
-            nodes.push_back(Point{line_coord_pm_one(i, order), line_coord_pm_one(j, order), Real(0)});
-        }
-    }
-
-    for (int level = 1; level < order - 1; ++level) {
-        const int n = order - level;
-        const Real z = static_cast<Real>(level) / static_cast<Real>(order);
-        const Real scale = Real(1) - z;
-
-        for (int m = 1; m < n; ++m) {
-            const Real s = line_coord_pm_one(m, n) * scale;
-            nodes.push_back(Point{s, -scale, z});
-        }
-        for (int m = 1; m < n; ++m) {
-            const Real s = line_coord_pm_one(m, n) * scale;
-            nodes.push_back(Point{scale, s, z});
-        }
-        for (int m = n - 1; m >= 1; --m) {
-            const Real s = line_coord_pm_one(m, n) * scale;
-            nodes.push_back(Point{s, scale, z});
-        }
-        for (int m = n - 1; m >= 1; --m) {
-            const Real s = line_coord_pm_one(m, n) * scale;
-            nodes.push_back(Point{-scale, s, z});
-        }
-    }
-
-    for (int level = 1; level < order - 1; ++level) {
-        const int n = order - level;
-        const Real z = static_cast<Real>(level) / static_cast<Real>(order);
-        const Real scale = Real(1) - z;
-        for (int j = 1; j < n; ++j) {
-            for (int i = 1; i < n; ++i) {
-                nodes.push_back(Point{line_coord_pm_one(i, n) * scale,
-                                      line_coord_pm_one(j, n) * scale,
+                nodes.push_back(Point{Real(b) / Real(order),
+                                      Real(c) / Real(order),
                                       z});
             }
         }
     }
-
     return nodes;
 }
 
-} // namespace
-
-math::Vector<Real, 3> ReferenceNodeLayout::get_node_coords(ElementType elem_type,
-                                                     std::size_t local_node) {
-    const auto table = fixed_node_table(elem_type);
-    if (table.data != nullptr && local_node < table.size) {
-        return to_point(table.data[local_node]);
-    }
-
-    throw BasisNodeOrderingException("Invalid element type or node index in ReferenceNodeLayout::get_node_coords",
-                                     __FILE__, __LINE__, __func__);
-}
-
-std::size_t ReferenceNodeLayout::num_nodes(ElementType elem_type) {
-    const auto table = fixed_node_table(elem_type);
-    if (table.data != nullptr) {
-        return table.size;
-    }
-
-    throw BasisNodeOrderingException("Unknown element type in ReferenceNodeLayout::num_nodes",
-                                     __FILE__, __LINE__, __func__);
-}
-
-std::vector<math::Vector<Real, 3>>
-ReferenceNodeLayout::get_lagrange_node_coords(ElementType canonical_type, int order) {
+std::vector<Point> complete_lagrange_nodes(ElementType canonical_type, int order) {
     if (order < 0) {
-        throw BasisNodeOrderingException("ReferenceNodeLayout::get_lagrange_node_coords requires non-negative order",
+        throw BasisNodeOrderingException("ReferenceNodeLayout requires non-negative Lagrange order",
                                          __FILE__, __LINE__, __func__);
     }
-
     const ElementType type = canonical_lagrange_type(canonical_type);
-    const auto fixed_table = fixed_complete_lagrange_table(type, order);
-    if (fixed_table.data != nullptr) {
-        std::vector<Point> nodes;
-        nodes.reserve(fixed_table.size);
-        for (std::size_t i = 0; i < fixed_table.size; ++i) {
-            nodes.push_back(to_point(fixed_table.data[i]));
-        }
-        return nodes;
-    }
-
     switch (type) {
         case ElementType::Point1:
             return {Point{Real(0), Real(0), Real(0)}};
@@ -783,24 +343,70 @@ ReferenceNodeLayout::get_lagrange_node_coords(ElementType canonical_type, int or
         case ElementType::Wedge6:
             return generate_wedge_nodes(order);
         case ElementType::Pyramid5:
-            return generate_pyramid_nodes(order);
-        case ElementType::Quad8:
-        case ElementType::Hex20:
-        case ElementType::Wedge15:
+            throw BasisNodeOrderingException("ReferenceNodeLayout: pyramid node ordering is disabled",
+                                             __FILE__, __LINE__, __func__);
+        default:
+            throw BasisNodeOrderingException("ReferenceNodeLayout: unsupported Lagrange topology",
+                                             __FILE__, __LINE__, __func__);
+    }
+}
+
+std::vector<Point> element_nodes(ElementType elem_type) {
+    const int order = complete_lagrange_alias_order(elem_type);
+    if (order >= 0) {
+        return complete_lagrange_nodes(elem_type, order);
+    }
+
+    switch (elem_type) {
+        case ElementType::Quad8: {
+            auto nodes = generate_quad_nodes(2);
+            nodes.resize(8u);
+            return nodes;
+        }
+        case ElementType::Hex20: {
+            auto nodes = generate_hex_nodes(2);
+            nodes.resize(20u);
+            return nodes;
+        }
+        case ElementType::Wedge15: {
+            auto nodes = generate_wedge_nodes(2);
+            nodes.resize(15u);
+            return nodes;
+        }
         case ElementType::Pyramid13:
-            throw BasisNodeOrderingException("ReferenceNodeLayout::get_lagrange_node_coords does not support serendipity topologies",
+            throw BasisNodeOrderingException("ReferenceNodeLayout: pyramid node ordering is disabled",
                                              __FILE__, __LINE__, __func__);
         default:
-            throw BasisNodeOrderingException("ReferenceNodeLayout::get_lagrange_node_coords: unsupported topology",
+            throw BasisNodeOrderingException("ReferenceNodeLayout: unknown element type",
                                              __FILE__, __LINE__, __func__);
     }
 }
 
+} // namespace
+
+math::Vector<Real, 3> ReferenceNodeLayout::get_node_coords(ElementType elem_type,
+                                                           std::size_t local_node) {
+    const auto nodes = element_nodes(elem_type);
+    if (local_node >= nodes.size()) {
+        throw BasisNodeOrderingException("ReferenceNodeLayout::get_node_coords: node index out of range",
+                                         __FILE__, __LINE__, __func__);
+    }
+    return nodes[local_node];
+}
+
+std::size_t ReferenceNodeLayout::num_nodes(ElementType elem_type) {
+    return element_nodes(elem_type).size();
+}
+
+std::vector<math::Vector<Real, 3>>
+ReferenceNodeLayout::get_lagrange_node_coords(ElementType canonical_type, int order) {
+    return complete_lagrange_nodes(canonical_type, order);
+}
+
 std::span<const std::size_t> ReferenceNodeLayout::mesh_to_basis_ordering(ElementType elem_type) {
     if (elem_type == ElementType::Hex20) {
-        return std::span<const std::size_t>(
-            kHex20MeshToBasisOrder.data(),
-            kHex20MeshToBasisOrder.size());
+        return std::span<const std::size_t>(kHex20MeshToBasisOrder.data(),
+                                            kHex20MeshToBasisOrder.size());
     }
     return {};
 }
diff --git a/Code/Source/solver/FE/Basis/NodeOrderingConventions.h b/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
index 52af4d932..8a43cc4e3 100644
--- a/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
+++ b/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
@@ -8,526 +8,28 @@
 #ifndef SVMP_FE_BASIS_NODEORDERINGCONVENTIONS_H
 #define SVMP_FE_BASIS_NODEORDERINGCONVENTIONS_H
 
-#include "Types.h"
 #include "Math/Vector.h"
-#include <cstddef>
-#include <vector>
-
-/**
- * @file NodeOrderingConventions.h
- * @brief Documentation of node ordering conventions for all element types
- *
- * This file provides comprehensive documentation of the node ordering
- * conventions used throughout the FE library. These orderings are consistent
- * with VTK conventions and must be matched exactly when interfacing with
- * the Mesh library.
- *
- * IMPORTANT: The FE library (Basis, Quadrature, Geometry) uses "node" to refer
- * to degrees of freedom locations on reference elements. The Mesh library uses
- * "vertex" for geometry vertices and "cell" for mesh elements. When interfacing
- * between the two, ensure consistent ordering.
- *
- * Reference Element Conventions:
- * - Line:       xi in [-1, 1]
- * - Quad:       (xi, eta) in [-1, 1] x [-1, 1]
- * - Hex:        (xi, eta, zeta) in [-1, 1]^3
- * - Triangle:   (xi, eta) in simplex with vertices (0,0), (1,0), (0,1)
- * - Tetrahedron: (xi, eta, zeta) in simplex with vertices
- *                (0,0,0), (1,0,0), (0,1,0), (0,0,1)
- * - Wedge:      Triangle base x line height, zeta in [-1, 1]
- * - Pyramid:    Quad base at z=0, apex at (0, 0, 1)
- *
- *
- * =============================================================================
- * 1D ELEMENTS
- * =============================================================================
- *
- * Line2 (Linear Line)
- * -------------------
- *   0---------1
- *   |         |
- *  xi=-1     xi=+1
- *
- * Node 0: xi = -1
- * Node 1: xi = +1
- *
- *
- * Line3 (Quadratic Line)
- * ----------------------
- *   0----2----1
- *   |    |    |
- *  xi=-1 0   xi=+1
- *
- * Node 0: xi = -1
- * Node 1: xi = +1
- * Node 2: xi =  0 (mid-edge)
- *
- *
- * =============================================================================
- * 2D QUADRILATERAL ELEMENTS
- * =============================================================================
- *
- * Quad4 (Bilinear Quadrilateral)
- * ------------------------------
- *
- *   3-----------2
- *   |           |
- *   |           |
- *   |           |
- *   0-----------1
- *
- * Node 0: (xi, eta) = (-1, -1)
- * Node 1: (xi, eta) = (+1, -1)
- * Node 2: (xi, eta) = (+1, +1)
- * Node 3: (xi, eta) = (-1, +1)
- *
- *
- * Quad8 (Serendipity Quadrilateral)
- * ---------------------------------
- *
- *   3-----6-----2
- *   |           |
- *   7           5
- *   |           |
- *   0-----4-----1
- *
- * Corners (same as Quad4):
- *   Node 0: (-1, -1)
- *   Node 1: (+1, -1)
- *   Node 2: (+1, +1)
- *   Node 3: (-1, +1)
- *
- * Mid-edge nodes:
- *   Node 4: ( 0, -1)  (edge 0-1)
- *   Node 5: (+1,  0)  (edge 1-2)
- *   Node 6: ( 0, +1)  (edge 2-3)
- *   Node 7: (-1,  0)  (edge 3-0)
- *
- *
- * Quad9 (Biquadratic Quadrilateral)
- * ---------------------------------
- *
- *   3-----6-----2
- *   |           |
- *   7     8     5
- *   |           |
- *   0-----4-----1
- *
- * Same as Quad8 plus:
- *   Node 8: (0, 0)  (center)
- *
- *
- * =============================================================================
- * 3D HEXAHEDRAL ELEMENTS
- * =============================================================================
- *
- * Hex8 (Trilinear Hexahedron)
- * ---------------------------
- *
- *        7-----------6
- *       /|          /|
- *      / |         / |
- *     4-----------5  |
- *     |  |        |  |
- *     |  3--------|--2
- *     | /         | /
- *     |/          |/
- *     0-----------1
- *
- * Bottom face (zeta = -1):
- *   Node 0: (xi, eta, zeta) = (-1, -1, -1)
- *   Node 1: (xi, eta, zeta) = (+1, -1, -1)
- *   Node 2: (xi, eta, zeta) = (+1, +1, -1)
- *   Node 3: (xi, eta, zeta) = (-1, +1, -1)
- *
- * Top face (zeta = +1):
- *   Node 4: (xi, eta, zeta) = (-1, -1, +1)
- *   Node 5: (xi, eta, zeta) = (+1, -1, +1)
- *   Node 6: (xi, eta, zeta) = (+1, +1, +1)
- *   Node 7: (xi, eta, zeta) = (-1, +1, +1)
- *
- *
- * Hex20 (Serendipity Hexahedron)
- * ------------------------------
- *
- *        7-----14-----6
- *       /|           /|
- *     15 |         13 |
- *     /  19        /  18
- *    4-----12-----5   |
- *    |   |        |   |
- *    |   3-----10-|---2
- *   16  /        17  /
- *    | 11         | 9
- *    |/           |/
- *    0------8-----1
- *
- * Corners (same as Hex8): Nodes 0-7
- *
- * Mid-edge nodes on bottom face (zeta = -1):
- *   Node 8:  ( 0, -1, -1)  (edge 0-1)
- *   Node 9:  (+1,  0, -1)  (edge 1-2)
- *   Node 10: ( 0, +1, -1)  (edge 2-3)
- *   Node 11: (-1,  0, -1)  (edge 3-0)
- *
- * Mid-edge nodes on top face (zeta = +1):
- *   Node 12: ( 0, -1, +1)  (edge 4-5)
- *   Node 13: (+1,  0, +1)  (edge 5-6)
- *   Node 14: ( 0, +1, +1)  (edge 6-7)
- *   Node 15: (-1,  0, +1)  (edge 7-4)
- *
- * Mid-edge nodes on vertical edges:
- *   Node 16: (-1, -1,  0)  (edge 0-4)
- *   Node 17: (+1, -1,  0)  (edge 1-5)
- *   Node 18: (+1, +1,  0)  (edge 2-6)
- *   Node 19: (-1, +1,  0)  (edge 3-7)
- *
- *
- * Hex27 (Triquadratic Hexahedron)
- * -------------------------------
- * Same as Hex20 plus face-center and body-center nodes:
- *
- * Face centers:
- *   Node 20: ( 0,  0, -1)  (bottom face)
- *   Node 21: ( 0,  0, +1)  (top face)
- *   Node 22: ( 0, -1,  0)  (front face)
- *   Node 23: (+1,  0,  0)  (right face)
- *   Node 24: ( 0, +1,  0)  (back face)
- *   Node 25: (-1,  0,  0)  (left face)
- *
- * Body center:
- *   Node 26: (0, 0, 0)
- *
- *
- * =============================================================================
- * 2D TRIANGULAR ELEMENTS
- * =============================================================================
- *
- * Triangle3 (Linear Triangle)
- * ---------------------------
- *
- *   2
- *   |\
- *   | \
- *   |  \
- *   |   \
- *   0----1
- *
- * Reference: (xi, eta) simplex with vertices at:
- *   Node 0: (xi, eta) = (0, 0)
- *   Node 1: (xi, eta) = (1, 0)
- *   Node 2: (xi, eta) = (0, 1)
- *
- *
- * Triangle6 (Quadratic Triangle)
- * ------------------------------
- *
- *   2
- *   |\
- *   | \
- *   5  4
- *   |   \
- *   0--3--1
- *
- * Corners: Nodes 0-2 (same as Triangle3)
- *
- * Mid-edge nodes:
- *   Node 3: (0.5,   0)  (edge 0-1)
- *   Node 4: (0.5, 0.5)  (edge 1-2)
- *   Node 5: (  0, 0.5)  (edge 2-0)
- *
- *
- * =============================================================================
- * 3D TETRAHEDRAL ELEMENTS
- * =============================================================================
- *
- * Tetrahedron4 (Linear Tetrahedron)
- * ---------------------------------
- *
- *             3
- *            /|\
- *           / | \
- *          /  |  \
- *         /   |   \
- *        /    |    \
- *       0-----|-----2
- *        \    |    /
- *         \   |   /
- *          \  |  /
- *           \ | /
- *            \|/
- *             1
- *
- * Reference: (xi, eta, zeta) simplex with vertices at:
- *   Node 0: (0, 0, 0)
- *   Node 1: (1, 0, 0)
- *   Node 2: (0, 1, 0)
- *   Node 3: (0, 0, 1)
- *
- *
- * Tetrahedron10 (Quadratic Tetrahedron)
- * -------------------------------------
- * Corners: Nodes 0-3 (same as Tet4)
- *
- * Mid-edge nodes:
- *   Node 4: (0.5,   0,   0)  (edge 0-1)
- *   Node 5: (0.5, 0.5,   0)  (edge 1-2)
- *   Node 6: (  0, 0.5,   0)  (edge 2-0)
- *   Node 7: (  0,   0, 0.5)  (edge 0-3)
- *   Node 8: (0.5,   0, 0.5)  (edge 1-3)
- *   Node 9: (  0, 0.5, 0.5)  (edge 2-3)
- *
- *
- * =============================================================================
- * 3D WEDGE (PRISM) ELEMENTS
- * =============================================================================
- *
- * Wedge6 (Linear Wedge)
- * ---------------------
- *
- *         5
- *        /|\
- *       / | \
- *      /  |  \
- *     3---|---4
- *     |   2   |
- *     |  / \  |
- *     | /   \ |
- *     |/     \|
- *     0-------1
- *
- * Reference: Triangle base at zeta = -1, top at zeta = +1
- *
- * Bottom face (zeta = -1):
- *   Node 0: (0, 0, -1)
- *   Node 1: (1, 0, -1)
- *   Node 2: (0, 1, -1)
- *
- * Top face (zeta = +1):
- *   Node 3: (0, 0, +1)
- *   Node 4: (1, 0, +1)
- *   Node 5: (0, 1, +1)
- *
- *
- * Wedge15 (Quadratic Wedge)
- * -------------------------
- * Corners: Nodes 0-5 (same as Wedge6)
- *
- * Mid-edge nodes on bottom face:
- *   Node 6:  (0.5,   0, -1)  (edge 0-1)
- *   Node 7:  (0.5, 0.5, -1)  (edge 1-2)
- *   Node 8:  (  0, 0.5, -1)  (edge 2-0)
- *
- * Mid-edge nodes on top face:
- *   Node 9:  (0.5,   0, +1)  (edge 3-4)
- *   Node 10: (0.5, 0.5, +1)  (edge 4-5)
- *   Node 11: (  0, 0.5, +1)  (edge 5-3)
- *
- * Mid-edge nodes on vertical edges:
- *   Node 12: (0, 0, 0)  (edge 0-3)
- *   Node 13: (1, 0, 0)  (edge 1-4)
- *   Node 14: (0, 1, 0)  (edge 2-5)
- *
- *
- * Wedge18 (Complete Quadratic Wedge)
- * ----------------------------------
- * Corners and mid-edges: Nodes 0-14 (same as Wedge15)
- *
- * Face-center nodes on quadrilateral faces:
- *   Node 15: (0.5, 0.0, 0.0)  (face with vertices 0-1-4-3, y = 0)
- *   Node 16: (0.5, 0.5, 0.0)  (face with vertices 1-2-5-4, x + y = 1)
- *   Node 17: (0.0, 0.5, 0.0)  (face with vertices 2-0-3-5, x = 0)
- *
- *
- * =============================================================================
- * 3D PYRAMID ELEMENTS
- * =============================================================================
- *
- * Pyramid5 (Linear Pyramid)
- * -------------------------
- *
- *           4
- *          /|\
- *         / | \
- *        /  |  \
- *       /   |   \
- *      3----|----2
- *      |    |    |
- *      |    +    |   (apex projects to center of base)
- *      |         |
- *      0---------1
- *
- * Reference: Quad base in xi-eta plane at zeta = 0, apex at zeta = 1
- *
- * Base (zeta = 0):
- *   Node 0: (-1, -1, 0)
- *   Node 1: (+1, -1, 0)
- *   Node 2: (+1, +1, 0)
- *   Node 3: (-1, +1, 0)
- *
- * Apex:
- *   Node 4: (0, 0, 1)
- *
- *
- * Pyramid13 (Quadratic Pyramid)
- * -----------------------------
- * Corners: Nodes 0-4 (same as Pyramid5)
- *
- * Mid-edge nodes on base:
- *   Node 5: ( 0, -1, 0)  (edge 0-1)
- *   Node 6: (+1,  0, 0)  (edge 1-2)
- *   Node 7: ( 0, +1, 0)  (edge 2-3)
- *   Node 8: (-1,  0, 0)  (edge 3-0)
- *
- * Mid-edge nodes to apex:
- *   Node 9:  (-0.5, -0.5, 0.5)  (edge 0-4)
- *   Node 10: (+0.5, -0.5, 0.5)  (edge 1-4)
- *   Node 11: (+0.5, +0.5, 0.5)  (edge 2-4)
- *   Node 12: (-0.5, +0.5, 0.5)  (edge 3-4)
- *
- *
- * Pyramid14 (Quadratic Rational Pyramid)
- * --------------------------------------
- *
- * This retained low-order compatibility layout matches the generated
- * complete-family quadratic Lagrange ordering for the reference pyramid with
- * base (-1,-1,0)..(1,1,0) and apex at (0,0,1). Nodes 0-12 coincide with the
- * Pyramid13 layout; node 13 is the base center.
- *
- *   Base corners (same as Pyramid5):
- *     Node 0: (-1, -1, 0)
- *     Node 1: (+1, -1, 0)
- *     Node 2: (+1, +1, 0)
- *     Node 3: (-1, +1, 0)
- *
- *   Apex:
- *     Node 4: (0, 0, 1)
- *
- *   Base mid-edges (same as Pyramid13):
- *     Node 5:  ( 0, -1, 0)   (edge 0-1)
- *     Node 6:  (+1,  0, 0)   (edge 1-2)
- *     Node 7:  ( 0, +1, 0)   (edge 2-3)
- *     Node 8:  (-1,  0, 0)   (edge 3-0)
- *
- *   Mid-edges to apex (same as Pyramid13):
- *     Node 9:  (-0.5, -0.5, 0.5)  (edge 0-4)
- *     Node 10: (+0.5, -0.5, 0.5)  (edge 1-4)
- *     Node 11: (+0.5, +0.5, 0.5)  (edge 2-4)
- *     Node 12: (-0.5, +0.5, 0.5)  (edge 3-4)
- *
- *   Base center:
- *     Node 13: (0, 0, 0)
- *
- *
- * =============================================================================
- * NOTES ON VTK COMPATIBILITY
- * =============================================================================
- *
- * The node orderings above are consistent with VTK cell types:
- *
- *   VTK_LINE           (3)  -> Line2
- *   VTK_QUADRATIC_EDGE (21) -> Line3
- *   VTK_TRIANGLE       (5)  -> Triangle3
- *   VTK_QUADRATIC_TRIANGLE (22) -> Triangle6
- *   VTK_QUAD           (9)  -> Quad4
- *   VTK_QUADRATIC_QUAD (23) -> Quad8
- *   VTK_BIQUADRATIC_QUAD (28) -> Quad9
- *   VTK_TETRA          (10) -> Tetrahedron4
- *   VTK_QUADRATIC_TETRA (24) -> Tetrahedron10
- *   VTK_HEXAHEDRON     (12) -> Hex8
- *   VTK_QUADRATIC_HEXAHEDRON (25) -> Hex20
- *   VTK_TRIQUADRATIC_HEXAHEDRON (29) -> Hex27
- *   VTK_WEDGE          (13) -> Wedge6
- *   VTK_QUADRATIC_WEDGE (26) -> Wedge15
- *   VTK_BIQUADRATIC_QUADRATIC_WEDGE (32) -> Wedge18
- *   VTK_PYRAMID        (14) -> Pyramid5
- *   VTK_QUADRATIC_PYRAMID (27) -> Pyramid13
- *
- *
- * =============================================================================
- * BARYCENTRIC COORDINATES
- * =============================================================================
- *
- * For simplex elements, barycentric coordinates (lambda_0, ..., lambda_n)
- * satisfy sum(lambda_i) = 1.
- *
- * Triangle:
- *   lambda_0 = 1 - xi - eta
- *   lambda_1 = xi
- *   lambda_2 = eta
- *
- * Tetrahedron:
- *   lambda_0 = 1 - xi - eta - zeta
- *   lambda_1 = xi
- *   lambda_2 = eta
- *   lambda_3 = zeta
- *
- */
+#include "Types.h"
 
+#include <cstddef>
 #include <span>
+#include <vector>
 
 namespace svmp {
 namespace FE {
 namespace basis {
 
-/**
- * @brief Basis-side reference node coordinate queries
- *
- * This is intentionally named differently from `svmp::NodeOrdering` in Mesh,
- * which handles mesh-format permutations rather than reference basis layouts.
- */
 class ReferenceNodeLayout {
 public:
-    /**
-     * @brief Get reference coordinates for a node
-     * @param elem_type Element type
-     * @param local_node Local node index (0-based)
-     *
-     * Complete-family low-order Lagrange aliases (`Line2/3`, `Triangle3/6`,
-     * `Quad4/9`, `Tetra4/10`, `Hex8/27`, `Wedge6/18`, `Pyramid5/14`) are
-     * served by the generated arbitrary-order Lagrange ordering path. Explicit
-     * hard-coded tables remain only for serendipity-only enums such as
-     * `Quad8`, `Hex20`, `Wedge15`, and `Pyramid13`.
-     *
-     * @return Reference coordinates (xi, eta, zeta)
-     */
-    static math::Vector<Real, 3> get_node_coords(ElementType elem_type, std::size_t local_node);
-
-    /**
-     * @brief Get number of nodes for an element type
-     *
-     * The low-order complete-family Lagrange aliases share the same generated
-     * ordering path used by `get_node_coords`.
-     */
+    static math::Vector<Real, 3> get_node_coords(ElementType elem_type,
+                                                 std::size_t local_node);
     static std::size_t num_nodes(ElementType elem_type);
 
-    /**
-     * @brief Generate complete-family Lagrange node coordinates for a canonical topology and order
-     *
-     * This covers arbitrary-order complete nodal Lagrange spaces on the
-     * canonical topologies `Line2`, `Triangle3`, `Quad4`, `Tetra4`, `Hex8`,
-     * `Wedge6`, and `Pyramid5`. Serendipity variants are intentionally
-     * excluded.
-     */
     static std::vector<math::Vector<Real, 3>>
     get_lagrange_node_coords(ElementType canonical_type, int order);
 
-    /**
-     * @brief Optional mapping from mesh/reference node order to internal basis order
-     *
-     * Returns an empty span when the public node order is already the basis
-     * table order or no special mapping is registered.
-     */
     static std::span<const std::size_t> mesh_to_basis_ordering(ElementType elem_type);
-
-    /**
-     * @brief Check if element is a simplex (triangle, tetrahedron)
-     */
     static bool is_simplex(ElementType elem_type);
-
-    /**
-     * @brief Check if element uses tensor-product topology
-     */
     static bool is_tensor_product(ElementType elem_type);
 };
 
diff --git a/Code/Source/solver/FE/Basis/PyramidModalBasis.h b/Code/Source/solver/FE/Basis/PyramidModalBasis.h
deleted file mode 100644
index 1ecdae282..000000000
--- a/Code/Source/solver/FE/Basis/PyramidModalBasis.h
+++ /dev/null
@@ -1,265 +0,0 @@
-#ifndef SVMP_FE_BASIS_PYRAMIDMODALBASIS_H
-#define SVMP_FE_BASIS_PYRAMIDMODALBASIS_H
-
-// Shared rational/modal pyramid helpers for scalar complete-family and spectral
-// pyramid bases. The degenerate z=1 top plane is evaluated by its apex limit;
-// callers that reject non-apex top-plane queries must validate before calling.
-
-#include "BasisFunction.h"
-#include "BasisTolerance.h"
-#include <algorithm>
-#include <cmath>
-#include <vector>
-
-namespace svmp {
-namespace FE {
-namespace basis {
-namespace pyramid_modal {
-
-struct Term {
-    int px{0};
-    int py{0};
-    int pz{0};
-    int denom_power{0};
-};
-
-struct EvaluationPoint {
-    Real x{Real(0)};
-    Real y{Real(0)};
-    Real z{Real(0)};
-    Real t{Real(1)};
-    bool top_plane{false};
-    std::vector<Real> x_powers;
-    std::vector<Real> y_powers;
-    std::vector<Real> z_powers;
-    std::vector<Real> t_powers;
-};
-
-inline std::vector<Term> build_terms(int order) {
-    std::vector<Term> terms;
-    terms.reserve(static_cast<std::size_t>((order + 1) * (order + 2) *
-                                           (2 * order + 3) / 6));
-    for (int pz = 0; pz <= order; ++pz) {
-        const int n = order - pz;
-        for (int py = 0; py <= n; ++py) {
-            for (int px = 0; px <= n; ++px) {
-                terms.push_back(Term{px, py, pz, std::min(px, py)});
-            }
-        }
-    }
-    return terms;
-}
-
-inline bool on_degenerate_top_plane(const math::Vector<Real, 3>& xi,
-                                    Real tolerance = detail::basis_scaled_tolerance()) {
-    return std::abs(Real(1) - xi[2]) <= tolerance;
-}
-
-inline void fill_powers(Real base, int max_power, std::vector<Real>& powers) {
-    powers.assign(static_cast<std::size_t>(max_power + 1), Real(1));
-    for (int p = 1; p <= max_power; ++p) {
-        powers[static_cast<std::size_t>(p)] =
-            powers[static_cast<std::size_t>(p - 1)] * base;
-    }
-}
-
-inline void prepare_evaluation_point(const math::Vector<Real, 3>& xi,
-                                     int max_px,
-                                     int max_py,
-                                     int max_pz,
-                                     int max_denom_power,
-                                     EvaluationPoint& point) {
-    point.x = xi[0];
-    point.y = xi[1];
-    point.z = xi[2];
-    point.t = Real(1) - point.z;
-    point.top_plane = on_degenerate_top_plane(xi);
-
-    fill_powers(point.x, std::max(max_px, 0), point.x_powers);
-    fill_powers(point.y, std::max(max_py, 0), point.y_powers);
-    fill_powers(point.z, std::max(max_pz, 0), point.z_powers);
-    if (point.top_plane) [[unlikely]] {
-        point.t_powers.assign(1u, Real(1));
-    } else {
-        fill_powers(point.t, std::max(max_denom_power + 2, 0), point.t_powers);
-    }
-}
-
-inline void prepare_evaluation_point(const std::vector<Term>& terms,
-                                     const math::Vector<Real, 3>& xi,
-                                     EvaluationPoint& point) {
-    int max_px = 0;
-    int max_py = 0;
-    int max_pz = 0;
-    int max_denom_power = 0;
-    for (const Term& term : terms) {
-        max_px = std::max(max_px, term.px);
-        max_py = std::max(max_py, term.py);
-        max_pz = std::max(max_pz, term.pz);
-        max_denom_power = std::max(max_denom_power, term.denom_power);
-    }
-    prepare_evaluation_point(xi, max_px, max_py, max_pz, max_denom_power, point);
-}
-
-inline void evaluate_term(const Term& term,
-                          const EvaluationPoint& point,
-                          Real& value,
-                          Gradient* gradient = nullptr,
-                          Hessian* hessian = nullptr) {
-    const auto pow_x = [&](int p) -> Real {
-        return point.x_powers[static_cast<std::size_t>(p)];
-    };
-    const auto pow_y = [&](int p) -> Real {
-        return point.y_powers[static_cast<std::size_t>(p)];
-    };
-    const auto pow_z = [&](int p) -> Real {
-        return point.z_powers[static_cast<std::size_t>(p)];
-    };
-    const auto pow_t = [&](int p) -> Real {
-        return point.t_powers[static_cast<std::size_t>(p)];
-    };
-
-    if (point.top_plane) [[unlikely]] {
-        if (term.px == 0 && term.py == 0) {
-            value = pow_z(term.pz);
-        } else {
-            value = Real(0);
-        }
-        if (gradient != nullptr) {
-            *gradient = Gradient{};
-            if (term.px == 0 && term.py == 0 && term.pz > 0) {
-                (*gradient)[2] = static_cast<Real>(term.pz) * pow_z(term.pz - 1);
-            }
-        }
-        if (hessian != nullptr) {
-            *hessian = Hessian{};
-            if (term.px == 0 && term.py == 0 && term.pz > 1) {
-                (*hessian)(2, 2) =
-                    static_cast<Real>(term.pz * (term.pz - 1)) *
-                    pow_z(term.pz - 2);
-            }
-        }
-        return;
-    }
-
-    const Real base = pow_x(term.px) * pow_y(term.py) * pow_z(term.pz);
-    const Real denom = pow_t(term.denom_power);
-    value = base / denom;
-
-    if (gradient != nullptr) {
-        *gradient = Gradient{};
-        if (term.px > 0) {
-            (*gradient)[0] =
-                static_cast<Real>(term.px) * pow_x(term.px - 1) *
-                pow_y(term.py) * pow_z(term.pz) / denom;
-        }
-        if (term.py > 0) {
-            (*gradient)[1] =
-                static_cast<Real>(term.py) * pow_x(term.px) *
-                pow_y(term.py - 1) * pow_z(term.pz) / denom;
-        }
-
-        Real gz = Real(0);
-        if (term.pz > 0) {
-            gz += static_cast<Real>(term.pz) * pow_x(term.px) *
-                  pow_y(term.py) * pow_z(term.pz - 1) / denom;
-        }
-        if (term.denom_power > 0) {
-            gz += static_cast<Real>(term.denom_power) * base / pow_t(term.denom_power + 1);
-        }
-        (*gradient)[2] = gz;
-    }
-
-    if (hessian == nullptr) {
-        return;
-    }
-
-    *hessian = Hessian{};
-    if (term.px > 1) {
-        (*hessian)(0, 0) =
-            static_cast<Real>(term.px * (term.px - 1)) *
-            pow_x(term.px - 2) * pow_y(term.py) * pow_z(term.pz) / denom;
-    }
-    if (term.py > 1) {
-        (*hessian)(1, 1) =
-            static_cast<Real>(term.py * (term.py - 1)) *
-            pow_x(term.px) * pow_y(term.py - 2) * pow_z(term.pz) / denom;
-    }
-    if (term.px > 0 && term.py > 0) {
-        const Real hxy =
-            static_cast<Real>(term.px * term.py) *
-            pow_x(term.px - 1) * pow_y(term.py - 1) * pow_z(term.pz) / denom;
-        (*hessian)(0, 1) = hxy;
-        (*hessian)(1, 0) = hxy;
-    }
-
-    if (term.px > 0) {
-        Real hxz =
-            static_cast<Real>(term.px) * pow_x(term.px - 1) *
-            pow_y(term.py) / denom;
-        if (term.pz > 0) {
-            hxz *= static_cast<Real>(term.pz) * pow_z(term.pz - 1);
-        } else {
-            hxz = Real(0);
-        }
-        if (term.denom_power > 0) {
-            hxz += static_cast<Real>(term.px * term.denom_power) *
-                   pow_x(term.px - 1) * pow_y(term.py) *
-                   pow_z(term.pz) / pow_t(term.denom_power + 1);
-        }
-        (*hessian)(0, 2) = hxz;
-        (*hessian)(2, 0) = hxz;
-    }
-
-    if (term.py > 0) {
-        Real hyz =
-            static_cast<Real>(term.py) * pow_x(term.px) *
-            pow_y(term.py - 1) / denom;
-        if (term.pz > 0) {
-            hyz *= static_cast<Real>(term.pz) * pow_z(term.pz - 1);
-        } else {
-            hyz = Real(0);
-        }
-        if (term.denom_power > 0) {
-            hyz += static_cast<Real>(term.py * term.denom_power) *
-                   pow_x(term.px) * pow_y(term.py - 1) *
-                   pow_z(term.pz) / pow_t(term.denom_power + 1);
-        }
-        (*hessian)(1, 2) = hyz;
-        (*hessian)(2, 1) = hyz;
-    }
-
-    Real hzz = Real(0);
-    if (term.pz > 1) {
-        hzz += static_cast<Real>(term.pz * (term.pz - 1)) *
-               pow_x(term.px) * pow_y(term.py) * pow_z(term.pz - 2) / denom;
-    }
-    if (term.pz > 0 && term.denom_power > 0) {
-        hzz += static_cast<Real>(2 * term.pz * term.denom_power) *
-               pow_x(term.px) * pow_y(term.py) *
-               pow_z(term.pz - 1) / pow_t(term.denom_power + 1);
-    }
-    if (term.denom_power > 0) {
-        hzz += static_cast<Real>(term.denom_power * (term.denom_power + 1)) *
-               base / pow_t(term.denom_power + 2);
-    }
-    (*hessian)(2, 2) = hzz;
-}
-
-inline void evaluate_term(const Term& term,
-                          const math::Vector<Real, 3>& xi,
-                          Real& value,
-                          Gradient* gradient = nullptr,
-                          Hessian* hessian = nullptr) {
-    EvaluationPoint point;
-    prepare_evaluation_point(
-        xi, term.px, term.py, term.pz, term.denom_power, point);
-    evaluate_term(term, point, value, gradient, hessian);
-}
-
-} // namespace pyramid_modal
-} // namespace basis
-} // namespace FE
-} // namespace svmp
-
-#endif // SVMP_FE_BASIS_PYRAMIDMODALBASIS_H
diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
index 309fd18be..e6395cee4 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
@@ -157,13 +157,6 @@ std::vector<Real> quad_serendipity_inverse_vandermonde(
     const std::string label = "Quad order " + std::to_string(order);
     return invert_dense_matrix(std::move(vandermonde), n, label.c_str());
 }
-constexpr std::array<Real, 13> kPyramid13CenterRedistribution = {
-    Real(-0.25), Real(-0.25), Real(-0.25), Real(-0.25),
-    Real(0),
-    Real(0.5), Real(0.5), Real(0.5), Real(0.5),
-    Real(0), Real(0), Real(0), Real(0)
-};
-
 constexpr std::array<std::array<int, 3>, 15> kWedge15MonomialExponents = {{
     {{0, 0, 0}},
     {{0, 0, 1}},
@@ -497,20 +490,8 @@ SerendipityBasis::SerendipityBasis(ElementType type, int order, bool geometry_mo
                 "SerendipityBasis supports up to quadratic on wedge15",
                 __FILE__, __LINE__, __func__);
         }
-    } else if (type == ElementType::Pyramid13) {
-        dimension_ = 3;
-        if (order_ < 2) {
-            order_ = 2;
-        }
-        if (order_ == 2) {
-            size_ = 13;
-        } else {
-            throw BasisConfigurationException(
-                "SerendipityBasis supports up to quadratic on pyramid13",
-                __FILE__, __LINE__, __func__);
-        }
     } else {
-        throw BasisElementCompatibilityException("SerendipityBasis supports Quad4/Quad8, Hex8/Hex20, Wedge15, and Pyramid13 elements",
+        throw BasisElementCompatibilityException("SerendipityBasis supports Quad4/Quad8, Hex8/Hex20, and Wedge15 elements",
                                                  __FILE__, __LINE__, __func__);
     }
 
@@ -522,17 +503,6 @@ SerendipityBasis::SerendipityBasis(ElementType type, int order, bool geometry_mo
     }
 }
 
-bool SerendipityBasis::cache_identity_words(std::vector<std::uint64_t>& words) const {
-    words.push_back(0x736572656e646970ULL);
-    words.push_back(static_cast<std::uint64_t>(basis_type()));
-    words.push_back(static_cast<std::uint64_t>(element_type_));
-    words.push_back(static_cast<std::uint64_t>(dimension_));
-    words.push_back(static_cast<std::uint64_t>(order_));
-    words.push_back(static_cast<std::uint64_t>(size_));
-    words.push_back(geometry_mode_ ? 1u : 0u);
-    return true;
-}
-
 void SerendipityBasis::evaluate_values(const math::Vector<Real, 3>& xi,
                                        std::vector<Real>& values) const {
     values.assign(size_, Real(0));
@@ -617,15 +587,6 @@ void SerendipityBasis::evaluate_values(const math::Vector<Real, 3>& xi,
         return;
     }
 
-    if (element_type_ == ElementType::Pyramid13) {
-        static const LagrangeBasis parent(ElementType::Pyramid14, 2);
-        std::array<Real, 14> parent_values{};
-        parent.evaluate_values_to(xi, parent_values.data());
-        for (std::size_t i = 0; i < 13; ++i) {
-            values[i] = parent_values[i] + kPyramid13CenterRedistribution[i] * parent_values[13];
-        }
-        return;
-    }
 }
 
 void SerendipityBasis::evaluate_gradients(const math::Vector<Real, 3>& xi,
@@ -762,25 +723,6 @@ void SerendipityBasis::evaluate_gradients(const math::Vector<Real, 3>& xi,
         return;
     }
 
-    if (element_type_ == ElementType::Pyramid13) {
-        static const LagrangeBasis parent(ElementType::Pyramid14, 2);
-        std::array<Real, 14u * 3u> parent_gradients{};
-        // Pyramid13 inherits the complete-family pyramid apex contract from the
-        // parent basis rather than introducing a separate regularized path.
-        parent.evaluate_gradients_to(xi, parent_gradients.data());
-        const auto parent_gradient = [&](std::size_t node, std::size_t component) {
-            return parent_gradients[node * 3u + component];
-        };
-        for (std::size_t i = 0; i < 13; ++i) {
-            for (std::size_t c = 0; c < 3u; ++c) {
-                gradients[i][c] =
-                    parent_gradient(i, c) +
-                    kPyramid13CenterRedistribution[i] * parent_gradient(13u, c);
-            }
-        }
-        return;
-    }
-
     throw BasisEvaluationException("SerendipityBasis::evaluate_gradients: unsupported serendipity configuration",
                                    __FILE__, __LINE__, __func__);
 }
@@ -859,20 +801,6 @@ void SerendipityBasis::evaluate_hessians(const math::Vector<Real, 3>& xi,
         return;
     }
 
-    if (element_type_ == ElementType::Pyramid13) {
-        static const LagrangeBasis parent(ElementType::Pyramid14, 2);
-        std::array<Real, 14u * 9u> parent_hessians{};
-        // Pyramid13 inherits the complete-family pyramid apex contract from the
-        // parent basis rather than introducing a separate regularized path.
-        parent.evaluate_hessians_to(xi, parent_hessians.data());
-        const Hessian center_hessian = load_hessian(parent_hessians.data() + 13u * 9u);
-        for (std::size_t i = 0; i < 13; ++i) {
-            hessians[i] = load_hessian(parent_hessians.data() + i * 9u);
-            add_scaled_hessian(hessians[i], center_hessian, kPyramid13CenterRedistribution[i]);
-        }
-        return;
-    }
-
     throw BasisEvaluationException("SerendipityBasis::evaluate_hessians: unsupported serendipity configuration",
                                    __FILE__, __LINE__, __func__);
 }
diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.h b/Code/Source/solver/FE/Basis/SerendipityBasis.h
index 98c01415a..10e426164 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.h
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.h
@@ -11,17 +11,11 @@
 /**
  * @file SerendipityBasis.h
  * @brief Reduced-degree-of-freedom serendipity bases
- *
- * `Pyramid13` inherits its apex contract from the complete-family rational
- * pyramid basis: values remain exact at the apex, while exact-apex gradient
- * and Hessian queries throw because the inherited nodal derivative limit is
- * not unique.
  */
 
 #include "BasisFunction.h"
 
 #include <array>
-#include <cstdint>
 
 namespace svmp {
 namespace FE {
@@ -37,7 +31,6 @@ class SerendipityBasis : public BasisFunction {
     int order() const noexcept override { return order_; }
     std::size_t size() const noexcept override { return size_; }
     const std::vector<math::Vector<Real, 3>>& nodes() const noexcept { return nodes_; }
-    bool cache_identity_words(std::vector<std::uint64_t>& words) const override;
 
     void evaluate_values(const math::Vector<Real, 3>& xi,
                          std::vector<Real>& values) const override;
diff --git a/Code/Source/solver/FE/Basis/VectorBasis.h b/Code/Source/solver/FE/Basis/VectorBasis.h
deleted file mode 100644
index d442c2160..000000000
--- a/Code/Source/solver/FE/Basis/VectorBasis.h
+++ /dev/null
@@ -1,255 +0,0 @@
-/* Copyright (c) Stanford University, The Regents of the University of California, and others.
- *
- * All Rights Reserved.
- *
- * See License file.
- */
-
-#ifndef SVMP_FE_BASIS_VECTORBASIS_H
-#define SVMP_FE_BASIS_VECTORBASIS_H
-
-/**
- * @file VectorBasis.h
- * @brief Vector-valued bases for H(div) and H(curl) conforming spaces
- */
-
-#include "BasisFunction.h"
-#include "VectorBasisModalPolynomial.h"
-#include <array>
-#include <cstddef>
-#include <vector>
-
-namespace svmp {
-namespace FE {
-namespace basis {
-
-/**
- * @brief DOF entity type for vector-valued basis functions
- */
-enum class DofEntity {
-    Vertex,   ///< DOF associated with a vertex
-    Edge,     ///< DOF associated with an edge (tangential moments for H(curl))
-    Face,     ///< DOF associated with a face (normal moments for H(div), tangential for H(curl))
-    Interior  ///< DOF associated with element interior
-};
-
-/**
- * @brief DOF association metadata for a single DOF
- */
-struct DofAssociation {
-    DofEntity entity_type{DofEntity::Interior};
-    int entity_id{-1};      ///< Local index of the entity (edge/face/vertex)
-    int moment_index{0};    ///< Index within the entity's moment space
-};
-
-struct SparseModalCoefficientMatrix {
-    std::size_t rows{0};
-    std::size_t cols{0};
-    std::vector<std::size_t> row_offsets;
-    std::vector<std::size_t> dofs;
-    std::vector<Real> coefficients;
-};
-
-class VectorBasisFunction : public BasisFunction {
-public:
-    bool is_vector_valued() const noexcept override { return true; }
-    bool supports_vector_jacobians() const noexcept override { return true; }
-    void evaluate_values(const math::Vector<Real, 3>&,
-                         std::vector<Real>&) const override {
-        throw BasisEvaluationException("Vector basis uses evaluate_vector_values",
-                                       __FILE__, __LINE__, __func__);
-    }
-
-    void evaluate_vector_at_quadrature_points_strided(
-        const std::vector<math::Vector<Real, 3>>& points,
-        std::size_t output_stride,
-        Real* SVMP_RESTRICT values_out,
-        Real* SVMP_RESTRICT jacobians_out,
-        Real* SVMP_RESTRICT curls_out,
-        Real* SVMP_RESTRICT divergence_out) const override;
-
-    /**
-     * @brief Get DOF association metadata for all basis functions
-     *
-     * Returns a vector of size(), where each entry describes which
-     * geometric entity (vertex/edge/face/interior) the corresponding
-     * DOF is associated with. This is essential for orientation-aware
-     * assembly of H(div) and H(curl) spaces.
-     */
-    virtual std::vector<DofAssociation> dof_associations() const {
-        // Default: all interior DOFs (subclasses should override)
-        std::vector<DofAssociation> result(size());
-        for (std::size_t i = 0; i < size(); ++i) {
-            result[i].entity_type = DofEntity::Interior;
-            result[i].entity_id = 0;
-            result[i].moment_index = static_cast<int>(i);
-        }
-        return result;
-    }
-};
-
-/**
- * @brief Raviart-Thomas H(div) basis on supported element families
- */
-class RaviartThomasBasis : public VectorBasisFunction {
-public:
-    RaviartThomasBasis(ElementType type, int order = 0);
-
-    BasisType basis_type() const noexcept override { return BasisType::RaviartThomas; }
-    ElementType element_type() const noexcept override { return element_type_; }
-    int dimension() const noexcept override { return dimension_; }
-    int order() const noexcept override { return order_; }
-    std::size_t size() const noexcept override { return size_; }
-    bool cache_identity_is_structural() const noexcept override { return true; }
-
-    void evaluate_vector_values(const math::Vector<Real, 3>& xi,
-                                std::vector<math::Vector<Real, 3>>& values) const override;
-    void evaluate_vector_jacobians(const math::Vector<Real, 3>& xi,
-                                   std::vector<VectorJacobian>& jacobians) const override;
-    void evaluate_divergence(const math::Vector<Real, 3>& xi,
-                             std::vector<Real>& divergence) const override;
-    bool supports_divergence() const noexcept override { return true; }
-    void evaluate_vector_at_quadrature_points_strided(
-        const std::vector<math::Vector<Real, 3>>& points,
-        std::size_t output_stride,
-        Real* SVMP_RESTRICT values_out,
-        Real* SVMP_RESTRICT jacobians_out,
-        Real* SVMP_RESTRICT curls_out,
-        Real* SVMP_RESTRICT divergence_out) const override;
-
-    /// Get DOF associations (face/edge DOFs for 2D, face DOFs for 3D H(div))
-    std::vector<DofAssociation> dof_associations() const override;
-
-private:
-    using ModalTerm = VectorBasisModalTerm;
-    using ModalPolynomial = VectorBasisModalPolynomial;
-    using SeedJacobianEvaluator = void (*)(
-        const math::Vector<Real, 3>&,
-        std::vector<VectorJacobian>&);
-
-    ElementType element_type_;
-    int dimension_;
-    int order_;
-    std::size_t size_{0};
-
-    bool nodal_generated_{false};
-    bool use_transformed_direct_seed_{false};  ///< True for wedge/pyramid RT(k=1,2) transformed from direct seed functions
-    std::vector<int> transformed_seed_indices_;
-    std::vector<std::array<int, 4>> transformed_monomial_candidates_; ///< {component, px, py, pz}
-    std::vector<ModalPolynomial> monomials_;
-    std::array<int, 3> modal_power_limits_{{0, 0, 0}};
-    std::array<int, 3> transformed_power_limits_{{0, 0, 0}};
-    SeedJacobianEvaluator transformed_seed_jacobian_evaluator_{nullptr};
-    // Sparse coefficients for nodal basis in modal monomial basis:
-    //   phi_j = sum_p c(p,j) * modal_p.
-    // Rows index modal functions; entries target nodal DOFs.
-    SparseModalCoefficientMatrix modal_sparse_coeffs_;
-    SparseModalCoefficientMatrix transformed_sparse_coeffs_;
-};
-
-/**
- * @brief First-kind Nedelec H(curl) basis on supported element families
- */
-class NedelecBasis : public VectorBasisFunction {
-public:
-    NedelecBasis(ElementType type, int order = 0);
-
-    BasisType basis_type() const noexcept override { return BasisType::Nedelec; }
-    ElementType element_type() const noexcept override { return element_type_; }
-    int dimension() const noexcept override { return dimension_; }
-    int order() const noexcept override { return order_; }
-    std::size_t size() const noexcept override { return size_; }
-    bool cache_identity_is_structural() const noexcept override { return true; }
-
-    void evaluate_vector_values(const math::Vector<Real, 3>& xi,
-                                std::vector<math::Vector<Real, 3>>& values) const override;
-    void evaluate_vector_jacobians(const math::Vector<Real, 3>& xi,
-                                   std::vector<VectorJacobian>& jacobians) const override;
-    void evaluate_curl(const math::Vector<Real, 3>& xi,
-                       std::vector<math::Vector<Real, 3>>& curl) const override;
-    bool supports_curl() const noexcept override { return true; }
-    void evaluate_vector_at_quadrature_points_strided(
-        const std::vector<math::Vector<Real, 3>>& points,
-        std::size_t output_stride,
-        Real* SVMP_RESTRICT values_out,
-        Real* SVMP_RESTRICT jacobians_out,
-        Real* SVMP_RESTRICT curls_out,
-        Real* SVMP_RESTRICT divergence_out) const override;
-
-    /// Get DOF associations (edge DOFs for H(curl), face DOFs for 3D interior)
-    std::vector<DofAssociation> dof_associations() const override;
-
-private:
-    using ModalTerm = VectorBasisModalTerm;
-    using ModalPolynomial = VectorBasisModalPolynomial;
-    using SeedJacobianEvaluator = void (*)(
-        const math::Vector<Real, 3>&,
-        std::vector<VectorJacobian>&);
-
-    ElementType element_type_;
-    int dimension_;
-    int order_;
-    std::size_t size_{0};
-
-    bool nodal_generated_{false};
-    bool use_transformed_direct_seed_{false};  ///< True for wedge/pyramid ND(k=1,2) transformed from direct seed/candidate functions
-    std::vector<std::array<int, 4>> transformed_monomial_candidates_; ///< {component, px, py, pz}
-    std::vector<ModalPolynomial> monomials_;
-    SparseModalCoefficientMatrix modal_sparse_coeffs_;
-    SparseModalCoefficientMatrix transformed_sparse_coeffs_;
-    std::array<int, 3> modal_power_limits_{{0, 0, 0}};
-    std::array<int, 3> transformed_power_limits_{{0, 0, 0}};
-    SeedJacobianEvaluator transformed_seed_jacobian_evaluator_{nullptr};
-};
-
-/**
- * @brief Brezzi-Douglas-Marini basis (simple linear variant)
- */
-class BDMBasis : public VectorBasisFunction {
-public:
-    BDMBasis(ElementType type, int order = 1);
-
-    BasisType basis_type() const noexcept override { return BasisType::BDM; }
-    ElementType element_type() const noexcept override { return element_type_; }
-    int dimension() const noexcept override { return dimension_; }
-    int order() const noexcept override { return order_; }
-    std::size_t size() const noexcept override { return size_; }
-    bool cache_identity_is_structural() const noexcept override { return true; }
-
-    void evaluate_vector_values(const math::Vector<Real, 3>& xi,
-                                std::vector<math::Vector<Real, 3>>& values) const override;
-    void evaluate_vector_jacobians(const math::Vector<Real, 3>& xi,
-                                   std::vector<VectorJacobian>& jacobians) const override;
-    void evaluate_divergence(const math::Vector<Real, 3>& xi,
-                             std::vector<Real>& divergence) const override;
-    bool supports_divergence() const noexcept override { return true; }
-    void evaluate_vector_at_quadrature_points_strided(
-        const std::vector<math::Vector<Real, 3>>& points,
-        std::size_t output_stride,
-        Real* SVMP_RESTRICT values_out,
-        Real* SVMP_RESTRICT jacobians_out,
-        Real* SVMP_RESTRICT curls_out,
-        Real* SVMP_RESTRICT divergence_out) const override;
-
-    /// Get DOF associations (face/edge DOFs for H(div))
-    std::vector<DofAssociation> dof_associations() const override;
-
-private:
-    using ModalTerm = VectorBasisModalTerm;
-    using ModalPolynomial = VectorBasisModalPolynomial;
-
-    ElementType element_type_;
-    int dimension_;
-    int order_;
-    std::size_t size_{0};
-    bool nodal_generated_{false};
-    std::vector<ModalPolynomial> monomials_;
-    SparseModalCoefficientMatrix modal_sparse_coeffs_;
-    std::array<int, 3> modal_power_limits_{{0, 0, 0}};
-};
-
-} // namespace basis
-} // namespace FE
-} // namespace svmp
-
-#endif // SVMP_FE_BASIS_VECTORBASIS_H
diff --git a/Code/Source/solver/FE/Basis/VectorBasisEvaluationHelpers.cpp b/Code/Source/solver/FE/Basis/VectorBasisEvaluationHelpers.cpp
deleted file mode 100644
index 7ec848633..000000000
--- a/Code/Source/solver/FE/Basis/VectorBasisEvaluationHelpers.cpp
+++ /dev/null
@@ -1,593 +0,0 @@
-/* Copyright (c) Stanford University, The Regents of the University of California, and others.
- *
- * All Rights Reserved.
- *
- * See License file.
- */
-
-#include "VectorBasisEvaluationHelpers.h"
-
-#include <algorithm>
-#include <cmath>
-#include <limits>
-#include <string>
-
-namespace svmp {
-namespace FE {
-namespace basis {
-namespace detail {
-namespace vector_common {
-
-VectorBasisScratch& vector_basis_scratch() {
-    // Scratch is intentionally thread-local: production assembly uses a
-    // persistent worker-thread team, so buffers stay warm on each worker.
-    static thread_local VectorBasisScratch scratch;
-    return scratch;
-}
-
-void prewarm_vector_basis_scratch(std::size_t max_size, std::size_t max_qpts) {
-    vector_basis_scratch().prewarm(max_size, max_qpts);
-}
-
-void fill_powers(Real x, int max_p, std::vector<Real>& out) {
-    BASIS_CHECK_CONSTRUCTION(max_p >= 0, "powers: negative max_p");
-    out.assign(static_cast<std::size_t>(max_p + 1), Real(1));
-    for (int i = 1; i <= max_p; ++i) {
-        out[static_cast<std::size_t>(i)] =
-            out[static_cast<std::size_t>(i - 1)] * x;
-    }
-}
-
-void fill_power_tables(const Vec3& xi,
-                       const std::array<int, 3>& limits,
-                       VectorBasisScratch& scratch) {
-    fill_powers(xi[0], limits[0], scratch.px);
-    fill_powers(xi[1], limits[1], scratch.py);
-    fill_powers(xi[2], limits[2], scratch.pz);
-}
-
-namespace {
-
-constexpr Real kSparseCoefficientRelativeTolerance =
-    Real(256) * std::numeric_limits<Real>::epsilon();
-
-void fill_batched_axis_powers(const std::vector<Vec3>& points,
-                              std::size_t axis,
-                              int max_power,
-                              std::vector<Real>& out) {
-    BASIS_CHECK_CONSTRUCTION(max_power >= 0, "batched powers: negative max_p");
-    const std::size_t num_qpts = points.size();
-    out.assign(static_cast<std::size_t>(max_power + 1) * num_qpts, Real(1));
-    if (num_qpts == 0 || max_power == 0) {
-        return;
-    }
-
-    Real* first_power = out.data() + num_qpts;
-    for (std::size_t q = 0; q < num_qpts; ++q) {
-        first_power[q] = points[q][axis];
-    }
-    for (int power = 2; power <= max_power; ++power) {
-        const Real* previous =
-            out.data() + static_cast<std::size_t>(power - 1) * num_qpts;
-        Real* current = out.data() + static_cast<std::size_t>(power) * num_qpts;
-        for (std::size_t q = 0; q < num_qpts; ++q) {
-            current[q] = previous[q] * points[q][axis];
-        }
-    }
-}
-
-} // namespace
-
-void fill_batched_power_tables(const std::vector<Vec3>& points,
-                               const std::array<int, 3>& limits,
-                               VectorBasisScratch& scratch) {
-    fill_batched_axis_powers(points, 0u, limits[0], scratch.batched_px);
-    fill_batched_axis_powers(points, 1u, limits[1], scratch.batched_py);
-    fill_batched_axis_powers(points, 2u, limits[2], scratch.batched_pz);
-}
-
-void validate_vector_strided_outputs(std::size_t num_qpts,
-                                     std::size_t output_stride,
-                                     const char* family_name) {
-    if (output_stride < num_qpts) {
-        throw BasisConfigurationException(
-            std::string(family_name) +
-                " strided vector evaluation requires output_stride >= points.size()",
-            __FILE__, __LINE__, __func__);
-    }
-}
-
-void zero_active_strided_rows(Real* output,
-                              std::size_t rows,
-                              std::size_t output_stride,
-                              std::size_t num_qpts) {
-    for (std::size_t row = 0; row < rows; ++row) {
-        std::fill_n(output + row * output_stride, num_qpts, Real(0));
-    }
-}
-
-SparseModalCoefficientMatrix build_sparse_modal_coefficients(
-    const std::vector<Real>& dense_coefficients,
-    std::size_t rows,
-    std::size_t cols) {
-    BASIS_CHECK_CONSTRUCTION(dense_coefficients.size() == rows * cols,
-                 "build_sparse_modal_coefficients: dense coefficient size mismatch");
-
-    SparseModalCoefficientMatrix sparse;
-    sparse.rows = rows;
-    sparse.cols = cols;
-    sparse.row_offsets.reserve(rows + 1u);
-    sparse.row_offsets.push_back(0u);
-
-    Real max_abs = Real(0);
-    for (const Real coefficient : dense_coefficients) {
-        max_abs = std::max(max_abs, std::abs(coefficient));
-    }
-    const Real prune_threshold = kSparseCoefficientRelativeTolerance * max_abs;
-
-    for (std::size_t row = 0; row < rows; ++row) {
-        const Real* dense_row = dense_coefficients.data() + row * cols;
-        for (std::size_t col = 0; col < cols; ++col) {
-            const Real coefficient = dense_row[col];
-            if (std::abs(coefficient) > prune_threshold) {
-                sparse.dofs.push_back(col);
-                sparse.coefficients.push_back(coefficient);
-            }
-        }
-        sparse.row_offsets.push_back(sparse.dofs.size());
-    }
-
-    return sparse;
-}
-
-Vec3 curl_from_jacobian(const VectorJacobian& J) noexcept {
-    return Vec3{J(2u, 1u) - J(1u, 2u),
-                J(0u, 2u) - J(2u, 0u),
-                J(1u, 0u) - J(0u, 1u)};
-}
-
-Real divergence_from_jacobian(const VectorJacobian& J) noexcept {
-    return J(0u, 0u) + J(1u, 1u) + J(2u, 2u);
-}
-
-void write_vector_values_strided(const std::vector<Vec3>& values,
-                                 std::size_t num_dofs,
-                                 std::size_t output_stride,
-                                 std::size_t q,
-                                 Real* SVMP_RESTRICT values_out) {
-    if (values_out == nullptr) {
-        return;
-    }
-    BASIS_CHECK_CONSTRUCTION(values.size() == num_dofs,
-                 "vector value evaluation returned the wrong number of DOFs");
-    for (std::size_t dof = 0; dof < num_dofs; ++dof) {
-        for (std::size_t component = 0; component < 3u; ++component) {
-            values_out[(dof * 3u + component) * output_stride + q] =
-                values[dof][component];
-        }
-    }
-}
-
-void write_vector_jacobians_strided(const std::vector<VectorJacobian>& jacobians,
-                                    std::size_t num_dofs,
-                                    std::size_t output_stride,
-                                    std::size_t q,
-                                    Real* SVMP_RESTRICT jacobians_out) {
-    if (jacobians_out == nullptr) {
-        return;
-    }
-    BASIS_CHECK_CONSTRUCTION(jacobians.size() == num_dofs,
-                 "vector Jacobian evaluation returned the wrong number of DOFs");
-    for (std::size_t dof = 0; dof < num_dofs; ++dof) {
-        const auto& J = jacobians[dof];
-        for (std::size_t component = 0; component < 3u; ++component) {
-            for (std::size_t derivative = 0; derivative < 3u; ++derivative) {
-                jacobians_out[(dof * 9u + component * 3u + derivative) *
-                                  output_stride + q] = J(component, derivative);
-            }
-        }
-    }
-}
-
-void write_vector_curl_strided(const std::vector<Vec3>& curl,
-                               std::size_t num_dofs,
-                               std::size_t output_stride,
-                               std::size_t q,
-                               Real* SVMP_RESTRICT curls_out) {
-    if (curls_out == nullptr) {
-        return;
-    }
-    BASIS_CHECK_CONSTRUCTION(curl.size() == num_dofs,
-                 "vector curl evaluation returned the wrong number of DOFs");
-    for (std::size_t dof = 0; dof < num_dofs; ++dof) {
-        for (std::size_t component = 0; component < 3u; ++component) {
-            curls_out[(dof * 3u + component) * output_stride + q] =
-                curl[dof][component];
-        }
-    }
-}
-
-void write_vector_divergence_strided(const std::vector<Real>& divergence,
-                                     std::size_t num_dofs,
-                                     std::size_t output_stride,
-                                     std::size_t q,
-                                     Real* SVMP_RESTRICT divergence_out) {
-    if (divergence_out == nullptr) {
-        return;
-    }
-    BASIS_CHECK_CONSTRUCTION(divergence.size() == num_dofs,
-                 "vector divergence evaluation returned the wrong number of DOFs");
-    for (std::size_t dof = 0; dof < num_dofs; ++dof) {
-        divergence_out[dof * output_stride + q] = divergence[dof];
-    }
-}
-
-void write_curl_and_divergence_from_jacobians_strided(
-    const std::vector<VectorJacobian>& jacobians,
-    std::size_t num_dofs,
-    std::size_t output_stride,
-    std::size_t q,
-    Real* SVMP_RESTRICT curls_out,
-    Real* SVMP_RESTRICT divergence_out) {
-    BASIS_CHECK_CONSTRUCTION(jacobians.size() == num_dofs,
-                 "vector Jacobian evaluation returned the wrong number of DOFs");
-    for (std::size_t dof = 0; dof < num_dofs; ++dof) {
-        const auto& J = jacobians[dof];
-        if (curls_out != nullptr) {
-            const Vec3 curl = curl_from_jacobian(J);
-            for (std::size_t component = 0; component < 3u; ++component) {
-                curls_out[(dof * 3u + component) * output_stride + q] =
-                    curl[component];
-            }
-        }
-        if (divergence_out != nullptr) {
-            divergence_out[dof * output_stride + q] = divergence_from_jacobian(J);
-        }
-    }
-}
-
-Vec3 lerp(const Vec3& a, const Vec3& b, Real s) {
-    const Real t = (s + Real(1)) * Real(0.5);
-    return a * (Real(1) - t) + b * t;
-}
-
-Vec3 bilinear(const std::array<Vec3, 4>& v, Real u, Real w) {
-    const Real N0 = Real(0.25) * (Real(1) - u) * (Real(1) - w);
-    const Real N1 = Real(0.25) * (Real(1) + u) * (Real(1) - w);
-    const Real N2 = Real(0.25) * (Real(1) + u) * (Real(1) + w);
-    const Real N3 = Real(0.25) * (Real(1) - u) * (Real(1) + w);
-    return v[0] * N0 + v[1] * N1 + v[2] * N2 + v[3] * N3;
-}
-
-Vec3 bilinear_du(const std::array<Vec3, 4>& v, Real u, Real w) {
-    (void)u;
-    const Real dN0 = -Real(0.25) * (Real(1) - w);
-    const Real dN1 =  Real(0.25) * (Real(1) - w);
-    const Real dN2 =  Real(0.25) * (Real(1) + w);
-    const Real dN3 = -Real(0.25) * (Real(1) + w);
-    return v[0] * dN0 + v[1] * dN1 + v[2] * dN2 + v[3] * dN3;
-}
-
-Vec3 bilinear_dw(const std::array<Vec3, 4>& v, Real u, Real w) {
-    (void)w;
-    const Real dN0 = -Real(0.25) * (Real(1) - u);
-    const Real dN1 = -Real(0.25) * (Real(1) + u);
-    const Real dN2 =  Real(0.25) * (Real(1) + u);
-    const Real dN3 =  Real(0.25) * (Real(1) - u);
-    return v[0] * dN0 + v[1] * dN1 + v[2] * dN2 + v[3] * dN3;
-}
-
-Vec3 cross3(const Vec3& a, const Vec3& b) {
-    return Vec3{a[1] * b[2] - a[2] * b[1],
-                a[2] * b[0] - a[0] * b[2],
-                a[0] * b[1] - a[1] * b[0]};
-}
-
-Vec3 normalize3(const Vec3& v) {
-    const Real n = v.norm();
-    BASIS_CHECK_CONSTRUCTION(n > std::numeric_limits<Real>::epsilon(),
-                 "normalize3: zero-length vector");
-    return v / n;
-}
-
-std::array<int, 3> component_monomial_power_limits(
-    const std::vector<std::array<int, 4>>& candidates) {
-    std::array<int, 3> limits{{0, 0, 0}};
-    for (const auto& mono : candidates) {
-        limits[0] = std::max(limits[0], mono[1]);
-        limits[1] = std::max(limits[1], mono[2]);
-        limits[2] = std::max(limits[2], mono[3]);
-    }
-    return limits;
-}
-
-std::size_t triangle_poly_dim(std::size_t k) {
-    return (k + 1u) * (k + 2u) / 2u;
-}
-
-std::size_t tetra_poly_dim(std::size_t k) {
-    return (k + 1u) * (k + 2u) * (k + 3u) / 6u;
-}
-
-std::size_t rt_wedge_size(int order) {
-    const std::size_t k = static_cast<std::size_t>(order);
-    const std::size_t face_dofs =
-        2u * triangle_poly_dim(k) + 3u * (k + 1u) * (k + 1u);
-    const std::size_t interior_dofs =
-        (k >= 1u) ? (3u * k * (k + 1u) * (k + 1u) / 2u) : 0u;
-    return face_dofs + interior_dofs;
-}
-
-std::size_t rt_pyramid_size(int order) {
-    const std::size_t k = static_cast<std::size_t>(order);
-    const std::size_t face_dofs = (k + 1u) * (k + 1u) + 4u * triangle_poly_dim(k);
-    const std::size_t interior_dofs = (k >= 1u) ? (3u * k * k * k) : 0u;
-    return face_dofs + interior_dofs;
-}
-
-std::size_t nd_wedge_size(int order) {
-    const std::size_t k = static_cast<std::size_t>(order);
-    const std::size_t edge_dofs = 9u * (k + 1u);
-    const std::size_t face_dofs = (k >= 1u) ? (8u * k * (k + 1u)) : 0u;
-    const std::size_t interior_dofs =
-        (k >= 2u) ? (3u * k * (k - 1u) * (k + 1u) / 2u) : 0u;
-    return edge_dofs + face_dofs + interior_dofs;
-}
-
-std::size_t nd_pyramid_size(int order) {
-    const std::size_t k = static_cast<std::size_t>(order);
-    const std::size_t edge_dofs = 8u * (k + 1u);
-    const std::size_t face_dofs = (k >= 1u) ? (6u * k * (k + 1u)) : 0u;
-    const std::size_t interior_dofs =
-        (k >= 2u) ? (k * (k - 1u) * (k + 1u) / 2u) : 0u;
-    return edge_dofs + face_dofs + interior_dofs;
-}
-
-void ensure_supported_hybrid_vector_order(ElementType type,
-                                          int order,
-                                          const char* family_name) {
-    (void)type;
-    (void)order;
-    (void)family_name;
-}
-
-std::vector<std::array<int, 4>> make_component_monomial_candidates(
-    int max_total_degree) {
-    BASIS_CHECK_CONSTRUCTION(max_total_degree >= 0,
-                 "make_component_monomial_candidates: negative total degree");
-
-    std::vector<std::array<int, 4>> candidates;
-    for (int component = 0; component < 3; ++component) {
-        for (int total = 0; total <= max_total_degree; ++total) {
-            for (int pz = 0; pz <= total; ++pz) {
-                for (int py = 0; py <= total - pz; ++py) {
-                    const int px = total - py - pz;
-                    candidates.push_back({component, px, py, pz});
-                }
-            }
-        }
-    }
-    return candidates;
-}
-
-std::vector<std::array<int, 4>> make_rt_extra_monomial_candidates(ElementType type,
-                                                                  int order) {
-    if (order >= 3) {
-        return make_component_monomial_candidates(3 * order);
-    }
-
-    std::vector<std::array<int, 4>> candidates;
-    if (!is_pyramid(type) || order != 2) {
-        return candidates;
-    }
-
-    for (int component = 0; component < 3; ++component) {
-        for (int pz = 0; pz <= 2; ++pz) {
-            for (int py = 0; py <= 2 - pz; ++py) {
-                for (int px = 0; px <= 2 - py - pz; ++px) {
-                    candidates.push_back({component, px, py, pz});
-                }
-            }
-        }
-    }
-    return candidates;
-}
-
-Real eval_transformed_rt_monomial_scalar(const std::array<int, 4>& mono,
-                                         const std::vector<Real>& px,
-                                         const std::vector<Real>& py,
-                                         const std::vector<Real>& pz) {
-    return px[static_cast<std::size_t>(mono[1])] *
-           py[static_cast<std::size_t>(mono[2])] *
-           pz[static_cast<std::size_t>(mono[3])];
-}
-
-Real eval_transformed_rt_monomial_divergence(const std::array<int, 4>& mono,
-                                             const std::vector<Real>& px,
-                                             const std::vector<Real>& py,
-                                             const std::vector<Real>& pz) {
-    const int component = mono[0];
-    const int px_pow = mono[1];
-    const int py_pow = mono[2];
-    const int pz_pow = mono[3];
-
-    if (component == 0) {
-        if (px_pow == 0) {
-            return Real(0);
-        }
-        return Real(px_pow) *
-               px[static_cast<std::size_t>(px_pow - 1)] *
-               py[static_cast<std::size_t>(py_pow)] *
-               pz[static_cast<std::size_t>(pz_pow)];
-    }
-    if (component == 1) {
-        if (py_pow == 0) {
-            return Real(0);
-        }
-        return Real(py_pow) *
-               px[static_cast<std::size_t>(px_pow)] *
-               py[static_cast<std::size_t>(py_pow - 1)] *
-               pz[static_cast<std::size_t>(pz_pow)];
-    }
-    if (pz_pow == 0) {
-        return Real(0);
-    }
-    return Real(pz_pow) *
-           px[static_cast<std::size_t>(px_pow)] *
-           py[static_cast<std::size_t>(py_pow)] *
-           pz[static_cast<std::size_t>(pz_pow - 1)];
-}
-
-void add_component_monomial_jacobian(VectorJacobian& J,
-                                     int component,
-                                     int px_pow,
-                                     int py_pow,
-                                     int pz_pow,
-                                     Real coefficient,
-                                     const std::vector<Real>& px,
-                                     const std::vector<Real>& py,
-                                     const std::vector<Real>& pz) {
-    const auto comp = static_cast<std::size_t>(component);
-    if (px_pow > 0) {
-        J(comp, 0) += coefficient * Real(px_pow) *
-                      px[static_cast<std::size_t>(px_pow - 1)] *
-                      py[static_cast<std::size_t>(py_pow)] *
-                      pz[static_cast<std::size_t>(pz_pow)];
-    }
-    if (py_pow > 0) {
-        J(comp, 1) += coefficient * Real(py_pow) *
-                      px[static_cast<std::size_t>(px_pow)] *
-                      py[static_cast<std::size_t>(py_pow - 1)] *
-                      pz[static_cast<std::size_t>(pz_pow)];
-    }
-    if (pz_pow > 0) {
-        J(comp, 2) += coefficient * Real(pz_pow) *
-                      px[static_cast<std::size_t>(px_pow)] *
-                      py[static_cast<std::size_t>(py_pow)] *
-                      pz[static_cast<std::size_t>(pz_pow - 1)];
-    }
-}
-
-VectorJacobian eval_transformed_component_monomial_jacobian(
-    const std::array<int, 4>& mono,
-    const std::vector<Real>& px,
-    const std::vector<Real>& py,
-    const std::vector<Real>& pz) {
-    VectorJacobian J{};
-    add_component_monomial_jacobian(
-        J, mono[0], mono[1], mono[2], mono[3], Real(1), px, py, pz);
-    return J;
-}
-
-void add_component_monomial_curl(Vec3& curl,
-                                 int component,
-                                 int px_pow,
-                                 int py_pow,
-                                 int pz_pow,
-                                 Real coefficient,
-                                 const std::vector<Real>& px,
-                                 const std::vector<Real>& py,
-                                 const std::vector<Real>& pz) {
-    const Real dphidx = (px_pow == 0)
-        ? Real(0)
-        : coefficient * Real(px_pow) *
-              px[static_cast<std::size_t>(px_pow - 1)] *
-              py[static_cast<std::size_t>(py_pow)] *
-              pz[static_cast<std::size_t>(pz_pow)];
-    const Real dphidy = (py_pow == 0)
-        ? Real(0)
-        : coefficient * Real(py_pow) *
-              px[static_cast<std::size_t>(px_pow)] *
-              py[static_cast<std::size_t>(py_pow - 1)] *
-              pz[static_cast<std::size_t>(pz_pow)];
-    const Real dphidz = (pz_pow == 0)
-        ? Real(0)
-        : coefficient * Real(pz_pow) *
-              px[static_cast<std::size_t>(px_pow)] *
-              py[static_cast<std::size_t>(py_pow)] *
-              pz[static_cast<std::size_t>(pz_pow - 1)];
-
-    if (component == 0) {
-        curl[1] += dphidz;
-        curl[2] -= dphidy;
-    } else if (component == 1) {
-        curl[0] -= dphidz;
-        curl[2] += dphidx;
-    } else {
-        curl[0] += dphidy;
-        curl[1] -= dphidx;
-    }
-}
-
-std::vector<std::array<int, 4>> make_nd_extra_monomial_candidates(ElementType,
-                                                                  int order) {
-    if (order >= 3) {
-        return make_component_monomial_candidates(3 * order);
-    }
-
-    std::vector<std::array<int, 4>> candidates;
-    const int max_total_degree = (order == 1) ? 4 : 5;
-    for (int component = 0; component < 3; ++component) {
-        for (int total = 0; total <= max_total_degree; ++total) {
-            for (int pz = 0; pz <= total; ++pz) {
-                for (int py = 0; py <= total - pz; ++py) {
-                    const int px = total - py - pz;
-                    candidates.push_back({component, px, py, pz});
-                }
-            }
-        }
-    }
-    return candidates;
-}
-
-Real eval_transformed_nd_monomial_scalar(const std::array<int, 4>& mono,
-                                         const std::vector<Real>& px,
-                                         const std::vector<Real>& py,
-                                         const std::vector<Real>& pz) {
-    return px[static_cast<std::size_t>(mono[1])] *
-           py[static_cast<std::size_t>(mono[2])] *
-           pz[static_cast<std::size_t>(mono[3])];
-}
-
-Vec3 eval_transformed_nd_monomial_curl(const std::array<int, 4>& mono,
-                                       const std::vector<Real>& px,
-                                       const std::vector<Real>& py,
-                                       const std::vector<Real>& pz) {
-    const int component = mono[0];
-    const int px_pow = mono[1];
-    const int py_pow = mono[2];
-    const int pz_pow = mono[3];
-
-    const Real dphidx = (px_pow == 0)
-        ? Real(0)
-        : Real(px_pow) *
-              px[static_cast<std::size_t>(px_pow - 1)] *
-              py[static_cast<std::size_t>(py_pow)] *
-              pz[static_cast<std::size_t>(pz_pow)];
-    const Real dphidy = (py_pow == 0)
-        ? Real(0)
-        : Real(py_pow) *
-              px[static_cast<std::size_t>(px_pow)] *
-              py[static_cast<std::size_t>(py_pow - 1)] *
-              pz[static_cast<std::size_t>(pz_pow)];
-    const Real dphidz = (pz_pow == 0)
-        ? Real(0)
-        : Real(pz_pow) *
-              px[static_cast<std::size_t>(px_pow)] *
-              py[static_cast<std::size_t>(py_pow)] *
-              pz[static_cast<std::size_t>(pz_pow - 1)];
-
-    if (component == 0) {
-        return Vec3{Real(0), dphidz, -dphidy};
-    }
-    if (component == 1) {
-        return Vec3{-dphidz, Real(0), dphidx};
-    }
-    return Vec3{dphidy, -dphidx, Real(0)};
-}
-
-} // namespace vector_common
-} // namespace detail
-} // namespace basis
-} // namespace FE
-} // namespace svmp
diff --git a/Code/Source/solver/FE/Basis/VectorBasisEvaluationHelpers.h b/Code/Source/solver/FE/Basis/VectorBasisEvaluationHelpers.h
deleted file mode 100644
index e0e6daa10..000000000
--- a/Code/Source/solver/FE/Basis/VectorBasisEvaluationHelpers.h
+++ /dev/null
@@ -1,751 +0,0 @@
-/* Copyright (c) Stanford University, The Regents of the University of California, and others.
- *
- * All Rights Reserved.
- *
- * See License file.
- */
-
-#ifndef SVMP_FE_BASIS_VECTORBASISEVALUATIONHELPERS_H
-#define SVMP_FE_BASIS_VECTORBASISEVALUATIONHELPERS_H
-
-#include "VectorBasis.h"
-#include "Basis/BasisTraits.h"
-
-#include <algorithm>
-#include <array>
-#include <limits>
-#include <string>
-#include <vector>
-
-namespace svmp {
-namespace FE {
-namespace basis {
-namespace detail {
-namespace vector_common {
-
-using Vec3 = math::Vector<Real, 3>;
-
-struct VectorBasisScratch {
-    std::vector<Real> px;
-    std::vector<Real> py;
-    std::vector<Real> pz;
-    std::vector<Real> batched_px;
-    std::vector<Real> batched_py;
-    std::vector<Real> batched_pz;
-    std::vector<Real> candidate_values;
-    std::vector<Real> candidate_dx;
-    std::vector<Real> candidate_dy;
-    std::vector<Real> candidate_dz;
-    std::vector<Real> modal_values_batched;
-    std::vector<Real> modal_jacobians_batched;
-    std::vector<Real> modal_curls_batched;
-    std::vector<Real> modal_divergence_batched;
-    std::vector<Vec3> vector_values;
-    std::vector<VectorJacobian> vector_jacobians;
-    std::vector<Real> scalars;
-    std::vector<Vec3> api_values;
-    std::vector<VectorJacobian> api_jacobians;
-    std::vector<Vec3> api_curl;
-    std::vector<Real> api_divergence;
-
-    void prewarm(std::size_t max_size, std::size_t max_qpts) {
-        const std::size_t batched_size = max_size * std::max<std::size_t>(max_qpts, 1u);
-        px.reserve(max_size);
-        py.reserve(max_size);
-        pz.reserve(max_size);
-        batched_px.reserve(batched_size);
-        batched_py.reserve(batched_size);
-        batched_pz.reserve(batched_size);
-        candidate_values.reserve(max_size);
-        candidate_dx.reserve(max_size);
-        candidate_dy.reserve(max_size);
-        candidate_dz.reserve(max_size);
-        modal_values_batched.reserve(batched_size * 3u);
-        modal_jacobians_batched.reserve(batched_size * 9u);
-        modal_curls_batched.reserve(batched_size * 3u);
-        modal_divergence_batched.reserve(batched_size);
-        vector_values.reserve(max_size);
-        vector_jacobians.reserve(max_size);
-        scalars.reserve(max_size);
-        api_values.reserve(max_size);
-        api_jacobians.reserve(max_size);
-        api_curl.reserve(max_size);
-        api_divergence.reserve(max_size);
-    }
-};
-
-VectorBasisScratch& vector_basis_scratch();
-void prewarm_vector_basis_scratch(std::size_t max_size, std::size_t max_qpts = 0);
-
-void fill_powers(Real x, int max_p, std::vector<Real>& out);
-void fill_power_tables(const Vec3& xi,
-                       const std::array<int, 3>& limits,
-                       VectorBasisScratch& scratch);
-void fill_batched_power_tables(const std::vector<Vec3>& points,
-                               const std::array<int, 3>& limits,
-                               VectorBasisScratch& scratch);
-void validate_vector_strided_outputs(std::size_t num_qpts,
-                                     std::size_t output_stride,
-                                     const char* family_name);
-void zero_active_strided_rows(Real* output,
-                              std::size_t rows,
-                              std::size_t output_stride,
-                              std::size_t num_qpts);
-SparseModalCoefficientMatrix build_sparse_modal_coefficients(
-    const std::vector<Real>& dense_coefficients,
-    std::size_t rows,
-    std::size_t cols);
-Vec3 curl_from_jacobian(const VectorJacobian& J) noexcept;
-Real divergence_from_jacobian(const VectorJacobian& J) noexcept;
-
-inline Real batched_power_product(const std::vector<Real>& px,
-                                  const std::vector<Real>& py,
-                                  const std::vector<Real>& pz,
-                                  std::size_t stride,
-                                  int px_pow,
-                                  int py_pow,
-                                  int pz_pow,
-                                  std::size_t q) noexcept {
-    return px[static_cast<std::size_t>(px_pow) * stride + q] *
-           py[static_cast<std::size_t>(py_pow) * stride + q] *
-           pz[static_cast<std::size_t>(pz_pow) * stride + q];
-}
-
-inline Real batched_component_partial(const std::vector<Real>& px,
-                                      const std::vector<Real>& py,
-                                      const std::vector<Real>& pz,
-                                      std::size_t stride,
-                                      int px_pow,
-                                      int py_pow,
-                                      int pz_pow,
-                                      int derivative_axis,
-                                      std::size_t q) noexcept {
-    if (derivative_axis == 0) {
-        if (px_pow == 0) {
-            return Real(0);
-        }
-        return Real(px_pow) *
-               px[static_cast<std::size_t>(px_pow - 1) * stride + q] *
-               py[static_cast<std::size_t>(py_pow) * stride + q] *
-               pz[static_cast<std::size_t>(pz_pow) * stride + q];
-    }
-    if (derivative_axis == 1) {
-        if (py_pow == 0) {
-            return Real(0);
-        }
-        return Real(py_pow) *
-               px[static_cast<std::size_t>(px_pow) * stride + q] *
-               py[static_cast<std::size_t>(py_pow - 1) * stride + q] *
-               pz[static_cast<std::size_t>(pz_pow) * stride + q];
-    }
-    if (pz_pow == 0) {
-        return Real(0);
-    }
-    return Real(pz_pow) *
-           px[static_cast<std::size_t>(px_pow) * stride + q] *
-           py[static_cast<std::size_t>(py_pow) * stride + q] *
-           pz[static_cast<std::size_t>(pz_pow - 1) * stride + q];
-}
-
-inline Vec3 curl_from_component_gradient(int component,
-                                         Real dphidx,
-                                         Real dphidy,
-                                         Real dphidz) noexcept {
-    if (component == 0) {
-        return Vec3{Real(0), dphidz, -dphidy};
-    }
-    if (component == 1) {
-        return Vec3{-dphidz, Real(0), dphidx};
-    }
-    return Vec3{dphidy, -dphidx, Real(0)};
-}
-
-inline void axpy_qpoints(Real* target,
-                         const Real* source,
-                         Real coefficient,
-                         std::size_t num_qpts) noexcept {
-    for (std::size_t q = 0; q < num_qpts; ++q) {
-        target[q] += coefficient * source[q];
-    }
-}
-
-void write_vector_values_strided(const std::vector<Vec3>& values,
-                                 std::size_t num_dofs,
-                                 std::size_t output_stride,
-                                 std::size_t q,
-                                 Real* SVMP_RESTRICT values_out);
-void write_vector_jacobians_strided(const std::vector<VectorJacobian>& jacobians,
-                                    std::size_t num_dofs,
-                                    std::size_t output_stride,
-                                    std::size_t q,
-                                    Real* SVMP_RESTRICT jacobians_out);
-void write_vector_curl_strided(const std::vector<Vec3>& curl,
-                               std::size_t num_dofs,
-                               std::size_t output_stride,
-                               std::size_t q,
-                               Real* SVMP_RESTRICT curls_out);
-void write_vector_divergence_strided(const std::vector<Real>& divergence,
-                                     std::size_t num_dofs,
-                                     std::size_t output_stride,
-                                     std::size_t q,
-                                     Real* SVMP_RESTRICT divergence_out);
-void write_curl_and_divergence_from_jacobians_strided(
-    const std::vector<VectorJacobian>& jacobians,
-    std::size_t num_dofs,
-    std::size_t output_stride,
-    std::size_t q,
-    Real* SVMP_RESTRICT curls_out,
-    Real* SVMP_RESTRICT divergence_out);
-
-template <typename BasisLike>
-void evaluate_vector_public_api_strided(
-    const BasisLike& basis,
-    const std::vector<Vec3>& points,
-    std::size_t output_stride,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT jacobians_out,
-    Real* SVMP_RESTRICT curls_out,
-    Real* SVMP_RESTRICT divergence_out,
-    bool use_direct_curl,
-    bool use_direct_divergence,
-    const char* family_name) {
-    const std::size_t num_qpts = points.size();
-    const std::size_t num_dofs = basis.size();
-    validate_vector_strided_outputs(num_qpts, output_stride, family_name);
-
-    auto& scratch = vector_basis_scratch();
-    for (std::size_t q = 0; q < num_qpts; ++q) {
-        if (values_out != nullptr) {
-            basis.evaluate_vector_values(points[q], scratch.api_values);
-            write_vector_values_strided(
-                scratch.api_values, num_dofs, output_stride, q, values_out);
-        }
-
-        const bool needs_jacobians =
-            jacobians_out != nullptr ||
-            (curls_out != nullptr && !use_direct_curl) ||
-            (divergence_out != nullptr && !use_direct_divergence);
-
-        if (needs_jacobians) {
-            basis.evaluate_vector_jacobians(points[q], scratch.api_jacobians);
-            write_vector_jacobians_strided(
-                scratch.api_jacobians, num_dofs, output_stride, q, jacobians_out);
-            write_curl_and_divergence_from_jacobians_strided(
-                scratch.api_jacobians,
-                num_dofs,
-                output_stride,
-                q,
-                curls_out,
-                divergence_out);
-            continue;
-        }
-
-        if (curls_out != nullptr) {
-            basis.evaluate_curl(points[q], scratch.api_curl);
-            write_vector_curl_strided(
-                scratch.api_curl, num_dofs, output_stride, q, curls_out);
-        }
-        if (divergence_out != nullptr) {
-            basis.evaluate_divergence(points[q], scratch.api_divergence);
-            write_vector_divergence_strided(
-                scratch.api_divergence, num_dofs, output_stride, q, divergence_out);
-        }
-    }
-}
-
-Vec3 lerp(const Vec3& a, const Vec3& b, Real s);
-Vec3 bilinear(const std::array<Vec3, 4>& v, Real u, Real w);
-Vec3 bilinear_du(const std::array<Vec3, 4>& v, Real u, Real w);
-Vec3 bilinear_dw(const std::array<Vec3, 4>& v, Real u, Real w);
-Vec3 cross3(const Vec3& a, const Vec3& b);
-Vec3 normalize3(const Vec3& v);
-
-template <typename ModalPolynomials>
-std::array<int, 3> modal_power_limits(const ModalPolynomials& monomials) {
-    std::array<int, 3> limits{{0, 0, 0}};
-    for (const auto& poly : monomials) {
-        for (int t = 0; t < poly.num_terms; ++t) {
-            const auto& m = poly.terms[static_cast<std::size_t>(t)];
-            limits[0] = std::max(limits[0], m.px);
-            limits[1] = std::max(limits[1], m.py);
-            limits[2] = std::max(limits[2], m.pz);
-        }
-    }
-    return limits;
-}
-
-std::array<int, 3> component_monomial_power_limits(
-    const std::vector<std::array<int, 4>>& candidates);
-std::size_t triangle_poly_dim(std::size_t k);
-std::size_t tetra_poly_dim(std::size_t k);
-std::size_t rt_wedge_size(int order);
-std::size_t rt_pyramid_size(int order);
-std::size_t nd_wedge_size(int order);
-std::size_t nd_pyramid_size(int order);
-void ensure_supported_hybrid_vector_order(ElementType type,
-                                          int order,
-                                          const char* family_name);
-std::vector<std::array<int, 4>> make_component_monomial_candidates(int max_total_degree);
-std::vector<std::array<int, 4>> make_rt_extra_monomial_candidates(ElementType type,
-                                                                  int order);
-Real eval_transformed_rt_monomial_scalar(const std::array<int, 4>& mono,
-                                         const std::vector<Real>& px,
-                                         const std::vector<Real>& py,
-                                         const std::vector<Real>& pz);
-Real eval_transformed_rt_monomial_divergence(const std::array<int, 4>& mono,
-                                             const std::vector<Real>& px,
-                                             const std::vector<Real>& py,
-                                             const std::vector<Real>& pz);
-
-void add_component_monomial_jacobian(VectorJacobian& J,
-                                     int component,
-                                     int px_pow,
-                                     int py_pow,
-                                     int pz_pow,
-                                     Real coefficient,
-                                     const std::vector<Real>& px,
-                                     const std::vector<Real>& py,
-                                     const std::vector<Real>& pz);
-VectorJacobian eval_transformed_component_monomial_jacobian(
-    const std::array<int, 4>& mono,
-    const std::vector<Real>& px,
-    const std::vector<Real>& py,
-    const std::vector<Real>& pz);
-void add_component_monomial_curl(Vec3& curl,
-                                 int component,
-                                 int px_pow,
-                                 int py_pow,
-                                 int pz_pow,
-                                 Real coefficient,
-                                 const std::vector<Real>& px,
-                                 const std::vector<Real>& py,
-                                 const std::vector<Real>& pz);
-
-template <typename ModalPolynomials>
-void evaluate_nodal_modal_vector_values_with_limits(const ModalPolynomials& monomials,
-                                                    const SparseModalCoefficientMatrix& sparse_coeffs,
-                                                    std::size_t n,
-                                                    const Vec3& xi,
-                                                    const std::array<int, 3>& power_limits,
-                                                    std::vector<Vec3>& values) {
-    values.assign(n, Vec3{});
-
-    auto& scratch = vector_basis_scratch();
-    fill_power_tables(xi, power_limits, scratch);
-    const auto& px = scratch.px;
-    const auto& py = scratch.py;
-    const auto& pz = scratch.pz;
-
-    auto& modal_vals = scratch.vector_values;
-    modal_vals.assign(n, Vec3{});
-    for (std::size_t p = 0; p < n; ++p) {
-        const auto& poly = monomials[p];
-        auto& v = modal_vals[p];
-        for (int t = 0; t < poly.num_terms; ++t) {
-            const auto& m = poly.terms[static_cast<std::size_t>(t)];
-            const Real mv =
-                px[static_cast<std::size_t>(m.px)] *
-                py[static_cast<std::size_t>(m.py)] *
-                pz[static_cast<std::size_t>(m.pz)];
-            v[static_cast<std::size_t>(m.component)] += m.coefficient * mv;
-        }
-    }
-
-    BASIS_CHECK_CONSTRUCTION(sparse_coeffs.rows == n &&
-                     sparse_coeffs.cols == n &&
-                     sparse_coeffs.row_offsets.size() == n + 1u,
-                 "evaluate_nodal_modal_vector_values: sparse coefficient size mismatch");
-    BASIS_CHECK_CONSTRUCTION(sparse_coeffs.dofs.size() == sparse_coeffs.coefficients.size(),
-                 "evaluate_nodal_modal_vector_values: sparse coefficient entry mismatch");
-    for (std::size_t p = 0; p < n; ++p) {
-        const Vec3& mv = modal_vals[p];
-        const std::size_t row_begin = sparse_coeffs.row_offsets[p];
-        const std::size_t row_end = sparse_coeffs.row_offsets[p + 1u];
-        for (std::size_t entry = row_begin; entry < row_end; ++entry) {
-            const std::size_t dof = sparse_coeffs.dofs[entry];
-            const Real c = sparse_coeffs.coefficients[entry];
-            values[dof][0] += c * mv[0];
-            values[dof][1] += c * mv[1];
-            values[dof][2] += c * mv[2];
-        }
-    }
-}
-
-template <typename ModalPolynomials>
-void evaluate_nodal_modal_vector_jacobians_with_limits(const ModalPolynomials& monomials,
-                                                       const SparseModalCoefficientMatrix& sparse_coeffs,
-                                                       std::size_t n,
-                                                       const Vec3& xi,
-                                                       const std::array<int, 3>& power_limits,
-                                                       std::vector<VectorJacobian>& jacobians) {
-    jacobians.assign(n, VectorJacobian{});
-
-    auto& scratch = vector_basis_scratch();
-    fill_power_tables(xi, power_limits, scratch);
-    const auto& px = scratch.px;
-    const auto& py = scratch.py;
-    const auto& pz = scratch.pz;
-
-    auto& modal_jacs = scratch.vector_jacobians;
-    modal_jacs.assign(n, VectorJacobian{});
-    for (std::size_t p = 0; p < n; ++p) {
-        const auto& poly = monomials[p];
-        auto& J = modal_jacs[p];
-        for (int t = 0; t < poly.num_terms; ++t) {
-            const auto& m = poly.terms[static_cast<std::size_t>(t)];
-            add_component_monomial_jacobian(J, m.component, m.px, m.py, m.pz, m.coefficient, px, py, pz);
-        }
-    }
-
-    BASIS_CHECK_CONSTRUCTION(sparse_coeffs.rows == n &&
-                     sparse_coeffs.cols == n &&
-                     sparse_coeffs.row_offsets.size() == n + 1u,
-                 "evaluate_nodal_modal_vector_jacobians: sparse coefficient size mismatch");
-    BASIS_CHECK_CONSTRUCTION(sparse_coeffs.dofs.size() == sparse_coeffs.coefficients.size(),
-                 "evaluate_nodal_modal_vector_jacobians: sparse coefficient entry mismatch");
-    for (std::size_t p = 0; p < n; ++p) {
-        const auto& Jp = modal_jacs[p];
-        const std::size_t row_begin = sparse_coeffs.row_offsets[p];
-        const std::size_t row_end = sparse_coeffs.row_offsets[p + 1u];
-        for (std::size_t entry = row_begin; entry < row_end; ++entry) {
-            const std::size_t dof = sparse_coeffs.dofs[entry];
-            const Real c = sparse_coeffs.coefficients[entry];
-            for (std::size_t r = 0; r < 3; ++r) {
-                for (std::size_t col = 0; col < 3; ++col) {
-                    jacobians[dof](r, col) += c * Jp(r, col);
-                }
-            }
-        }
-    }
-}
-
-template <typename ModalPolynomials>
-void evaluate_nodal_modal_vector_curl_with_limits(const ModalPolynomials& monomials,
-                                                  const SparseModalCoefficientMatrix& sparse_coeffs,
-                                                  std::size_t n,
-                                                  const Vec3& xi,
-                                                  const std::array<int, 3>& power_limits,
-                                                  std::vector<Vec3>& curl) {
-    curl.assign(n, Vec3{});
-
-    auto& scratch = vector_basis_scratch();
-    fill_power_tables(xi, power_limits, scratch);
-    const auto& px = scratch.px;
-    const auto& py = scratch.py;
-    const auto& pz = scratch.pz;
-
-    auto& modal_curl = scratch.vector_values;
-    modal_curl.assign(n, Vec3{});
-    for (std::size_t p = 0; p < n; ++p) {
-        const auto& poly = monomials[p];
-        auto& c = modal_curl[p];
-        for (int t = 0; t < poly.num_terms; ++t) {
-            const auto& m = poly.terms[static_cast<std::size_t>(t)];
-            add_component_monomial_curl(c, m.component, m.px, m.py, m.pz, m.coefficient, px, py, pz);
-        }
-    }
-
-    BASIS_CHECK_CONSTRUCTION(sparse_coeffs.rows == n &&
-                     sparse_coeffs.cols == n &&
-                     sparse_coeffs.row_offsets.size() == n + 1u,
-                 "evaluate_nodal_modal_vector_curl: sparse coefficient size mismatch");
-    BASIS_CHECK_CONSTRUCTION(sparse_coeffs.dofs.size() == sparse_coeffs.coefficients.size(),
-                 "evaluate_nodal_modal_vector_curl: sparse coefficient entry mismatch");
-    for (std::size_t p = 0; p < n; ++p) {
-        const Vec3& cm = modal_curl[p];
-        const std::size_t row_begin = sparse_coeffs.row_offsets[p];
-        const std::size_t row_end = sparse_coeffs.row_offsets[p + 1u];
-        for (std::size_t entry = row_begin; entry < row_end; ++entry) {
-            const std::size_t dof = sparse_coeffs.dofs[entry];
-            const Real c = sparse_coeffs.coefficients[entry];
-            curl[dof][0] += c * cm[0];
-            curl[dof][1] += c * cm[1];
-            curl[dof][2] += c * cm[2];
-        }
-    }
-}
-
-template <typename ModalPolynomials>
-void evaluate_nodal_modal_divergence_with_limits(const ModalPolynomials& monomials,
-                                                 const SparseModalCoefficientMatrix& sparse_coeffs,
-                                                 std::size_t n,
-                                                 const Vec3& xi,
-                                                 const std::array<int, 3>& power_limits,
-                                                 std::vector<Real>& divergence) {
-    divergence.assign(n, Real(0));
-
-    auto& scratch = vector_basis_scratch();
-    fill_power_tables(xi, power_limits, scratch);
-    const auto& px = scratch.px;
-    const auto& py = scratch.py;
-    const auto& pz = scratch.pz;
-
-    auto& modal_divergence = scratch.scalars;
-    modal_divergence.assign(n, Real(0));
-    for (std::size_t p = 0; p < n; ++p) {
-        const auto& poly = monomials[p];
-        Real div = Real(0);
-        for (int t = 0; t < poly.num_terms; ++t) {
-            const auto& m = poly.terms[static_cast<std::size_t>(t)];
-            if (m.component == 0 && m.px > 0) {
-                div += m.coefficient * Real(m.px) *
-                       px[static_cast<std::size_t>(m.px - 1)] *
-                       py[static_cast<std::size_t>(m.py)] *
-                       pz[static_cast<std::size_t>(m.pz)];
-            } else if (m.component == 1 && m.py > 0) {
-                div += m.coefficient * Real(m.py) *
-                       px[static_cast<std::size_t>(m.px)] *
-                       py[static_cast<std::size_t>(m.py - 1)] *
-                       pz[static_cast<std::size_t>(m.pz)];
-            } else if (m.component == 2 && m.pz > 0) {
-                div += m.coefficient * Real(m.pz) *
-                       px[static_cast<std::size_t>(m.px)] *
-                       py[static_cast<std::size_t>(m.py)] *
-                       pz[static_cast<std::size_t>(m.pz - 1)];
-            }
-        }
-        modal_divergence[p] = div;
-    }
-
-    BASIS_CHECK_CONSTRUCTION(sparse_coeffs.rows == n &&
-                     sparse_coeffs.cols == n &&
-                     sparse_coeffs.row_offsets.size() == n + 1u,
-                 "evaluate_nodal_modal_divergence: sparse coefficient size mismatch");
-    BASIS_CHECK_CONSTRUCTION(sparse_coeffs.dofs.size() == sparse_coeffs.coefficients.size(),
-                 "evaluate_nodal_modal_divergence: sparse coefficient entry mismatch");
-    for (std::size_t p = 0; p < n; ++p) {
-        const Real div = modal_divergence[p];
-        if (div == Real(0)) {
-            continue;
-        }
-        const std::size_t row_begin = sparse_coeffs.row_offsets[p];
-        const std::size_t row_end = sparse_coeffs.row_offsets[p + 1u];
-        for (std::size_t entry = row_begin; entry < row_end; ++entry) {
-            divergence[sparse_coeffs.dofs[entry]] +=
-                sparse_coeffs.coefficients[entry] * div;
-        }
-    }
-}
-
-template <typename ModalPolynomials>
-void evaluate_nodal_modal_vector_strided_with_limits(
-    const ModalPolynomials& monomials,
-    const SparseModalCoefficientMatrix& sparse_coeffs,
-    std::size_t n,
-    const std::vector<Vec3>& points,
-    std::size_t output_stride,
-    const std::array<int, 3>& power_limits,
-    Real* SVMP_RESTRICT values_out,
-    Real* SVMP_RESTRICT jacobians_out,
-    Real* SVMP_RESTRICT curls_out,
-    Real* SVMP_RESTRICT divergence_out,
-    const char* family_name) {
-    const std::size_t num_qpts = points.size();
-    validate_vector_strided_outputs(num_qpts, output_stride, family_name);
-    BASIS_CHECK_CONSTRUCTION(sparse_coeffs.rows == n &&
-                     sparse_coeffs.cols == n &&
-                     sparse_coeffs.row_offsets.size() == n + 1u,
-                 "evaluate_nodal_modal_vector_strided: sparse coefficient size mismatch");
-    BASIS_CHECK_CONSTRUCTION(sparse_coeffs.dofs.size() == sparse_coeffs.coefficients.size(),
-                 "evaluate_nodal_modal_vector_strided: sparse coefficient entry mismatch");
-
-    auto& scratch = vector_basis_scratch();
-    const bool need_values = values_out != nullptr;
-    const bool need_jacobians = jacobians_out != nullptr;
-    const bool need_curls = curls_out != nullptr;
-    const bool need_divergence = divergence_out != nullptr;
-
-    if (need_values) {
-        zero_active_strided_rows(values_out, n * 3u, output_stride, num_qpts);
-    }
-    if (need_jacobians) {
-        zero_active_strided_rows(jacobians_out, n * 9u, output_stride, num_qpts);
-    }
-    if (need_curls) {
-        zero_active_strided_rows(curls_out, n * 3u, output_stride, num_qpts);
-    }
-    if (need_divergence) {
-        zero_active_strided_rows(divergence_out, n, output_stride, num_qpts);
-    }
-    if (num_qpts == 0 || n == 0) {
-        return;
-    }
-
-    fill_batched_power_tables(points, power_limits, scratch);
-    const auto& px = scratch.batched_px;
-    const auto& py = scratch.batched_py;
-    const auto& pz = scratch.batched_pz;
-    const std::size_t power_stride = num_qpts;
-    const bool need_modal_gradient = need_jacobians || need_curls || need_divergence;
-
-    auto& modal_values = scratch.modal_values_batched;
-    auto& modal_jacobians = scratch.modal_jacobians_batched;
-    auto& modal_curls = scratch.modal_curls_batched;
-    auto& modal_divergence = scratch.modal_divergence_batched;
-
-    for (std::size_t p = 0; p < n; ++p) {
-        if (need_values) {
-            modal_values.assign(3u * num_qpts, Real(0));
-        }
-        if (need_jacobians) {
-            modal_jacobians.assign(9u * num_qpts, Real(0));
-        }
-        if (need_curls) {
-            modal_curls.assign(3u * num_qpts, Real(0));
-        }
-        if (need_divergence) {
-            modal_divergence.assign(num_qpts, Real(0));
-        }
-
-        const auto& poly = monomials[p];
-        for (int term_index = 0; term_index < poly.num_terms; ++term_index) {
-            const auto& term = poly.terms[static_cast<std::size_t>(term_index)];
-            const std::size_t component = static_cast<std::size_t>(term.component);
-            Real* modal_value_row = need_values
-                ? modal_values.data() + component * num_qpts
-                : nullptr;
-            Real* modal_jacobian_row = need_jacobians
-                ? modal_jacobians.data() + component * 3u * num_qpts
-                : nullptr;
-            Real* modal_curl_rows = need_curls ? modal_curls.data() : nullptr;
-            Real* modal_divergence_row =
-                need_divergence ? modal_divergence.data() : nullptr;
-
-            if (need_values) {
-                for (std::size_t q = 0; q < num_qpts; ++q) {
-                    modal_value_row[q] +=
-                        term.coefficient *
-                        batched_power_product(px,
-                                              py,
-                                              pz,
-                                              power_stride,
-                                              term.px,
-                                              term.py,
-                                              term.pz,
-                                              q);
-                }
-            }
-
-            if (need_modal_gradient) {
-                for (std::size_t q = 0; q < num_qpts; ++q) {
-                    const Real dphidx =
-                        term.coefficient *
-                        batched_component_partial(px,
-                                                  py,
-                                                  pz,
-                                                  power_stride,
-                                                  term.px,
-                                                  term.py,
-                                                  term.pz,
-                                                  0,
-                                                  q);
-                    const Real dphidy =
-                        term.coefficient *
-                        batched_component_partial(px,
-                                                  py,
-                                                  pz,
-                                                  power_stride,
-                                                  term.px,
-                                                  term.py,
-                                                  term.pz,
-                                                  1,
-                                                  q);
-                    const Real dphidz =
-                        term.coefficient *
-                        batched_component_partial(px,
-                                                  py,
-                                                  pz,
-                                                  power_stride,
-                                                  term.px,
-                                                  term.py,
-                                                  term.pz,
-                                                  2,
-                                                  q);
-
-                    if (need_jacobians) {
-                        modal_jacobian_row[q] += dphidx;
-                        modal_jacobian_row[num_qpts + q] += dphidy;
-                        modal_jacobian_row[2u * num_qpts + q] += dphidz;
-                    }
-                    if (need_curls) {
-                        const Vec3 curl =
-                            curl_from_component_gradient(term.component,
-                                                         dphidx,
-                                                         dphidy,
-                                                         dphidz);
-                        modal_curl_rows[q] += curl[0];
-                        modal_curl_rows[num_qpts + q] += curl[1];
-                        modal_curl_rows[2u * num_qpts + q] += curl[2];
-                    }
-                    if (need_divergence) {
-                        const Real div = term.component == 0 ? dphidx
-                                       : term.component == 1 ? dphidy
-                                                            : dphidz;
-                        modal_divergence_row[q] += div;
-                    }
-                }
-            }
-        }
-
-        const std::size_t row_begin = sparse_coeffs.row_offsets[p];
-        const std::size_t row_end = sparse_coeffs.row_offsets[p + 1u];
-        for (std::size_t entry = row_begin; entry < row_end; ++entry) {
-            const std::size_t dof = sparse_coeffs.dofs[entry];
-            const Real c = sparse_coeffs.coefficients[entry];
-            if (need_values) {
-                for (std::size_t component = 0; component < 3u; ++component) {
-                    axpy_qpoints(values_out + (dof * 3u + component) * output_stride,
-                                 modal_values.data() + component * num_qpts,
-                                 c,
-                                 num_qpts);
-                }
-            }
-            if (need_jacobians) {
-                for (std::size_t row = 0; row < 3u; ++row) {
-                    for (std::size_t col = 0; col < 3u; ++col) {
-                        axpy_qpoints(jacobians_out +
-                                         (dof * 9u + row * 3u + col) * output_stride,
-                                     modal_jacobians.data() +
-                                         (row * 3u + col) * num_qpts,
-                                     c,
-                                     num_qpts);
-                    }
-                }
-            }
-            if (need_curls) {
-                for (std::size_t component = 0; component < 3u; ++component) {
-                    axpy_qpoints(curls_out + (dof * 3u + component) * output_stride,
-                                 modal_curls.data() + component * num_qpts,
-                                 c,
-                                 num_qpts);
-                }
-            }
-            if (need_divergence) {
-                axpy_qpoints(divergence_out + dof * output_stride,
-                             modal_divergence.data(),
-                             c,
-                             num_qpts);
-            }
-        }
-    }
-}
-
-std::vector<std::array<int, 4>> make_nd_extra_monomial_candidates(ElementType type,
-                                                                  int order);
-Real eval_transformed_nd_monomial_scalar(const std::array<int, 4>& mono,
-                                         const std::vector<Real>& px,
-                                         const std::vector<Real>& py,
-                                         const std::vector<Real>& pz);
-Vec3 eval_transformed_nd_monomial_curl(const std::array<int, 4>& mono,
-                                       const std::vector<Real>& px,
-                                       const std::vector<Real>& py,
-                                       const std::vector<Real>& pz);
-
-
-} // namespace vector_common
-} // namespace detail
-} // namespace basis
-} // namespace FE
-} // namespace svmp
-
-#endif // SVMP_FE_BASIS_VECTORBASISEVALUATIONHELPERS_H
diff --git a/Code/Source/solver/FE/Basis/VectorBasisModalPolynomial.h b/Code/Source/solver/FE/Basis/VectorBasisModalPolynomial.h
deleted file mode 100644
index 6e1a7202b..000000000
--- a/Code/Source/solver/FE/Basis/VectorBasisModalPolynomial.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright (c) Stanford University, The Regents of the University of California, and others.
- *
- * All Rights Reserved.
- *
- * See License file.
- */
-
-#ifndef SVMP_FE_BASIS_VECTORBASISMODALPOLYNOMIAL_H
-#define SVMP_FE_BASIS_VECTORBASISMODALPOLYNOMIAL_H
-
-#include "Types.h"
-
-#include <algorithm>
-#include <array>
-#include <vector>
-
-namespace svmp {
-namespace FE {
-namespace basis {
-
-struct VectorBasisModalTerm {
-    int component{0}; // 0=x, 1=y, 2=z
-    int px{0};
-    int py{0};
-    int pz{0};
-    Real coefficient{Real(1)};
-};
-
-struct VectorBasisModalPolynomial {
-    std::array<VectorBasisModalTerm, 4> terms{};
-    int num_terms{0};
-};
-
-inline bool modal_terms_equal(const VectorBasisModalTerm& lhs,
-                              const VectorBasisModalTerm& rhs) noexcept {
-    return lhs.component == rhs.component &&
-           lhs.px == rhs.px &&
-           lhs.py == rhs.py &&
-           lhs.pz == rhs.pz &&
-           lhs.coefficient == rhs.coefficient;
-}
-
-inline bool modal_polynomials_equal(const VectorBasisModalPolynomial& lhs,
-                                    const VectorBasisModalPolynomial& rhs) noexcept {
-    if (lhs.num_terms != rhs.num_terms) {
-        return false;
-    }
-    for (int term = 0; term < lhs.num_terms; ++term) {
-        const auto index = static_cast<std::size_t>(term);
-        if (!modal_terms_equal(lhs.terms[index], rhs.terms[index])) {
-            return false;
-        }
-    }
-    return true;
-}
-
-inline bool append_unique_modal_polynomial(
-    std::vector<VectorBasisModalPolynomial>& polynomials,
-    const VectorBasisModalPolynomial& polynomial) {
-    const auto found = std::find_if(
-        polynomials.begin(),
-        polynomials.end(),
-        [&](const VectorBasisModalPolynomial& existing) {
-            return modal_polynomials_equal(existing, polynomial);
-        });
-    if (found != polynomials.end()) {
-        return false;
-    }
-    polynomials.push_back(polynomial);
-    return true;
-}
-
-} // namespace basis
-} // namespace FE
-} // namespace svmp
-
-#endif // SVMP_FE_BASIS_VECTORBASISMODALPOLYNOMIAL_H
diff --git a/Code/Source/solver/FE/Common/Alignment.h b/Code/Source/solver/FE/Common/Alignment.h
deleted file mode 100644
index 8d33a7a7a..000000000
--- a/Code/Source/solver/FE/Common/Alignment.h
+++ /dev/null
@@ -1,23 +0,0 @@
-#ifndef SVMP_FE_CORE_ALIGNMENT_H
-#define SVMP_FE_CORE_ALIGNMENT_H
-
-/**
- * @file Alignment.h
- * @brief Global alignment constants used across FE modules.
- */
-
-#include <cstddef>
-
-namespace svmp {
-namespace FE {
-
-/// Preferred cache-line/SIMD alignment for performance-critical arrays.
-inline constexpr std::size_t kFEPreferredAlignmentBytes = 64u;
-
-/// Alignment for small fixed-size math objects that are commonly passed by value.
-inline constexpr std::size_t kFEFixedObjectAlignmentBytes = 32u;
-
-} // namespace FE
-} // namespace svmp
-
-#endif // SVMP_FE_CORE_ALIGNMENT_H
diff --git a/Code/Source/solver/FE/Common/Types.h b/Code/Source/solver/FE/Common/Types.h
index 60312a524..bb3f23bca 100644
--- a/Code/Source/solver/FE/Common/Types.h
+++ b/Code/Source/solver/FE/Common/Types.h
@@ -70,8 +70,9 @@ enum class CellFamily {
 #endif
 } // namespace svmp
 #endif
-#include <cstdint>
 #include <array>
+#include <cstddef>
+#include <cstdint>
 #include <string>
 #include <type_traits>
 #include <limits>
@@ -174,6 +175,12 @@ constexpr BlockId INVALID_BLOCK_ID = std::numeric_limits<BlockId>::max();
  */
 constexpr FieldId CURRENT_SOLUTION_FIELD_ID = std::numeric_limits<FieldId>::max();
 
+/// Preferred cache-line/SIMD alignment for performance-critical arrays.
+inline constexpr std::size_t kFEPreferredAlignmentBytes = 64u;
+
+/// Alignment for small fixed-size math objects that are commonly passed by value.
+inline constexpr std::size_t kFEFixedObjectAlignmentBytes = 32u;
+
 // ============================================================================
 // Field Value Entry (for point evaluation of field-dependent expressions)
 // ============================================================================
diff --git a/Code/Source/solver/FE/Math/Matrix.h b/Code/Source/solver/FE/Math/Matrix.h
index 0b80091f9..6058ab943 100644
--- a/Code/Source/solver/FE/Math/Matrix.h
+++ b/Code/Source/solver/FE/Math/Matrix.h
@@ -14,7 +14,7 @@
 #include "MatrixExpr.h"
 #include "Vector.h"
 #include "MathConstants.h"
-#include "../Common/Alignment.h"
+#include "../Common/Types.h"
 #include <algorithm>
 #include <array>
 #include <cmath>
diff --git a/Code/Source/solver/FE/Math/Vector.h b/Code/Source/solver/FE/Math/Vector.h
index e272bd6dd..76c7be152 100644
--- a/Code/Source/solver/FE/Math/Vector.h
+++ b/Code/Source/solver/FE/Math/Vector.h
@@ -12,7 +12,7 @@
 
 #include "VectorExpr.h"
 #include "MathConstants.h"
-#include "../Common/Alignment.h"
+#include "../Common/Types.h"
 #include <algorithm>
 #include <array>
 #include <cmath>
diff --git a/Code/Source/solver/FE/Quadrature/QuadratureRule.h b/Code/Source/solver/FE/Quadrature/QuadratureRule.h
deleted file mode 100644
index f7d186891..000000000
--- a/Code/Source/solver/FE/Quadrature/QuadratureRule.h
+++ /dev/null
@@ -1,237 +0,0 @@
-/* Copyright (c) Stanford University, The Regents of the University of California, and others.
- *
- * All Rights Reserved.
- *
- * See License file.
- */
-
-#ifndef SVMP_FE_QUADRATURE_RULE_H
-#define SVMP_FE_QUADRATURE_RULE_H
-
-/**
- * @file QuadratureRule.h
- * @brief Abstracted quadrature rule representation for FE integration
- *
- * This header defines the base class for all quadrature rules used by the
- * finite element infrastructure. Rules are expressed in reference element
- * space only; mapping to physical space is handled by the Geometry module.
- *
- * The interface is intentionally lightweight and header-only to avoid coupling
- * Quadrature to other modules while remaining compatible with the Mesh library
- * through shared type aliases provided by FE/Common/Types.h.
- */
-
-#include "Types.h"
-#include "FEException.h"
-#include "Math/Vector.h"
-#include <algorithm>
-#include <cmath>
-#include <cstddef>
-#include <cstdint>
-#include <cstring>
-#include <iomanip>
-#include <limits>
-#include <sstream>
-#include <string>
-#include <vector>
-
-namespace svmp {
-namespace FE {
-namespace quadrature {
-
-/// Convenience alias for quadrature point representation in reference space
-using QuadPoint = math::Vector<Real, 3>;
-
-struct QuadraturePointFingerprint {
-    int dimension{0};
-    std::size_t num_points{0};
-    std::uint64_t points_hash_a{0};
-    std::uint64_t points_hash_b{0};
-};
-
-/**
- * @brief Base class for quadrature rules over reference elements
- *
- * Derived classes populate the point/weight data via the protected setters.
- * The class performs lightweight consistency checks (size agreement, basic
- * reference-measure validation) but leaves element-specific checks to callers.
- */
-class QuadratureRule {
-public:
-    virtual ~QuadratureRule() = default;
-
-    /// Number of quadrature points
-    std::size_t num_points() const noexcept { return points_.size(); }
-
-    /// Polynomial exactness degree reported by the rule
-    int order() const noexcept { return order_; }
-
-    /// Spatial dimension of the reference domain
-    int dimension() const noexcept { return dimension_; }
-
-    /// Cell family the rule integrates over (line, tri, quad, ...)
-    svmp::CellFamily cell_family() const noexcept { return cell_family_; }
-
-    /// Access a single quadrature point (no bounds checking)
-    QuadPoint point(std::size_t i) const noexcept { return points_[i]; }
-
-    /// Access a single quadrature weight (no bounds checking)
-    Real weight(std::size_t i) const noexcept { return weights_[i]; }
-
-    /// Bulk accessors
-    const std::vector<QuadPoint>& points() const noexcept { return points_; }
-    const std::vector<Real>& weights() const noexcept { return weights_; }
-
-    /// Cached coordinate-only fingerprint for consumers whose values depend on
-    /// reference points but not quadrature weights.
-    QuadraturePointFingerprint point_fingerprint() const noexcept { return point_fingerprint_; }
-
-    /// Stable semantic identity used by BasisCache
-    virtual std::string cache_identity() const;
-
-    /**
-     * @brief Validate rule data for basic consistency
-     * @param tol Relative tolerance for weight sum check
-     * @return True if rule passes size and weight checks
-     */
-    virtual bool is_valid(Real tol = 1e-12) const;
-
-    /**
-     * @brief Reference-domain measure for the element family
-     *
-     * Length/area/volume of the canonical reference element:
-     * - Line   [-1,1]            -> 2
-     * - Quad   [-1,1]^2          -> 4
-     * - Hex    [-1,1]^3          -> 8
-     * - Tri    (0,0)-(1,0)-(0,1) -> 0.5
-     * - Tet    simplex at origin -> 1/6
-     * - Wedge  (triangle x line) -> 1
-     * - Pyramid (x,y in [-1,1], z in [0,1]) -> 4/3
-     */
-    Real reference_measure() const noexcept;
-
-protected:
-    QuadratureRule(svmp::CellFamily family, int dimension, int order = 0)
-        : cell_family_(family), dimension_(dimension), order_(order) {}
-
-    /// Assign point and weight storage (sizes must match)
-    void set_data(std::vector<QuadPoint> pts, std::vector<Real> wts);
-
-    /// Override computed order in derived classes
-    void set_order(int ord) noexcept { order_ = ord; }
-
-private:
-    std::string build_cache_identity() const;
-    QuadraturePointFingerprint build_point_fingerprint() const noexcept;
-
-    svmp::CellFamily cell_family_;
-    int dimension_;
-    int order_;
-    std::vector<QuadPoint> points_;
-    std::vector<Real> weights_;
-    std::string cache_identity_;
-    QuadraturePointFingerprint point_fingerprint_;
-};
-
-// --------------------------------------------------------------------------------
-// Inline implementations
-// --------------------------------------------------------------------------------
-
-inline void QuadratureRule::set_data(std::vector<QuadPoint> pts, std::vector<Real> wts) {
-    if (pts.size() != wts.size()) {
-        throw FEException("QuadratureRule: points/weights size mismatch",
-                          StatusCode::InvalidArgument,
-                          __FILE__, __LINE__, __func__);
-    }
-    points_ = std::move(pts);
-    weights_ = std::move(wts);
-    point_fingerprint_ = build_point_fingerprint();
-    cache_identity_ = build_cache_identity();
-}
-
-inline bool QuadratureRule::is_valid(Real tol) const {
-    if (points_.empty() || points_.size() != weights_.size()) {
-        return false;
-    }
-    Real sum_w = Real(0);
-    for (Real w : weights_) {
-        if (!std::isfinite(w)) {
-            return false;
-        }
-        sum_w += w;
-    }
-    const Real ref = reference_measure();
-    const Real denom = std::max(Real(1), std::abs(ref));
-    return std::abs(sum_w - ref) <= tol * denom;
-}
-
-inline std::string QuadratureRule::cache_identity() const {
-    if (!cache_identity_.empty()) {
-        return cache_identity_;
-    }
-    return build_cache_identity();
-}
-
-inline std::string QuadratureRule::build_cache_identity() const {
-    std::ostringstream oss;
-    oss << "dim=" << dimension_
-        << "|npts=" << points_.size();
-
-    oss << std::setprecision(std::numeric_limits<Real>::max_digits10);
-    for (const auto& pt : points_) {
-        oss << "|pt=" << pt[0] << ',' << pt[1] << ',' << pt[2];
-    }
-    return oss.str();
-}
-
-inline QuadraturePointFingerprint QuadratureRule::build_point_fingerprint() const noexcept {
-    auto real_bits = [](Real value) noexcept {
-        static_assert(sizeof(Real) <= sizeof(std::uint64_t),
-                      "Quadrature point fingerprints assume Real fits in 64 bits");
-        std::uint64_t bits = 0;
-        std::memcpy(&bits, &value, sizeof(Real));
-        return bits;
-    };
-    auto mix_hash = [](std::uint64_t& seed, std::uint64_t value) noexcept {
-        seed ^= value + 0x9e3779b97f4a7c15ULL + (seed << 6u) + (seed >> 2u);
-    };
-
-    QuadraturePointFingerprint fingerprint;
-    fingerprint.dimension = dimension_;
-    fingerprint.num_points = points_.size();
-    fingerprint.points_hash_a = 1469598103934665603ULL;
-    fingerprint.points_hash_b = 1099511628211ULL;
-
-    mix_hash(fingerprint.points_hash_a, static_cast<std::uint64_t>(fingerprint.dimension));
-    mix_hash(fingerprint.points_hash_a, static_cast<std::uint64_t>(fingerprint.num_points));
-    mix_hash(fingerprint.points_hash_b, static_cast<std::uint64_t>(fingerprint.num_points));
-    mix_hash(fingerprint.points_hash_b, static_cast<std::uint64_t>(fingerprint.dimension));
-    for (const auto& point : points_) {
-        for (std::size_t component = 0; component < 3u; ++component) {
-            const std::uint64_t bits = real_bits(point[component]);
-            mix_hash(fingerprint.points_hash_a, bits);
-            mix_hash(fingerprint.points_hash_b, bits ^ (0xbf58476d1ce4e5b9ULL + component));
-        }
-    }
-    return fingerprint;
-}
-
-inline Real QuadratureRule::reference_measure() const noexcept {
-    switch (cell_family_) {
-        case svmp::CellFamily::Line:      return Real(2);
-        case svmp::CellFamily::Quad:      return Real(4);
-        case svmp::CellFamily::Hex:       return Real(8);
-        case svmp::CellFamily::Triangle:  return Real(0.5);
-        case svmp::CellFamily::Tetra:     return Real(1.0 / 6.0);
-        case svmp::CellFamily::Wedge:     return Real(1.0);     // 0.5 area * length 2
-        case svmp::CellFamily::Pyramid:   return Real(4.0 / 3.0);
-        case svmp::CellFamily::Point:     return Real(1.0);
-        default:                          return Real(1.0);
-    }
-}
-
-} // namespace quadrature
-} // namespace FE
-} // namespace svmp
-
-#endif // SVMP_FE_QUADRATURE_RULE_H
diff --git a/Code/Source/solver/Timer.h b/Code/Source/solver/Timer.h
index 6810ae17c..b8ffa29df 100644
--- a/Code/Source/solver/Timer.h
+++ b/Code/Source/solver/Timer.h
@@ -4,28 +4,24 @@
 #ifndef TIMER_H 
 #define TIMER_H 
 
-#include <chrono>
-#include <iostream>
-#include <string>
+#include <sys/time.h>
 
 /// @brief Keep track of time
 class Timer 
 {
   public:
 
-    double get_elapsed_time()
+    double get_elapsed_time() const
     {
       return get_time() - current_time;
     }
 
-    double get_time()
+    double get_time() const
     {
-      auto now = std::chrono::system_clock::now();
-      auto now_ms = std::chrono::time_point_cast<std::chrono::milliseconds>(now);
-
-      auto value = now_ms.time_since_epoch();
-      auto duration = value.count() / 1000.0;
-      return static_cast<double>(duration);
+      timeval now{};
+      gettimeofday(&now, nullptr);
+      return static_cast<double>(now.tv_sec) +
+             static_cast<double>(now.tv_usec) * 1.0e-6;
     }
 
     void set_time()
@@ -33,8 +29,7 @@ class Timer
       current_time = get_time();
     }
 
-    double current_time;
+    double current_time{0.0};
 };
 
 #endif
-
diff --git a/Code/Source/solver/load_msh.cpp b/Code/Source/solver/load_msh.cpp
index c7c5a62ba..50d0ca858 100644
--- a/Code/Source/solver/load_msh.cpp
+++ b/Code/Source/solver/load_msh.cpp
@@ -13,7 +13,6 @@
 #include <iostream>
 #include <fstream>
 #include <sstream>
-#include <chrono>
 #include <unordered_map>
 #include <string>
 #include <iomanip>
@@ -300,4 +299,3 @@ void read_sv(Simulation* simulation, mshType& mesh, const MeshParameters* mesh_p
         }
     }
 };
-
diff --git a/Code/Source/solver/utils.cpp b/Code/Source/solver/utils.cpp
index 4d5b847cd..233d35474 100644
--- a/Code/Source/solver/utils.cpp
+++ b/Code/Source/solver/utils.cpp
@@ -4,7 +4,6 @@
 #include "utils.h"
 
 #include <bitset>
-#include <chrono>
 #include <cmath> 
 #include <limits>
 
@@ -13,6 +12,7 @@
 #include <iostream>
 #include <fstream>
 #include <sys/resource.h>
+#include <sys/time.h>
 
 /* MacOS
 #include <mach/task.h>
@@ -35,12 +35,10 @@ int CountBits(int n)
 
 double cput()
 {
-  auto now = std::chrono::system_clock::now();
-  auto now_ms = std::chrono::time_point_cast<std::chrono::milliseconds>(now);
-
-  auto value = now_ms.time_since_epoch();
-  auto duration = value.count() / 1000.0;
-  return static_cast<double>(duration);
+  timeval now{};
+  gettimeofday(&now, nullptr);
+  return static_cast<double>(now.tv_sec) +
+         static_cast<double>(now.tv_usec) * 1.0e-6;
 }
 
 Vector<double> 
@@ -386,4 +384,4 @@ void find_loc(const Array<int>& array, int value, std::array<int, 2>& ind)
   }
 }
 
-};
\ No newline at end of file
+};
diff --git a/Code/ThirdParty/eigen/include/eigen3/unsupported/Eigen/CXX11/Tensor b/Code/ThirdParty/eigen/include/eigen3/unsupported/Eigen/CXX11/Tensor
index 0938bb554..45b176fe7 100644
--- a/Code/ThirdParty/eigen/include/eigen3/unsupported/Eigen/CXX11/Tensor
+++ b/Code/ThirdParty/eigen/include/eigen3/unsupported/Eigen/CXX11/Tensor
@@ -34,7 +34,9 @@
   */
 
 #include <atomic>
+#ifdef EIGEN_USE_GPU
 #include <chrono>
+#endif
 #include <cmath>
 #include <cstddef>
 #include <cstring>
diff --git a/tests/unitTests/FE/Basis/test_BasisCacheFactory.cpp b/tests/unitTests/FE/Basis/test_BasisCacheFactory.cpp
deleted file mode 100644
index 216fd0401..000000000
--- a/tests/unitTests/FE/Basis/test_BasisCacheFactory.cpp
+++ /dev/null
@@ -1,256 +0,0 @@
-/**
- * @file test_BasisCacheFactory.cpp
- * @brief Tests for the migrated Basis cache and factory subset.
- */
-
-#include <gtest/gtest.h>
-
-#include "FE/Basis/BasisCache.h"
-#include "FE/Basis/BasisFactory.h"
-#include "FE/Basis/LagrangeBasis.h"
-#include "FE/Basis/SerendipityBasis.h"
-#include "FE/Quadrature/QuadratureRule.h"
-
-#include <memory>
-#include <vector>
-
-using namespace svmp::FE;
-using namespace svmp::FE::basis;
-using namespace svmp::FE::quadrature;
-
-namespace {
-
-class CustomQuadratureRule final : public QuadratureRule {
-public:
-    CustomQuadratureRule(svmp::CellFamily family,
-                         int dimension,
-                         int order,
-                         std::vector<QuadPoint> points,
-                         std::vector<Real> weights)
-        : QuadratureRule(family, dimension, order)
-    {
-        set_data(std::move(points), std::move(weights));
-    }
-};
-
-CustomQuadratureRule line_rule() {
-    return CustomQuadratureRule(
-        svmp::CellFamily::Line, 1, 3,
-        {
-            QuadPoint{Real(-0.5), Real(0), Real(0)},
-            QuadPoint{Real(0.5), Real(0), Real(0)}
-        },
-        {Real(1), Real(1)});
-}
-
-CustomQuadratureRule quad_rule(Real first_weight = Real(1)) {
-    return CustomQuadratureRule(
-        svmp::CellFamily::Quad, 2, 3,
-        {
-            QuadPoint{Real(-0.5), Real(-0.5), Real(0)},
-            QuadPoint{Real(0.5), Real(-0.25), Real(0)},
-            QuadPoint{Real(0.0), Real(0.5), Real(0)}
-        },
-        {first_weight, Real(1), Real(2)});
-}
-
-class TestCustomScalarBasis final : public BasisFunction {
-public:
-    explicit TestCustomScalarBasis(int tag)
-        : tag_(tag)
-    {
-    }
-
-    BasisType basis_type() const noexcept override { return BasisType::Custom; }
-    ElementType element_type() const noexcept override { return ElementType::Line2; }
-    int dimension() const noexcept override { return 1; }
-    int order() const noexcept override { return 1; }
-    std::size_t size() const noexcept override { return 2u; }
-
-    std::string cache_identity() const override {
-        return BasisFunction::cache_identity() + "|tag=" + std::to_string(tag_);
-    }
-
-    void evaluate_values(const math::Vector<Real, 3>& xi,
-                         std::vector<Real>& values) const override
-    {
-        values.resize(2u);
-        const Real shift = Real(tag_) * Real(0.125);
-        values[0] = Real(0.5) * (Real(1) - xi[0]) + shift;
-        values[1] = Real(0.5) * (Real(1) + xi[0]) - shift;
-    }
-
-    void evaluate_gradients(const math::Vector<Real, 3>&,
-                            std::vector<Gradient>& gradients) const override
-    {
-        gradients.assign(2u, Gradient{});
-        gradients[0][0] = Real(-0.5);
-        gradients[1][0] = Real(0.5);
-    }
-
-private:
-    int tag_{0};
-};
-
-class StructuredIdentityScalarBasis final : public BasisFunction {
-public:
-    explicit StructuredIdentityScalarBasis(int tag)
-        : tag_(tag)
-    {
-    }
-
-    BasisType basis_type() const noexcept override { return BasisType::Custom; }
-    ElementType element_type() const noexcept override { return ElementType::Line2; }
-    int dimension() const noexcept override { return 1; }
-    int order() const noexcept override { return 1; }
-    std::size_t size() const noexcept override { return 2u; }
-
-    bool cache_identity_words(std::vector<std::uint64_t>& words) const override {
-        words.push_back(0x7374727563746964ULL);
-        words.push_back(static_cast<std::uint64_t>(tag_));
-        return true;
-    }
-
-    std::string cache_identity() const override {
-        ++string_identity_calls;
-        return BasisFunction::cache_identity() + "|structured-tag=" + std::to_string(tag_);
-    }
-
-    void evaluate_values(const math::Vector<Real, 3>& xi,
-                         std::vector<Real>& values) const override
-    {
-        values.resize(2u);
-        values[0] = Real(1) - xi[0] + Real(tag_);
-        values[1] = xi[0] - Real(tag_);
-    }
-
-    mutable std::size_t string_identity_calls{0};
-
-private:
-    int tag_{0};
-};
-
-} // namespace
-
-TEST(BasisFactory, CreatesLagrangeAndSerendipityBases) {
-    auto lagrange = basis_factory::create(
-        BasisRequest{ElementType::Line2, BasisType::Lagrange, 2});
-    ASSERT_NE(lagrange, nullptr);
-    EXPECT_EQ(lagrange->basis_type(), BasisType::Lagrange);
-    EXPECT_EQ(lagrange->element_type(), ElementType::Line2);
-    EXPECT_EQ(lagrange->order(), 2);
-
-    auto serendipity = basis_factory::create(
-        BasisRequest{ElementType::Quad8, BasisType::Serendipity, 2});
-    ASSERT_NE(serendipity, nullptr);
-    EXPECT_EQ(serendipity->basis_type(), BasisType::Serendipity);
-    EXPECT_EQ(serendipity->element_type(), ElementType::Quad8);
-    EXPECT_EQ(serendipity->size(), 8u);
-}
-
-TEST(BasisFactory, RejectsOutOfScopeAndInvalidRequests) {
-    EXPECT_THROW(
-        (void)basis_factory::create(BasisRequest{ElementType::Line2, BasisType::Lagrange}),
-        BasisConfigurationException);
-    EXPECT_THROW(
-        (void)basis_factory::create(
-            BasisRequest{ElementType::Line2, BasisType::Lagrange, -1}),
-        BasisConfigurationException);
-    EXPECT_THROW(
-        (void)basis_factory::create(
-            BasisRequest{ElementType::Line2, BasisType::Bernstein, 1}),
-        BasisConfigurationException);
-    EXPECT_THROW(
-        (void)basis_factory::create(
-            BasisRequest{ElementType::Line2,
-                         BasisType::Lagrange,
-                         1,
-                         Continuity::H_div,
-                         FieldType::Vector}),
-        BasisConfigurationException);
-}
-
-TEST(BasisFactory, SupportsCustomFactoryRegistration) {
-    basis_factory::clear_custom_registry_for_tests();
-    basis_factory::register_custom(
-        "test-custom",
-        [](const BasisRequest& req) {
-            const int tag = req.order.value_or(0);
-            return std::make_shared<TestCustomScalarBasis>(tag);
-        });
-
-    BasisRequest req{ElementType::Line2, BasisType::Custom, 7};
-    req.custom_id = "test-custom";
-    auto custom = basis_factory::create(req);
-    ASSERT_NE(custom, nullptr);
-    EXPECT_EQ(custom->basis_type(), BasisType::Custom);
-    EXPECT_EQ(custom->size(), 2u);
-
-    basis_factory::unregister_custom("test-custom");
-    EXPECT_THROW((void)basis_factory::create(req), BasisConfigurationException);
-    basis_factory::clear_custom_registry_for_tests();
-}
-
-TEST(BasisCache, ReusesEntriesForSameBasisAndQuadratureCoordinates) {
-    LagrangeBasis basis(ElementType::Line2, 2);
-    const auto quad = line_rule();
-
-    auto& cache = BasisCache::instance();
-    cache.clear();
-    const auto& entry1 = cache.get_or_compute(basis, quad, true, true);
-    const auto& entry2 = cache.get_or_compute(basis, quad, true, true);
-
-    EXPECT_EQ(&entry1, &entry2);
-    EXPECT_EQ(entry1.num_qpts, quad.num_points());
-    EXPECT_EQ(entry1.num_dofs, basis.size());
-    ASSERT_EQ(entry1.scalar_values.size(), basis.size() * quad.num_points());
-    ASSERT_EQ(entry1.gradients.size(), basis.size() * 3u * quad.num_points());
-    ASSERT_EQ(entry1.hessians.size(), basis.size() * 9u * quad.num_points());
-    EXPECT_EQ(cache.size(), 1u);
-}
-
-TEST(BasisCache, ReusesCoordinateIdenticalQuadratureRulesIgnoringWeights) {
-    SerendipityBasis basis(ElementType::Quad8, 2);
-    const auto quad_a = quad_rule(Real(1));
-    const auto quad_b = quad_rule(Real(0.25));
-
-    auto& cache = BasisCache::instance();
-    cache.clear();
-    const auto& entry_a = cache.get_or_compute(basis, quad_a, true, false);
-    const auto& entry_b = cache.get_or_compute(basis, quad_b, true, false);
-
-    EXPECT_EQ(&entry_a, &entry_b);
-    EXPECT_EQ(cache.size(), 1u);
-}
-
-TEST(BasisCache, SeparatesStringIdentityCustomBases) {
-    TestCustomScalarBasis custom_a(1);
-    TestCustomScalarBasis custom_b(2);
-    const auto quad = line_rule();
-
-    auto& cache = BasisCache::instance();
-    cache.clear();
-    const auto& entry_a = cache.get_or_compute(custom_a, quad, false, false);
-    const auto& entry_b = cache.get_or_compute(custom_b, quad, false, false);
-
-    EXPECT_NE(&entry_a, &entry_b);
-    EXPECT_NE(entry_a.scalar_values, entry_b.scalar_values);
-    EXPECT_EQ(cache.size(), 2u);
-}
-
-TEST(BasisCache, StructuredIdentityAvoidsStringFallbackAndSeparatesBases) {
-    StructuredIdentityScalarBasis custom_a(1);
-    StructuredIdentityScalarBasis custom_b(2);
-    const auto quad = line_rule();
-
-    auto& cache = BasisCache::instance();
-    cache.clear();
-    const auto& entry_a = cache.get_or_compute(custom_a, quad, false, false);
-    const auto& entry_b = cache.get_or_compute(custom_b, quad, false, false);
-
-    EXPECT_NE(&entry_a, &entry_b);
-    EXPECT_EQ(custom_a.string_identity_calls, 0u);
-    EXPECT_EQ(custom_b.string_identity_calls, 0u);
-    EXPECT_EQ(cache.size(), 2u);
-}
-
diff --git a/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp b/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
index 967f078aa..7838702b0 100644
--- a/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
+++ b/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
@@ -21,7 +21,7 @@ namespace {
 
 class MinimalScalarBasis : public BasisFunction {
 public:
-    BasisType basis_type() const noexcept override { return BasisType::Custom; }
+    BasisType basis_type() const noexcept override { return BasisType::Lagrange; }
     ElementType element_type() const noexcept override { return ElementType::Line2; }
     int dimension() const noexcept override { return 1; }
     int order() const noexcept override { return 1; }
@@ -36,7 +36,7 @@ class MinimalScalarBasis : public BasisFunction {
 
 class CompleteFallbackBasis : public BasisFunction {
 public:
-    BasisType basis_type() const noexcept override { return BasisType::Custom; }
+    BasisType basis_type() const noexcept override { return BasisType::Lagrange; }
     ElementType element_type() const noexcept override { return ElementType::Triangle3; }
     int dimension() const noexcept override { return 2; }
     int order() const noexcept override { return 1; }
@@ -90,6 +90,8 @@ TEST(BasisErrorPaths, SerendipityInvalidRequestsThrowBasisExceptions) {
                  BasisElementCompatibilityException);
     EXPECT_THROW(SerendipityBasis(ElementType::Quad8, 3),
                  BasisConfigurationException);
+    EXPECT_THROW(SerendipityBasis(ElementType::Pyramid13, 2),
+                 BasisElementCompatibilityException);
     EXPECT_THROW(SerendipityBasis(ElementType::Pyramid14, 2),
                  BasisElementCompatibilityException);
 }
@@ -104,6 +106,13 @@ TEST(BasisErrorPaths, BasisFactoryInvalidRequestsThrowBasisExceptions) {
     EXPECT_THROW((void)basis_factory::create(
                      BasisRequest{ElementType::Line2, BasisType::Bernstein, 1}),
                  BasisConfigurationException);
+    EXPECT_THROW((void)basis_factory::create(
+                     BasisRequest{ElementType::Pyramid5, BasisType::Lagrange, 1}),
+                 BasisElementCompatibilityException);
+
+    BasisRequest vector_req{ElementType::Line2, BasisType::Lagrange, 1};
+    vector_req.field_type = FieldType::Vector;
+    EXPECT_THROW((void)basis_factory::create(vector_req), BasisConfigurationException);
 
     auto serendipity = basis_factory::create(
         BasisRequest{ElementType::Quad8, BasisType::Serendipity, 2});
@@ -130,6 +139,8 @@ TEST(BasisErrorPaths, NodeOrderingInvalidNodeThrows) {
                  BasisNodeOrderingException);
     EXPECT_THROW((void)ReferenceNodeLayout::get_lagrange_node_coords(ElementType::Quad8, 2),
                  BasisNodeOrderingException);
+    EXPECT_THROW((void)ReferenceNodeLayout::num_nodes(ElementType::Pyramid5),
+                 BasisNodeOrderingException);
 }
 
 TEST(BasisErrorPaths, BasisFunctionDefaultsThrowForMissingDerivatives) {
@@ -142,25 +153,22 @@ TEST(BasisErrorPaths, BasisFunctionDefaultsThrowForMissingDerivatives) {
     EXPECT_THROW(basis.evaluate_hessians(xi, hessians), BasisEvaluationException);
 }
 
-TEST(BasisErrorPaths, BasisFunctionFallbackWritesFlatAndStridedLayouts) {
+TEST(BasisErrorPaths, BasisFunctionFallbackWritesRawLayouts) {
     CompleteFallbackBasis basis;
-    const std::vector<math::Vector<Real, 3>> points = {
-        {Real(0.25), Real(0.5), Real(-0.25)},
-        {Real(-0.5), Real(0.75), Real(0.125)}
-    };
-    prewarm_basis_function_scratch(basis.size(), points.size());
+    const math::Vector<Real, 3> point{Real(0.25), Real(0.5), Real(-0.25)};
+    prewarm_basis_function_scratch(basis.size());
 
     std::vector<Real> flat_values(basis.size());
     std::vector<Real> flat_gradients(basis.size() * 3u);
     std::vector<Real> flat_hessians(basis.size() * 9u);
-    basis.evaluate_values_to(points.front(), flat_values.data());
-    basis.evaluate_gradients_to(points.front(), flat_gradients.data());
-    basis.evaluate_hessians_to(points.front(), flat_hessians.data());
+    basis.evaluate_values_to(point, flat_values.data());
+    basis.evaluate_gradients_to(point, flat_gradients.data());
+    basis.evaluate_hessians_to(point, flat_hessians.data());
 
     std::vector<Real> expected_values;
     std::vector<Gradient> expected_gradients;
     std::vector<Hessian> expected_hessians;
-    basis.evaluate_all(points.front(), expected_values, expected_gradients, expected_hessians);
+    basis.evaluate_all(point, expected_values, expected_gradients, expected_hessians);
     for (std::size_t d = 0; d < basis.size(); ++d) {
         EXPECT_EQ(flat_values[d], expected_values[d]);
         for (std::size_t c = 0; c < 3u; ++c) {
@@ -172,32 +180,4 @@ TEST(BasisErrorPaths, BasisFunctionFallbackWritesFlatAndStridedLayouts) {
             }
         }
     }
-
-    constexpr std::size_t output_stride = 3u;
-    std::vector<Real> values(basis.size() * output_stride, Real(-99));
-    std::vector<Real> gradients(basis.size() * 3u * output_stride, Real(-99));
-    std::vector<Real> hessians(basis.size() * 9u * output_stride, Real(-99));
-    basis.evaluate_at_quadrature_points_strided(
-        points, output_stride, values.data(), gradients.data(), hessians.data());
-
-    for (std::size_t q = 0; q < points.size(); ++q) {
-        basis.evaluate_all(points[q], expected_values, expected_gradients, expected_hessians);
-        for (std::size_t d = 0; d < basis.size(); ++d) {
-            EXPECT_EQ(values[d * output_stride + q], expected_values[d]);
-            for (std::size_t c = 0; c < 3u; ++c) {
-                EXPECT_EQ(gradients[(d * 3u + c) * output_stride + q],
-                          expected_gradients[d][c]);
-            }
-            for (std::size_t r = 0; r < 3u; ++r) {
-                for (std::size_t c = 0; c < 3u; ++c) {
-                    EXPECT_EQ(hessians[(d * 9u + r * 3u + c) * output_stride + q],
-                              expected_hessians[d](r, c));
-                }
-            }
-        }
-    }
-
-    for (std::size_t d = 0; d < basis.size(); ++d) {
-        EXPECT_EQ(values[d * output_stride + 2u], Real(-99));
-    }
 }
diff --git a/tests/unitTests/FE/Basis/test_BasisHessians.cpp b/tests/unitTests/FE/Basis/test_BasisHessians.cpp
index 0899ce358..f786b07cd 100644
--- a/tests/unitTests/FE/Basis/test_BasisHessians.cpp
+++ b/tests/unitTests/FE/Basis/test_BasisHessians.cpp
@@ -61,8 +61,6 @@ std::vector<math::Vector<Real, 3>> sample_points_for(ElementType type) {
             return {{Real(0.1), Real(-0.2), Real(0.3)}, {Real(-0.35), Real(0.25), Real(-0.15)}};
         case ElementType::Wedge6:
             return {{Real(0.18), Real(0.22), Real(-0.2)}, {Real(0.12), Real(0.16), Real(0.1)}};
-        case ElementType::Pyramid5:
-            return {{Real(0.0), Real(0.0), Real(0.2)}, {Real(0.12), Real(-0.08), Real(0.24)}};
         default:
             return {{Real(0), Real(0), Real(0)}};
     }
@@ -200,8 +198,6 @@ TEST(BasisHessians, LagrangeCanonicalTopologiesMatchNumericalHessians) {
         {ElementType::Tetra4, 2, Real(1e-6), Real(1e-5)},
         {ElementType::Hex8, 2, Real(1e-6), Real(1e-5)},
         {ElementType::Wedge6, 2, Real(1e-5), Real(1e-5)},
-        {ElementType::Pyramid5, 1, Real(2e-6), Real(1e-5)},
-        {ElementType::Pyramid5, 3, Real(4e-4), Real(2e-5)},
     };
 
     for (const auto& c : cases) {
@@ -223,7 +219,6 @@ TEST(BasisHessians, LagrangeHessiansSumToZeroAndAreSymmetric) {
         {ElementType::Tetra4, 2, {Real(0.15), Real(0.2), Real(0.1)}, Real(1e-10)},
         {ElementType::Hex8, 2, {Real(0.1), Real(-0.2), Real(0.3)}, Real(1e-12)},
         {ElementType::Wedge6, 2, {Real(0.2), Real(0.15), Real(-0.3)}, Real(1e-10)},
-        {ElementType::Pyramid5, 1, {Real(0.1), Real(-0.2), Real(0.3)}, Real(1e-8)},
     };
 
     for (const auto& c : cases) {
@@ -233,25 +228,6 @@ TEST(BasisHessians, LagrangeHessiansSumToZeroAndAreSymmetric) {
     }
 }
 
-TEST(BasisHessians, LagrangePyramidExactApexHessianThrows) {
-    const struct Case {
-        ElementType type;
-        int order;
-    } cases[] = {
-        {ElementType::Pyramid5, 1},
-        {ElementType::Pyramid14, 2},
-        {ElementType::Pyramid5, 4},
-    };
-
-    const math::Vector<Real, 3> apex{Real(0), Real(0), Real(1)};
-    for (const auto& c : cases) {
-        LagrangeBasis basis(c.type, c.order);
-        std::vector<Hessian> hessians;
-        EXPECT_THROW(basis.evaluate_hessians(apex, hessians), BasisEvaluationException)
-            << "order " << c.order;
-    }
-}
-
 TEST(BasisHessians, SerendipityHessiansSumToZeroAndAreSymmetric) {
     const struct Case {
         ElementType type;
@@ -262,7 +238,6 @@ TEST(BasisHessians, SerendipityHessiansSumToZeroAndAreSymmetric) {
         {ElementType::Quad8, 2, {Real(0.17), Real(-0.31), Real(0)}, Real(1e-10)},
         {ElementType::Hex20, 2, {Real(0.2), Real(-0.1), Real(0.3)}, Real(1e-10)},
         {ElementType::Wedge15, 2, {Real(0.2), Real(0.3), Real(0.1)}, Real(1e-10)},
-        {ElementType::Pyramid13, 2, {Real(0.1), Real(-0.2), Real(0.4)}, Real(1e-8)},
     };
 
     for (const auto& c : cases) {
@@ -272,13 +247,6 @@ TEST(BasisHessians, SerendipityHessiansSumToZeroAndAreSymmetric) {
     }
 }
 
-TEST(BasisHessians, SerendipityPyramidExactApexHessianThrows) {
-    SerendipityBasis basis(ElementType::Pyramid13, 2);
-    std::vector<Hessian> hessians;
-    EXPECT_THROW(basis.evaluate_hessians({Real(0), Real(0), Real(1)}, hessians),
-                 BasisEvaluationException);
-}
-
 TEST(BasisHessians, SolverMappedVolumeSelectionsSatisfyInvariants) {
     const struct Case {
         ElementType type;
diff --git a/tests/unitTests/FE/Basis/test_ConstexprBasis.cpp b/tests/unitTests/FE/Basis/test_ConstexprBasis.cpp
index a1031fa76..44e588fdc 100644
--- a/tests/unitTests/FE/Basis/test_ConstexprBasis.cpp
+++ b/tests/unitTests/FE/Basis/test_ConstexprBasis.cpp
@@ -1,21 +1,16 @@
 /**
  * @file test_ConstexprBasis.cpp
- * @brief Compile-time and lightweight runtime checks for migrated Basis helpers.
+ * @brief Compile-time and lightweight runtime checks for reduced Basis helpers.
  */
 
-#include "FE/Basis/BasisTolerance.h"
+#include "FE/Basis/BasisExceptions.h"
 #include "FE/Basis/BasisTraits.h"
-#include "FE/Basis/LagrangeBasis.h"
-#include "FE/Basis/LagrangeBasisFast.h"
 #include "FE/Basis/NodeOrderingConventions.h"
 
 #include <gtest/gtest.h>
 
-#include <array>
 #include <limits>
 #include <tuple>
-#include <type_traits>
-#include <utility>
 #include <vector>
 
 namespace svmp {
@@ -30,26 +25,27 @@ static_assert(is_quadrilateral(ElementType::Quad8));
 static_assert(is_tetrahedron(ElementType::Tetra10));
 static_assert(is_hexahedron(ElementType::Hex20));
 static_assert(is_wedge(ElementType::Wedge18));
-static_assert(is_pyramid(ElementType::Pyramid14));
+static_assert(!is_pyramid(ElementType::Pyramid5));
+static_assert(!is_pyramid(ElementType::Pyramid14));
 static_assert(is_simplex(ElementType::Triangle3));
 static_assert(is_simplex(ElementType::Tetra4));
 static_assert(!is_simplex(ElementType::Wedge6));
 static_assert(is_tensor_product(ElementType::Line2));
 static_assert(is_tensor_product(ElementType::Quad9));
 static_assert(is_tensor_product(ElementType::Hex27));
-static_assert(!is_tensor_product(ElementType::Pyramid5));
-static_assert(reference_dimension(ElementType::Pyramid14) == 3);
+static_assert(!is_tensor_product(ElementType::Wedge6));
+static_assert(topology(ElementType::Pyramid5) == BasisTopology::Unknown);
 static_assert(canonical_lagrange_type(ElementType::Hex27) == ElementType::Hex8);
 static_assert(canonical_lagrange_type(ElementType::Pyramid13) == ElementType::Pyramid13);
 static_assert(complete_lagrange_alias_order(ElementType::Wedge18) == 2);
-static_assert(complete_lagrange_alias_order(ElementType::Hex20) == -1);
+static_assert(complete_lagrange_alias_order(ElementType::Pyramid14) == -1);
 static_assert(line_lagrange_size(2) == 3u);
 static_assert(triangle_lagrange_size(2) == 6u);
 static_assert(quad_lagrange_size(2) == 9u);
 static_assert(tetra_lagrange_size(2) == 10u);
 static_assert(hex_lagrange_size(2) == 27u);
 static_assert(wedge_lagrange_size(2) == 18u);
-static_assert(pyramid_lagrange_size(2) == 14u);
+static_assert(complete_lagrange_alias_size(ElementType::Pyramid14) == 0u);
 static_assert(detail::basis_abs(Real(-2)) == Real(2));
 static_assert(detail::basis_max(Real(2), Real(3)) == Real(3));
 static_assert(detail::basis_near_zero(std::numeric_limits<Real>::epsilon() * Real(32)));
@@ -57,73 +53,7 @@ static_assert(detail::basis_nearly_equal(
     Real(1),
     Real(1) + std::numeric_limits<Real>::epsilon() * Real(32)));
 
-constexpr auto kLineFastValues = [] {
-    math::Vector<Real, 3> xi{Real(0), Real(0), Real(0)};
-    std::array<Real, LagrangeLineFast<1>::n_dofs> values{};
-    LagrangeLineFast<1>::evaluate(xi, values);
-    return values;
-}();
-static_assert(kLineFastValues[0] == Real(0.5));
-static_assert(kLineFastValues[1] == Real(0.5));
-
-constexpr auto kLineP2FastHessians = [] {
-    math::Vector<Real, 3> xi{Real(0), Real(0), Real(0)};
-    std::array<Hessian, LagrangeLineFast<2>::n_dofs> hessians{};
-    LagrangeLineFast<2>::evaluate_hessians(xi, hessians);
-    return hessians;
-}();
-static_assert(kLineP2FastHessians[0](0, 0) == Real(1));
-static_assert(kLineP2FastHessians[1](0, 0) == Real(1));
-static_assert(kLineP2FastHessians[2](0, 0) == Real(-2));
-
-constexpr auto kTriP2FastValues = [] {
-    math::Vector<Real, 3> xi{Real(0.25), Real(0.25), Real(0)};
-    std::array<Real, LagrangeTriFast<2>::n_dofs> values{};
-    LagrangeTriFast<2>::evaluate(xi, values);
-    return values;
-}();
-static_assert(kTriP2FastValues[0] == Real(0));
-static_assert(kTriP2FastValues[3] == Real(0.5));
-static_assert(kTriP2FastValues[4] == Real(0.25));
-
-template<typename Basis>
-constexpr bool overrides_scalar_strided_v =
-    !std::is_same_v<decltype(&Basis::evaluate_at_quadrature_points_strided),
-                    decltype(&BasisFunction::evaluate_at_quadrature_points_strided)>;
-
-template<typename FastBasis>
-void expect_fast_matches_lagrange(ElementType type,
-                                  int order,
-                                  const std::vector<math::Vector<Real, 3>>& points)
-{
-    LagrangeBasis basis(type, order);
-    for (const auto& xi : points) {
-        std::vector<Real> expected_values;
-        std::vector<Gradient> expected_gradients;
-        std::vector<Hessian> expected_hessians;
-        basis.evaluate_all(xi, expected_values, expected_gradients, expected_hessians);
-
-        std::array<Real, FastBasis::n_dofs> values{};
-        std::array<Gradient, FastBasis::n_dofs> gradients{};
-        std::array<Hessian, FastBasis::n_dofs> hessians{};
-        FastBasis::evaluate(xi, values);
-        FastBasis::evaluate_gradients(xi, gradients);
-        FastBasis::evaluate_hessians(xi, hessians);
-
-        ASSERT_EQ(expected_values.size(), values.size());
-        for (std::size_t i = 0; i < values.size(); ++i) {
-            EXPECT_NEAR(values[i], expected_values[i], Real(1e-14));
-            for (std::size_t d = 0; d < 3u; ++d) {
-                EXPECT_NEAR(gradients[i][d], expected_gradients[i][d], Real(1e-14));
-                for (std::size_t e = 0; e < 3u; ++e) {
-                    EXPECT_NEAR(hessians[i](d, e), expected_hessians[i](d, e), Real(1e-14));
-                }
-            }
-        }
-    }
-}
-
-TEST(ConstexprBasis, FixedNodeTableSizes) {
+TEST(ConstexprBasis, FixedNodeTableSizesForSupportedLayouts) {
     const std::vector<std::pair<ElementType, std::size_t>> expected = {
         {ElementType::Line2, 2u},
         {ElementType::Line3, 3u},
@@ -140,9 +70,6 @@ TEST(ConstexprBasis, FixedNodeTableSizes) {
         {ElementType::Wedge6, 6u},
         {ElementType::Wedge15, 15u},
         {ElementType::Wedge18, 18u},
-        {ElementType::Pyramid5, 5u},
-        {ElementType::Pyramid13, 13u},
-        {ElementType::Pyramid14, 14u},
     };
 
     for (const auto& [type, size] : expected) {
@@ -150,7 +77,7 @@ TEST(ConstexprBasis, FixedNodeTableSizes) {
     }
 }
 
-TEST(ConstexprBasis, BasisToleranceScalesWithRealPrecision) {
+TEST(ConstexprBasis, TraitToleranceScalesWithRealPrecision) {
     const Real eps = std::numeric_limits<Real>::epsilon();
     EXPECT_GT(detail::basis_scaled_tolerance(), eps);
     EXPECT_TRUE(detail::basis_near_zero(eps * Real(32)));
@@ -159,37 +86,6 @@ TEST(ConstexprBasis, BasisToleranceScalesWithRealPrecision) {
     EXPECT_FALSE(detail::basis_nearly_equal(Real(1), Real(1) + eps * Real(128)));
 }
 
-TEST(ConstexprBasis, LagrangeOverridesStridedEvaluation) {
-    EXPECT_TRUE(overrides_scalar_strided_v<LagrangeBasis>);
-}
-
-TEST(ConstexprBasis, FastSidecarsMatchRuntimeLagrangeBasis) {
-    expect_fast_matches_lagrange<LagrangeLineFast<1>>(
-        ElementType::Line2, 1,
-        {{Real(-0.2), Real(0), Real(0)}, {Real(0.35), Real(0), Real(0)}});
-    expect_fast_matches_lagrange<LagrangeLineFast<2>>(
-        ElementType::Line2, 2,
-        {{Real(-0.2), Real(0), Real(0)}, {Real(0.35), Real(0), Real(0)}});
-    expect_fast_matches_lagrange<LagrangeQuadFast<1>>(
-        ElementType::Quad4, 1,
-        {{Real(-0.2), Real(0.3), Real(0)}, {Real(0.35), Real(-0.45), Real(0)}});
-    expect_fast_matches_lagrange<LagrangeHexFast<1>>(
-        ElementType::Hex8, 1,
-        {{Real(-0.2), Real(0.3), Real(0.1)}, {Real(0.35), Real(-0.45), Real(0.25)}});
-    expect_fast_matches_lagrange<LagrangeTriFast<1>>(
-        ElementType::Triangle3, 1,
-        {{Real(0.2), Real(0.3), Real(0)}, {Real(0.1), Real(0.6), Real(0)}});
-    expect_fast_matches_lagrange<LagrangeTriFast<2>>(
-        ElementType::Triangle3, 2,
-        {{Real(0.2), Real(0.3), Real(0)}, {Real(0.1), Real(0.6), Real(0)}});
-    expect_fast_matches_lagrange<LagrangeTetFast<1>>(
-        ElementType::Tetra4, 1,
-        {{Real(0.2), Real(0.3), Real(0.1)}, {Real(0.1), Real(0.2), Real(0.4)}});
-    expect_fast_matches_lagrange<LagrangeTetFast<2>>(
-        ElementType::Tetra4, 2,
-        {{Real(0.2), Real(0.3), Real(0.1)}, {Real(0.1), Real(0.2), Real(0.4)}});
-}
-
 TEST(ConstexprBasis, CompleteAliasTablesMatchGeneratedLagrangeNodes) {
     const std::vector<std::tuple<ElementType, ElementType, int>> aliases = {
         {ElementType::Line2, ElementType::Line2, 1},
@@ -204,8 +100,6 @@ TEST(ConstexprBasis, CompleteAliasTablesMatchGeneratedLagrangeNodes) {
         {ElementType::Hex27, ElementType::Hex8, 2},
         {ElementType::Wedge6, ElementType::Wedge6, 1},
         {ElementType::Wedge18, ElementType::Wedge6, 2},
-        {ElementType::Pyramid5, ElementType::Pyramid5, 1},
-        {ElementType::Pyramid14, ElementType::Pyramid5, 2},
     };
 
     for (const auto& [alias, canonical_type, order] : aliases) {
@@ -220,6 +114,15 @@ TEST(ConstexprBasis, CompleteAliasTablesMatchGeneratedLagrangeNodes) {
     }
 }
 
+TEST(ConstexprBasis, PyramidNodeOrderingIsOutsideCurrentScope) {
+    EXPECT_THROW((void)ReferenceNodeLayout::num_nodes(ElementType::Pyramid5),
+                 BasisNodeOrderingException);
+    EXPECT_THROW((void)ReferenceNodeLayout::num_nodes(ElementType::Pyramid13),
+                 BasisNodeOrderingException);
+    EXPECT_THROW((void)ReferenceNodeLayout::get_lagrange_node_coords(ElementType::Pyramid5, 1),
+                 BasisNodeOrderingException);
+}
+
 } // namespace
 } // namespace basis
 } // namespace FE
diff --git a/tests/unitTests/FE/Basis/test_HigherOrderWedgePyramid.cpp b/tests/unitTests/FE/Basis/test_HigherOrderWedge.cpp
similarity index 64%
rename from tests/unitTests/FE/Basis/test_HigherOrderWedgePyramid.cpp
rename to tests/unitTests/FE/Basis/test_HigherOrderWedge.cpp
index 26efc4070..3faffd9e0 100644
--- a/tests/unitTests/FE/Basis/test_HigherOrderWedgePyramid.cpp
+++ b/tests/unitTests/FE/Basis/test_HigherOrderWedge.cpp
@@ -1,6 +1,6 @@
 /**
- * @file test_HigherOrderWedgePyramid.cpp
- * @brief Focused higher-order wedge and pyramid checks for LagrangeBasis.
+ * @file test_HigherOrderWedge.cpp
+ * @brief Focused higher-order wedge checks for LagrangeBasis.
  */
 
 #include <gtest/gtest.h>
@@ -9,8 +9,6 @@
 #include "FE/Basis/NodeOrderingConventions.h"
 
 #include <cmath>
-#include <tuple>
-#include <utility>
 #include <vector>
 
 using namespace svmp::FE;
@@ -107,28 +105,18 @@ void expect_all_entries_finite(const LagrangeBasis& basis,
 
 } // namespace
 
-TEST(HigherOrderWedgePyramid, CompleteAliasesMatchGeneratedNodeLayouts) {
-    const std::vector<std::tuple<ElementType, ElementType, int>> cases = {
-        {ElementType::Wedge18, ElementType::Wedge6, 2},
-        {ElementType::Pyramid14, ElementType::Pyramid5, 2},
-    };
-
-    for (const auto& [alias, canonical, order] : cases) {
-        LagrangeBasis alias_basis(alias, order);
-        const auto generated = ReferenceNodeLayout::get_lagrange_node_coords(canonical, order);
-        ASSERT_EQ(generated.size(), ReferenceNodeLayout::num_nodes(alias));
-        expect_nodes_close(alias_basis.nodes(), generated, Real(1e-14));
-
-        for (std::size_t i = 0; i < generated.size(); ++i) {
-            const auto public_node = ReferenceNodeLayout::get_node_coords(alias, i);
-            EXPECT_NEAR(public_node[0], generated[i][0], Real(1e-14)) << "node " << i;
-            EXPECT_NEAR(public_node[1], generated[i][1], Real(1e-14)) << "node " << i;
-            EXPECT_NEAR(public_node[2], generated[i][2], Real(1e-14)) << "node " << i;
-        }
-    }
+TEST(HigherOrderWedge, CompleteAliasMatchesGeneratedNodeLayout) {
+    LagrangeBasis alias_basis(ElementType::Wedge18, 1);
+    const auto generated =
+        ReferenceNodeLayout::get_lagrange_node_coords(ElementType::Wedge6, 2);
+
+    ASSERT_EQ(generated.size(), ReferenceNodeLayout::num_nodes(ElementType::Wedge18));
+    EXPECT_EQ(alias_basis.element_type(), ElementType::Wedge6);
+    EXPECT_EQ(alias_basis.order(), 2);
+    expect_nodes_close(alias_basis.nodes(), generated, Real(1e-14));
 }
 
-TEST(HigherOrderWedgePyramid, WedgeOrderThreeIsNodalAndPartitionsUnity) {
+TEST(HigherOrderWedge, OrderThreeIsNodalAndPartitionsUnity) {
     LagrangeBasis wedge(ElementType::Wedge6, 3);
 
     expect_kronecker_at_nodes(wedge, Real(2e-10));
@@ -143,31 +131,9 @@ TEST(HigherOrderWedgePyramid, WedgeOrderThreeIsNodalAndPartitionsUnity) {
         Real(1e-9));
 }
 
-TEST(HigherOrderWedgePyramid, PyramidOrderThreeIsNodalAndPartitionsUnity) {
-    LagrangeBasis pyramid(ElementType::Pyramid5, 3);
+TEST(HigherOrderWedge, OrderFourEvaluationsRemainFinite) {
+    LagrangeBasis wedge(ElementType::Wedge6, 4);
 
-    expect_kronecker_at_nodes(pyramid, Real(5e-8));
-    expect_partition_gradient_hessian_sums(
-        pyramid,
-        {
-            {Real(0), Real(0), Real(0.2)},
-            {Real(0.12), Real(-0.08), Real(0.24)},
-            {Real(-0.08), Real(0.1), Real(0.55)},
-        },
-        Real(1e-11),
-        Real(5e-7));
-}
-
-TEST(HigherOrderWedgePyramid, PyramidNearApexDerivativeQueriesRemainFinite) {
-    const std::vector<std::pair<ElementType, int>> cases = {
-        {ElementType::Pyramid5, 1},
-        {ElementType::Pyramid14, 2},
-        {ElementType::Pyramid5, 4},
-    };
-
-    for (const auto& [type, order] : cases) {
-        LagrangeBasis basis(type, order);
-        expect_all_entries_finite(basis, {Real(0.01), Real(-0.005), Real(0.92)});
-        expect_all_entries_finite(basis, {Real(-0.004), Real(0.007), Real(0.98)});
-    }
+    expect_all_entries_finite(wedge, {Real(0.2), Real(0.1), Real(-0.6)});
+    expect_all_entries_finite(wedge, {Real(0.05), Real(0.8), Real(0.3)});
 }
diff --git a/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp b/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp
index a88d860e9..9d93f8931 100644
--- a/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp
+++ b/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp
@@ -1,2249 +1,98 @@
 /**
  * @file test_LagrangeBasis.cpp
- * @brief Unit tests for Lagrange basis functions
+ * @brief Unit tests for the reduced scalar Lagrange basis implementation.
  */
 
 #include <gtest/gtest.h>
-#include "FE/Basis/BasisFactory.h"
-#include "FE/Basis/LagrangeBasis.h"
-#include "FE/Basis/NodeOrderingConventions.h"
-#include "FE/Basis/SerendipityBasis.h"
-#include "fs.h"
-#include "nn.h"
-#include <array>
-#include <cmath>
-#include <functional>
-#include <limits>
-#include <map>
-#include <math.h>
-#include <numeric>
-#include <string>
-#include <vector>
-
-namespace legacy_solver_nn {
-using namespace consts;
-#include "nn_elem_gip.h"
-#include "nn_elem_gnn.h"
-#include "nn_elem_gnnxx.h"
-} // namespace legacy_solver_nn
-
-using svmp::FE::basis::LagrangeBasis;
-using svmp::FE::ElementType;
-using svmp::FE::Real;
-using svmp::FE::basis::Gradient;
-using svmp::FE::basis::Hessian;
-using svmp::FE::basis::ReferenceNodeLayout;
-
-namespace {
-
-using Point = svmp::FE::math::Vector<Real, 3>;
-
-struct SolverBasisAdapterCase {
-    consts::ElementType type;
-    consts::ElementType quadrature_type;
-    int insd;
-    int eNoN;
-    int nG;
-};
-
-std::vector<SolverBasisAdapterCase> solver_basis_adapter_cases() {
-    using consts::ElementType;
-    return {
-        {ElementType::LIN1, ElementType::LIN1, 1, 2, 2},
-        {ElementType::LIN2, ElementType::LIN2, 1, 3, 3},
-        {ElementType::TRI3, ElementType::TRI3, 2, 3, 3},
-        {ElementType::TRI6, ElementType::TRI6, 2, 6, 7},
-        {ElementType::QUD4, ElementType::QUD4, 2, 4, 4},
-        {ElementType::QUD8, ElementType::QUD9, 2, 8, 9},
-        {ElementType::QUD9, ElementType::QUD9, 2, 9, 9},
-        {ElementType::TET4, ElementType::TET4, 3, 4, 4},
-        {ElementType::TET10, ElementType::TET10, 3, 10, 15},
-        {ElementType::HEX8, ElementType::HEX8, 3, 8, 8},
-        {ElementType::HEX20, ElementType::HEX20, 3, 20, 27},
-        {ElementType::HEX27, ElementType::HEX27, 3, 27, 27},
-        {ElementType::WDG, ElementType::WDG, 3, 6, 6},
-    };
-}
-
-std::vector<SolverBasisAdapterCase> solver_face_basis_adapter_cases() {
-    using consts::ElementType;
-    return {
-        {ElementType::LIN1, ElementType::LIN1, 1, 2, 2},
-        {ElementType::LIN2, ElementType::LIN2, 1, 3, 3},
-        {ElementType::TRI3, ElementType::TRI3, 2, 3, 3},
-        {ElementType::TRI6, ElementType::TRI6, 2, 6, 7},
-        {ElementType::QUD4, ElementType::QUD4, 2, 4, 4},
-        {ElementType::QUD8, ElementType::QUD8, 2, 8, 9},
-        {ElementType::QUD9, ElementType::QUD9, 2, 9, 9},
-    };
-}
-
-std::vector<SolverBasisAdapterCase> solver_hessian_adapter_cases() {
-    return solver_basis_adapter_cases();
-}
-
-std::vector<SolverBasisAdapterCase> solver_legacy_hessian_parity_cases() {
-    using consts::ElementType;
-    return {
-        {ElementType::TRI6, ElementType::TRI6, 2, 6, 7},
-        {ElementType::QUD9, ElementType::QUD9, 2, 9, 9},
-        {ElementType::TET10, ElementType::TET10, 3, 10, 15},
-    };
-}
-
-int packed_hessian_components(int insd) {
-    if (insd == 1) {
-        return 1;
-    }
-    if (insd == 2) {
-        return 3;
-    }
-    return 6;
-}
-
-void fill_legacy_quadrature(const SolverBasisAdapterCase& c,
-                            Vector<double>& w,
-                            Array<double>& xi) {
-    mshType mesh;
-    mesh.eType = c.quadrature_type;
-    mesh.eNoN = c.eNoN;
-    mesh.nG = c.nG;
-    mesh.w.resize(c.nG);
-    mesh.xi.resize(c.insd, c.nG);
-    legacy_solver_nn::set_element_gauss_int_data.at(c.quadrature_type)(mesh);
-    w = mesh.w;
-    xi = mesh.xi;
-}
-
-faceType initialized_face_for_case(const SolverBasisAdapterCase& c) {
-    faceType face;
-    face.eType = c.type;
-    face.eNoN = c.eNoN;
-    face.nG = c.nG;
-    face.w.resize(c.nG);
-    face.xi.resize(c.insd, c.nG);
-    legacy_solver_nn::set_face_gauss_int_data.at(c.quadrature_type)(face);
-    face.N.resize(c.eNoN, c.nG);
-    face.Nx.resize(c.insd, c.eNoN, c.nG);
-    return face;
-}
-
-void expect_arrays_near(const Array<double>& actual,
-                        const Array<double>& expected,
-                        double tol) {
-    ASSERT_EQ(actual.nrows(), expected.nrows());
-    ASSERT_EQ(actual.ncols(), expected.ncols());
-    for (int col = 0; col < actual.ncols(); ++col) {
-        for (int row = 0; row < actual.nrows(); ++row) {
-            EXPECT_NEAR(actual(row, col), expected(row, col), tol)
-                << "row=" << row << ", col=" << col;
-        }
-    }
-}
-
-void expect_vectors_near(const Vector<double>& actual,
-                         const Vector<double>& expected,
-                         double tol) {
-    ASSERT_EQ(actual.size(), expected.size());
-    for (int i = 0; i < actual.size(); ++i) {
-        EXPECT_NEAR(actual(i), expected(i), tol) << "index=" << i;
-    }
-}
-
-void expect_array3_near(const Array3<double>& actual,
-                        const Array3<double>& expected,
-                        double tol) {
-    ASSERT_EQ(actual.nrows(), expected.nrows());
-    ASSERT_EQ(actual.ncols(), expected.ncols());
-    ASSERT_EQ(actual.nslices(), expected.nslices());
-    for (int slice = 0; slice < actual.nslices(); ++slice) {
-        for (int col = 0; col < actual.ncols(); ++col) {
-            for (int row = 0; row < actual.nrows(); ++row) {
-                EXPECT_NEAR(actual(row, col, slice), expected(row, col, slice), tol)
-                    << "row=" << row << ", col=" << col << ", slice=" << slice;
-            }
-        }
-    }
-}
-
-void fill_array3(Array3<double>& values, double value) {
-    for (int slice = 0; slice < values.nslices(); ++slice) {
-        for (int col = 0; col < values.ncols(); ++col) {
-            for (int row = 0; row < values.nrows(); ++row) {
-                values(row, col, slice) = value;
-            }
-        }
-    }
-}
-
-void expect_face_partition_identities(const SolverBasisAdapterCase& c,
-                                      const faceType& face,
-                                      int g,
-                                      double tol) {
-    double partition = 0.0;
-    std::array<double, 3> gradient_sum{0.0, 0.0, 0.0};
-
-    for (int a = 0; a < c.eNoN; ++a) {
-        EXPECT_TRUE(std::isfinite(face.N(a, g)))
-            << "element=" << static_cast<int>(c.type)
-            << ", node=" << a
-            << ", g=" << g;
-        partition += face.N(a, g);
-
-        for (int d = 0; d < c.insd; ++d) {
-            EXPECT_TRUE(std::isfinite(face.Nx(d, a, g)))
-                << "element=" << static_cast<int>(c.type)
-                << ", d=" << d
-                << ", node=" << a
-                << ", g=" << g;
-            gradient_sum[static_cast<std::size_t>(d)] += face.Nx(d, a, g);
-        }
-    }
-
-    EXPECT_NEAR(partition, 1.0, tol)
-        << "element=" << static_cast<int>(c.type) << ", g=" << g;
-    for (int d = 0; d < c.insd; ++d) {
-        EXPECT_NEAR(gradient_sum[static_cast<std::size_t>(d)], 0.0, tol)
-            << "element=" << static_cast<int>(c.type) << ", d=" << d << ", g=" << g;
-    }
-}
-
-bool array3_has_nonzero_component(const Array3<double>& values,
-                                  int row,
-                                  double tol) {
-    for (int slice = 0; slice < values.nslices(); ++slice) {
-        for (int col = 0; col < values.ncols(); ++col) {
-            if (std::abs(values(row, col, slice)) > tol) {
-                return true;
-            }
-        }
-    }
-    return false;
-}
-
-Array<double> single_point_xi(const SolverBasisAdapterCase& c,
-                              const Array<double>& xi,
-                              int g) {
-    Array<double> point(c.insd, 1);
-    for (int d = 0; d < c.insd; ++d) {
-        point(d, 0) = xi(d, g);
-    }
-    return point;
-}
-
-std::vector<double> finite_difference_solver_second_derivative(
-    const SolverBasisAdapterCase& c,
-    const Array<double>& point,
-    int gradient_component,
-    int coordinate_component,
-    double eps) {
-    Array<double> xi_plus = point;
-    Array<double> xi_minus = point;
-    xi_plus(coordinate_component, 0) += eps;
-    xi_minus(coordinate_component, 0) -= eps;
-
-    Array<double> N_plus(c.eNoN, 1);
-    Array<double> N_minus(c.eNoN, 1);
-    Array3<double> Nx_plus(c.insd, c.eNoN, 1);
-    Array3<double> Nx_minus(c.insd, c.eNoN, 1);
-
-    nn::get_gnn(c.insd, c.type, c.eNoN, 0, xi_plus, N_plus, Nx_plus);
-    nn::get_gnn(c.insd, c.type, c.eNoN, 0, xi_minus, N_minus, Nx_minus);
-
-    std::vector<double> values(static_cast<std::size_t>(c.eNoN));
-    for (int a = 0; a < c.eNoN; ++a) {
-        values[static_cast<std::size_t>(a)] =
-            (Nx_plus(gradient_component, a, 0) - Nx_minus(gradient_component, a, 0)) /
-            (2.0 * eps);
-    }
-    return values;
-}
-
-void expect_packed_hessian_component_matches_finite_difference(
-    const SolverBasisAdapterCase& c,
-    const Array<double>& point,
-    const Array3<double>& Nxx,
-    int g,
-    int packed_row,
-    int first_derivative_component,
-    int second_derivative_component,
-    double tol) {
-    const double eps = 2e-6;
-    const auto numerical = finite_difference_solver_second_derivative(
-        c, point, first_derivative_component, second_derivative_component, eps);
-    for (int a = 0; a < c.eNoN; ++a) {
-        EXPECT_NEAR(Nxx(packed_row, a, g), numerical[static_cast<std::size_t>(a)], tol)
-            << "element=" << static_cast<int>(c.type)
-            << ", packed_row=" << packed_row
-            << ", node=" << a
-            << ", g=" << g;
-    }
-
-    if (first_derivative_component != second_derivative_component) {
-        const auto symmetric_numerical = finite_difference_solver_second_derivative(
-            c, point, second_derivative_component, first_derivative_component, eps);
-        for (int a = 0; a < c.eNoN; ++a) {
-            EXPECT_NEAR(Nxx(packed_row, a, g),
-                        symmetric_numerical[static_cast<std::size_t>(a)],
-                        tol)
-                << "element=" << static_cast<int>(c.type)
-                << ", symmetry packed_row=" << packed_row
-                << ", node=" << a
-                << ", g=" << g;
-        }
-    }
-}
-
-void expect_solver_hessian_matches_gradient_finite_difference(
-    const SolverBasisAdapterCase& c,
-    const Array<double>& xi,
-    int g,
-    const Array3<double>& Nxx,
-    double tol) {
-    const Array<double> point = single_point_xi(c, xi, g);
-
-    expect_packed_hessian_component_matches_finite_difference(c, point, Nxx, g, 0, 0, 0, tol);
-    if (c.insd >= 2) {
-        expect_packed_hessian_component_matches_finite_difference(c, point, Nxx, g, 1, 1, 1, tol);
-    }
-    if (c.insd == 2) {
-        expect_packed_hessian_component_matches_finite_difference(c, point, Nxx, g, 2, 0, 1, tol);
-    } else if (c.insd >= 3) {
-        expect_packed_hessian_component_matches_finite_difference(c, point, Nxx, g, 2, 2, 2, tol);
-        expect_packed_hessian_component_matches_finite_difference(c, point, Nxx, g, 3, 0, 1, tol);
-        expect_packed_hessian_component_matches_finite_difference(c, point, Nxx, g, 4, 1, 2, tol);
-        expect_packed_hessian_component_matches_finite_difference(c, point, Nxx, g, 5, 0, 2, tol);
-    }
-}
-
-void expect_partition_hessian_identity(const SolverBasisAdapterCase& c,
-                                       const Array3<double>& Nxx,
-                                       int g,
-                                       double tol) {
-    for (int row = 0; row < Nxx.nrows(); ++row) {
-        double sum = 0.0;
-        for (int a = 0; a < c.eNoN; ++a) {
-            sum += Nxx(row, a, g);
-        }
-        EXPECT_NEAR(sum, 0.0, tol)
-            << "element=" << static_cast<int>(c.type)
-            << ", packed_row=" << row
-            << ", g=" << g;
-    }
-}
-
-void expect_all_hessians_zero(const SolverBasisAdapterCase& c,
-                              const Array3<double>& Nxx,
-                              int g,
-                              double tol) {
-    for (int row = 0; row < Nxx.nrows(); ++row) {
-        for (int a = 0; a < c.eNoN; ++a) {
-            EXPECT_NEAR(Nxx(row, a, g), 0.0, tol)
-                << "element=" << static_cast<int>(c.type)
-                << ", packed_row=" << row
-                << ", node=" << a
-                << ", g=" << g;
-        }
-    }
-}
-
-mshType initialized_mesh_for_case(const SolverBasisAdapterCase& c, bool force_lShpF) {
-    mshType mesh;
-    mesh.nFs = 1;
-    mesh.eType = c.type;
-    mesh.eNoN = c.eNoN;
-    mesh.nG = c.nG;
-    mesh.lShpF = force_lShpF;
-    mesh.w.resize(c.nG);
-    mesh.xi.resize(c.insd, c.nG);
-    mesh.N.resize(c.eNoN, c.nG);
-    mesh.Nx.resize(c.insd, c.eNoN, c.nG);
-    mesh.xib.resize(2, c.insd);
-    mesh.Nb.resize(2, c.eNoN);
-
-    nn::get_gip(c.insd, c.quadrature_type, c.nG, mesh.w, mesh.xi);
-    for (int g = 0; g < c.nG; ++g) {
-        nn::get_gnn(c.insd, c.type, c.eNoN, g, mesh.xi, mesh.N, mesh.Nx);
-    }
-    nn::get_nn_bnds(c.insd, c.type, c.eNoN, mesh.xib, mesh.Nb);
-    return mesh;
-}
-
-enum class PyramidFace {
-    Base,
-    South,
-    East,
-    North,
-    West
-};
-
-enum class PyramidEdge {
-    BaseSouth,
-    BaseEast,
-    BaseNorth,
-    BaseWest,
-    VerticalSW,
-    VerticalSE,
-    VerticalNE,
-    VerticalNW
-};
-
-struct LagrangeAccuracyCase {
-    ElementType type;
-    int order;
-    std::vector<Point> points;
-};
-
-std::size_t expected_lagrange_size(ElementType type, int order) {
-    switch (type) {
-        case ElementType::Point1:
-            return 1u;
-        case ElementType::Line2:
-        case ElementType::Line3:
-            return static_cast<std::size_t>(order + 1);
-        case ElementType::Triangle3:
-        case ElementType::Triangle6:
-            return static_cast<std::size_t>(order + 1) * static_cast<std::size_t>(order + 2) / 2;
-        case ElementType::Quad4:
-        case ElementType::Quad9:
-            return static_cast<std::size_t>(order + 1) * static_cast<std::size_t>(order + 1);
-        case ElementType::Tetra4:
-        case ElementType::Tetra10:
-            return static_cast<std::size_t>(order + 1) *
-                   static_cast<std::size_t>(order + 2) *
-                   static_cast<std::size_t>(order + 3) / 6;
-        case ElementType::Hex8:
-        case ElementType::Hex27:
-            return static_cast<std::size_t>(order + 1) *
-                   static_cast<std::size_t>(order + 1) *
-                   static_cast<std::size_t>(order + 1);
-        case ElementType::Wedge6:
-        case ElementType::Wedge18:
-            return static_cast<std::size_t>(order + 1) *
-                   static_cast<std::size_t>(order + 1) *
-                   static_cast<std::size_t>(order + 2) / 2;
-        case ElementType::Pyramid5:
-        case ElementType::Pyramid14:
-            return static_cast<std::size_t>(order + 1) *
-                   static_cast<std::size_t>(order + 2) *
-                   static_cast<std::size_t>(2 * order + 3) / 6;
-        default:
-            return 0u;
-    }
-}
-
-int expected_dimension(ElementType type) {
-    switch (type) {
-        case ElementType::Point1:
-            return 0;
-        case ElementType::Line2:
-        case ElementType::Line3:
-            return 1;
-        case ElementType::Triangle3:
-        case ElementType::Triangle6:
-        case ElementType::Quad4:
-        case ElementType::Quad9:
-            return 2;
-        default:
-            return 3;
-    }
-}
-
-bool points_close(const Point& a,
-                  const Point& b,
-                  Real tol = Real(1e-12)) {
-    return std::abs(a[0] - b[0]) <= tol &&
-           std::abs(a[1] - b[1]) <= tol &&
-           std::abs(a[2] - b[2]) <= tol;
-}
-
-std::vector<Point> reference_node_coords(ElementType type) {
-    switch (type) {
-        case ElementType::Line2:
-            return {
-                Point{Real(-1), Real(0), Real(0)},
-                Point{Real(1), Real(0), Real(0)},
-            };
-        case ElementType::Line3:
-            return {
-                Point{Real(-1), Real(0), Real(0)},
-                Point{Real(1), Real(0), Real(0)},
-                Point{Real(0), Real(0), Real(0)},
-            };
-        case ElementType::Triangle3:
-            return {
-                Point{Real(0), Real(0), Real(0)},
-                Point{Real(1), Real(0), Real(0)},
-                Point{Real(0), Real(1), Real(0)},
-            };
-        case ElementType::Triangle6:
-            return {
-                Point{Real(0), Real(0), Real(0)},
-                Point{Real(1), Real(0), Real(0)},
-                Point{Real(0), Real(1), Real(0)},
-                Point{Real(0.5), Real(0), Real(0)},
-                Point{Real(0.5), Real(0.5), Real(0)},
-                Point{Real(0), Real(0.5), Real(0)},
-            };
-        case ElementType::Quad4:
-            return {
-                Point{Real(-1), Real(-1), Real(0)},
-                Point{Real(1), Real(-1), Real(0)},
-                Point{Real(1), Real(1), Real(0)},
-                Point{Real(-1), Real(1), Real(0)},
-            };
-        case ElementType::Quad8:
-            return {
-                Point{Real(-1), Real(-1), Real(0)},
-                Point{Real(1), Real(-1), Real(0)},
-                Point{Real(1), Real(1), Real(0)},
-                Point{Real(-1), Real(1), Real(0)},
-                Point{Real(0), Real(-1), Real(0)},
-                Point{Real(1), Real(0), Real(0)},
-                Point{Real(0), Real(1), Real(0)},
-                Point{Real(-1), Real(0), Real(0)},
-            };
-        case ElementType::Quad9:
-            return {
-                Point{Real(-1), Real(-1), Real(0)},
-                Point{Real(1), Real(-1), Real(0)},
-                Point{Real(1), Real(1), Real(0)},
-                Point{Real(-1), Real(1), Real(0)},
-                Point{Real(0), Real(-1), Real(0)},
-                Point{Real(1), Real(0), Real(0)},
-                Point{Real(0), Real(1), Real(0)},
-                Point{Real(-1), Real(0), Real(0)},
-                Point{Real(0), Real(0), Real(0)},
-            };
-        case ElementType::Tetra4:
-            return {
-                Point{Real(0), Real(0), Real(0)},
-                Point{Real(1), Real(0), Real(0)},
-                Point{Real(0), Real(1), Real(0)},
-                Point{Real(0), Real(0), Real(1)},
-            };
-        case ElementType::Tetra10:
-            return {
-                Point{Real(0), Real(0), Real(0)},
-                Point{Real(1), Real(0), Real(0)},
-                Point{Real(0), Real(1), Real(0)},
-                Point{Real(0), Real(0), Real(1)},
-                Point{Real(0.5), Real(0), Real(0)},
-                Point{Real(0.5), Real(0.5), Real(0)},
-                Point{Real(0), Real(0.5), Real(0)},
-                Point{Real(0), Real(0), Real(0.5)},
-                Point{Real(0.5), Real(0), Real(0.5)},
-                Point{Real(0), Real(0.5), Real(0.5)},
-            };
-        case ElementType::Hex8:
-            return {
-                Point{Real(-1), Real(-1), Real(-1)},
-                Point{Real(1), Real(-1), Real(-1)},
-                Point{Real(1), Real(1), Real(-1)},
-                Point{Real(-1), Real(1), Real(-1)},
-                Point{Real(-1), Real(-1), Real(1)},
-                Point{Real(1), Real(-1), Real(1)},
-                Point{Real(1), Real(1), Real(1)},
-                Point{Real(-1), Real(1), Real(1)},
-            };
-        case ElementType::Hex20:
-            return {
-                Point{Real(-1), Real(-1), Real(-1)},
-                Point{Real(1), Real(-1), Real(-1)},
-                Point{Real(1), Real(1), Real(-1)},
-                Point{Real(-1), Real(1), Real(-1)},
-                Point{Real(-1), Real(-1), Real(1)},
-                Point{Real(1), Real(-1), Real(1)},
-                Point{Real(1), Real(1), Real(1)},
-                Point{Real(-1), Real(1), Real(1)},
-                Point{Real(0), Real(-1), Real(-1)},
-                Point{Real(1), Real(0), Real(-1)},
-                Point{Real(0), Real(1), Real(-1)},
-                Point{Real(-1), Real(0), Real(-1)},
-                Point{Real(0), Real(-1), Real(1)},
-                Point{Real(1), Real(0), Real(1)},
-                Point{Real(0), Real(1), Real(1)},
-                Point{Real(-1), Real(0), Real(1)},
-                Point{Real(-1), Real(-1), Real(0)},
-                Point{Real(1), Real(-1), Real(0)},
-                Point{Real(1), Real(1), Real(0)},
-                Point{Real(-1), Real(1), Real(0)},
-            };
-        case ElementType::Hex27:
-            return {
-                Point{Real(-1), Real(-1), Real(-1)},
-                Point{Real(1), Real(-1), Real(-1)},
-                Point{Real(1), Real(1), Real(-1)},
-                Point{Real(-1), Real(1), Real(-1)},
-                Point{Real(-1), Real(-1), Real(1)},
-                Point{Real(1), Real(-1), Real(1)},
-                Point{Real(1), Real(1), Real(1)},
-                Point{Real(-1), Real(1), Real(1)},
-                Point{Real(0), Real(-1), Real(-1)},
-                Point{Real(1), Real(0), Real(-1)},
-                Point{Real(0), Real(1), Real(-1)},
-                Point{Real(-1), Real(0), Real(-1)},
-                Point{Real(0), Real(-1), Real(1)},
-                Point{Real(1), Real(0), Real(1)},
-                Point{Real(0), Real(1), Real(1)},
-                Point{Real(-1), Real(0), Real(1)},
-                Point{Real(-1), Real(-1), Real(0)},
-                Point{Real(1), Real(-1), Real(0)},
-                Point{Real(1), Real(1), Real(0)},
-                Point{Real(-1), Real(1), Real(0)},
-                Point{Real(0), Real(0), Real(-1)},
-                Point{Real(0), Real(0), Real(1)},
-                Point{Real(0), Real(-1), Real(0)},
-                Point{Real(1), Real(0), Real(0)},
-                Point{Real(0), Real(1), Real(0)},
-                Point{Real(-1), Real(0), Real(0)},
-                Point{Real(0), Real(0), Real(0)},
-            };
-        case ElementType::Wedge6:
-            return {
-                Point{Real(0), Real(0), Real(-1)},
-                Point{Real(1), Real(0), Real(-1)},
-                Point{Real(0), Real(1), Real(-1)},
-                Point{Real(0), Real(0), Real(1)},
-                Point{Real(1), Real(0), Real(1)},
-                Point{Real(0), Real(1), Real(1)},
-            };
-        case ElementType::Wedge15:
-            return {
-                Point{Real(0), Real(0), Real(-1)},
-                Point{Real(1), Real(0), Real(-1)},
-                Point{Real(0), Real(1), Real(-1)},
-                Point{Real(0), Real(0), Real(1)},
-                Point{Real(1), Real(0), Real(1)},
-                Point{Real(0), Real(1), Real(1)},
-                Point{Real(0.5), Real(0), Real(-1)},
-                Point{Real(0.5), Real(0.5), Real(-1)},
-                Point{Real(0), Real(0.5), Real(-1)},
-                Point{Real(0.5), Real(0), Real(1)},
-                Point{Real(0.5), Real(0.5), Real(1)},
-                Point{Real(0), Real(0.5), Real(1)},
-                Point{Real(0), Real(0), Real(0)},
-                Point{Real(1), Real(0), Real(0)},
-                Point{Real(0), Real(1), Real(0)},
-            };
-        case ElementType::Wedge18:
-            return {
-                Point{Real(0), Real(0), Real(-1)},
-                Point{Real(1), Real(0), Real(-1)},
-                Point{Real(0), Real(1), Real(-1)},
-                Point{Real(0), Real(0), Real(1)},
-                Point{Real(1), Real(0), Real(1)},
-                Point{Real(0), Real(1), Real(1)},
-                Point{Real(0.5), Real(0), Real(-1)},
-                Point{Real(0.5), Real(0.5), Real(-1)},
-                Point{Real(0), Real(0.5), Real(-1)},
-                Point{Real(0.5), Real(0), Real(1)},
-                Point{Real(0.5), Real(0.5), Real(1)},
-                Point{Real(0), Real(0.5), Real(1)},
-                Point{Real(0), Real(0), Real(0)},
-                Point{Real(1), Real(0), Real(0)},
-                Point{Real(0), Real(1), Real(0)},
-                Point{Real(0.5), Real(0), Real(0)},
-                Point{Real(0.5), Real(0.5), Real(0)},
-                Point{Real(0), Real(0.5), Real(0)},
-            };
-        case ElementType::Pyramid5:
-            return {
-                Point{Real(-1), Real(-1), Real(0)},
-                Point{Real(1), Real(-1), Real(0)},
-                Point{Real(1), Real(1), Real(0)},
-                Point{Real(-1), Real(1), Real(0)},
-                Point{Real(0), Real(0), Real(1)},
-            };
-        case ElementType::Pyramid13:
-            return {
-                Point{Real(-1), Real(-1), Real(0)},
-                Point{Real(1), Real(-1), Real(0)},
-                Point{Real(1), Real(1), Real(0)},
-                Point{Real(-1), Real(1), Real(0)},
-                Point{Real(0), Real(0), Real(1)},
-                Point{Real(0), Real(-1), Real(0)},
-                Point{Real(1), Real(0), Real(0)},
-                Point{Real(0), Real(1), Real(0)},
-                Point{Real(-1), Real(0), Real(0)},
-                Point{Real(-0.5), Real(-0.5), Real(0.5)},
-                Point{Real(0.5), Real(-0.5), Real(0.5)},
-                Point{Real(0.5), Real(0.5), Real(0.5)},
-                Point{Real(-0.5), Real(0.5), Real(0.5)},
-            };
-        case ElementType::Pyramid14:
-            return {
-                Point{Real(-1), Real(-1), Real(0)},
-                Point{Real(1), Real(-1), Real(0)},
-                Point{Real(1), Real(1), Real(0)},
-                Point{Real(-1), Real(1), Real(0)},
-                Point{Real(0), Real(0), Real(1)},
-                Point{Real(0), Real(-1), Real(0)},
-                Point{Real(1), Real(0), Real(0)},
-                Point{Real(0), Real(1), Real(0)},
-                Point{Real(-1), Real(0), Real(0)},
-                Point{Real(-0.5), Real(-0.5), Real(0.5)},
-                Point{Real(0.5), Real(-0.5), Real(0.5)},
-                Point{Real(0.5), Real(0.5), Real(0.5)},
-                Point{Real(-0.5), Real(0.5), Real(0.5)},
-                Point{Real(0), Real(0), Real(0)},
-            };
-        default:
-            return {};
-    }
-}
-
-void expect_nodes_match_node_ordering(ElementType canonical_type,
-                                      int order,
-                                      ElementType node_ordering_type) {
-    LagrangeBasis basis(canonical_type, order);
-    const auto& nodes = basis.nodes();
-
-    ASSERT_EQ(nodes.size(), ReferenceNodeLayout::num_nodes(node_ordering_type));
-    ASSERT_EQ(nodes.size(), basis.size());
-
-    for (std::size_t i = 0; i < nodes.size(); ++i) {
-        const auto expected = ReferenceNodeLayout::get_node_coords(node_ordering_type, i);
-        EXPECT_NEAR(nodes[i][0], expected[0], 1e-14);
-        EXPECT_NEAR(nodes[i][1], expected[1], 1e-14);
-        EXPECT_NEAR(nodes[i][2], expected[2], 1e-14);
-
-        std::vector<Real> vals;
-        basis.evaluate_values(expected, vals);
-        ASSERT_EQ(vals.size(), nodes.size());
-        for (std::size_t j = 0; j < vals.size(); ++j) {
-            const double expected_delta = (i == j) ? 1.0 : 0.0;
-            EXPECT_NEAR(vals[j], expected_delta, 1e-12);
-        }
-    }
-}
-
-void expect_alias_matches_canonical(ElementType alias_type,
-                                    ElementType canonical_type,
-                                    int canonical_order,
-                                    const std::vector<Point>& points,
-                                    Real tol = Real(1e-12)) {
-    LagrangeBasis alias(alias_type, canonical_order);
-    LagrangeBasis canonical(canonical_type, canonical_order);
-
-    ASSERT_EQ(alias.element_type(), canonical.element_type());
-    ASSERT_EQ(alias.order(), canonical.order());
-    ASSERT_EQ(alias.size(), canonical.size());
-    ASSERT_EQ(alias.nodes().size(), canonical.nodes().size());
-
-    for (std::size_t i = 0; i < alias.nodes().size(); ++i) {
-        EXPECT_NEAR(alias.nodes()[i][0], canonical.nodes()[i][0], tol);
-        EXPECT_NEAR(alias.nodes()[i][1], canonical.nodes()[i][1], tol);
-        EXPECT_NEAR(alias.nodes()[i][2], canonical.nodes()[i][2], tol);
-    }
-
-    for (const auto& xi : points) {
-        std::vector<Real> alias_values;
-        std::vector<Real> canonical_values;
-        std::vector<Gradient> alias_gradients;
-        std::vector<Gradient> canonical_gradients;
-        std::vector<Hessian> alias_hessians;
-        std::vector<Hessian> canonical_hessians;
-
-        alias.evaluate_values(xi, alias_values);
-        canonical.evaluate_values(xi, canonical_values);
-        alias.evaluate_gradients(xi, alias_gradients);
-        canonical.evaluate_gradients(xi, canonical_gradients);
-        alias.evaluate_hessians(xi, alias_hessians);
-        canonical.evaluate_hessians(xi, canonical_hessians);
-
-        ASSERT_EQ(alias_values.size(), canonical_values.size());
-        ASSERT_EQ(alias_gradients.size(), canonical_gradients.size());
-        ASSERT_EQ(alias_hessians.size(), canonical_hessians.size());
-
-        for (std::size_t i = 0; i < alias_values.size(); ++i) {
-            EXPECT_NEAR(alias_values[i], canonical_values[i], tol);
-            for (int d = 0; d < canonical.dimension(); ++d) {
-                const std::size_t sd = static_cast<std::size_t>(d);
-                EXPECT_NEAR(alias_gradients[i][sd], canonical_gradients[i][sd], tol);
-                for (int e = 0; e < canonical.dimension(); ++e) {
-                    const std::size_t se = static_cast<std::size_t>(e);
-                    EXPECT_NEAR(alias_hessians[i](sd, se), canonical_hessians[i](sd, se), Real(5) * tol);
-                }
-            }
-        }
-    }
-}
-
-std::vector<Point> sample_points_for(ElementType type) {
-    switch (type) {
-        case ElementType::Line2:
-        case ElementType::Line3:
-            return {
-                Point{Real(-0.7), Real(0), Real(0)},
-                Point{Real(0.1), Real(0), Real(0)},
-                Point{Real(0.65), Real(0), Real(0)}
-            };
-        case ElementType::Triangle3:
-        case ElementType::Triangle6:
-            return {
-                Point{Real(0.15), Real(0.2), Real(0)},
-                Point{Real(0.25), Real(0.1), Real(0)},
-                Point{Real(0.2), Real(0.3), Real(0)}
-            };
-        case ElementType::Quad4:
-        case ElementType::Quad9:
-            return {
-                Point{Real(0.2), Real(-0.35), Real(0)},
-                Point{Real(-0.4), Real(0.25), Real(0)},
-                Point{Real(0.55), Real(0.1), Real(0)}
-            };
-        case ElementType::Tetra4:
-        case ElementType::Tetra10:
-            return {
-                Point{Real(0.1), Real(0.2), Real(0.15)},
-                Point{Real(0.2), Real(0.1), Real(0.25)},
-                Point{Real(0.15), Real(0.15), Real(0.2)}
-            };
-        case ElementType::Hex8:
-        case ElementType::Hex27:
-            return {
-                Point{Real(0.2), Real(-0.3), Real(0.25)},
-                Point{Real(-0.5), Real(0.4), Real(-0.2)},
-                Point{Real(0.1), Real(0.15), Real(0.6)}
-            };
-        case ElementType::Wedge6:
-        case ElementType::Wedge18:
-            return {
-                Point{Real(0.2), Real(0.25), Real(0.0)},
-                Point{Real(0.1), Real(0.2), Real(-0.45)},
-                Point{Real(0.3), Real(0.15), Real(0.5)}
-            };
-        case ElementType::Pyramid5:
-        case ElementType::Pyramid14:
-            return {
-                Point{Real(0.0), Real(0.0), Real(0.25)},
-                Point{Real(0.15), Real(-0.1), Real(0.3)},
-                Point{Real(-0.1), Real(0.2), Real(0.4)}
-            };
-        default:
-            return {Point{Real(0), Real(0), Real(0)}};
-    }
-}
-
-std::vector<Point> boundary_stress_points_for(ElementType type);
-
-std::vector<Point> dense_sample_points_for(ElementType type) {
-    const auto interior = sample_points_for(type);
-    const auto boundary = boundary_stress_points_for(type);
-
-    std::vector<Point> points;
-    points.reserve(interior.size() + boundary.size());
-    points.insert(points.end(), interior.begin(), interior.end());
-    points.insert(points.end(), boundary.begin(), boundary.end());
-
-    if (type == ElementType::Pyramid5 || type == ElementType::Pyramid14) {
-        points.push_back(Point{Real(0.0), Real(0.0), Real(0.85)});
-        points.push_back(Point{Real(0.02), Real(-0.015), Real(0.95)});
-    }
-    return points;
-}
-
-std::vector<Point> boundary_stress_points_for(ElementType type) {
-    switch (type) {
-        case ElementType::Line2:
-        case ElementType::Line3:
-            return {
-                Point{Real(-0.999), Real(0), Real(0)},
-                Point{Real(-0.75), Real(0), Real(0)},
-                Point{Real(0.0), Real(0), Real(0)},
-                Point{Real(0.8), Real(0), Real(0)},
-                Point{Real(0.999), Real(0), Real(0)}
-            };
-        case ElementType::Triangle3:
-        case ElementType::Triangle6:
-            return {
-                Point{Real(1e-6), Real(1e-6), Real(0)},
-                Point{Real(0.98), Real(0.01), Real(0)},
-                Point{Real(0.01), Real(0.98), Real(0)},
-                Point{Real(0.25), Real(1e-4), Real(0)},
-                Point{Real(0.49), Real(0.49), Real(0)}
-            };
-        case ElementType::Quad4:
-        case ElementType::Quad9:
-            return {
-                Point{Real(-0.99), Real(-0.99), Real(0)},
-                Point{Real(0.99), Real(-0.99), Real(0)},
-                Point{Real(0.99), Real(0.99), Real(0)},
-                Point{Real(-0.99), Real(0.99), Real(0)},
-                Point{Real(0.0), Real(0.95), Real(0)}
-            };
-        case ElementType::Tetra4:
-        case ElementType::Tetra10:
-            return {
-                Point{Real(1e-6), Real(1e-6), Real(1e-6)},
-                Point{Real(0.97), Real(0.01), Real(0.01)},
-                Point{Real(0.01), Real(0.97), Real(0.01)},
-                Point{Real(0.01), Real(0.01), Real(0.97)},
-                Point{Real(0.32), Real(0.33), Real(0.01)}
-            };
-        case ElementType::Hex8:
-        case ElementType::Hex27:
-            return {
-                Point{Real(-0.99), Real(-0.99), Real(-0.99)},
-                Point{Real(0.99), Real(-0.99), Real(0.99)},
-                Point{Real(0.99), Real(0.99), Real(-0.99)},
-                Point{Real(-0.99), Real(0.99), Real(0.99)},
-                Point{Real(0.0), Real(0.0), Real(0.95)}
-            };
-        case ElementType::Wedge6:
-        case ElementType::Wedge18:
-            return {
-                Point{Real(1e-6), Real(1e-6), Real(-0.99)},
-                Point{Real(0.98), Real(0.01), Real(-0.99)},
-                Point{Real(0.01), Real(0.98), Real(0.99)},
-                Point{Real(0.49), Real(0.49), Real(0.0)},
-                Point{Real(0.25), Real(1e-4), Real(0.95)}
-            };
-        case ElementType::Pyramid5:
-        case ElementType::Pyramid14:
-            return {
-                Point{Real(0.0), Real(0.0), Real(0.95)},
-                Point{Real(0.01), Real(-0.01), Real(0.98)},
-                Point{Real(0.6), Real(-0.6), Real(0.2)},
-                Point{Real(0.79), Real(0.0), Real(0.2)},
-                Point{Real(0.0), Real(0.79), Real(0.2)}
-            };
-        default:
-            return {Point{Real(0), Real(0), Real(0)}};
-    }
-}
-
-Real monomial_value(const Point& xi, int px, int py, int pz) {
-    return std::pow(xi[0], px) * std::pow(xi[1], py) * std::pow(xi[2], pz);
-}
-
-void expect_gradients_match_finite_difference(const LagrangeAccuracyCase& c,
-                                              Real eps,
-                                              Real tol) {
-    LagrangeBasis basis(c.type, c.order);
-
-    for (const auto& xi : c.points) {
-        std::vector<Gradient> gradients;
-        basis.evaluate_gradients(xi, gradients);
-        ASSERT_EQ(gradients.size(), basis.size());
-
-        for (int d = 0; d < basis.dimension(); ++d) {
-            Point xp = xi;
-            Point xm = xi;
-            xp[d] += eps;
-            xm[d] -= eps;
-
-            std::vector<Real> values_p;
-            std::vector<Real> values_m;
-            basis.evaluate_values(xp, values_p);
-            basis.evaluate_values(xm, values_m);
-
-            ASSERT_EQ(values_p.size(), basis.size());
-            ASSERT_EQ(values_m.size(), basis.size());
-            for (std::size_t i = 0; i < basis.size(); ++i) {
-                const Real fd = (values_p[i] - values_m[i]) / (Real(2) * eps);
-                EXPECT_NEAR(gradients[i][d], fd, tol)
-                    << "type=" << static_cast<int>(c.type)
-                    << ", order=" << c.order
-                    << ", dim=" << d
-                    << ", basis_i=" << i
-                    << ", xi=(" << xi[0] << "," << xi[1] << "," << xi[2] << ")";
-            }
-        }
-    }
-}
-
-void expect_polynomial_reproduction(const LagrangeAccuracyCase& c,
-                                    const std::vector<std::array<int, 3>>& exponents,
-                                    Real tol) {
-    LagrangeBasis basis(c.type, c.order);
-    const auto& nodes = basis.nodes();
-    ASSERT_EQ(nodes.size(), basis.size());
-
-    for (const auto& exp : exponents) {
-        std::vector<Real> coeffs(basis.size(), Real(0));
-        for (std::size_t i = 0; i < basis.size(); ++i) {
-            coeffs[i] = monomial_value(nodes[i], exp[0], exp[1], exp[2]);
-        }
-
-        for (const auto& xi : c.points) {
-            std::vector<Real> values;
-            basis.evaluate_values(xi, values);
-            ASSERT_EQ(values.size(), basis.size());
-
-            Real interpolated = Real(0);
-            for (std::size_t i = 0; i < basis.size(); ++i) {
-                interpolated += coeffs[i] * values[i];
-            }
-
-            const Real exact = monomial_value(xi, exp[0], exp[1], exp[2]);
-            EXPECT_NEAR(interpolated, exact, tol)
-                << "type=" << static_cast<int>(c.type)
-                << ", order=" << c.order
-                << ", monomial=(" << exp[0] << "," << exp[1] << "," << exp[2] << ")"
-                << ", xi=(" << xi[0] << "," << xi[1] << "," << xi[2] << ")";
-        }
-    }
-}
-
-template<typename Container>
-void expect_all_finite(const Container& values) {
-    for (const auto& value : values) {
-        for (std::size_t d = 0; d < 3; ++d) {
-            EXPECT_TRUE(std::isfinite(value[d]));
-        }
-    }
-}
-
-void expect_hessians_finite(const std::vector<Hessian>& hessians,
-                            int dimension) {
-    for (const auto& H : hessians) {
-        for (int i = 0; i < dimension; ++i) {
-            for (int j = 0; j < dimension; ++j) {
-                EXPECT_TRUE(std::isfinite(H(static_cast<std::size_t>(i),
-                                            static_cast<std::size_t>(j))));
-            }
-        }
-    }
-}
-
-void expect_partition_gradient_hessian_sums(const LagrangeBasis& basis,
-                                            const std::vector<Point>& points,
-                                            Real value_tol,
-                                            Real derivative_tol) {
-    for (const auto& xi : points) {
-        std::vector<Real> values;
-        std::vector<Gradient> gradients;
-        std::vector<Hessian> hessians;
-        basis.evaluate_values(xi, values);
-        basis.evaluate_gradients(xi, gradients);
-        basis.evaluate_hessians(xi, hessians);
-
-        ASSERT_EQ(values.size(), basis.size());
-        ASSERT_EQ(gradients.size(), basis.size());
-        ASSERT_EQ(hessians.size(), basis.size());
-
-        Real value_sum = Real(0);
-        Gradient gradient_sum{};
-        Hessian hessian_sum{};
-        for (std::size_t i = 0; i < basis.size(); ++i) {
-            value_sum += values[i];
-            for (int d = 0; d < basis.dimension(); ++d) {
-                const std::size_t sd = static_cast<std::size_t>(d);
-                gradient_sum[sd] += gradients[i][sd];
-                for (int e = 0; e < basis.dimension(); ++e) {
-                    const std::size_t se = static_cast<std::size_t>(e);
-                    hessian_sum(sd, se) += hessians[i](sd, se);
-                }
-            }
-        }
-
-        EXPECT_NEAR(value_sum, Real(1), value_tol)
-            << "Element type " << static_cast<int>(basis.element_type())
-            << ", order " << basis.order()
-            << ", xi=(" << xi[0] << "," << xi[1] << "," << xi[2] << ")";
-
-        for (int d = 0; d < basis.dimension(); ++d) {
-            const std::size_t sd = static_cast<std::size_t>(d);
-            EXPECT_NEAR(gradient_sum[sd], Real(0), derivative_tol)
-                << "Gradient sum mismatch for element type " << static_cast<int>(basis.element_type())
-                << ", order " << basis.order()
-                << ", dim " << d;
-            for (int e = 0; e < basis.dimension(); ++e) {
-                const std::size_t se = static_cast<std::size_t>(e);
-                EXPECT_NEAR(hessian_sum(sd, se), Real(0), derivative_tol)
-                    << "Hessian sum mismatch for element type " << static_cast<int>(basis.element_type())
-                    << ", order " << basis.order()
-                    << ", component (" << d << "," << e << ")";
-            }
-        }
-    }
-}
-
-bool is_on_pyramid_face(const Point& point,
-                        PyramidFace face,
-                        Real tol = Real(1e-12)) {
-    const Real scale = Real(1) - point[2];
-    switch (face) {
-        case PyramidFace::Base:
-            return std::abs(point[2]) <= tol;
-        case PyramidFace::South:
-            return std::abs(point[1] + scale) <= tol;
-        case PyramidFace::East:
-            return std::abs(point[0] - scale) <= tol;
-        case PyramidFace::North:
-            return std::abs(point[1] - scale) <= tol;
-        case PyramidFace::West:
-            return std::abs(point[0] + scale) <= tol;
-    }
-    return false;
-}
-
-Point map_pyramid_face_to_reference(PyramidFace face,
-                                    const Point& point) {
-    const Real scale = Real(1) - point[2];
-    switch (face) {
-        case PyramidFace::Base:
-            return Point{point[0], point[1], Real(0)};
-        case PyramidFace::South:
-            return Point{(scale - point[0]) / Real(2), point[2], Real(0)};
-        case PyramidFace::East:
-            return Point{(scale + point[1]) / Real(2), point[2], Real(0)};
-        case PyramidFace::North:
-            return Point{(scale + point[0]) / Real(2), point[2], Real(0)};
-        case PyramidFace::West:
-            return Point{(scale - point[1]) / Real(2), point[2], Real(0)};
-    }
-    return Point{};
-}
-
-std::vector<Point> sample_points_for_pyramid_face(PyramidFace face) {
-    switch (face) {
-        case PyramidFace::Base:
-            return {
-                Point{Real(0.15), Real(-0.2), Real(0)},
-                Point{Real(-0.55), Real(0.35), Real(0)}
-            };
-        case PyramidFace::South:
-            return {
-                Point{Real(-0.2), Real(-0.8), Real(0.2)},
-                Point{Real(0.05), Real(-0.35), Real(0.65)}
-            };
-        case PyramidFace::East:
-            return {
-                Point{Real(0.8), Real(-0.25), Real(0.2)},
-                Point{Real(0.3), Real(0.08), Real(0.7)}
-            };
-        case PyramidFace::North:
-            return {
-                Point{Real(0.25), Real(0.8), Real(0.2)},
-                Point{Real(-0.08), Real(0.35), Real(0.65)}
-            };
-        case PyramidFace::West:
-            return {
-                Point{Real(-0.8), Real(0.2), Real(0.2)},
-                Point{Real(-0.3), Real(-0.05), Real(0.7)}
-            };
-    }
-    return {};
-}
-
-bool is_on_pyramid_edge(const Point& point,
-                        PyramidEdge edge,
-                        Real tol = Real(1e-12)) {
-    const Real scale = Real(1) - point[2];
-    switch (edge) {
-        case PyramidEdge::BaseSouth:
-            return std::abs(point[2]) <= tol && std::abs(point[1] + Real(1)) <= tol;
-        case PyramidEdge::BaseEast:
-            return std::abs(point[2]) <= tol && std::abs(point[0] - Real(1)) <= tol;
-        case PyramidEdge::BaseNorth:
-            return std::abs(point[2]) <= tol && std::abs(point[1] - Real(1)) <= tol;
-        case PyramidEdge::BaseWest:
-            return std::abs(point[2]) <= tol && std::abs(point[0] + Real(1)) <= tol;
-        case PyramidEdge::VerticalSW:
-            return std::abs(point[0] + scale) <= tol && std::abs(point[1] + scale) <= tol;
-        case PyramidEdge::VerticalSE:
-            return std::abs(point[0] - scale) <= tol && std::abs(point[1] + scale) <= tol;
-        case PyramidEdge::VerticalNE:
-            return std::abs(point[0] - scale) <= tol && std::abs(point[1] - scale) <= tol;
-        case PyramidEdge::VerticalNW:
-            return std::abs(point[0] + scale) <= tol && std::abs(point[1] - scale) <= tol;
-    }
-    return false;
-}
-
-Point map_pyramid_edge_to_reference(PyramidEdge edge,
-                                    const Point& point) {
-    switch (edge) {
-        case PyramidEdge::BaseSouth:
-        case PyramidEdge::BaseNorth:
-            return Point{point[0], Real(0), Real(0)};
-        case PyramidEdge::BaseEast:
-        case PyramidEdge::BaseWest:
-            return Point{point[1], Real(0), Real(0)};
-        case PyramidEdge::VerticalSW:
-        case PyramidEdge::VerticalSE:
-        case PyramidEdge::VerticalNE:
-        case PyramidEdge::VerticalNW:
-            return Point{Real(2) * point[2] - Real(1), Real(0), Real(0)};
-    }
-    return Point{};
-}
-
-std::vector<Point> sample_points_for_pyramid_edge(PyramidEdge edge) {
-    switch (edge) {
-        case PyramidEdge::BaseSouth:
-            return {Point{Real(-0.65), Real(-1), Real(0)}, Point{Real(0.35), Real(-1), Real(0)}};
-        case PyramidEdge::BaseEast:
-            return {Point{Real(1), Real(-0.45), Real(0)}, Point{Real(1), Real(0.55), Real(0)}};
-        case PyramidEdge::BaseNorth:
-            return {Point{Real(-0.55), Real(1), Real(0)}, Point{Real(0.45), Real(1), Real(0)}};
-        case PyramidEdge::BaseWest:
-            return {Point{Real(-1), Real(-0.55), Real(0)}, Point{Real(-1), Real(0.45), Real(0)}};
-        case PyramidEdge::VerticalSW:
-            return {Point{Real(-0.75), Real(-0.75), Real(0.25)}, Point{Real(-0.3), Real(-0.3), Real(0.7)}};
-        case PyramidEdge::VerticalSE:
-            return {Point{Real(0.75), Real(-0.75), Real(0.25)}, Point{Real(0.3), Real(-0.3), Real(0.7)}};
-        case PyramidEdge::VerticalNE:
-            return {Point{Real(0.75), Real(0.75), Real(0.25)}, Point{Real(0.3), Real(0.3), Real(0.7)}};
-        case PyramidEdge::VerticalNW:
-            return {Point{Real(-0.75), Real(0.75), Real(0.25)}, Point{Real(-0.3), Real(0.3), Real(0.7)}};
-    }
-    return {};
-}
-
-std::vector<int> map_pyramid_nodes_to_lower_basis_nodes(
-    const std::vector<Point>& pyramid_nodes,
-    const std::vector<Point>& lower_basis_nodes,
-    const std::function<bool(const Point&)>& selector,
-    const std::function<Point(const Point&)>& mapper) {
-    std::vector<int> mapping(pyramid_nodes.size(), -1);
-    std::size_t face_count = 0;
-    for (std::size_t i = 0; i < pyramid_nodes.size(); ++i) {
-        if (!selector(pyramid_nodes[i])) {
-            continue;
-        }
-
-        ++face_count;
-        const Point mapped = mapper(pyramid_nodes[i]);
-        bool found = false;
-        for (std::size_t j = 0; j < lower_basis_nodes.size(); ++j) {
-            if (points_close(mapped, lower_basis_nodes[j])) {
-                mapping[i] = static_cast<int>(j);
-                found = true;
-                break;
-            }
-        }
-        EXPECT_TRUE(found)
-            << "Failed to match pyramid trace node at (" << pyramid_nodes[i][0] << ","
-            << pyramid_nodes[i][1] << "," << pyramid_nodes[i][2] << ")";
-    }
-
-    EXPECT_EQ(face_count, lower_basis_nodes.size());
-    return mapping;
-}
-
-void expect_pyramid_face_trace_matches_lower_basis(int order,
-                                                   PyramidFace face,
-                                                   Real tol = Real(2e-10)) {
-    LagrangeBasis pyramid(ElementType::Pyramid5, order);
-    const bool base_face = face == PyramidFace::Base;
-    LagrangeBasis lower(base_face ? ElementType::Quad4 : ElementType::Triangle3, order);
-
-    const auto mapping = map_pyramid_nodes_to_lower_basis_nodes(
-        pyramid.nodes(),
-        lower.nodes(),
-        [&](const Point& point) { return is_on_pyramid_face(point, face); },
-        [&](const Point& point) { return map_pyramid_face_to_reference(face, point); });
-
-    for (const auto& face_point : sample_points_for_pyramid_face(face)) {
-        std::vector<Real> pyramid_values;
-        std::vector<Real> lower_values;
-        pyramid.evaluate_values(face_point, pyramid_values);
-        lower.evaluate_values(map_pyramid_face_to_reference(face, face_point), lower_values);
-
-        ASSERT_EQ(pyramid_values.size(), pyramid.size());
-        ASSERT_EQ(lower_values.size(), lower.size());
-
-        for (std::size_t i = 0; i < pyramid.size(); ++i) {
-            if (mapping[i] >= 0) {
-                EXPECT_NEAR(pyramid_values[i], lower_values[static_cast<std::size_t>(mapping[i])], tol)
-                    << "Face trace mismatch for order " << order
-                    << ", face " << static_cast<int>(face)
-                    << ", basis " << i;
-            } else {
-                EXPECT_NEAR(pyramid_values[i], Real(0), tol)
-                    << "Off-face pyramid basis should vanish on face for order " << order
-                    << ", face " << static_cast<int>(face)
-                    << ", basis " << i;
-            }
-        }
-    }
-}
-
-void expect_pyramid_edge_trace_matches_line_basis(int order,
-                                                  PyramidEdge edge,
-                                                  Real tol = Real(2e-10)) {
-    LagrangeBasis pyramid(ElementType::Pyramid5, order);
-    LagrangeBasis line(ElementType::Line2, order);
-
-    const auto mapping = map_pyramid_nodes_to_lower_basis_nodes(
-        pyramid.nodes(),
-        line.nodes(),
-        [&](const Point& point) { return is_on_pyramid_edge(point, edge); },
-        [&](const Point& point) { return map_pyramid_edge_to_reference(edge, point); });
-
-    for (const auto& edge_point : sample_points_for_pyramid_edge(edge)) {
-        std::vector<Real> pyramid_values;
-        std::vector<Real> line_values;
-        pyramid.evaluate_values(edge_point, pyramid_values);
-        line.evaluate_values(map_pyramid_edge_to_reference(edge, edge_point), line_values);
-
-        ASSERT_EQ(pyramid_values.size(), pyramid.size());
-        ASSERT_EQ(line_values.size(), line.size());
-
-        for (std::size_t i = 0; i < pyramid.size(); ++i) {
-            if (mapping[i] >= 0) {
-                EXPECT_NEAR(pyramid_values[i], line_values[static_cast<std::size_t>(mapping[i])], tol)
-                    << "Edge trace mismatch for order " << order
-                    << ", edge " << static_cast<int>(edge)
-                    << ", basis " << i;
-            } else {
-                EXPECT_NEAR(pyramid_values[i], Real(0), tol)
-                    << "Off-edge pyramid basis should vanish on edge for order " << order
-                    << ", edge " << static_cast<int>(edge)
-                    << ", basis " << i;
-            }
-        }
-    }
-}
-
-struct StridedOutputRequest {
-    bool values;
-    bool gradients;
-    bool hessians;
-};
-
-void expect_strided_matches_pointwise(ElementType type,
-                                      int order,
-                                      const StridedOutputRequest& request) {
-    LagrangeBasis basis(type, order);
-    const auto points = dense_sample_points_for(type);
-    const std::size_t stride = points.size() + 3u;
-    constexpr Real sentinel = Real(-12345.25);
-
-    std::vector<Real> values(request.values ? basis.size() * stride : 0u, sentinel);
-    std::vector<Real> gradients(request.gradients ? basis.size() * 3u * stride : 0u, sentinel);
-    std::vector<Real> hessians(request.hessians ? basis.size() * 9u * stride : 0u, sentinel);
-
-    basis.evaluate_at_quadrature_points_strided(
-        points,
-        stride,
-        request.values ? values.data() : nullptr,
-        request.gradients ? gradients.data() : nullptr,
-        request.hessians ? hessians.data() : nullptr);
-
-    const Real tol = (type == ElementType::Pyramid5 || type == ElementType::Pyramid14)
-        ? Real(5e-10)
-        : Real(1e-12);
-
-    for (std::size_t q = 0; q < points.size(); ++q) {
-        if (request.values) {
-            std::vector<Real> expected;
-            basis.evaluate_values(points[q], expected);
-            ASSERT_EQ(expected.size(), basis.size());
-            for (std::size_t d = 0; d < basis.size(); ++d) {
-                EXPECT_NEAR(values[d * stride + q], expected[d], tol)
-                    << "type=" << static_cast<int>(type)
-                    << ", order=" << order
-                    << ", dof=" << d
-                    << ", q=" << q;
-            }
-        }
-
-        if (request.gradients) {
-            std::vector<Gradient> expected;
-            basis.evaluate_gradients(points[q], expected);
-            ASSERT_EQ(expected.size(), basis.size());
-            for (std::size_t d = 0; d < basis.size(); ++d) {
-                for (std::size_t c = 0; c < 3u; ++c) {
-                    EXPECT_NEAR(gradients[(d * 3u + c) * stride + q], expected[d][c], tol)
-                        << "type=" << static_cast<int>(type)
-                        << ", order=" << order
-                        << ", dof=" << d
-                        << ", component=" << c
-                        << ", q=" << q;
-                }
-            }
-        }
-
-        if (request.hessians) {
-            std::vector<Hessian> expected;
-            basis.evaluate_hessians(points[q], expected);
-            ASSERT_EQ(expected.size(), basis.size());
-            for (std::size_t d = 0; d < basis.size(); ++d) {
-                for (std::size_t r = 0; r < 3u; ++r) {
-                    for (std::size_t c = 0; c < 3u; ++c) {
-                        EXPECT_NEAR(hessians[(d * 9u + r * 3u + c) * stride + q],
-                                    expected[d](r, c),
-                                    Real(4) * tol)
-                            << "type=" << static_cast<int>(type)
-                            << ", order=" << order
-                            << ", dof=" << d
-                            << ", hessian=(" << r << "," << c << ")"
-                            << ", q=" << q;
-                    }
-                }
-            }
-        }
-    }
-
-    const auto expect_padding_untouched = [&](const std::vector<Real>& buffer,
-                                              std::size_t rows) {
-        for (std::size_t row = 0; row < rows; ++row) {
-            for (std::size_t q = points.size(); q < stride; ++q) {
-                EXPECT_EQ(buffer[row * stride + q], sentinel)
-                    << "type=" << static_cast<int>(type)
-                    << ", order=" << order
-                    << ", row=" << row
-                    << ", padding q=" << q;
-            }
-        }
-    };
-
-    if (request.values) {
-        expect_padding_untouched(values, basis.size());
-    }
-    if (request.gradients) {
-        expect_padding_untouched(gradients, basis.size() * 3u);
-    }
-    if (request.hessians) {
-        expect_padding_untouched(hessians, basis.size() * 9u);
-    }
-}
-
-void expect_raw_to_matches_vector_evaluation(ElementType type, int order) {
-    LagrangeBasis basis(type, order);
-    const Real tol = (type == ElementType::Pyramid5 || type == ElementType::Pyramid14)
-        ? Real(5e-10)
-        : Real(1e-12);
-
-    for (const auto& point : sample_points_for(type)) {
-        std::vector<Real> values;
-        std::vector<Gradient> gradients;
-        std::vector<Hessian> hessians;
-        basis.evaluate_all(point, values, gradients, hessians);
-
-        std::vector<Real> raw_values(basis.size());
-        std::vector<Real> raw_gradients(basis.size() * 3u);
-        std::vector<Real> raw_hessians(basis.size() * 9u);
-        basis.evaluate_values_to(point, raw_values.data());
-        basis.evaluate_gradients_to(point, raw_gradients.data());
-        basis.evaluate_hessians_to(point, raw_hessians.data());
-
-        for (std::size_t i = 0; i < basis.size(); ++i) {
-            EXPECT_NEAR(raw_values[i], values[i], tol)
-                << "type=" << static_cast<int>(type) << ", order=" << order << ", dof=" << i;
-            for (std::size_t c = 0; c < 3u; ++c) {
-                EXPECT_NEAR(raw_gradients[i * 3u + c], gradients[i][c], tol)
-                    << "type=" << static_cast<int>(type)
-                    << ", order=" << order
-                    << ", dof=" << i
-                    << ", gradient component=" << c;
-            }
-            for (std::size_t r = 0; r < 3u; ++r) {
-                for (std::size_t c = 0; c < 3u; ++c) {
-                    EXPECT_NEAR(raw_hessians[i * 9u + r * 3u + c], hessians[i](r, c), Real(4) * tol)
-                        << "type=" << static_cast<int>(type)
-                        << ", order=" << order
-                        << ", dof=" << i
-                        << ", hessian=(" << r << "," << c << ")";
-                }
-            }
-        }
-    }
-}
-
-} // namespace
-
-TEST(SolverBasisAdapter, ShapeValuesGradientsAndMeshOverloadMatchLegacy) {
-    constexpr double tol = 2e-12;
-
-    for (const auto& c : solver_basis_adapter_cases()) {
-        SCOPED_TRACE("element=" + std::to_string(static_cast<int>(c.type)));
-        Vector<double> weights;
-        Array<double> xi;
-        fill_legacy_quadrature(c, weights, xi);
-
-        Array<double> legacy_N(c.eNoN, c.nG);
-        Array<double> adapter_N(c.eNoN, c.nG);
-        Array3<double> legacy_Nx(c.insd, c.eNoN, c.nG);
-        Array3<double> adapter_Nx(c.insd, c.eNoN, c.nG);
-        auto legacy_shape = legacy_solver_nn::get_element_shape_data.find(c.type);
-
-        faceType legacy_face;
-        if (legacy_shape == legacy_solver_nn::get_element_shape_data.end()) {
-            ASSERT_EQ(c.type, consts::ElementType::QUD8);
-            legacy_face.eType = c.type;
-            legacy_face.eNoN = c.eNoN;
-            legacy_face.nG = c.nG;
-            legacy_face.xi = xi;
-            legacy_face.N.resize(c.eNoN, c.nG);
-            legacy_face.Nx.resize(c.insd, c.eNoN, c.nG);
-        }
-
-        for (int g = 0; g < c.nG; ++g) {
-            if (legacy_shape != legacy_solver_nn::get_element_shape_data.end()) {
-                legacy_shape->second(c.insd, c.eNoN, g, xi, legacy_N, legacy_Nx);
-            } else {
-                legacy_solver_nn::set_face_shape_data.at(c.type)(g, legacy_face);
-            }
-            nn::get_gnn(c.insd, c.type, c.eNoN, g, xi, adapter_N, adapter_Nx);
-
-            double partition = 0.0;
-            std::array<double, 3> gradient_sum{0.0, 0.0, 0.0};
-            for (int a = 0; a < c.eNoN; ++a) {
-                partition += adapter_N(a, g);
-                for (int d = 0; d < c.insd; ++d) {
-                    gradient_sum[static_cast<std::size_t>(d)] += adapter_Nx(d, a, g);
-                }
-            }
-
-            EXPECT_NEAR(partition, 1.0, tol)
-                << "element=" << static_cast<int>(c.type) << ", g=" << g;
-            for (int d = 0; d < c.insd; ++d) {
-                EXPECT_NEAR(gradient_sum[static_cast<std::size_t>(d)], 0.0, tol)
-                    << "element=" << static_cast<int>(c.type) << ", d=" << d << ", g=" << g;
-            }
-        }
-
-        if (legacy_shape == legacy_solver_nn::get_element_shape_data.end()) {
-            legacy_N = legacy_face.N;
-            legacy_Nx = legacy_face.Nx;
-        }
-
-        expect_arrays_near(adapter_N, legacy_N, tol);
-        expect_array3_near(adapter_Nx, legacy_Nx, tol);
-
-        mshType mesh;
-        mesh.eType = c.type;
-        mesh.eNoN = c.eNoN;
-        mesh.nG = c.nG;
-        mesh.xi = xi;
-        mesh.N.resize(c.eNoN, c.nG);
-        mesh.Nx.resize(c.insd, c.eNoN, c.nG);
-        for (int g = 0; g < c.nG; ++g) {
-            nn::get_gnn(g, mesh);
-        }
-
-        expect_arrays_near(mesh.N, legacy_N, tol);
-        expect_array3_near(mesh.Nx, legacy_Nx, tol);
-    }
-}
-
-TEST(SolverFaceBasisAdapter, ShapeValuesGradientsAndDispatchMatchLegacyFaceTable) {
-    constexpr double tol = 2e-12;
-
-    int covered = 0;
-    for (const auto& c : solver_face_basis_adapter_cases()) {
-        SCOPED_TRACE("face element=" + std::to_string(static_cast<int>(c.type)));
-
-        faceType legacy_face = initialized_face_for_case(c);
-        faceType basis_face = initialized_face_for_case(c);
-
-        for (int g = 0; g < c.nG; ++g) {
-            legacy_solver_nn::set_face_shape_data.at(c.type)(g, legacy_face);
-            nn::get_gnn(nullptr, g, basis_face);
-            expect_face_partition_identities(c, basis_face, g, tol);
-        }
-
-        expect_arrays_near(basis_face.N, legacy_face.N, tol);
-        expect_array3_near(basis_face.Nx, legacy_face.Nx, tol);
-        ++covered;
-    }
-
-    EXPECT_EQ(covered, 7);
-}
-
-TEST(SolverFaceBasisAdapter, MappedFacesFailClosedWithoutLegacyFallback) {
-    using consts::ElementType;
-
-    SolverBasisAdapterCase c{ElementType::LIN1, ElementType::LIN1, 1, 3, 2};
-    faceType face = initialized_face_for_case(c);
-
-    try {
-        nn::get_gnn(nullptr, 0, face);
-        FAIL() << "Expected mapped face dispatch to reject mismatched eNoN";
-    } catch (const svmp::FE::basis::BasisEvaluationException& exception) {
-        const std::string message = exception.what();
-        EXPECT_NE(message.find("legacy fallback was not attempted"), std::string::npos)
-            << message;
-    }
-}
-
-TEST(SolverFaceBasisAdapter, PointFaceRemainsLegacyValuePath) {
-    faceType face;
-    face.eType = consts::ElementType::PNT;
-    face.eNoN = 1;
-    face.nG = 1;
-    face.N.resize(1, 1);
-    face.Nx.resize(1, 1, 1);
-    face.N(0, 0) = -7.0;
-    face.Nx(0, 0, 0) = 42.0;
-
-    nn::get_gnn(nullptr, 0, face);
-
-    EXPECT_DOUBLE_EQ(face.N(0, 0), 1.0);
-    EXPECT_DOUBLE_EQ(face.Nx(0, 0, 0), 42.0);
-}
-
-TEST(SolverFaceBasisAdapter, UnsupportedFacesThrowClearErrors) {
-    faceType nrb_face;
-    nrb_face.eType = consts::ElementType::NRB;
-    nrb_face.eNoN = 1;
-    nrb_face.nG = 1;
-    nrb_face.N.resize(1, 1);
-    nrb_face.Nx.resize(1, 1, 1);
-    EXPECT_THROW(nn::get_gnn(nullptr, 0, nrb_face), svmp::FE::NotImplementedException);
-
-    faceType unknown_face;
-    unknown_face.eType = consts::ElementType::NA;
-    unknown_face.eNoN = 1;
-    unknown_face.nG = 1;
-    unknown_face.N.resize(1, 1);
-    unknown_face.Nx.resize(1, 1, 1);
-    EXPECT_THROW(nn::get_gnn(nullptr, 0, unknown_face), svmp::FE::InvalidElementException);
-}
-
-TEST(SolverBasisAdapter, QuadraturePathsRemainLegacyCompatible) {
-    constexpr double tol = 0.0;
-
-    for (const auto& c : solver_basis_adapter_cases()) {
-        auto mesh_it = legacy_solver_nn::set_element_gauss_int_data.find(c.type);
-        if (mesh_it != legacy_solver_nn::set_element_gauss_int_data.end()) {
-            mshType legacy_mesh;
-            legacy_mesh.eType = c.type;
-            legacy_mesh.eNoN = c.eNoN;
-            legacy_mesh.nG = c.nG;
-            legacy_mesh.w.resize(c.nG);
-            legacy_mesh.xi.resize(c.insd, c.nG);
-            mesh_it->second(legacy_mesh);
-
-            mshType adapter_mesh;
-            adapter_mesh.eType = c.type;
-            adapter_mesh.eNoN = c.eNoN;
-            adapter_mesh.nG = c.nG;
-            adapter_mesh.w.resize(c.nG);
-            adapter_mesh.xi.resize(c.insd, c.nG);
-            nn::get_gip(adapter_mesh);
-
-            expect_vectors_near(adapter_mesh.w, legacy_mesh.w, tol);
-            expect_arrays_near(adapter_mesh.xi, legacy_mesh.xi, tol);
-        }
-
-        auto scalar_it = legacy_solver_nn::get_element_gauss_int_data.find(c.type);
-        if (scalar_it != legacy_solver_nn::get_element_gauss_int_data.end()) {
-            Vector<double> legacy_w(c.nG);
-            Vector<double> adapter_w(c.nG);
-            Array<double> legacy_xi(c.insd, c.nG);
-            Array<double> adapter_xi(c.insd, c.nG);
-
-            scalar_it->second(c.insd, c.nG, legacy_w, legacy_xi);
-            nn::get_gip(c.insd, c.type, c.nG, adapter_w, adapter_xi);
-
-            expect_vectors_near(adapter_w, legacy_w, tol);
-            expect_arrays_near(adapter_xi, legacy_xi, tol);
-        }
-    }
-
-    mshType legacy_tet;
-    legacy_tet.eType = consts::ElementType::TET4;
-    legacy_tet.eNoN = 4;
-    legacy_tet.nG = 4;
-    legacy_tet.qmTET4 = 0.25;
-    legacy_tet.w.resize(4);
-    legacy_tet.xi.resize(3, 4);
-    legacy_solver_nn::set_element_gauss_int_data.at(consts::ElementType::TET4)(legacy_tet);
-
-    mshType adapter_tet;
-    adapter_tet.eType = consts::ElementType::TET4;
-    adapter_tet.eNoN = 4;
-    adapter_tet.nG = 4;
-    adapter_tet.qmTET4 = 0.25;
-    adapter_tet.w.resize(4);
-    adapter_tet.xi.resize(3, 4);
-    nn::get_gip(adapter_tet);
-
-    expect_vectors_near(adapter_tet.w, legacy_tet.w, tol);
-    expect_arrays_near(adapter_tet.xi, legacy_tet.xi, tol);
-}
-
-TEST(SolverBasisAdapter, HessiansCoverEveryMappedScalarVolumeElement) {
-    constexpr double partition_tol = 2e-10;
-    constexpr double finite_difference_tol = 2e-5;
-    constexpr double zero_tol = 2e-12;
-
-    int covered = 0;
-    for (const auto& c : solver_hessian_adapter_cases()) {
-        SCOPED_TRACE("element=" + std::to_string(static_cast<int>(c.type)));
-        Vector<double> weights;
-        Array<double> xi;
-        fill_legacy_quadrature(c, weights, xi);
-
-        const int ind2 = packed_hessian_components(c.insd);
-        Array3<double> adapter_Nxx(ind2, c.eNoN, c.nG);
-        fill_array3(adapter_Nxx, std::numeric_limits<double>::quiet_NaN());
-
-        for (int g = 0; g < c.nG; ++g) {
-            nn::get_gn_nxx(c.insd, ind2, c.type, c.eNoN, g, xi, adapter_Nxx);
-            expect_partition_hessian_identity(c, adapter_Nxx, g, partition_tol);
-            expect_solver_hessian_matches_gradient_finite_difference(
-                c, xi, g, adapter_Nxx, finite_difference_tol);
-
-            if (c.type == consts::ElementType::LIN1 ||
-                c.type == consts::ElementType::TRI3 ||
-                c.type == consts::ElementType::TET4) {
-                expect_all_hessians_zero(c, adapter_Nxx, g, zero_tol);
-            }
-        }
-
-        if (c.type == consts::ElementType::QUD4) {
-            EXPECT_TRUE(array3_has_nonzero_component(adapter_Nxx, 2, zero_tol));
-        } else if (c.type == consts::ElementType::HEX8) {
-            EXPECT_TRUE(array3_has_nonzero_component(adapter_Nxx, 3, zero_tol));
-            EXPECT_TRUE(array3_has_nonzero_component(adapter_Nxx, 4, zero_tol));
-            EXPECT_TRUE(array3_has_nonzero_component(adapter_Nxx, 5, zero_tol));
-        } else if (c.type == consts::ElementType::WDG) {
-            EXPECT_TRUE(array3_has_nonzero_component(adapter_Nxx, 5, zero_tol));
-        }
-        ++covered;
-    }
-
-    EXPECT_EQ(covered, 13);
-}
-
-TEST(SolverBasisAdapter, HessianPackingMatchesLegacyWhereLegacyIsApproved) {
-    constexpr double tol = 2e-12;
-
-    for (const auto& c : solver_legacy_hessian_parity_cases()) {
-        Vector<double> weights;
-        Array<double> xi;
-        fill_legacy_quadrature(c, weights, xi);
-
-        const int ind2 = packed_hessian_components(c.insd);
-        Array3<double> legacy_Nxx(ind2, c.eNoN, c.nG);
-        Array3<double> adapter_Nxx(ind2, c.eNoN, c.nG);
-
-        for (int g = 0; g < c.nG; ++g) {
-            legacy_solver_nn::get_element_2nd_derivs.at(c.type)(
-                c.insd, ind2, c.eNoN, g, xi, legacy_Nxx);
-            nn::get_gn_nxx(c.insd, ind2, c.type, c.eNoN, g, xi, adapter_Nxx);
-        }
-
-        expect_array3_near(adapter_Nxx, legacy_Nxx, tol);
-    }
-}
-
-TEST(SolverBasisAdapter, Qud8HessiansDoNotUseLegacyFallback) {
-    using consts::ElementType;
-    SolverBasisAdapterCase c{ElementType::QUD8, ElementType::QUD9, 2, 8, 9};
-
-    Vector<double> weights;
-    Array<double> xi;
-    fill_legacy_quadrature(c, weights, xi);
-
-    const int ind2 = packed_hessian_components(c.insd);
-    Array3<double> legacy_Nxx(ind2, c.eNoN, c.nG);
-    Array3<double> adapter_Nxx(ind2, c.eNoN, c.nG);
-    fill_array3(legacy_Nxx, 0.0);
-    fill_array3(adapter_Nxx, 0.0);
-
-    for (int g = 0; g < c.nG; ++g) {
-        legacy_solver_nn::get_element_2nd_derivs.at(c.type)(
-            c.insd, ind2, c.eNoN, g, xi, legacy_Nxx);
-        nn::get_gn_nxx(c.insd, ind2, c.type, c.eNoN, g, xi, adapter_Nxx);
-    }
-
-    double max_abs_difference = 0.0;
-    for (int g = 0; g < c.nG; ++g) {
-        for (int a = 0; a < c.eNoN; ++a) {
-            for (int row = 0; row < ind2; ++row) {
-                max_abs_difference = std::max(
-                    max_abs_difference,
-                    std::abs(adapter_Nxx(row, a, g) - legacy_Nxx(row, a, g)));
-            }
-        }
-    }
-
-    EXPECT_GT(max_abs_difference, 1e-8);
-}
-
-TEST(SolverBasisAdapter, UnsupportedHessianFamiliesRemainNoOp) {
-    Array<double> xi(1, 1);
-    xi(0, 0) = 0.0;
-    Array3<double> Nxx(1, 1, 1);
-
-    for (const auto unsupported : {consts::ElementType::NRB, consts::ElementType::PNT}) {
-        fill_array3(Nxx, 42.0);
-        nn::get_gn_nxx(1, 1, unsupported, 1, 0, xi, Nxx);
-        EXPECT_DOUBLE_EQ(Nxx(0, 0, 0), 42.0)
-            << "element=" << static_cast<int>(unsupported);
-    }
-}
-
-TEST(SolverBasisAdapter, InitFsMshPopulatesMappedHessiansWithoutLShpFGate) {
-    using consts::ElementType;
-    const SolverBasisAdapterCase cases[] = {
-        {ElementType::QUD4, ElementType::QUD4, 2, 4, 4},
-        {ElementType::HEX8, ElementType::HEX8, 3, 8, 8},
-        {ElementType::HEX20, ElementType::HEX20, 3, 20, 27},
-        {ElementType::HEX27, ElementType::HEX27, 3, 27, 27},
-        {ElementType::WDG, ElementType::WDG, 3, 6, 6},
-    };
-
-    for (const auto& c : cases) {
-        SCOPED_TRACE("element=" + std::to_string(static_cast<int>(c.type)));
-        ComMod com_mod;
-        com_mod.nsd = c.insd;
-        mshType mesh = initialized_mesh_for_case(c, true);
-
-        fs::init_fs_msh(com_mod, mesh);
-
-        ASSERT_EQ(mesh.fs.size(), 1u);
-        ASSERT_EQ(mesh.fs[0].Nxx.nrows(), packed_hessian_components(c.insd));
-        if (c.type == ElementType::QUD4) {
-            EXPECT_TRUE(array3_has_nonzero_component(mesh.fs[0].Nxx, 2, 2e-12));
-        } else if (c.type == ElementType::HEX8) {
-            EXPECT_TRUE(array3_has_nonzero_component(mesh.fs[0].Nxx, 3, 2e-12));
-        } else if (c.type == ElementType::WDG) {
-            EXPECT_TRUE(array3_has_nonzero_component(mesh.fs[0].Nxx, 5, 2e-12));
-        } else {
-            bool has_nonzero = false;
-            for (int row = 0; row < mesh.fs[0].Nxx.nrows(); ++row) {
-                has_nonzero = has_nonzero ||
-                    array3_has_nonzero_component(mesh.fs[0].Nxx, row, 2e-12);
-            }
-            EXPECT_TRUE(has_nonzero);
-        }
-    }
-}
-
-TEST(LagrangeBasis, QuadPartitionOfUnity) {
-    LagrangeBasis basis(ElementType::Quad4, 1);
-    svmp::FE::math::Vector<Real, 3> xi{0.2, -0.3, 0.0};
-
-    std::vector<Real> values;
-    basis.evaluate_values(xi, values);
-
-    double sum = std::accumulate(values.begin(), values.end(), 0.0);
-    EXPECT_NEAR(sum, 1.0, 1e-12);
-}
-
-TEST(LagrangeBasis, LineGradientLinear) {
-    LagrangeBasis basis(ElementType::Line2, 1);
-    svmp::FE::math::Vector<Real, 3> xi{0.0, 0.0, 0.0};
-    std::vector<Gradient> grad;
-    basis.evaluate_gradients(xi, grad);
-
-    ASSERT_EQ(grad.size(), 2u);
-    EXPECT_NEAR(grad[0][0], -0.5, 1e-12);
-    EXPECT_NEAR(grad[1][0], 0.5, 1e-12);
-}
-
-TEST(LagrangeBasis, TrianglePartitionOfUnity) {
-    LagrangeBasis basis(ElementType::Triangle3, 1);
-    svmp::FE::math::Vector<Real, 3> xi{0.2, 0.3, 0.0};
-    std::vector<Real> values;
-    basis.evaluate_values(xi, values);
-
-    double sum = std::accumulate(values.begin(), values.end(), 0.0);
-    EXPECT_NEAR(sum, 1.0, 1e-12);
-}
-
-TEST(LagrangeBasis, SizeFormulasPerElement) {
-    for (int order = 0; order <= 3; ++order) {
-        {
-            LagrangeBasis line(ElementType::Line2, order);
-            EXPECT_EQ(line.size(), static_cast<std::size_t>(order + 1));
-        }
-        {
-            LagrangeBasis quad(ElementType::Quad4, order);
-            const std::size_t n1d = static_cast<std::size_t>(order + 1);
-            EXPECT_EQ(quad.size(), n1d * n1d);
-        }
-        {
-            LagrangeBasis hex(ElementType::Hex8, order);
-            const std::size_t n1d = static_cast<std::size_t>(order + 1);
-            EXPECT_EQ(hex.size(), n1d * n1d * n1d);
-        }
-        {
-            LagrangeBasis tri(ElementType::Triangle3, order);
-            const std::size_t expected =
-                static_cast<std::size_t>(order + 1) *
-                static_cast<std::size_t>(order + 2) / 2;
-            EXPECT_EQ(tri.size(), expected);
-        }
-        {
-            LagrangeBasis tet(ElementType::Tetra4, order);
-            const std::size_t expected =
-                static_cast<std::size_t>(order + 1) *
-                static_cast<std::size_t>(order + 2) *
-                static_cast<std::size_t>(order + 3) / 6;
-            EXPECT_EQ(tet.size(), expected);
-        }
-    }
-}
-
-TEST(LagrangeBasis, KroneckerDeltaAtNodes) {
-    const std::vector<std::pair<ElementType, int>> cases = {
-        {ElementType::Line2, 1},
-        {ElementType::Quad4, 1},
-        {ElementType::Triangle3, 1},
-        {ElementType::Tetra4, 1},
-        {ElementType::Hex8, 1},
-        {ElementType::Triangle3, 2},
-        {ElementType::Tetra4, 2},
-        {ElementType::Quad4, 2},
-        {ElementType::Hex8, 2},
-        {ElementType::Wedge6, 2}
-    };
-
-    for (const auto& c : cases) {
-        LagrangeBasis basis(c.first, c.second);
-        const auto& nodes = basis.nodes();
-        ASSERT_EQ(nodes.size(), basis.size());
-
-        for (std::size_t i = 0; i < nodes.size(); ++i) {
-            std::vector<Real> vals;
-            basis.evaluate_values(nodes[i], vals);
-            ASSERT_EQ(vals.size(), nodes.size());
-            for (std::size_t j = 0; j < nodes.size(); ++j) {
-                if (i == j) {
-                    EXPECT_NEAR(vals[j], 1.0, 1e-12);
-                } else {
-                    EXPECT_NEAR(vals[j], 0.0, 1e-12);
-                }
-            }
-        }
-    }
-}
-
-TEST(LagrangeBasis, MatchesNodeOrderingConventionsForLinearAndQuadratic) {
-    // Tensor-product elements
-    expect_nodes_match_node_ordering(ElementType::Line2, 1, ElementType::Line2);
-    expect_nodes_match_node_ordering(ElementType::Line2, 2, ElementType::Line3);
-    expect_nodes_match_node_ordering(ElementType::Quad4, 1, ElementType::Quad4);
-    expect_nodes_match_node_ordering(ElementType::Quad4, 2, ElementType::Quad9);
-    expect_nodes_match_node_ordering(ElementType::Hex8, 1, ElementType::Hex8);
-    expect_nodes_match_node_ordering(ElementType::Hex8, 2, ElementType::Hex27);
-
-    // Simplex elements
-    expect_nodes_match_node_ordering(ElementType::Triangle3, 1, ElementType::Triangle3);
-    expect_nodes_match_node_ordering(ElementType::Triangle3, 2, ElementType::Triangle6);
-    expect_nodes_match_node_ordering(ElementType::Tetra4, 1, ElementType::Tetra4);
-    expect_nodes_match_node_ordering(ElementType::Tetra4, 2, ElementType::Tetra10);
-
-    // Mixed topology
-    expect_nodes_match_node_ordering(ElementType::Wedge6, 1, ElementType::Wedge6);
-    expect_nodes_match_node_ordering(ElementType::Wedge6, 2, ElementType::Wedge18);
-
-    // Pyramid
-    expect_nodes_match_node_ordering(ElementType::Pyramid5, 1, ElementType::Pyramid5);
-    expect_nodes_match_node_ordering(ElementType::Pyramid14, 2, ElementType::Pyramid14);
-}
-
-TEST(LagrangeBasis, WedgeAndPyramidPartitionOfUnity) {
-    {
-        LagrangeBasis wedge(ElementType::Wedge6, 1);
-        svmp::FE::math::Vector<Real, 3> xi{Real(0.2), Real(0.1), Real(0.3)};
-        std::vector<Real> vals;
-        wedge.evaluate_values(xi, vals);
-        const double sum = std::accumulate(vals.begin(), vals.end(), 0.0);
-        EXPECT_NEAR(sum, 1.0, 1e-12);
-    }
-
-    {
-        LagrangeBasis wedge_q(ElementType::Wedge18, 2);
-        svmp::FE::math::Vector<Real, 3> xi{Real(0.2), Real(0.1), Real(-0.25)};
-        std::vector<Real> vals;
-        wedge_q.evaluate_values(xi, vals);
-        const double sum = std::accumulate(vals.begin(), vals.end(), 0.0);
-        EXPECT_NEAR(sum, 1.0, 1e-12);
-
-        // Wedge18 should report 18 nodes in ReferenceNodeLayout
-        EXPECT_EQ(ReferenceNodeLayout::num_nodes(ElementType::Wedge18), 18u);
-        // Corner nodes should match Wedge6 vertices
-        auto v0 = ReferenceNodeLayout::get_node_coords(ElementType::Wedge18, 0);
-        auto v1 = ReferenceNodeLayout::get_node_coords(ElementType::Wedge18, 1);
-        auto v2 = ReferenceNodeLayout::get_node_coords(ElementType::Wedge18, 2);
-        EXPECT_NEAR(v0[0], Real(0), 1e-14);
-        EXPECT_NEAR(v0[1], Real(0), 1e-14);
-        EXPECT_NEAR(v0[2], Real(-1), 1e-14);
-        EXPECT_NEAR(v1[0], Real(1), 1e-14);
-        EXPECT_NEAR(v1[1], Real(0), 1e-14);
-        EXPECT_NEAR(v1[2], Real(-1), 1e-14);
-        EXPECT_NEAR(v2[0], Real(0), 1e-14);
-        EXPECT_NEAR(v2[1], Real(1), 1e-14);
-        EXPECT_NEAR(v2[2], Real(-1), 1e-14);
-    }
-
-    {
-        LagrangeBasis pyr(ElementType::Pyramid5, 1);
-        svmp::FE::math::Vector<Real, 3> xi{Real(0.1), Real(-0.2), Real(0.4)};
-        std::vector<Real> vals;
-        pyr.evaluate_values(xi, vals);
-        const double sum = std::accumulate(vals.begin(), vals.end(), 0.0);
-        EXPECT_NEAR(sum, 1.0, 1e-12);
-    }
-}
-
-TEST(LagrangeBasis, NonTensorStridedEvaluationMatchesPointwise) {
-    const std::vector<std::pair<ElementType, int>> cases = {
-        {ElementType::Triangle3, 3},
-        {ElementType::Tetra4, 3},
-        {ElementType::Wedge6, 3},
-        {ElementType::Pyramid5, 3},
-    };
-    const std::vector<StridedOutputRequest> requests = {
-        {true, false, false},
-        {false, true, false},
-        {false, false, true},
-        {true, true, false},
-        {true, false, true},
-        {false, true, true},
-        {true, true, true},
-    };
-
-    for (const auto& [type, order] : cases) {
-        for (const auto& request : requests) {
-            SCOPED_TRACE(static_cast<int>(type));
-            SCOPED_TRACE(order);
-            SCOPED_TRACE(request.values ? "values" : "no values");
-            SCOPED_TRACE(request.gradients ? "gradients" : "no gradients");
-            SCOPED_TRACE(request.hessians ? "hessians" : "no hessians");
-            expect_strided_matches_pointwise(type, order, request);
-        }
-    }
-}
-
-TEST(LagrangeBasis, RawOutputSinksMatchVectorEvaluationAcrossTopologies) {
-    const std::vector<std::pair<ElementType, int>> cases = {
-        {ElementType::Line2, 4},
-        {ElementType::Quad4, 3},
-        {ElementType::Hex8, 3},
-        {ElementType::Triangle3, 4},
-        {ElementType::Tetra4, 3},
-        {ElementType::Wedge6, 3},
-        {ElementType::Pyramid5, 3},
-    };
-
-    for (const auto& [type, order] : cases) {
-        SCOPED_TRACE(static_cast<int>(type));
-        SCOPED_TRACE(order);
-        expect_raw_to_matches_vector_evaluation(type, order);
-    }
-}
-
-TEST(LagrangeBasis, CanonicalConstructorsSupportArbitraryOrders) {
-    const struct Case {
-        ElementType type;
-        int max_order;
-    } cases[] = {
-        {ElementType::Line2, 8},
-        {ElementType::Triangle3, 6},
-        {ElementType::Quad4, 6},
-        {ElementType::Tetra4, 5},
-        {ElementType::Hex8, 5},
-        {ElementType::Wedge6, 5},
-        {ElementType::Pyramid5, 5},
-    };
-
-    for (const auto& c : cases) {
-        for (int order = 0; order <= c.max_order; ++order) {
-            LagrangeBasis basis(c.type, order);
-            EXPECT_EQ(basis.element_type(), c.type);
-            EXPECT_EQ(basis.order(), order);
-            EXPECT_EQ(basis.dimension(), expected_dimension(c.type));
-            EXPECT_EQ(basis.size(), expected_lagrange_size(c.type, order));
-            EXPECT_EQ(basis.nodes().size(), basis.size());
-        }
-    }
-}
-
-TEST(LagrangeBasis, AliasVariantsNormalizeToCanonicalPaths) {
-    expect_alias_matches_canonical(
-        ElementType::Line3, ElementType::Line2, 2, sample_points_for(ElementType::Line2));
-    expect_alias_matches_canonical(
-        ElementType::Triangle6, ElementType::Triangle3, 2, sample_points_for(ElementType::Triangle3));
-    expect_alias_matches_canonical(
-        ElementType::Quad9, ElementType::Quad4, 2, sample_points_for(ElementType::Quad4));
-    expect_alias_matches_canonical(
-        ElementType::Tetra10, ElementType::Tetra4, 2, sample_points_for(ElementType::Tetra4));
-    expect_alias_matches_canonical(
-        ElementType::Hex27, ElementType::Hex8, 2, sample_points_for(ElementType::Hex8));
-    expect_alias_matches_canonical(
-        ElementType::Wedge18, ElementType::Wedge6, 2, sample_points_for(ElementType::Wedge6));
-    expect_alias_matches_canonical(
-        ElementType::Pyramid14, ElementType::Pyramid5, 2, sample_points_for(ElementType::Pyramid5),
-        Real(2e-10));
-}
 
-TEST(LagrangeBasis, SerendipityVariantsRemainRejected) {
-    EXPECT_THROW((void)LagrangeBasis(ElementType::Quad8, 2), svmp::FE::FEException);
-    EXPECT_THROW((void)LagrangeBasis(ElementType::Hex20, 2), svmp::FE::FEException);
-    EXPECT_THROW((void)LagrangeBasis(ElementType::Wedge15, 2), svmp::FE::FEException);
-    EXPECT_THROW((void)LagrangeBasis(ElementType::Pyramid13, 2), svmp::FE::FEException);
-}
+#include "FE/Basis/BasisExceptions.h"
+#include "FE/Basis/BasisFactory.h"
+#include "FE/Basis/LagrangeBasis.h"
+#include "FE/Basis/NodeOrderingConventions.h"
 
-TEST(LagrangeBasis, GeneratedNodeOrderingIsDeterministicAcrossOrders) {
-    const struct Case {
-        ElementType type;
-        int max_order;
-    } cases[] = {
-        {ElementType::Line2, 8},
-        {ElementType::Triangle3, 6},
-        {ElementType::Quad4, 6},
-        {ElementType::Tetra4, 5},
-        {ElementType::Hex8, 5},
-        {ElementType::Wedge6, 5},
-        {ElementType::Pyramid5, 5},
-    };
+#include <array>
+#include <tuple>
+#include <vector>
 
-    for (const auto& c : cases) {
-        for (int order = 0; order <= c.max_order; ++order) {
-            const auto generated_a = ReferenceNodeLayout::get_lagrange_node_coords(c.type, order);
-            const auto generated_b = ReferenceNodeLayout::get_lagrange_node_coords(c.type, order);
-            ASSERT_EQ(generated_a.size(), expected_lagrange_size(c.type, order));
-            ASSERT_EQ(generated_a.size(), generated_b.size());
-            for (std::size_t i = 0; i < generated_a.size(); ++i) {
-                EXPECT_TRUE(points_close(generated_a[i], generated_b[i]));
-            }
-        }
-    }
-}
+using namespace svmp::FE;
+using namespace svmp::FE::basis;
 
-TEST(LagrangeBasis, NodeOrderingMatchesReferenceCoordinateOracles) {
-    const std::array<ElementType, 18> cases = {
-        ElementType::Line2, ElementType::Line3,
-        ElementType::Triangle3, ElementType::Triangle6,
-        ElementType::Quad4, ElementType::Quad8, ElementType::Quad9,
-        ElementType::Tetra4, ElementType::Tetra10,
-        ElementType::Hex8, ElementType::Hex20, ElementType::Hex27,
-        ElementType::Wedge6, ElementType::Wedge15, ElementType::Wedge18,
-        ElementType::Pyramid5, ElementType::Pyramid13, ElementType::Pyramid14,
-    };
+namespace {
 
-    for (ElementType type : cases) {
-        const auto expected = reference_node_coords(type);
-        ASSERT_FALSE(expected.empty());
-        ASSERT_EQ(ReferenceNodeLayout::num_nodes(type), expected.size());
-        for (std::size_t i = 0; i < expected.size(); ++i) {
-            const auto actual = ReferenceNodeLayout::get_node_coords(type, i);
-            EXPECT_TRUE(points_close(actual, expected[i]))
-                << "Element type " << static_cast<int>(type)
-                << ", node " << i;
-        }
-    }
-}
+using Point = math::Vector<Real, 3>;
 
-TEST(LagrangeBasis, GeneratedLowOrderOrderingMatchesPublicAliasPaths) {
-    const struct Case {
-        ElementType type;
-        int order;
-        ElementType public_alias;
-    } cases[] = {
-        {ElementType::Line2, 1, ElementType::Line2},
-        {ElementType::Line2, 2, ElementType::Line3},
-        {ElementType::Triangle3, 1, ElementType::Triangle3},
-        {ElementType::Triangle3, 2, ElementType::Triangle6},
-        {ElementType::Quad4, 1, ElementType::Quad4},
-        {ElementType::Quad4, 2, ElementType::Quad9},
-        {ElementType::Tetra4, 1, ElementType::Tetra4},
-        {ElementType::Tetra4, 2, ElementType::Tetra10},
-        {ElementType::Hex8, 1, ElementType::Hex8},
-        {ElementType::Hex8, 2, ElementType::Hex27},
-        {ElementType::Wedge6, 1, ElementType::Wedge6},
-        {ElementType::Wedge6, 2, ElementType::Wedge18},
-        {ElementType::Pyramid5, 1, ElementType::Pyramid5},
-        {ElementType::Pyramid5, 2, ElementType::Pyramid14},
-    };
+struct CanonicalCase {
+    ElementType type;
+    int order;
+    std::size_t size;
+    int dimension;
+    std::vector<Point> points;
+    Real derivative_tol;
+};
 
-    for (const auto& c : cases) {
-        const auto generated = ReferenceNodeLayout::get_lagrange_node_coords(c.type, c.order);
-        ASSERT_EQ(generated.size(), ReferenceNodeLayout::num_nodes(c.public_alias));
-        for (std::size_t i = 0; i < generated.size(); ++i) {
-            const auto public_alias = ReferenceNodeLayout::get_node_coords(c.public_alias, i);
-            EXPECT_TRUE(points_close(generated[i], public_alias));
-        }
-    }
+const std::vector<CanonicalCase>& canonical_cases() {
+    static const std::vector<CanonicalCase> cases = {
+        {ElementType::Line2, 3, 4u, 1,
+         {{Real(-0.35), Real(0), Real(0)}, {Real(0.2), Real(0), Real(0)}},
+         Real(1e-11)},
+        {ElementType::Triangle3, 3, 10u, 2,
+         {{Real(0.15), Real(0.2), Real(0)}, {Real(0.25), Real(0.1), Real(0)}},
+         Real(1e-9)},
+        {ElementType::Quad4, 3, 16u, 2,
+         {{Real(0.2), Real(-0.3), Real(0)}, {Real(-0.45), Real(0.25), Real(0)}},
+         Real(1e-11)},
+        {ElementType::Tetra4, 2, 10u, 3,
+         {{Real(0.12), Real(0.18), Real(0.16)}, {Real(0.2), Real(0.1), Real(0.18)}},
+         Real(1e-9)},
+        {ElementType::Hex8, 2, 27u, 3,
+         {{Real(0.1), Real(-0.2), Real(0.3)}, {Real(-0.35), Real(0.25), Real(-0.15)}},
+         Real(1e-10)},
+        {ElementType::Wedge6, 2, 18u, 3,
+         {{Real(0.18), Real(0.22), Real(-0.2)}, {Real(0.12), Real(0.16), Real(0.1)}},
+         Real(1e-9)},
+    };
+    return cases;
 }
 
-TEST(LagrangeBasis, KroneckerDeltaAcrossCanonicalTopologiesAndOrders) {
-    const struct Case {
-        ElementType type;
-        int max_order;
-    } cases[] = {
-        {ElementType::Line2, 8},
-        {ElementType::Triangle3, 6},
-        {ElementType::Quad4, 6},
-        {ElementType::Tetra4, 5},
-        {ElementType::Hex8, 5},
-        {ElementType::Wedge6, 5},
-        {ElementType::Pyramid5, 5},
-    };
-
-    for (const auto& c : cases) {
-        for (int order = 0; order <= c.max_order; ++order) {
-            LagrangeBasis basis(c.type, order);
-            ASSERT_EQ(basis.size(), expected_lagrange_size(c.type, order));
-
-            std::vector<Real> values;
-            for (std::size_t node_i = 0; node_i < basis.size(); ++node_i) {
-                basis.evaluate_values(basis.nodes()[node_i], values);
-                ASSERT_EQ(values.size(), basis.size());
-                for (std::size_t basis_i = 0; basis_i < basis.size(); ++basis_i) {
-                    EXPECT_NEAR(values[basis_i], basis_i == node_i ? Real(1) : Real(0), Real(2e-10))
-                        << "Element type " << static_cast<int>(c.type)
-                        << ", order " << order
-                        << ", node " << node_i
-                        << ", basis " << basis_i;
-                }
-            }
+std::vector<Point> sample_points_for(ElementType type) {
+    for (const auto& c : canonical_cases()) {
+        if (c.type == type) {
+            return c.points;
         }
     }
+    return {};
 }
 
-TEST(LagrangeBasis, PartitionGradientAndHessianSumsAcrossCanonicalTopologiesAndOrders) {
-    const struct Case {
-        ElementType type;
-        int max_order;
-        Real tol;
-    } cases[] = {
-        {ElementType::Line2, 8, Real(1e-11)},
-        {ElementType::Triangle3, 6, Real(1e-10)},
-        {ElementType::Quad4, 6, Real(1e-10)},
-        {ElementType::Tetra4, 5, Real(2e-10)},
-        {ElementType::Hex8, 5, Real(2e-10)},
-        {ElementType::Wedge6, 5, Real(5e-10)},
-        {ElementType::Pyramid5, 5, Real(5e-7)},
-    };
+void expect_kronecker_at_nodes(const LagrangeBasis& basis, Real tol)
+{
+    const auto& nodes = basis.nodes();
+    ASSERT_EQ(nodes.size(), basis.size());
 
-    for (const auto& c : cases) {
-        for (int order = 0; order <= c.max_order; ++order) {
-            LagrangeBasis basis(c.type, order);
-            expect_partition_gradient_hessian_sums(basis, dense_sample_points_for(c.type), c.tol, c.tol);
+    std::vector<Real> values;
+    for (std::size_t node = 0; node < nodes.size(); ++node) {
+        basis.evaluate_values(nodes[node], values);
+        ASSERT_EQ(values.size(), basis.size());
+        for (std::size_t i = 0; i < values.size(); ++i) {
+            EXPECT_NEAR(values[i], i == node ? Real(1) : Real(0), tol)
+                << "node=" << node << " basis=" << i;
         }
     }
 }
 
-TEST(LagrangeBasis, SimplexAxisScratchDynamicFallbackForHighOrder) {
-    const struct Case {
-        ElementType type;
-        int order;
-        Point point;
-        Real tolerance;
-    } cases[] = {
-        {ElementType::Triangle3, 13, Point{Real(0.19), Real(0.31), Real(0)}, Real(1e-8)},
-        {ElementType::Tetra4, 13, Point{Real(0.13), Real(0.17), Real(0.19)}, Real(1e-7)},
-    };
-
-    for (const auto& c : cases) {
-        LagrangeBasis basis(c.type, c.order);
+void expect_partition_gradient_hessian_sums(const LagrangeBasis& basis,
+                                            const std::vector<Point>& points,
+                                            Real derivative_tol)
+{
+    for (const auto& xi : points) {
         std::vector<Real> values;
         std::vector<Gradient> gradients;
         std::vector<Hessian> hessians;
-        basis.evaluate_all(c.point, values, gradients, hessians);
-
-        ASSERT_EQ(values.size(), basis.size());
-        ASSERT_EQ(gradients.size(), basis.size());
-        ASSERT_EQ(hessians.size(), basis.size());
+        basis.evaluate_all(xi, values, gradients, hessians);
 
         Real value_sum = Real(0);
         Gradient gradient_sum{};
         Hessian hessian_sum{};
-        for (std::size_t i = 0; i < basis.size(); ++i) {
+        for (std::size_t i = 0; i < values.size(); ++i) {
             value_sum += values[i];
             for (std::size_t d = 0; d < 3u; ++d) {
                 gradient_sum[d] += gradients[i][d];
@@ -2253,776 +102,297 @@ TEST(LagrangeBasis, SimplexAxisScratchDynamicFallbackForHighOrder) {
             }
         }
 
-        EXPECT_NEAR(value_sum, Real(1), c.tolerance);
-        for (std::size_t d = 0; d < 3u; ++d) {
-            EXPECT_NEAR(gradient_sum[d], Real(0), c.tolerance);
-            for (std::size_t e = 0; e < 3u; ++e) {
-                EXPECT_NEAR(hessian_sum(d, e), Real(0), Real(10) * c.tolerance);
+        EXPECT_NEAR(value_sum, Real(1), Real(1e-12));
+        for (int d = 0; d < basis.dimension(); ++d) {
+            EXPECT_NEAR(gradient_sum[static_cast<std::size_t>(d)], Real(0), derivative_tol);
+            for (int e = 0; e < basis.dimension(); ++e) {
+                EXPECT_NEAR(hessian_sum(static_cast<std::size_t>(d),
+                                        static_cast<std::size_t>(e)),
+                            Real(0),
+                            derivative_tol);
             }
         }
     }
 }
 
-TEST(LagrangeBasis, HighOrderAxisNearNodeMaintainsPartitionAndDerivativeSums) {
-    const int order = 16;
-    const LagrangeBasis basis(ElementType::Line2, order);
-    const Real node = Real(-1) + Real(2 * 5) / static_cast<Real>(order);
-    const Point point{node + Real(1e-7), Real(0), Real(0)};
-
+void expect_raw_sinks_match_vector_evaluation(const LagrangeBasis& basis,
+                                              const Point& xi)
+{
     std::vector<Real> values;
     std::vector<Gradient> gradients;
     std::vector<Hessian> hessians;
-    basis.evaluate_all(point, values, gradients, hessians);
-    ASSERT_EQ(values.size(), basis.size());
-
-    Real value_sum = Real(0);
-    Real gradient_sum = Real(0);
-    Real hessian_sum = Real(0);
-    for (std::size_t i = 0; i < basis.size(); ++i) {
-        value_sum += values[i];
-        gradient_sum += gradients[i][0];
-        hessian_sum += hessians[i](0, 0);
-    }
-
-    EXPECT_NEAR(value_sum, Real(1), Real(1e-12));
-    EXPECT_NEAR(gradient_sum, Real(0), Real(1e-8));
-    EXPECT_NEAR(hessian_sum, Real(0), Real(1e-5));
-}
-
-TEST(LagrangeBasis, PyramidFaceTracesMatchLowerDimensionalLagrangeBases) {
-    const PyramidFace faces[] = {
-        PyramidFace::Base,
-        PyramidFace::South,
-        PyramidFace::East,
-        PyramidFace::North,
-        PyramidFace::West,
-    };
-
-    for (int order = 1; order <= 5; ++order) {
-        for (const auto face : faces) {
-            expect_pyramid_face_trace_matches_lower_basis(
-                order, face, face == PyramidFace::Base ? Real(2e-10) : Real(5e-10));
-        }
-    }
-}
-
-TEST(LagrangeBasis, PyramidEdgeTracesMatchLineLagrangeBasis) {
-    const PyramidEdge edges[] = {
-        PyramidEdge::BaseSouth,
-        PyramidEdge::BaseEast,
-        PyramidEdge::BaseNorth,
-        PyramidEdge::BaseWest,
-        PyramidEdge::VerticalSW,
-        PyramidEdge::VerticalSE,
-        PyramidEdge::VerticalNE,
-        PyramidEdge::VerticalNW,
-    };
-
-    for (int order = 1; order <= 5; ++order) {
-        for (const auto edge : edges) {
-            expect_pyramid_edge_trace_matches_line_basis(order, edge, Real(5e-10));
-        }
-    }
-}
+    basis.evaluate_all(xi, values, gradients, hessians);
 
-TEST(LagrangeBasis, Pyramid14RationalNodalAndPartition) {
-    using svmp::FE::basis::ReferenceNodeLayout;
+    std::vector<Real> flat_values(basis.size());
+    std::vector<Real> flat_gradients(basis.size() * 3u);
+    std::vector<Real> flat_hessians(basis.size() * 9u);
+    basis.evaluate_values_to(xi, flat_values.data());
+    basis.evaluate_gradients_to(xi, flat_gradients.data());
+    basis.evaluate_hessians_to(xi, flat_hessians.data());
 
-    LagrangeBasis basis(ElementType::Pyramid14, 2);
-    EXPECT_EQ(basis.dimension(), 3);
-    EXPECT_EQ(basis.size(), 14u);
-
-    // Kronecker nodal property at all Pyramid14 nodes
     for (std::size_t i = 0; i < basis.size(); ++i) {
-        auto xi = ReferenceNodeLayout::get_node_coords(ElementType::Pyramid14, i);
-        std::vector<Real> vals;
-        basis.evaluate_values(xi, vals);
-        ASSERT_EQ(vals.size(), basis.size());
-        for (std::size_t j = 0; j < basis.size(); ++j) {
-            const double expected = (i == j) ? 1.0 : 0.0;
-            EXPECT_NEAR(vals[j], expected, 1e-12);
+        EXPECT_NEAR(flat_values[i], values[i], Real(1e-14));
+        for (std::size_t d = 0; d < 3u; ++d) {
+            EXPECT_NEAR(flat_gradients[i * 3u + d], gradients[i][d], Real(1e-14));
+            for (std::size_t e = 0; e < 3u; ++e) {
+                EXPECT_NEAR(flat_hessians[i * 9u + d * 3u + e],
+                            hessians[i](d, e),
+                            Real(1e-14));
+            }
         }
     }
-
-    // Partition of unity at an interior point
-    svmp::FE::math::Vector<Real, 3> xi{Real(0.1), Real(-0.2), Real(0.3)};
-    std::vector<Real> vals;
-    basis.evaluate_values(xi, vals);
-    const double sum = std::accumulate(vals.begin(), vals.end(), 0.0);
-    EXPECT_NEAR(sum, 1.0, 1e-12);
 }
 
-TEST(LagrangeBasis, Pyramid14GradientSumZero) {
-    LagrangeBasis basis(ElementType::Pyramid14, 2);
-    svmp::FE::math::Vector<Real, 3> xi{Real(0.15), Real(-0.1), Real(0.3)};
-
-    std::vector<Gradient> grads;
-    basis.evaluate_gradients(xi, grads);
-    ASSERT_EQ(grads.size(), basis.size());
-
-    Gradient sum{};
-    for (const auto& g : grads) {
-        sum[0] += g[0];
-        sum[1] += g[1];
-        sum[2] += g[2];
+void expect_nodes_close(const std::vector<Point>& lhs,
+                        const std::vector<Point>& rhs,
+                        Real tol)
+{
+    ASSERT_EQ(lhs.size(), rhs.size());
+    for (std::size_t i = 0; i < lhs.size(); ++i) {
+        EXPECT_NEAR(lhs[i][0], rhs[i][0], tol) << "node=" << i;
+        EXPECT_NEAR(lhs[i][1], rhs[i][1], tol) << "node=" << i;
+        EXPECT_NEAR(lhs[i][2], rhs[i][2], tol) << "node=" << i;
     }
-    EXPECT_NEAR(sum[0], 0.0, 1e-8);
-    EXPECT_NEAR(sum[1], 0.0, 1e-8);
-    EXPECT_NEAR(sum[2], 0.0, 1e-8);
 }
 
-TEST(LagrangeBasis, HigherOrderP4KroneckerAndPartition) {
-    struct Case {
-        ElementType type;
-        int order;
-        svmp::FE::math::Vector<Real, 3> xi;
-    };
-
-    const std::vector<Case> cases = {
-        {ElementType::Line2, 4, {Real(0.11), Real(0), Real(0)}},
-        {ElementType::Quad4, 4, {Real(0.2), Real(-0.3), Real(0)}},
-        {ElementType::Triangle3, 4, {Real(0.2), Real(0.3), Real(0)}},
-        {ElementType::Hex8, 4, {Real(0.2), Real(-0.3), Real(0.4)}},
-    };
-
-    for (const auto& c : cases) {
-        LagrangeBasis basis(c.type, c.order);
-
-        // Partition of unity at an interior point
-        std::vector<Real> values;
-        basis.evaluate_values(c.xi, values);
-        const double sum = std::accumulate(values.begin(), values.end(), 0.0);
-        EXPECT_NEAR(sum, 1.0, 1e-12);
+void expect_evaluations_match(const LagrangeBasis& lhs,
+                              const LagrangeBasis& rhs,
+                              const std::vector<Point>& points,
+                              Real tol)
+{
+    ASSERT_EQ(lhs.size(), rhs.size());
 
-        // Kronecker delta property at all nodes
-        const auto& nodes = basis.nodes();
-        ASSERT_EQ(nodes.size(), basis.size());
-        for (std::size_t i = 0; i < nodes.size(); ++i) {
-            basis.evaluate_values(nodes[i], values);
-            ASSERT_EQ(values.size(), nodes.size());
-            for (std::size_t j = 0; j < nodes.size(); ++j) {
-                const double expected = (i == j) ? 1.0 : 0.0;
-                EXPECT_NEAR(values[j], expected, 1e-12);
+    for (const auto& xi : points) {
+        std::vector<Real> lhs_values;
+        std::vector<Real> rhs_values;
+        std::vector<Gradient> lhs_gradients;
+        std::vector<Gradient> rhs_gradients;
+        std::vector<Hessian> lhs_hessians;
+        std::vector<Hessian> rhs_hessians;
+
+        lhs.evaluate_all(xi, lhs_values, lhs_gradients, lhs_hessians);
+        rhs.evaluate_all(xi, rhs_values, rhs_gradients, rhs_hessians);
+
+        for (std::size_t i = 0; i < lhs.size(); ++i) {
+            EXPECT_NEAR(lhs_values[i], rhs_values[i], tol);
+            for (std::size_t d = 0; d < 3u; ++d) {
+                EXPECT_NEAR(lhs_gradients[i][d], rhs_gradients[i][d], tol);
+                for (std::size_t e = 0; e < 3u; ++e) {
+                    EXPECT_NEAR(lhs_hessians[i](d, e), rhs_hessians[i](d, e), tol);
+                }
             }
         }
     }
 }
 
-TEST(LagrangeBasis, Pyramid14InterpolatesQuadraticPolynomials) {
-    using svmp::FE::basis::ReferenceNodeLayout;
-
-    LagrangeBasis basis(ElementType::Pyramid14, 2);
-    const std::size_t n = basis.size();
-
-    // Precompute nodal coordinates
-    std::vector<svmp::FE::math::Vector<Real,3>> nodes;
-    nodes.reserve(n);
-    for (std::size_t i = 0; i < n; ++i) {
-        nodes.push_back(ReferenceNodeLayout::get_node_coords(ElementType::Pyramid14, i));
-    }
-
-    auto interpolate_and_check = [&](auto f, Real tol) {
-        // Nodal coefficients
-        std::vector<Real> coeffs(n);
-        for (std::size_t i = 0; i < n; ++i) {
-            const auto& x = nodes[i];
-            coeffs[i] = f(x[0], x[1], x[2]);
-        }
-
-        // Test at a few interior points
-        const svmp::FE::math::Vector<Real,3> test_pts[] = {
-            {Real(0.1), Real(-0.2), Real(0.2)},
-            {Real(-0.2), Real(0.15), Real(0.4)},
-            {Real(0.05), Real(0.05), Real(0.3)}
-        };
-
-        for (const auto& xi : test_pts) {
-            std::vector<Real> vals;
-            basis.evaluate_values(xi, vals);
-            ASSERT_EQ(vals.size(), n);
-
-            Real u_interp = Real(0);
-            for (std::size_t i = 0; i < n; ++i) {
-                u_interp += coeffs[i] * vals[i];
-            }
-
-            const Real u_exact = f(xi[0], xi[1], xi[2]);
-            EXPECT_NEAR(u_interp, u_exact, tol);
-        }
-    };
-
-    // Constant, linear and quadratic monomials
-    interpolate_and_check([](Real, Real, Real) { return Real(1); }, Real(1e-12));
-    interpolate_and_check([](Real x, Real, Real) { return x; }, Real(1e-11));
-    interpolate_and_check([](Real, Real y, Real) { return y; }, Real(1e-11));
-    interpolate_and_check([](Real, Real, Real z) { return z; }, Real(1e-11));
-    interpolate_and_check([](Real x, Real y, Real) { return x * y; }, Real(1e-10));
-    interpolate_and_check([](Real x, Real, Real z) { return x * z; }, Real(1e-10));
-    interpolate_and_check([](Real, Real y, Real z) { return y * z; }, Real(1e-10));
-    interpolate_and_check([](Real x, Real, Real) { return x * x; }, Real(1e-10));
-    interpolate_and_check([](Real, Real y, Real) { return y * y; }, Real(1e-10));
-    interpolate_and_check([](Real, Real, Real z) { return z * z; }, Real(1e-10));
+Real linear_function(const Point& p) {
+    return Real(2) + Real(3) * p[0] - Real(4) * p[1] + Real(5) * p[2];
 }
 
-TEST(LagrangeBasis, Pyramid14GradientMatchesLinearFunctionGradient) {
-    using svmp::FE::basis::ReferenceNodeLayout;
-
-    LagrangeBasis basis(ElementType::Pyramid14, 2);
-    const std::size_t n = basis.size();
-
-    // Nodal coordinates and coefficients for f(x,y,z) = ax + by + cz
-    const Real a = Real(1.2);
-    const Real b = Real(-0.7);
-    const Real c = Real(0.5);
-
-    std::vector<Real> coeffs(n);
-    for (std::size_t i = 0; i < n; ++i) {
-        const auto x = ReferenceNodeLayout::get_node_coords(ElementType::Pyramid14, i);
-        coeffs[i] = a * x[0] + b * x[1] + c * x[2];
-    }
-
-    const svmp::FE::math::Vector<Real,3> xi{Real(0.1), Real(-0.15), Real(0.35)};
-
-    std::vector<Gradient> grads;
-    basis.evaluate_gradients(xi, grads);
-    ASSERT_EQ(grads.size(), n);
-
-    Gradient g_interp{};
-    for (std::size_t i = 0; i < n; ++i) {
-        g_interp[0] += coeffs[i] * grads[i][0];
-        g_interp[1] += coeffs[i] * grads[i][1];
-        g_interp[2] += coeffs[i] * grads[i][2];
-    }
-
-    EXPECT_NEAR(g_interp[0], a, 1e-6);
-    EXPECT_NEAR(g_interp[1], b, 1e-6);
-    EXPECT_NEAR(g_interp[2], c, 1e-6);
+Gradient linear_gradient() {
+    Gradient g{};
+    g[0] = Real(3);
+    g[1] = Real(-4);
+    g[2] = Real(5);
+    return g;
 }
 
-TEST(LagrangeBasis, PyramidApexValuesRemainExactAcrossRepresentativeOrders) {
-    const struct Case {
-        ElementType type;
-        int order;
-    } cases[] = {
-        {ElementType::Pyramid5, 1},
-        {ElementType::Pyramid14, 2},
-        {ElementType::Pyramid5, 4},
-    };
-
-    const svmp::FE::math::Vector<Real, 3> apex{Real(0), Real(0), Real(1)};
-    for (const auto& c : cases) {
-        LagrangeBasis basis(c.type, c.order);
-        std::vector<Real> values;
-        basis.evaluate_values(apex, values);
-        ASSERT_EQ(values.size(), basis.size());
-
-        const auto& nodes = basis.nodes();
-        auto apex_it = std::find_if(
-            nodes.begin(), nodes.end(),
-            [](const auto& node) {
-                return std::abs(node[0]) <= Real(1e-14) &&
-                       std::abs(node[1]) <= Real(1e-14) &&
-                       std::abs(node[2] - Real(1)) <= Real(1e-14);
-            });
-        ASSERT_NE(apex_it, nodes.end());
-        const std::size_t apex_index = static_cast<std::size_t>(
-            std::distance(nodes.begin(), apex_it));
+Real quadratic_function(const Point& p) {
+    return Real(1) + Real(2) * p[0] - p[1] + Real(0.5) * p[2] +
+           p[0] * p[0] + Real(0.75) * p[1] * p[1] - Real(0.25) * p[2] * p[2] +
+           Real(0.2) * p[0] * p[1] - Real(0.3) * p[0] * p[2] +
+           Real(0.4) * p[1] * p[2];
+}
 
-        Real sum = Real(0);
-        for (std::size_t i = 0; i < values.size(); ++i) {
-            EXPECT_TRUE(std::isfinite(static_cast<double>(values[i])));
-            sum += values[i];
-            const Real expected = (i == apex_index) ? Real(1) : Real(0);
-            EXPECT_NEAR(values[i], expected, 1e-12)
-                << "order " << c.order << ", basis " << i;
-        }
-        EXPECT_NEAR(sum, Real(1), 1e-12);
+template<typename Function>
+Real interpolate_value(const LagrangeBasis& basis,
+                       const std::vector<Real>& values,
+                       Function&& nodal_function)
+{
+    Real result = Real(0);
+    const auto& nodes = basis.nodes();
+    for (std::size_t i = 0; i < values.size(); ++i) {
+        result += values[i] * nodal_function(nodes[i]);
     }
+    return result;
 }
 
-TEST(LagrangeBasis, PyramidGradientAtExactApexThrowsWhenLimitIsNotUnique) {
-    const struct Case {
-        ElementType type;
-        int order;
-    } cases[] = {
-        {ElementType::Pyramid5, 1},
-        {ElementType::Pyramid14, 2},
-        {ElementType::Pyramid5, 4},
-    };
+} // namespace
 
-    const svmp::FE::math::Vector<Real, 3> apex{Real(0), Real(0), Real(1)};
-    for (const auto& c : cases) {
+TEST(LagrangeBasis, CanonicalTopologiesHaveExpectedSizesAndDimensions) {
+    for (const auto& c : canonical_cases()) {
         LagrangeBasis basis(c.type, c.order);
-        std::vector<Gradient> gradients;
-        EXPECT_THROW(basis.evaluate_gradients(apex, gradients), svmp::FE::basis::BasisEvaluationException)
-            << "order " << c.order;
+        EXPECT_EQ(basis.basis_type(), BasisType::Lagrange);
+        EXPECT_EQ(basis.element_type(), c.type);
+        EXPECT_EQ(basis.order(), c.order);
+        EXPECT_EQ(basis.size(), c.size);
+        EXPECT_EQ(basis.dimension(), c.dimension);
     }
 }
 
-TEST(LagrangeBasis, PyramidApexValuesMatchDirectionalNearApexLimits) {
-    const struct Case {
-        ElementType type;
-        int order;
-        Real tol;
-    } cases[] = {
-        {ElementType::Pyramid5, 1, Real(3e-6)},
-        {ElementType::Pyramid14, 2, Real(4e-6)},
-        {ElementType::Pyramid5, 4, Real(1e-5)},
-    };
-
-    const std::array<std::array<Real, 2>, 4> directions = {{
-        {Real(0), Real(0)},
-        {Real(0.35), Real(-0.25)},
-        {Real(-0.50), Real(0.45)},
-        {Real(0.20), Real(0.60)},
-    }};
-    const Real t = Real(1e-6);
-    const svmp::FE::math::Vector<Real, 3> apex{Real(0), Real(0), Real(1)};
-
-    for (const auto& c : cases) {
+TEST(LagrangeBasis, CanonicalTopologiesAreNodalAndPartitionUnity) {
+    for (const auto& c : canonical_cases()) {
         LagrangeBasis basis(c.type, c.order);
-        std::vector<Real> apex_values;
-        basis.evaluate_values(apex, apex_values);
-
-        for (const auto& direction : directions) {
-            const svmp::FE::math::Vector<Real, 3> xi{
-                t * direction[0],
-                t * direction[1],
-                Real(1) - t
-            };
-
-            std::vector<Real> values;
-            basis.evaluate_values(xi, values);
-            ASSERT_EQ(values.size(), apex_values.size());
-
-            for (std::size_t i = 0; i < values.size(); ++i) {
-                EXPECT_NEAR(values[i], apex_values[i], c.tol)
-                    << "order " << c.order
-                    << ", basis " << i
-                    << ", direction (" << direction[0] << ", " << direction[1] << ")";
-            }
-        }
+        expect_kronecker_at_nodes(basis, Real(2e-10));
+        expect_partition_gradient_hessian_sums(basis, c.points, c.derivative_tol);
     }
 }
 
-TEST(LagrangeBasis, PyramidNearApexGradientShowsDirectionalSpread) {
-    const struct Case {
-        ElementType type;
-        int order;
-        Real min_spread;
-    } cases[] = {
-        {ElementType::Pyramid5, 1, Real(5e-2)},
-        {ElementType::Pyramid14, 2, Real(5e-2)},
-    };
-
-    const std::array<std::array<Real, 2>, 4> directions = {{
-        {Real(0), Real(0)},
-        {Real(0.45), Real(-0.30)},
-        {Real(-0.35), Real(0.40)},
-        {Real(0.25), Real(0.55)},
-    }};
-    const Real t = Real(1e-6);
-
-    for (const auto& c : cases) {
+TEST(LagrangeBasis, RawOutputSinksMatchVectorEvaluationAcrossTopologies) {
+    for (const auto& c : canonical_cases()) {
         LagrangeBasis basis(c.type, c.order);
-        double max_spread = 0.0;
-
-        std::vector<std::vector<Gradient>> directional_gradients;
-        directional_gradients.reserve(directions.size());
-        for (const auto& direction : directions) {
-            const svmp::FE::math::Vector<Real, 3> xi{
-                t * direction[0],
-                t * direction[1],
-                Real(1) - t
-            };
-
-            std::vector<Gradient> gradients;
-            basis.evaluate_gradients(xi, gradients);
-            directional_gradients.push_back(std::move(gradients));
-        }
-
-        for (std::size_t i = 0; i < basis.size(); ++i) {
-            for (int d = 0; d < 3; ++d) {
-                double min_value = std::numeric_limits<double>::infinity();
-                double max_value = -std::numeric_limits<double>::infinity();
-                for (const auto& gradients : directional_gradients) {
-                    const double value = static_cast<double>(gradients[i][static_cast<std::size_t>(d)]);
-                    min_value = std::min(min_value, value);
-                    max_value = std::max(max_value, value);
-                }
-                max_spread = std::max(max_spread, max_value - min_value);
-            }
-        }
-
-        EXPECT_GT(max_spread, static_cast<double>(c.min_spread))
-            << "order " << c.order;
+        expect_raw_sinks_match_vector_evaluation(basis, c.points.front());
     }
 }
 
-TEST(LagrangeBasis, GradientSumZeroQuadAndTet) {
-    const std::vector<std::pair<ElementType, svmp::FE::math::Vector<Real, 3>>> cases = {
-        {ElementType::Quad4, svmp::FE::math::Vector<Real, 3>{Real(0.2), Real(-0.1), Real(0)}},
-        {ElementType::Tetra4, svmp::FE::math::Vector<Real, 3>{Real(0.1), Real(0.2), Real(0.1)}}
+TEST(LagrangeBasis, CompleteAliasesNormalizeToCanonicalBases) {
+    const std::vector<std::tuple<ElementType, ElementType, int>> aliases = {
+        {ElementType::Line3, ElementType::Line2, 2},
+        {ElementType::Triangle6, ElementType::Triangle3, 2},
+        {ElementType::Quad9, ElementType::Quad4, 2},
+        {ElementType::Tetra10, ElementType::Tetra4, 2},
+        {ElementType::Hex27, ElementType::Hex8, 2},
+        {ElementType::Wedge18, ElementType::Wedge6, 2},
     };
 
-    for (const auto& c : cases) {
-        LagrangeBasis basis(c.first, 1);
-        std::vector<Gradient> grads;
-        basis.evaluate_gradients(c.second, grads);
-
-        ASSERT_EQ(grads.size(), basis.size());
-        Gradient sum{};
-        for (const auto& g : grads) {
-            sum[0] += g[0];
-            sum[1] += g[1];
-            sum[2] += g[2];
-        }
-        EXPECT_NEAR(sum[0], 0.0, 1e-12);
-        EXPECT_NEAR(sum[1], 0.0, 1e-12);
-        EXPECT_NEAR(sum[2], 0.0, 1e-12);
-    }
-}
-
-TEST(LagrangeBasis, HexPartitionAndGradientSumZeroOrderThree) {
-    LagrangeBasis basis(ElementType::Hex8, 3);
-    svmp::FE::math::Vector<Real, 3> xi{Real(0.1), Real(-0.2), Real(0.25)};
-
-    std::vector<Real> values;
-    basis.evaluate_values(xi, values);
-    const double sum = std::accumulate(values.begin(), values.end(), 0.0);
-    EXPECT_NEAR(sum, 1.0, 1e-12);
-
-    std::vector<Gradient> grads;
-    basis.evaluate_gradients(xi, grads);
-    Gradient gsum{};
-    for (const auto& g : grads) {
-        gsum[0] += g[0];
-        gsum[1] += g[1];
-        gsum[2] += g[2];
-    }
-    EXPECT_NEAR(gsum[0], 0.0, 1e-10);
-    EXPECT_NEAR(gsum[1], 0.0, 1e-10);
-    EXPECT_NEAR(gsum[2], 0.0, 1e-10);
-}
-
-TEST(LagrangeBasis, OracleLine3ValuesGradientsAndHessians) {
-    LagrangeBasis basis(ElementType::Line3, 2);
-    const Point xi{Real(0.2), Real(0), Real(0)};
-
-    std::vector<Real> values;
-    std::vector<Gradient> gradients;
-    std::vector<Hessian> hessians;
-    basis.evaluate_values(xi, values);
-    basis.evaluate_gradients(xi, gradients);
-    basis.evaluate_hessians(xi, hessians);
-
-    ASSERT_EQ(values.size(), 3u);
-    ASSERT_EQ(gradients.size(), 3u);
-    ASSERT_EQ(hessians.size(), 3u);
-
-    const Real expected_values[] = {Real(-2) / Real(25), Real(3) / Real(25), Real(24) / Real(25)};
-    const Real expected_gradients[] = {Real(-3) / Real(10), Real(7) / Real(10), Real(-2) / Real(5)};
-    const Real expected_hessians[] = {Real(1), Real(1), Real(-2)};
+    for (const auto& [alias, canonical, order] : aliases) {
+        LagrangeBasis alias_basis(alias, 1);
+        LagrangeBasis canonical_basis(canonical, order);
+        const auto generated = ReferenceNodeLayout::get_lagrange_node_coords(canonical, order);
 
-    for (std::size_t i = 0; i < 3; ++i) {
-        EXPECT_NEAR(values[i], expected_values[i], 1e-14);
-        EXPECT_NEAR(gradients[i][0], expected_gradients[i], 1e-14);
-        EXPECT_NEAR(hessians[i](0, 0), expected_hessians[i], 1e-14);
+        EXPECT_EQ(alias_basis.element_type(), canonical);
+        EXPECT_EQ(alias_basis.order(), order);
+        expect_nodes_close(alias_basis.nodes(), generated, Real(1e-14));
+        expect_nodes_close(alias_basis.nodes(), canonical_basis.nodes(), Real(1e-14));
+        expect_evaluations_match(alias_basis,
+                                 canonical_basis,
+                                 sample_points_for(canonical),
+                                 Real(1e-12));
     }
 }
 
-TEST(LagrangeBasis, OracleTriangle3ValuesGradientsAndHessians) {
-    LagrangeBasis basis(ElementType::Triangle3, 1);
-    const Point xi{Real(0.2), Real(0.3), Real(0)};
-
-    std::vector<Real> values;
-    std::vector<Gradient> gradients;
-    std::vector<Hessian> hessians;
-    basis.evaluate_values(xi, values);
-    basis.evaluate_gradients(xi, gradients);
-    basis.evaluate_hessians(xi, hessians);
-
-    ASSERT_EQ(values.size(), 3u);
-    const Point expected_gradients[] = {
-        Point{Real(-1), Real(-1), Real(0)},
-        Point{Real(1), Real(0), Real(0)},
-        Point{Real(0), Real(1), Real(0)}
+TEST(LagrangeBasis, NodeOrderingMatchesPublicAliasLayouts) {
+    const std::vector<std::tuple<ElementType, ElementType, int>> aliases = {
+        {ElementType::Line2, ElementType::Line2, 1},
+        {ElementType::Line3, ElementType::Line2, 2},
+        {ElementType::Triangle3, ElementType::Triangle3, 1},
+        {ElementType::Triangle6, ElementType::Triangle3, 2},
+        {ElementType::Quad4, ElementType::Quad4, 1},
+        {ElementType::Quad9, ElementType::Quad4, 2},
+        {ElementType::Tetra4, ElementType::Tetra4, 1},
+        {ElementType::Tetra10, ElementType::Tetra4, 2},
+        {ElementType::Hex8, ElementType::Hex8, 1},
+        {ElementType::Hex27, ElementType::Hex8, 2},
+        {ElementType::Wedge6, ElementType::Wedge6, 1},
+        {ElementType::Wedge18, ElementType::Wedge6, 2},
     };
-    const Real expected_values[] = {Real(0.5), Real(0.2), Real(0.3)};
-
-    for (std::size_t i = 0; i < 3; ++i) {
-        EXPECT_NEAR(values[i], expected_values[i], 1e-14);
-        EXPECT_NEAR(gradients[i][0], expected_gradients[i][0], 1e-14);
-        EXPECT_NEAR(gradients[i][1], expected_gradients[i][1], 1e-14);
-        for (int a = 0; a < 2; ++a) {
-            for (int b = 0; b < 2; ++b) {
-                EXPECT_NEAR(hessians[i](static_cast<std::size_t>(a), static_cast<std::size_t>(b)),
-                            Real(0), 1e-14);
-            }
-        }
-    }
-}
-
-TEST(LagrangeBasis, OracleQuad4ValuesGradientsAndHessians) {
-    LagrangeBasis basis(ElementType::Quad4, 1);
-    const Point xi{Real(0.2), Real(-0.4), Real(0)};
-
-    std::vector<Real> values;
-    std::vector<Gradient> gradients;
-    std::vector<Hessian> hessians;
-    basis.evaluate_values(xi, values);
-    basis.evaluate_gradients(xi, gradients);
-    basis.evaluate_hessians(xi, hessians);
 
-    ASSERT_EQ(values.size(), 4u);
-    const Real expected_values[] = {Real(7) / Real(25), Real(21) / Real(50),
-                                    Real(9) / Real(50), Real(3) / Real(25)};
-    const Point expected_gradients[] = {
-        Point{Real(-7) / Real(20), Real(-1) / Real(5), Real(0)},
-        Point{Real(7) / Real(20), Real(-3) / Real(10), Real(0)},
-        Point{Real(3) / Real(20), Real(3) / Real(10), Real(0)},
-        Point{Real(-3) / Real(20), Real(1) / Real(5), Real(0)}
-    };
-    const Real expected_hxy[] = {Real(1) / Real(4), Real(-1) / Real(4),
-                                 Real(1) / Real(4), Real(-1) / Real(4)};
+    for (const auto& [alias, canonical, order] : aliases) {
+        const auto generated = ReferenceNodeLayout::get_lagrange_node_coords(canonical, order);
+        ASSERT_EQ(generated.size(), ReferenceNodeLayout::num_nodes(alias));
 
-    for (std::size_t i = 0; i < 4; ++i) {
-        EXPECT_NEAR(values[i], expected_values[i], 1e-14);
-        EXPECT_NEAR(gradients[i][0], expected_gradients[i][0], 1e-14);
-        EXPECT_NEAR(gradients[i][1], expected_gradients[i][1], 1e-14);
-        EXPECT_NEAR(hessians[i](0, 0), Real(0), 1e-14);
-        EXPECT_NEAR(hessians[i](1, 1), Real(0), 1e-14);
-        EXPECT_NEAR(hessians[i](0, 1), expected_hxy[i], 1e-14);
-        EXPECT_NEAR(hessians[i](1, 0), expected_hxy[i], 1e-14);
+        for (std::size_t i = 0; i < generated.size(); ++i) {
+            const auto public_node = ReferenceNodeLayout::get_node_coords(alias, i);
+            EXPECT_NEAR(public_node[0], generated[i][0], Real(1e-14)) << "node=" << i;
+            EXPECT_NEAR(public_node[1], generated[i][1], Real(1e-14)) << "node=" << i;
+            EXPECT_NEAR(public_node[2], generated[i][2], Real(1e-14)) << "node=" << i;
+        }
     }
 }
 
-TEST(LagrangeBasis, OracleWedge6ValuesGradientsAndHessians) {
-    LagrangeBasis basis(ElementType::Wedge6, 1);
-    const Point xi{Real(0.2), Real(0.25), Real(-0.3)};
-
-    std::vector<Real> values;
-    std::vector<Gradient> gradients;
-    std::vector<Hessian> hessians;
-    basis.evaluate_values(xi, values);
-    basis.evaluate_gradients(xi, gradients);
-    basis.evaluate_hessians(xi, hessians);
-
-    ASSERT_EQ(values.size(), 6u);
-    const Real expected_values[] = {
-        Real(143) / Real(400), Real(13) / Real(100), Real(13) / Real(80),
-        Real(77) / Real(400), Real(7) / Real(100), Real(7) / Real(80)
-    };
-    const Point expected_gradients[] = {
-        Point{Real(-13) / Real(20), Real(-13) / Real(20), Real(-11) / Real(40)},
-        Point{Real(13) / Real(20), Real(0), Real(-1) / Real(10)},
-        Point{Real(0), Real(13) / Real(20), Real(-1) / Real(8)},
-        Point{Real(-7) / Real(20), Real(-7) / Real(20), Real(11) / Real(40)},
-        Point{Real(7) / Real(20), Real(0), Real(1) / Real(10)},
-        Point{Real(0), Real(7) / Real(20), Real(1) / Real(8)}
-    };
-    const Point expected_hxz[] = {
-        Point{Real(1) / Real(2), Real(1) / Real(2), Real(0)},
-        Point{Real(-1) / Real(2), Real(0), Real(0)},
-        Point{Real(0), Real(-1) / Real(2), Real(0)},
-        Point{Real(-1) / Real(2), Real(-1) / Real(2), Real(0)},
-        Point{Real(1) / Real(2), Real(0), Real(0)},
-        Point{Real(0), Real(1) / Real(2), Real(0)}
+TEST(LagrangeBasis, RemovedOrSerendipityFamiliesAreRejected) {
+    const std::array<ElementType, 6> unsupported = {
+        ElementType::Quad8,
+        ElementType::Hex20,
+        ElementType::Wedge15,
+        ElementType::Pyramid5,
+        ElementType::Pyramid13,
+        ElementType::Pyramid14,
     };
 
-    for (std::size_t i = 0; i < 6; ++i) {
-        EXPECT_NEAR(values[i], expected_values[i], 1e-14);
-        EXPECT_NEAR(gradients[i][0], expected_gradients[i][0], 1e-14);
-        EXPECT_NEAR(gradients[i][1], expected_gradients[i][1], 1e-14);
-        EXPECT_NEAR(gradients[i][2], expected_gradients[i][2], 1e-14);
-        EXPECT_NEAR(hessians[i](0, 0), Real(0), 1e-14);
-        EXPECT_NEAR(hessians[i](1, 1), Real(0), 1e-14);
-        EXPECT_NEAR(hessians[i](2, 2), Real(0), 1e-14);
-        EXPECT_NEAR(hessians[i](0, 1), Real(0), 1e-14);
-        EXPECT_NEAR(hessians[i](1, 0), Real(0), 1e-14);
-        EXPECT_NEAR(hessians[i](0, 2), expected_hxz[i][0], 1e-14);
-        EXPECT_NEAR(hessians[i](2, 0), expected_hxz[i][0], 1e-14);
-        EXPECT_NEAR(hessians[i](1, 2), expected_hxz[i][1], 1e-14);
-        EXPECT_NEAR(hessians[i](2, 1), expected_hxz[i][1], 1e-14);
+    for (const auto type : unsupported) {
+        EXPECT_THROW((void)LagrangeBasis(type, 2), BasisElementCompatibilityException)
+            << "element=" << static_cast<int>(type);
     }
 }
 
-TEST(LagrangeBasis, DeterministicBoundarySweepMaintainsPartitionAndFiniteDerivatives) {
-    const std::vector<std::pair<ElementType, int>> cases = {
-        {ElementType::Line2, 1},
-        {ElementType::Line3, 2},
-        {ElementType::Triangle3, 1},
-        {ElementType::Triangle6, 2},
-        {ElementType::Quad4, 1},
-        {ElementType::Quad9, 2},
-        {ElementType::Tetra4, 1},
-        {ElementType::Tetra10, 2},
-        {ElementType::Hex8, 1},
-        {ElementType::Hex27, 2},
-        {ElementType::Wedge6, 1},
-        {ElementType::Wedge18, 2},
-        {ElementType::Pyramid5, 1},
-        {ElementType::Pyramid14, 2},
+TEST(LagrangeBasis, LinearPolynomialReproductionAcrossLinearTopologies) {
+    const std::vector<std::pair<ElementType, Point>> cases = {
+        {ElementType::Line2, {Real(-0.2), Real(0), Real(0)}},
+        {ElementType::Triangle3, {Real(0.2), Real(0.3), Real(0)}},
+        {ElementType::Quad4, {Real(0.25), Real(-0.4), Real(0)}},
+        {ElementType::Tetra4, {Real(0.1), Real(0.2), Real(0.3)}},
+        {ElementType::Hex8, {Real(0.15), Real(-0.2), Real(0.25)}},
+        {ElementType::Wedge6, {Real(0.2), Real(0.15), Real(-0.3)}},
     };
+    const Gradient expected_gradient = linear_gradient();
 
-    for (const auto& [type, order] : cases) {
-        LagrangeBasis basis(type, order);
-        for (const auto& xi : boundary_stress_points_for(type)) {
-            std::vector<Real> values;
-            std::vector<Gradient> gradients;
-            std::vector<Hessian> hessians;
-            basis.evaluate_values(xi, values);
-            basis.evaluate_gradients(xi, gradients);
-            basis.evaluate_hessians(xi, hessians);
+    for (const auto& [type, point] : cases) {
+        LagrangeBasis basis(type, 1);
+        std::vector<Real> values;
+        std::vector<Gradient> gradients;
+        basis.evaluate_values(point, values);
+        basis.evaluate_gradients(point, gradients);
 
-            ASSERT_EQ(values.size(), basis.size());
-            ASSERT_EQ(gradients.size(), basis.size());
-            ASSERT_EQ(hessians.size(), basis.size());
+        const Real interpolated =
+            interpolate_value(basis, values, linear_function);
+        EXPECT_NEAR(interpolated, linear_function(point), Real(1e-12));
 
-            Real sum = Real(0);
-            for (Real value : values) {
-                EXPECT_TRUE(std::isfinite(value));
-                sum += value;
+        Gradient interpolated_gradient{};
+        for (std::size_t i = 0; i < gradients.size(); ++i) {
+            const Real nodal_value = linear_function(basis.nodes()[i]);
+            for (int d = 0; d < basis.dimension(); ++d) {
+                interpolated_gradient[static_cast<std::size_t>(d)] +=
+                    nodal_value * gradients[i][static_cast<std::size_t>(d)];
             }
-            expect_all_finite(gradients);
-            expect_hessians_finite(hessians, basis.dimension());
-            EXPECT_NEAR(sum, Real(1), type == ElementType::Pyramid5 || type == ElementType::Pyramid14
-                                       ? Real(1e-8)
-                                       : Real(1e-12))
-                << "type=" << static_cast<int>(type)
-                << ", order=" << order
-                << ", xi=(" << xi[0] << "," << xi[1] << "," << xi[2] << ")";
         }
-    }
-}
-
-TEST(LagrangeBasis, FiniteDifferenceGradientsAcrossSupportedLinearShapes) {
-    const std::vector<LagrangeAccuracyCase> cases = {
-        {ElementType::Line2, 1, sample_points_for(ElementType::Line2)},
-        {ElementType::Triangle3, 1, sample_points_for(ElementType::Triangle3)},
-        {ElementType::Quad4, 1, sample_points_for(ElementType::Quad4)},
-        {ElementType::Tetra4, 1, sample_points_for(ElementType::Tetra4)},
-        {ElementType::Hex8, 1, sample_points_for(ElementType::Hex8)},
-        {ElementType::Wedge6, 1, sample_points_for(ElementType::Wedge6)},
-        {ElementType::Pyramid5, 1, sample_points_for(ElementType::Pyramid5)},
-    };
-
-    for (const auto& c : cases) {
-        expect_gradients_match_finite_difference(c, Real(1e-6), Real(1e-6));
-    }
-}
-
-TEST(LagrangeBasis, FiniteDifferenceGradientsAcrossSupportedQuadraticShapes) {
-    const std::vector<LagrangeAccuracyCase> cases = {
-        {ElementType::Line3, 2, sample_points_for(ElementType::Line3)},
-        {ElementType::Triangle6, 2, sample_points_for(ElementType::Triangle6)},
-        {ElementType::Quad9, 2, sample_points_for(ElementType::Quad9)},
-        {ElementType::Tetra10, 2, sample_points_for(ElementType::Tetra10)},
-        {ElementType::Hex27, 2, sample_points_for(ElementType::Hex27)},
-        {ElementType::Wedge18, 2, sample_points_for(ElementType::Wedge18)},
-        {ElementType::Pyramid14, 2, sample_points_for(ElementType::Pyramid14)},
-    };
-
-    for (const auto& c : cases) {
-        expect_gradients_match_finite_difference(c, Real(1e-6), Real(2e-6));
-    }
-}
-
-TEST(LagrangeBasis, LinearPolynomialReproductionAcrossSupportedLinearShapes) {
-    const std::vector<LagrangeAccuracyCase> cases = {
-        {ElementType::Line2, 1, sample_points_for(ElementType::Line2)},
-        {ElementType::Triangle3, 1, sample_points_for(ElementType::Triangle3)},
-        {ElementType::Quad4, 1, sample_points_for(ElementType::Quad4)},
-        {ElementType::Tetra4, 1, sample_points_for(ElementType::Tetra4)},
-        {ElementType::Hex8, 1, sample_points_for(ElementType::Hex8)},
-        {ElementType::Wedge6, 1, sample_points_for(ElementType::Wedge6)},
-        {ElementType::Pyramid5, 1, sample_points_for(ElementType::Pyramid5)},
-    };
-
-    const std::vector<std::array<int, 3>> exponents = {
-        {0, 0, 0},
-        {1, 0, 0},
-        {0, 1, 0},
-        {0, 0, 1},
-    };
-
-    for (const auto& c : cases) {
-        const std::vector<std::array<int, 3>> relevant(
-            exponents.begin(),
-            exponents.begin() + static_cast<std::ptrdiff_t>(c.type == ElementType::Line2 ? 2 :
-                                                            (c.type == ElementType::Triangle3 ||
-                                                             c.type == ElementType::Quad4) ? 3 : 4));
-        expect_polynomial_reproduction(c, relevant, Real(1e-12));
-    }
-}
-
-TEST(LagrangeBasis, QuadraticPolynomialReproductionAcrossSupportedQuadraticShapes) {
-    const std::vector<LagrangeAccuracyCase> cases = {
-        {ElementType::Line3, 2, sample_points_for(ElementType::Line3)},
-        {ElementType::Triangle6, 2, sample_points_for(ElementType::Triangle6)},
-        {ElementType::Quad9, 2, sample_points_for(ElementType::Quad9)},
-        {ElementType::Tetra10, 2, sample_points_for(ElementType::Tetra10)},
-        {ElementType::Hex27, 2, sample_points_for(ElementType::Hex27)},
-        {ElementType::Wedge18, 2, sample_points_for(ElementType::Wedge18)},
-        {ElementType::Pyramid14, 2, sample_points_for(ElementType::Pyramid14)},
-    };
-
-    const std::vector<std::array<int, 3>> line_exponents = {
-        {0, 0, 0}, {1, 0, 0}, {2, 0, 0}
-    };
-    const std::vector<std::array<int, 3>> surface_exponents = {
-        {0, 0, 0}, {1, 0, 0}, {0, 1, 0},
-        {2, 0, 0}, {1, 1, 0}, {0, 2, 0}
-    };
-    const std::vector<std::array<int, 3>> volume_exponents = {
-        {0, 0, 0}, {1, 0, 0}, {0, 1, 0}, {0, 0, 1},
-        {2, 0, 0}, {1, 1, 0}, {0, 2, 0},
-        {1, 0, 1}, {0, 1, 1}, {0, 0, 2}
-    };
-
-    for (const auto& c : cases) {
-        if (c.type == ElementType::Line3) {
-            expect_polynomial_reproduction(c, line_exponents, Real(1e-12));
-        } else if (c.type == ElementType::Triangle6 || c.type == ElementType::Quad9) {
-            expect_polynomial_reproduction(c, surface_exponents, Real(1e-11));
-        } else {
-            expect_polynomial_reproduction(c, volume_exponents, Real(2e-10));
+        for (int d = 0; d < basis.dimension(); ++d) {
+            EXPECT_NEAR(interpolated_gradient[static_cast<std::size_t>(d)],
+                        expected_gradient[static_cast<std::size_t>(d)],
+                        Real(1e-12));
         }
     }
 }
 
-TEST(LagrangeBasis, HighOrderTensorLagrangeMaintainsPartitionAndDerivativeSums) {
-    const std::vector<LagrangeAccuracyCase> cases = {
-        {ElementType::Line2, 8, {Point{-0.875, 0, 0}, Point{0.125, 0, 0}, Point{1, 0, 0}}},
-        {ElementType::Quad4, 7, {Point{0.2, -0.35, 0}, Point{-1, 0.5, 0}, Point{0.5, 1, 0}}},
-        {ElementType::Hex8, 6, {Point{0.1, -0.2, 0.3}, Point{-1, 0.5, 1}, Point{0.75, -1, -0.5}}},
+TEST(LagrangeBasis, QuadraticPolynomialReproductionAcrossQuadraticAliases) {
+    const std::vector<std::pair<ElementType, Point>> cases = {
+        {ElementType::Line3, {Real(-0.2), Real(0), Real(0)}},
+        {ElementType::Triangle6, {Real(0.2), Real(0.3), Real(0)}},
+        {ElementType::Quad9, {Real(0.25), Real(-0.4), Real(0)}},
+        {ElementType::Tetra10, {Real(0.1), Real(0.2), Real(0.3)}},
+        {ElementType::Hex27, {Real(0.15), Real(-0.2), Real(0.25)}},
+        {ElementType::Wedge18, {Real(0.2), Real(0.15), Real(-0.3)}},
     };
 
-    for (const auto& c : cases) {
-        LagrangeBasis basis(c.type, c.order);
-        expect_partition_gradient_hessian_sums(basis, c.points, Real(2e-12), Real(2e-8));
-    }
-}
-
-TEST(LagrangeBasis, HighOrderTensorLagrangeReproducesTensorPolynomials) {
-    const LagrangeAccuracyCase line{ElementType::Line2,
-                                    8,
-                                    {Point{-0.73, 0, 0}, Point{-0.1, 0, 0}, Point{0.64, 0, 0}}};
-    expect_polynomial_reproduction(line,
-                                   {{0, 0, 0}, {1, 0, 0}, {4, 0, 0}, {8, 0, 0}},
-                                   Real(1e-11));
-
-    const LagrangeAccuracyCase quad{ElementType::Quad4,
-                                    7,
-                                    {Point{-0.6, -0.2, 0}, Point{0.15, 0.45, 0}, Point{0.8, -0.55, 0}}};
-    expect_polynomial_reproduction(quad,
-                                   {{0, 0, 0}, {7, 0, 0}, {0, 7, 0}, {4, 3, 0}},
-                                   Real(5e-10));
-
-    const LagrangeAccuracyCase hex{ElementType::Hex8,
-                                   6,
-                                   {Point{-0.4, 0.2, -0.3}, Point{0.35, -0.55, 0.25}, Point{0.75, 0.4, -0.65}}};
-    expect_polynomial_reproduction(hex,
-                                   {{0, 0, 0}, {6, 0, 0}, {0, 6, 0}, {0, 0, 6}, {3, 2, 4}},
-                                   Real(2e-9));
+    for (const auto& [type, point] : cases) {
+        LagrangeBasis basis(type, 1);
+        std::vector<Real> values;
+        basis.evaluate_values(point, values);
+
+        const Real interpolated =
+            interpolate_value(basis, values, quadratic_function);
+        EXPECT_NEAR(interpolated, quadratic_function(point), Real(5e-12))
+            << "element=" << static_cast<int>(type);
+    }
+}
+
+TEST(LagrangeBasis, FactoryCreatesReducedScalarBasisFamilies) {
+    auto lagrange =
+        basis_factory::create(BasisRequest{ElementType::Hex27, BasisType::Lagrange, 1});
+    ASSERT_NE(lagrange, nullptr);
+    EXPECT_EQ(lagrange->basis_type(), BasisType::Lagrange);
+    EXPECT_EQ(lagrange->element_type(), ElementType::Hex8);
+    EXPECT_EQ(lagrange->order(), 2);
+
+    auto serendipity =
+        basis_factory::create(BasisRequest{ElementType::Quad8, BasisType::Serendipity, 2});
+    ASSERT_NE(serendipity, nullptr);
+    EXPECT_EQ(serendipity->basis_type(), BasisType::Serendipity);
+
+    EXPECT_THROW((void)basis_factory::create(
+                     BasisRequest{ElementType::Pyramid5, BasisType::Lagrange, 1}),
+                 BasisElementCompatibilityException);
+    EXPECT_THROW((void)basis_factory::create(
+                     BasisRequest{ElementType::Pyramid13, BasisType::Serendipity, 2}),
+                 BasisElementCompatibilityException);
 }
diff --git a/tests/unitTests/FE/Basis/test_SerendipityTensorModal.cpp b/tests/unitTests/FE/Basis/test_SerendipityTensorModal.cpp
index 9f2bf8be5..30f876420 100644
--- a/tests/unitTests/FE/Basis/test_SerendipityTensorModal.cpp
+++ b/tests/unitTests/FE/Basis/test_SerendipityTensorModal.cpp
@@ -98,19 +98,9 @@ TEST(SerendipityBasis, Wedge15IsNodalAndPartitionsUnity) {
     expect_partition_of_unity(basis, {Real(0.2), Real(0.3), Real(0.1)});
 }
 
-TEST(SerendipityBasis, Pyramid13IsNodalAndPartitionsUnity) {
-    SerendipityBasis basis(ElementType::Pyramid13, 2);
-
-    EXPECT_EQ(basis.size(), 13u);
-    expect_nodal_delta(basis,
-                       reference_nodes(ElementType::Pyramid13, basis.size()),
-                       Real(1e-8));
-    expect_partition_of_unity(basis, {Real(0.1), Real(-0.2), Real(0.4)});
-}
-
 TEST(SerendipityBasis, RejectsUnsupportedSerendipityAliases) {
     EXPECT_THROW(SerendipityBasis(ElementType::Quad9, 2), FEException);
+    EXPECT_THROW(SerendipityBasis(ElementType::Pyramid13, 2), FEException);
     EXPECT_THROW(SerendipityBasis(ElementType::Pyramid14, 2), FEException);
     EXPECT_THROW(SerendipityBasis(ElementType::Quad8, 3), FEException);
 }
-
diff --git a/tests/unitTests/test_common.h b/tests/unitTests/test_common.h
index 98709f600..7227b2beb 100644
--- a/tests/unitTests/test_common.h
+++ b/tests/unitTests/test_common.h
@@ -33,7 +33,6 @@
 #include <stdlib.h>
 #include <iostream>
 #include <random>
-#include <chrono>
 #include "CepMod.h"
 #include "ComMod.h"
 #include "gtest/gtest.h"
@@ -96,4 +95,4 @@ class TestBase {
 };
 
 
-#endif
\ No newline at end of file
+#endif

From 3876ee1fb1c0cd3231a8a2fdf4ea79b10c1dac24 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Mon, 8 Jun 2026 13:11:04 -0700
Subject: [PATCH 07/22] removing prewarmed evaluations and switch to std
 library constants. removed associated unit tests for these changes

---
 Code/Source/solver/CMakeLists.txt             |   6 -
 Code/Source/solver/FE/Basis/BasisFunction.cpp |  10 -
 Code/Source/solver/FE/Basis/BasisFunction.h   |   2 -
 Code/Source/solver/FE/Basis/LagrangeBasis.cpp |   5 -
 Code/Source/solver/FE/Basis/LagrangeBasis.h   |   2 -
 Code/Source/solver/FE/Math/MathConstants.h    | 388 ------------------
 Code/Source/solver/FE/Math/Matrix.h           |   1 -
 Code/Source/solver/FE/Math/Vector.h           |  19 +-
 Code/Source/solver/nn.cpp                     | 144 +++----
 .../FE/Basis/test_BasisErrorPaths.cpp         |   1 -
 .../unitTests/FE/Math/test_ExpressionOps.cpp  |   1 -
 .../unitTests/FE/Math/test_MathConstants.cpp  | 341 ---------------
 tests/unitTests/FE/Math/test_Matrix.cpp       |   1 -
 tests/unitTests/FE/Math/test_MatrixExpr.cpp   |   1 -
 tests/unitTests/FE/Math/test_Vector.cpp       |   1 -
 tests/unitTests/FE/Math/test_VectorExpr.cpp   |   1 -
 16 files changed, 77 insertions(+), 847 deletions(-)
 delete mode 100644 Code/Source/solver/FE/Math/MathConstants.h
 delete mode 100644 tests/unitTests/FE/Math/test_MathConstants.cpp

diff --git a/Code/Source/solver/CMakeLists.txt b/Code/Source/solver/CMakeLists.txt
index bdebc4a52..eace4d0b2 100644
--- a/Code/Source/solver/CMakeLists.txt
+++ b/Code/Source/solver/CMakeLists.txt
@@ -258,17 +258,11 @@ file(GLOB SOLVER_FE_MATH_SRCS CONFIGURE_DEPENDS
   FE/Math/*.h
 )
 
-file(GLOB SOLVER_FE_QUADRATURE_SRCS CONFIGURE_DEPENDS
-  FE/Quadrature/*.cpp
-  FE/Quadrature/*.h
-)
-
 list(APPEND CSRCS
   ${SOLVER_CORE_SRCS}
   ${SOLVER_FE_COMMON_SRCS}
   ${SOLVER_FE_BASIS_SRCS}
   ${SOLVER_FE_MATH_SRCS}
-  ${SOLVER_FE_QUADRATURE_SRCS}
 )
 
   # Set PETSc interace code.
diff --git a/Code/Source/solver/FE/Basis/BasisFunction.cpp b/Code/Source/solver/FE/Basis/BasisFunction.cpp
index 2a1d4f6b0..578c46c88 100644
--- a/Code/Source/solver/FE/Basis/BasisFunction.cpp
+++ b/Code/Source/solver/FE/Basis/BasisFunction.cpp
@@ -19,12 +19,6 @@ struct BasisFunctionScratch {
     std::vector<Real> values;
     std::vector<Gradient> gradients;
     std::vector<Hessian> hessians;
-
-    void prewarm(std::size_t max_size) {
-        values.reserve(max_size);
-        gradients.reserve(max_size);
-        hessians.reserve(max_size);
-    }
 };
 
 BasisFunctionScratch& scratch() {
@@ -34,10 +28,6 @@ BasisFunctionScratch& scratch() {
 
 } // namespace
 
-void prewarm_basis_function_scratch(std::size_t max_size) {
-    scratch().prewarm(max_size);
-}
-
 void BasisFunction::evaluate_gradients(const math::Vector<Real, 3>& xi,
                                        std::vector<Gradient>& gradients) const {
     (void)xi;
diff --git a/Code/Source/solver/FE/Basis/BasisFunction.h b/Code/Source/solver/FE/Basis/BasisFunction.h
index dbabf7061..bf6ac5de7 100644
--- a/Code/Source/solver/FE/Basis/BasisFunction.h
+++ b/Code/Source/solver/FE/Basis/BasisFunction.h
@@ -23,8 +23,6 @@ namespace basis {
 using Gradient = math::Vector<Real, 3>;
 using Hessian  = math::Matrix<Real, 3, 3>;
 
-void prewarm_basis_function_scratch(std::size_t max_size);
-
 [[nodiscard]] inline Hessian make_symmetric_hessian(Real xx,
                                                     Real yy,
                                                     Real zz,
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
index 7516d514a..372209722 100644
--- a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
+++ b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
@@ -303,11 +303,6 @@ void store_gradient(const Gradient& gradient, Real* dst) {
 
 } // namespace
 
-void prewarm_lagrange_basis_scratch(int max_order, std::size_t max_qpts) {
-    const auto n = static_cast<std::size_t>(std::max(0, max_order) + 1);
-    prewarm_basis_function_scratch(std::max(n * n * n, max_qpts));
-}
-
 LagrangeBasis::LagrangeBasis(ElementType type, int order)
     : element_type_(type), order_(order) {
     const auto normalized = normalize_lagrange_request(element_type_, order_);
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasis.h b/Code/Source/solver/FE/Basis/LagrangeBasis.h
index a5fe8e0fa..dae149872 100644
--- a/Code/Source/solver/FE/Basis/LagrangeBasis.h
+++ b/Code/Source/solver/FE/Basis/LagrangeBasis.h
@@ -18,8 +18,6 @@ namespace svmp {
 namespace FE {
 namespace basis {
 
-void prewarm_lagrange_basis_scratch(int max_order, std::size_t max_qpts = 0);
-
 class LagrangeBasis : public BasisFunction {
 public:
     using TensorNodeIndex = std::array<std::size_t, 3>;
diff --git a/Code/Source/solver/FE/Math/MathConstants.h b/Code/Source/solver/FE/Math/MathConstants.h
deleted file mode 100644
index 145520ab2..000000000
--- a/Code/Source/solver/FE/Math/MathConstants.h
+++ /dev/null
@@ -1,388 +0,0 @@
-#ifndef SVMP_FE_MATH_CONSTANTS_H
-#define SVMP_FE_MATH_CONSTANTS_H
-
-/**
- * @file MathConstants.h
- * @brief Mathematical constants and numerical tolerances for FE computations
- *
- * This header provides mathematical constants (π, e, √2, etc.) and numerical
- * tolerances used throughout the FE library. All constants are templated
- * to support different precision types.
- */
-
-#include <cmath>
-#include <limits>
-#include <type_traits>
-#include <algorithm>
-
-namespace svmp {
-namespace FE {
-namespace math {
-
-/**
- * @brief Mathematical constants templated by type
- * @tparam T The numeric type (float, double, long double)
- */
-template<typename T>
-struct Constants {
-    static_assert(std::is_floating_point_v<T>,
-                  "Constants only defined for floating-point types");
-
-    // Mathematical constants
-    static constexpr T pi           = T(3.14159265358979323846264338327950288419716939937510L);
-    static constexpr T two_pi       = T(6.28318530717958647692528676655900576839433879875021L);
-    static constexpr T half_pi      = T(1.57079632679489661923132169163975144209858469968755L);
-    static constexpr T quarter_pi   = T(0.78539816339744830961566084581987572104929234984378L);
-    static constexpr T inv_pi       = T(0.31830988618379067153776752674502872406891929148091L);
-    static constexpr T inv_two_pi   = T(0.15915494309189533576888376337251436203445964574046L);
-
-    static constexpr T e            = T(2.71828182845904523536028747135266249775724709369995L);
-    static constexpr T log2e        = T(1.44269504088896340735992468100189213742664595415299L);
-    static constexpr T log10e       = T(0.43429448190325182765112891891660508229439700580367L);
-    static constexpr T ln2          = T(0.69314718055994530941723212145817656807550013436026L);
-    static constexpr T ln10         = T(2.30258509299404568401799145468436420760110148862877L);
-
-    static constexpr T sqrt2        = T(1.41421356237309504880168872420969807856967187537694L);
-    static constexpr T sqrt3        = T(1.73205080756887729352744634150587236694280525381038L);
-    static constexpr T inv_sqrt2    = T(0.70710678118654752440084436210484903928483593768847L);
-    static constexpr T inv_sqrt3    = T(0.57735026918962576450914878050195745564760175127013L);
-
-    // Golden ratio
-    static constexpr T phi          = T(1.61803398874989484820458683436563811772030917980576L);
-
-    // Degrees to radians conversion
-    static constexpr T deg_to_rad   = pi / T(180);
-    static constexpr T rad_to_deg   = T(180) / pi;
-};
-
-/**
- * @brief Numerical tolerances and machine epsilon
- * @tparam T The numeric type
- */
-template<typename T>
-struct Tolerances {
-    static_assert(std::is_floating_point_v<T>,
-                  "Tolerances only defined for floating-point types");
-
-    // Machine epsilon
-    static constexpr T epsilon      = std::numeric_limits<T>::epsilon();
-
-    // Default tolerance (1000 * machine epsilon)
-    static constexpr T tolerance    = T(1000) * epsilon;
-
-    // Strict tolerance (10 * machine epsilon)
-    static constexpr T strict       = T(10) * epsilon;
-
-    // Loose tolerance (10000 * machine epsilon)
-    static constexpr T loose        = T(10000) * epsilon;
-
-    // Square root of epsilon (useful for finite differences)
-    static inline const T sqrt_epsilon = std::sqrt(epsilon);
-
-    // Cube root of epsilon (useful for numerical derivatives)
-    static inline const T cbrt_epsilon = std::cbrt(epsilon);
-
-    // Smallest positive normalized value
-    static constexpr T min_positive = std::numeric_limits<T>::min();
-
-    // Largest representable value
-    static constexpr T max_value    = std::numeric_limits<T>::max();
-
-    // Infinity
-    static constexpr T infinity     = std::numeric_limits<T>::infinity();
-
-    // Not-a-Number
-    static constexpr T nan          = std::numeric_limits<T>::quiet_NaN();
-};
-
-/**
- * @brief Convenient aliases for common types
- */
-template<typename T> inline constexpr T pi           = Constants<T>::pi;
-template<typename T> inline constexpr T two_pi       = Constants<T>::two_pi;
-template<typename T> inline constexpr T half_pi      = Constants<T>::half_pi;
-template<typename T> inline constexpr T quarter_pi   = Constants<T>::quarter_pi;
-template<typename T> inline constexpr T inv_pi       = Constants<T>::inv_pi;
-template<typename T> inline constexpr T inv_two_pi   = Constants<T>::inv_two_pi;
-
-template<typename T> inline constexpr T e            = Constants<T>::e;
-template<typename T> inline constexpr T log2e        = Constants<T>::log2e;
-template<typename T> inline constexpr T log10e       = Constants<T>::log10e;
-template<typename T> inline constexpr T ln2          = Constants<T>::ln2;
-template<typename T> inline constexpr T ln10         = Constants<T>::ln10;
-
-template<typename T> inline constexpr T sqrt2        = Constants<T>::sqrt2;
-template<typename T> inline constexpr T sqrt3        = Constants<T>::sqrt3;
-template<typename T> inline constexpr T inv_sqrt2    = Constants<T>::inv_sqrt2;
-template<typename T> inline constexpr T inv_sqrt3    = Constants<T>::inv_sqrt3;
-
-template<typename T> inline constexpr T phi          = Constants<T>::phi;
-
-template<typename T> inline constexpr T deg_to_rad   = Constants<T>::deg_to_rad;
-template<typename T> inline constexpr T rad_to_deg   = Constants<T>::rad_to_deg;
-
-template<typename T> inline constexpr T epsilon      = Tolerances<T>::epsilon;
-template<typename T> inline constexpr T tolerance    = Tolerances<T>::tolerance;
-template<typename T> inline constexpr T strict_tol   = Tolerances<T>::strict;
-template<typename T> inline constexpr T loose_tol    = Tolerances<T>::loose;
-template<typename T> inline const T sqrt_epsilon = Tolerances<T>::sqrt_epsilon;
-template<typename T> inline const T cbrt_epsilon = Tolerances<T>::cbrt_epsilon;
-template<typename T> inline constexpr T min_positive = Tolerances<T>::min_positive;
-template<typename T> inline constexpr T max_value    = Tolerances<T>::max_value;
-template<typename T> inline constexpr T infinity     = Tolerances<T>::infinity;
-
-/**
- * @brief Comparison functions with tolerance
- */
-
-/**
- * @brief Check if two values are approximately equal
- * @param a First value
- * @param b Second value
- * @param tol Tolerance (default: 1000 * epsilon)
- * @return true if |a - b| <= tol * max(|a|, |b|, 1)
- */
-template<typename T>
-inline constexpr bool approx_equal(T a, T b, T tol = tolerance<T>) {
-    static_assert(std::is_floating_point_v<T>,
-                  "approx_equal only defined for floating-point types");
-    const T scale = std::max({std::abs(a), std::abs(b), T(1)});
-    return std::abs(a - b) <= tol * scale;
-}
-
-/**
- * @brief Check if a value is approximately zero
- * @param a Value to check
- * @param tol Tolerance (default: 1000 * epsilon)
- * @return true if |a| <= tol
- */
-template<typename T>
-inline constexpr bool approx_zero(T a, T tol = tolerance<T>) {
-    static_assert(std::is_floating_point_v<T>,
-                  "approx_zero only defined for floating-point types");
-    return std::abs(a) <= tol;
-}
-
-/**
- * @brief Check if a value is positive (greater than tolerance)
- * @param a Value to check
- * @param tol Tolerance (default: 1000 * epsilon)
- * @return true if a > tol
- */
-template<typename T>
-inline constexpr bool is_positive(T a, T tol = tolerance<T>) {
-    static_assert(std::is_floating_point_v<T>,
-                  "is_positive only defined for floating-point types");
-    return a > tol;
-}
-
-/**
- * @brief Check if a value is negative (less than -tolerance)
- * @param a Value to check
- * @param tol Tolerance (default: 1000 * epsilon)
- * @return true if a < -tol
- */
-template<typename T>
-inline constexpr bool is_negative(T a, T tol = tolerance<T>) {
-    static_assert(std::is_floating_point_v<T>,
-                  "is_negative only defined for floating-point types");
-    return a < -tol;
-}
-
-/**
- * @brief Check if a value is finite (not infinite or NaN)
- * @param a Value to check
- * @return true if value is finite
- */
-template<typename T>
-inline constexpr bool is_finite(T a) {
-    static_assert(std::is_floating_point_v<T>,
-                  "is_finite only defined for floating-point types");
-    return std::isfinite(a);
-}
-
-/**
- * @brief Degrees to radians conversion
- * @param degrees Angle in degrees
- * @return Angle in radians
- */
-template<typename T>
-inline constexpr T to_radians(T degrees) {
-    static_assert(std::is_floating_point_v<T>,
-                  "to_radians only defined for floating-point types");
-    return degrees * deg_to_rad<T>;
-}
-
-/**
- * @brief Radians to degrees conversion
- * @param radians Angle in radians
- * @return Angle in degrees
- */
-template<typename T>
-inline constexpr T to_degrees(T radians) {
-    static_assert(std::is_floating_point_v<T>,
-                  "to_degrees only defined for floating-point types");
-    return radians * rad_to_deg<T>;
-}
-
-// =============================================================================
-// Constants namespace for compatibility with test expectations
-// =============================================================================
-namespace constants {
-
-// Mathematical constants (double precision defaults)
-inline constexpr double PI         = Constants<double>::pi;
-inline constexpr double PI_2       = Constants<double>::half_pi;
-inline constexpr double PI_4       = Constants<double>::quarter_pi;
-inline constexpr double TWO_PI     = Constants<double>::two_pi;
-inline constexpr double INV_PI     = Constants<double>::inv_pi;
-
-inline constexpr double E          = Constants<double>::e;
-inline constexpr double LN_2       = Constants<double>::ln2;
-inline constexpr double LN_10      = Constants<double>::ln10;
-inline constexpr double LOG10_E    = Constants<double>::log10e;
-inline constexpr double LOG2_E     = Constants<double>::log2e;
-
-inline constexpr double SQRT_2     = Constants<double>::sqrt2;
-inline constexpr double SQRT_3     = Constants<double>::sqrt3;
-inline constexpr double SQRT_5     = 2.2360679774997896964091736687312L;
-inline constexpr double INV_SQRT_2  = Constants<double>::inv_sqrt2;
-inline constexpr double INV_SQRT_3  = Constants<double>::inv_sqrt3;
-
-inline constexpr double PHI        = Constants<double>::phi;
-
-// Angle conversion functions
-template<typename T>
-inline constexpr T deg_to_rad(T degrees) {
-    return degrees * Constants<T>::deg_to_rad;
-}
-
-template<typename T>
-inline constexpr T rad_to_deg(T radians) {
-    return radians * Constants<T>::rad_to_deg;
-}
-
-// Templated tolerances
-template<typename T>
-inline constexpr T tolerance() {
-    return Tolerances<T>::tolerance;
-}
-
-template<typename T>
-inline constexpr T machine_epsilon() {
-    return Tolerances<T>::epsilon;
-}
-
-// Additional constants and utility functions for tests
-inline constexpr double DEFAULT_TOLERANCE = Tolerances<double>::tolerance;
-inline constexpr double DEFAULT_REL_TOLERANCE = 1e-12;
-inline constexpr double GEOMETRY_TOLERANCE = 1e-10;
-inline constexpr double SOLVER_TOLERANCE = Tolerances<double>::strict;
-inline constexpr double EPSILON = Tolerances<double>::epsilon;
-inline constexpr double INF_VALUE = Tolerances<double>::infinity;  // Renamed from INFINITY
-inline constexpr double NOT_A_NUMBER = Tolerances<double>::nan;  // Renamed from NAN
-inline constexpr double MAX_DOUBLE = Tolerances<double>::max_value;
-inline constexpr double MIN_DOUBLE = Tolerances<double>::min_positive;
-inline constexpr double LOWEST_DOUBLE = -Tolerances<double>::max_value;
-
-// Physical constants
-inline constexpr double SPEED_OF_LIGHT = 299792458.0;         // m/s
-inline constexpr double GRAVITATIONAL_CONSTANT = 6.67430e-11;  // m³/(kg·s²)
-inline constexpr double PLANCK_CONSTANT = 6.62607015e-34;      // J·s
-inline constexpr double AVOGADRO_NUMBER = 6.02214076e23;       // mol⁻¹
-inline constexpr double BOLTZMANN_CONSTANT = 1.380649e-23;     // J/K
-inline constexpr double STANDARD_GRAVITY = 9.80665;            // m/s²
-
-// Float and long double versions
-inline constexpr float PI_F = static_cast<float>(PI);
-inline constexpr float E_F = static_cast<float>(E);
-inline constexpr float SQRT_2_F = static_cast<float>(SQRT_2);
-inline constexpr float EPSILON_F = Tolerances<float>::epsilon;
-
-inline constexpr long double PI_L = static_cast<long double>(PI);
-inline constexpr long double E_L = static_cast<long double>(E);
-inline constexpr long double SQRT_2_L = static_cast<long double>(SQRT_2);
-inline constexpr long double EPSILON_L = Tolerances<long double>::epsilon;
-
-// Additional mathematical constants
-inline constexpr double SQRT_PI = 1.7724538509055160272981674833411L;
-
-// Utility functions
-template<typename T>
-inline constexpr int sign(T value) {
-    return (T(0) < value) - (value < T(0));
-}
-
-template<typename T>
-inline constexpr bool is_zero(T value, T tol = DEFAULT_TOLERANCE) {
-    return std::abs(value) <= tol;
-}
-
-template<typename T>
-inline bool near(T a, T b, T tol = DEFAULT_TOLERANCE) {
-    return std::abs(a - b) <= tol;
-}
-
-template<typename T>
-inline bool near_relative(T a, T b, T rel_tol = DEFAULT_REL_TOLERANCE) {
-    T scale = std::max(std::abs(a), std::abs(b));
-    return std::abs(a - b) <= rel_tol * scale;
-}
-
-template<typename T>
-inline constexpr T clamp(T value, T min_val, T max_val) {
-    return value < min_val ? min_val : (value > max_val ? max_val : value);
-}
-
-template<typename T>
-inline constexpr T lerp(T a, T b, T t) {
-    return a + t * (b - a);
-}
-
-template<typename T>
-inline T safe_divide(T numerator, T denominator, T default_val = T(0)) {
-    return is_zero(denominator) ? default_val : numerator / denominator;
-}
-
-template<typename T>
-inline bool isinf(T value) {
-    return std::isinf(value);
-}
-
-template<typename T>
-inline bool isnan(T value) {
-    return std::isnan(value);
-}
-
-} // namespace constants
-
-// Physical constants for FE analysis
-namespace physical_constants {
-
-// Material properties (SI units)
-inline constexpr double water_density = 1000.0;         // kg/m³
-inline constexpr double steel_density = 7850.0;         // kg/m³
-inline constexpr double aluminum_density = 2700.0;      // kg/m³
-
-inline constexpr double water_viscosity = 0.001;        // Pa·s at 20°C
-inline constexpr double air_viscosity = 1.81e-5;        // Pa·s at 20°C
-
-inline constexpr double steel_youngs_modulus = 200e9;   // Pa
-inline constexpr double aluminum_youngs_modulus = 70e9; // Pa
-
-inline constexpr double steel_poisson_ratio = 0.3;      // dimensionless
-inline constexpr double aluminum_poisson_ratio = 0.33;  // dimensionless
-
-// Physical constants
-inline constexpr double gravity = 9.80665;              // m/s²
-inline constexpr double gas_constant = 8.314462618;     // J/(mol·K)
-inline constexpr double boltzmann = 1.380649e-23;       // J/K
-inline constexpr double avogadro = 6.02214076e23;       // mol⁻¹
-
-} // namespace physical_constants
-
-} // namespace math
-} // namespace FE
-} // namespace svmp
-
-#endif // SVMP_FE_MATH_CONSTANTS_H
diff --git a/Code/Source/solver/FE/Math/Matrix.h b/Code/Source/solver/FE/Math/Matrix.h
index 6058ab943..8cb28e5d5 100644
--- a/Code/Source/solver/FE/Math/Matrix.h
+++ b/Code/Source/solver/FE/Math/Matrix.h
@@ -13,7 +13,6 @@
 
 #include "MatrixExpr.h"
 #include "Vector.h"
-#include "MathConstants.h"
 #include "../Common/Types.h"
 #include <algorithm>
 #include <array>
diff --git a/Code/Source/solver/FE/Math/Vector.h b/Code/Source/solver/FE/Math/Vector.h
index 76c7be152..777f9945b 100644
--- a/Code/Source/solver/FE/Math/Vector.h
+++ b/Code/Source/solver/FE/Math/Vector.h
@@ -11,12 +11,12 @@
  */
 
 #include "VectorExpr.h"
-#include "MathConstants.h"
 #include "../Common/Types.h"
 #include <algorithm>
 #include <array>
 #include <cmath>
 #include <initializer_list>
+#include <limits>
 #include <ostream>
 #include <stdexcept>
 #include <type_traits>
@@ -25,6 +25,23 @@ namespace svmp {
 namespace FE {
 namespace math {
 
+template<typename T>
+inline constexpr T tolerance =
+    std::is_floating_point_v<T> ? T(1000) * std::numeric_limits<T>::epsilon() : T(0);
+
+template<typename T>
+inline bool approx_zero(T value, T tol = tolerance<T>) {
+    using std::abs;
+    return abs(value) <= tol;
+}
+
+template<typename T>
+inline bool approx_equal(T a, T b, T tol = tolerance<T>) {
+    using std::abs;
+    const T scale = std::max({abs(a), abs(b), T(1)});
+    return abs(a - b) <= tol * scale;
+}
+
 /**
  * @brief Fixed-size vector for element-level computations
  * @tparam T Scalar type (float, double)
diff --git a/Code/Source/solver/nn.cpp b/Code/Source/solver/nn.cpp
index 51c126708..a9e0aebc3 100644
--- a/Code/Source/solver/nn.cpp
+++ b/Code/Source/solver/nn.cpp
@@ -35,6 +35,7 @@
 #include <math.h> 
 #include <memory>
 #include <optional>
+#include <span>
 #include <string>
 #include <vector>
 
@@ -152,36 +153,27 @@ std::string solver_element_name(consts::ElementType eType)
 
 std::optional<BasisSelection> to_basis_selection(consts::ElementType eType)
 {
-  switch (eType) {
-    case consts::ElementType::LIN1:
-      return BasisSelection{fe::ElementType::Line2, fe::BasisType::Lagrange, 1};
-    case consts::ElementType::LIN2:
-      return BasisSelection{fe::ElementType::Line3, fe::BasisType::Lagrange, 2};
-    case consts::ElementType::TRI3:
-      return BasisSelection{fe::ElementType::Triangle3, fe::BasisType::Lagrange, 1};
-    case consts::ElementType::TRI6:
-      return BasisSelection{fe::ElementType::Triangle6, fe::BasisType::Lagrange, 2};
-    case consts::ElementType::QUD4:
-      return BasisSelection{fe::ElementType::Quad4, fe::BasisType::Lagrange, 1};
-    case consts::ElementType::QUD8:
-      return BasisSelection{fe::ElementType::Quad8, fe::BasisType::Serendipity, 2};
-    case consts::ElementType::QUD9:
-      return BasisSelection{fe::ElementType::Quad9, fe::BasisType::Lagrange, 2};
-    case consts::ElementType::TET4:
-      return BasisSelection{fe::ElementType::Tetra4, fe::BasisType::Lagrange, 1};
-    case consts::ElementType::TET10:
-      return BasisSelection{fe::ElementType::Tetra10, fe::BasisType::Lagrange, 2};
-    case consts::ElementType::HEX8:
-      return BasisSelection{fe::ElementType::Hex8, fe::BasisType::Lagrange, 1};
-    case consts::ElementType::HEX20:
-      return BasisSelection{fe::ElementType::Hex20, fe::BasisType::Serendipity, 2};
-    case consts::ElementType::HEX27:
-      return BasisSelection{fe::ElementType::Hex27, fe::BasisType::Lagrange, 2};
-    case consts::ElementType::WDG:
-      return BasisSelection{fe::ElementType::Wedge6, fe::BasisType::Lagrange, 1};
-    default:
-      return std::nullopt;
-  }
+  static constexpr std::array supported{
+      BasisSelection{fe::ElementType::Line2,     fe::BasisType::Lagrange,    1},
+      BasisSelection{fe::ElementType::Line3,     fe::BasisType::Lagrange,    2},
+      BasisSelection{fe::ElementType::Triangle3, fe::BasisType::Lagrange,    1},
+      BasisSelection{fe::ElementType::Triangle6, fe::BasisType::Lagrange,    2},
+      BasisSelection{fe::ElementType::Quad4,     fe::BasisType::Lagrange,    1},
+      BasisSelection{fe::ElementType::Quad8,     fe::BasisType::Serendipity, 2},
+      BasisSelection{fe::ElementType::Quad9,     fe::BasisType::Lagrange,    2},
+      BasisSelection{fe::ElementType::Tetra4,    fe::BasisType::Lagrange,    1},
+      BasisSelection{fe::ElementType::Tetra10,   fe::BasisType::Lagrange,    2},
+      BasisSelection{fe::ElementType::Hex8,      fe::BasisType::Lagrange,    1},
+      BasisSelection{fe::ElementType::Hex20,     fe::BasisType::Serendipity, 2},
+      BasisSelection{fe::ElementType::Hex27,     fe::BasisType::Lagrange,    2},
+      BasisSelection{fe::ElementType::Wedge6,    fe::BasisType::Lagrange,    1},
+  };
+
+  const int index = static_cast<int>(eType) - static_cast<int>(consts::ElementType::LIN1);
+  if (index >= 0 && static_cast<std::size_t>(index) < supported.size()) {
+    return supported[static_cast<std::size_t>(index)];
+  }
+  return std::nullopt;
 }
 
 bool use_basis_adapter_for(consts::ElementType eType)
@@ -189,11 +181,6 @@ bool use_basis_adapter_for(consts::ElementType eType)
   return basis_mode_allows_fe_adapter() && to_basis_selection(eType).has_value();
 }
 
-bool supports_basis_hessian_adapter_for(consts::ElementType eType)
-{
-  return basis_mode_allows_fe_adapter() && to_basis_selection(eType).has_value();
-}
-
 bool supports_face_basis_adapter_for(consts::ElementType eType)
 {
   if (!basis_mode_allows_fe_adapter()) {
@@ -223,26 +210,36 @@ std::shared_ptr<febasis::BasisFunction> make_basis_for_solver_element(consts::El
         __FILE__, __LINE__, __func__);
   }
 
-  febasis::BasisRequest request;
-  request.element_type = selection->element;
-  request.basis_type = selection->basis;
-  request.order = selection->order;
-  return febasis::basis_factory::create(request);
+  return febasis::basis_factory::create(
+      {selection->element, selection->basis, selection->order});
 }
 
-template <std::size_t NumNodes>
-std::size_t mapped_basis_index(const std::array<std::size_t, NumNodes>& map,
-                               consts::ElementType eType,
-                               const int solver_node)
+std::span<const std::size_t> solver_to_basis_node_map(consts::ElementType eType)
 {
-  if (solver_node < 0 || static_cast<std::size_t>(solver_node) >= map.size()) {
-    throw febasis::BasisNodeOrderingException(
-        "Solver node " + std::to_string(solver_node) +
-            " is outside node map for " + solver_element_name(eType),
-        __FILE__, __LINE__, __func__);
-  }
+  static constexpr std::array<std::size_t, 3> tri3{1, 2, 0};
+  static constexpr std::array<std::size_t, 6> tri6{1, 2, 0, 4, 5, 3};
+  static constexpr std::array<std::size_t, 4> tet4{1, 2, 3, 0};
+  static constexpr std::array<std::size_t, 10> tet10{1, 2, 3, 0, 5, 9, 8, 4, 6, 7};
+  static constexpr std::array<std::size_t, 27> hex27{
+      0, 1, 2, 3, 4, 5, 6, 7,
+      8, 9, 10, 11, 12, 13, 14, 15,
+      16, 17, 18, 19, 25, 23, 22, 24, 20, 21, 26};
 
-  return map[static_cast<std::size_t>(solver_node)];
+  switch (eType) {
+    case consts::ElementType::TRI3:
+      return tri3;
+    case consts::ElementType::TRI6:
+    case consts::ElementType::WDG:
+      return tri6;
+    case consts::ElementType::TET4:
+      return tet4;
+    case consts::ElementType::TET10:
+      return tet10;
+    case consts::ElementType::HEX27:
+      return hex27;
+    default:
+      return {};
+  }
 }
 
 std::size_t basis_index_for_solver_node(consts::ElementType eType, const int solver_node)
@@ -255,40 +252,17 @@ std::size_t basis_index_for_solver_node(consts::ElementType eType, const int sol
   }
 
   const auto node = static_cast<std::size_t>(solver_node);
-
-  switch (eType) {
-    case consts::ElementType::TRI3: {
-      static constexpr std::array<std::size_t, 3> map{1, 2, 0};
-      return mapped_basis_index(map, eType, solver_node);
-    }
-    case consts::ElementType::TRI6: {
-      static constexpr std::array<std::size_t, 6> map{1, 2, 0, 4, 5, 3};
-      return mapped_basis_index(map, eType, solver_node);
-    }
-    case consts::ElementType::TET4: {
-      static constexpr std::array<std::size_t, 4> map{1, 2, 3, 0};
-      return mapped_basis_index(map, eType, solver_node);
-    }
-    case consts::ElementType::TET10: {
-      static constexpr std::array<std::size_t, 10> map{1, 2, 3, 0, 5, 9, 8, 4, 6, 7};
-      return mapped_basis_index(map, eType, solver_node);
-    }
-    case consts::ElementType::WDG: {
-      static constexpr std::array<std::size_t, 6> map{1, 2, 0, 4, 5, 3};
-      return mapped_basis_index(map, eType, solver_node);
-    }
-    case consts::ElementType::HEX27: {
-      static constexpr std::array<std::size_t, 27> map{
-        0, 1, 2, 3, 4, 5, 6, 7,
-        8, 9, 10, 11, 12, 13, 14, 15,
-        16, 17, 18, 19,
-        25, 23, 22, 24, 20, 21, 26
-      };
-      return mapped_basis_index(map, eType, solver_node);
-    }
-    default:
-      return node;
+  const auto map = solver_to_basis_node_map(eType);
+  if (map.empty()) {
+    return node;
+  }
+  if (node < map.size()) {
+    return map[node];
   }
+  throw febasis::BasisNodeOrderingException(
+      "Solver node " + std::to_string(solver_node) +
+          " is outside node map for " + solver_element_name(eType),
+      __FILE__, __LINE__, __func__);
 }
 
 fe::math::Vector<fe::Real, 3> make_basis_point(const febasis::BasisFunction& basis,
@@ -710,7 +684,7 @@ void get_gn_nxx(const int insd, const int ind2, consts::ElementType eType, const
     return;
   }
 
-  if (supports_basis_hessian_adapter_for(eType)) {
+  if (use_basis_adapter_for(eType)) {
     try {
       evaluate_basis_hessians(insd, ind2, eType, eNoN, gaus_pt, xi, Nxx);
       return;
diff --git a/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp b/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
index 7838702b0..430390e54 100644
--- a/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
+++ b/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
@@ -156,7 +156,6 @@ TEST(BasisErrorPaths, BasisFunctionDefaultsThrowForMissingDerivatives) {
 TEST(BasisErrorPaths, BasisFunctionFallbackWritesRawLayouts) {
     CompleteFallbackBasis basis;
     const math::Vector<Real, 3> point{Real(0.25), Real(0.5), Real(-0.25)};
-    prewarm_basis_function_scratch(basis.size());
 
     std::vector<Real> flat_values(basis.size());
     std::vector<Real> flat_gradients(basis.size() * 3u);
diff --git a/tests/unitTests/FE/Math/test_ExpressionOps.cpp b/tests/unitTests/FE/Math/test_ExpressionOps.cpp
index 307b308a1..a368e345e 100644
--- a/tests/unitTests/FE/Math/test_ExpressionOps.cpp
+++ b/tests/unitTests/FE/Math/test_ExpressionOps.cpp
@@ -7,7 +7,6 @@
 #include "FE/Math/ExpressionOps.h"
 #include "FE/Math/Vector.h"
 #include "FE/Math/Matrix.h"
-#include "FE/Math/MathConstants.h"
 #include <limits>
 #include <cmath>
 #include <complex>
diff --git a/tests/unitTests/FE/Math/test_MathConstants.cpp b/tests/unitTests/FE/Math/test_MathConstants.cpp
deleted file mode 100644
index 5619690ed..000000000
--- a/tests/unitTests/FE/Math/test_MathConstants.cpp
+++ /dev/null
@@ -1,341 +0,0 @@
-/**
- * @file test_MathConstants.cpp
- * @brief Unit tests for MathConstants.h - mathematical constants and tolerances
- */
-
-#include <gtest/gtest.h>
-#include "FE/Math/MathConstants.h"
-#include <cmath>
-#include <limits>
-#include <type_traits>
-
-using namespace svmp::FE::math;
-
-// Test fixture for MathConstants tests
-class MathConstantsTest : public ::testing::Test {
-protected:
-    void SetUp() override {}
-    void TearDown() override {}
-};
-
-// =============================================================================
-// Mathematical Constants Tests
-// =============================================================================
-
-TEST_F(MathConstantsTest, PiConstants) {
-    // Test PI value
-    EXPECT_NEAR(constants::PI, 3.14159265358979323846, 1e-15);
-
-    // Test PI/2
-    EXPECT_NEAR(constants::PI_2, constants::PI / 2.0, 1e-15);
-
-    // Test PI/4
-    EXPECT_NEAR(constants::PI_4, constants::PI / 4.0, 1e-15);
-
-    // Test 2*PI
-    EXPECT_NEAR(constants::TWO_PI, 2.0 * constants::PI, 1e-15);
-
-    // Test 1/PI
-    EXPECT_NEAR(constants::INV_PI, 1.0 / constants::PI, 1e-15);
-
-    // Test sqrt(PI)
-    EXPECT_NEAR(constants::SQRT_PI, std::sqrt(constants::PI), 1e-15);
-}
-
-TEST_F(MathConstantsTest, EulerConstant) {
-    // Test e (Euler's number)
-    EXPECT_NEAR(constants::E, std::exp(1.0), 1e-15);
-
-    // Test ln(2)
-    EXPECT_NEAR(constants::LN_2, std::log(2.0), 1e-15);
-
-    // Test ln(10)
-    EXPECT_NEAR(constants::LN_10, std::log(10.0), 1e-15);
-
-    // Test log10(e)
-    EXPECT_NEAR(constants::LOG10_E, std::log10(constants::E), 1e-15);
-
-    // Test log2(e)
-    EXPECT_NEAR(constants::LOG2_E, std::log2(constants::E), 1e-15);
-}
-
-TEST_F(MathConstantsTest, SquareRootConstants) {
-    // Test sqrt(2)
-    EXPECT_NEAR(constants::SQRT_2, std::sqrt(2.0), 1e-15);
-
-    // Test sqrt(3)
-    EXPECT_NEAR(constants::SQRT_3, std::sqrt(3.0), 1e-15);
-
-    // Test sqrt(5)
-    EXPECT_NEAR(constants::SQRT_5, std::sqrt(5.0), 1e-15);
-
-    // Test 1/sqrt(2)
-    EXPECT_NEAR(constants::INV_SQRT_2, 1.0 / std::sqrt(2.0), 1e-15);
-
-    // Test 1/sqrt(3)
-    EXPECT_NEAR(constants::INV_SQRT_3, 1.0 / std::sqrt(3.0), 1e-15);
-}
-
-TEST_F(MathConstantsTest, GoldenRatio) {
-    // Test golden ratio φ = (1 + sqrt(5))/2
-    EXPECT_NEAR(constants::PHI, (1.0 + std::sqrt(5.0)) / 2.0, 1e-15);
-
-    // Property: φ² = φ + 1
-    EXPECT_NEAR(constants::PHI * constants::PHI, constants::PHI + 1.0, 1e-14);
-
-    // Property: 1/φ = φ - 1
-    EXPECT_NEAR(1.0 / constants::PHI, constants::PHI - 1.0, 1e-14);
-}
-
-// =============================================================================
-// Angle Conversion Tests
-// =============================================================================
-
-TEST_F(MathConstantsTest, DegreesToRadians) {
-    // Test common conversions
-    EXPECT_NEAR(constants::deg_to_rad(0.0), 0.0, 1e-15);
-    EXPECT_NEAR(constants::deg_to_rad(90.0), constants::PI_2, 1e-15);
-    EXPECT_NEAR(constants::deg_to_rad(180.0), constants::PI, 1e-15);
-    EXPECT_NEAR(constants::deg_to_rad(270.0), 3.0 * constants::PI_2, 1e-15);
-    EXPECT_NEAR(constants::deg_to_rad(360.0), constants::TWO_PI, 1e-15);
-
-    // Test negative angles
-    EXPECT_NEAR(constants::deg_to_rad(-90.0), -constants::PI_2, 1e-15);
-    EXPECT_NEAR(constants::deg_to_rad(-180.0), -constants::PI, 1e-15);
-
-    // Test arbitrary angle
-    EXPECT_NEAR(constants::deg_to_rad(45.0), constants::PI_4, 1e-15);
-    EXPECT_NEAR(constants::deg_to_rad(30.0), constants::PI / 6.0, 1e-15);
-    EXPECT_NEAR(constants::deg_to_rad(60.0), constants::PI / 3.0, 1e-15);
-}
-
-TEST_F(MathConstantsTest, RadiansToDegrees) {
-    // Test common conversions
-    EXPECT_NEAR(constants::rad_to_deg(0.0), 0.0, 1e-13);
-    EXPECT_NEAR(constants::rad_to_deg(constants::PI_2), 90.0, 1e-13);
-    EXPECT_NEAR(constants::rad_to_deg(constants::PI), 180.0, 1e-13);
-    EXPECT_NEAR(constants::rad_to_deg(constants::TWO_PI), 360.0, 1e-13);
-
-    // Test negative angles
-    EXPECT_NEAR(constants::rad_to_deg(-constants::PI), -180.0, 1e-13);
-
-    // Test round-trip conversion
-    double angle_deg = 123.456;
-    double angle_rad = constants::deg_to_rad(angle_deg);
-    double back_to_deg = constants::rad_to_deg(angle_rad);
-    EXPECT_NEAR(back_to_deg, angle_deg, 1e-13);
-}
-
-// =============================================================================
-// Machine Precision Tests
-// =============================================================================
-
-TEST_F(MathConstantsTest, MachineEpsilon) {
-    // Test double precision epsilon
-    EXPECT_EQ(constants::EPSILON, std::numeric_limits<double>::epsilon());
-
-    // Test float precision epsilon
-    EXPECT_EQ(constants::EPSILON_F, std::numeric_limits<float>::epsilon());
-
-    // Verify epsilon is the smallest value such that 1.0 + epsilon != 1.0
-    double one_plus_eps = 1.0 + constants::EPSILON;
-    double one_plus_half_eps = 1.0 + constants::EPSILON / 2.0;
-
-    EXPECT_NE(one_plus_eps, 1.0);
-    EXPECT_EQ(one_plus_half_eps, 1.0);
-}
-
-TEST_F(MathConstantsTest, NumericalLimits) {
-    // Test infinity
-    EXPECT_TRUE(std::isinf(constants::INF_VALUE));
-    EXPECT_GT(constants::INF_VALUE, std::numeric_limits<double>::max());
-
-    // Test NaN
-    EXPECT_TRUE(std::isnan(constants::NOT_A_NUMBER));
-    EXPECT_NE(constants::NOT_A_NUMBER, constants::NOT_A_NUMBER);  // NaN != NaN
-
-    // Test max/min values
-    EXPECT_EQ(constants::MAX_DOUBLE, std::numeric_limits<double>::max());
-    EXPECT_EQ(constants::MIN_DOUBLE, std::numeric_limits<double>::min());
-    EXPECT_EQ(constants::LOWEST_DOUBLE, std::numeric_limits<double>::lowest());
-}
-
-// =============================================================================
-// Tolerance Tests
-// =============================================================================
-
-TEST_F(MathConstantsTest, DefaultTolerances) {
-    // Test default absolute tolerance
-    EXPECT_GT(constants::DEFAULT_TOLERANCE, 0.0);
-    EXPECT_LT(constants::DEFAULT_TOLERANCE, 1e-10);
-
-    // Test default relative tolerance
-    EXPECT_GT(constants::DEFAULT_REL_TOLERANCE, 0.0);
-    EXPECT_LT(constants::DEFAULT_REL_TOLERANCE, 1e-10);
-
-    // Test solver tolerance
-    EXPECT_GT(constants::SOLVER_TOLERANCE, 0.0);
-    EXPECT_LE(constants::SOLVER_TOLERANCE, constants::DEFAULT_TOLERANCE);
-
-    // Test geometry tolerance (typically larger)
-    EXPECT_GT(constants::GEOMETRY_TOLERANCE, 0.0);
-    EXPECT_GE(constants::GEOMETRY_TOLERANCE, constants::DEFAULT_TOLERANCE);
-}
-
-TEST_F(MathConstantsTest, ToleranceComparison) {
-    double a = 1.0;
-    double b = 1.0 + constants::DEFAULT_TOLERANCE / 2.0;
-    double c = 1.0 + constants::DEFAULT_TOLERANCE * 2.0;
-
-    // Values within tolerance should be considered equal
-    EXPECT_TRUE(constants::near(a, b, constants::DEFAULT_TOLERANCE));
-
-    // Values outside tolerance should not be equal
-    EXPECT_FALSE(constants::near(a, c, constants::DEFAULT_TOLERANCE));
-
-    // Test relative tolerance
-    double large_a = 1e10;
-    double large_b = large_a * (1.0 + constants::DEFAULT_REL_TOLERANCE / 2.0);
-    double large_c = large_a * (1.0 + constants::DEFAULT_REL_TOLERANCE * 2.0);
-
-    EXPECT_TRUE(constants::near_relative(large_a, large_b, constants::DEFAULT_REL_TOLERANCE));
-    EXPECT_FALSE(constants::near_relative(large_a, large_c, constants::DEFAULT_REL_TOLERANCE));
-}
-
-TEST_F(MathConstantsTest, ZeroComparison) {
-    // Test near zero detection
-    EXPECT_TRUE(constants::is_zero(0.0));
-    EXPECT_TRUE(constants::is_zero(constants::DEFAULT_TOLERANCE / 2.0));
-    EXPECT_FALSE(constants::is_zero(constants::DEFAULT_TOLERANCE * 2.0));
-
-    // Test with negative values
-    EXPECT_TRUE(constants::is_zero(-constants::DEFAULT_TOLERANCE / 2.0));
-    EXPECT_FALSE(constants::is_zero(-constants::DEFAULT_TOLERANCE * 2.0));
-}
-
-// =============================================================================
-// Physical Constants Tests
-// =============================================================================
-
-TEST_F(MathConstantsTest, PhysicalConstants) {
-    // Test speed of light (m/s)
-    EXPECT_NEAR(constants::SPEED_OF_LIGHT, 299792458.0, 1.0);
-
-    // Test gravitational constant (m³/kg/s²)
-    EXPECT_NEAR(constants::GRAVITATIONAL_CONSTANT, 6.67430e-11, 1e-16);
-
-    // Test standard gravity (m/s²)
-    EXPECT_NEAR(constants::STANDARD_GRAVITY, 9.80665, 1e-10);
-
-    // Test Planck constant (J⋅s)
-    EXPECT_NEAR(constants::PLANCK_CONSTANT, 6.62607015e-34, 1e-42);
-
-    // Test Boltzmann constant (J/K)
-    EXPECT_NEAR(constants::BOLTZMANN_CONSTANT, 1.380649e-23, 1e-29);
-
-    // Test Avogadro's number (1/mol)
-    EXPECT_NEAR(constants::AVOGADRO_NUMBER, 6.02214076e23, 1e15);
-}
-
-// =============================================================================
-// Compile-Time Constants Tests
-// =============================================================================
-
-TEST_F(MathConstantsTest, CompileTimeConstants) {
-    // Test that constants are constexpr (compile-time)
-    constexpr double pi = constants::PI;
-    constexpr double e = constants::E;
-    constexpr double sqrt2 = constants::SQRT_2;
-
-    EXPECT_EQ(pi, constants::PI);
-    EXPECT_EQ(e, constants::E);
-    EXPECT_EQ(sqrt2, constants::SQRT_2);
-
-    // Test compile-time functions
-    constexpr double angle_rad = constants::deg_to_rad(90.0);
-    EXPECT_NEAR(angle_rad, constants::PI_2, 1e-15);
-
-    constexpr double angle_deg = constants::rad_to_deg(constants::PI);
-    EXPECT_NEAR(angle_deg, 180.0, 1e-13);
-}
-
-// =============================================================================
-// Type Traits Tests
-// =============================================================================
-
-TEST_F(MathConstantsTest, TypedConstants) {
-    // Test float versions
-    EXPECT_NEAR(constants::PI_F, static_cast<float>(constants::PI), 1e-7f);
-    EXPECT_NEAR(constants::E_F, static_cast<float>(constants::E), 1e-7f);
-    EXPECT_NEAR(constants::SQRT_2_F, static_cast<float>(constants::SQRT_2), 1e-7f);
-
-    // Test long double versions
-    EXPECT_NEAR(constants::PI_L, static_cast<long double>(constants::PI), 1e-18L);
-    EXPECT_NEAR(constants::E_L, static_cast<long double>(constants::E), 1e-18L);
-}
-
-// =============================================================================
-// Special Functions Tests
-// =============================================================================
-
-TEST_F(MathConstantsTest, SignFunction) {
-    // Test sign function
-    EXPECT_EQ(constants::sign(5.0), 1);
-    EXPECT_EQ(constants::sign(-5.0), -1);
-    EXPECT_EQ(constants::sign(0.0), 0);
-
-    // Test with very small values
-    EXPECT_EQ(constants::sign(constants::EPSILON), 1);
-    EXPECT_EQ(constants::sign(-constants::EPSILON), -1);
-
-    // Test with infinity
-    EXPECT_EQ(constants::sign(constants::INF_VALUE), 1);
-    EXPECT_EQ(constants::sign(-constants::INF_VALUE), -1);
-}
-
-TEST_F(MathConstantsTest, SafeDivision) {
-    // Test safe division
-    EXPECT_NEAR(constants::safe_divide(10.0, 2.0), 5.0, 1e-15);
-    EXPECT_NEAR(constants::safe_divide(1.0, 3.0), 1.0/3.0, 1e-15);
-
-    // Test division by zero returns default
-    EXPECT_EQ(constants::safe_divide(1.0, 0.0, 999.0), 999.0);
-    EXPECT_EQ(constants::safe_divide(1.0, constants::EPSILON/2.0, -1.0), -1.0);
-
-    // Test division by near-zero
-    double tiny = constants::DEFAULT_TOLERANCE / 10.0;
-    EXPECT_EQ(constants::safe_divide(1.0, tiny, 0.0), 0.0);
-}
-
-// =============================================================================
-// Utility Functions Tests
-// =============================================================================
-
-TEST_F(MathConstantsTest, ClampFunction) {
-    // Test clamping
-    EXPECT_EQ(constants::clamp(5.0, 0.0, 10.0), 5.0);
-    EXPECT_EQ(constants::clamp(-5.0, 0.0, 10.0), 0.0);
-    EXPECT_EQ(constants::clamp(15.0, 0.0, 10.0), 10.0);
-
-    // Test with same min/max
-    EXPECT_EQ(constants::clamp(5.0, 3.0, 3.0), 3.0);
-
-    // Test with infinity
-    EXPECT_EQ(constants::clamp(constants::INF_VALUE, 0.0, 10.0), 10.0);
-    EXPECT_EQ(constants::clamp(-constants::INF_VALUE, 0.0, 10.0), 0.0);
-}
-
-TEST_F(MathConstantsTest, LerpFunction) {
-    // Test linear interpolation
-    EXPECT_NEAR(constants::lerp(0.0, 10.0, 0.0), 0.0, 1e-15);
-    EXPECT_NEAR(constants::lerp(0.0, 10.0, 1.0), 10.0, 1e-15);
-    EXPECT_NEAR(constants::lerp(0.0, 10.0, 0.5), 5.0, 1e-15);
-    EXPECT_NEAR(constants::lerp(0.0, 10.0, 0.25), 2.5, 1e-15);
-
-    // Test extrapolation
-    EXPECT_NEAR(constants::lerp(0.0, 10.0, -0.5), -5.0, 1e-15);
-    EXPECT_NEAR(constants::lerp(0.0, 10.0, 1.5), 15.0, 1e-15);
-
-    // Test with negative range
-    EXPECT_NEAR(constants::lerp(-10.0, -5.0, 0.5), -7.5, 1e-15);
-}
diff --git a/tests/unitTests/FE/Math/test_Matrix.cpp b/tests/unitTests/FE/Math/test_Matrix.cpp
index c186c26ee..3b2fe664a 100644
--- a/tests/unitTests/FE/Math/test_Matrix.cpp
+++ b/tests/unitTests/FE/Math/test_Matrix.cpp
@@ -7,7 +7,6 @@
 #include "FE/Math/Matrix.h"
 #include "FE/Math/Vector.h"
 #include "FE/Math/MatrixExpr.h"
-#include "FE/Math/MathConstants.h"
 #include <limits>
 #include <cmath>
 #include <thread>
diff --git a/tests/unitTests/FE/Math/test_MatrixExpr.cpp b/tests/unitTests/FE/Math/test_MatrixExpr.cpp
index 9486f409c..b17bce928 100644
--- a/tests/unitTests/FE/Math/test_MatrixExpr.cpp
+++ b/tests/unitTests/FE/Math/test_MatrixExpr.cpp
@@ -7,7 +7,6 @@
 #include "FE/Math/Matrix.h"
 #include "FE/Math/MatrixExpr.h"
 #include "FE/Math/Vector.h"
-#include "FE/Math/MathConstants.h"
 #include <limits>
 #include <cmath>
 #include <memory>
diff --git a/tests/unitTests/FE/Math/test_Vector.cpp b/tests/unitTests/FE/Math/test_Vector.cpp
index a38a71727..754ad819d 100644
--- a/tests/unitTests/FE/Math/test_Vector.cpp
+++ b/tests/unitTests/FE/Math/test_Vector.cpp
@@ -6,7 +6,6 @@
 #include <gtest/gtest.h>
 #include "FE/Math/Vector.h"
 #include "FE/Math/VectorExpr.h"
-#include "FE/Math/MathConstants.h"
 #include <limits>
 #include <cmath>
 #include <sstream>
diff --git a/tests/unitTests/FE/Math/test_VectorExpr.cpp b/tests/unitTests/FE/Math/test_VectorExpr.cpp
index bd6d85d51..0e7363c64 100644
--- a/tests/unitTests/FE/Math/test_VectorExpr.cpp
+++ b/tests/unitTests/FE/Math/test_VectorExpr.cpp
@@ -6,7 +6,6 @@
 #include <gtest/gtest.h>
 #include "FE/Math/Vector.h"
 #include "FE/Math/VectorExpr.h"
-#include "FE/Math/MathConstants.h"
 #include <limits>
 #include <cmath>
 #include <memory>

From 2a97fa0466796913614d22a7e9f4c089e1a1d257 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Mon, 8 Jun 2026 13:43:26 -0700
Subject: [PATCH 08/22] consolidating math support for integer functions and
 expression operations

---
 .../solver/FE/Basis/SerendipityBasis.cpp      |  25 +-
 Code/Source/solver/FE/Math/ExpressionOps.h    |  99 ----
 Code/Source/solver/FE/Math/IntegerMath.h      |  98 ----
 Code/Source/solver/FE/Math/MatrixExpr.h       |   5 +-
 Code/Source/solver/FE/Math/VectorExpr.h       |  59 +-
 .../unitTests/FE/Math/test_ExpressionOps.cpp  | 508 ------------------
 6 files changed, 75 insertions(+), 719 deletions(-)
 delete mode 100644 Code/Source/solver/FE/Math/ExpressionOps.h
 delete mode 100644 Code/Source/solver/FE/Math/IntegerMath.h
 delete mode 100644 tests/unitTests/FE/Math/test_ExpressionOps.cpp

diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
index e6395cee4..237f8c2ce 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
@@ -9,7 +9,6 @@
 #include "LagrangeBasis.h"
 #include "NodeOrderingConventions.h"
 #include "Math/DenseLinearAlgebra.h"
-#include "Math/IntegerMath.h"
 
 #include <algorithm>
 #include <array>
@@ -21,8 +20,6 @@ namespace svmp {
 namespace FE {
 namespace basis {
 
-using math::pow_int;
-
 namespace {
 using Vec3 = math::Vector<Real, 3>;
 
@@ -150,7 +147,7 @@ std::vector<Real> quad_serendipity_inverse_vandermonde(
         const Real y = nodes[static_cast<std::size_t>(row)][1];
         for (int col = 0; col < n; ++col) {
             const auto [ax, ay] = exponents[static_cast<std::size_t>(col)];
-            vandermonde[idx(row, col)] = pow_int(x, ax) * pow_int(y, ay);
+            vandermonde[idx(row, col)] = std::pow(x, ax) * std::pow(y, ay);
         }
     }
 
@@ -521,7 +518,7 @@ void SerendipityBasis::evaluate_values(const math::Vector<Real, 3>& xi,
         std::vector<Real> monomials(size_, Real(0));
         for (std::size_t j = 0; j < size_; ++j) {
             const auto [ax, ay] = quad_monomial_exponents_[j];
-            monomials[j] = pow_int(x, ax) * pow_int(y, ay);
+            monomials[j] = std::pow(x, ax) * std::pow(y, ay);
         }
 
         for (std::size_t i = 0; i < size_; ++i) {
@@ -609,8 +606,10 @@ void SerendipityBasis::evaluate_gradients(const math::Vector<Real, 3>& xi,
         std::vector<Real> dmon_dy(size_, Real(0));
         for (std::size_t j = 0; j < size_; ++j) {
             const auto [ax, ay] = quad_monomial_exponents_[j];
-            dmon_dx[j] = (ax > 0) ? Real(ax) * pow_int(x, ax - 1) * pow_int(y, ay) : Real(0);
-            dmon_dy[j] = (ay > 0) ? pow_int(x, ax) * Real(ay) * pow_int(y, ay - 1) : Real(0);
+            dmon_dx[j] =
+                (ax > 0) ? Real(ax) * std::pow(x, ax - 1) * std::pow(y, ay) : Real(0);
+            dmon_dy[j] =
+                (ay > 0) ? std::pow(x, ax) * Real(ay) * std::pow(y, ay - 1) : Real(0);
         }
 
         for (std::size_t i = 0; i < size_; ++i) {
@@ -747,9 +746,15 @@ void SerendipityBasis::evaluate_hessians(const math::Vector<Real, 3>& xi,
         std::vector<Real> dyy(size_, Real(0));
         for (std::size_t j = 0; j < size_; ++j) {
             const auto [ax, ay] = quad_monomial_exponents_[j];
-            dxx[j] = (ax > 1) ? Real(ax * (ax - 1)) * pow_int(x, ax - 2) * pow_int(y, ay) : Real(0);
-            dxy[j] = (ax > 0 && ay > 0) ? Real(ax * ay) * pow_int(x, ax - 1) * pow_int(y, ay - 1) : Real(0);
-            dyy[j] = (ay > 1) ? Real(ay * (ay - 1)) * pow_int(x, ax) * pow_int(y, ay - 2) : Real(0);
+            dxx[j] = (ax > 1)
+                         ? Real(ax * (ax - 1)) * std::pow(x, ax - 2) * std::pow(y, ay)
+                         : Real(0);
+            dxy[j] = (ax > 0 && ay > 0)
+                         ? Real(ax * ay) * std::pow(x, ax - 1) * std::pow(y, ay - 1)
+                         : Real(0);
+            dyy[j] = (ay > 1)
+                         ? Real(ay * (ay - 1)) * std::pow(x, ax) * std::pow(y, ay - 2)
+                         : Real(0);
         }
 
         for (std::size_t i = 0; i < size_; ++i) {
diff --git a/Code/Source/solver/FE/Math/ExpressionOps.h b/Code/Source/solver/FE/Math/ExpressionOps.h
deleted file mode 100644
index 96cea1037..000000000
--- a/Code/Source/solver/FE/Math/ExpressionOps.h
+++ /dev/null
@@ -1,99 +0,0 @@
-#ifndef SVMP_FE_MATH_EXPRESSION_OPS_H
-#define SVMP_FE_MATH_EXPRESSION_OPS_H
-
-/**
- * @file ExpressionOps.h
- * @brief Common expression template operators for vector and matrix expressions
- *
- * This header provides shared operator functors used by both VectorExpr.h and
- * MatrixExpr.h to avoid code duplication and namespace conflicts. All operators
- * are defined in the detail::ops namespace for internal use by expression templates.
- */
-
-#include <cmath>
-
-namespace svmp {
-namespace FE {
-namespace math {
-namespace detail {
-namespace ops {
-
-/**
- * @brief Addition operator functor
- */
-struct Add {
-    template<typename T1, typename T2>
-    constexpr auto operator()(const T1& a, const T2& b) const {
-        return a + b;
-    }
-};
-
-/**
- * @brief Subtraction operator functor
- */
-struct Sub {
-    template<typename T1, typename T2>
-    constexpr auto operator()(const T1& a, const T2& b) const {
-        return a - b;
-    }
-};
-
-/**
- * @brief Multiplication operator functor
- */
-struct Mul {
-    template<typename T1, typename T2>
-    constexpr auto operator()(const T1& a, const T2& b) const {
-        return a * b;
-    }
-};
-
-/**
- * @brief Division operator functor
- */
-struct Div {
-    template<typename T1, typename T2>
-    constexpr auto operator()(const T1& a, const T2& b) const {
-        return a / b;
-    }
-};
-
-/**
- * @brief Negation operator functor
- */
-struct Negate {
-    template<typename T>
-    constexpr auto operator()(const T& a) const {
-        return -a;
-    }
-};
-
-/**
- * @brief Absolute value operator functor
- */
-struct Abs {
-    template<typename T>
-    constexpr auto operator()(const T& a) const {
-        using std::abs;
-        return abs(a);
-    }
-};
-
-/**
- * @brief Square root operator functor
- */
-struct Sqrt {
-    template<typename T>
-    constexpr auto operator()(const T& a) const {
-        using std::sqrt;
-        return sqrt(a);
-    }
-};
-
-} // namespace ops
-} // namespace detail
-} // namespace math
-} // namespace FE
-} // namespace svmp
-
-#endif // SVMP_FE_MATH_EXPRESSION_OPS_H
diff --git a/Code/Source/solver/FE/Math/IntegerMath.h b/Code/Source/solver/FE/Math/IntegerMath.h
deleted file mode 100644
index 52a50117f..000000000
--- a/Code/Source/solver/FE/Math/IntegerMath.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright (c) Stanford University, The Regents of the University of California, and others.
- *
- * All Rights Reserved.
- *
- * See License file.
- */
-
-#ifndef SVMP_FE_MATH_INTEGERMATH_H
-#define SVMP_FE_MATH_INTEGERMATH_H
-
-#include "Types.h"
-
-#include <cstddef>
-#include <limits>
-#include <numeric>
-#include <stdexcept>
-
-namespace svmp {
-namespace FE {
-namespace math {
-
-[[nodiscard]] constexpr Real pow_int_nonnegative(Real base, int exponent) noexcept {
-    Real result = Real(1);
-    Real factor = base;
-    int power = exponent;
-    while (power > 0) {
-        if ((power & 1) != 0) {
-            result *= factor;
-        }
-        power >>= 1;
-        if (power > 0) {
-            factor *= factor;
-        }
-    }
-    return result;
-}
-
-[[nodiscard]] constexpr Real pow_int(Real base, int exponent) noexcept {
-    if (exponent < 0) {
-        return Real(1) / pow_int_nonnegative(base, -exponent);
-    }
-    return pow_int_nonnegative(base, exponent);
-}
-
-[[nodiscard]] constexpr std::size_t binomial_size(int n, int k) {
-    if (n < 0 || k < 0 || k > n) {
-        return 0u;
-    }
-    if (k > n - k) {
-        k = n - k;
-    }
-
-    std::size_t result = 1u;
-    for (int i = 1; i <= k; ++i) {
-        auto numerator = static_cast<std::size_t>(n - (k - i));
-        auto denominator = static_cast<std::size_t>(i);
-
-        const auto numerator_gcd = std::gcd(numerator, denominator);
-        numerator /= numerator_gcd;
-        denominator /= numerator_gcd;
-
-        const auto result_gcd = std::gcd(result, denominator);
-        result /= result_gcd;
-        denominator /= result_gcd;
-        if (denominator != 1u) {
-            throw std::overflow_error(
-                "binomial_size: failed to reduce exact binomial factor");
-        }
-        if (numerator != 0u &&
-            result > std::numeric_limits<std::size_t>::max() / numerator) {
-            throw std::overflow_error("binomial_size: result does not fit in size_t");
-        }
-        result *= numerator;
-    }
-    return result;
-}
-
-[[nodiscard]] constexpr Real binomial_real(int n, int k) noexcept {
-    if (k < 0 || k > n) {
-        return Real(0);
-    }
-    if (k > n - k) {
-        k = n - k;
-    }
-
-    Real result = Real(1);
-    for (int i = 1; i <= k; ++i) {
-        result *= static_cast<Real>(n - (k - i));
-        result /= static_cast<Real>(i);
-    }
-    return result;
-}
-
-} // namespace math
-} // namespace FE
-} // namespace svmp
-
-#endif // SVMP_FE_MATH_INTEGERMATH_H
diff --git a/Code/Source/solver/FE/Math/MatrixExpr.h b/Code/Source/solver/FE/Math/MatrixExpr.h
index 097f35361..13010bddf 100644
--- a/Code/Source/solver/FE/Math/MatrixExpr.h
+++ b/Code/Source/solver/FE/Math/MatrixExpr.h
@@ -11,10 +11,11 @@
  */
 
 #include <algorithm>
+#include <cmath>
 #include <cstddef>
 #include <type_traits>
-#include <cmath>
-#include "ExpressionOps.h"
+
+#include "VectorExpr.h"
 
 namespace svmp {
 namespace FE {
diff --git a/Code/Source/solver/FE/Math/VectorExpr.h b/Code/Source/solver/FE/Math/VectorExpr.h
index 627d2fd88..178b66b8a 100644
--- a/Code/Source/solver/FE/Math/VectorExpr.h
+++ b/Code/Source/solver/FE/Math/VectorExpr.h
@@ -10,14 +10,69 @@
  * of assignment, eliminating intermediate allocations and improving performance.
  */
 
+#include <cmath>
 #include <cstddef>
 #include <type_traits>
-#include <cmath>
-#include "ExpressionOps.h"
 
 namespace svmp {
 namespace FE {
 namespace math {
+namespace detail {
+namespace ops {
+
+struct Add {
+    template<typename T1, typename T2>
+    constexpr auto operator()(const T1& a, const T2& b) const {
+        return a + b;
+    }
+};
+
+struct Sub {
+    template<typename T1, typename T2>
+    constexpr auto operator()(const T1& a, const T2& b) const {
+        return a - b;
+    }
+};
+
+struct Mul {
+    template<typename T1, typename T2>
+    constexpr auto operator()(const T1& a, const T2& b) const {
+        return a * b;
+    }
+};
+
+struct Div {
+    template<typename T1, typename T2>
+    constexpr auto operator()(const T1& a, const T2& b) const {
+        return a / b;
+    }
+};
+
+struct Negate {
+    template<typename T>
+    constexpr auto operator()(const T& a) const {
+        return -a;
+    }
+};
+
+struct Abs {
+    template<typename T>
+    constexpr auto operator()(const T& a) const {
+        using std::abs;
+        return abs(a);
+    }
+};
+
+struct Sqrt {
+    template<typename T>
+    constexpr auto operator()(const T& a) const {
+        using std::sqrt;
+        return sqrt(a);
+    }
+};
+
+} // namespace ops
+} // namespace detail
 
 /**
  * @brief Base class for all vector expressions using CRTP
diff --git a/tests/unitTests/FE/Math/test_ExpressionOps.cpp b/tests/unitTests/FE/Math/test_ExpressionOps.cpp
deleted file mode 100644
index a368e345e..000000000
--- a/tests/unitTests/FE/Math/test_ExpressionOps.cpp
+++ /dev/null
@@ -1,508 +0,0 @@
-/**
- * @file test_ExpressionOps.cpp
- * @brief Unit tests for ExpressionOps.h - expression template operators
- */
-
-#include <gtest/gtest.h>
-#include "FE/Math/ExpressionOps.h"
-#include "FE/Math/Vector.h"
-#include "FE/Math/Matrix.h"
-#include <limits>
-#include <cmath>
-#include <complex>
-#include <type_traits>
-
-using namespace svmp::FE::math;
-using namespace svmp::FE::math::detail::ops;
-
-// Test fixture for ExpressionOps tests
-class ExpressionOpsTest : public ::testing::Test {
-protected:
-    static constexpr double tolerance = 1e-14;
-
-    void SetUp() override {}
-    void TearDown() override {}
-
-    template<typename T>
-    bool approx_equal(T a, T b, T tol = tolerance) {
-        return std::abs(a - b) <= tol;
-    }
-};
-
-// =============================================================================
-// Binary Operation Tests
-// =============================================================================
-
-TEST_F(ExpressionOpsTest, AddOperator) {
-    Add op;
-
-    // Integer addition
-    EXPECT_EQ(op(5, 3), 8);
-    EXPECT_EQ(op(-5, 3), -2);
-    EXPECT_EQ(op(-5, -3), -8);
-
-    // Floating point addition
-    EXPECT_DOUBLE_EQ(op(3.14, 2.86), 6.0);
-    EXPECT_DOUBLE_EQ(op(-1.5, 2.5), 1.0);
-
-    // Mixed types
-    auto result = op(3, 2.5);
-    EXPECT_TRUE((std::is_same_v<decltype(result), double>));
-    EXPECT_DOUBLE_EQ(result, 5.5);
-}
-
-TEST_F(ExpressionOpsTest, SubOperator) {
-    Sub op;
-
-    // Integer subtraction
-    EXPECT_EQ(op(5, 3), 2);
-    EXPECT_EQ(op(3, 5), -2);
-    EXPECT_EQ(op(-5, -3), -2);
-
-    // Floating point subtraction
-    EXPECT_DOUBLE_EQ(op(5.5, 2.5), 3.0);
-    EXPECT_DOUBLE_EQ(op(2.5, 5.5), -3.0);
-
-    // Mixed types
-    auto result = op(5.5, 2);
-    EXPECT_TRUE((std::is_same_v<decltype(result), double>));
-    EXPECT_DOUBLE_EQ(result, 3.5);
-}
-
-TEST_F(ExpressionOpsTest, MulOperator) {
-    Mul op;
-
-    // Integer multiplication
-    EXPECT_EQ(op(5, 3), 15);
-    EXPECT_EQ(op(-5, 3), -15);
-    EXPECT_EQ(op(-5, -3), 15);
-
-    // Floating point multiplication
-    EXPECT_DOUBLE_EQ(op(2.5, 4.0), 10.0);
-    EXPECT_DOUBLE_EQ(op(-2.5, 4.0), -10.0);
-
-    // Zero multiplication
-    EXPECT_EQ(op(0, 100), 0);
-    EXPECT_DOUBLE_EQ(op(0.0, 3.14), 0.0);
-
-    // Mixed types
-    auto result = op(3, 2.5);
-    EXPECT_TRUE((std::is_same_v<decltype(result), double>));
-    EXPECT_DOUBLE_EQ(result, 7.5);
-}
-
-TEST_F(ExpressionOpsTest, DivOperator) {
-    Div op;
-
-    // Integer division
-    EXPECT_EQ(op(10, 2), 5);
-    EXPECT_EQ(op(10, 3), 3);  // Integer division truncates
-    EXPECT_EQ(op(-10, 2), -5);
-
-    // Floating point division
-    EXPECT_DOUBLE_EQ(op(10.0, 2.0), 5.0);
-    EXPECT_DOUBLE_EQ(op(10.0, 3.0), 10.0/3.0);
-    EXPECT_DOUBLE_EQ(op(-10.0, 2.0), -5.0);
-
-    // Mixed types
-    auto result = op(10.0, 3);
-    EXPECT_TRUE((std::is_same_v<decltype(result), double>));
-    EXPECT_DOUBLE_EQ(result, 10.0/3.0);
-}
-
-// =============================================================================
-// Unary Operation Tests
-// =============================================================================
-
-TEST_F(ExpressionOpsTest, NegateOperator) {
-    Negate op;
-
-    // Integer negation
-    EXPECT_EQ(op(5), -5);
-    EXPECT_EQ(op(-5), 5);
-    EXPECT_EQ(op(0), 0);
-
-    // Floating point negation
-    EXPECT_DOUBLE_EQ(op(3.14), -3.14);
-    EXPECT_DOUBLE_EQ(op(-2.71), 2.71);
-    EXPECT_DOUBLE_EQ(op(0.0), 0.0);
-
-    // Type preservation
-    auto int_result = op(5);
-    EXPECT_TRUE((std::is_same_v<decltype(int_result), int>));
-
-    auto double_result = op(5.0);
-    EXPECT_TRUE((std::is_same_v<decltype(double_result), double>));
-}
-
-TEST_F(ExpressionOpsTest, AbsOperator) {
-    Abs op;
-
-    // Integer absolute value
-    EXPECT_EQ(op(5), 5);
-    EXPECT_EQ(op(-5), 5);
-    EXPECT_EQ(op(0), 0);
-
-    // Floating point absolute value
-    EXPECT_DOUBLE_EQ(op(3.14), 3.14);
-    EXPECT_DOUBLE_EQ(op(-3.14), 3.14);
-    EXPECT_DOUBLE_EQ(op(0.0), 0.0);
-
-    // Special cases
-    EXPECT_DOUBLE_EQ(op(-0.0), 0.0);
-
-    // Type preservation
-    auto int_result = op(-5);
-    EXPECT_TRUE((std::is_same_v<decltype(int_result), int>));
-
-    auto double_result = op(-5.0);
-    EXPECT_TRUE((std::is_same_v<decltype(double_result), double>));
-}
-
-TEST_F(ExpressionOpsTest, SqrtOperator) {
-    Sqrt op;
-
-    // Perfect squares
-    EXPECT_DOUBLE_EQ(op(4.0), 2.0);
-    EXPECT_DOUBLE_EQ(op(9.0), 3.0);
-    EXPECT_DOUBLE_EQ(op(16.0), 4.0);
-    EXPECT_DOUBLE_EQ(op(25.0), 5.0);
-
-    // Non-perfect squares
-    EXPECT_DOUBLE_EQ(op(2.0), std::sqrt(2.0));
-    EXPECT_DOUBLE_EQ(op(3.0), std::sqrt(3.0));
-
-    // Special cases
-    EXPECT_DOUBLE_EQ(op(0.0), 0.0);
-    EXPECT_DOUBLE_EQ(op(1.0), 1.0);
-
-    // Type conversion
-    auto result = op(4);  // Integer input
-    EXPECT_DOUBLE_EQ(result, 2.0);
-}
-
-// =============================================================================
-// Constexpr Tests
-// =============================================================================
-
-TEST_F(ExpressionOpsTest, ConstexprOperators) {
-    // Test that operators can be used in constexpr contexts
-    constexpr Add add_op;
-    constexpr Sub sub_op;
-    constexpr Mul mul_op;
-    constexpr Div div_op;
-    constexpr Negate neg_op;
-
-    // Compile-time evaluation
-    constexpr auto sum = add_op(3, 4);
-    constexpr auto diff = sub_op(7, 3);
-    constexpr auto prod = mul_op(3, 4);
-    constexpr auto quot = div_op(12, 3);
-    constexpr auto neg = neg_op(5);
-
-    EXPECT_EQ(sum, 7);
-    EXPECT_EQ(diff, 4);
-    EXPECT_EQ(prod, 12);
-    EXPECT_EQ(quot, 4);
-    EXPECT_EQ(neg, -5);
-
-    // Static assertions to verify compile-time evaluation
-    static_assert(add_op(2, 3) == 5);
-    static_assert(sub_op(5, 2) == 3);
-    static_assert(mul_op(3, 4) == 12);
-    static_assert(div_op(10, 2) == 5);
-    static_assert(neg_op(3) == -3);
-}
-
-// =============================================================================
-// Type Deduction Tests
-// =============================================================================
-
-TEST_F(ExpressionOpsTest, TypeDeduction) {
-    Add add_op;
-    Sub sub_op;
-    Mul mul_op;
-    Div div_op;
-
-    // int + int -> int
-    auto int_result = add_op(3, 4);
-    EXPECT_TRUE((std::is_same_v<decltype(int_result), int>));
-
-    // double + double -> double
-    auto double_result = add_op(3.0, 4.0);
-    EXPECT_TRUE((std::is_same_v<decltype(double_result), double>));
-
-    // int + double -> double
-    auto mixed_result1 = add_op(3, 4.0);
-    EXPECT_TRUE((std::is_same_v<decltype(mixed_result1), double>));
-
-    // double + int -> double
-    auto mixed_result2 = add_op(3.0, 4);
-    EXPECT_TRUE((std::is_same_v<decltype(mixed_result2), double>));
-
-    // float + double -> double
-    auto float_double_result = add_op(3.0f, 4.0);
-    EXPECT_TRUE((std::is_same_v<decltype(float_double_result), double>));
-}
-
-// =============================================================================
-// Complex Expression Tests
-// =============================================================================
-
-TEST_F(ExpressionOpsTest, ChainedOperations) {
-    Add add_op;
-    Sub sub_op;
-    Mul mul_op;
-    Div div_op;
-    Negate neg_op;
-
-    // Simulate complex expression: -(a + b) * c / d
-    double a = 2.0, b = 3.0, c = 4.0, d = 2.0;
-
-    auto sum = add_op(a, b);       // 5.0
-    auto negated = neg_op(sum);    // -5.0
-    auto product = mul_op(negated, c);  // -20.0
-    auto result = div_op(product, d);   // -10.0
-
-    EXPECT_DOUBLE_EQ(result, -10.0);
-}
-
-TEST_F(ExpressionOpsTest, MixedPrecisionChain) {
-    Add add_op;
-    Mul mul_op;
-
-    // Mixed precision chain
-    int a = 2;
-    float b = 3.5f;
-    double c = 1.5;
-
-    auto step1 = add_op(a, b);    // int + float -> float (5.5f)
-    auto step2 = mul_op(step1, c); // float + double -> double (8.25)
-
-    EXPECT_TRUE((std::is_same_v<decltype(step2), double>));
-    EXPECT_DOUBLE_EQ(step2, 8.25);
-}
-
-// =============================================================================
-// Operator Integration with Vector/Matrix Tests
-// =============================================================================
-
-TEST_F(ExpressionOpsTest, VectorIntegration) {
-    Vector<double, 3> v1{1.0, 2.0, 3.0};
-    Vector<double, 3> v2{4.0, 5.0, 6.0};
-
-    // Test that operators work correctly in vector expressions
-    Vector<double, 3> sum = v1 + v2;
-    Vector<double, 3> diff = v1 - v2;
-    Vector<double, 3> neg = -v1;
-    Vector<double, 3> scaled = v1 * 2.0;
-
-    EXPECT_DOUBLE_EQ(sum[0], 5.0);
-    EXPECT_DOUBLE_EQ(sum[1], 7.0);
-    EXPECT_DOUBLE_EQ(sum[2], 9.0);
-
-    EXPECT_DOUBLE_EQ(diff[0], -3.0);
-    EXPECT_DOUBLE_EQ(diff[1], -3.0);
-    EXPECT_DOUBLE_EQ(diff[2], -3.0);
-
-    EXPECT_DOUBLE_EQ(neg[0], -1.0);
-    EXPECT_DOUBLE_EQ(neg[1], -2.0);
-    EXPECT_DOUBLE_EQ(neg[2], -3.0);
-
-    EXPECT_DOUBLE_EQ(scaled[0], 2.0);
-    EXPECT_DOUBLE_EQ(scaled[1], 4.0);
-    EXPECT_DOUBLE_EQ(scaled[2], 6.0);
-}
-
-TEST_F(ExpressionOpsTest, MatrixIntegration) {
-    Matrix<double, 2, 2> m1{{1.0, 2.0}, {3.0, 4.0}};
-    Matrix<double, 2, 2> m2{{5.0, 6.0}, {7.0, 8.0}};
-
-    // Test that operators work correctly in matrix expressions
-    Matrix<double, 2, 2> sum = m1 + m2;
-    Matrix<double, 2, 2> diff = m1 - m2;
-    Matrix<double, 2, 2> neg = -m1;
-    Matrix<double, 2, 2> scaled = m1 * 2.0;
-
-    EXPECT_DOUBLE_EQ(sum(0, 0), 6.0);
-    EXPECT_DOUBLE_EQ(sum(0, 1), 8.0);
-    EXPECT_DOUBLE_EQ(sum(1, 0), 10.0);
-    EXPECT_DOUBLE_EQ(sum(1, 1), 12.0);
-
-    EXPECT_DOUBLE_EQ(diff(0, 0), -4.0);
-    EXPECT_DOUBLE_EQ(diff(0, 1), -4.0);
-    EXPECT_DOUBLE_EQ(diff(1, 0), -4.0);
-    EXPECT_DOUBLE_EQ(diff(1, 1), -4.0);
-
-    EXPECT_DOUBLE_EQ(neg(0, 0), -1.0);
-    EXPECT_DOUBLE_EQ(neg(0, 1), -2.0);
-    EXPECT_DOUBLE_EQ(neg(1, 0), -3.0);
-    EXPECT_DOUBLE_EQ(neg(1, 1), -4.0);
-
-    EXPECT_DOUBLE_EQ(scaled(0, 0), 2.0);
-    EXPECT_DOUBLE_EQ(scaled(0, 1), 4.0);
-    EXPECT_DOUBLE_EQ(scaled(1, 0), 6.0);
-    EXPECT_DOUBLE_EQ(scaled(1, 1), 8.0);
-}
-
-// =============================================================================
-// Edge Cases and Special Values Tests
-// =============================================================================
-
-TEST_F(ExpressionOpsTest, SpecialFloatingPointValues) {
-    Add add_op;
-    Sub sub_op;
-    Mul mul_op;
-    Div div_op;
-    Abs abs_op;
-    Negate neg_op;
-
-    // Infinity handling
-    double inf = std::numeric_limits<double>::infinity();
-    EXPECT_DOUBLE_EQ(add_op(inf, 1.0), inf);
-    EXPECT_DOUBLE_EQ(sub_op(inf, 1.0), inf);
-    EXPECT_DOUBLE_EQ(mul_op(inf, 2.0), inf);
-    EXPECT_DOUBLE_EQ(div_op(inf, 2.0), inf);
-    EXPECT_DOUBLE_EQ(abs_op(inf), inf);
-    EXPECT_DOUBLE_EQ(neg_op(inf), -inf);
-
-    // NaN handling
-    double nan = std::numeric_limits<double>::quiet_NaN();
-    EXPECT_TRUE(std::isnan(add_op(nan, 1.0)));
-    EXPECT_TRUE(std::isnan(sub_op(nan, 1.0)));
-    EXPECT_TRUE(std::isnan(mul_op(nan, 2.0)));
-    EXPECT_TRUE(std::isnan(div_op(nan, 2.0)));
-    EXPECT_TRUE(std::isnan(abs_op(nan)));
-    EXPECT_TRUE(std::isnan(neg_op(nan)));
-
-    // Division by zero
-    EXPECT_DOUBLE_EQ(div_op(1.0, 0.0), inf);
-    EXPECT_DOUBLE_EQ(div_op(-1.0, 0.0), -inf);
-    EXPECT_TRUE(std::isnan(div_op(0.0, 0.0)));
-}
-
-TEST_F(ExpressionOpsTest, LargeAndSmallValues) {
-    Add add_op;
-    Mul mul_op;
-
-    // Large values
-    double large = 1e308;
-    double result = add_op(large, large);
-    EXPECT_TRUE(std::isinf(result));  // Overflow to infinity
-
-    // Small values
-    double tiny = std::numeric_limits<double>::min();
-    double tiny_result = mul_op(tiny, 0.5);
-    EXPECT_GT(tiny_result, 0.0);  // Should still be positive
-    EXPECT_LT(tiny_result, tiny);  // But smaller
-
-    // Denormalized numbers
-    double denorm = std::numeric_limits<double>::denorm_min();
-    double denorm_result = add_op(denorm, denorm);
-    EXPECT_EQ(denorm_result, 2.0 * denorm);
-}
-
-// =============================================================================
-// SFINAE and Compile-time Constraint Tests
-// =============================================================================
-
-TEST_F(ExpressionOpsTest, SFINAECompatibility) {
-    // Test that operators work with any arithmetic types
-    Add add_op;
-
-    // Various integer types
-    EXPECT_EQ(add_op(int8_t(3), int8_t(4)), 7);
-    EXPECT_EQ(add_op(int16_t(100), int16_t(200)), 300);
-    EXPECT_EQ(add_op(int32_t(1000), int32_t(2000)), 3000);
-    EXPECT_EQ(add_op(int64_t(10000), int64_t(20000)), 30000);
-
-    // Unsigned types
-    EXPECT_EQ(add_op(uint8_t(3), uint8_t(4)), 7u);
-    EXPECT_EQ(add_op(uint16_t(100), uint16_t(200)), 300u);
-    EXPECT_EQ(add_op(uint32_t(1000), uint32_t(2000)), 3000u);
-
-    // Floating point types
-    EXPECT_FLOAT_EQ(add_op(3.0f, 4.0f), 7.0f);
-    EXPECT_DOUBLE_EQ(add_op(3.0, 4.0), 7.0);
-
-    // Long double
-    long double ld1 = 3.0L;
-    long double ld2 = 4.0L;
-    EXPECT_DOUBLE_EQ(add_op(ld1, ld2), 7.0L);
-}
-
-// =============================================================================
-// Template Instantiation Tests
-// =============================================================================
-
-TEST_F(ExpressionOpsTest, TemplateInstantiations) {
-    // Test that operators can be instantiated with various types
-    Add add_op;
-    Sub sub_op;
-    Mul mul_op;
-    Div div_op;
-    Abs abs_op;
-    Sqrt sqrt_op;
-    Negate neg_op;
-
-    // Custom types that support arithmetic operations
-    struct CustomNumber {
-        double value;
-        CustomNumber(double v) : value(v) {}
-        CustomNumber operator+(const CustomNumber& other) const { return CustomNumber(value + other.value); }
-        CustomNumber operator-(const CustomNumber& other) const { return CustomNumber(value - other.value); }
-        CustomNumber operator*(const CustomNumber& other) const { return CustomNumber(value * other.value); }
-        CustomNumber operator/(const CustomNumber& other) const { return CustomNumber(value / other.value); }
-        CustomNumber operator-() const { return CustomNumber(-value); }
-        bool operator==(const CustomNumber& other) const { return value == other.value; }
-    };
-
-    CustomNumber cn1(3.0);
-    CustomNumber cn2(4.0);
-
-    auto cn_sum = add_op(cn1, cn2);
-    EXPECT_EQ(cn_sum.value, 7.0);
-
-    auto cn_diff = sub_op(cn1, cn2);
-    EXPECT_EQ(cn_diff.value, -1.0);
-
-    auto cn_prod = mul_op(cn1, cn2);
-    EXPECT_EQ(cn_prod.value, 12.0);
-
-    auto cn_quot = div_op(cn1, cn2);
-    EXPECT_EQ(cn_quot.value, 0.75);
-
-    auto cn_neg = neg_op(cn1);
-    EXPECT_EQ(cn_neg.value, -3.0);
-}
-
-// =============================================================================
-// Complex Number Support Tests
-// =============================================================================
-
-TEST_F(ExpressionOpsTest, ComplexNumberSupport) {
-    Add add_op;
-    Sub sub_op;
-    Mul mul_op;
-    Div div_op;
-    Negate neg_op;
-
-    std::complex<double> c1(3.0, 4.0);
-    std::complex<double> c2(1.0, 2.0);
-
-    auto c_sum = add_op(c1, c2);
-    EXPECT_DOUBLE_EQ(c_sum.real(), 4.0);
-    EXPECT_DOUBLE_EQ(c_sum.imag(), 6.0);
-
-    auto c_diff = sub_op(c1, c2);
-    EXPECT_DOUBLE_EQ(c_diff.real(), 2.0);
-    EXPECT_DOUBLE_EQ(c_diff.imag(), 2.0);
-
-    auto c_prod = mul_op(c1, c2);
-    EXPECT_DOUBLE_EQ(c_prod.real(), -5.0);  // (3+4i)(1+2i) = 3+6i+4i+8i² = 3+10i-8 = -5+10i
-    EXPECT_DOUBLE_EQ(c_prod.imag(), 10.0);
-
-    auto c_neg = neg_op(c1);
-    EXPECT_DOUBLE_EQ(c_neg.real(), -3.0);
-    EXPECT_DOUBLE_EQ(c_neg.imag(), -4.0);
-}

From 7f2e0202de0896246f4a88b4d42ec38e60b72b3a Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Mon, 8 Jun 2026 14:16:50 -0700
Subject: [PATCH 09/22] removing the previous basis functions so that we are
 not maintaining two basis function infrastructures

---
 Code/Source/solver/README.md       |    2 +-
 Code/Source/solver/nn.cpp          |  256 +----
 Code/Source/solver/nn_elem_gnn.h   | 1586 ----------------------------
 Code/Source/solver/nn_elem_gnnxx.h |  139 ---
 4 files changed, 32 insertions(+), 1951 deletions(-)
 delete mode 100644 Code/Source/solver/nn_elem_gnn.h
 delete mode 100644 Code/Source/solver/nn_elem_gnnxx.h

diff --git a/Code/Source/solver/README.md b/Code/Source/solver/README.md
index 252999e8f..d11378e35 100644
--- a/Code/Source/solver/README.md
+++ b/Code/Source/solver/README.md
@@ -601,7 +601,7 @@ A map type used to set element properties.
 
 Computes shape functions and derivatives at given natural coords.
 
-- `set_face_shape_data[face.eType](gaus_pt, face)`
+- FE Basis face evaluation for supported mapped face elements.
 
 
 <!-- ============= -->
diff --git a/Code/Source/solver/nn.cpp b/Code/Source/solver/nn.cpp
index a9e0aebc3..1ec9984b6 100644
--- a/Code/Source/solver/nn.cpp
+++ b/Code/Source/solver/nn.cpp
@@ -1,7 +1,8 @@
 // SPDX-FileCopyrightText: Copyright (c) Stanford University, The Regents of the University of California, and others.
 // SPDX-License-Identifier: BSD-3-Clause
 
-// The functions defined here replicate the Fortran functions defined in NN.f.
+// Solver-facing element setup, Gauss integration, FE Basis evaluation, and
+// shape-function bounds.
 //
 // The functions are used to 
 //
@@ -25,13 +26,8 @@
 
 #include "lapack_defs.h"
 
-#include <algorithm>
 #include <array>
-#include <cstdlib>
-#include <cctype>
-#include <exception>
 #include <functional>
-#include <iostream> 
 #include <math.h> 
 #include <memory>
 #include <optional>
@@ -51,12 +47,6 @@ using namespace consts;
 // Define maps used to set element Gauss integration data. 
 #include "nn_elem_gip.h"
 
-// Define maps used to set element shape function data. 
-#include "nn_elem_gnn.h"
-
-// Define maps used to get element shape function 2nd derivative data. 
-#include "nn_elem_gnnxx.h"
-
 // Define a map type used to set the bounds of element shape functions.
 #include "nn_elem_nn_bnds.h"
 
@@ -71,77 +61,6 @@ struct BasisSelection {
   int order;
 };
 
-enum class BasisMode {
-  Auto,
-  Legacy,
-  Fe
-};
-
-std::string normalize_basis_mode_name(std::string value)
-{
-  std::transform(value.begin(), value.end(), value.begin(),
-      [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
-  return value;
-}
-
-BasisMode parse_basis_mode()
-{
-  const char* mode_env = std::getenv("SVMP_BASIS_MODE");
-  if (mode_env == nullptr || *mode_env == '\0') {
-    return BasisMode::Auto;
-  }
-
-  const std::string mode = normalize_basis_mode_name(mode_env);
-  if (mode == "auto") {
-    return BasisMode::Auto;
-  }
-  if (mode == "legacy") {
-    return BasisMode::Legacy;
-  }
-  if (mode == "fe") {
-    return BasisMode::Fe;
-  }
-
-  throw febasis::BasisConfigurationException(
-      "Invalid SVMP_BASIS_MODE='" + std::string(mode_env) +
-          "'. Expected one of: auto, legacy, fe",
-      __FILE__, __LINE__, __func__);
-}
-
-BasisMode active_basis_mode()
-{
-  static const BasisMode mode = parse_basis_mode();
-  return mode;
-}
-
-const char* basis_mode_name(BasisMode mode)
-{
-  switch (mode) {
-    case BasisMode::Auto:
-      return "auto";
-    case BasisMode::Legacy:
-      return "legacy";
-    case BasisMode::Fe:
-      return "fe";
-  }
-  return "unknown";
-}
-
-void log_basis_mode_once()
-{
-  static const bool logged = []() {
-    std::cout << "[svMultiPhysics] SVMP_BASIS_MODE="
-              << basis_mode_name(active_basis_mode()) << std::endl;
-    return true;
-  }();
-  (void)logged;
-}
-
-bool basis_mode_allows_fe_adapter()
-{
-  return active_basis_mode() != BasisMode::Legacy;
-}
-
 std::string solver_element_name(consts::ElementType eType)
 {
   auto it = consts::element_type_to_string.find(eType);
@@ -178,15 +97,11 @@ std::optional<BasisSelection> to_basis_selection(consts::ElementType eType)
 
 bool use_basis_adapter_for(consts::ElementType eType)
 {
-  return basis_mode_allows_fe_adapter() && to_basis_selection(eType).has_value();
+  return to_basis_selection(eType).has_value();
 }
 
 bool supports_face_basis_adapter_for(consts::ElementType eType)
 {
-  if (!basis_mode_allows_fe_adapter()) {
-    return false;
-  }
-
   switch (eType) {
     case consts::ElementType::LIN1:
     case consts::ElementType::LIN2:
@@ -464,73 +379,20 @@ void evaluate_basis_hessians(const int insd,
   copy_basis_hessians_to_solver_nxx(eType, eNoN, gaus_pt, basis->dimension(), hessians, Nxx);
 }
 
-void call_legacy_get_gnn(const int insd,
-                         consts::ElementType eType,
-                         const int eNoN,
-                         const int g,
-                         Array<double>& xi,
-                         Array<double>& N,
-                         Array3<double>& Nx,
-                         const std::string& basis_failure = "")
+void set_point_face_shape_data(const int gaus_pt, faceType& face)
 {
-  try {
-    get_element_shape_data[eType](insd, eNoN, g, xi, N, Nx);
-  } catch (const std::bad_function_call&) {
-    std::string message = "[get_gnn] No FE Basis or legacy shape support for element " +
-        solver_element_name(eType) + "; legacy fallback was attempted";
-    if (!basis_failure.empty()) {
-      message += " after FE Basis failure: " + basis_failure;
+  face.N(0, gaus_pt) = 1.0;
+  for (int row = 0; row < face.Nx.nrows(); ++row) {
+    for (int col = 0; col < face.Nx.ncols(); ++col) {
+      face.Nx(row, col, gaus_pt) = 0.0;
     }
-    throw fe::InvalidElementException(message, solver_element_name(eType),
-        __FILE__, __LINE__, __func__);
   }
 }
 
-void call_legacy_get_gn_nxx(const int insd,
-                            const int ind2,
-                            consts::ElementType eType,
-                            const int eNoN,
-                            const int gaus_pt,
-                            const Array<double>& xi,
-                            Array3<double>& Nxx,
-                            const std::string& basis_failure = "",
-                            const bool allow_missing_legacy_table = false)
-{
-  try {
-    get_element_2nd_derivs[eType](insd, ind2, eNoN, gaus_pt, xi, Nxx);
-  } catch (const std::bad_function_call&) {
-    if (allow_missing_legacy_table) {
-      return;
-    }
-
-    std::string message = "[get_gn_nxx] No FE Basis or legacy second-derivative support for element " +
-        solver_element_name(eType) + "; legacy fallback was attempted";
-    if (!basis_failure.empty()) {
-      message += " after FE Basis failure: " + basis_failure;
-    }
-    throw fe::InvalidElementException(message, solver_element_name(eType),
-        __FILE__, __LINE__, __func__);
-  }
-}
-
-void call_legacy_face_shape_data(const int gaus_pt, faceType& face)
-{
-  auto legacy_shape = set_face_shape_data.find(face.eType);
-  if (legacy_shape == set_face_shape_data.end()) {
-    throw fe::InvalidElementException(
-        "[get_gnn(face)] No FE Basis or legacy face shape support",
-        solver_element_name(face.eType), __FILE__, __LINE__, __func__);
-  }
-
-  legacy_shape->second(gaus_pt, face);
-}
-
 } // namespace
 
 void get_gip(const int insd, consts::ElementType eType, const int nG, Vector<double>& w, Array<double>& xi) 
 {
-  log_basis_mode_once();
-
   try {
     get_element_gauss_int_data[eType](insd, nG, w, xi);
   } catch (const std::bad_function_call& exception) {
@@ -546,8 +408,6 @@ void get_gip(const int insd, consts::ElementType eType, const int nG, Vector<dou
 //
 void get_gip(mshType& mesh)
 {
-  log_basis_mode_once();
-
   try {
     set_element_gauss_int_data[mesh.eType](mesh);
   } catch (const std::bad_function_call& exception) {
@@ -559,8 +419,6 @@ void get_gip(mshType& mesh)
 
 void get_gip(Simulation* simulation, faceType& face)
 {
-  log_basis_mode_once();
-
   try {
     set_face_gauss_int_data[face.eType](face);
   } catch (const std::bad_function_call& exception) {
@@ -575,30 +433,16 @@ void get_gip(Simulation* simulation, faceType& face)
 void get_gnn(const int insd, consts::ElementType eType, const int eNoN, const int g, Array<double>& xi, 
     Array<double>& N, Array3<double>& Nx)
 {
-  log_basis_mode_once();
-
-  if (use_basis_adapter_for(eType)) {
-    try {
-      evaluate_basis_values_and_gradients(insd, eType, eNoN, g, xi, N, Nx);
-      return;
-    } catch (const fe::NotImplementedException& exception) {
-      call_legacy_get_gnn(insd, eType, eNoN, g, xi, N, Nx, exception.what());
-      return;
-    } catch (const std::exception& exception) {
-      throw febasis::BasisEvaluationException(
-          "[get_gnn] FE Basis adapter failed for element " +
-              solver_element_name(eType) +
-              "; legacy fallback was not attempted for this approved element: " +
-              exception.what(),
-          __FILE__, __LINE__, __func__);
-    }
+  if (!use_basis_adapter_for(eType)) {
+    throw febasis::BasisElementCompatibilityException(
+        "[get_gnn] FE Basis does not support solver element " + solver_element_name(eType),
+        __FILE__, __LINE__, __func__);
   }
 
-  call_legacy_get_gnn(insd, eType, eNoN, g, xi, N, Nx);
+  evaluate_basis_values_and_gradients(insd, eType, eNoN, g, xi, N, Nx);
 }
 
-/// @brief A big fat hack because the Fortran GETNN() operates on primitive types but
-/// the C++ version does not, uses Array and Vector objects.
+/// @brief Adapter overload for vector-style callers.
 //
 void get_gnn(const int nsd, consts::ElementType eType, const int eNoN, Vector<double>& xi, 
     Vector<double>& N, Array<double>& Nx)
@@ -625,86 +469,48 @@ void get_gnn(Simulation* simulation, int gaus_pt, faceType& face)
 {
   using consts::ElementType;
 
-  log_basis_mode_once();
-
-  if (active_basis_mode() == BasisMode::Legacy) {
-    call_legacy_face_shape_data(gaus_pt, face);
-    return;
-  }
-
   if (face.eType == ElementType::NRB) {
     throw fe::NotImplementedException(
-        "[get_gnn(face)] NRB face shape functions remain unsupported by FE Basis and the legacy face table",
+        "[get_gnn(face)] NRB face shape functions are unsupported by FE Basis",
         __FILE__, __LINE__, __func__);
   }
 
-  if (supports_face_basis_adapter_for(face.eType)) {
-    try {
-      // FE Basis owns mapped face N/Nx formulas; faceType remains the solver-facing storage contract.
-      evaluate_face_basis_values_and_gradients(gaus_pt, face);
-      return;
-    } catch (const std::exception& exception) {
-      throw febasis::BasisEvaluationException(
-          "[get_gnn(face)] FE Basis face adapter failed for mapped face element " +
-              solver_element_name(face.eType) + "; legacy fallback was not attempted: " +
-              exception.what(),
-          __FILE__, __LINE__, __func__);
-    }
+  if (face.eType == ElementType::PNT) {
+    set_point_face_shape_data(gaus_pt, face);
+    return;
   }
 
-  if (face.eType == ElementType::PNT) {
-    // Point faces have no mapped FE Basis representation in this pass; keep the legacy scalar value path.
-    call_legacy_face_shape_data(gaus_pt, face);
+  if (supports_face_basis_adapter_for(face.eType)) {
+    // FE Basis owns mapped face N/Nx formulas; faceType remains the solver-facing storage contract.
+    evaluate_face_basis_values_and_gradients(gaus_pt, face);
     return;
   }
 
-  // The legacy face table is retained only for explicitly unsupported paths and future cleanup.
-  call_legacy_face_shape_data(gaus_pt, face);
+  throw febasis::BasisElementCompatibilityException(
+      "[get_gnn(face)] FE Basis does not support face element " + solver_element_name(face.eType),
+      __FILE__, __LINE__, __func__);
 }
 
-/// @brief Returns second order derivatives at given natural coords
-///
-/// Replicates 'SUBROUTINE GETGNNxx(insd, ind2, eType, eNoN, xi, Nxx)'.
+/// @brief Returns second order derivatives at given natural coords.
 //
 void get_gn_nxx(const int insd, const int ind2, consts::ElementType eType, const int eNoN, const int gaus_pt, 
     const Array<double>& xi, Array3<double>& Nxx)
 {
   using namespace consts;
 
-  log_basis_mode_once();
-
   // NRB/PNT and face-only Hessian paths remain intentionally unsupported here.
   if (eType == ElementType::NRB || eType == ElementType::PNT) {
     return;
   }
 
-  if (active_basis_mode() == BasisMode::Legacy) {
-    call_legacy_get_gn_nxx(
-        insd, ind2, eType, eNoN, gaus_pt, xi, Nxx, "", true);
-    return;
-  }
-
-  if (use_basis_adapter_for(eType)) {
-    try {
-      evaluate_basis_hessians(insd, ind2, eType, eNoN, gaus_pt, xi, Nxx);
-      return;
-    } catch (const fe::NotImplementedException& exception) {
-      throw fe::NotImplementedException(
-          "[get_gn_nxx] FE Basis Hessian support is required for mapped volume element " +
-              solver_element_name(eType) + " but is not implemented: " + exception.what(),
-          __FILE__, __LINE__, __func__);
-    } catch (const std::exception& exception) {
-      throw febasis::BasisEvaluationException(
-          "[get_gn_nxx] FE Basis Hessian adapter failed for element " +
-              solver_element_name(eType) +
-              "; legacy fallback was not attempted for this approved element: " +
-              exception.what(),
-          __FILE__, __LINE__, __func__);
-    }
+  if (!use_basis_adapter_for(eType)) {
+    throw febasis::BasisElementCompatibilityException(
+        "[get_gn_nxx] FE Basis Hessian evaluation does not support solver element " +
+            solver_element_name(eType),
+        __FILE__, __LINE__, __func__);
   }
 
-  // Legacy Hessian tables are reserved for intentionally unsupported families.
-  call_legacy_get_gn_nxx(insd, ind2, eType, eNoN, gaus_pt, xi, Nxx);
+  evaluate_basis_hessians(insd, ind2, eType, eNoN, gaus_pt, xi, Nxx);
 }
 
 /// @brief Sets bounds on Gauss integration points in parametric space and
diff --git a/Code/Source/solver/nn_elem_gnn.h b/Code/Source/solver/nn_elem_gnn.h
deleted file mode 100644
index 33564d45b..000000000
--- a/Code/Source/solver/nn_elem_gnn.h
+++ /dev/null
@@ -1,1586 +0,0 @@
-// SPDX-FileCopyrightText: Copyright (c) Stanford University, The Regents of the University of California, and others.
-// SPDX-License-Identifier: BSD-3-Clause
-
-/// @brief Define a map type used to set element shape function data.
-///
-/// Reproduces the Fortran 'GETGNN' subroutine.
-//
-using GetElementShapeMapType = std::map<ElementType, std::function<void(const int, const int, const int, 
-    Array<double>&, Array<double>&, Array3<double>&)>>;
-
-GetElementShapeMapType get_element_shape_data = {
-
-  {ElementType::HEX8, [](const int insd, const int eNoN, const int g, Array<double>& xi, Array<double>& N, 
-      Array3<double>& Nx) -> void 
-    { 
-    double lx = 1.0 - xi(0,g);
-    double ly = 1.0 - xi(1,g);
-    double lz = 1.0 - xi(2,g);
-    double ux = 1.0 + xi(0,g);
-    double uy = 1.0 + xi(1,g);
-    double uz = 1.0 + xi(2,g);
-
-    N(0,g) = lx*ly*lz/8.0;
-    N(1,g) = ux*ly*lz/8.0;
-    N(2,g) = ux*uy*lz/8.0;
-    N(3,g) = lx*uy*lz/8.0;
-    N(4,g) = lx*ly*uz/8.0;
-    N(5,g) = ux*ly*uz/8.0;
-    N(6,g) = ux*uy*uz/8.0;
-    N(7,g) = lx*uy*uz/8.0;
-
-    Nx(0,0,g) = -ly*lz/8.0;
-    Nx(1,0,g) = -lx*lz/8.0;
-    Nx(2,0,g) = -lx*ly/8.0;
-
-    Nx(0,1,g) =  ly*lz/8.0;
-    Nx(1,1,g) = -ux*lz/8.0;
-    Nx(2,1,g) = -ux*ly/8.0;
-
-    Nx(0,2,g) =  uy*lz/8.0;
-    Nx(1,2,g) =  ux*lz/8.0;
-    Nx(2,2,g) = -ux*uy/8.0;
-
-    Nx(0,3,g) = -uy*lz/8.0;
-    Nx(1,3,g) =  lx*lz/8.0;
-    Nx(2,3,g) = -lx*uy/8.0;
-
-    Nx(0,4,g) = -ly*uz/8.0;
-    Nx(1,4,g) = -lx*uz/8.0;
-    Nx(2,4,g) =  lx*ly/8.0;
-
-    Nx(0,5,g) =  ly*uz/8.0;
-    Nx(1,5,g) = -ux*uz/8.0;
-    Nx(2,5,g) =  ux*ly/8.0;
-
-    Nx(0,6,g) =  uy*uz/8.0;
-    Nx(1,6,g) =  ux*uz/8.0;
-    Nx(2,6,g) =  ux*uy/8.0;
-
-    Nx(0,7,g) = -uy*uz/8.0;
-    Nx(1,7,g) =  lx*uz/8.0;
-    Nx(2,7,g) =  lx*uy/8.0;
-    }
-  },
-
-  {ElementType::HEX20, [](const int insd, const int eNoN, const int g, Array<double>& xi, Array<double>& N,
-      Array3<double>& Nx) -> void
-    {
-    double lx = 1.0 - xi(0,g);
-    double ly = 1.0 - xi(1,g);
-    double lz = 1.0 - xi(2,g);
-    double ux = 1.0 + xi(0,g);
-    double uy = 1.0 + xi(1,g);
-    double uz = 1.0 + xi(2,g);
-
-    double mx = lx*ux;
-    double my = ly*uy;
-    double mz = lz*uz;
-
-    N(0, g) = lx*ly*lz*(lx+ly+lz-5.0)/8.0;
-    N(1, g) = ux*ly*lz*(ux+ly+lz-5.0)/8.0;
-    N(2, g) = ux*uy*lz*(ux+uy+lz-5.0)/8.0;
-    N(3, g) = lx*uy*lz*(lx+uy+lz-5.0)/8.0;
-    N(4, g) = lx*ly*uz*(lx+ly+uz-5.0)/8.0;
-    N(5, g) = ux*ly*uz*(ux+ly+uz-5.0)/8.0;
-    N(6, g) = ux*uy*uz*(ux+uy+uz-5.0)/8.0;
-    N(7, g) = lx*uy*uz*(lx+uy+uz-5.0)/8.0;
-    N(8, g) = mx*ly*lz/4.0;
-    N(9, g) = ux*my*lz/4.0;
-    N(10, g) = mx*uy*lz/4.0;
-    N(11, g) = lx*my*lz/4.0;
-    N(12, g) = mx*ly*uz/4.0;
-    N(13, g) = ux*my*uz/4.0;
-    N(14, g) = mx*uy*uz/4.0;
-    N(15, g) = lx*my*uz/4.0;
-    N(16, g) = lx*ly*mz/4.0;
-    N(17, g) = ux*ly*mz/4.0;
-    N(18, g) = ux*uy*mz/4.0;
-    N(19, g) = lx*uy*mz/4.0;
-
-    // N(1)  = lx*ly*lz*(lx+ly+lz-5.0)/8.0;
-    int n = 0;
-    Nx(0,n,g) = -ly*lz*(lx+ly+lz-5.0+lx)/8.0;
-    Nx(1,n,g) = -lx*lz*(lx+ly+lz-5.0+ly)/8.0;
-    Nx(2,n,g) = -lx*ly*(lx+ly+lz-5.0+lz)/8.0;
-
-//c   N(n,g) = ux*ly*lz*(ux+ly+lz-5.0)/8.0;
-    n += 1;
-    Nx(0,n,g) =  ly*lz*(ux+ly+lz-5.0+ux)/8.0;
-    Nx(1,n,g) = -ux*lz*(ux+ly+lz-5.0+ly)/8.0;
-    Nx(2,n,g) = -ux*ly*(ux+ly+lz-5.0+lz)/8.0;
-
-//c   N(n,g) = ux*uy*lz*(ux+uy+lz-5.0)/8.0
-    n += 1;
-    Nx(0,n,g) =  uy*lz*(ux+uy+lz-5.0+ux)/8.0;
-    Nx(1,n,g) =  ux*lz*(ux+uy+lz-5.0+uy)/8.0;
-    Nx(2,n,g) = -ux*uy*(ux+uy+lz-5.0+lz)/8.0;
-
-//c   N(n,g) = lx*uy*lz*(lx+uy+lz-5.0)/8.0
-    n += 1;
-    Nx(0,n,g) = -uy*lz*(lx+uy+lz-5.0+lx)/8.0;
-    Nx(1,n,g) =  lx*lz*(lx+uy+lz-5.0+uy)/8.0;
-    Nx(2,n,g) = -lx*uy*(lx+uy+lz-5.0+lz)/8.0;
-
-//c   N(n,g) = lx*ly*uz*(lx+ly+uz-5.0)/8.0
-    n += 1;
-    Nx(0,n,g) = -ly*uz*(lx+ly+uz-5.0+lx)/8.0;
-    Nx(1,n,g) = -lx*uz*(lx+ly+uz-5.0+ly)/8.0;
-    Nx(2,n,g) =  lx*ly*(lx+ly+uz-5.0+uz)/8.0;
-
-//c   N(n,g) = ux*ly*uz*(ux+ly+uz-5.0)/8.0
-    n += 1;
-    Nx(0,n,g) =  ly*uz*(ux+ly+uz-5.0+ux)/8.0;
-    Nx(1,n,g) = -ux*uz*(ux+ly+uz-5.0+ly)/8.0;
-    Nx(2,n,g) =  ux*ly*(ux+ly+uz-5.0+uz)/8.0;
-
-//c   N(n,g) = ux*uy*uz*(ux+uy+uz-5.0)/8.0
-    n += 1;
-    Nx(0,n,g) =  uy*uz*(ux+uy+uz-5.0+ux)/8.0;
-    Nx(1,n,g) =  ux*uz*(ux+uy+uz-5.0+uy)/8.0;
-    Nx(2,n,g) =  ux*uy*(ux+uy+uz-5.0+uz)/8.0;
-
-//c   N(n,g) = lx*uy*uz*(lx+uy+uz-5.0)/8.0
-    n += 1;
-    Nx(0,n,g) = -uy*uz*(lx+uy+uz-5.0+lx)/8.0;
-    Nx(1,n,g) =  lx*uz*(lx+uy+uz-5.0+uy)/8.0;
-    Nx(2,n,g) =  lx*uy*(lx+uy+uz-5.0+uz)/8.0;
-
-//c   N(n,g) = mx*ly*lz/4.0
-    n += 1;
-    Nx(0,n,g) =  (lx - ux)*ly*lz/4.0;
-    Nx(1,n,g) = -mx*lz/4.0;
-    Nx(2,n,g) = -mx*ly/4.0;
-
-//c   N(0n,g) = ux*my*lz/4.0
-    n += 1;
-    Nx(0,n,g) =  my*lz/4.0;
-    Nx(1,n,g) =  (ly - uy)*ux*lz/4.0;
-    Nx(2,n,g) = -ux*my/4.0;
-
-//c   N(0n,g) = mx*uy*lz/4.0
-    n += 1;
-    Nx(0,n,g) =  (lx - ux)*uy*lz/4.0;
-    Nx(1,n,g) =  mx*lz/4.0;
-    Nx(2,n,g) = -mx*uy/4.0;
-
-//c   N(0n,g) = lx*my*lz/4.0
-    n += 1;
-    Nx(0,n,g) = -my*lz/4.0;
-    Nx(1,n,g) =  (ly - uy)*lx*lz/4.0;
-    Nx(2,n,g) = -lx*my/4.0;
-
-//c   N(0n,g) = mx*ly*uz/4.0
-    n += 1;
-    Nx(0,n,g) =  (lx - ux)*ly*uz/4.0;
-    Nx(1,n,g) = -mx*uz/4.0;
-    Nx(2,n,g) =  mx*ly/4.0;
-
-//c   N(0n,g) = ux*my*uz/4.0
-    n += 1;
-    Nx(0,n,g) =  my*uz/4.0;
-    Nx(1,n,g) =  (ly - uy)*ux*uz/4.0;
-    Nx(2,n,g) =  ux*my/4.0;
-
-//c   N(0n,g) = mx*uy*uz/4.0
-    n += 1;
-    Nx(0,n,g) =  (lx - ux)*uy*uz/4.0;
-    Nx(1,n,g) =  mx*uz/4.0;
-    Nx(2,n,g) =  mx*uy/4.0;
-
-//c   N(0n,g) = lx*my*uz/4.0
-    n += 1;
-    Nx(0,n,g) = -my*uz/4.0;
-    Nx(1,n,g) =  (ly - uy)*lx*uz/4.0;
-    Nx(2,n,g) =  lx*my/4.0;
-
-//c   N(0n,g) = lx*ly*mz/4.0
-    n += 1;
-    Nx(0,n,g) = -ly*mz/4.0;
-    Nx(1,n,g) = -lx*mz/4.0;
-    Nx(2,n,g) =  (lz - uz)*lx*ly/4.0;
-
-//c   N(0n,g) = ux*ly*mz/4.0
-    n += 1;
-    Nx(0,n,g) =  ly*mz/4.0;
-    Nx(1,n,g) = -ux*mz/4.0;
-    Nx(2,n,g) =  (lz - uz)*ux*ly/4.0;
-
-//c   N(0n,g) = ux*uy*mz/4.0
-    n += 1;
-    Nx(0,n,g) =  uy*mz/4.0;
-    Nx(1,n,g) =  ux*mz/4.0;
-    Nx(2,n,g) =  (lz - uz)*ux*uy/4.0;
-
-//c   N(n,g) = lx*uy*mz/4.0
-    n += 1;
-    Nx(0,n,g) = -uy*mz/4.0;
-    Nx(1,n,g) =  lx*mz/4.0;
-    Nx(2,n,g) =  (lz - uz)*lx*uy/4.0;
-    }
-  },
-
-  {ElementType::HEX27, [](const int insd, const int eNoN, const int g, Array<double>& xi, Array<double>& N,
-      Array3<double>& Nx) -> void
-    {
-    double lx = 1.0 - xi(0,g);
-    double ly = 1.0 - xi(1,g);
-    double lz = 1.0 - xi(2,g);
-    double ux = 1.0 + xi(0,g);
-    double uy = 1.0 + xi(1,g);
-    double uz = 1.0 + xi(2,g);
-
-    double mx = xi(0,g);
-    double my = xi(1,g);
-    double mz = xi(2,g);
-
-    N(0,g)  = -mx*lx*my*ly*mz*lz/8.0;
-    N(1,g)  =  mx*ux*my*ly*mz*lz/8.0;
-    N(2,g)  = -mx*ux*my*uy*mz*lz/8.0;
-    N(3,g)  =  mx*lx*my*uy*mz*lz/8.0;
-    N(4,g)  =  mx*lx*my*ly*mz*uz/8.0;
-    N(5,g)  = -mx*ux*my*ly*mz*uz/8.0;
-    N(6,g)  =  mx*ux*my*uy*mz*uz/8.0;
-    N(7,g)  = -mx*lx*my*uy*mz*uz/8.0;
-    N(8,g)  =  lx*ux*my*ly*mz*lz/4.0;
-    N(9,g)  = -mx*ux*ly*uy*mz*lz/4.0;
-    N(10,g) = -lx*ux*my*uy*mz*lz/4.0;
-    N(11,g) =  mx*lx*ly*uy*mz*lz/4.0;
-    N(12,g) = -lx*ux*my*ly*mz*uz/4.0;
-    N(13,g) =  mx*ux*ly*uy*mz*uz/4.0;
-    N(14,g) =  lx*ux*my*uy*mz*uz/4.0;
-    N(15,g) = -mx*lx*ly*uy*mz*uz/4.0;
-    N(16,g) =  mx*lx*my*ly*lz*uz/4.0;
-    N(17,g) = -mx*ux*my*ly*lz*uz/4.0;
-    N(18,g) =  mx*ux*my*uy*lz*uz/4.0;
-    N(19,g) = -mx*lx*my*uy*lz*uz/4.0;
-
-    N(20,g) = -mx*lx*ly*uy*lz*uz/2.0;
-    N(21,g) =  mx*ux*ly*uy*lz*uz/2.0;
-    N(22,g) = -lx*ux*my*ly*lz*uz/2.0;
-    N(23,g) =  lx*ux*my*uy*lz*uz/2.0;
-    N(24,g) = -lx*ux*ly*uy*mz*lz/2.0;
-    N(25,g) =  lx*ux*ly*uy*mz*uz/2.0;
-
-    N(26,g) =  lx*ux*ly*uy*lz*uz;
-
-    // N(0)  = -mx*lx*my*ly*mz*lz/8.0
-    int n = 0;
-    Nx(0,n,g)  = -(lx - mx)*my*ly*mz*lz/8.0;
-    Nx(1,n,g)  = -(ly - my)*mx*lx*mz*lz/8.0;
-    Nx(2,n,g)  = -(lz - mz)*mx*lx*my*ly/8.0;
-
-    // N(n,g)  =  mx*ux*my*ly*mz*lz/8.0
-    n += 1;
-    Nx(0,n,g)  =  (mx + ux)*my*ly*mz*lz/8.0;
-    Nx(1,n,g)  =  (ly - my)*mx*ux*mz*lz/8.0;
-    Nx(2,n,g)  =  (lz - mz)*mx*ux*my*ly/8.0;
-
-    // N(n,g)  = -mx*ux*my*uy*mz*lz/8.0
-    n += 1;
-    Nx(0,n,g)  = -(mx + ux)*my*uy*mz*lz/8.0;
-    Nx(1,n,g)  = -(my + uy)*mx*ux*mz*lz/8.0;
-    Nx(2,n,g)  = -(lz - mz)*mx*ux*my*uy/8.0;
-
-    // N(n,g)  =  mx*lx*my*uy*mz*lz/8.0
-    n += 1;
-    Nx(0,n,g)  =  (lx - mx)*my*uy*mz*lz/8.0;
-    Nx(1,n,g)  =  (my + uy)*mx*lx*mz*lz/8.0;
-    Nx(2,n,g)  =  (lz - mz)*mx*lx*my*uy/8.0;
-
-    // N(n,g)  =  mx*lx*my*ly*mz*uz/8.0
-    n += 1;
-    Nx(0,n,g)  =  (lx - mx)*my*ly*mz*uz/8.0;
-    Nx(1,n,g)  =  (ly - my)*mx*lx*mz*uz/8.0;
-    Nx(2,n,g)  =  (mz + uz)*mx*lx*my*ly/8.0;
-
-    // N(n,g)  = -mx*ux*my*ly*mz*uz/8.0
-    n += 1;
-    Nx(0,n,g)  = -(mx + ux)*my*ly*mz*uz/8.0;
-    Nx(1,n,g)  = -(ly - my)*mx*ux*mz*uz/8.0;
-    Nx(2,n,g)  = -(mz + uz)*mx*ux*my*ly/8.0;
-
-    // N(n,g)  =  mx*ux*my*uy*mz*uz/8.0
-    n += 1;
-    Nx(0,n,g)  =  (mx + ux)*my*uy*mz*uz/8.0;
-    Nx(1,n,g)  =  (my + uy)*mx*ux*mz*uz/8.0;
-    Nx(2,n,g)  =  (mz + uz)*mx*ux*my*uy/8.0;
-
-    // N(n,g)  = -mx*lx*my*uy*mz*uz/8.0
-    n += 1;
-    Nx(0,n,g)  = -(lx - mx)*my*uy*mz*uz/8.0;
-    Nx(1,n,g)  = -(my + uy)*mx*lx*mz*uz/8.0;
-    Nx(2,n,g)  = -(mz + uz)*mx*lx*my*uy/8.0;
-
-    // N(n,g)  =  lx*ux*my*ly*mz*lz/4.0
-    n += 1;
-    Nx(0,n,g)  =  (lx - ux)*my*ly*mz*lz/4.0;
-    Nx(1,n,g)  =  (ly - my)*lx*ux*mz*lz/4.0;
-    Nx(2,n,g)  =  (lz - mz)*lx*ux*my*ly/4.0;
-
-    // N(n,g) = -mx*ux*ly*uy*mz*lz/4.0
-    n += 1;
-    Nx(0,n,g) = -(mx + ux)*ly*uy*mz*lz/4.0;
-    Nx(1,n,g) = -(ly - uy)*mx*ux*mz*lz/4.0;
-    Nx(2,n,g) = -(lz - mz)*mx*ux*ly*uy/4.0;
-
-    //   N(n,g) = -lx*ux*my*uy*mz*lz/4.0
-    n += 1;
-    Nx(0,n,g) = -(lx - ux)*my*uy*mz*lz/4.0;
-    Nx(1,n,g) = -(my + uy)*lx*ux*mz*lz/4.0;
-    Nx(2,n,g) = -(lz - mz)*lx*ux*my*uy/4.0;
-
-    //   N(n,g) =  mx*lx*ly*uy*mz*lz/4.0
-    n += 1;
-    Nx(0,n,g) =  (lx - mx)*ly*uy*mz*lz/4.0;
-    Nx(1,n,g) =  (ly - uy)*mx*lx*mz*lz/4.0;
-    Nx(2,n,g) =  (lz - mz)*mx*lx*ly*uy/4.0;
-
-    //   N(n,g) = -lx*ux*my*ly*mz*uz/4.0
-    n += 1;
-    Nx(0,n,g) = -(lx - ux)*my*ly*mz*uz/4.0;
-    Nx(1,n,g) = -(ly - my)*lx*ux*mz*uz/4.0;
-    Nx(2,n,g) = -(mz + uz)*lx*ux*my*ly/4.0;
-
-    //   N(n,g) =  mx*ux*ly*uy*mz*uz/4.0
-    n += 1;
-    Nx(0,n,g) =  (mx + ux)*ly*uy*mz*uz/4.0;
-    Nx(1,n,g) =  (ly - uy)*mx*ux*mz*uz/4.0;
-    Nx(2,n,g) =  (mz + uz)*mx*ux*ly*uy/4.0;
-
-    //   N(n,g) =  lx*ux*my*uy*mz*uz/4.0
-    n += 1;
-    Nx(0,n,g) =  (lx - ux)*my*uy*mz*uz/4.0;
-    Nx(1,n,g) =  (my + uy)*lx*ux*mz*uz/4.0;
-    Nx(2,n,g) =  (mz + uz)*lx*ux*my*uy/4.0;
-
-    //   N(n,g) = -mx*lx*ly*uy*mz*uz/4.0
-    n += 1;
-    Nx(0,n,g) = -(lx - mx)*ly*uy*mz*uz/4.0;
-    Nx(1,n,g) = -(ly - uy)*mx*lx*mz*uz/4.0;
-    Nx(2,n,g) = -(mz + uz)*mx*lx*ly*uy/4.0;
-
-    //   N(n,g) =  mx*lx*my*ly*lz*uz/4.0
-    n += 1;
-    Nx(0,n,g) =  (lx - mx)*my*ly*lz*uz/4.0;
-    Nx(1,n,g) =  (ly - my)*mx*lx*lz*uz/4.0;
-    Nx(2,n,g) =  (lz - uz)*mx*lx*my*ly/4.0;
-
-    //   N(n,g) = -mx*ux*my*ly*lz*uz/4.0
-    n += 1;
-    Nx(0,n,g) = -(mx + ux)*my*ly*lz*uz/4.0;
-    Nx(1,n,g) = -(ly - my)*mx*ux*lz*uz/4.0;
-    Nx(2,n,g) = -(lz - uz)*mx*ux*my*ly/4.0;
-
-    //   N(n,g) =  mx*ux*my*uy*lz*uz/4.0
-    n += 1;
-    Nx(0,n,g) =  (mx + ux)*my*uy*lz*uz/4.0;
-    Nx(1,n,g) =  (my + uy)*mx*ux*lz*uz/4.0;
-    Nx(2,n,g) =  (lz - uz)*mx*ux*my*uy/4.0;
-
-    //   N(n,g) = -mx*lx*my*uy*lz*uz/4.0
-    n += 1;
-    Nx(0,n,g) = -(lx - mx)*my*uy*lz*uz/4.0;
-    Nx(1,n,g) = -(my + uy)*mx*lx*lz*uz/4.0;
-    Nx(2,n,g) = -(lz - uz)*mx*lx*my*uy/4.0;
-
-    //   N(n,g) = -mx*lx*ly*uy*lz*uz/2.0
-    n += 1;
-    Nx(0,n,g) = -(lx - mx)*ly*uy*lz*uz/2.0;
-    Nx(1,n,g) = -(ly - uy)*mx*lx*lz*uz/2.0;
-    Nx(2,n,g) = -(lz - uz)*mx*lx*ly*uy/2.0;
-
-    //   N(n,g) =  mx*ux*ly*uy*lz*uz/2.0
-    n += 1;
-    Nx(0,n,g) =  (mx + ux)*ly*uy*lz*uz/2.0;
-    Nx(1,n,g) =  (ly - uy)*mx*ux*lz*uz/2.0;
-    Nx(2,n,g) =  (lz - uz)*mx*ux*ly*uy/2.0;
-
-    //   N(n,g) = -lx*ux*my*ly*lz*uz/2.0
-    n += 1;
-    Nx(0,n,g) = -(lx - ux)*my*ly*lz*uz/2.0;
-    Nx(1,n,g) = -(ly - my)*lx*ux*lz*uz/2.0;
-    Nx(2,n,g) = -(lz - uz)*lx*ux*my*ly/2.0;
-
-    //   N(n,g) =  lx*ux*my*uy*lz*uz/2.0
-    n += 1;
-    Nx(0,n,g) =  (lx - ux)*my*uy*lz*uz/2.0;
-    Nx(1,n,g) =  (my + uy)*lx*ux*lz*uz/2.0;
-    Nx(2,n,g) =  (lz - uz)*lx*ux*my*uy/2.0;
-
-    //   N(n,g) = -lx*ux*ly*uy*mz*lz/2.0
-    n += 1;
-    Nx(0,n,g) = -(lx - ux)*ly*uy*mz*lz/2.0;
-    Nx(1,n,g) = -(ly - uy)*lx*ux*mz*lz/2.0;
-    Nx(2,n,g) = -(lz - mz)*lx*ux*ly*uy/2.0;
-
-    //   N(n,g) =  lx*ux*ly*uy*mz*uz/2.0
-    n += 1;
-    Nx(0,n,g) =  (lx - ux)*ly*uy*mz*uz/2.0;
-    Nx(1,n,g) =  (ly - uy)*lx*ux*mz*uz/2.0;
-    Nx(2,n,g) =  (mz + uz)*lx*ux*ly*uy/2.0;
-
-    //   N(n,g) =  lx*ux*ly*uy*lz*uz
-    n += 1;
-    Nx(0,n,g) =  (lx - ux)*ly*uy*lz*uz;
-    Nx(1,n,g) =  (ly - uy)*lx*ux*lz*uz;
-    Nx(2,n,g) =  (lz - uz)*lx*ux*ly*uy;
-    }
-  },
-
-  {ElementType::LIN1, [](const int insd, const int eNoN, const int g, Array<double>& xi, Array<double>& N, 
-      Array3<double>& Nx) -> void 
-    { 
-    N(0,g) = (1.0 - xi(0,g))*0.5;
-    N(1,g) = (1.0 + xi(0,g))*0.5;
-
-    Nx(0,0,g) = -0.5;
-    Nx(0,1,g) =  0.5;
-    }
-  },
-
-  {ElementType::LIN2, [](const int insd, const int eNoN, const int g, Array<double>& xi, Array<double>& N, 
-      Array3<double>& Nx) -> void 
-    { 
-    N(0,g) = -xi(0,g)*(1.0 - xi(0,g))*0.50;
-    N(1,g) =  xi(0,g)*(1.0 + xi(0,g))*0.50;
-    N(2,g) = (1.0 - xi(0,g))*(1.0 + xi(0,g));
-
-    Nx(0,0,g) = -0.50 + xi(0,g);
-    Nx(0,1,g) =  0.50 + xi(0,g);
-    Nx(0,2,g) = -2.0*xi(0,g);
-    }
-  },
-
-  {ElementType::QUD4, [](const int insd, const int eNoN, const int g, Array<double>& xi, Array<double>& N, 
-      Array3<double>& Nx) -> void 
-    { 
-    double lx = 1.0 - xi(0,g);
-    double ly = 1.0 - xi(1,g);
-    double ux = 1.0 + xi(0,g);
-    double uy = 1.0 + xi(1,g);
-
-    N(0,g) = lx*ly / 4.0;
-    N(1,g) = ux*ly / 4.0;
-    N(2,g) = ux*uy / 4.0;
-    N(3,g) = lx*uy / 4.0;
-
-    Nx(0,0,g) = -ly / 4.0;
-    Nx(1,0,g) = -lx / 4.0;
-    Nx(0,1,g) =  ly / 4.0;
-    Nx(1,1,g) = -ux / 4.0;
-    Nx(0,2,g) =  uy / 4.0;
-    Nx(1,2,g) =  ux / 4.0;
-    Nx(0,3,g) = -uy / 4.0;
-    Nx(1,3,g) =  lx / 4.0;
-    }
-  },
-
-  {ElementType::QUD9, [](const int insd, const int eNoN, const int g, Array<double>& xi, Array<double>& N, 
-      Array3<double>& Nx) -> void 
-    { 
-    double lx = 1.0 - xi(0,g);
-    double ly = 1.0 - xi(1,g);
-    double ux = 1.0 + xi(0,g);
-    double uy = 1.0 + xi(1,g);
-    double mx = xi(0,g);
-    double my = xi(1,g);
-
-    N(0,g) =  mx*lx*my*ly/4.0;
-    N(1,g) = -mx*ux*my*ly/4.0;
-    N(2,g) =  mx*ux*my*uy/4.0;
-    N(3,g) = -mx*lx*my*uy/4.0;
-    N(4,g) = -lx*ux*my*ly*0.50;
-    N(5,g) =  mx*ux*ly*uy*0.50;
-    N(6,g) =  lx*ux*my*uy*0.50;
-    N(7,g) = -mx*lx*ly*uy*0.50;
-    N(8,g) =  lx*ux*ly*uy;
-
-    Nx(0,0,g) =  (lx - mx)*my*ly/4.0;
-    Nx(1,0,g) =  (ly - my)*mx*lx/4.0;
-    Nx(0,1,g) = -(ux + mx)*my*ly/4.0;
-    Nx(1,1,g) = -(ly - my)*mx*ux/4.0;
-    Nx(0,2,g) =  (ux + mx)*my*uy/4.0;
-    Nx(1,2,g) =  (uy + my)*mx*ux/4.0;
-    Nx(0,3,g) = -(lx - mx)*my*uy/4.0;
-    Nx(1,3,g) = -(uy + my)*mx*lx/4.0;
-    Nx(0,4,g) = -(lx - ux)*my*ly*0.50;
-    Nx(1,4,g) = -(ly - my)*lx*ux*0.50;
-    Nx(0,5,g) =  (ux + mx)*ly*uy*0.50;
-    Nx(1,5,g) =  (ly - uy)*mx*ux*0.50;
-    Nx(0,6,g) =  (lx - ux)*my*uy*0.50;
-    Nx(1,6,g) =  (uy + my)*lx*ux*0.50;
-    Nx(0,7,g) = -(lx - mx)*ly*uy*0.50;
-    Nx(1,7,g) = -(ly - uy)*mx*lx*0.50;
-    Nx(0,8,g) =  (lx - ux)*ly*uy;
-    Nx(1,8,g) =  (ly - uy)*lx*ux;
-    }
-  },
-
-  {ElementType::TET4, [](const int insd, const int eNoN, const int g, Array<double>& xi, Array<double>& N, 
-      Array3<double>& Nx) -> void 
-    { 
-    //std::cout << "[get_element_shape_data] TET4 " << std::endl;
-
-    N(0,g) = xi(0,g);
-    N(1,g) = xi(1,g);
-    N(2,g) = xi(2,g);
-    N(3,g) = 1.0 - xi(0,g) - xi(1,g) - xi(2,g);
-
-    Nx(0,0,g) =  1.0;
-    Nx(1,0,g) =  0.0;
-    Nx(2,0,g) =  0.0;
-    Nx(0,1,g) =  0.0;
-    Nx(1,1,g) =  1.0;
-    Nx(2,1,g) =  0.0;
-    Nx(0,2,g) =  0.0;
-    Nx(1,2,g) =  0.0;
-    Nx(2,2,g) =  1.0;
-    Nx(0,3,g) = -1.0;
-    Nx(1,3,g) = -1.0;
-    Nx(2,3,g) = -1.0;
-    }
-  },
-
-  {ElementType::TET10, [](const int insd, const int eNoN, const int g, Array<double>& xi, Array<double>& N, 
-      Array3<double>& Nx) -> void 
-    { 
-    double s = 1.0 - xi(0,g) - xi(1,g) - xi(2,g);
-    N(0,g) = xi(0,g)*(2.0*xi(0,g) - 1.0);
-    N(1,g) = xi(1,g)*(2.0*xi(1,g) - 1.0);
-    N(2,g) = xi(2,g)*(2.0*xi(2,g) - 1.0);
-    N(3,g) = s * (2.0*s - 1.0);
-    N(4,g) = 4.0*xi(0,g)*xi(1,g);
-    N(5,g) = 4.0*xi(1,g)*xi(2,g);
-    N(6,g) = 4.0*xi(0,g)*xi(2,g);
-    N(7,g) = 4.0*xi(0,g)*s;
-    N(8,g) = 4.0*xi(1,g)*s;
-    N(9,g) = 4.0*xi(2,g)*s;
-
-    Nx(0,0,g)  =  4.0*xi(0,g) - 1.0;
-    Nx(1,0,g)  =  0.0;
-    Nx(2,0,g)  =  0.0;
-
-    Nx(0,1,g)  =  0.0;
-    Nx(1,1,g)  =  4.0*xi(1,g) - 1.0;
-    Nx(2,1,g)  =  0.0;
-
-    Nx(0,2,g)  =  0.0;
-    Nx(1,2,g)  =  0.0;
-    Nx(2,2,g)  =  4.0*xi(2,g) - 1.0;
-
-    Nx(0,3,g)  =  1.0 - 4.0*s;
-    Nx(1,3,g)  =  1.0 - 4.0*s;
-    Nx(2,3,g)  =  1.0 - 4.0*s;
-
-    Nx(0,4,g)  =  4.0*xi(1,g);
-    Nx(1,4,g)  =  4.0*xi(0,g);
-    Nx(2,4,g)  =  0.0;
-
-    Nx(0,5,g)  =  0.0;
-    Nx(1,5,g)  =  4.0*xi(2,g);
-    Nx(2,5,g)  =  4.0*xi(1,g);
-
-    Nx(0,6,g)  =  4.0*xi(2,g);
-    Nx(1,6,g)  =  0.0;
-    Nx(2,6,g)  =  4.0*xi(0,g);
-
-    Nx(0,7,g)  =  4.0*( s - xi(0,g));
-    Nx(1,7,g)  = -4.0*xi(0,g);
-    Nx(2,7,g)  = -4.0*xi(0,g);
-
-    Nx(0,8,g)  = -4.0*xi(1,g);
-    Nx(1,8,g)  =  4.0*( s - xi(1,g));
-    Nx(2,8,g)  = -4.0*xi(1,g);
-
-    Nx(0,9,g) = -4.0*xi(2,g);
-    Nx(1,9,g) = -4.0*xi(2,g);
-    Nx(2,9,g) =  4.0*( s - xi(2,g));
-    }
-  },
-
-  {ElementType::TRI3, [](const int insd, const int eNoN, const int g, Array<double>& xi, Array<double>& N, 
-      Array3<double>& Nx) -> void 
-    { 
-    //std::cout << "[get_element_shape_data] TRI3 " << std::endl;
-    N(0,g) = xi(0,g);
-    N(1,g) = xi(1,g);
-    N(2,g) = 1.0 - xi(0,g) - xi(1,g);
-
-    Nx(0,0,g) =  1.0;
-    Nx(1,0,g) =  0.0;
-    Nx(0,1,g) =  0.0;
-    Nx(1,1,g) =  1.0;
-    Nx(0,2,g) = -1.0;
-    Nx(1,2,g) = -1.0;
-    }
-  },
-
-  {ElementType::TRI6, [](const int insd, const int eNoN, const int g, Array<double>& xi, Array<double>& N, 
-      Array3<double>& Nx) -> void 
-    { 
-    double s = 1.0 - xi(0,g) - xi(1,g);
-    N(0,g) = xi(0,g) * (2.0*xi(0,g) - 1.0);
-    N(1,g) = xi(1,g) * (2.0*xi(1,g) - 1.0);
-    N(2,g) = s * (2.0*s - 1.0);
-    N(3,g) = 4.0*xi(0,g)*xi(1,g);
-    N(4,g) = 4.0*xi(1,g)*s;
-    N(5,g) = 4.0*xi(0,g)*s;
-
-    Nx(0,0,g) =  4.0*xi(0,g) - 1.0;
-    Nx(1,0,g) =  0.0;
-
-    Nx(0,1,g) =  0.0;
-    Nx(1,1,g) =  4.0*xi(1,g) - 1.0;
-
-    Nx(0,2,g) =  1.0 - 4.0*s;
-    Nx(1,2,g) =  1.0 - 4.0*s;
-
-    Nx(0,3,g) =  4.0*xi(1,g);
-    Nx(1,3,g) =  4.0*xi(0,g);
-
-    Nx(0,4,g) = -4.0*xi(1,g);
-    Nx(1,4,g) =  4.0*( s - xi(1,g) );
-
-    Nx(0,5,g) =  4.0*( s - xi(0,g) );
-    Nx(1,5,g) = -4.0*xi(0,g);
-    }
-  },
-
-  {ElementType::WDG, [](const int insd, const int eNoN, const int g, Array<double>& xi, Array<double>& N, 
-      Array3<double>& Nx) -> void 
-    { 
-    double ux = xi(0,g);
-    double uy = xi(1,g);
-    double uz = 1.0 - ux - uy;
-    double s = (1.0 + xi(2,g))*0.5;
-    double t = (1.0 - xi(2,g))*0.5;
-    N(0,g) = ux*t;
-    N(1,g) = uy*t;
-    N(2,g) = uz*t;
-    N(3,g) = ux*s;
-    N(4,g) = uy*s;
-    N(5,g) = uz*s;
-
-    Nx(0,0,g) =  t;
-    Nx(1,0,g) =  0.0;
-    Nx(2,0,g) = -ux*0.50;
-
-    Nx(0,1,g) =  0.0;
-    Nx(1,1,g) =  t;
-    Nx(2,1,g) = -uy*0.50;
-
-    Nx(0,2,g) = -t;
-    Nx(1,2,g) = -t;
-    Nx(2,2,g) = -uz*0.50;
-
-    Nx(0,3,g) =  s;
-    Nx(1,3,g) =  0.0;
-    Nx(2,3,g) =  ux*0.50;
-
-    Nx(0,4,g) =  0.0;
-    Nx(1,4,g) =  s;
-    Nx(2,4,g) =  uy*0.50;
-
-    Nx(0,5,g) = -s;
-    Nx(1,5,g) = -s;
-    Nx(2,5,g) =  uz*0.50;
-    }
-  },
-
-
-
-};
-
-
-//------------------------
-// set_element_shape_data 
-//------------------------
-// Replicates 'SUBROUTINE GETGNN(insd, eType, eNoN, xi, N, Nxi)' defined in NN.f.
-//
-using SetElementShapeMapType = std::map<ElementType, std::function<void(int, mshType&)>>;
-
-SetElementShapeMapType set_element_shape_data = {
-
-  {ElementType::HEX8, [](int g, mshType& mesh) -> void { 
-    auto& xi = mesh.xi;
-    double lx = 1.0 - xi(0,g);
-    double ly = 1.0 - xi(1,g);
-    double lz = 1.0 - xi(2,g);
-    double ux = 1.0 + xi(0,g);
-    double uy = 1.0 + xi(1,g);
-    double uz = 1.0 + xi(2,g);
-
-    auto& N = mesh.N;
-    N(0,g) = lx*ly*lz/8.0;
-    N(1,g) = ux*ly*lz/8.0;
-    N(2,g) = ux*uy*lz/8.0;
-    N(3,g) = lx*uy*lz/8.0;
-    N(4,g) = lx*ly*uz/8.0;
-    N(5,g) = ux*ly*uz/8.0;
-    N(6,g) = ux*uy*uz/8.0;
-    N(7,g) = lx*uy*uz/8.0;
-
-    auto& Nx = mesh.Nx;
-    Nx(0,0,g) = -ly*lz/8.0;
-    Nx(1,0,g) = -lx*lz/8.0;
-    Nx(2,0,g) = -lx*ly/8.0;
-
-    Nx(0,1,g) =  ly*lz/8.0;
-    Nx(1,1,g) = -ux*lz/8.0;
-    Nx(2,1,g) = -ux*ly/8.0;
-
-    Nx(0,2,g) =  uy*lz/8.0;
-    Nx(1,2,g) =  ux*lz/8.0;
-    Nx(2,2,g) = -ux*uy/8.0;
-
-    Nx(0,3,g) = -uy*lz/8.0;
-    Nx(1,3,g) =  lx*lz/8.0;
-    Nx(2,3,g) = -lx*uy/8.0;
-
-    Nx(0,4,g) = -ly*uz/8.0;
-    Nx(1,4,g) = -lx*uz/8.0;
-    Nx(2,4,g) =  lx*ly/8.0;
-
-    Nx(0,5,g) =  ly*uz/8.0;
-    Nx(1,5,g) = -ux*uz/8.0;
-    Nx(2,5,g) =  ux*ly/8.0;
-
-    Nx(0,6,g) =  uy*uz/8.0;
-    Nx(1,6,g) =  ux*uz/8.0;
-    Nx(2,6,g) =  ux*uy/8.0;
-
-    Nx(0,7,g) = -uy*uz/8.0;
-    Nx(1,7,g) =  lx*uz/8.0;
-    Nx(2,7,g) =  lx*uy/8.0;
-    }
-  },
-
-  {ElementType::HEX20, [](int g, mshType& mesh) -> void {
-
-    auto& xi = mesh.xi;
-    double lx = 1.0 - xi(0,g);
-    double ly = 1.0 - xi(1,g);
-    double lz = 1.0 - xi(2,g);
-    double ux = 1.0 + xi(0,g);
-    double uy = 1.0 + xi(1,g);
-    double uz = 1.0 + xi(2,g);
-
-    double mx = lx*ux;
-    double my = ly*uy;
-    double mz = lz*uz;
-
-    auto& N = mesh.N;
-    N(0, g) = lx*ly*lz*(lx+ly+lz-5.0)/8.0;
-    N(1, g) = ux*ly*lz*(ux+ly+lz-5.0)/8.0;
-    N(2, g) = ux*uy*lz*(ux+uy+lz-5.0)/8.0;
-    N(3, g) = lx*uy*lz*(lx+uy+lz-5.0)/8.0;
-    N(4, g) = lx*ly*uz*(lx+ly+uz-5.0)/8.0;
-    N(5, g) = ux*ly*uz*(ux+ly+uz-5.0)/8.0;
-    N(6, g) = ux*uy*uz*(ux+uy+uz-5.0)/8.0;
-    N(7, g) = lx*uy*uz*(lx+uy+uz-5.0)/8.0;
-    N(8, g) = mx*ly*lz/4.0;
-    N(9, g) = ux*my*lz/4.0;
-    N(10, g) = mx*uy*lz/4.0;
-    N(11, g) = lx*my*lz/4.0;
-    N(12, g) = mx*ly*uz/4.0;
-    N(13, g) = ux*my*uz/4.0;
-    N(14, g) = mx*uy*uz/4.0;
-    N(15, g) = lx*my*uz/4.0;
-    N(16, g) = lx*ly*mz/4.0;
-    N(17, g) = ux*ly*mz/4.0;
-    N(18, g) = ux*uy*mz/4.0;
-    N(19, g) = lx*uy*mz/4.0;
-
-    // N(1)  = lx*ly*lz*(lx+ly+lz-5.0)/8.0;
-    auto& Nx = mesh.Nx;
-    int n = 0;
-    Nx(0,n,g) = -ly*lz*(lx+ly+lz-5.0+lx)/8.0;
-    Nx(1,n,g) = -lx*lz*(lx+ly+lz-5.0+ly)/8.0;
-    Nx(2,n,g) = -lx*ly*(lx+ly+lz-5.0+lz)/8.0;
-
-//c   N(n,g) = ux*ly*lz*(ux+ly+lz-5.0)/8.0;
-    n += 1;
-    Nx(0,n,g) =  ly*lz*(ux+ly+lz-5.0+ux)/8.0;
-    Nx(1,n,g) = -ux*lz*(ux+ly+lz-5.0+ly)/8.0;
-    Nx(2,n,g) = -ux*ly*(ux+ly+lz-5.0+lz)/8.0;
-
-//c   N(n,g) = ux*uy*lz*(ux+uy+lz-5.0)/8.0
-    n += 1;
-    Nx(0,n,g) =  uy*lz*(ux+uy+lz-5.0+ux)/8.0;
-    Nx(1,n,g) =  ux*lz*(ux+uy+lz-5.0+uy)/8.0;
-    Nx(2,n,g) = -ux*uy*(ux+uy+lz-5.0+lz)/8.0;
-
-//c   N(n,g) = lx*uy*lz*(lx+uy+lz-5.0)/8.0
-    n += 1;
-    Nx(0,n,g) = -uy*lz*(lx+uy+lz-5.0+lx)/8.0;
-    Nx(1,n,g) =  lx*lz*(lx+uy+lz-5.0+uy)/8.0;
-    Nx(2,n,g) = -lx*uy*(lx+uy+lz-5.0+lz)/8.0;
-
-//c   N(n,g) = lx*ly*uz*(lx+ly+uz-5.0)/8.0
-    n += 1;
-    Nx(0,n,g) = -ly*uz*(lx+ly+uz-5.0+lx)/8.0;
-    Nx(1,n,g) = -lx*uz*(lx+ly+uz-5.0+ly)/8.0;
-    Nx(2,n,g) =  lx*ly*(lx+ly+uz-5.0+uz)/8.0;
-
-//c   N(n,g) = ux*ly*uz*(ux+ly+uz-5.0)/8.0
-    n += 1;
-    Nx(0,n,g) =  ly*uz*(ux+ly+uz-5.0+ux)/8.0;
-    Nx(1,n,g) = -ux*uz*(ux+ly+uz-5.0+ly)/8.0;
-    Nx(2,n,g) =  ux*ly*(ux+ly+uz-5.0+uz)/8.0;
-
-//c   N(n,g) = ux*uy*uz*(ux+uy+uz-5.0)/8.0
-    n += 1;
-    Nx(0,n,g) =  uy*uz*(ux+uy+uz-5.0+ux)/8.0;
-    Nx(1,n,g) =  ux*uz*(ux+uy+uz-5.0+uy)/8.0;
-    Nx(2,n,g) =  ux*uy*(ux+uy+uz-5.0+uz)/8.0;
-
-//c   N(n,g) = lx*uy*uz*(lx+uy+uz-5.0)/8.0
-    n += 1;
-    Nx(0,n,g) = -uy*uz*(lx+uy+uz-5.0+lx)/8.0;
-    Nx(1,n,g) =  lx*uz*(lx+uy+uz-5.0+uy)/8.0;
-    Nx(2,n,g) =  lx*uy*(lx+uy+uz-5.0+uz)/8.0;
-
-//c   N(n,g) = mx*ly*lz/4.0
-    n += 1;
-    Nx(0,n,g) =  (lx - ux)*ly*lz/4.0;
-    Nx(1,n,g) = -mx*lz/4.0;
-    Nx(2,n,g) = -mx*ly/4.0;
-
-//c   N(0n,g) = ux*my*lz/4.0
-    n += 1;
-    Nx(0,n,g) =  my*lz/4.0;
-    Nx(1,n,g) =  (ly - uy)*ux*lz/4.0;
-    Nx(2,n,g) = -ux*my/4.0;
-
-//c   N(0n,g) = mx*uy*lz/4.0
-    n += 1;
-    Nx(0,n,g) =  (lx - ux)*uy*lz/4.0;
-    Nx(1,n,g) =  mx*lz/4.0;
-    Nx(2,n,g) = -mx*uy/4.0;
-
-//c   N(0n,g) = lx*my*lz/4.0
-    n += 1;
-    Nx(0,n,g) = -my*lz/4.0;
-    Nx(1,n,g) =  (ly - uy)*lx*lz/4.0;
-    Nx(2,n,g) = -lx*my/4.0;
-
-//c   N(0n,g) = mx*ly*uz/4.0
-    n += 1;
-    Nx(0,n,g) =  (lx - ux)*ly*uz/4.0;
-    Nx(1,n,g) = -mx*uz/4.0;
-    Nx(2,n,g) =  mx*ly/4.0;
-
-//c   N(0n,g) = ux*my*uz/4.0
-    n += 1;
-    Nx(0,n,g) =  my*uz/4.0;
-    Nx(1,n,g) =  (ly - uy)*ux*uz/4.0;
-    Nx(2,n,g) =  ux*my/4.0;
-
-//c   N(0n,g) = mx*uy*uz/4.0
-    n += 1;
-    Nx(0,n,g) =  (lx - ux)*uy*uz/4.0;
-    Nx(1,n,g) =  mx*uz/4.0;
-    Nx(2,n,g) =  mx*uy/4.0;
-
-//c   N(0n,g) = lx*my*uz/4.0
-    n += 1;
-    Nx(0,n,g) = -my*uz/4.0;
-    Nx(1,n,g) =  (ly - uy)*lx*uz/4.0;
-    Nx(2,n,g) =  lx*my/4.0;
-
-//c   N(0n,g) = lx*ly*mz/4.0
-    n += 1;
-    Nx(0,n,g) = -ly*mz/4.0;
-    Nx(1,n,g) = -lx*mz/4.0;
-    Nx(2,n,g) =  (lz - uz)*lx*ly/4.0;
-
-//c   N(0n,g) = ux*ly*mz/4.0
-    n += 1;
-    Nx(0,n,g) =  ly*mz/4.0;
-    Nx(1,n,g) = -ux*mz/4.0;
-    Nx(2,n,g) =  (lz - uz)*ux*ly/4.0;
-
-//c   N(0n,g) = ux*uy*mz/4.0
-    n += 1;
-    Nx(0,n,g) =  uy*mz/4.0;
-    Nx(1,n,g) =  ux*mz/4.0;
-    Nx(2,n,g) =  (lz - uz)*ux*uy/4.0;
-
-//c   N(n,g) = lx*uy*mz/4.0
-    n += 1;
-    Nx(0,n,g) = -uy*mz/4.0;
-    Nx(1,n,g) =  lx*mz/4.0;
-    Nx(2,n,g) =  (lz - uz)*lx*uy/4.0;
-    }
-  },
-
-  {ElementType::HEX27, [](int g, mshType& mesh) -> void {
-
-    auto& xi = mesh.xi;
-    double lx = 1.0 - xi(0,g);
-    double ly = 1.0 - xi(1,g);
-    double lz = 1.0 - xi(2,g);
-    double ux = 1.0 + xi(0,g);
-    double uy = 1.0 + xi(1,g);
-    double uz = 1.0 + xi(2,g);
-
-    double mx = xi(0,g);
-    double my = xi(1,g);
-    double mz = xi(2,g);
-
-    auto& N = mesh.N;
-    N(0,g)  = -mx*lx*my*ly*mz*lz/8.0;
-    N(1,g)  =  mx*ux*my*ly*mz*lz/8.0;
-    N(2,g)  = -mx*ux*my*uy*mz*lz/8.0;
-    N(3,g)  =  mx*lx*my*uy*mz*lz/8.0;
-    N(4,g)  =  mx*lx*my*ly*mz*uz/8.0;
-    N(5,g)  = -mx*ux*my*ly*mz*uz/8.0;
-    N(6,g)  =  mx*ux*my*uy*mz*uz/8.0;
-    N(7,g)  = -mx*lx*my*uy*mz*uz/8.0;
-    N(8,g)  =  lx*ux*my*ly*mz*lz/4.0;
-    N(9,g) = -mx*ux*ly*uy*mz*lz/4.0;
-    N(10,g) = -lx*ux*my*uy*mz*lz/4.0;
-    N(11,g) =  mx*lx*ly*uy*mz*lz/4.0;
-    N(12,g) = -lx*ux*my*ly*mz*uz/4.0;
-    N(13,g) =  mx*ux*ly*uy*mz*uz/4.0;
-    N(14,g) =  lx*ux*my*uy*mz*uz/4.0;
-    N(15,g) = -mx*lx*ly*uy*mz*uz/4.0;
-    N(16,g) =  mx*lx*my*ly*lz*uz/4.0;
-    N(17,g) = -mx*ux*my*ly*lz*uz/4.0;
-    N(18,g) =  mx*ux*my*uy*lz*uz/4.0;
-    N(19,g) = -mx*lx*my*uy*lz*uz/4.0;
-
-    N(20,g) = -mx*lx*ly*uy*lz*uz/2.0;
-    N(21,g) =  mx*ux*ly*uy*lz*uz/2.0;
-    N(22,g) = -lx*ux*my*ly*lz*uz/2.0;
-    N(23,g) =  lx*ux*my*uy*lz*uz/2.0;
-    N(24,g) = -lx*ux*ly*uy*mz*lz/2.0;
-    N(25,g) =  lx*ux*ly*uy*mz*uz/2.0;
-
-    N(26,g) =  lx*ux*ly*uy*lz*uz;
-
-    auto& Nxi = mesh.Nx;
-    int n = 0;
-    Nxi(0,n,g)  = -(lx - mx)*my*ly*mz*lz/8.0;
-    Nxi(1,n,g)  = -(ly - my)*mx*lx*mz*lz/8.0;
-    Nxi(2,n,g)  = -(lz - mz)*mx*lx*my*ly/8.0;
-
-    n += 1;
-    Nxi(0,n,g)  =  (mx + ux)*my*ly*mz*lz/8.0;
-    Nxi(1,n,g)  =  (ly - my)*mx*ux*mz*lz/8.0;
-    Nxi(2,n,g)  =  (lz - mz)*mx*ux*my*ly/8.0;
-
-    n += 1;
-    Nxi(0,n,g)  = -(mx + ux)*my*uy*mz*lz/8.0;
-    Nxi(1,n,g)  = -(my + uy)*mx*ux*mz*lz/8.0;
-    Nxi(2,n,g)  = -(lz - mz)*mx*ux*my*uy/8.0;
-
-    n += 1;
-    Nxi(0,n,g)  =  (lx - mx)*my*uy*mz*lz/8.0;
-    Nxi(1,n,g)  =  (my + uy)*mx*lx*mz*lz/8.0;
-    Nxi(2,n,g)  =  (lz - mz)*mx*lx*my*uy/8.0;
-
-    n += 1;
-    Nxi(0,n,g)  =  (lx - mx)*my*ly*mz*uz/8.0;
-    Nxi(1,n,g)  =  (ly - my)*mx*lx*mz*uz/8.0;
-    Nxi(2,n,g)  =  (mz + uz)*mx*lx*my*ly/8.0;
-
-    n += 1;
-    Nxi(0,n,g)  = -(mx + ux)*my*ly*mz*uz/8.0;
-    Nxi(1,n,g)  = -(ly - my)*mx*ux*mz*uz/8.0;
-    Nxi(2,n,g)  = -(mz + uz)*mx*ux*my*ly/8.0;
-
-    n += 1;
-    Nxi(0,n,g)  =  (mx + ux)*my*uy*mz*uz/8.0;
-    Nxi(1,n,g)  =  (my + uy)*mx*ux*mz*uz/8.0;
-    Nxi(2,n,g)  =  (mz + uz)*mx*ux*my*uy/8.0;
-
-    n += 1;
-    Nxi(0,n,g)  = -(lx - mx)*my*uy*mz*uz/8.0;
-    Nxi(1,n,g)  = -(my + uy)*mx*lx*mz*uz/8.0;
-    Nxi(2,n,g)  = -(mz + uz)*mx*lx*my*uy/8.0;
-
-    n += 1;
-    Nxi(0,n,g)  =  (lx - ux)*my*ly*mz*lz/4.0;
-    Nxi(1,n,g)  =  (ly - my)*lx*ux*mz*lz/4.0;
-    Nxi(2,n,g)  =  (lz - mz)*lx*ux*my*ly/4.0;
-
-    n += 1;
-    Nxi(0,n,g) = -(mx + ux)*ly*uy*mz*lz/4.0;
-    Nxi(1,n,g) = -(ly - uy)*mx*ux*mz*lz/4.0;
-    Nxi(2,n,g) = -(lz - mz)*mx*ux*ly*uy/4.0;
-
-    n += 1;
-    Nxi(0,n,g) = -(lx - ux)*my*uy*mz*lz/4.0;
-    Nxi(1,n,g) = -(my + uy)*lx*ux*mz*lz/4.0;
-    Nxi(2,n,g) = -(lz - mz)*lx*ux*my*uy/4.0;
-
-    n += 1;
-    Nxi(0,n,g) =  (lx - mx)*ly*uy*mz*lz/4.0;
-    Nxi(1,n,g) =  (ly - uy)*mx*lx*mz*lz/4.0;
-    Nxi(2,n,g) =  (lz - mz)*mx*lx*ly*uy/4.0;
-
-    n += 1;
-    Nxi(0,n,g) = -(lx - ux)*my*ly*mz*uz/4.0;
-    Nxi(1,n,g) = -(ly - my)*lx*ux*mz*uz/4.0;
-    Nxi(2,n,g) = -(mz + uz)*lx*ux*my*ly/4.0;
-
-    n += 1;
-    Nxi(0,n,g) =  (mx + ux)*ly*uy*mz*uz/4.0;
-    Nxi(1,n,g) =  (ly - uy)*mx*ux*mz*uz/4.0;
-    Nxi(2,n,g) =  (mz + uz)*mx*ux*ly*uy/4.0;
-
-    n += 1;
-    Nxi(0,n,g) =  (lx - ux)*my*uy*mz*uz/4.0;
-    Nxi(1,n,g) =  (my + uy)*lx*ux*mz*uz/4.0;
-    Nxi(2,n,g) =  (mz + uz)*lx*ux*my*uy/4.0;
-
-    n += 1;
-    Nxi(0,n,g) = -(lx - mx)*ly*uy*mz*uz/4.0;
-    Nxi(1,n,g) = -(ly - uy)*mx*lx*mz*uz/4.0;
-    Nxi(2,n,g) = -(mz + uz)*mx*lx*ly*uy/4.0;
-
-    n += 1;
-    Nxi(0,n,g) =  (lx - mx)*my*ly*lz*uz/4.0;
-    Nxi(1,n,g) =  (ly - my)*mx*lx*lz*uz/4.0;
-    Nxi(2,n,g) =  (lz - uz)*mx*lx*my*ly/4.0;
-
-    n += 1;
-    Nxi(0,n,g) = -(mx + ux)*my*ly*lz*uz/4.0;
-    Nxi(1,n,g) = -(ly - my)*mx*ux*lz*uz/4.0;
-    Nxi(2,n,g) = -(lz - uz)*mx*ux*my*ly/4.0;
-
-    n += 1;
-    Nxi(0,n,g) =  (mx + ux)*my*uy*lz*uz/4.0;
-    Nxi(1,n,g) =  (my + uy)*mx*ux*lz*uz/4.0;
-    Nxi(2,n,g) =  (lz - uz)*mx*ux*my*uy/4.0;
-
-    n += 1;
-    Nxi(0,n,g) = -(lx - mx)*my*uy*lz*uz/4.0;
-    Nxi(1,n,g) = -(my + uy)*mx*lx*lz*uz/4.0;
-    Nxi(2,n,g) = -(lz - uz)*mx*lx*my*uy/4.0;
-
-    n += 1;
-    Nxi(0,n,g) = -(lx - mx)*ly*uy*lz*uz/2.0;
-    Nxi(1,n,g) = -(ly - uy)*mx*lx*lz*uz/2.0;
-    Nxi(2,n,g) = -(lz - uz)*mx*lx*ly*uy/2.0;
-
-    n += 1;
-    Nxi(0,n,g) =  (mx + ux)*ly*uy*lz*uz/2.0;
-    Nxi(1,n,g) =  (ly - uy)*mx*ux*lz*uz/2.0;
-    Nxi(2,n,g) =  (lz - uz)*mx*ux*ly*uy/2.0;
-
-    n += 1;
-    Nxi(0,n,g) = -(lx - ux)*my*ly*lz*uz/2.0;
-    Nxi(1,n,g) = -(ly - my)*lx*ux*lz*uz/2.0;
-    Nxi(2,n,g) = -(lz - uz)*lx*ux*my*ly/2.0;
-
-    n += 1;
-    Nxi(0,n,g) =  (lx - ux)*my*uy*lz*uz/2.0;
-    Nxi(1,n,g) =  (my + uy)*lx*ux*lz*uz/2.0;
-    Nxi(2,n,g) =  (lz - uz)*lx*ux*my*uy/2.0;
-
-    n += 1;
-    Nxi(0,n,g) = -(lx - ux)*ly*uy*mz*lz/2.0;
-    Nxi(1,n,g) = -(ly - uy)*lx*ux*mz*lz/2.0;
-    Nxi(2,n,g) = -(lz - mz)*lx*ux*ly*uy/2.0;
-
-    n += 1;
-    Nxi(0,n,g) =  (lx - ux)*ly*uy*mz*uz/2.0;
-    Nxi(1,n,g) =  (ly - uy)*lx*ux*mz*uz/2.0;
-    Nxi(2,n,g) =  (mz + uz)*lx*ux*ly*uy/2.0;
-
-    n += 1;
-    Nxi(0,n,g) =  (lx - ux)*ly*uy*lz*uz;
-    Nxi(1,n,g) =  (ly - uy)*lx*ux*lz*uz;
-    Nxi(2,n,g) =  (lz - uz)*lx*ux*ly*uy;
-    }
-  },
-
-  {ElementType::LIN1, [](int g, mshType& mesh) -> void { 
-    //std::cout << "[set_element_shape_data] **************************" << std::endl;
-    //std::cout << "[set_element_shape_data] ERROR: LIN1 not supported." << std::endl;
-    //std::cout << "[set_element_shape_data] **************************" << std::endl;
-    auto& xi = mesh.xi;
-    auto& N = mesh.N;
-    N(0,g) = (1.0 - xi(0,g))*0.5;
-    N(1,g) = (1.0 + xi(0,g))*0.5;
-
-    auto& Nx = mesh.Nx;
-    Nx(0,0,g) = -0.5;
-    Nx(0,1,g) =  0.5;
-    }
-  },
-
-  {ElementType::LIN2, [](int g, mshType& mesh) -> void {
-    auto& xi = mesh.xi;
-    auto& N = mesh.N;
-    N(0,g) = -xi(0,g)*(1.0 - xi(0,g))*0.50;
-    N(1,g) =  xi(0,g)*(1.0 + xi(0,g))*0.50;
-    N(2,g) = (1.0 - xi(0,g))*(1.0 + xi(0,g));
-
-    auto& Nx = mesh.Nx;
-    Nx(0,0,g) = -0.50 + xi(0,g);
-    Nx(0,1,g) =  0.50 + xi(0,g);
-    Nx(0,2,g) = -2.0*xi(0,g);
-    }
-  },
-
-  {ElementType::QUD4, [](int g, mshType& mesh) -> void {
-    auto& xi = mesh.xi;
-    double lx = 1.0 - xi(0,g);
-    double ly = 1.0 - xi(1,g);
-    double ux = 1.0 + xi(0,g);
-    double uy = 1.0 + xi(1,g);
-
-    auto& N = mesh.N;
-    N(0,g) = lx*ly / 4.0;    
-    N(1,g) = ux*ly / 4.0;    
-    N(2,g) = ux*uy / 4.0;    
-    N(3,g) = lx*uy / 4.0;    
-
-    auto& Nx = mesh.Nx;
-    Nx(0,0,g) = -ly / 4.0;
-    Nx(1,0,g) = -lx / 4.0;
-    Nx(0,1,g) =  ly / 4.0;
-    Nx(1,1,g) = -ux / 4.0;
-    Nx(0,2,g) =  uy / 4.0;
-    Nx(1,2,g) =  ux / 4.0;
-    Nx(0,3,g) = -uy / 4.0;
-    Nx(1,3,g) =  lx / 4.0;
-    }
-  },
-
- {ElementType::QUD9, [](int g, mshType& mesh) -> void {
-    auto& xi = mesh.xi;
-    double lx = 1.0 - xi(0,g);
-    double ly = 1.0 - xi(1,g);
-    double ux = 1.0 + xi(0,g);
-    double uy = 1.0 + xi(1,g);
-    double mx = xi(0,g);
-    double my = xi(1,g);
-
-    auto& N = mesh.N;
-    N(0,g) =  mx*lx*my*ly/4.0;
-    N(1,g) = -mx*ux*my*ly/4.0;
-    N(2,g) =  mx*ux*my*uy/4.0;
-    N(3,g) = -mx*lx*my*uy/4.0;
-    N(4,g) = -lx*ux*my*ly*0.50;
-    N(5,g) =  mx*ux*ly*uy*0.50;
-    N(6,g) =  lx*ux*my*uy*0.50;
-    N(7,g) = -mx*lx*ly*uy*0.50;
-    N(8,g) =  lx*ux*ly*uy;
-
-    auto& Nx = mesh.Nx;
-    Nx(0,0,g) =  (lx - mx)*my*ly/4.0;
-    Nx(1,0,g) =  (ly - my)*mx*lx/4.0;
-
-    Nx(0,1,g) = -(ux + mx)*my*ly/4.0;
-    Nx(1,1,g) = -(ly - my)*mx*ux/4.0;
-
-    Nx(0,2,g) =  (ux + mx)*my*uy/4.0;
-    Nx(1,2,g) =  (uy + my)*mx*ux/4.0;
-
-    Nx(0,3,g) = -(lx - mx)*my*uy/4.0;
-    Nx(1,3,g) = -(uy + my)*mx*lx/4.0;
-
-    Nx(0,4,g) = -(lx - ux)*my*ly*0.50;
-    Nx(1,4,g) = -(ly - my)*lx*ux*0.50;
-
-    Nx(0,5,g) =  (ux + mx)*ly*uy*0.50;
-    Nx(1,5,g) =  (ly - uy)*mx*ux*0.50;
-
-    Nx(0,6,g) =  (lx - ux)*my*uy*0.50;
-    Nx(1,6,g) =  (uy + my)*lx*ux*0.50;
-
-    Nx(0,7,g) = -(lx - mx)*ly*uy*0.50;
-    Nx(1,7,g) = -(ly - uy)*mx*lx*0.50;
-
-    Nx(0,8,g) =  (lx - ux)*ly*uy;
-    Nx(1,8,g) =  (ly - uy)*lx*ux;
-    }
-  },
-
-  {ElementType::TET4, [](int g, mshType& mesh) -> void { 
-    auto& xi = mesh.xi;
-    auto& N = mesh.N;
-    N(0,g) = xi(0,g);
-    N(1,g) = xi(1,g);
-    N(2,g) = xi(2,g);
-    N(3,g) = 1.0 - xi(0,g) - xi(1,g) - xi(2,g);
-
-    auto& Nx = mesh.Nx;
-    Nx(0,0,g) =  1.0;
-    Nx(1,0,g) =  0.0;
-    Nx(2,0,g) =  0.0;
-    Nx(0,1,g) =  0.0;
-    Nx(1,1,g) =  1.0;
-    Nx(2,1,g) =  0.0;
-    Nx(0,2,g) =  0.0;
-    Nx(1,2,g) =  0.0;
-    Nx(2,2,g) =  1.0;
-    Nx(0,3,g) = -1.0;
-    Nx(1,3,g) = -1.0;
-    Nx(2,3,g) = -1.0;
-    }
-  },
-
-  {ElementType::TET10, [](int g, mshType& mesh) -> void {
-    auto& xi = mesh.xi;
-    auto& N = mesh.N;
-    double s = 1.0 - xi(0,g) - xi(1,g) - xi(2,g);
-    N(0,g)  = xi(0,g)*(2.0*xi(0,g) - 1.0);
-    N(1,g)  = xi(1,g)*(2.0*xi(1,g) - 1.0);
-    N(2,g)  = xi(2,g)*(2.0*xi(2,g) - 1.0);
-    N(3,g)  = s    *(2.0*s     - 1.0);
-    N(4,g)  = 4.0*xi(0,g)*xi(1,g);
-    N(5,g)  = 4.0*xi(1,g)*xi(2,g);
-    N(6,g)  = 4.0*xi(0,g)*xi(2,g);
-    N(7,g)  = 4.0*xi(0,g)*s;
-    N(8,g)  = 4.0*xi(1,g)*s;
-    N(9,g) = 4.0*xi(2,g)*s;
-
-    auto& Nx = mesh.Nx;
-    Nx(0,0,g)  =  4.0*xi(0,g) - 1.0;
-    Nx(1,0,g)  =  0.0;
-    Nx(2,0,g)  =  0.0;
-
-    Nx(0,1,g)  =  0.0;
-    Nx(1,1,g)  =  4.0*xi(1,g) - 1.0;
-    Nx(2,1,g)  =  0.0;
-
-    Nx(0,2,g)  =  0.0;
-    Nx(1,2,g)  =  0.0;
-    Nx(2,2,g)  =  4.0*xi(2,g) - 1.0;
-
-    Nx(0,3,g)  =  1.0 - 4.0*s;
-    Nx(1,3,g)  =  1.0 - 4.0*s;
-    Nx(2,3,g)  =  1.0 - 4.0*s;
-
-    Nx(0,4,g)  =  4.0*xi(1,g);
-    Nx(1,4,g)  =  4.0*xi(0,g);
-    Nx(2,4,g)  =  0.0;
-
-    Nx(0,5,g)  =  0.0;
-    Nx(1,5,g)  =  4.0*xi(2,g);
-    Nx(2,5,g)  =  4.0*xi(1,g);
-
-    Nx(0,6,g)  =  4.0*xi(2,g);
-    Nx(1,6,g)  =  0.0;
-    Nx(2,6,g)  =  4.0*xi(0,g);
-
-    Nx(0,7,g)  =  4.0*( s - xi(0,g));
-    Nx(1,7,g)  = -4.0*xi(0,g);
-    Nx(2,7,g)  = -4.0*xi(0,g);
-
-    Nx(0,8,g)  = -4.0*xi(1,g);
-    Nx(1,8,g)  =  4.0*( s - xi(1,g));
-    Nx(2,8,g)  = -4.0*xi(1,g);
-
-    Nx(0,9,g) = -4.0*xi(2,g);
-    Nx(1,9,g) = -4.0*xi(2,g);
-    Nx(2,9,g) =  4.0*( s - xi(2,g));
-    }
-  },
-
-  {ElementType::TRI3, [](int g, mshType& mesh) -> void { 
-    auto& xi = mesh.xi;
-    auto& N = mesh.N;
-    N(0,g) = xi(0,g);
-    N(1,g) = xi(1,g);
-    N(2,g) = 1.0 - xi(0,g) - xi(1,g);
-
-    auto& Nxi = mesh.Nx;
-    Nxi(0,0,g) =  1.0;
-    Nxi(1,0,g) =  0.0;
-    Nxi(0,1,g) =  0.0;
-    Nxi(1,1,g) =  1.0;
-    Nxi(0,2,g) = -1.0;
-    Nxi(1,2,g) = -1.0;
-    }
-  },
-
-  {ElementType::TRI6, [](int g, mshType& mesh) -> void {
-    auto& xi = mesh.xi;
-    auto& N = mesh.N;
-
-    double s = 1.0 - xi(0,g) - xi(1,g);
-    N(0,g) = xi(0,g)*( 2.0*xi(0,g) - 1.0 );
-    N(1,g) = xi(1,g)*( 2.0*xi(1,g) - 1.0 );
-    N(2,g) = s    *( 2.0*s     - 1.0 );
-    N(3,g) = 4.0*xi(0,g)*xi(1,g);
-    N(4,g) = 4.0*xi(1,g)*s;
-    N(5,g) = 4.0*xi(0,g)*s;
-
-    auto& Nxi = mesh.Nx;
-    Nxi(0,0,g) =  4.0*xi(0,g) - 1.0;
-    Nxi(1,0,g) =  0.0;
-    Nxi(0,1,g) =  0.0;
-    Nxi(1,1,g) =  4.0*xi(1,g) - 1.0;
-    Nxi(0,2,g) =  1.0 - 4.0*s;
-    Nxi(1,2,g) =  1.0 - 4.0*s;
-    Nxi(0,3,g) =  4.0*xi(1,g);
-    Nxi(1,3,g) =  4.0*xi(0,g);
-    Nxi(0,4,g) = -4.0*xi(1,g);
-    Nxi(1,4,g) =  4.0*( s - xi(1,g) );
-    Nxi(0,5,g) =  4.0*( s - xi(0,g) );
-    Nxi(1,5,g) = -4.0*xi(0,g);
-    }
-  },
-
-  {ElementType::WDG, [](int g, mshType& mesh) -> void 
-    { 
-    auto& xi = mesh.xi;
-    auto& N = mesh.N;
-    double ux = xi(0,g);
-    double uy = xi(1,g);
-    double uz = 1.0 - ux - uy;
-    double s = (1.0 + xi(2,g))*0.5;
-    double t = (1.0 - xi(2,g))*0.5;
-    N(0,g) = ux*t;
-    N(1,g) = uy*t;
-    N(2,g) = uz*t;
-    N(3,g) = ux*s;
-    N(4,g) = uy*s;
-    N(5,g) = uz*s;
-
-    auto& Nxi = mesh.Nx;
-    Nxi(0,0,g) =  t;
-    Nxi(1,0,g) =  0.0;
-    Nxi(2,0,g) = -ux*0.50;
-
-    Nxi(0,1,g) =  0.0;
-    Nxi(1,1,g) =  t;
-    Nxi(2,1,g) = -uy*0.50;
-
-    Nxi(0,2,g) = -t;
-    Nxi(1,2,g) = -t;
-    Nxi(2,2,g) = -uz*0.50;
-
-    Nxi(0,3,g) =  s;
-    Nxi(1,3,g) =  0.0;
-    Nxi(2,3,g) =  ux*0.50;
-
-    Nxi(0,4,g) =  0.0;
-    Nxi(1,4,g) =  s;
-    Nxi(2,4,g) =  uy*0.50;
-
-    Nxi(0,5,g) = -s;
-    Nxi(1,5,g) = -s;
-    Nxi(2,5,g) =  uz*0.50;
-    }
-  },
-
-};
-
-//---------------------
-// set_face_shape_data
-//---------------------
-// Define a map type used to face element shape function data.
-//
-// This reproduces 'SUBROUTINE GETGNN(insd, eType, eNoN, xi, N, Nxi)' in NN.f.
-//
-using SetFaceShapeMapType = std::map<ElementType, std::function<void(int, faceType&)>>;
-
-SetFaceShapeMapType set_face_shape_data = {
-
-  {ElementType::PNT, [](int g, faceType& face) -> void 
-    {
-    face.N(0,g) = 1.0;
-    }
-  },
-
-  {ElementType::QUD8, [](int g, faceType& face) -> void 
-    {
-    auto& xi = face.xi;
-    double lx = 1.0 - xi(0,g);
-    double ly = 1.0 - xi(1,g);
-    double ux = 1.0 + xi(0,g);
-    double uy = 1.0 + xi(1,g);
-    double mx = lx*ux;
-    double my = ly*uy;
-
-    auto& N = face.N;
-    N(0,g) = lx*ly*(lx+ly-3.0)/4.0;
-    N(1,g) = ux*ly*(ux+ly-3.0)/4.0;
-    N(2,g) = ux*uy*(ux+uy-3.0)/4.0;
-    N(3,g) = lx*uy*(lx+uy-3.0)/4.0;
-    N(4,g) = mx*ly*0.50;
-    N(5,g) = ux*my*0.50;
-    N(6,g) = mx*uy*0.50;
-    N(7,g) = lx*my*0.50;
-
-    auto& Nxi = face.Nx;
-    Nxi(0,0,g) = -ly*(lx+ly-3.0+lx)/4.0;
-    Nxi(1,0,g) = -lx*(lx+ly-3.0+ly)/4.0;
-
-    Nxi(0,1,g) =  ly*(ux+ly-3.0+ux)/4.0;
-    Nxi(1,1,g) = -ux*(ux+ly-3.0+ly)/4.0;
-
-    Nxi(0,2,g) =  uy*(ux+uy-3.0+ux)/4.0;
-    Nxi(1,2,g) =  ux*(ux+uy-3.0+uy)/4.0;
-
-    Nxi(0,3,g) = -uy*(lx+uy-3.0+lx)/4.0;
-    Nxi(1,3,g) =  lx*(lx+uy-3.0+uy)/4.0;
-
-    Nxi(0,4,g) =  (lx - ux)*ly*0.50;
-    Nxi(1,4,g) = -mx*0.50;
-
-    Nxi(0,5,g) =  my*0.50;
-    Nxi(1,5,g) =  (ly - uy)*ux*0.50;
-
-    Nxi(0,6,g) =  (lx - ux)*uy*0.50;
-    Nxi(1,6,g) =  mx*0.50;
-
-    Nxi(0,7,g) = -my*0.50;
-    Nxi(1,7,g) =  (ly - uy)*lx*0.50;
-    }
-  },
-
-  {ElementType::QUD9, [](int g, faceType& face) -> void 
-    {
-    auto& xi = face.xi;
-    double lx = 1.0 - xi(0,g);
-    double ly = 1.0 - xi(1,g);
-    double ux = 1.0 + xi(0,g);
-    double uy = 1.0 + xi(1,g);
-    double mx = xi(0,g);
-    double my = xi(1,g);
-
-    auto& N = face.N;
-    N(0,g) =  mx*lx*my*ly/4.0;
-    N(1,g) = -mx*ux*my*ly/4.0;
-    N(2,g) =  mx*ux*my*uy/4.0;
-    N(3,g) = -mx*lx*my*uy/4.0;
-    N(4,g) = -lx*ux*my*ly*0.50;
-    N(5,g) =  mx*ux*ly*uy*0.50;
-    N(6,g) =  lx*ux*my*uy*0.50;
-    N(7,g) = -mx*lx*ly*uy*0.50;
-    N(8,g) =  lx*ux*ly*uy;
-
-    auto& Nx = face.Nx;
-    Nx(0,0,g) =  (lx - mx)*my*ly/4.0;
-    Nx(1,0,g) =  (ly - my)*mx*lx/4.0;
-    Nx(0,1,g) = -(ux + mx)*my*ly/4.0;
-    Nx(1,1,g) = -(ly - my)*mx*ux/4.0;
-    Nx(0,2,g) =  (ux + mx)*my*uy/4.0;
-    Nx(1,2,g) =  (uy + my)*mx*ux/4.0;
-    Nx(0,3,g) = -(lx - mx)*my*uy/4.0;
-    Nx(1,3,g) = -(uy + my)*mx*lx/4.0;
-    Nx(0,4,g) = -(lx - ux)*my*ly*0.50;
-    Nx(1,4,g) = -(ly - my)*lx*ux*0.50;
-    Nx(0,5,g) =  (ux + mx)*ly*uy*0.50;
-    Nx(1,5,g) =  (ly - uy)*mx*ux*0.50;
-    Nx(0,6,g) =  (lx - ux)*my*uy*0.50;
-    Nx(1,6,g) =  (uy + my)*lx*ux*0.50;
-    Nx(0,7,g) = -(lx - mx)*ly*uy*0.50;
-    Nx(1,7,g) = -(ly - uy)*mx*lx*0.50;
-    Nx(0,8,g) =  (lx - ux)*ly*uy;
-    Nx(1,8,g) =  (ly - uy)*lx*ux;
-    }
-  },
-
-  {ElementType::LIN1, [](int g, faceType& face) -> void 
-    {
-    face.N(0,g) = 0.5 * (1.0 - face.xi(0,g));
-    face.N(1,g) = 0.5 * (1.0 + face.xi(0,g));
-
-    face.Nx(0,0,g) = -0.5;
-    face.Nx(0,1,g) =  0.5;
-    }
-  },
-
-  {ElementType::LIN2, [](int g, faceType& face) -> void
-    {
-    auto& xi = face.xi;
-    auto& N = face.N;
-    N(0,g) = -xi(0,g)*(1.0 - xi(0,g))*0.50;
-    N(1,g) =  xi(0,g)*(1.0 + xi(0,g))*0.50;
-    N(2,g) = (1.0 - xi(0,g))*(1.0 + xi(0,g));
-
-    auto& Nx = face.Nx;
-    Nx(0,0,g) = -0.50 + xi(0,g);
-    Nx(0,1,g) =  0.50 + xi(0,g);
-    Nx(0,2,g) = -2.0*xi(0,g);
-    }
-  },
-
-  {ElementType::QUD4, [](int g, faceType& face) -> void {
-    auto& xi = face.xi;
-    double lx = 1.0 - xi(0,g);
-    double ly = 1.0 - xi(1,g);
-    double ux = 1.0 + xi(0,g);
-    double uy = 1.0 + xi(1,g);
-
-    auto& N =face.N;
-    N(0,g) = lx*ly / 4.0;
-    N(1,g) = ux*ly / 4.0;
-    N(2,g) = ux*uy / 4.0;
-    N(3,g) = lx*uy / 4.0;
-
-    auto& Nx = face.Nx;
-    Nx(0,0,g) = -ly / 4.0;
-    Nx(1,0,g) = -lx / 4.0;
-    Nx(0,1,g) =  ly / 4.0;
-    Nx(1,1,g) = -ux / 4.0;
-    Nx(0,2,g) =  uy / 4.0;
-    Nx(1,2,g) =  ux / 4.0;
-    Nx(0,3,g) = -uy / 4.0;
-    Nx(1,3,g) =  lx / 4.0;
-    }
-  },
-
-  {ElementType::TRI3, [](int g, faceType& face) -> void 
-    {
-    face.N(0,g) = face.xi(0,g);
-    face.N(1,g) = face.xi(1,g);
-    face.N(2,g) = 1.0 - face.xi(0,g) - face.xi(1,g);
-
-    face.Nx(0,0,g) = 1.0;
-    face.Nx(1,0,g) = 0.0;
-
-    face.Nx(0,1,g) = 0.0;
-    face.Nx(1,1,g) = 1.0;
-
-    face.Nx(0,2,g) = -1.0;
-    face.Nx(1,2,g) = -1.0;
-    }
-  },
-
-  {ElementType::TRI6, [](int g, faceType& face) -> void
-    {
-    auto& xi = face.xi;
-    auto& N = face.N;
-
-    double s = 1.0 - xi(0,g) - xi(1,g);
-    N(0,g) = xi(0,g)*( 2.0*xi(0,g) - 1.0 );
-    N(1,g) = xi(1,g)*( 2.0*xi(1,g) - 1.0 );
-    N(2,g) = s    *( 2.0*s     - 1.0 );
-    N(3,g) = 4.0*xi(0,g)*xi(1,g);
-    N(4,g) = 4.0*xi(1,g)*s;
-    N(5,g) = 4.0*xi(0,g)*s;
-
-    auto& Nxi = face.Nx;
-    Nxi(0,0,g) =  4.0*xi(0,g) - 1.0;
-    Nxi(1,0,g) =  0.0;
-
-    Nxi(0,1,g) =  0.0;
-    Nxi(1,1,g) =  4.0*xi(1,g) - 1.0;
-
-    Nxi(0,2,g) =  1.0 - 4.0*s;
-    Nxi(1,2,g) =  1.0 - 4.0*s;
-
-    Nxi(0,3,g) =  4.0*xi(1,g);
-    Nxi(1,3,g) =  4.0*xi(0,g);
-
-    Nxi(0,4,g) = -4.0*xi(1,g);
-    Nxi(1,4,g) =  4.0*( s - xi(1,g) );
-
-    Nxi(0,5,g) =  4.0*( s - xi(0,g) );
-    Nxi(1,5,g) = -4.0*xi(0,g);
-    }
-  },
-
-
-};
diff --git a/Code/Source/solver/nn_elem_gnnxx.h b/Code/Source/solver/nn_elem_gnnxx.h
deleted file mode 100644
index 7b40a783b..000000000
--- a/Code/Source/solver/nn_elem_gnnxx.h
+++ /dev/null
@@ -1,139 +0,0 @@
-// SPDX-FileCopyrightText: Copyright (c) Stanford University, The Regents of the University of California, and others.
-// SPDX-License-Identifier: BSD-3-Clause
-
-/// @brief Define a map type used to compute 2nd direivatives of element shape function data.
-///
-/// Replicates 'SUBROUTINE GETGNNxx(insd, ind2, eType, eNoN, xi, Nxx)'
-//
-static double fp = 4.0;
-static double fn = -4.0;
-static double en = -8.0;
-static double ze =  0.0;
-
-using GetElement2ndDerivMapType = std::map<ElementType, std::function<void(const int, const int, const int, 
-    const int, const Array<double>&, Array3<double>&)>>;
-
-GetElement2ndDerivMapType get_element_2nd_derivs = {
-
-  {ElementType::QUD8, [](const int insd, const int ind2, const int eNoN, const int g, const Array<double>& xi, 
-       Array3<double>& Nxx) -> void {
-
-    double lx = 1.0 - xi(0);
-    double ly = 1.0 - xi(1);
-    double ux = 1.0 + xi(0);
-    double uy = 1.0 + xi(1);
-    double mx = xi(0);
-    double my = xi(1);
-
-    Nxx(0,0,g) =  ly*0.50;
-    Nxx(1,0,g) =  lx*0.50;
-    Nxx(2,0,g) =  (lx+lx+ly+ly-3.0)/4.0;
-
-    Nxx(0,1,g) =  ly*0.50;
-    Nxx(1,1,g) =  ux*0.50;
-    Nxx(2,1,g) = -(ux+ux+ly+ly-3.0)/4.0;
-
-    Nxx(0,2,g) =  uy*0.50;
-    Nxx(1,2,g) =  ux*0.50;
-    Nxx(2,3,g) =  (ux+ux+uy+uy-3.0)/4.0;
-
-    Nxx(0,3,g) =  uy*0.50;
-    Nxx(1,3,g) =  lx*0.50;
-    Nxx(2,3,g) = -(lx+lx+uy+uy-3.0)/4.0;
-
-    Nxx(0,4,g) = -ly;
-    Nxx(1,4,g) =  0.0;
-    Nxx(2,4,g) =  mx;
-
-    Nxx(0,5,g) =  0.0;
-    Nxx(1,5,g) = -ux;
-    Nxx(2,5,g) = -my;
-
-    Nxx(0,6,g) = -uy;
-    Nxx(1,6,g) =  0.0;
-    Nxx(2,6,g) = -mx;
-
-    Nxx(0,7,g) =  0.0;
-    Nxx(1,7,g) = -lx;
-    Nxx(2,7,g) =  my;
-    }
-  },
-
-  {ElementType::QUD9, [](const int insd, const int ind2, const int eNoN, const int g, const Array<double>& xi, 
-       Array3<double>& Nxx) -> void { 
-
-    double lx = 1.0 - xi(0,g);
-    double ly = 1.0 - xi(1,g);
-    double ux = 1.0 + xi(0,g);
-    double uy = 1.0 + xi(1,g);
-    double mx = xi(0,g);
-    double my = xi(1,g);
-
-    Nxx(0,0,g) = -ly*my*0.5;
-    Nxx(1,0,g) = -lx*mx*0.5;
-    Nxx(2,0,g) =  (lx-mx)*(ly-my)/4.0;
-
-    Nxx(0,1,g) = -ly*my*0.5;
-    Nxx(1,1,g) =  ux*mx*0.5;
-    Nxx(2,1,g) = -(ux+mx)*(ly-my)/4.0;
-
-    Nxx(0,2,g) =  uy*my*0.5;
-    Nxx(1,2,g) =  ux*mx*0.5;
-    Nxx(2,2,g) =  (ux+mx)*(uy+my)/4.0;
-
-    Nxx(0,3,g) =  uy*my*0.5;
-    Nxx(1,3,g) = -lx*mx*0.5;
-    Nxx(2,3,g) = -(lx-mx)*(uy+my)/4.0;
-
-    Nxx(0,4,g) =  ly*my;
-    Nxx(1,4,g) =  lx*ux;
-    Nxx(2,4,g) =  mx*(ly-my);
-
-    Nxx(0,5,g) =  ly*uy;
-    Nxx(1,5,g) = -ux*mx;
-    Nxx(2,5,g) = -(ux+mx)*my;
-
-    Nxx(0,6,g) = -uy*my;
-    Nxx(1,6,g) =  lx*ux;
-    Nxx(2,6,g) = -mx*(uy+my);
-
-    Nxx(0,7,g) =  ly*uy;
-    Nxx(1,7,g) =  lx*mx;
-    Nxx(2,7,g) =  (lx-mx)*my;
-
-    Nxx(0,8,g) = -ly*uy*2.0;
-    Nxx(1,8,g) = -lx*ux*2.0;
-    Nxx(2,8,g) =  mx*my*4.0;
-    }
-  },
-
-  {ElementType::TET10, [](const int insd, const int ind2, const int eNoN, const int g, const Array<double>& xi, 
-       Array3<double>& Nxx) -> void { 
-    Nxx.set_row(0, g, {fp, ze, ze, ze, ze, ze});
-    Nxx.set_row(1, g, {ze, fp, ze, ze, ze, ze});
-    Nxx.set_row(2, g, {ze, ze, fp, ze, ze, ze});
-    Nxx.set_row(3, g, {fp, fp, fp, fp, fp, fp});
-    Nxx.set_row(4, g, {ze, ze, ze, fp, ze, ze});
-    Nxx.set_row(5, g, {ze, ze, ze, ze, fp, ze});
-    Nxx.set_row(6, g, {ze, ze, ze, ze, ze, fp});
-    Nxx.set_row(7, g, {en, ze, ze, fn, ze, fn});
-    Nxx.set_row(8, g, {ze, en, ze, fn, fn, ze});
-    Nxx.set_row(9, g, {ze, ze, en, ze, fn, fn});
-    }
-  },
-
-  {ElementType::TRI6, [](const int insd, const int ind2, const int eNoN, const int g, const Array<double>& xi,
-       Array3<double>& Nxx) -> void {
-
-    Nxx.set_row(0, g, {fp, ze, ze});
-    Nxx.set_row(1, g, {ze, fp, ze});
-    Nxx.set_row(2, g, {fp, fp, fp});
-    Nxx.set_row(3, g, {ze, ze, fp});
-    Nxx.set_row(4, g, {ze, en, fn});
-    Nxx.set_row(5, g, {en, ze, fn});
-    }
-  },
-
-};
-
-

From 36046f8a2f0baa006dcd2fa896bd66498c7032b0 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Mon, 8 Jun 2026 14:25:44 -0700
Subject: [PATCH 10/22] fixing the licensing and copyright comments

---
 Code/Source/solver/FE/Basis/BasisExceptions.h |  8 ++---
 Code/Source/solver/FE/Basis/BasisFactory.cpp  |  8 ++---
 Code/Source/solver/FE/Basis/BasisFactory.h    |  8 ++---
 Code/Source/solver/FE/Basis/BasisFunction.cpp |  8 ++---
 Code/Source/solver/FE/Basis/BasisFunction.h   |  8 ++---
 Code/Source/solver/FE/Basis/BasisTraits.h     |  8 ++---
 Code/Source/solver/FE/Basis/LagrangeBasis.cpp |  8 ++---
 Code/Source/solver/FE/Basis/LagrangeBasis.h   |  8 ++---
 .../FE/Basis/NodeOrderingConventions.cpp      |  8 ++---
 .../solver/FE/Basis/NodeOrderingConventions.h |  8 ++---
 .../solver/FE/Basis/SerendipityBasis.cpp      |  8 ++---
 .../Source/solver/FE/Basis/SerendipityBasis.h |  8 ++---
 Code/Source/solver/FE/Common/Types.h          | 31 ++-----------------
 .../solver/FE/Math/DenseLinearAlgebra.cpp     |  8 ++---
 .../solver/FE/Math/DenseLinearAlgebra.h       |  8 ++---
 .../solver/FE/Math/DenseTransformKernels.h    |  8 ++---
 Code/Source/solver/FE/Math/Matrix.h           |  3 ++
 Code/Source/solver/FE/Math/MatrixExpr.h       |  3 ++
 Code/Source/solver/FE/Math/Vector.h           |  3 ++
 Code/Source/solver/FE/Math/VectorExpr.h       |  3 ++
 20 files changed, 44 insertions(+), 119 deletions(-)

diff --git a/Code/Source/solver/FE/Basis/BasisExceptions.h b/Code/Source/solver/FE/Basis/BasisExceptions.h
index 8ee92a3dd..c1af17049 100644
--- a/Code/Source/solver/FE/Basis/BasisExceptions.h
+++ b/Code/Source/solver/FE/Basis/BasisExceptions.h
@@ -1,9 +1,5 @@
-/* Copyright (c) Stanford University, The Regents of the University of California, and others.
- *
- * All Rights Reserved.
- *
- * See License file.
- */
+// SPDX-FileCopyrightText: Copyright (c) Stanford University, The Regents of the University of California, and others.
+// SPDX-License-Identifier: BSD-3-Clause
 
 #ifndef SVMP_FE_BASIS_BASISEXCEPTIONS_H
 #define SVMP_FE_BASIS_BASISEXCEPTIONS_H
diff --git a/Code/Source/solver/FE/Basis/BasisFactory.cpp b/Code/Source/solver/FE/Basis/BasisFactory.cpp
index 9f0867959..bc01be0ed 100644
--- a/Code/Source/solver/FE/Basis/BasisFactory.cpp
+++ b/Code/Source/solver/FE/Basis/BasisFactory.cpp
@@ -1,9 +1,5 @@
-/* Copyright (c) Stanford University, The Regents of the University of California, and others.
- *
- * All Rights Reserved.
- *
- * See License file.
- */
+// SPDX-FileCopyrightText: Copyright (c) Stanford University, The Regents of the University of California, and others.
+// SPDX-License-Identifier: BSD-3-Clause
 
 #include "BasisFactory.h"
 
diff --git a/Code/Source/solver/FE/Basis/BasisFactory.h b/Code/Source/solver/FE/Basis/BasisFactory.h
index c937dd4a0..b188b3aa2 100644
--- a/Code/Source/solver/FE/Basis/BasisFactory.h
+++ b/Code/Source/solver/FE/Basis/BasisFactory.h
@@ -1,9 +1,5 @@
-/* Copyright (c) Stanford University, The Regents of the University of California, and others.
- *
- * All Rights Reserved.
- *
- * See License file.
- */
+// SPDX-FileCopyrightText: Copyright (c) Stanford University, The Regents of the University of California, and others.
+// SPDX-License-Identifier: BSD-3-Clause
 
 #ifndef SVMP_FE_BASIS_BASISFACTORY_H
 #define SVMP_FE_BASIS_BASISFACTORY_H
diff --git a/Code/Source/solver/FE/Basis/BasisFunction.cpp b/Code/Source/solver/FE/Basis/BasisFunction.cpp
index 578c46c88..3d95671f4 100644
--- a/Code/Source/solver/FE/Basis/BasisFunction.cpp
+++ b/Code/Source/solver/FE/Basis/BasisFunction.cpp
@@ -1,9 +1,5 @@
-/* Copyright (c) Stanford University, The Regents of the University of California, and others.
- *
- * All Rights Reserved.
- *
- * See License file.
- */
+// SPDX-FileCopyrightText: Copyright (c) Stanford University, The Regents of the University of California, and others.
+// SPDX-License-Identifier: BSD-3-Clause
 
 #include "BasisFunction.h"
 
diff --git a/Code/Source/solver/FE/Basis/BasisFunction.h b/Code/Source/solver/FE/Basis/BasisFunction.h
index bf6ac5de7..5ad65f35d 100644
--- a/Code/Source/solver/FE/Basis/BasisFunction.h
+++ b/Code/Source/solver/FE/Basis/BasisFunction.h
@@ -1,9 +1,5 @@
-/* Copyright (c) Stanford University, The Regents of the University of California, and others.
- *
- * All Rights Reserved.
- *
- * See License file.
- */
+// SPDX-FileCopyrightText: Copyright (c) Stanford University, The Regents of the University of California, and others.
+// SPDX-License-Identifier: BSD-3-Clause
 
 #ifndef SVMP_FE_BASIS_BASISFUNCTION_H
 #define SVMP_FE_BASIS_BASISFUNCTION_H
diff --git a/Code/Source/solver/FE/Basis/BasisTraits.h b/Code/Source/solver/FE/Basis/BasisTraits.h
index d97b59f1f..eca5c1c69 100644
--- a/Code/Source/solver/FE/Basis/BasisTraits.h
+++ b/Code/Source/solver/FE/Basis/BasisTraits.h
@@ -1,9 +1,5 @@
-/* Copyright (c) Stanford University, The Regents of the University of California, and others.
- *
- * All Rights Reserved.
- *
- * See License file.
- */
+// SPDX-FileCopyrightText: Copyright (c) Stanford University, The Regents of the University of California, and others.
+// SPDX-License-Identifier: BSD-3-Clause
 
 #ifndef SVMP_FE_BASIS_BASISTRAITS_H
 #define SVMP_FE_BASIS_BASISTRAITS_H
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
index 372209722..ece2d9cb5 100644
--- a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
+++ b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
@@ -1,9 +1,5 @@
-/* Copyright (c) Stanford University, The Regents of the University of California, and others.
- *
- * All Rights Reserved.
- *
- * See License file.
- */
+// SPDX-FileCopyrightText: Copyright (c) Stanford University, The Regents of the University of California, and others.
+// SPDX-License-Identifier: BSD-3-Clause
 
 #include "LagrangeBasis.h"
 #include "NodeOrderingConventions.h"
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasis.h b/Code/Source/solver/FE/Basis/LagrangeBasis.h
index dae149872..43304a263 100644
--- a/Code/Source/solver/FE/Basis/LagrangeBasis.h
+++ b/Code/Source/solver/FE/Basis/LagrangeBasis.h
@@ -1,9 +1,5 @@
-/* Copyright (c) Stanford University, The Regents of the University of California, and others.
- *
- * All Rights Reserved.
- *
- * See License file.
- */
+// SPDX-FileCopyrightText: Copyright (c) Stanford University, The Regents of the University of California, and others.
+// SPDX-License-Identifier: BSD-3-Clause
 
 #ifndef SVMP_FE_BASIS_LAGRANGEBASIS_H
 #define SVMP_FE_BASIS_LAGRANGEBASIS_H
diff --git a/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp b/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
index ae3ea8ed3..76662abe1 100644
--- a/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
+++ b/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
@@ -1,9 +1,5 @@
-/* Copyright (c) Stanford University, The Regents of the University of California, and others.
- *
- * All Rights Reserved.
- *
- * See License file.
- */
+// SPDX-FileCopyrightText: Copyright (c) Stanford University, The Regents of the University of California, and others.
+// SPDX-License-Identifier: BSD-3-Clause
 
 #include "NodeOrderingConventions.h"
 #include "BasisExceptions.h"
diff --git a/Code/Source/solver/FE/Basis/NodeOrderingConventions.h b/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
index 8a43cc4e3..4b11cca32 100644
--- a/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
+++ b/Code/Source/solver/FE/Basis/NodeOrderingConventions.h
@@ -1,9 +1,5 @@
-/* Copyright (c) Stanford University, The Regents of the University of California, and others.
- *
- * All Rights Reserved.
- *
- * See License file.
- */
+// SPDX-FileCopyrightText: Copyright (c) Stanford University, The Regents of the University of California, and others.
+// SPDX-License-Identifier: BSD-3-Clause
 
 #ifndef SVMP_FE_BASIS_NODEORDERINGCONVENTIONS_H
 #define SVMP_FE_BASIS_NODEORDERINGCONVENTIONS_H
diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
index 237f8c2ce..d551419a8 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
@@ -1,9 +1,5 @@
-/* Copyright (c) Stanford University, The Regents of the University of California, and others.
- *
- * All Rights Reserved.
- *
- * See License file.
- */
+// SPDX-FileCopyrightText: Copyright (c) Stanford University, The Regents of the University of California, and others.
+// SPDX-License-Identifier: BSD-3-Clause
 
 #include "SerendipityBasis.h"
 #include "LagrangeBasis.h"
diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.h b/Code/Source/solver/FE/Basis/SerendipityBasis.h
index 10e426164..e0289f82d 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.h
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.h
@@ -1,9 +1,5 @@
-/* Copyright (c) Stanford University, The Regents of the University of California, and others.
- *
- * All Rights Reserved.
- *
- * See License file.
- */
+// SPDX-FileCopyrightText: Copyright (c) Stanford University, The Regents of the University of California, and others.
+// SPDX-License-Identifier: BSD-3-Clause
 
 #ifndef SVMP_FE_BASIS_SERENDIPITYBASIS_H
 #define SVMP_FE_BASIS_SERENDIPITYBASIS_H
diff --git a/Code/Source/solver/FE/Common/Types.h b/Code/Source/solver/FE/Common/Types.h
index bb3f23bca..e3d5a46e9 100644
--- a/Code/Source/solver/FE/Common/Types.h
+++ b/Code/Source/solver/FE/Common/Types.h
@@ -1,32 +1,5 @@
-/* Copyright (c) Stanford University, The Regents of the University of California, and others.
- *
- * All Rights Reserved.
- *
- * See Copyright-SimVascular.txt for additional details.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject
- * to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
- * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
- * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
- * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
- * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
+// SPDX-FileCopyrightText: Copyright (c) Stanford University, The Regents of the University of California, and others.
+// SPDX-License-Identifier: BSD-3-Clause
 
 #ifndef SVMP_FE_TYPES_H
 #define SVMP_FE_TYPES_H
diff --git a/Code/Source/solver/FE/Math/DenseLinearAlgebra.cpp b/Code/Source/solver/FE/Math/DenseLinearAlgebra.cpp
index 7d909fa0c..8be9a7560 100644
--- a/Code/Source/solver/FE/Math/DenseLinearAlgebra.cpp
+++ b/Code/Source/solver/FE/Math/DenseLinearAlgebra.cpp
@@ -1,9 +1,5 @@
-/* Copyright (c) Stanford University, The Regents of the University of California, and others.
- *
- * All Rights Reserved.
- *
- * See License file.
- */
+// SPDX-FileCopyrightText: Copyright (c) Stanford University, The Regents of the University of California, and others.
+// SPDX-License-Identifier: BSD-3-Clause
 
 #include "DenseLinearAlgebra.h"
 
diff --git a/Code/Source/solver/FE/Math/DenseLinearAlgebra.h b/Code/Source/solver/FE/Math/DenseLinearAlgebra.h
index 7684439b5..6c81755f4 100644
--- a/Code/Source/solver/FE/Math/DenseLinearAlgebra.h
+++ b/Code/Source/solver/FE/Math/DenseLinearAlgebra.h
@@ -1,9 +1,5 @@
-/* Copyright (c) Stanford University, The Regents of the University of California, and others.
- *
- * All Rights Reserved.
- *
- * See License file.
- */
+// SPDX-FileCopyrightText: Copyright (c) Stanford University, The Regents of the University of California, and others.
+// SPDX-License-Identifier: BSD-3-Clause
 
 #ifndef SVMP_FE_MATH_DENSELINEARALGEBRA_H
 #define SVMP_FE_MATH_DENSELINEARALGEBRA_H
diff --git a/Code/Source/solver/FE/Math/DenseTransformKernels.h b/Code/Source/solver/FE/Math/DenseTransformKernels.h
index 8bf83ec0b..50f1002de 100644
--- a/Code/Source/solver/FE/Math/DenseTransformKernels.h
+++ b/Code/Source/solver/FE/Math/DenseTransformKernels.h
@@ -1,9 +1,5 @@
-/* Copyright (c) Stanford University, The Regents of the University of California, and others.
- *
- * All Rights Reserved.
- *
- * See License file.
- */
+// SPDX-FileCopyrightText: Copyright (c) Stanford University, The Regents of the University of California, and others.
+// SPDX-License-Identifier: BSD-3-Clause
 
 #ifndef SVMP_FE_MATH_DENSETRANSFORMKERNELS_H
 #define SVMP_FE_MATH_DENSETRANSFORMKERNELS_H
diff --git a/Code/Source/solver/FE/Math/Matrix.h b/Code/Source/solver/FE/Math/Matrix.h
index 8cb28e5d5..3f3a9d9b6 100644
--- a/Code/Source/solver/FE/Math/Matrix.h
+++ b/Code/Source/solver/FE/Math/Matrix.h
@@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: Copyright (c) Stanford University, The Regents of the University of California, and others.
+// SPDX-License-Identifier: BSD-3-Clause
+
 #ifndef SVMP_FE_MATH_MATRIX_H
 #define SVMP_FE_MATH_MATRIX_H
 
diff --git a/Code/Source/solver/FE/Math/MatrixExpr.h b/Code/Source/solver/FE/Math/MatrixExpr.h
index 13010bddf..288bbc5ca 100644
--- a/Code/Source/solver/FE/Math/MatrixExpr.h
+++ b/Code/Source/solver/FE/Math/MatrixExpr.h
@@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: Copyright (c) Stanford University, The Regents of the University of California, and others.
+// SPDX-License-Identifier: BSD-3-Clause
+
 #ifndef SVMP_FE_MATH_MATRIX_EXPR_H
 #define SVMP_FE_MATH_MATRIX_EXPR_H
 
diff --git a/Code/Source/solver/FE/Math/Vector.h b/Code/Source/solver/FE/Math/Vector.h
index 777f9945b..a1214f9aa 100644
--- a/Code/Source/solver/FE/Math/Vector.h
+++ b/Code/Source/solver/FE/Math/Vector.h
@@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: Copyright (c) Stanford University, The Regents of the University of California, and others.
+// SPDX-License-Identifier: BSD-3-Clause
+
 #ifndef SVMP_FE_MATH_VECTOR_H
 #define SVMP_FE_MATH_VECTOR_H
 
diff --git a/Code/Source/solver/FE/Math/VectorExpr.h b/Code/Source/solver/FE/Math/VectorExpr.h
index 178b66b8a..aa712dd63 100644
--- a/Code/Source/solver/FE/Math/VectorExpr.h
+++ b/Code/Source/solver/FE/Math/VectorExpr.h
@@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: Copyright (c) Stanford University, The Regents of the University of California, and others.
+// SPDX-License-Identifier: BSD-3-Clause
+
 #ifndef SVMP_FE_MATH_VECTOR_EXPR_H
 #define SVMP_FE_MATH_VECTOR_EXPR_H
 

From 3691503eed8da410083633ed831ec7c350bd433f Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Tue, 9 Jun 2026 10:00:33 -0700
Subject: [PATCH 11/22] including doxygen documentation for Basis and Math
 submodules

---
 .github/workflows/documentation.yml           |   2 +-
 Code/Source/solver/FE/Basis/BasisFunction.h   |  86 ++++++++++
 Code/Source/solver/FE/Basis/LagrangeBasis.cpp |  19 ++-
 Code/Source/solver/FE/Basis/LagrangeBasis.h   | 158 ++++++++++++++++++
 .../Source/solver/FE/Basis/SerendipityBasis.h | 116 +++++++++++++
 Code/Source/solver/FE/Math/Matrix.h           |  27 ++-
 Code/Source/solver/FE/Math/Vector.h           |  14 ++
 Documentation/Doxyfile                        |   8 +-
 .../FE/Basis/test_BasisErrorPaths.cpp         |   2 +-
 9 files changed, 423 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml
index db67bbbdb..c1f8a3b5d 100644
--- a/.github/workflows/documentation.yml
+++ b/.github/workflows/documentation.yml
@@ -10,7 +10,7 @@ jobs:
       - uses: actions/checkout@v4
       - name: Build doxygen documentation
         run: |
-          sudo apt install -y doxygen
+          sudo apt install -y doxygen graphviz
           doxygen Documentation/Doxyfile
       - name: Save documentation
         uses: actions/upload-artifact@v4
diff --git a/Code/Source/solver/FE/Basis/BasisFunction.h b/Code/Source/solver/FE/Basis/BasisFunction.h
index 5ad65f35d..f8f78d7b6 100644
--- a/Code/Source/solver/FE/Basis/BasisFunction.h
+++ b/Code/Source/solver/FE/Basis/BasisFunction.h
@@ -12,11 +12,25 @@
 #include <cstddef>
 #include <vector>
 
+/// \defgroup FE FE Library
+/// \brief Finite-element interfaces and utilities used by the solver.
+///
+/// The FE library groups basis functions, math utilities, assembly interfaces,
+/// and related support code that can be built and consumed as a coherent
+/// finite-element component.
+
+/// \defgroup FE_Basis Basis
+/// \ingroup FE
+/// \brief Basis-function interfaces, concrete basis families, and reference-node conventions.
+
 namespace svmp {
 namespace FE {
 namespace basis {
 
+/// \brief Gradient vector type used by basis evaluators.
 using Gradient = math::Vector<Real, 3>;
+
+/// \brief Hessian matrix type used by basis evaluators.
 using Hessian  = math::Matrix<Real, 3, 3>;
 
 [[nodiscard]] inline Hessian make_symmetric_hessian(Real xx,
@@ -71,38 +85,110 @@ inline void add_scaled_hessian(Hessian& target,
     }
 }
 
+/// \brief Abstract interface for finite-element basis-function families.
+/// \ingroup FE_Basis
+///
+/// BasisFunction defines the common query and evaluation API used by solver
+/// code that does not need to know the concrete basis implementation. Derived
+/// classes provide values at minimum and can override analytical gradients,
+/// Hessians, combined evaluation, and flat-buffer output paths.
 class BasisFunction {
 public:
+    /// \brief Destroy a basis function through the abstract interface.
     virtual ~BasisFunction() = default;
 
+    /// \brief Return the concrete basis family.
+    /// \return Basis family identifier.
     virtual BasisType basis_type() const noexcept = 0;
+
+    /// \brief Return the canonical element type represented by this basis.
+    /// \return Element type used for node layout and evaluation.
     virtual ElementType element_type() const noexcept = 0;
+
+    /// \brief Return the reference-space dimension of the basis.
+    /// \return Reference dimension, from zero for points through three for volume elements.
     virtual int dimension() const noexcept = 0;
+
+    /// \brief Return the polynomial order represented by this basis.
+    /// \return Effective polynomial order after any element-family normalization.
     virtual int order() const noexcept = 0;
+
+    /// \brief Return the number of basis functions and reference nodes.
+    /// \return Basis function count.
     virtual std::size_t size() const noexcept = 0;
 
+    /// \brief Evaluate basis function values at a reference coordinate.
+    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+    /// \param values Receives one value per basis function.
     virtual void evaluate_values(const math::Vector<Real, 3>& xi,
                                  std::vector<Real>& values) const = 0;
+
+    /// \brief Evaluate basis gradients at a reference coordinate.
+    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+    /// \param gradients Receives one three-component gradient per basis function.
+    /// \throws BasisEvaluationException If gradients are not available for the basis.
     virtual void evaluate_gradients(const math::Vector<Real, 3>& xi,
                                     std::vector<Gradient>& gradients) const;
+
+    /// \brief Evaluate basis Hessians at a reference coordinate.
+    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+    /// \param hessians Receives one 3-by-3 Hessian per basis function.
+    /// \throws BasisEvaluationException If Hessians are not available for the basis.
     virtual void evaluate_hessians(const math::Vector<Real, 3>& xi,
                                    std::vector<Hessian>& hessians) const;
+
+    /// \brief Evaluate values, gradients, and Hessians together.
+    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+    /// \param values Receives one value per basis function.
+    /// \param gradients Receives one three-component gradient per basis function.
+    /// \param hessians Receives one 3-by-3 Hessian per basis function.
     virtual void evaluate_all(const math::Vector<Real, 3>& xi,
                               std::vector<Real>& values,
                               std::vector<Gradient>& gradients,
                               std::vector<Hessian>& hessians) const;
 
+    /// \brief Evaluate basis values into a flat caller-provided buffer.
+    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+    /// \param values_out Output buffer with at least size() entries.
     virtual void evaluate_values_to(const math::Vector<Real, 3>& xi,
                                     Real* SVMP_RESTRICT values_out) const;
+
+    /// \brief Evaluate basis gradients into a flat caller-provided buffer.
+    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+    /// \param gradients_out Output buffer with node-major layout: node * 3 + component.
     virtual void evaluate_gradients_to(const math::Vector<Real, 3>& xi,
                                        Real* SVMP_RESTRICT gradients_out) const;
+
+    /// \brief Evaluate basis Hessians into a flat caller-provided buffer.
+    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+    /// \param hessians_out Output buffer with node-major row-major layout: node * 9 + row * 3 + col.
     virtual void evaluate_hessians_to(const math::Vector<Real, 3>& xi,
                                       Real* SVMP_RESTRICT hessians_out) const;
 
 protected:
+    /// \brief Approximate gradients by centered finite differences of values.
+    ///
+    /// \details This helper exists as a development and fallback utility for
+    /// basis implementations that do not yet provide analytical gradients. It
+    /// is useful for prototyping new basis families and for checking analytical
+    /// derivative formulas in tests. Production element assembly should prefer
+    /// analytical gradients when available because finite differences introduce
+    /// truncation/roundoff sensitivity and require multiple value evaluations
+    /// per reference coordinate.
     void numerical_gradient(const math::Vector<Real, 3>& xi,
                             std::vector<Gradient>& gradients,
                             Real eps = Real(1e-6)) const;
+
+    /// \brief Approximate Hessians by centered finite differences of gradients.
+    ///
+    /// \details This helper exists for the same reason as numerical_gradient:
+    /// it provides a simple reference implementation for prototyping and
+    /// derivative verification when analytical second derivatives are not yet
+    /// implemented. It depends on evaluate_gradients(), so it is only available
+    /// for basis implementations that can already provide gradients. Analytical
+    /// Hessians should be used in performance-sensitive solver paths because
+    /// finite-difference Hessians amplify numerical error and require repeated
+    /// gradient evaluations.
     void numerical_hessian(const math::Vector<Real, 3>& xi,
                            std::vector<Hessian>& hessians,
                            Real eps = Real(1e-5)) const;
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
index ece2d9cb5..d777447cb 100644
--- a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
+++ b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
@@ -16,6 +16,7 @@ namespace {
 
 using Vec3 = math::Vector<Real, 3>;
 
+// Return the equispaced 1D reference coordinate in [-1, 1].
 inline constexpr Real equispaced_pm_one_coord(int i, int order) {
     if (order <= 0) {
         return Real(0);
@@ -40,6 +41,7 @@ struct NormalizedLagrangeRequest {
     int order;
 };
 
+// Validate and return the supported basis topology for a Lagrange element type.
 BasisTopology supported_lagrange_topology(ElementType type) {
     const BasisTopology top = topology(type);
     if (top == BasisTopology::Unknown) {
@@ -49,6 +51,7 @@ BasisTopology supported_lagrange_topology(ElementType type) {
     return top;
 }
 
+// Normalize named higher-order element requests to base Lagrange topologies.
 NormalizedLagrangeRequest normalize_lagrange_request(ElementType element_type, int order) {
     switch (element_type) {
         case ElementType::Line3:
@@ -79,13 +82,14 @@ NormalizedLagrangeRequest normalize_lagrange_request(ElementType element_type, i
         case ElementType::Pyramid13:
         case ElementType::Pyramid14:
             throw BasisElementCompatibilityException(
-                "LagrangeBasis: pyramid support has been removed from the current solver basis scope",
+                "LagrangeBasis: pyramid support is not within the current solver basis scope",
                 __FILE__, __LINE__, __func__);
         default:
             return {element_type, order};
     }
 }
 
+// Convert a coordinate on [-1, 1] to an equispaced axis node index.
 std::size_t axis_index_pm_one(Real coord, int order) {
     if (order <= 0) {
         return 0u;
@@ -94,6 +98,7 @@ std::size_t axis_index_pm_one(Real coord, int order) {
     return static_cast<std::size_t>(std::llround(scaled));
 }
 
+// Convert a simplex barycentric coordinate to a lattice index.
 int simplex_lattice_index(Real value, int order) {
     if (order <= 0) {
         return 0;
@@ -101,6 +106,7 @@ int simplex_lattice_index(Real value, int order) {
     return static_cast<int>(std::llround(value * Real(order)));
 }
 
+// Compute simplex interpolation exponents from a reference node.
 LagrangeBasis::SimplexExponent simplex_exponent_from_point(const Vec3& p,
                                                            BasisTopology top,
                                                            int order) {
@@ -121,6 +127,7 @@ LagrangeBasis::SimplexExponent simplex_exponent_from_point(const Vec3& p,
     return e;
 }
 
+// Evaluate 1D Lagrange polynomials and derivatives at a point.
 void evaluate_1d_lagrange(Real x, const std::vector<Real>& nodes, AxisEval& out) {
     const std::size_t n = nodes.size();
     out.value.assign(n, Real(0));
@@ -185,6 +192,7 @@ void evaluate_1d_lagrange(Real x, const std::vector<Real>& nodes, AxisEval& out)
     }
 }
 
+// Evaluate one barycentric polynomial factor and derivatives.
 std::array<Real, 3> simplex_factor(int alpha, Real lambda, int order) {
     Real value = Real(1);
     Real first = Real(0);
@@ -204,6 +212,7 @@ std::array<Real, 3> simplex_factor(int alpha, Real lambda, int order) {
     return {value, first, second};
 }
 
+// Evaluate simplex Lagrange basis functions and derivatives.
 void evaluate_simplex(const Vec3& xi,
                       BasisTopology top,
                       int order,
@@ -291,6 +300,7 @@ void evaluate_simplex(const Vec3& xi,
     }
 }
 
+// Store a gradient in the flat buffer layout used by fast evaluators.
 void store_gradient(const Gradient& gradient, Real* dst) {
     dst[0] = gradient[0];
     dst[1] = gradient[1];
@@ -314,6 +324,7 @@ LagrangeBasis::LagrangeBasis(ElementType type, int order)
     init_nodes();
 }
 
+// Initialize equispaced 1D interpolation nodes for tensor-product axes.
 void LagrangeBasis::init_equispaced_1d_nodes() {
     nodes_1d_.resize(static_cast<std::size_t>(order_ + 1));
     for (int i = 0; i <= order_; ++i) {
@@ -322,6 +333,7 @@ void LagrangeBasis::init_equispaced_1d_nodes() {
     }
 }
 
+// Initialize reference nodes and topology-specific lookup data.
 void LagrangeBasis::init_nodes() {
     nodes_.clear();
     nodes_1d_.clear();
@@ -357,10 +369,12 @@ void LagrangeBasis::init_nodes() {
                                              __FILE__, __LINE__, __func__);
 }
 
+// Build the single reference node for a point basis.
 void LagrangeBasis::build_point_nodes() {
     nodes_.push_back(Vec3{Real(0), Real(0), Real(0)});
 }
 
+// Build nodes and axis indices for tensor-product elements.
 void LagrangeBasis::build_tensor_product_nodes(int dimensions) {
     init_equispaced_1d_nodes();
     nodes_ = ReferenceNodeLayout::get_lagrange_node_coords(element_type_, order_);
@@ -378,6 +392,7 @@ void LagrangeBasis::build_tensor_product_nodes(int dimensions) {
     }
 }
 
+// Build nodes and barycentric exponents for simplex elements.
 void LagrangeBasis::build_simplex_nodes() {
     nodes_ = ReferenceNodeLayout::get_lagrange_node_coords(element_type_, order_);
     simplex_exponents_.reserve(nodes_.size());
@@ -386,6 +401,7 @@ void LagrangeBasis::build_simplex_nodes() {
     }
 }
 
+// Build nodes and mixed triangle-axis lookup data for wedge elements.
 void LagrangeBasis::build_wedge_nodes() {
     init_equispaced_1d_nodes();
     nodes_ = ReferenceNodeLayout::get_lagrange_node_coords(element_type_, order_);
@@ -412,6 +428,7 @@ void LagrangeBasis::build_wedge_nodes() {
     }
 }
 
+// Evaluate requested basis quantities into caller-provided flat buffers.
 void LagrangeBasis::evaluate_all_to(const Vec3& xi,
                                     Real* SVMP_RESTRICT values_out,
                                     Real* SVMP_RESTRICT gradients_out,
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasis.h b/Code/Source/solver/FE/Basis/LagrangeBasis.h
index 43304a263..3bb1a5e74 100644
--- a/Code/Source/solver/FE/Basis/LagrangeBasis.h
+++ b/Code/Source/solver/FE/Basis/LagrangeBasis.h
@@ -14,37 +14,193 @@ namespace svmp {
 namespace FE {
 namespace basis {
 
+/// \defgroup FE_LagrangeBasis LagrangeBasis
+/// \ingroup FE_Basis
+/// \brief Construction and evaluation API for nodal Lagrange finite-element bases.
+///
+/// \details This group documents the complete nodal Lagrange basis evaluator
+/// used by the FE library. The implementation covers tensor-product,
+/// simplex, and wedge reference topologies with exact analytical first and
+/// second derivatives in reference coordinates.
+/// @{
+
+/// \brief Nodal Lagrange basis on supported reference finite elements.
+///
+/// \details LagrangeBasis represents the nodal interpolation basis associated
+/// with an equispaced reference-node lattice. It supports point, line,
+/// quadrilateral, hexahedron, triangle, tetrahedron, and wedge reference
+/// elements. Named complete quadratic elements such as Line3, Triangle6,
+/// Quad9, Tetra10, Hex27, and Wedge18 are normalized to their canonical
+/// linear topology plus effective order 2.
+///
+/// Tensor-product elements use the one-dimensional nodal polynomials
+/// \f[
+///   l_i(x) = \prod_{j \ne i} \frac{x - x_j}{x_i - x_j}
+/// \f]
+/// on equispaced coordinates in \f$[-1, 1]\f$. Multi-dimensional basis
+/// functions are products of the active axis polynomials, for example
+/// \f$N_{ijk}(r,s,t) = l_i(r)l_j(s)l_k(t)\f$ on a hexahedron.
+///
+/// Simplex elements use barycentric coordinates and integer lattice
+/// exponents. For a node with exponent tuple \f$\alpha\f$, where
+/// \f$\sum_a \alpha_a = p\f$, the basis is assembled from scaled
+/// falling-factorial factors,
+/// \f[
+///   N_\alpha(\lambda) =
+///   \prod_a \prod_{m=0}^{\alpha_a-1}
+///   \frac{p\lambda_a - m}{m + 1}.
+/// \f]
+/// Gradients and Hessians are evaluated analytically by differentiating these
+/// factors and applying the barycentric-coordinate chain rule.
+///
+/// Wedge elements are treated as a tensor product between a triangle simplex
+/// basis and a one-dimensional through-axis basis:
+/// \f$N_{a k}(r,s,t) = T_a(r,s)l_k(t)\f$.
+///
+/// The vector-returning evaluators are convenient API wrappers. The `*_to`
+/// methods write to caller-provided flat buffers and are intended for assembly
+/// paths that avoid temporary allocations.
 class LagrangeBasis : public BasisFunction {
 public:
+    /// \brief Axis-index tuple for tensor-product reference nodes.
     using TensorNodeIndex = std::array<std::size_t, 3>;
+
+    /// \brief Barycentric exponent tuple for simplex reference nodes.
     using SimplexExponent = std::array<int, 4>;
+
+    /// \brief Triangle-node and axis-node tuple for wedge reference nodes.
     using WedgeNodeIndex = std::array<std::size_t, 2>;
 
+    /// \brief Construct a Lagrange basis for an element type and polynomial order.
+    ///
+    /// \details The constructor normalizes complete higher-order aliases to the
+    /// canonical topology and effective polynomial order, builds the reference
+    /// node coordinates, and precomputes topology-specific lookup data used by
+    /// evaluation. Tensor-product bases store per-axis node indices, simplex
+    /// bases store barycentric exponent tuples, and wedge bases store the
+    /// triangle-node/axis-node decomposition.
+    ///
+    /// \param type Element type used to determine topology and reference-node layout.
+    /// \param order Requested polynomial order.
+    /// \throws BasisConfigurationException If the effective order is negative.
+    /// \throws BasisElementCompatibilityException If the element type is unsupported.
     LagrangeBasis(ElementType type, int order);
 
+    /// \copydoc BasisFunction::basis_type()
     BasisType basis_type() const noexcept override { return BasisType::Lagrange; }
+
+    /// \copydoc BasisFunction::element_type()
     ElementType element_type() const noexcept override { return element_type_; }
+
+    /// \copydoc BasisFunction::dimension()
     int dimension() const noexcept override { return dimension_; }
+
+    /// \copydoc BasisFunction::order()
     int order() const noexcept override { return order_; }
+
+    /// \copydoc BasisFunction::size()
     std::size_t size() const noexcept override { return nodes_.size(); }
 
+    /// \brief Return the reference interpolation nodes in basis ordering.
+    ///
+    /// \details The returned node order matches the basis-function order used
+    /// by all evaluators. Coordinates are reference-element coordinates:
+    /// tensor-product axes use \f$[-1,1]\f$, triangles and tetrahedra use the
+    /// repository's simplex reference coordinates, and wedges combine triangle
+    /// reference coordinates with a \f$[-1,1]\f$ through-axis coordinate.
+    ///
+    /// \return Reference node coordinates, one per basis function.
     const std::vector<math::Vector<Real, 3>>& nodes() const noexcept { return nodes_; }
 
+    /// \brief Evaluate Lagrange basis function values at a reference coordinate.
+    ///
+    /// \details Values satisfy the nodal interpolation property
+    /// \f$N_i(x_j)=\delta_{ij}\f$ at the basis nodes. Tensor-product values are
+    /// products of one-dimensional Lagrange polynomials. Simplex values are
+    /// products of barycentric falling-factorial factors. Wedge values are
+    /// products of triangle simplex values and through-axis Lagrange values.
+    ///
+    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+    /// \param values Receives one value per basis function.
     void evaluate_values(const math::Vector<Real, 3>& xi,
                          std::vector<Real>& values) const final;
+
+    /// \brief Evaluate analytical Lagrange basis gradients at a reference coordinate.
+    ///
+    /// \details Gradients are derivatives with respect to reference
+    /// coordinates, not physical coordinates. Tensor-product gradients apply
+    /// the product rule to the active axis polynomials. Simplex gradients
+    /// differentiate the barycentric factors and multiply by the constant
+    /// gradients of the barycentric coordinates. Wedge gradients combine the
+    /// triangle gradient in the first two components with the through-axis
+    /// derivative in the third component.
+    ///
+    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+    /// \param gradients Receives one three-component gradient per basis function.
     void evaluate_gradients(const math::Vector<Real, 3>& xi,
                             std::vector<Gradient>& gradients) const final;
+
+    /// \brief Evaluate analytical Lagrange basis Hessians at a reference coordinate.
+    ///
+    /// \details Hessians are second derivatives in reference coordinates and
+    /// are stored as 3-by-3 matrices. Tensor-product Hessians contain pure
+    /// second axis derivatives on the diagonal and mixed product-rule terms
+    /// off diagonal. Simplex Hessians are assembled from first and second
+    /// derivatives of the barycentric factors. Wedge Hessians contain triangle
+    /// Hessian terms, through-axis second derivatives, and mixed
+    /// triangle/through-axis derivative products.
+    ///
+    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+    /// \param hessians Receives one 3-by-3 Hessian per basis function.
     void evaluate_hessians(const math::Vector<Real, 3>& xi,
                            std::vector<Hessian>& hessians) const final;
+
+    /// \brief Evaluate Lagrange values, gradients, and Hessians together.
+    ///
+    /// \details This is the allocation-friendly vector API for callers that
+    /// need all basis quantities at the same quadrature point. The underlying
+    /// evaluator computes only topology-local polynomial data once and then
+    /// fills all requested outputs.
+    ///
+    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+    /// \param values Receives one value per basis function.
+    /// \param gradients Receives one three-component gradient per basis function.
+    /// \param hessians Receives one 3-by-3 Hessian per basis function.
     void evaluate_all(const math::Vector<Real, 3>& xi,
                       std::vector<Real>& values,
                       std::vector<Gradient>& gradients,
                       std::vector<Hessian>& hessians) const final;
 
+    /// \brief Evaluate Lagrange basis values into a flat caller-provided buffer.
+    ///
+    /// \details This is the low-allocation API intended for element assembly
+    /// loops. The buffer is filled in basis-node order and no vector resizing
+    /// is performed.
+    ///
+    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+    /// \param values_out Output buffer with at least size() entries.
     void evaluate_values_to(const math::Vector<Real, 3>& xi,
                             Real* SVMP_RESTRICT values_out) const final;
+
+    /// \brief Evaluate Lagrange basis gradients into a flat caller-provided buffer.
+    ///
+    /// \details Gradients are written in node-major order with three
+    /// reference-coordinate components per node. For node \f$i\f$ and component
+    /// \f$c\f$, the entry is `gradients_out[i * 3 + c]`.
+    ///
+    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+    /// \param gradients_out Output buffer with node-major layout: node * 3 + component.
     void evaluate_gradients_to(const math::Vector<Real, 3>& xi,
                                Real* SVMP_RESTRICT gradients_out) const final;
+
+    /// \brief Evaluate Lagrange basis Hessians into a flat caller-provided buffer.
+    ///
+    /// \details Hessians are written in node-major row-major order. For node
+    /// \f$i\f$ and Hessian component \f$(r,c)\f$, the entry is
+    /// `hessians_out[i * 9 + r * 3 + c]`.
+    ///
+    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+    /// \param hessians_out Output buffer with node-major row-major layout: node * 9 + row * 3 + col.
     void evaluate_hessians_to(const math::Vector<Real, 3>& xi,
                               Real* SVMP_RESTRICT hessians_out) const final;
 
@@ -73,6 +229,8 @@ class LagrangeBasis : public BasisFunction {
                          Real* SVMP_RESTRICT hessians_out) const;
 };
 
+/// @}
+
 } // namespace basis
 } // namespace FE
 } // namespace svmp
diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.h b/Code/Source/solver/FE/Basis/SerendipityBasis.h
index e0289f82d..fc0b897cf 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.h
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.h
@@ -17,23 +17,137 @@ namespace svmp {
 namespace FE {
 namespace basis {
 
+/// \defgroup FE_SerendipityBasis SerendipityBasis
+/// \ingroup FE_Basis
+/// \brief Construction and evaluation API for reduced serendipity finite-element bases.
+///
+/// \details This group documents reduced degree-of-freedom basis families that
+/// preserve nodal interpolation on supported element boundaries while omitting
+/// selected interior tensor-product modes. These bases are used for standard
+/// serendipity elements and geometry-mode mappings that intentionally use a
+/// lower-order interpolation space.
+/// @{
+
+/// \brief Reduced-degree-of-freedom serendipity basis on supported reference elements.
+///
+/// \details SerendipityBasis implements nodal bases for Quad4/Quad8,
+/// Hex8/Hex20, and Wedge15. Compared with a complete tensor-product Lagrange
+/// basis of the same nominal order, a serendipity basis removes selected
+/// interior modes while retaining nodal interpolation on the supported node
+/// layout.
+///
+/// Quadrilateral serendipity bases are built from monomials
+/// \f$x^{a_x}y^{a_y}\f$ whose superlinear degree is at most the requested
+/// order. In this implementation the superlinear degree is
+/// \f[
+///   sldeg(x^{a_x}y^{a_y}) =
+///   \begin{cases} a_x, & a_x > 1 \\ 0, & a_x \le 1 \end{cases}
+///   +
+///   \begin{cases} a_y, & a_y > 1 \\ 0, & a_y \le 1 \end{cases}.
+/// \f]
+/// The nodal basis is recovered by inverting the Vandermonde interpolation
+/// matrix at the selected reference nodes. Values, gradients, and Hessians are
+/// then evaluated by differentiating the monomial vector and applying the
+/// inverse Vandermonde coefficients.
+///
+/// Hex8 uses the standard trilinear corner basis
+/// \f$(1 \pm r)(1 \pm s)(1 \pm t)/8\f$. Hex20 and Wedge15 use tabulated
+/// polynomial coefficient tables over monomial bases; analytical gradients and
+/// Hessians are obtained by differentiating those monomials. Hex20 evaluation
+/// is reordered through ReferenceNodeLayout so the output matches the public
+/// basis ordering.
+///
+/// When `geometry_mode` is enabled for Hex20, the basis uses the trilinear
+/// Hex8 corner functions for geometry mapping and assigns zero contribution to
+/// the quadratic edge nodes. This preserves the public Hex20 node count while
+/// intentionally reducing the geometry interpolation order.
 class SerendipityBasis : public BasisFunction {
 public:
+    /// \brief Construct a serendipity basis for an element type and polynomial order.
+    ///
+    /// \details The constructor selects the topology-specific interpolation
+    /// space, computes the reference node coordinates, and initializes any
+    /// coefficient tables needed for evaluation. Quadrilateral bases build and
+    /// invert a Vandermonde matrix for the selected serendipity monomials.
+    /// Hex20 and Wedge15 use fixed coefficient tables. For hexahedra, only
+    /// linear Hex8 and quadratic Hex20 serendipity spaces are supported. For
+    /// wedges, only quadratic Wedge15 is supported.
+    ///
+    /// \param type Element type used to determine topology and reference-node layout.
+    /// \param order Requested polynomial order.
+    /// \param geometry_mode When true, allow reduced geometry-mapping behavior for supported elements.
+    /// \throws BasisConfigurationException If the requested order or mode is invalid.
+    /// \throws BasisElementCompatibilityException If the element type is unsupported.
     SerendipityBasis(ElementType type, int order, bool geometry_mode = false);
 
+    /// \copydoc BasisFunction::basis_type()
     BasisType basis_type() const noexcept override { return BasisType::Serendipity; }
+
+    /// \copydoc BasisFunction::element_type()
     ElementType element_type() const noexcept override { return element_type_; }
+
+    /// \copydoc BasisFunction::dimension()
     int dimension() const noexcept override { return dimension_; }
+
+    /// \copydoc BasisFunction::order()
     int order() const noexcept override { return order_; }
+
+    /// \copydoc BasisFunction::size()
     std::size_t size() const noexcept override { return size_; }
+
+    /// \brief Return the reference interpolation nodes in basis ordering.
+    ///
+    /// \details Node coordinates are the points at which the serendipity basis
+    /// satisfies the nodal interpolation property. Quadrilateral nodes are
+    /// placed first on the boundary and then, for higher order requests, at the
+    /// selected interior points needed to make the reduced monomial space
+    /// unisolvent. Hexahedral and wedge nodes are taken from
+    /// ReferenceNodeLayout.
+    ///
+    /// \return Reference node coordinates, one per basis function.
     const std::vector<math::Vector<Real, 3>>& nodes() const noexcept { return nodes_; }
 
+    /// \brief Evaluate serendipity basis function values at a reference coordinate.
+    ///
+    /// \details For quadrilateral bases, this evaluates the serendipity
+    /// monomial vector and multiplies by the inverse Vandermonde matrix to
+    /// obtain nodal shape-function values. For Hex8, values are the standard
+    /// trilinear corner products. For Hex20 and Wedge15, values are evaluated
+    /// from the stored polynomial coefficient tables. In Hex20 geometry mode,
+    /// only the first eight corner values are nonzero and they match Hex8.
+    ///
+    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+    /// \param values Receives one value per basis function.
     void evaluate_values(const math::Vector<Real, 3>& xi,
                          std::vector<Real>& values) const override;
 
+    /// \brief Evaluate analytical serendipity basis gradients at a reference coordinate.
+    ///
+    /// \details Gradients are derivatives with respect to reference
+    /// coordinates. Quadrilateral gradients differentiate the monomial vector
+    /// before applying the inverse Vandermonde coefficients. Hex8 gradients are
+    /// direct derivatives of the trilinear corner products. Hex20 and Wedge15
+    /// gradients are computed by differentiating the tabulated monomial
+    /// expansions. In Hex20 geometry mode, edge-node gradients are zero and the
+    /// corner gradients match Hex8.
+    ///
+    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+    /// \param gradients Receives one three-component gradient per basis function.
     void evaluate_gradients(const math::Vector<Real, 3>& xi,
                             std::vector<Gradient>& gradients) const override;
 
+    /// \brief Evaluate analytical serendipity basis Hessians at a reference coordinate.
+    ///
+    /// \details Hessians are second derivatives in reference coordinates and
+    /// are stored as 3-by-3 matrices. Quadrilateral Hessians use second
+    /// derivatives of the monomial vector and inverse Vandermonde coefficients.
+    /// Hex8 Hessians are delegated to the linear Lagrange Hex8 basis. Hex20 and
+    /// Wedge15 Hessians are computed by differentiating their polynomial
+    /// coefficient tables twice. In Hex20 geometry mode, only the corner
+    /// Hessians from the Hex8 geometry mapping are populated.
+    ///
+    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+    /// \param hessians Receives one 3-by-3 Hessian per basis function.
     void evaluate_hessians(const math::Vector<Real, 3>& xi,
                            std::vector<Hessian>& hessians) const override;
 
@@ -52,6 +166,8 @@ class SerendipityBasis : public BasisFunction {
     bool geometry_mode_;
 };
 
+/// @}
+
 } // namespace basis
 } // namespace FE
 } // namespace svmp
diff --git a/Code/Source/solver/FE/Math/Matrix.h b/Code/Source/solver/FE/Math/Matrix.h
index 3f3a9d9b6..f7432f38c 100644
--- a/Code/Source/solver/FE/Math/Matrix.h
+++ b/Code/Source/solver/FE/Math/Matrix.h
@@ -25,12 +25,21 @@
 #include <stdexcept>
 #include <type_traits>
 
+/// \defgroup FE_MatrixMath Matrix
+/// \ingroup FE_Math
+/// \brief Fixed-size matrix types, matrix expressions, and small-matrix operations.
+///
+/// \details The Matrix submodule contains row-major fixed-size matrices used
+/// by FE kernels, expression-template support for matrix algebra, and direct
+/// determinant/inverse implementations for common element-level sizes.
+
 namespace svmp {
 namespace FE {
 namespace math {
 
 /**
  * @brief Fixed-size matrix for element-level computations
+ * @ingroup FE_MatrixMath
  * @tparam T Scalar type (float, double)
  * @tparam M Number of rows
  * @tparam N Number of columns
@@ -770,7 +779,14 @@ inline Matrix<T, 3, 3> inverse_3x3(const Matrix<T, 3, 3>& m) {
     return adj * inv_det;
 }
 
-// Template specializations for 2x2 Matrix determinant and inverse
+/**
+ * @brief Specialized fixed-size 2-by-2 matrix for element-level computations.
+ * @ingroup FE_MatrixMath
+ * @tparam T Scalar type.
+ *
+ * This specialization preserves the Matrix API while using direct formulas for
+ * 2-by-2 determinant and inverse operations.
+ */
 template<typename T>
 class Matrix<T, 2, 2> : public MatrixExpr<Matrix<T, 2, 2>> {
     static constexpr std::size_t M = 2;
@@ -1006,7 +1022,14 @@ class Matrix<T, 2, 2> : public MatrixExpr<Matrix<T, 2, 2>> {
     const T* end() const { return data_ + 4; }
 };
 
-// Template specialization for 3x3 Matrix
+/**
+ * @brief Specialized fixed-size 3-by-3 matrix for element-level computations.
+ * @ingroup FE_MatrixMath
+ * @tparam T Scalar type.
+ *
+ * This specialization preserves the Matrix API while using direct formulas for
+ * 3-by-3 determinant and inverse operations.
+ */
 template<typename T>
 class Matrix<T, 3, 3> : public MatrixExpr<Matrix<T, 3, 3>> {
     static constexpr std::size_t M = 3;
diff --git a/Code/Source/solver/FE/Math/Vector.h b/Code/Source/solver/FE/Math/Vector.h
index a1214f9aa..0ec99c81f 100644
--- a/Code/Source/solver/FE/Math/Vector.h
+++ b/Code/Source/solver/FE/Math/Vector.h
@@ -24,6 +24,19 @@
 #include <stdexcept>
 #include <type_traits>
 
+/// \defgroup FE_Math Math
+/// \ingroup FE
+/// \brief Fixed-size and dense linear algebra utilities for finite-element computations.
+///
+/// \details The Math module provides small fixed-size vector and matrix types
+/// used in element-level kernels, expression-template infrastructure for
+/// allocation-free algebraic expressions, and dense linear algebra utilities
+/// used by basis construction and local transforms.
+///
+/// \defgroup FE_VectorMath Vector
+/// \ingroup FE_Math
+/// \brief Fixed-size vector types and vector expression utilities.
+
 namespace svmp {
 namespace FE {
 namespace math {
@@ -47,6 +60,7 @@ inline bool approx_equal(T a, T b, T tol = tolerance<T>) {
 
 /**
  * @brief Fixed-size vector for element-level computations
+ * @ingroup FE_VectorMath
  * @tparam T Scalar type (float, double)
  * @tparam N Vector dimension
  *
diff --git a/Documentation/Doxyfile b/Documentation/Doxyfile
index acd5ba21c..3c29a08f1 100644
--- a/Documentation/Doxyfile
+++ b/Documentation/Doxyfile
@@ -191,10 +191,10 @@ TREEVIEW_WIDTH         = 250
 EXT_LINKS_IN_WINDOW    = NO
 FORMULA_FONTSIZE       = 10
 USE_MATHJAX            = YES
-MATHJAX_VERSION        = MathJax_3
-MATHJAX_FORMAT         = chtml
-MATHJAX_RELPATH        = https://cdn.jsdelivr.net/npm/mathjax@3
-MATHJAX_EXTENSIONS     = ams
+MATHJAX_VERSION        = MathJax_2
+MATHJAX_FORMAT         = HTML-CSS
+MATHJAX_RELPATH        = https://cdn.jsdelivr.net/npm/mathjax@2
+MATHJAX_EXTENSIONS     = TeX/AMSmath TeX/AMSsymbols
 MATHJAX_CODEFILE       =
 SEARCHENGINE           = YES
 SERVER_BASED_SEARCH    = NO
diff --git a/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp b/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
index 430390e54..d4bf1d6e5 100644
--- a/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
+++ b/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
@@ -1,6 +1,6 @@
 /**
  * @file test_BasisErrorPaths.cpp
- * @brief Error-path coverage for the migrated Lagrange-focused Basis subset.
+ * @brief Error-path coverage for the Lagrange-focused Basis subset.
  */
 
 #include <gtest/gtest.h>

From c53e0e06ef4165b3b8b4069f2fbec246bbd4ab54 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Tue, 9 Jun 2026 11:34:34 -0700
Subject: [PATCH 12/22] updating serendipity basis to be concrete terminal
 classes with `final`

---
 .../solver/FE/Basis/SerendipityBasis.cpp      | 442 ++++++++----------
 .../Source/solver/FE/Basis/SerendipityBasis.h |  56 ++-
 2 files changed, 238 insertions(+), 260 deletions(-)

diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
index d551419a8..358e76123 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
@@ -2,7 +2,6 @@
 // SPDX-License-Identifier: BSD-3-Clause
 
 #include "SerendipityBasis.h"
-#include "LagrangeBasis.h"
 #include "NodeOrderingConventions.h"
 #include "Math/DenseLinearAlgebra.h"
 
@@ -19,6 +18,61 @@ namespace basis {
 namespace {
 using Vec3 = math::Vector<Real, 3>;
 
+void store_gradient(const Gradient& gradient, Real* dst) {
+    dst[0] = gradient[0];
+    dst[1] = gradient[1];
+    dst[2] = gradient[2];
+}
+
+void evaluate_hex8_reference(Real r,
+                             Real s,
+                             Real t,
+                             Real* values,
+                             Real* gradients,
+                             Real* hessians) {
+    static constexpr int signs[8][3] = {
+        {-1, -1, -1},
+        { 1, -1, -1},
+        { 1,  1, -1},
+        {-1,  1, -1},
+        {-1, -1,  1},
+        { 1, -1,  1},
+        { 1,  1,  1},
+        {-1,  1,  1},
+    };
+
+    for (std::size_t i = 0; i < 8u; ++i) {
+        const Real a = Real(signs[i][0]);
+        const Real b = Real(signs[i][1]);
+        const Real c = Real(signs[i][2]);
+        const Real ar = Real(1) + a * r;
+        const Real bs = Real(1) + b * s;
+        const Real ct = Real(1) + c * t;
+
+        if (values) {
+            values[i] = Real(0.125) * ar * bs * ct;
+        }
+        if (gradients) {
+            Real* g = gradients + i * 3u;
+            g[0] = Real(0.125) * a * bs * ct;
+            g[1] = Real(0.125) * b * ar * ct;
+            g[2] = Real(0.125) * c * ar * bs;
+        }
+        if (hessians) {
+            Real* h = hessians + i * 9u;
+            h[0] = Real(0);
+            h[1] = Real(0.125) * a * b * ct;
+            h[2] = Real(0.125) * a * c * bs;
+            h[3] = h[1];
+            h[4] = Real(0);
+            h[5] = Real(0.125) * b * c * ar;
+            h[6] = h[2];
+            h[7] = h[5];
+            h[8] = Real(0);
+        }
+    }
+}
+
 int quad_serendipity_superlinear_degree(int ax, int ay) {
     return (ax > 1 ? ax : 0) + (ay > 1 ? ay : 0);
 }
@@ -496,96 +550,24 @@ SerendipityBasis::SerendipityBasis(ElementType type, int order, bool geometry_mo
     }
 }
 
-void SerendipityBasis::evaluate_values(const math::Vector<Real, 3>& xi,
-                                       std::vector<Real>& values) const {
-    values.assign(size_, Real(0));
-    const Real x = xi[0];
-    const Real y = xi[1];
-    const Real z = xi[2];
-
-    if (dimension_ == 2) {
-        if (quad_monomial_exponents_.size() != size_ ||
-            quad_inv_vandermonde_.size() != size_ * size_) {
-            throw BasisEvaluationException(
-                "SerendipityBasis: quadrilateral interpolation tables are not initialized for value evaluation",
-                __FILE__, __LINE__, __func__);
-        }
-
-        std::vector<Real> monomials(size_, Real(0));
-        for (std::size_t j = 0; j < size_; ++j) {
-            const auto [ax, ay] = quad_monomial_exponents_[j];
-            monomials[j] = std::pow(x, ax) * std::pow(y, ay);
-        }
-
-        for (std::size_t i = 0; i < size_; ++i) {
-            Real value = Real(0);
-            for (std::size_t j = 0; j < size_; ++j) {
-                value += monomials[j] * quad_inv_vandermonde_[j * size_ + i];
-            }
-            values[i] = value;
-        }
+void SerendipityBasis::evaluate_all_to(const math::Vector<Real, 3>& xi,
+                                       Real* SVMP_RESTRICT values_out,
+                                       Real* SVMP_RESTRICT gradients_out,
+                                       Real* SVMP_RESTRICT hessians_out) const {
+    if (!values_out && !gradients_out && !hessians_out) {
         return;
     }
 
-    if (dimension_ == 3 && order_ == 1) {
-        // Hex8 trilinear shape functions
-        const Real r = x;
-        const Real s = y;
-        const Real t = z;
-        values[0] = Real(0.125) * (Real(1) - r) * (Real(1) - s) * (Real(1) - t);
-        values[1] = Real(0.125) * (Real(1) + r) * (Real(1) - s) * (Real(1) - t);
-        values[2] = Real(0.125) * (Real(1) + r) * (Real(1) + s) * (Real(1) - t);
-        values[3] = Real(0.125) * (Real(1) - r) * (Real(1) + s) * (Real(1) - t);
-        values[4] = Real(0.125) * (Real(1) - r) * (Real(1) - s) * (Real(1) + t);
-        values[5] = Real(0.125) * (Real(1) + r) * (Real(1) - s) * (Real(1) + t);
-        values[6] = Real(0.125) * (Real(1) + r) * (Real(1) + s) * (Real(1) + t);
-        values[7] = Real(0.125) * (Real(1) - r) * (Real(1) + s) * (Real(1) + t);
-        return;
+    if (values_out) {
+        std::fill_n(values_out, size_, Real(0));
     }
-
-    const Real r = x;
-    const Real s = y;
-    const Real t = z;
-
-    if (geometry_mode_ && element_type_ == ElementType::Hex20) {
-        // Hex20 geometry mode: use trilinear Hex8 shape functions on corners, edges zero.
-        values[0] = Real(0.125) * (Real(1) - r) * (Real(1) - s) * (Real(1) - t);
-        values[1] = Real(0.125) * (Real(1) + r) * (Real(1) - s) * (Real(1) - t);
-        values[2] = Real(0.125) * (Real(1) + r) * (Real(1) + s) * (Real(1) - t);
-        values[3] = Real(0.125) * (Real(1) - r) * (Real(1) + s) * (Real(1) - t);
-        values[4] = Real(0.125) * (Real(1) - r) * (Real(1) - s) * (Real(1) + t);
-        values[5] = Real(0.125) * (Real(1) + r) * (Real(1) - s) * (Real(1) + t);
-        values[6] = Real(0.125) * (Real(1) + r) * (Real(1) + s) * (Real(1) + t);
-        values[7] = Real(0.125) * (Real(1) - r) * (Real(1) + s) * (Real(1) + t);
-        for (std::size_t i = 8; i < 20; ++i) {
-            values[i] = Real(0);
-        }
-        return;
+    if (gradients_out) {
+        std::fill_n(gradients_out, size_ * 3u, Real(0));
     }
-
-    if (element_type_ == ElementType::Hex20) {
-        Real internal_vals[20];
-        eval_hex20_internal(r, s, t, internal_vals);
-        const auto mesh_to_basis = ReferenceNodeLayout::mesh_to_basis_ordering(element_type_);
-        BASIS_CHECK_EVAL(mesh_to_basis.size() == size_,
-                         "Hex20 mesh-to-basis ordering is not registered");
-        for (std::size_t i = 0; i < 20; ++i) {
-            values[i] = internal_vals[mesh_to_basis[i]];
-        }
-        return;
+    if (hessians_out) {
+        std::fill_n(hessians_out, size_ * 9u, Real(0));
     }
 
-    if (element_type_ == ElementType::Wedge15) {
-        eval_wedge15_polynomial(r, s, t, values.data(), nullptr, nullptr);
-        return;
-    }
-
-}
-
-void SerendipityBasis::evaluate_gradients(const math::Vector<Real, 3>& xi,
-                                          std::vector<Gradient>& gradients) const {
-    gradients.assign(size_, Gradient{});
-
     const Real x = xi[0];
     const Real y = xi[1];
     const Real z = xi[2];
@@ -594,216 +576,174 @@ void SerendipityBasis::evaluate_gradients(const math::Vector<Real, 3>& xi,
         if (quad_monomial_exponents_.size() != size_ ||
             quad_inv_vandermonde_.size() != size_ * size_) {
             throw BasisEvaluationException(
-                "SerendipityBasis: quadrilateral interpolation tables are not initialized for gradient evaluation",
+                "SerendipityBasis: quadrilateral interpolation tables are not initialized for value evaluation",
                 __FILE__, __LINE__, __func__);
         }
 
-        std::vector<Real> dmon_dx(size_, Real(0));
-        std::vector<Real> dmon_dy(size_, Real(0));
         for (std::size_t j = 0; j < size_; ++j) {
             const auto [ax, ay] = quad_monomial_exponents_[j];
-            dmon_dx[j] =
+            const Real value = std::pow(x, ax) * std::pow(y, ay);
+            const Real dx =
                 (ax > 0) ? Real(ax) * std::pow(x, ax - 1) * std::pow(y, ay) : Real(0);
-            dmon_dy[j] =
+            const Real dy =
                 (ay > 0) ? std::pow(x, ax) * Real(ay) * std::pow(y, ay - 1) : Real(0);
-        }
+            const Real dxx =
+                (ax > 1) ? Real(ax * (ax - 1)) * std::pow(x, ax - 2) * std::pow(y, ay)
+                         : Real(0);
+            const Real dxy =
+                (ax > 0 && ay > 0)
+                    ? Real(ax * ay) * std::pow(x, ax - 1) * std::pow(y, ay - 1)
+                    : Real(0);
+            const Real dyy =
+                (ay > 1) ? Real(ay * (ay - 1)) * std::pow(x, ax) * std::pow(y, ay - 2)
+                         : Real(0);
 
-        for (std::size_t i = 0; i < size_; ++i) {
-            Real gx = Real(0);
-            Real gy = Real(0);
-            for (std::size_t j = 0; j < size_; ++j) {
+            for (std::size_t i = 0; i < size_; ++i) {
                 const Real coeff = quad_inv_vandermonde_[j * size_ + i];
-                gx += dmon_dx[j] * coeff;
-                gy += dmon_dy[j] * coeff;
+                if (values_out) {
+                    values_out[i] += value * coeff;
+                }
+                if (gradients_out) {
+                    Real* g = gradients_out + i * 3u;
+                    g[0] += dx * coeff;
+                    g[1] += dy * coeff;
+                }
+                if (hessians_out) {
+                    Real* h = hessians_out + i * 9u;
+                    h[0] += dxx * coeff;
+                    h[1] += dxy * coeff;
+                    h[3] += dxy * coeff;
+                    h[4] += dyy * coeff;
+                }
             }
-            gradients[i][0] = gx;
-            gradients[i][1] = gy;
         }
         return;
     }
 
-    // 3D linear hex (Hex8)
     if (dimension_ == 3 && order_ == 1) {
-        const Real r = x, s = y, t = z;
-        gradients[0][0] = -Real(0.125) * (Real(1) - s) * (Real(1) - t);
-        gradients[0][1] = -Real(0.125) * (Real(1) - r) * (Real(1) - t);
-        gradients[0][2] = -Real(0.125) * (Real(1) - r) * (Real(1) - s);
-
-        gradients[1][0] =  Real(0.125) * (Real(1) - s) * (Real(1) - t);
-        gradients[1][1] = -Real(0.125) * (Real(1) + r) * (Real(1) - t);
-        gradients[1][2] = -Real(0.125) * (Real(1) + r) * (Real(1) - s);
-
-        gradients[2][0] =  Real(0.125) * (Real(1) + s) * (Real(1) - t);
-        gradients[2][1] =  Real(0.125) * (Real(1) + r) * (Real(1) - t);
-        gradients[2][2] = -Real(0.125) * (Real(1) + r) * (Real(1) + s);
-
-        gradients[3][0] = -Real(0.125) * (Real(1) + s) * (Real(1) - t);
-        gradients[3][1] =  Real(0.125) * (Real(1) - r) * (Real(1) - t);
-        gradients[3][2] = -Real(0.125) * (Real(1) - r) * (Real(1) + s);
-
-        gradients[4][0] = -Real(0.125) * (Real(1) - s) * (Real(1) + t);
-        gradients[4][1] = -Real(0.125) * (Real(1) - r) * (Real(1) + t);
-        gradients[4][2] =  Real(0.125) * (Real(1) - r) * (Real(1) - s);
-
-        gradients[5][0] =  Real(0.125) * (Real(1) - s) * (Real(1) + t);
-        gradients[5][1] = -Real(0.125) * (Real(1) + r) * (Real(1) + t);
-        gradients[5][2] =  Real(0.125) * (Real(1) + r) * (Real(1) - s);
-
-        gradients[6][0] =  Real(0.125) * (Real(1) + s) * (Real(1) + t);
-        gradients[6][1] =  Real(0.125) * (Real(1) + r) * (Real(1) + t);
-        gradients[6][2] =  Real(0.125) * (Real(1) + r) * (Real(1) + s);
-
-        gradients[7][0] = -Real(0.125) * (Real(1) + s) * (Real(1) + t);
-        gradients[7][1] =  Real(0.125) * (Real(1) - r) * (Real(1) + t);
-        gradients[7][2] =  Real(0.125) * (Real(1) - r) * (Real(1) + s);
+        evaluate_hex8_reference(x, y, z, values_out, gradients_out, hessians_out);
         return;
     }
 
-    // Hex20 geometry mode: use Hex8 gradients
-    if (dimension_ == 3 && order_ == 2 && geometry_mode_ &&
-        (element_type_ == ElementType::Hex20 || element_type_ == ElementType::Quad8)) {
-        const Real r = x, s = y, t = z;
-        gradients[0][0] = -Real(0.125) * (Real(1) - s) * (Real(1) - t);
-        gradients[0][1] = -Real(0.125) * (Real(1) - r) * (Real(1) - t);
-        gradients[0][2] = -Real(0.125) * (Real(1) - r) * (Real(1) - s);
-
-        gradients[1][0] =  Real(0.125) * (Real(1) - s) * (Real(1) - t);
-        gradients[1][1] = -Real(0.125) * (Real(1) + r) * (Real(1) - t);
-        gradients[1][2] = -Real(0.125) * (Real(1) + r) * (Real(1) - s);
-
-        gradients[2][0] =  Real(0.125) * (Real(1) + s) * (Real(1) - t);
-        gradients[2][1] =  Real(0.125) * (Real(1) + r) * (Real(1) - t);
-        gradients[2][2] = -Real(0.125) * (Real(1) + r) * (Real(1) + s);
-
-        gradients[3][0] = -Real(0.125) * (Real(1) + s) * (Real(1) - t);
-        gradients[3][1] =  Real(0.125) * (Real(1) - r) * (Real(1) - t);
-        gradients[3][2] = -Real(0.125) * (Real(1) - r) * (Real(1) + s);
-
-        gradients[4][0] = -Real(0.125) * (Real(1) - s) * (Real(1) + t);
-        gradients[4][1] = -Real(0.125) * (Real(1) - r) * (Real(1) + t);
-        gradients[4][2] =  Real(0.125) * (Real(1) - r) * (Real(1) - s);
-
-        gradients[5][0] =  Real(0.125) * (Real(1) - s) * (Real(1) + t);
-        gradients[5][1] = -Real(0.125) * (Real(1) + r) * (Real(1) + t);
-        gradients[5][2] =  Real(0.125) * (Real(1) + r) * (Real(1) - s);
-
-        gradients[6][0] =  Real(0.125) * (Real(1) + s) * (Real(1) + t);
-        gradients[6][1] =  Real(0.125) * (Real(1) + r) * (Real(1) + t);
-        gradients[6][2] =  Real(0.125) * (Real(1) + r) * (Real(1) + s);
-
-        gradients[7][0] = -Real(0.125) * (Real(1) + s) * (Real(1) + t);
-        gradients[7][1] =  Real(0.125) * (Real(1) - r) * (Real(1) + t);
-        gradients[7][2] =  Real(0.125) * (Real(1) - r) * (Real(1) + s);
-        // Edge-node gradients remain zero
+    if (geometry_mode_ && element_type_ == ElementType::Hex20) {
+        evaluate_hex8_reference(x, y, z, values_out, gradients_out, hessians_out);
         return;
     }
 
-    // Hex20 analytical gradients using monomial differentiation
-    if (element_type_ == ElementType::Hex20 && order_ == 2) {
-        const Real r = x, s = y, t = z;
-        Gradient internal_grads[20];
-        eval_hex20_grad_internal(r, s, t, internal_grads);
+    if (element_type_ == ElementType::Hex20) {
         const auto mesh_to_basis = ReferenceNodeLayout::mesh_to_basis_ordering(element_type_);
         BASIS_CHECK_EVAL(mesh_to_basis.size() == size_,
                          "Hex20 mesh-to-basis ordering is not registered");
-        for (std::size_t i = 0; i < 20; ++i) {
-            gradients[i] = internal_grads[mesh_to_basis[i]];
+
+        if (values_out) {
+            Real internal_vals[20];
+            eval_hex20_internal(x, y, z, internal_vals);
+            for (std::size_t i = 0; i < 20u; ++i) {
+                values_out[i] = internal_vals[mesh_to_basis[i]];
+            }
+        }
+        if (gradients_out) {
+            Gradient internal_grads[20];
+            eval_hex20_grad_internal(x, y, z, internal_grads);
+            for (std::size_t i = 0; i < 20u; ++i) {
+                store_gradient(internal_grads[mesh_to_basis[i]], gradients_out + i * 3u);
+            }
+        }
+        if (hessians_out) {
+            Hessian internal_hessians[20];
+            eval_hex20_hess_internal(x, y, z, internal_hessians);
+            for (std::size_t i = 0; i < 20u; ++i) {
+                store_hessian(internal_hessians[mesh_to_basis[i]], hessians_out + i * 9u);
+            }
         }
         return;
     }
 
-    // Wedge15 analytical gradients using monomial differentiation
-    if (element_type_ == ElementType::Wedge15 && order_ == 2) {
-        eval_wedge15_polynomial(x, y, z, nullptr, gradients.data(), nullptr);
+    if (element_type_ == ElementType::Wedge15) {
+        std::array<Gradient, 15u> wedge_gradients{};
+        std::array<Hessian, 15u> wedge_hessians{};
+        eval_wedge15_polynomial(x,
+                                 y,
+                                 z,
+                                 values_out,
+                                 gradients_out ? wedge_gradients.data() : nullptr,
+                                 hessians_out ? wedge_hessians.data() : nullptr);
+        if (gradients_out) {
+            for (std::size_t i = 0; i < 15u; ++i) {
+                store_gradient(wedge_gradients[i], gradients_out + i * 3u);
+            }
+        }
+        if (hessians_out) {
+            for (std::size_t i = 0; i < 15u; ++i) {
+                store_hessian(wedge_hessians[i], hessians_out + i * 9u);
+            }
+        }
         return;
     }
 
-    throw BasisEvaluationException("SerendipityBasis::evaluate_gradients: unsupported serendipity configuration",
+    throw BasisEvaluationException("SerendipityBasis::evaluate_all_to: unsupported serendipity configuration",
                                    __FILE__, __LINE__, __func__);
 }
 
-void SerendipityBasis::evaluate_hessians(const math::Vector<Real, 3>& xi,
-                                         std::vector<Hessian>& hessians) const {
-    hessians.assign(size_, Hessian{});
-    const Real x = xi[0];
-    const Real y = xi[1];
-    const Real z = xi[2];
-
-    if (dimension_ == 2) {
-        if (quad_monomial_exponents_.size() != size_ ||
-            quad_inv_vandermonde_.size() != size_ * size_) {
-            throw BasisEvaluationException(
-                "SerendipityBasis: quadrilateral interpolation tables are not initialized for Hessian evaluation",
-                __FILE__, __LINE__, __func__);
-        }
-
-        std::vector<Real> dxx(size_, Real(0));
-        std::vector<Real> dxy(size_, Real(0));
-        std::vector<Real> dyy(size_, Real(0));
-        for (std::size_t j = 0; j < size_; ++j) {
-            const auto [ax, ay] = quad_monomial_exponents_[j];
-            dxx[j] = (ax > 1)
-                         ? Real(ax * (ax - 1)) * std::pow(x, ax - 2) * std::pow(y, ay)
-                         : Real(0);
-            dxy[j] = (ax > 0 && ay > 0)
-                         ? Real(ax * ay) * std::pow(x, ax - 1) * std::pow(y, ay - 1)
-                         : Real(0);
-            dyy[j] = (ay > 1)
-                         ? Real(ay * (ay - 1)) * std::pow(x, ax) * std::pow(y, ay - 2)
-                         : Real(0);
-        }
+void SerendipityBasis::evaluate_values(const math::Vector<Real, 3>& xi,
+                                       std::vector<Real>& values) const {
+    values.resize(size_);
+    evaluate_values_to(xi, values.data());
+}
 
-        for (std::size_t i = 0; i < size_; ++i) {
-            for (std::size_t j = 0; j < size_; ++j) {
-                const Real coeff = quad_inv_vandermonde_[j * size_ + i];
-                hessians[i](0, 0) += dxx[j] * coeff;
-                hessians[i](0, 1) += dxy[j] * coeff;
-                hessians[i](1, 1) += dyy[j] * coeff;
-            }
-            hessians[i](1, 0) = hessians[i](0, 1);
-        }
-        return;
+void SerendipityBasis::evaluate_gradients(const math::Vector<Real, 3>& xi,
+                                          std::vector<Gradient>& gradients) const {
+    gradients.resize(size_);
+    std::vector<Real> flat(size_ * 3u, Real(0));
+    evaluate_gradients_to(xi, flat.data());
+    for (std::size_t i = 0; i < size_; ++i) {
+        gradients[i][0] = flat[i * 3u + 0u];
+        gradients[i][1] = flat[i * 3u + 1u];
+        gradients[i][2] = flat[i * 3u + 2u];
     }
+}
 
-    if (element_type_ == ElementType::Hex8 && order_ == 1) {
-        static const LagrangeBasis parent(ElementType::Hex8, 1);
-        parent.evaluate_hessians(xi, hessians);
-        return;
+void SerendipityBasis::evaluate_hessians(const math::Vector<Real, 3>& xi,
+                                         std::vector<Hessian>& hessians) const {
+    hessians.resize(size_);
+    std::vector<Real> flat(size_ * 9u, Real(0));
+    evaluate_hessians_to(xi, flat.data());
+    for (std::size_t i = 0; i < size_; ++i) {
+        hessians[i] = load_hessian(flat.data() + i * 9u);
     }
+}
 
-    if (geometry_mode_ && element_type_ == ElementType::Hex20) {
-        static const LagrangeBasis parent(ElementType::Hex8, 1);
-        std::array<Real, 8u * 9u> parent_hessians{};
-        parent.evaluate_hessians_to(xi, parent_hessians.data());
-        for (std::size_t i = 0; i < 8; ++i) {
-            for (std::size_t r = 0; r < 3; ++r) {
-                for (std::size_t c = 0; c < 3; ++c) {
-                    hessians[i](r, c) = parent_hessians[i * 9u + r * 3u + c];
-                }
-            }
-        }
-        return;
+void SerendipityBasis::evaluate_all(const math::Vector<Real, 3>& xi,
+                                    std::vector<Real>& values,
+                                    std::vector<Gradient>& gradients,
+                                    std::vector<Hessian>& hessians) const {
+    values.resize(size_);
+    gradients.resize(size_);
+    hessians.resize(size_);
+    std::vector<Real> flat_gradients(size_ * 3u, Real(0));
+    std::vector<Real> flat_hessians(size_ * 9u, Real(0));
+    evaluate_all_to(xi, values.data(), flat_gradients.data(), flat_hessians.data());
+    for (std::size_t i = 0; i < size_; ++i) {
+        gradients[i][0] = flat_gradients[i * 3u + 0u];
+        gradients[i][1] = flat_gradients[i * 3u + 1u];
+        gradients[i][2] = flat_gradients[i * 3u + 2u];
+        hessians[i] = load_hessian(flat_hessians.data() + i * 9u);
     }
+}
 
-    if (element_type_ == ElementType::Hex20 && order_ == 2) {
-        Hessian internal_hessians[20];
-        eval_hex20_hess_internal(x, y, z, internal_hessians);
-        const auto mesh_to_basis = ReferenceNodeLayout::mesh_to_basis_ordering(element_type_);
-        BASIS_CHECK_EVAL(mesh_to_basis.size() == size_,
-                         "Hex20 mesh-to-basis ordering is not registered");
-        for (std::size_t i = 0; i < 20; ++i) {
-            hessians[i] = internal_hessians[mesh_to_basis[i]];
-        }
-        return;
-    }
+void SerendipityBasis::evaluate_values_to(const math::Vector<Real, 3>& xi,
+                                          Real* SVMP_RESTRICT values_out) const {
+    evaluate_all_to(xi, values_out, nullptr, nullptr);
+}
 
-    if (element_type_ == ElementType::Wedge15 && order_ == 2) {
-        eval_wedge15_polynomial(x, y, z, nullptr, nullptr, hessians.data());
-        return;
-    }
+void SerendipityBasis::evaluate_gradients_to(const math::Vector<Real, 3>& xi,
+                                             Real* SVMP_RESTRICT gradients_out) const {
+    evaluate_all_to(xi, nullptr, gradients_out, nullptr);
+}
 
-    throw BasisEvaluationException("SerendipityBasis::evaluate_hessians: unsupported serendipity configuration",
-                                   __FILE__, __LINE__, __func__);
+void SerendipityBasis::evaluate_hessians_to(const math::Vector<Real, 3>& xi,
+                                            Real* SVMP_RESTRICT hessians_out) const {
+    evaluate_all_to(xi, nullptr, nullptr, hessians_out);
 }
 
 } // namespace basis
diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.h b/Code/Source/solver/FE/Basis/SerendipityBasis.h
index fc0b897cf..9c55c8eec 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.h
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.h
@@ -61,7 +61,7 @@ namespace basis {
 /// Hex8 corner functions for geometry mapping and assigns zero contribution to
 /// the quadratic edge nodes. This preserves the public Hex20 node count while
 /// intentionally reducing the geometry interpolation order.
-class SerendipityBasis : public BasisFunction {
+class SerendipityBasis final : public BasisFunction {
 public:
     /// \brief Construct a serendipity basis for an element type and polynomial order.
     ///
@@ -81,19 +81,19 @@ class SerendipityBasis : public BasisFunction {
     SerendipityBasis(ElementType type, int order, bool geometry_mode = false);
 
     /// \copydoc BasisFunction::basis_type()
-    BasisType basis_type() const noexcept override { return BasisType::Serendipity; }
+    BasisType basis_type() const noexcept final { return BasisType::Serendipity; }
 
     /// \copydoc BasisFunction::element_type()
-    ElementType element_type() const noexcept override { return element_type_; }
+    ElementType element_type() const noexcept final { return element_type_; }
 
     /// \copydoc BasisFunction::dimension()
-    int dimension() const noexcept override { return dimension_; }
+    int dimension() const noexcept final { return dimension_; }
 
     /// \copydoc BasisFunction::order()
-    int order() const noexcept override { return order_; }
+    int order() const noexcept final { return order_; }
 
     /// \copydoc BasisFunction::size()
-    std::size_t size() const noexcept override { return size_; }
+    std::size_t size() const noexcept final { return size_; }
 
     /// \brief Return the reference interpolation nodes in basis ordering.
     ///
@@ -119,7 +119,7 @@ class SerendipityBasis : public BasisFunction {
     /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
     /// \param values Receives one value per basis function.
     void evaluate_values(const math::Vector<Real, 3>& xi,
-                         std::vector<Real>& values) const override;
+                         std::vector<Real>& values) const final;
 
     /// \brief Evaluate analytical serendipity basis gradients at a reference coordinate.
     ///
@@ -134,7 +134,7 @@ class SerendipityBasis : public BasisFunction {
     /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
     /// \param gradients Receives one three-component gradient per basis function.
     void evaluate_gradients(const math::Vector<Real, 3>& xi,
-                            std::vector<Gradient>& gradients) const override;
+                            std::vector<Gradient>& gradients) const final;
 
     /// \brief Evaluate analytical serendipity basis Hessians at a reference coordinate.
     ///
@@ -149,7 +149,40 @@ class SerendipityBasis : public BasisFunction {
     /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
     /// \param hessians Receives one 3-by-3 Hessian per basis function.
     void evaluate_hessians(const math::Vector<Real, 3>& xi,
-                           std::vector<Hessian>& hessians) const override;
+                           std::vector<Hessian>& hessians) const final;
+
+    /// \brief Evaluate serendipity values, gradients, and Hessians together.
+    ///
+    /// \details This vector API is backed by the same flat-buffer evaluator as
+    /// the assembly-oriented `*_to` methods, so topology-specific polynomial
+    /// setup can be shared for a quadrature point.
+    ///
+    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+    /// \param values Receives one value per basis function.
+    /// \param gradients Receives one three-component gradient per basis function.
+    /// \param hessians Receives one 3-by-3 Hessian per basis function.
+    void evaluate_all(const math::Vector<Real, 3>& xi,
+                      std::vector<Real>& values,
+                      std::vector<Gradient>& gradients,
+                      std::vector<Hessian>& hessians) const final;
+
+    /// \brief Evaluate serendipity basis values into a flat caller-provided buffer.
+    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+    /// \param values_out Output buffer with at least size() entries.
+    void evaluate_values_to(const math::Vector<Real, 3>& xi,
+                            Real* SVMP_RESTRICT values_out) const final;
+
+    /// \brief Evaluate serendipity basis gradients into a flat caller-provided buffer.
+    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+    /// \param gradients_out Output buffer with node-major layout: node * 3 + component.
+    void evaluate_gradients_to(const math::Vector<Real, 3>& xi,
+                               Real* SVMP_RESTRICT gradients_out) const final;
+
+    /// \brief Evaluate serendipity basis Hessians into a flat caller-provided buffer.
+    /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
+    /// \param hessians_out Output buffer with node-major row-major layout: node * 9 + row * 3 + col.
+    void evaluate_hessians_to(const math::Vector<Real, 3>& xi,
+                              Real* SVMP_RESTRICT hessians_out) const final;
 
 private:
     ElementType element_type_;
@@ -164,6 +197,11 @@ class SerendipityBasis : public BasisFunction {
     // When true, this basis is used purely for geometry mapping and may use
     // reduced polynomial order (e.g., Hex20 geometry as Hex8).
     bool geometry_mode_;
+
+    void evaluate_all_to(const math::Vector<Real, 3>& xi,
+                         Real* SVMP_RESTRICT values_out,
+                         Real* SVMP_RESTRICT gradients_out,
+                         Real* SVMP_RESTRICT hessians_out) const;
 };
 
 /// @}

From 1289c086f637cdc1544aff0bfe99eb78ad3b9f1c Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Tue, 9 Jun 2026 16:33:06 -0700
Subject: [PATCH 13/22] adding switch cases for converting consts element types
 to fe element types. replaced custom math vector/matrix implementations for
 Eigen-backed implementations

---
 Code/Source/solver/FE/Basis/BasisFactory.cpp  |   27 +
 Code/Source/solver/FE/Basis/BasisFactory.h    |   24 +
 Code/Source/solver/FE/Basis/BasisFunction.cpp |    4 +-
 Code/Source/solver/FE/Basis/BasisFunction.h   |    4 +-
 Code/Source/solver/FE/Basis/LagrangeBasis.cpp |    7 +-
 .../solver/FE/Basis/SerendipityBasis.cpp      |    4 +-
 .../solver/FE/Math/DenseLinearAlgebra.cpp     |  278 +---
 .../solver/FE/Math/DenseLinearAlgebra.h       |    9 +-
 .../solver/FE/Math/DenseTransformKernels.h    |   70 +-
 Code/Source/solver/FE/Math/Matrix.h           | 1472 +----------------
 Code/Source/solver/FE/Math/MatrixExpr.h       |  630 -------
 Code/Source/solver/FE/Math/Vector.h           |  826 +--------
 Code/Source/solver/FE/Math/VectorExpr.h       |  476 ------
 Code/Source/solver/nn.cpp                     |  115 +-
 .../FE/Basis/test_BasisErrorPaths.cpp         |  106 +-
 .../unitTests/FE/Basis/test_BasisHessians.cpp |  141 +-
 .../FE/Basis/test_HigherOrderWedge.cpp        |   22 +-
 .../unitTests/FE/Basis/test_LagrangeBasis.cpp |  207 ++-
 .../FE/Basis/test_SerendipityTensorModal.cpp  |  185 ++-
 .../FE/Math/test_DenseLinearAlgebra.cpp       |  143 +-
 tests/unitTests/FE/Math/test_Matrix.cpp       |  593 -------
 tests/unitTests/FE/Math/test_MatrixExpr.cpp   |  527 ------
 tests/unitTests/FE/Math/test_Vector.cpp       |  588 -------
 tests/unitTests/FE/Math/test_VectorExpr.cpp   |  408 -----
 24 files changed, 1038 insertions(+), 5828 deletions(-)
 delete mode 100644 Code/Source/solver/FE/Math/MatrixExpr.h
 delete mode 100644 Code/Source/solver/FE/Math/VectorExpr.h
 delete mode 100644 tests/unitTests/FE/Math/test_Matrix.cpp
 delete mode 100644 tests/unitTests/FE/Math/test_MatrixExpr.cpp
 delete mode 100644 tests/unitTests/FE/Math/test_Vector.cpp
 delete mode 100644 tests/unitTests/FE/Math/test_VectorExpr.cpp

diff --git a/Code/Source/solver/FE/Basis/BasisFactory.cpp b/Code/Source/solver/FE/Basis/BasisFactory.cpp
index bc01be0ed..b48e25536 100644
--- a/Code/Source/solver/FE/Basis/BasisFactory.cpp
+++ b/Code/Source/solver/FE/Basis/BasisFactory.cpp
@@ -3,6 +3,7 @@
 
 #include "BasisFactory.h"
 
+#include "BasisTraits.h"
 #include "LagrangeBasis.h"
 #include "SerendipityBasis.h"
 
@@ -74,6 +75,32 @@ std::shared_ptr<BasisFunction> create(const BasisRequest& req) {
     }
 }
 
+BasisRequest default_basis_request(ElementType element_type) {
+    switch (element_type) {
+        // Reduced serendipity node layouts have no complete Lagrange basis at
+        // their node count; they always use the quadratic serendipity space.
+        case ElementType::Quad8:
+        case ElementType::Hex20:
+        case ElementType::Wedge15:
+            return BasisRequest{element_type, BasisType::Serendipity, 2};
+        case ElementType::Point1:
+            return BasisRequest{element_type, BasisType::Lagrange, 0};
+        default: {
+            const int order = complete_lagrange_alias_order(element_type);
+            if (order >= 0) {
+                return BasisRequest{element_type, BasisType::Lagrange, order};
+            }
+            throw BasisElementCompatibilityException(
+                "BasisFactory: no default basis is defined for the requested element type",
+                __FILE__, __LINE__, __func__);
+        }
+    }
+}
+
+std::shared_ptr<BasisFunction> create_default_for(ElementType element_type) {
+    return create(default_basis_request(element_type));
+}
+
 } // namespace basis_factory
 
 } // namespace basis
diff --git a/Code/Source/solver/FE/Basis/BasisFactory.h b/Code/Source/solver/FE/Basis/BasisFactory.h
index b188b3aa2..3922d5ced 100644
--- a/Code/Source/solver/FE/Basis/BasisFactory.h
+++ b/Code/Source/solver/FE/Basis/BasisFactory.h
@@ -38,6 +38,30 @@ namespace basis_factory {
 
 [[nodiscard]] std::shared_ptr<BasisFunction> create(const BasisRequest& req);
 
+/// \brief Return the default basis request (family and order) for an element type.
+///
+/// \details This is the single source of truth for which basis family and
+/// polynomial order a given element type uses by default: serendipity node
+/// layouts (Quad8, Hex20, Wedge15) select the quadratic serendipity family,
+/// and every complete Lagrange element selects the Lagrange family at the
+/// order given by its node layout. Solver-facing adapters should translate
+/// their element names to ElementType and delegate the basis choice here
+/// rather than tabulating family/order themselves.
+///
+/// \param element_type Element type to select a default basis for.
+/// \return Basis request suitable for create().
+/// \throws BasisElementCompatibilityException If no default basis is defined
+///         for the element type.
+[[nodiscard]] BasisRequest default_basis_request(ElementType element_type);
+
+/// \brief Create the default basis for an element type.
+///
+/// \details Equivalent to create(default_basis_request(element_type)).
+///
+/// \param element_type Element type to create a default basis for.
+/// \return Shared basis instance.
+[[nodiscard]] std::shared_ptr<BasisFunction> create_default_for(ElementType element_type);
+
 } // namespace basis_factory
 
 } // namespace basis
diff --git a/Code/Source/solver/FE/Basis/BasisFunction.cpp b/Code/Source/solver/FE/Basis/BasisFunction.cpp
index 3d95671f4..b98a36292 100644
--- a/Code/Source/solver/FE/Basis/BasisFunction.cpp
+++ b/Code/Source/solver/FE/Basis/BasisFunction.cpp
@@ -84,7 +84,7 @@ void BasisFunction::numerical_gradient(const math::Vector<Real, 3>& xi,
                                        Real eps) const {
     std::vector<Real> base;
     evaluate_values(xi, base);
-    gradients.assign(base.size(), Gradient{});
+    gradients.assign(base.size(), Gradient::Zero());
 
     for (int d = 0; d < dimension(); ++d) {
         math::Vector<Real, 3> forward = xi;
@@ -109,7 +109,7 @@ void BasisFunction::numerical_hessian(const math::Vector<Real, 3>& xi,
                                       Real eps) const {
     std::vector<Gradient> base_grad;
     evaluate_gradients(xi, base_grad);
-    hessians.assign(base_grad.size(), Hessian{});
+    hessians.assign(base_grad.size(), Hessian::Zero());
 
     for (int d = 0; d < dimension(); ++d) {
         math::Vector<Real, 3> forward = xi;
diff --git a/Code/Source/solver/FE/Basis/BasisFunction.h b/Code/Source/solver/FE/Basis/BasisFunction.h
index f8f78d7b6..e7de2bf01 100644
--- a/Code/Source/solver/FE/Basis/BasisFunction.h
+++ b/Code/Source/solver/FE/Basis/BasisFunction.h
@@ -39,7 +39,7 @@ using Hessian  = math::Matrix<Real, 3, 3>;
                                                     Real xy,
                                                     Real xz,
                                                     Real yz) {
-    Hessian hessian{};
+    Hessian hessian = Hessian::Zero();
     hessian(0, 0) = xx;
     hessian(1, 1) = yy;
     hessian(2, 2) = zz;
@@ -62,7 +62,7 @@ inline void store_hessian(const Hessian& hessian, Real* dst) noexcept {
 }
 
 [[nodiscard]] inline Hessian load_hessian(const Real* src) noexcept {
-    Hessian hessian{};
+    Hessian hessian = Hessian::Zero();
     hessian(0, 0) = src[0];
     hessian(0, 1) = src[1];
     hessian(0, 2) = src[2];
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
index d777447cb..4f8c15bb1 100644
--- a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
+++ b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
@@ -220,8 +220,8 @@ void evaluate_simplex(const Vec3& xi,
                       SimplexEval& out) {
     const std::size_t n = exponents.size();
     out.value.assign(n, Real(0));
-    out.gradient.assign(n, Gradient{});
-    out.hessian.assign(n, Hessian{});
+    out.gradient.assign(n, Gradient::Zero());
+    out.hessian.assign(n, Hessian::Zero());
 
     if (n == 1u && order == 0) {
         out.value[0] = Real(1);
@@ -230,7 +230,8 @@ void evaluate_simplex(const Vec3& xi,
 
     const int bary_count = top == BasisTopology::Triangle ? 3 : 4;
     std::array<Real, 4> lambda{Real(0), Real(0), Real(0), Real(0)};
-    std::array<Gradient, 4> lambda_grad{};
+    std::array<Gradient, 4> lambda_grad;
+    lambda_grad.fill(Gradient::Zero());
 
     lambda[1] = xi[0];
     lambda[2] = xi[1];
diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
index 358e76123..30eac9c38 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
@@ -377,7 +377,7 @@ void eval_hex20_hess_internal(Real r, Real s, Real t, Hessian* internal_hessians
     }
 
     for (int i = 0; i < 20; ++i) {
-        Hessian H{};
+        Hessian H = Hessian::Zero();
         for (int j = 0; j < 20; ++j) {
             H(0, 0) += hex20_coeffs[j][i] * d2phi_drr[j];
             H(1, 1) += hex20_coeffs[j][i] * d2phi_dss[j];
@@ -450,7 +450,7 @@ void eval_wedge15_polynomial(Real r,
         Real gr = Real(0);
         Real gs = Real(0);
         Real gt = Real(0);
-        Hessian H{};
+        Hessian H = Hessian::Zero();
         for (int j = 0; j < 15; ++j) {
             const Real coefficient =
                 kWedge15Coefficients[static_cast<std::size_t>(j)][static_cast<std::size_t>(i)];
diff --git a/Code/Source/solver/FE/Math/DenseLinearAlgebra.cpp b/Code/Source/solver/FE/Math/DenseLinearAlgebra.cpp
index 8be9a7560..fb27ad7bf 100644
--- a/Code/Source/solver/FE/Math/DenseLinearAlgebra.cpp
+++ b/Code/Source/solver/FE/Math/DenseLinearAlgebra.cpp
@@ -5,9 +5,7 @@
 
 #include "FEException.h"
 
-#if defined(FE_HAS_EIGEN) && FE_HAS_EIGEN
 #include <Eigen/Dense>
-#endif
 
 #include <algorithm>
 #include <cmath>
@@ -24,16 +22,24 @@ namespace math {
 
 namespace {
 
-constexpr std::size_t kDenseSolveRhsBlock = 32u;
+using DenseMatrix = DenseLUSolver::DenseMatrix;
+using RowMajorMatrix =
+    Eigen::Matrix<Real, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+using ConstRowMajorMap = Eigen::Map<const RowMajorMatrix>;
+
+ConstRowMajorMap map_row_major(std::span<const Real> matrix,
+                               std::size_t rows,
+                               std::size_t cols) {
+    return ConstRowMajorMap(matrix.data(),
+                            static_cast<Eigen::Index>(rows),
+                            static_cast<Eigen::Index>(cols));
+}
 
-void materialize_inverse_from_solver(const DenseLUSolver& solver,
-                                     std::vector<Real>& inverse) {
-    const std::size_t n = solver.n;
-    inverse.assign(n * n, Real(0));
-    for (std::size_t diag = 0; diag < n; ++diag) {
-        inverse[diag * n + diag] = Real(1);
-    }
-    solver.solve_in_place(std::span<Real>(inverse.data(), inverse.size()), n);
+void copy_to_row_major(const DenseMatrix& source, std::vector<Real>& dest) {
+    const auto rows = static_cast<std::size_t>(source.rows());
+    const auto cols = static_cast<std::size_t>(source.cols());
+    dest.resize(rows * cols);
+    Eigen::Map<RowMajorMatrix>(dest.data(), source.rows(), source.cols()) = source;
 }
 
 } // namespace
@@ -84,59 +90,18 @@ void DenseLUSolver::solve_in_place(std::span<Real> rhs,
                              label + ": dense solve requires at least one right-hand side");
     DENSE_LINALG_CHECK(rhs.size() == n * rhs_count,
                              label + ": dense multi-RHS solve size mismatch");
-    DENSE_LINALG_CHECK(lu.size() == n * n && pivots.size() == n,
+    DENSE_LINALG_CHECK(lu.rows() == static_cast<Eigen::Index>(n),
                              label + ": dense solver is not factorized");
-
-    for (std::size_t k = 0; k < n; ++k) {
-        if (pivots[k] != k) {
-            for (std::size_t block = 0; block < rhs_count; block += kDenseSolveRhsBlock) {
-                const std::size_t end =
-                    std::min(rhs_count, block + kDenseSolveRhsBlock);
-                for (std::size_t r = block; r < end; ++r) {
-                    std::swap(rhs[k * rhs_count + r],
-                              rhs[pivots[k] * rhs_count + r]);
-                }
-            }
-        }
-    }
-
-    for (std::size_t row = 0; row < n; ++row) {
-        for (std::size_t col = 0; col < row; ++col) {
-            const Real factor = lu[row * n + col];
-            for (std::size_t block = 0; block < rhs_count; block += kDenseSolveRhsBlock) {
-                const std::size_t end =
-                    std::min(rhs_count, block + kDenseSolveRhsBlock);
-                for (std::size_t r = block; r < end; ++r) {
-                    rhs[row * rhs_count + r] -= factor * rhs[col * rhs_count + r];
-                }
-            }
-        }
+    if (n == 0) {
+        return;
     }
 
-    for (std::size_t rev = 0; rev < n; ++rev) {
-        const std::size_t row = n - 1u - rev;
-        for (std::size_t col = row + 1u; col < n; ++col) {
-            const Real factor = lu[row * n + col];
-            for (std::size_t block = 0; block < rhs_count; block += kDenseSolveRhsBlock) {
-                const std::size_t end =
-                    std::min(rhs_count, block + kDenseSolveRhsBlock);
-                for (std::size_t r = block; r < end; ++r) {
-                    rhs[row * rhs_count + r] -= factor * rhs[col * rhs_count + r];
-                }
-            }
-        }
-        const Real pivot = lu[row * n + row];
-        DENSE_LINALG_CHECK(
-            std::abs(pivot) > pivot_tolerance,
-            label + ": zero pivot during dense solve");
-        for (std::size_t block = 0; block < rhs_count; block += kDenseSolveRhsBlock) {
-            const std::size_t end =
-                std::min(rhs_count, block + kDenseSolveRhsBlock);
-            for (std::size_t r = block; r < end; ++r) {
-                rhs[row * rhs_count + r] /= pivot;
-            }
-        }
-    }
+    Eigen::Map<RowMajorMatrix> rhs_map(rhs.data(),
+                                       static_cast<Eigen::Index>(n),
+                                       static_cast<Eigen::Index>(rhs_count));
+    // Evaluate into a temporary: lu.solve cannot alias its argument.
+    const DenseMatrix solution = lu.solve(rhs_map);
+    rhs_map = solution;
 }
 
 std::vector<Real> DenseLUSolver::solve(std::span<const Real> rhs) const {
@@ -155,14 +120,8 @@ DenseMatrixDiagnostics dense_matrix_diagnostics(
     DENSE_LINALG_CHECK(rows > 0 && cols > 0,
                              std::string(label) + ": diagnostics require a nonempty matrix");
 
-#if defined(FE_HAS_EIGEN) && FE_HAS_EIGEN
-    using RowMajorMatrix = Eigen::Matrix<Real, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
-    using Matrix = Eigen::Matrix<Real, Eigen::Dynamic, Eigen::Dynamic>;
-    const Eigen::Map<const RowMajorMatrix> A(matrix.data(),
-                                             static_cast<Eigen::Index>(rows),
-                                             static_cast<Eigen::Index>(cols));
-    const Matrix dense = A;
-    Eigen::JacobiSVD<Matrix> svd(dense);
+    const DenseMatrix dense = map_row_major(matrix, rows, cols);
+    Eigen::JacobiSVD<DenseMatrix> svd(dense);
 
     DenseMatrixDiagnostics diagnostics;
     const auto& singular_values = svd.singularValues();
@@ -189,22 +148,6 @@ DenseMatrixDiagnostics dense_matrix_diagnostics(
             diagnostics.smallest_retained_singular_value;
     }
     return diagnostics;
-#else
-    DenseMatrixDiagnostics diagnostics;
-    diagnostics.largest_singular_value = dense_matrix_max_abs(matrix);
-    diagnostics.tolerance =
-        dense_matrix_pivot_tolerance(rows, cols, diagnostics.largest_singular_value);
-    diagnostics.rank =
-        dense_matrix_rank(std::vector<Real>(matrix.begin(), matrix.end()), rows, cols);
-    const std::size_t full_rank = std::min(rows, cols);
-    if (diagnostics.rank == full_rank) {
-        diagnostics.smallest_retained_singular_value = diagnostics.tolerance;
-    }
-    // Exact condition estimates require SVD diagnostics. In Eigen-disabled
-    // builds this stays explicit instead of relying on a misleading estimate.
-    diagnostics.condition_estimate = std::numeric_limits<Real>::infinity();
-    return diagnostics;
-#endif
 }
 
 DenseLUSolver factor_dense_matrix(std::vector<Real> matrix,
@@ -215,55 +158,28 @@ DenseLUSolver factor_dense_matrix(std::vector<Real> matrix,
 
     DenseLUSolver solver;
     solver.n = n;
-    solver.lu = std::move(matrix);
-    solver.pivots.resize(n);
-    const Real max_abs = dense_matrix_max_abs(solver.lu);
-    solver.pivot_tolerance =
-        dense_matrix_pivot_tolerance(n, n, max_abs);
     solver.label = std::string(label);
+    const Real max_abs =
+        dense_matrix_max_abs(std::span<const Real>(matrix.data(), matrix.size()));
+    solver.pivot_tolerance = dense_matrix_pivot_tolerance(n, n, max_abs);
 
+    solver.lu.compute(map_row_major(matrix, n, n));
+
+    // Partial pivoting leaves the pivots on the diagonal of the packed LU
+    // factor; a pivot below the scale-aware tolerance marks rank deficiency.
     Real max_pivot_abs = Real(0);
     Real min_pivot_abs = std::numeric_limits<Real>::infinity();
-    for (std::size_t col = 0; col < n; ++col) {
-        std::size_t pivot_row = col;
-        Real pivot_abs = std::abs(solver.lu[col * n + col]);
-        for (std::size_t row = col + 1; row < n; ++row) {
-            const Real candidate = std::abs(solver.lu[row * n + col]);
-            if (candidate > pivot_abs) {
-                pivot_abs = candidate;
-                pivot_row = row;
-            }
-        }
-
+    const auto diagonal = solver.lu.matrixLU().diagonal();
+    for (Eigen::Index col = 0; col < diagonal.size(); ++col) {
+        const Real pivot_magnitude = std::abs(diagonal[col]);
         DENSE_LINALG_CHECK(
-            pivot_abs > solver.pivot_tolerance,
+            pivot_magnitude > solver.pivot_tolerance,
             solver.label + ": rank-deficient matrix (rank " +
                 std::to_string(col) + " of " + std::to_string(n) +
                 ", pivot below scale-aware tolerance " +
                 std::to_string(solver.pivot_tolerance) + ")");
-
-        solver.pivots[col] = pivot_row;
-        if (pivot_row != col) {
-            for (std::size_t j = 0; j < n; ++j) {
-                std::swap(solver.lu[col * n + j], solver.lu[pivot_row * n + j]);
-            }
-        }
-
-        const Real pivot = solver.lu[col * n + col];
-        DENSE_LINALG_CHECK(
-            std::abs(pivot) > solver.pivot_tolerance,
-            solver.label + ": zero pivot after row exchange");
-        const Real pivot_magnitude = std::abs(pivot);
         max_pivot_abs = std::max(max_pivot_abs, pivot_magnitude);
         min_pivot_abs = std::min(min_pivot_abs, pivot_magnitude);
-
-        for (std::size_t row = col + 1; row < n; ++row) {
-            const Real factor = solver.lu[row * n + col] / pivot;
-            solver.lu[row * n + col] = factor;
-            for (std::size_t j = col + 1; j < n; ++j) {
-                solver.lu[row * n + j] -= factor * solver.lu[col * n + j];
-            }
-        }
     }
 
     solver.diagnostics.rank = n;
@@ -293,20 +209,14 @@ DenseInverseResult invert_dense_matrix_with_diagnostics(
         dense_matrix_diagnostics(std::span<const Real>(matrix.data(), matrix.size()),
                                  n, n, label);
 
-#if defined(FE_HAS_EIGEN) && FE_HAS_EIGEN
     if (std::isfinite(solver.diagnostics.condition_estimate) &&
         std::isfinite(result.diagnostics.condition_estimate) &&
         result.diagnostics.condition_estimate > dense_matrix_condition_fallback_threshold()) {
-        using RowMajorMatrix = Eigen::Matrix<Real, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
-        using Matrix = Eigen::Matrix<Real, Eigen::Dynamic, Eigen::Dynamic>;
-        const Eigen::Map<const RowMajorMatrix> A(matrix.data(),
-                                                 static_cast<Eigen::Index>(n),
-                                                 static_cast<Eigen::Index>(n));
-        const Matrix dense = A;
-        Eigen::JacobiSVD<Matrix> svd(dense,
-                                     Eigen::ComputeFullU | Eigen::ComputeFullV);
-        Matrix sigma_inverse = Matrix::Zero(static_cast<Eigen::Index>(n),
-                                            static_cast<Eigen::Index>(n));
+        const DenseMatrix dense = map_row_major(matrix, n, n);
+        Eigen::JacobiSVD<DenseMatrix> svd(dense,
+                                          Eigen::ComputeFullU | Eigen::ComputeFullV);
+        DenseMatrix sigma_inverse = DenseMatrix::Zero(static_cast<Eigen::Index>(n),
+                                                      static_cast<Eigen::Index>(n));
         const auto& singular_values = svd.singularValues();
         for (Eigen::Index i = 0; i < singular_values.size(); ++i) {
             DENSE_LINALG_CHECK(
@@ -314,20 +224,14 @@ DenseInverseResult invert_dense_matrix_with_diagnostics(
                 std::string(label) + ": high-condition SVD fallback encountered a dropped singular value");
             sigma_inverse(i, i) = Real(1) / singular_values[i];
         }
-        const Matrix inverse = svd.matrixV() * sigma_inverse * svd.matrixU().transpose();
-        result.inverse.assign(n * n, Real(0));
-        for (std::size_t row = 0; row < n; ++row) {
-            for (std::size_t col = 0; col < n; ++col) {
-                result.inverse[row * n + col] =
-                    inverse(static_cast<Eigen::Index>(row), static_cast<Eigen::Index>(col));
-            }
-        }
+        const DenseMatrix inverse = svd.matrixV() * sigma_inverse * svd.matrixU().transpose();
+        copy_to_row_major(inverse, result.inverse);
         result.used_svd_fallback = true;
         return result;
     }
-#endif
 
-    materialize_inverse_from_solver(solver, result.inverse);
+    const DenseMatrix inverse = solver.lu.inverse();
+    copy_to_row_major(inverse, result.inverse);
     return result;
 }
 
@@ -357,9 +261,10 @@ std::vector<Real> invert_dense_matrix(std::vector<Real> matrix,
                                       std::size_t n,
                                       std::string_view label) {
     const DenseLUSolver solver = factor_dense_matrix(std::move(matrix), n, label);
-    std::vector<Real> inverse;
-    materialize_inverse_from_solver(solver, inverse);
-    return inverse;
+    const DenseMatrix inverse = solver.lu.inverse();
+    std::vector<Real> result;
+    copy_to_row_major(inverse, result);
+    return result;
 }
 
 std::size_t dense_matrix_rank(std::vector<Real> matrix,
@@ -367,46 +272,22 @@ std::size_t dense_matrix_rank(std::vector<Real> matrix,
                               std::size_t cols) {
     DENSE_LINALG_CHECK(matrix.size() == rows * cols,
                              "dense_matrix_rank: size mismatch");
-    const Real tolerance =
-        dense_matrix_pivot_tolerance(rows, cols, dense_matrix_max_abs(matrix));
 
-    std::size_t rank = 0;
-    std::size_t pivot_row = 0;
-    for (std::size_t col = 0; col < cols && pivot_row < rows; ++col) {
-        std::size_t best_row = pivot_row;
-        Real best_abs = std::abs(matrix[pivot_row * cols + col]);
-        for (std::size_t row = pivot_row + 1; row < rows; ++row) {
-            const Real candidate = std::abs(matrix[row * cols + col]);
-            if (candidate > best_abs) {
-                best_abs = candidate;
-                best_row = row;
-            }
-        }
-        if (best_abs <= tolerance) {
-            continue;
-        }
+    const DenseMatrix dense =
+        map_row_major(std::span<const Real>(matrix.data(), matrix.size()), rows, cols);
+    Eigen::JacobiSVD<DenseMatrix> svd(dense);
 
-        if (best_row != pivot_row) {
-            for (std::size_t c = col; c < cols; ++c) {
-                std::swap(matrix[pivot_row * cols + c], matrix[best_row * cols + c]);
-            }
-        }
+    const auto& singular_values = svd.singularValues();
+    const Real largest =
+        (singular_values.size() > 0) ? singular_values[0] : Real(0);
+    const Real tolerance =
+        dense_matrix_singular_value_tolerance(rows, cols, largest);
 
-        const Real pivot = matrix[pivot_row * cols + col];
-        for (std::size_t row = pivot_row + 1; row < rows; ++row) {
-            const Real factor = matrix[row * cols + col] / pivot;
-            if (std::abs(factor) <= tolerance) {
-                matrix[row * cols + col] = Real(0);
-                continue;
-            }
-            matrix[row * cols + col] = Real(0);
-            for (std::size_t c = col + 1; c < cols; ++c) {
-                matrix[row * cols + c] -= factor * matrix[pivot_row * cols + c];
-            }
+    std::size_t rank = 0;
+    for (Eigen::Index i = 0; i < singular_values.size(); ++i) {
+        if (singular_values[i] > tolerance) {
+            ++rank;
         }
-
-        ++rank;
-        ++pivot_row;
     }
     return rank;
 }
@@ -421,17 +302,10 @@ DensePseudoInverseResult rank_revealing_pseudo_inverse(
     DENSE_LINALG_CHECK(rows > 0 && cols > 0,
                              std::string(label) + ": pseudo-inverse requires a nonempty matrix");
 
-#if defined(FE_HAS_EIGEN) && FE_HAS_EIGEN
-    using RowMajorMatrix = Eigen::Matrix<Real, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
-    using Matrix = Eigen::Matrix<Real, Eigen::Dynamic, Eigen::Dynamic>;
-    const Eigen::Map<const RowMajorMatrix> A(matrix.data(),
-                                             static_cast<Eigen::Index>(rows),
-                                             static_cast<Eigen::Index>(cols));
-    const Matrix dense = A;
-    Eigen::JacobiSVD<Matrix> svd(dense, Eigen::ComputeFullU | Eigen::ComputeFullV);
+    const DenseMatrix dense = map_row_major(matrix, rows, cols);
+    Eigen::JacobiSVD<DenseMatrix> svd(dense, Eigen::ComputeFullU | Eigen::ComputeFullV);
 
     DensePseudoInverseResult result;
-    result.inverse.assign(cols * rows, Real(0));
 
     const auto& singular_values = svd.singularValues();
     result.largest_singular_value =
@@ -439,8 +313,8 @@ DensePseudoInverseResult rank_revealing_pseudo_inverse(
     result.tolerance =
         dense_matrix_singular_value_tolerance(rows, cols, result.largest_singular_value);
 
-    Matrix sigma_inverse = Matrix::Zero(static_cast<Eigen::Index>(cols),
-                                        static_cast<Eigen::Index>(rows));
+    DenseMatrix sigma_inverse = DenseMatrix::Zero(static_cast<Eigen::Index>(cols),
+                                                  static_cast<Eigen::Index>(rows));
     for (Eigen::Index i = 0; i < singular_values.size(); ++i) {
         const Real sigma = singular_values[i];
         if (sigma <= result.tolerance) {
@@ -451,22 +325,10 @@ DensePseudoInverseResult rank_revealing_pseudo_inverse(
         result.smallest_retained_singular_value = sigma;
     }
 
-    const Matrix pseudo_inverse =
+    const DenseMatrix pseudo_inverse =
         svd.matrixV() * sigma_inverse * svd.matrixU().transpose();
-    for (std::size_t r = 0; r < cols; ++r) {
-        for (std::size_t c = 0; c < rows; ++c) {
-            result.inverse[r * rows + c] =
-                pseudo_inverse(static_cast<Eigen::Index>(r), static_cast<Eigen::Index>(c));
-        }
-    }
+    copy_to_row_major(pseudo_inverse, result.inverse);
     return result;
-#else
-    DENSE_LINALG_CHECK(
-        false,
-        std::string(label) +
-            ": rank-revealing pseudo-inverse requires FE_ENABLE_EIGEN");
-    return {};
-#endif
 }
 
 } // namespace math
diff --git a/Code/Source/solver/FE/Math/DenseLinearAlgebra.h b/Code/Source/solver/FE/Math/DenseLinearAlgebra.h
index 6c81755f4..d322ef958 100644
--- a/Code/Source/solver/FE/Math/DenseLinearAlgebra.h
+++ b/Code/Source/solver/FE/Math/DenseLinearAlgebra.h
@@ -6,6 +6,8 @@
 
 #include "Types.h"
 
+#include <Eigen/Dense>
+
 #include <cstddef>
 #include <limits>
 #include <span>
@@ -18,7 +20,7 @@ namespace FE {
 namespace math {
 
 // Dense solve, inverse, rank, and pseudo-inverse support for FE construction
-// utilities. Matrices are row-major: matrix[row * cols + col].
+// utilities, backed by Eigen. Matrices are row-major: matrix[row * cols + col].
 [[nodiscard]] Real dense_matrix_max_abs(std::span<const Real> matrix) noexcept;
 
 [[nodiscard]] Real dense_matrix_pivot_tolerance(std::size_t rows,
@@ -57,9 +59,10 @@ struct DenseInverseResult {
 [[nodiscard]] Real dense_matrix_condition_error_threshold() noexcept;
 
 struct DenseLUSolver {
+    using DenseMatrix = Eigen::Matrix<Real, Eigen::Dynamic, Eigen::Dynamic>;
+
     std::size_t n{0};
-    std::vector<Real> lu;
-    std::vector<std::size_t> pivots;
+    Eigen::PartialPivLU<DenseMatrix> lu;
     DenseMatrixDiagnostics diagnostics;
     Real pivot_tolerance{0};
     std::string label;
diff --git a/Code/Source/solver/FE/Math/DenseTransformKernels.h b/Code/Source/solver/FE/Math/DenseTransformKernels.h
index 50f1002de..2ddb9cefa 100644
--- a/Code/Source/solver/FE/Math/DenseTransformKernels.h
+++ b/Code/Source/solver/FE/Math/DenseTransformKernels.h
@@ -6,17 +6,21 @@
 
 #include "Types.h"
 
-#include <algorithm>
-#include <array>
+#include <Eigen/Core>
+
 #include <cstddef>
 
 namespace svmp {
 namespace FE {
 namespace math {
 
-constexpr std::size_t dense_transform_blocked_min_rows() noexcept { return 32u; }
-constexpr std::size_t dense_transform_blocked_min_rhs() noexcept { return 4u; }
-
+/// \brief Apply a row-major dense matrix to a batch of right-hand sides.
+///
+/// Computes output = matrix * input where matrix is rows-by-cols (row-major),
+/// input holds cols rows of rhs_count values each (row stride
+/// input_row_stride), and output holds rows rows of rhs_count values each
+/// (row stride output_row_stride). Strides may exceed rhs_count for padded
+/// layouts; padding entries are left untouched.
 inline void dense_transform_batched_row_major(
     const Real* SVMP_RESTRICT matrix,
     std::size_t rows,
@@ -30,41 +34,29 @@ inline void dense_transform_batched_row_major(
         return;
     }
 
-    if (rows < dense_transform_blocked_min_rows() ||
-        rhs_count < dense_transform_blocked_min_rhs()) {
-        for (std::size_t row = 0; row < rows; ++row) {
-            const Real* matrix_row = matrix + row * cols;
-            Real* output_row = output + row * output_row_stride;
-            for (std::size_t rhs = 0; rhs < rhs_count; ++rhs) {
-                Real value = Real(0);
-                for (std::size_t col = 0; col < cols; ++col) {
-                    value += matrix_row[col] * input[col * input_row_stride + rhs];
-                }
-                output_row[rhs] = value;
-            }
-        }
-        return;
-    }
+    using RowMajorMatrix =
+        Eigen::Matrix<Real, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+    using ConstMap = Eigen::Map<const RowMajorMatrix>;
+    using ConstStridedMap =
+        Eigen::Map<const RowMajorMatrix, Eigen::Unaligned, Eigen::OuterStride<>>;
+    using StridedMap =
+        Eigen::Map<RowMajorMatrix, Eigen::Unaligned, Eigen::OuterStride<>>;
 
-    constexpr std::size_t kRhsBlock = 32u;
-    for (std::size_t row = 0; row < rows; ++row) {
-        const Real* matrix_row = matrix + row * cols;
-        Real* output_row = output + row * output_row_stride;
-        for (std::size_t rhs_base = 0; rhs_base < rhs_count; rhs_base += kRhsBlock) {
-            const std::size_t block_size = std::min(kRhsBlock, rhs_count - rhs_base);
-            std::array<Real, kRhsBlock> accum{};
-            for (std::size_t col = 0; col < cols; ++col) {
-                const Real coeff = matrix_row[col];
-                const Real* input_row = input + col * input_row_stride + rhs_base;
-                for (std::size_t rhs = 0; rhs < block_size; ++rhs) {
-                    accum[rhs] += coeff * input_row[rhs];
-                }
-            }
-            for (std::size_t rhs = 0; rhs < block_size; ++rhs) {
-                output_row[rhs_base + rhs] = accum[rhs];
-            }
-        }
-    }
+    const ConstMap matrix_map(matrix,
+                              static_cast<Eigen::Index>(rows),
+                              static_cast<Eigen::Index>(cols));
+    const ConstStridedMap input_map(
+        input,
+        static_cast<Eigen::Index>(cols),
+        static_cast<Eigen::Index>(rhs_count),
+        Eigen::OuterStride<>(static_cast<Eigen::Index>(input_row_stride)));
+    StridedMap output_map(
+        output,
+        static_cast<Eigen::Index>(rows),
+        static_cast<Eigen::Index>(rhs_count),
+        Eigen::OuterStride<>(static_cast<Eigen::Index>(output_row_stride)));
+
+    output_map.noalias() = matrix_map * input_map;
 }
 
 } // namespace math
diff --git a/Code/Source/solver/FE/Math/Matrix.h b/Code/Source/solver/FE/Math/Matrix.h
index f7432f38c..ce1d4a612 100644
--- a/Code/Source/solver/FE/Math/Matrix.h
+++ b/Code/Source/solver/FE/Math/Matrix.h
@@ -6,32 +6,25 @@
 
 /**
  * @file Matrix.h
- * @brief Fixed-size matrices with expression templates and specializations for FE computations
+ * @brief Fixed-size matrix types for FE computations, backed by Eigen.
  *
- * This header provides optimized fixed-size matrix operations for element-level
- * computations. Includes specialized analytical formulas for 2x2 and 3x3 matrices
- * (determinant, inverse using Cramer's rule) and Gauss elimination for larger matrices.
- * All operations use expression templates to eliminate temporaries.
+ * The FE library standardizes on Eigen for linear algebra. These aliases give
+ * element-level code a stable vocabulary type without re-exporting all of
+ * Eigen. Storage is Eigen's default (column-major); element access through
+ * operator()(row, col) is unchanged. Note that, unlike the previous in-house
+ * implementation, Eigen types are NOT zero-initialized by default
+ * construction; use Matrix::Zero() where a zeroed value is required.
  */
 
-#include "MatrixExpr.h"
 #include "Vector.h"
-#include "../Common/Types.h"
-#include <algorithm>
-#include <array>
-#include <cmath>
-#include <initializer_list>
-#include <ostream>
-#include <stdexcept>
-#include <type_traits>
+
+#include <Eigen/Core>
+
+#include <cstddef>
 
 /// \defgroup FE_MatrixMath Matrix
 /// \ingroup FE_Math
-/// \brief Fixed-size matrix types, matrix expressions, and small-matrix operations.
-///
-/// \details The Matrix submodule contains row-major fixed-size matrices used
-/// by FE kernels, expression-template support for matrix algebra, and direct
-/// determinant/inverse implementations for common element-level sizes.
+/// \brief Fixed-size matrix type aliases.
 
 namespace svmp {
 namespace FE {
@@ -43,1266 +36,9 @@ namespace math {
  * @tparam T Scalar type (float, double)
  * @tparam M Number of rows
  * @tparam N Number of columns
- *
- * Storage is row-major for cache efficiency. Memory is aligned for SIMD operations.
- * Specializations exist for 2x2, 3x3, 4x4 matrices with analytical algorithms.
  */
 template<typename T, std::size_t M, std::size_t N>
-class Matrix : public MatrixExpr<Matrix<T, M, N>> {
-    static_assert(std::is_arithmetic_v<T>, "T must be an arithmetic type");
-    static_assert(M > 0 && N > 0, "Matrix dimensions must be positive");
-
-private:
-    alignas(kFEFixedObjectAlignmentBytes) T data_[M * N];  // Row-major, SIMD-friendly storage
-
-    // Helper to compute linear index from (i,j)
-    static constexpr std::size_t index(std::size_t i, std::size_t j) {
-        return i * N + j;
-    }
-
-public:
-    // Type definitions
-    using value_type = T;
-    using size_type = std::size_t;
-    using reference = T&;
-    using const_reference = const T&;
-    using pointer = T*;
-    using const_pointer = const T*;
-
-    /**
-     * @brief Default constructor - zero initializes all elements
-     */
-    constexpr Matrix() : data_{} {}
-
-    /**
-     * @brief Fill constructor - initializes all elements with same value
-     * @param value Value to fill matrix with
-     */
-    constexpr explicit Matrix(T value) {
-        for (size_type i = 0; i < M * N; ++i) {
-            data_[i] = value;
-        }
-    }
-
-    /**
-     * @brief Initializer list constructor for row-wise initialization
-     * @param init Nested initializer lists {{row0}, {row1}, ...}
-     */
-    constexpr Matrix(std::initializer_list<std::initializer_list<T>> init) : data_{} {
-        size_type row = 0;
-        for (auto row_init : init) {
-            if (row >= M) break;
-            size_type col = 0;
-            for (auto val : row_init) {
-                if (col >= N) break;
-                (*this)(row, col) = val;
-                ++col;
-            }
-            ++row;
-        }
-    }
-
-    /**
-     * @brief Constructor from expression template
-     * @tparam Expr Expression type
-     * @param expr Matrix expression to evaluate
-     */
-    template<typename Expr>
-    Matrix(const MatrixExpr<Expr>& expr) {
-        const auto& e = expr.derived();
-        for (size_type i = 0; i < M; ++i) {
-            for (size_type j = 0; j < N; ++j) {
-                (*this)(i, j) = e(i, j);
-            }
-        }
-    }
-
-    /**
-     * @brief Copy constructor
-     */
-    constexpr Matrix(const Matrix&) = default;
-
-    /**
-     * @brief Move constructor
-     */
-    constexpr Matrix(Matrix&&) noexcept = default;
-
-    /**
-     * @brief Copy assignment
-     */
-    Matrix& operator=(const Matrix&) = default;
-
-    /**
-     * @brief Move assignment
-     */
-    Matrix& operator=(Matrix&&) noexcept = default;
-
-    /**
-     * @brief Assignment from expression template
-     * @tparam Expr Expression type
-     * @param expr Matrix expression to evaluate
-     * @return Reference to this
-     */
-    template<typename Expr>
-    Matrix& operator=(const MatrixExpr<Expr>& expr) {
-        const auto& e = expr.derived();
-        for (size_type i = 0; i < M; ++i) {
-            for (size_type j = 0; j < N; ++j) {
-                (*this)(i, j) = e(i, j);
-            }
-        }
-        return *this;
-    }
-
-    /**
-     * @brief Get number of rows (compile-time constant)
-     * @return Number of rows
-     */
-    static constexpr size_type rows() { return M; }
-
-    /**
-     * @brief Get number of columns (compile-time constant)
-     * @return Number of columns
-     */
-    static constexpr size_type cols() { return N; }
-
-    /**
-     * @brief Get total number of elements
-     * @return M * N
-     */
-    static constexpr size_type size() { return M * N; }
-
-    /**
-     * @brief Element access (no bounds checking)
-     * @param i Row index
-     * @param j Column index
-     * @return Reference to element
-     */
-    constexpr T& operator()(size_type i, size_type j) {
-        return data_[index(i, j)];
-    }
-
-    /**
-     * @brief Element access (no bounds checking) - const version
-     * @param i Row index
-     * @param j Column index
-     * @return Const reference to element
-     */
-    constexpr const T& operator()(size_type i, size_type j) const {
-        return data_[index(i, j)];
-    }
-
-    /**
-     * @brief Element access with bounds checking
-     * @param i Row index
-     * @param j Column index
-     * @return Reference to element
-     * @throws std::out_of_range if indices are out of bounds
-     */
-    T& at(size_type i, size_type j) {
-        if (i >= M || j >= N) {
-            throw std::out_of_range("Matrix::at: index out of range");
-        }
-        return (*this)(i, j);
-    }
-
-    /**
-     * @brief Element access with bounds checking - const version
-     * @param i Row index
-     * @param j Column index
-     * @return Const reference to element
-     * @throws std::out_of_range if indices are out of bounds
-     */
-    const T& at(size_type i, size_type j) const {
-        if (i >= M || j >= N) {
-            throw std::out_of_range("Matrix::at: index out of range");
-        }
-        return (*this)(i, j);
-    }
-
-    /**
-     * @brief Get row as vector
-     * @param i Row index
-     * @return Vector containing row elements
-     */
-    Vector<T, N> row(size_type i) const {
-        Vector<T, N> result;
-        for (size_type j = 0; j < N; ++j) {
-            result[j] = (*this)(i, j);
-        }
-        return result;
-    }
-
-    /**
-     * @brief Get column as vector
-     * @param j Column index
-     * @return Vector containing column elements
-     */
-    Vector<T, M> column(size_type j) const {
-        Vector<T, M> result;
-        for (size_type i = 0; i < M; ++i) {
-            result[i] = (*this)(i, j);
-        }
-        return result;
-    }
-
-    /**
-     * @brief Get column as vector (alias for column)
-     * @param j Column index
-     * @return Vector containing column elements
-     */
-    Vector<T, M> col(size_type j) const {
-        return column(j);
-    }
-
-    /**
-     * @brief Set row from vector
-     * @param i Row index
-     * @param v Vector of values
-     */
-    void set_row(size_type i, const Vector<T, N>& v) {
-        for (size_type j = 0; j < N; ++j) {
-            (*this)(i, j) = v[j];
-        }
-    }
-
-    /**
-     * @brief Set column from vector
-     * @param j Column index
-     * @param v Vector of values
-     */
-    void set_column(size_type j, const Vector<T, M>& v) {
-        for (size_type i = 0; i < M; ++i) {
-            (*this)(i, j) = v[i];
-        }
-    }
-
-    /**
-     * @brief Set column from vector (alias for set_column)
-     * @param j Column index
-     * @param v Vector of values
-     */
-    void set_col(size_type j, const Vector<T, M>& v) {
-        set_column(j, v);
-    }
-
-    /**
-     * @brief Get pointer to underlying data
-     * @return Pointer to first element
-     */
-    T* data() { return data_; }
-    const T* data() const { return data_; }
-
-    /**
-     * @brief Fill matrix with value
-     * @param value Value to fill with
-     */
-    void fill(T value) {
-        for (size_type i = 0; i < M * N; ++i) {
-            data_[i] = value;
-        }
-    }
-
-    /**
-     * @brief Set all elements to zero
-     */
-    void set_zero() {
-        fill(T{0});
-    }
-
-    // Arithmetic operators
-
-    /**
-     * @brief In-place addition
-     * @param other Matrix to add
-     * @return Reference to this
-     */
-    Matrix& operator+=(const Matrix& other) {
-        for (size_type i = 0; i < M * N; ++i) {
-            data_[i] += other.data_[i];
-        }
-        return *this;
-    }
-
-    /**
-     * @brief In-place subtraction
-     * @param other Matrix to subtract
-     * @return Reference to this
-     */
-    Matrix& operator-=(const Matrix& other) {
-        for (size_type i = 0; i < M * N; ++i) {
-            data_[i] -= other.data_[i];
-        }
-        return *this;
-    }
-
-    /**
-     * @brief In-place scalar multiplication
-     * @param scalar Scalar to multiply by
-     * @return Reference to this
-     */
-    Matrix& operator*=(T scalar) {
-        for (size_type i = 0; i < M * N; ++i) {
-            data_[i] *= scalar;
-        }
-        return *this;
-    }
-
-    /**
-     * @brief In-place scalar division
-     * @param scalar Scalar to divide by
-     * @return Reference to this
-     */
-    Matrix& operator/=(T scalar) {
-        const T inv = T(1) / scalar;
-        return (*this) *= inv;
-    }
-
-    // Matrix operations
-
-    /**
-     * @brief Compute transpose
-     * @return Transposed matrix
-     */
-    Matrix<T, N, M> transpose() const {
-        Matrix<T, N, M> result;
-        for (size_type i = 0; i < M; ++i) {
-            for (size_type j = 0; j < N; ++j) {
-                result(j, i) = (*this)(i, j);
-            }
-        }
-        return result;
-    }
-
-    /**
-     * @brief Compute trace (sum of diagonal elements)
-     * @return Trace (only valid for square matrices)
-     */
-    template<std::size_t M2 = M, std::size_t N2 = N>
-    std::enable_if_t<M2 == N2, T> trace() const {
-        T result = T(0);
-        for (size_type i = 0; i < M; ++i) {
-            result += (*this)(i, i);
-        }
-        return result;
-    }
-
-    /**
-     * @brief Compute Frobenius norm squared
-     * @return Sum of squares of all elements
-     */
-    T frobenius_norm_squared() const {
-        T result = T(0);
-        for (size_type i = 0; i < M * N; ++i) {
-            result += data_[i] * data_[i];
-        }
-        return result;
-    }
-
-    /**
-     * @brief Compute Frobenius norm
-     * @return Square root of sum of squares
-     */
-    T frobenius_norm() const {
-        using std::sqrt;
-        return sqrt(frobenius_norm_squared());
-    }
-
-    /**
-     * @brief Compute infinity norm (maximum absolute row sum)
-     * @return Infinity norm
-     */
-    T infinity_norm() const {
-        T max_row_sum = T(0);
-        for (size_type i = 0; i < M; ++i) {
-            T row_sum = T(0);
-            for (size_type j = 0; j < N; ++j) {
-                using std::abs;
-                row_sum += abs((*this)(i, j));
-            }
-            max_row_sum = std::max(max_row_sum, row_sum);
-        }
-        return max_row_sum;
-    }
-
-    /**
-     * @brief Compute one norm (maximum absolute column sum)
-     * @return One norm
-     */
-    T one_norm() const {
-        T max_col_sum = T(0);
-        for (size_type j = 0; j < N; ++j) {
-            T col_sum = T(0);
-            for (size_type i = 0; i < M; ++i) {
-                using std::abs;
-                col_sum += abs((*this)(i, j));
-            }
-            max_col_sum = std::max(max_col_sum, col_sum);
-        }
-        return max_col_sum;
-    }
-
-    /**
-     * @brief Get minimum element
-     * @return Minimum value
-     */
-    T min() const {
-        return *std::min_element(data_, data_ + M * N);
-    }
-
-    /**
-     * @brief Get maximum element
-     * @return Maximum value
-     */
-    T max() const {
-        return *std::max_element(data_, data_ + M * N);
-    }
-
-    /**
-     * @brief Get sum of all elements
-     * @return Sum of elements
-     */
-    T sum() const {
-        T result = T(0);
-        for (size_type i = 0; i < M * N; ++i) {
-            result += data_[i];
-        }
-        return result;
-    }
-
-    // Static factory functions
-
-    /**
-     * @brief Create zero matrix
-     * @return Matrix with all elements zero
-     */
-    static constexpr Matrix zeros() {
-        return Matrix();
-    }
-
-    /**
-     * @brief Create matrix with all elements one
-     * @return Matrix with all elements one
-     */
-    static constexpr Matrix ones() {
-        return Matrix(T(1));
-    }
-
-    /**
-     * @brief Create identity matrix (only for square matrices)
-     * @return Identity matrix
-     */
-    template<std::size_t M2 = M, std::size_t N2 = N>
-    static std::enable_if_t<M2 == N2, Matrix> identity() {
-        Matrix result;
-        for (size_type i = 0; i < M; ++i) {
-            result(i, i) = T(1);
-        }
-        return result;
-    }
-
-    /**
-     * @brief Create diagonal matrix from vector (only for square matrices)
-     * @param diag Vector of diagonal elements
-     * @return Diagonal matrix
-     */
-    template<std::size_t M2 = M, std::size_t N2 = N>
-    static std::enable_if_t<M2 == N2, Matrix> diagonal(const Vector<T, M>& diag) {
-        Matrix result;
-        for (size_type i = 0; i < M; ++i) {
-            result(i, i) = diag[i];
-        }
-        return result;
-    }
-
-    /**
-     * @brief Create zero matrix (static factory)
-     * @return Zero matrix
-     */
-    static Matrix zero() {
-        return zeros();
-    }
-
-    // Property checking methods
-
-    /**
-     * @brief Check if matrix is symmetric (only for square matrices)
-     * @param tol Tolerance for comparison
-     * @return true if symmetric
-     */
-    template<std::size_t M2 = M, std::size_t N2 = N>
-    std::enable_if_t<M2 == N2, bool> is_symmetric(T tol = tolerance<T>) const {
-        for (size_type i = 0; i < M; ++i) {
-            for (size_type j = i + 1; j < N; ++j) {
-                using std::abs;
-                if (abs((*this)(i, j) - (*this)(j, i)) > tol) {
-                    return false;
-                }
-            }
-        }
-        return true;
-    }
-
-    /**
-     * @brief Check if matrix is skew-symmetric (only for square matrices)
-     * @param tol Tolerance for comparison
-     * @return true if skew-symmetric
-     */
-    template<std::size_t M2 = M, std::size_t N2 = N>
-    std::enable_if_t<M2 == N2, bool> is_skew_symmetric(T tol = tolerance<T>) const {
-        for (size_type i = 0; i < M; ++i) {
-            // Diagonal must be zero
-            using std::abs;
-            if (abs((*this)(i, i)) > tol) {
-                return false;
-            }
-            for (size_type j = i + 1; j < N; ++j) {
-                if (abs((*this)(i, j) + (*this)(j, i)) > tol) {
-                    return false;
-                }
-            }
-        }
-        return true;
-    }
-
-    /**
-     * @brief Check if matrix is diagonal (only for square matrices)
-     * @param tol Tolerance for comparison
-     * @return true if diagonal
-     */
-    template<std::size_t M2 = M, std::size_t N2 = N>
-    std::enable_if_t<M2 == N2, bool> is_diagonal(T tol = tolerance<T>) const {
-        for (size_type i = 0; i < M; ++i) {
-            for (size_type j = 0; j < N; ++j) {
-                if (i != j) {
-                    using std::abs;
-                    if (abs((*this)(i, j)) > tol) {
-                        return false;
-                    }
-                }
-            }
-        }
-        return true;
-    }
-
-    // Determinant (general template, specialized for 2x2, 3x3)
-    /**
-     * @brief Compute determinant (only for square matrices)
-     * @return Determinant value
-     */
-    template<std::size_t M2 = M, std::size_t N2 = N>
-    std::enable_if_t<M2 == N2 && M2 != 2 && M2 != 3, T> determinant() const {
-        // For 4x4 and larger, use LU decomposition
-        return determinant_lu();
-    }
-
-    // Inverse (general template, specialized for 2x2, 3x3)
-    /**
-     * @brief Compute matrix inverse (only for square matrices)
-     * @return Inverse matrix
-     */
-    template<std::size_t M2 = M, std::size_t N2 = N>
-    std::enable_if_t<M2 == N2 && M2 != 2 && M2 != 3, Matrix> inverse() const {
-        // For 4x4 and larger, use Gauss-Jordan elimination
-        return inverse_gauss_jordan();
-    }
-
-private:
-    // LU decomposition for determinant (4x4 and larger)
-    T determinant_lu() const {
-        Matrix<T, M, M> lu = *this;
-        T det = T(1);
-
-        for (size_type k = 0; k < M - 1; ++k) {
-            // Find pivot
-            size_type pivot = k;
-            T max_val = std::abs(lu(k, k));
-            for (size_type i = k + 1; i < M; ++i) {
-                T val = std::abs(lu(i, k));
-                if (val > max_val) {
-                    max_val = val;
-                    pivot = i;
-                }
-            }
-
-            // Swap rows if needed
-            if (pivot != k) {
-                for (size_type j = 0; j < M; ++j) {
-                    std::swap(lu(k, j), lu(pivot, j));
-                }
-                det = -det;  // Row swap changes sign
-            }
-
-            // Check for singularity
-            if (approx_zero(lu(k, k))) {
-                return T(0);
-            }
-
-            // Eliminate column
-            for (size_type i = k + 1; i < M; ++i) {
-                T factor = lu(i, k) / lu(k, k);
-                for (size_type j = k + 1; j < M; ++j) {
-                    lu(i, j) -= factor * lu(k, j);
-                }
-            }
-
-            det *= lu(k, k);
-        }
-        det *= lu(M - 1, M - 1);
-
-        return det;
-    }
-
-    // Gauss-Jordan elimination for inverse (4x4 and larger)
-    Matrix inverse_gauss_jordan() const {
-        Matrix<T, M, M> aug;  // Augmented matrix [A | I]
-        Matrix<T, M, M> result = Matrix::identity();
-
-        // Copy this matrix to augmented matrix
-        for (size_type i = 0; i < M; ++i) {
-            for (size_type j = 0; j < M; ++j) {
-                aug(i, j) = (*this)(i, j);
-            }
-        }
-
-        // Forward elimination with partial pivoting
-        for (size_type k = 0; k < M; ++k) {
-            // Find pivot
-            size_type pivot = k;
-            T max_val = std::abs(aug(k, k));
-            for (size_type i = k + 1; i < M; ++i) {
-                T val = std::abs(aug(i, k));
-                if (val > max_val) {
-                    max_val = val;
-                    pivot = i;
-                }
-            }
-
-            // Swap rows
-            if (pivot != k) {
-                for (size_type j = 0; j < M; ++j) {
-                    std::swap(aug(k, j), aug(pivot, j));
-                    std::swap(result(k, j), result(pivot, j));
-                }
-            }
-
-            // Check for singularity
-            if (approx_zero(aug(k, k))) {
-                throw std::runtime_error("Matrix is singular");
-            }
-
-            // Scale pivot row
-            T pivot_val = aug(k, k);
-            for (size_type j = 0; j < M; ++j) {
-                aug(k, j) /= pivot_val;
-                result(k, j) /= pivot_val;
-            }
-
-            // Eliminate column
-            for (size_type i = 0; i < M; ++i) {
-                if (i != k) {
-                    T factor = aug(i, k);
-                    for (size_type j = 0; j < M; ++j) {
-                        aug(i, j) -= factor * aug(k, j);
-                        result(i, j) -= factor * result(k, j);
-                    }
-                }
-            }
-        }
-
-        return result;
-    }
-
-    // Iterators
-public:
-    T* begin() { return data_; }
-    T* end() { return data_ + M * N; }
-    const T* begin() const { return data_; }
-    const T* end() const { return data_ + M * N; }
-    const T* cbegin() const { return data_; }
-    const T* cend() const { return data_ + M * N; }
-};
-
-// Specialization for 2x2 determinant (analytical formula)
-template<typename T>
-inline T determinant_2x2(const Matrix<T, 2, 2>& m) {
-    return m(0, 0) * m(1, 1) - m(0, 1) * m(1, 0);
-}
-
-// Specialization for 2x2 inverse (Cramer's rule)
-template<typename T>
-inline Matrix<T, 2, 2> inverse_2x2(const Matrix<T, 2, 2>& m) {
-    T det = determinant_2x2(m);
-    if (approx_zero(det)) {
-        throw std::runtime_error("Matrix is singular");
-    }
-
-    T inv_det = T(1) / det;
-    return Matrix<T, 2, 2>{
-        { m(1, 1) * inv_det, -m(0, 1) * inv_det},
-        {-m(1, 0) * inv_det,  m(0, 0) * inv_det}
-    };
-}
-
-// Specialization for 3x3 determinant (Sarrus rule)
-template<typename T>
-inline T determinant_3x3(const Matrix<T, 3, 3>& m) {
-    return m(0, 0) * (m(1, 1) * m(2, 2) - m(1, 2) * m(2, 1))
-         - m(0, 1) * (m(1, 0) * m(2, 2) - m(1, 2) * m(2, 0))
-         + m(0, 2) * (m(1, 0) * m(2, 1) - m(1, 1) * m(2, 0));
-}
-
-// Specialization for 3x3 inverse (Cramer's rule / adjugate method)
-template<typename T>
-inline Matrix<T, 3, 3> inverse_3x3(const Matrix<T, 3, 3>& m) {
-    T det = determinant_3x3(m);
-    if (approx_zero(det)) {
-        throw std::runtime_error("Matrix is singular");
-    }
-
-    T inv_det = T(1) / det;
-
-    // Compute adjugate matrix (transpose of cofactor matrix)
-    Matrix<T, 3, 3> adj;
-    adj(0, 0) =  (m(1, 1) * m(2, 2) - m(1, 2) * m(2, 1));
-    adj(0, 1) = -(m(0, 1) * m(2, 2) - m(0, 2) * m(2, 1));
-    adj(0, 2) =  (m(0, 1) * m(1, 2) - m(0, 2) * m(1, 1));
-
-    adj(1, 0) = -(m(1, 0) * m(2, 2) - m(1, 2) * m(2, 0));
-    adj(1, 1) =  (m(0, 0) * m(2, 2) - m(0, 2) * m(2, 0));
-    adj(1, 2) = -(m(0, 0) * m(1, 2) - m(0, 2) * m(1, 0));
-
-    adj(2, 0) =  (m(1, 0) * m(2, 1) - m(1, 1) * m(2, 0));
-    adj(2, 1) = -(m(0, 0) * m(2, 1) - m(0, 1) * m(2, 0));
-    adj(2, 2) =  (m(0, 0) * m(1, 1) - m(0, 1) * m(1, 0));
-
-    return adj * inv_det;
-}
-
-/**
- * @brief Specialized fixed-size 2-by-2 matrix for element-level computations.
- * @ingroup FE_MatrixMath
- * @tparam T Scalar type.
- *
- * This specialization preserves the Matrix API while using direct formulas for
- * 2-by-2 determinant and inverse operations.
- */
-template<typename T>
-class Matrix<T, 2, 2> : public MatrixExpr<Matrix<T, 2, 2>> {
-    static constexpr std::size_t M = 2;
-    static constexpr std::size_t N = 2;
-
-private:
-    alignas(kFEFixedObjectAlignmentBytes) T data_[4];
-
-    static constexpr std::size_t index(std::size_t i, std::size_t j) {
-        return i * 2 + j;
-    }
-
-public:
-    using value_type = T;
-    using size_type = std::size_t;
-
-    // Include all the same constructors and methods as the general template
-    constexpr Matrix() : data_{} {}
-    constexpr explicit Matrix(T value) {
-        for (size_type i = 0; i < 4; ++i) {
-            data_[i] = value;
-        }
-    }
-    constexpr Matrix(std::initializer_list<std::initializer_list<T>> init) : data_{} {
-        size_type row = 0;
-        for (auto row_init : init) {
-            if (row >= 2) break;
-            size_type col = 0;
-            for (auto val : row_init) {
-                if (col >= 2) break;
-                (*this)(row, col) = val;
-                ++col;
-            }
-            ++row;
-        }
-    }
-
-    template<typename Expr>
-    Matrix(const MatrixExpr<Expr>& expr) {
-        const auto& e = expr.derived();
-        for (size_type i = 0; i < 2; ++i) {
-            for (size_type j = 0; j < 2; ++j) {
-                (*this)(i, j) = e(i, j);
-            }
-        }
-    }
-
-    constexpr Matrix(const Matrix&) = default;
-    constexpr Matrix(Matrix&&) noexcept = default;
-    Matrix& operator=(const Matrix&) = default;
-    Matrix& operator=(Matrix&&) noexcept = default;
-
-    template<typename Expr>
-    Matrix& operator=(const MatrixExpr<Expr>& expr) {
-        const auto& e = expr.derived();
-        for (size_type i = 0; i < 2; ++i) {
-            for (size_type j = 0; j < 2; ++j) {
-                (*this)(i, j) = e(i, j);
-            }
-        }
-        return *this;
-    }
-
-    static constexpr size_type rows() { return 2; }
-    static constexpr size_type cols() { return 2; }
-    static constexpr size_type size() { return 4; }
-
-    constexpr T& operator()(size_type i, size_type j) {
-        return data_[index(i, j)];
-    }
-    constexpr const T& operator()(size_type i, size_type j) const {
-        return data_[index(i, j)];
-    }
-
-    T* data() { return data_; }
-    const T* data() const { return data_; }
-
-    void fill(T value) {
-        for (size_type i = 0; i < 4; ++i) {
-            data_[i] = value;
-        }
-    }
-
-    void set_zero() { fill(T{0}); }
-
-    void set_row(size_type i, const Vector<T, 2>& v) {
-        for (size_type j = 0; j < 2; ++j) {
-            (*this)(i, j) = v[j];
-        }
-    }
-
-    void set_column(size_type j, const Vector<T, 2>& v) {
-        for (size_type i = 0; i < 2; ++i) {
-            (*this)(i, j) = v[i];
-        }
-    }
-
-    void set_col(size_type j, const Vector<T, 2>& v) {
-        set_column(j, v);
-    }
-
-    Vector<T, 2> col(size_type j) const {
-        return column(j);
-    }
-
-    static Matrix zero() {
-        return zeros();
-    }
-
-    static Matrix diagonal(const Vector<T, 2>& diag) {
-        Matrix result;
-        result(0, 0) = diag[0];
-        result(1, 1) = diag[1];
-        return result;
-    }
-
-    bool is_symmetric(T tol = tolerance<T>) const {
-        using std::abs;
-        return abs((*this)(0, 1) - (*this)(1, 0)) <= tol;
-    }
-
-    bool is_skew_symmetric(T tol = tolerance<T>) const {
-        using std::abs;
-        // Diagonal must be zero
-        if (abs((*this)(0, 0)) > tol || abs((*this)(1, 1)) > tol) {
-            return false;
-        }
-        // Off-diagonal must be opposite
-        return abs((*this)(0, 1) + (*this)(1, 0)) <= tol;
-    }
-
-    bool is_diagonal(T tol = tolerance<T>) const {
-        using std::abs;
-        return abs((*this)(0, 1)) <= tol && abs((*this)(1, 0)) <= tol;
-    }
-
-    T frobenius_norm() const {
-        using std::sqrt;
-        T sum = T(0);
-        for (size_type i = 0; i < 4; ++i) {
-            sum += data_[i] * data_[i];
-        }
-        return sqrt(sum);
-    }
-
-    T infinity_norm() const {
-        using std::abs;
-        T row0 = abs((*this)(0, 0)) + abs((*this)(0, 1));
-        T row1 = abs((*this)(1, 0)) + abs((*this)(1, 1));
-        return std::max(row0, row1);
-    }
-
-    T one_norm() const {
-        using std::abs;
-        T col0 = abs((*this)(0, 0)) + abs((*this)(1, 0));
-        T col1 = abs((*this)(0, 1)) + abs((*this)(1, 1));
-        return std::max(col0, col1);
-    }
-
-    Matrix& operator+=(const Matrix& other) {
-        for (size_type i = 0; i < 4; ++i) {
-            data_[i] += other.data_[i];
-        }
-        return *this;
-    }
-
-    Matrix& operator-=(const Matrix& other) {
-        for (size_type i = 0; i < 4; ++i) {
-            data_[i] -= other.data_[i];
-        }
-        return *this;
-    }
-
-    Matrix& operator*=(T scalar) {
-        for (size_type i = 0; i < 4; ++i) {
-            data_[i] *= scalar;
-        }
-        return *this;
-    }
-
-    Matrix& operator/=(T scalar) {
-        const T inv = T(1) / scalar;
-        return (*this) *= inv;
-    }
-
-    Matrix<T, 2, 2> transpose() const {
-        return Matrix<T, 2, 2>{
-            {(*this)(0, 0), (*this)(1, 0)},
-            {(*this)(0, 1), (*this)(1, 1)}
-        };
-    }
-
-    T trace() const {
-        return (*this)(0, 0) + (*this)(1, 1);
-    }
-
-    static Matrix identity() {
-        Matrix result;
-        result(0, 0) = T(1);
-        result(1, 1) = T(1);
-        return result;
-    }
-
-    static Matrix zeros() {
-        return Matrix();
-    }
-
-    static Matrix ones() {
-        return Matrix(T(1));
-    }
-
-    // Specialized 2x2 determinant
-    T determinant() const {
-        return determinant_2x2(*this);
-    }
-
-    // Specialized 2x2 inverse
-    Matrix inverse() const {
-        return inverse_2x2(*this);
-    }
-
-    Vector<T, 2> row(size_type i) const {
-        return Vector<T, 2>{(*this)(i, 0), (*this)(i, 1)};
-    }
-
-    Vector<T, 2> column(size_type j) const {
-        return Vector<T, 2>{(*this)(0, j), (*this)(1, j)};
-    }
-
-    T* begin() { return data_; }
-    T* end() { return data_ + 4; }
-    const T* begin() const { return data_; }
-    const T* end() const { return data_ + 4; }
-};
-
-/**
- * @brief Specialized fixed-size 3-by-3 matrix for element-level computations.
- * @ingroup FE_MatrixMath
- * @tparam T Scalar type.
- *
- * This specialization preserves the Matrix API while using direct formulas for
- * 3-by-3 determinant and inverse operations.
- */
-template<typename T>
-class Matrix<T, 3, 3> : public MatrixExpr<Matrix<T, 3, 3>> {
-    static constexpr std::size_t M = 3;
-    static constexpr std::size_t N = 3;
-
-private:
-    alignas(kFEFixedObjectAlignmentBytes) T data_[9];
-
-    static constexpr std::size_t index(std::size_t i, std::size_t j) {
-        return i * 3 + j;
-    }
-
-public:
-    using value_type = T;
-    using size_type = std::size_t;
-
-    constexpr Matrix() : data_{} {}
-    constexpr explicit Matrix(T value) {
-        for (size_type i = 0; i < 9; ++i) {
-            data_[i] = value;
-        }
-    }
-    constexpr Matrix(std::initializer_list<std::initializer_list<T>> init) : data_{} {
-        size_type row = 0;
-        for (auto row_init : init) {
-            if (row >= 3) break;
-            size_type col = 0;
-            for (auto val : row_init) {
-                if (col >= 3) break;
-                (*this)(row, col) = val;
-                ++col;
-            }
-            ++row;
-        }
-    }
-
-    template<typename Expr>
-    Matrix(const MatrixExpr<Expr>& expr) {
-        const auto& e = expr.derived();
-        for (size_type i = 0; i < 3; ++i) {
-            for (size_type j = 0; j < 3; ++j) {
-                (*this)(i, j) = e(i, j);
-            }
-        }
-    }
-
-    constexpr Matrix(const Matrix&) = default;
-    constexpr Matrix(Matrix&&) noexcept = default;
-    Matrix& operator=(const Matrix&) = default;
-    Matrix& operator=(Matrix&&) noexcept = default;
-
-    template<typename Expr>
-    Matrix& operator=(const MatrixExpr<Expr>& expr) {
-        const auto& e = expr.derived();
-        for (size_type i = 0; i < 3; ++i) {
-            for (size_type j = 0; j < 3; ++j) {
-                (*this)(i, j) = e(i, j);
-            }
-        }
-        return *this;
-    }
-
-    static constexpr size_type rows() { return 3; }
-    static constexpr size_type cols() { return 3; }
-    static constexpr size_type size() { return 9; }
-
-    constexpr T& operator()(size_type i, size_type j) {
-        return data_[index(i, j)];
-    }
-    constexpr const T& operator()(size_type i, size_type j) const {
-        return data_[index(i, j)];
-    }
-
-    T* data() { return data_; }
-    const T* data() const { return data_; }
-
-    void fill(T value) {
-        for (size_type i = 0; i < 9; ++i) {
-            data_[i] = value;
-        }
-    }
-
-    void set_zero() { fill(T{0}); }
-
-    void set_row(size_type i, const Vector<T, 3>& v) {
-        for (size_type j = 0; j < 3; ++j) {
-            (*this)(i, j) = v[j];
-        }
-    }
-
-    void set_column(size_type j, const Vector<T, 3>& v) {
-        for (size_type i = 0; i < 3; ++i) {
-            (*this)(i, j) = v[i];
-        }
-    }
-
-    void set_col(size_type j, const Vector<T, 3>& v) {
-        set_column(j, v);
-    }
-
-    Vector<T, 3> col(size_type j) const {
-        return column(j);
-    }
-
-    static Matrix zero() {
-        return zeros();
-    }
-
-    static Matrix diagonal(const Vector<T, 3>& diag) {
-        Matrix result;
-        result(0, 0) = diag[0];
-        result(1, 1) = diag[1];
-        result(2, 2) = diag[2];
-        return result;
-    }
-
-    bool is_symmetric(T tol = tolerance<T>) const {
-        using std::abs;
-        for (size_type i = 0; i < 3; ++i) {
-            for (size_type j = i + 1; j < 3; ++j) {
-                if (abs((*this)(i, j) - (*this)(j, i)) > tol) {
-                    return false;
-                }
-            }
-        }
-        return true;
-    }
-
-    bool is_skew_symmetric(T tol = tolerance<T>) const {
-        using std::abs;
-        // Diagonal must be zero
-        for (size_type i = 0; i < 3; ++i) {
-            if (abs((*this)(i, i)) > tol) {
-                return false;
-            }
-        }
-        // Off-diagonal must be opposite
-        for (size_type i = 0; i < 3; ++i) {
-            for (size_type j = i + 1; j < 3; ++j) {
-                if (abs((*this)(i, j) + (*this)(j, i)) > tol) {
-                    return false;
-                }
-            }
-        }
-        return true;
-    }
-
-    bool is_diagonal(T tol = tolerance<T>) const {
-        using std::abs;
-        for (size_type i = 0; i < 3; ++i) {
-            for (size_type j = 0; j < 3; ++j) {
-                if (i != j && abs((*this)(i, j)) > tol) {
-                    return false;
-                }
-            }
-        }
-        return true;
-    }
-
-    T frobenius_norm() const {
-        using std::sqrt;
-        T sum = T(0);
-        for (size_type i = 0; i < 9; ++i) {
-            sum += data_[i] * data_[i];
-        }
-        return sqrt(sum);
-    }
-
-    T infinity_norm() const {
-        using std::abs;
-        T max_row_sum = T(0);
-        for (size_type i = 0; i < 3; ++i) {
-            T row_sum = T(0);
-            for (size_type j = 0; j < 3; ++j) {
-                row_sum += abs((*this)(i, j));
-            }
-            max_row_sum = std::max(max_row_sum, row_sum);
-        }
-        return max_row_sum;
-    }
-
-    T one_norm() const {
-        using std::abs;
-        T max_col_sum = T(0);
-        for (size_type j = 0; j < 3; ++j) {
-            T col_sum = T(0);
-            for (size_type i = 0; i < 3; ++i) {
-                col_sum += abs((*this)(i, j));
-            }
-            max_col_sum = std::max(max_col_sum, col_sum);
-        }
-        return max_col_sum;
-    }
-
-    Matrix& operator+=(const Matrix& other) {
-        for (size_type i = 0; i < 9; ++i) {
-            data_[i] += other.data_[i];
-        }
-        return *this;
-    }
-
-    Matrix& operator-=(const Matrix& other) {
-        for (size_type i = 0; i < 9; ++i) {
-            data_[i] -= other.data_[i];
-        }
-        return *this;
-    }
-
-    Matrix& operator*=(T scalar) {
-        for (size_type i = 0; i < 9; ++i) {
-            data_[i] *= scalar;
-        }
-        return *this;
-    }
-
-    Matrix& operator/=(T scalar) {
-        const T inv = T(1) / scalar;
-        return (*this) *= inv;
-    }
-
-    Matrix<T, 3, 3> transpose() const {
-        Matrix<T, 3, 3> result;
-        for (size_type i = 0; i < 3; ++i) {
-            for (size_type j = 0; j < 3; ++j) {
-                result(j, i) = (*this)(i, j);
-            }
-        }
-        return result;
-    }
-
-    T trace() const {
-        return (*this)(0, 0) + (*this)(1, 1) + (*this)(2, 2);
-    }
-
-    static Matrix identity() {
-        Matrix result;
-        result(0, 0) = T(1);
-        result(1, 1) = T(1);
-        result(2, 2) = T(1);
-        return result;
-    }
-
-    static Matrix zeros() {
-        return Matrix();
-    }
-
-    static Matrix ones() {
-        return Matrix(T(1));
-    }
-
-    // Specialized 3x3 determinant
-    T determinant() const {
-        return determinant_3x3(*this);
-    }
-
-    // Specialized 3x3 inverse
-    Matrix inverse() const {
-        return inverse_3x3(*this);
-    }
-
-    Vector<T, 3> row(size_type i) const {
-        return Vector<T, 3>{(*this)(i, 0), (*this)(i, 1), (*this)(i, 2)};
-    }
-
-    Vector<T, 3> column(size_type j) const {
-        return Vector<T, 3>{(*this)(0, j), (*this)(1, j), (*this)(2, j)};
-    }
-
-    T* begin() { return data_; }
-    T* end() { return data_ + 9; }
-    const T* begin() const { return data_; }
-    const T* end() const { return data_ + 9; }
-};
+using Matrix = Eigen::Matrix<T, static_cast<int>(M), static_cast<int>(N)>;
 
 // Type aliases for common matrix types
 template<typename T> using Matrix2x2 = Matrix<T, 2, 2>;
@@ -1323,188 +59,6 @@ using Matrix2x2f = Matrix2x2<float>;
 using Matrix3x3f = Matrix3x3<float>;
 using Matrix4x4f = Matrix4x4<float>;
 
-// Matrix-vector multiplication
-template<typename T, std::size_t M, std::size_t N>
-inline Vector<T, M> operator*(const Matrix<T, M, N>& A, const Vector<T, N>& x) {
-    Vector<T, M> result;
-    for (std::size_t i = 0; i < M; ++i) {
-        T sum = T(0);
-        for (std::size_t j = 0; j < N; ++j) {
-            sum += A(i, j) * x[j];
-        }
-        result[i] = sum;
-    }
-    return result;
-}
-
-// Vector-matrix multiplication (row vector * matrix)
-template<typename T, std::size_t M, std::size_t N>
-inline Vector<T, N> operator*(const Vector<T, M>& x, const Matrix<T, M, N>& A) {
-    Vector<T, N> result;
-    for (std::size_t j = 0; j < N; ++j) {
-        T sum = T(0);
-        for (std::size_t i = 0; i < M; ++i) {
-            sum += x[i] * A(i, j);
-        }
-        result[j] = sum;
-    }
-    return result;
-}
-
-// Matrix-matrix multiplication
-template<typename T, std::size_t M, std::size_t N, std::size_t P>
-inline Matrix<T, M, P> operator*(const Matrix<T, M, N>& A, const Matrix<T, N, P>& B) {
-    Matrix<T, M, P> result;
-    for (std::size_t i = 0; i < M; ++i) {
-        for (std::size_t k = 0; k < N; ++k) {
-            T a_ik = A(i, k);
-            for (std::size_t j = 0; j < P; ++j) {
-                result(i, j) += a_ik * B(k, j);
-            }
-        }
-    }
-    return result;
-}
-
-// Free functions
-
-/**
- * @brief Compute matrix transpose
- */
-template<typename T, std::size_t M, std::size_t N>
-inline Matrix<T, N, M> transpose(const Matrix<T, M, N>& m) {
-    return m.transpose();
-}
-
-/**
- * @brief Compute matrix trace
- */
-template<typename T, std::size_t N>
-inline T trace(const Matrix<T, N, N>& m) {
-    return m.trace();
-}
-
-/**
- * @brief Compute matrix determinant
- */
-template<typename T, std::size_t N>
-inline T determinant(const Matrix<T, N, N>& m) {
-    return m.determinant();
-}
-
-/**
- * @brief Compute matrix inverse
- */
-template<typename T, std::size_t N>
-inline Matrix<T, N, N> inverse(const Matrix<T, N, N>& m) {
-    return m.inverse();
-}
-
-/**
- * @brief Compute Frobenius norm
- */
-template<typename T, std::size_t M, std::size_t N>
-inline T frobenius_norm(const Matrix<T, M, N>& m) {
-    return m.frobenius_norm();
-}
-
-/**
- * @brief Component-wise absolute value
- */
-template<typename T, std::size_t M, std::size_t N>
-inline Matrix<T, M, N> abs(const Matrix<T, M, N>& m) {
-    Matrix<T, M, N> result;
-    for (std::size_t i = 0; i < M; ++i) {
-        for (std::size_t j = 0; j < N; ++j) {
-            using std::abs;
-            result(i, j) = abs(m(i, j));
-        }
-    }
-    return result;
-}
-
-/**
- * @brief Component-wise minimum
- */
-template<typename T, std::size_t M, std::size_t N>
-inline Matrix<T, M, N> min(const Matrix<T, M, N>& a, const Matrix<T, M, N>& b) {
-    Matrix<T, M, N> result;
-    for (std::size_t i = 0; i < M; ++i) {
-        for (std::size_t j = 0; j < N; ++j) {
-            result(i, j) = std::min(a(i, j), b(i, j));
-        }
-    }
-    return result;
-}
-
-/**
- * @brief Component-wise maximum
- */
-template<typename T, std::size_t M, std::size_t N>
-inline Matrix<T, M, N> max(const Matrix<T, M, N>& a, const Matrix<T, M, N>& b) {
-    Matrix<T, M, N> result;
-    for (std::size_t i = 0; i < M; ++i) {
-        for (std::size_t j = 0; j < N; ++j) {
-            result(i, j) = std::max(a(i, j), b(i, j));
-        }
-    }
-    return result;
-}
-
-/**
- * @brief Outer product of two vectors
- */
-template<typename T, std::size_t M, std::size_t N>
-inline Matrix<T, M, N> outer_product(const Vector<T, M>& u, const Vector<T, N>& v) {
-    Matrix<T, M, N> result;
-    for (std::size_t i = 0; i < M; ++i) {
-        for (std::size_t j = 0; j < N; ++j) {
-            result(i, j) = u[i] * v[j];
-        }
-    }
-    return result;
-}
-
-/**
- * @brief Check if two matrices are approximately equal
- */
-template<typename T, std::size_t M, std::size_t N>
-inline bool approx_equal(const Matrix<T, M, N>& a, const Matrix<T, M, N>& b, T tol = tolerance<T>) {
-    for (std::size_t i = 0; i < M; ++i) {
-        for (std::size_t j = 0; j < N; ++j) {
-            if (!approx_equal(a(i, j), b(i, j), tol)) {
-                return false;
-            }
-        }
-    }
-    return true;
-}
-
-/**
- * @brief Stream output operator for matrices
- * @tparam T Scalar type
- * @tparam M Number of rows
- * @tparam N Number of columns
- * @param os Output stream
- * @param m Matrix to output
- * @return Reference to output stream
- */
-template<typename T, std::size_t M, std::size_t N>
-inline std::ostream& operator<<(std::ostream& os, const Matrix<T, M, N>& m) {
-    os << "[";
-    for (std::size_t i = 0; i < M; ++i) {
-        if (i > 0) os << "\n ";
-        os << "[";
-        for (std::size_t j = 0; j < N; ++j) {
-            if (j > 0) os << ", ";
-            os << m(i, j);
-        }
-        os << "]";
-    }
-    os << "]";
-    return os;
-}
-
 } // namespace math
 } // namespace FE
 } // namespace svmp
diff --git a/Code/Source/solver/FE/Math/MatrixExpr.h b/Code/Source/solver/FE/Math/MatrixExpr.h
deleted file mode 100644
index 288bbc5ca..000000000
--- a/Code/Source/solver/FE/Math/MatrixExpr.h
+++ /dev/null
@@ -1,630 +0,0 @@
-// SPDX-FileCopyrightText: Copyright (c) Stanford University, The Regents of the University of California, and others.
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef SVMP_FE_MATH_MATRIX_EXPR_H
-#define SVMP_FE_MATH_MATRIX_EXPR_H
-
-/**
- * @file MatrixExpr.h
- * @brief Expression template infrastructure for lazy evaluation of matrix operations
- *
- * This header provides expression templates that enable compound matrix operations
- * without creating temporary objects. Operations are evaluated lazily at the point
- * of assignment, eliminating intermediate allocations and improving performance.
- */
-
-#include <algorithm>
-#include <cmath>
-#include <cstddef>
-#include <type_traits>
-
-#include "VectorExpr.h"
-
-namespace svmp {
-namespace FE {
-namespace math {
-
-/**
- * @brief Base class for all matrix expressions using CRTP
- * @tparam Derived The derived expression type
- *
- * This uses the Curiously Recurring Template Pattern (CRTP) to provide
- * static polymorphism for expression templates.
- */
-template<typename Derived>
-class MatrixExpr {
-public:
-    /**
-     * @brief Get the derived expression
-     * @return Reference to the derived type
-     */
-    const Derived& derived() const {
-        return static_cast<const Derived&>(*this);
-    }
-
-    /**
-     * @brief Get the derived expression (non-const)
-     * @return Reference to the derived type
-     */
-    Derived& derived() {
-        return static_cast<Derived&>(*this);
-    }
-
-    /**
-     * @brief Access element by row and column indices
-     * @param i Row index
-     * @param j Column index
-     * @return Value at (i,j)
-     */
-    auto operator()(std::size_t i, std::size_t j) const {
-        return derived()(i, j);
-    }
-
-    /**
-     * @brief Get number of rows
-     * @return Number of rows
-     */
-    std::size_t rows() const {
-        return derived().rows();
-    }
-
-    /**
-     * @brief Get number of columns
-     * @return Number of columns
-     */
-    std::size_t cols() const {
-        return derived().cols();
-    }
-};
-
-/**
- * @brief Binary expression for element-wise operations between two matrix expressions
- * @tparam LHS Left-hand side expression type
- * @tparam RHS Right-hand side expression type
- * @tparam Op Binary operation functor
- */
-template<typename LHS, typename RHS, typename Op>
-class MatrixBinaryExpr : public MatrixExpr<MatrixBinaryExpr<LHS, RHS, Op>> {
-private:
-    LHS lhs_;
-    RHS rhs_;
-    Op op_;
-
-public:
-    /**
-     * @brief Construct binary expression
-     * @param lhs Left operand
-     * @param rhs Right operand
-     * @param op Operation to apply
-     */
-    constexpr MatrixBinaryExpr(const LHS& lhs, const RHS& rhs, Op op = Op{})
-        : lhs_(lhs), rhs_(rhs), op_(op) {}
-
-    /**
-     * @brief Access element at (i,j)
-     * @param i Row index
-     * @param j Column index
-     * @return Result of operation on elements at (i,j)
-     */
-    constexpr auto operator()(std::size_t i, std::size_t j) const {
-        return op_(lhs_(i, j), rhs_(i, j));
-    }
-
-    /**
-     * @brief Get number of rows
-     * @return Number of rows
-     */
-    constexpr std::size_t rows() const {
-        return lhs_.rows();
-    }
-
-    /**
-     * @brief Get number of columns
-     * @return Number of columns
-     */
-    constexpr std::size_t cols() const {
-        return lhs_.cols();
-    }
-};
-
-/**
- * @brief Unary expression for element-wise operations on a single matrix expression
- * @tparam Expr Expression type
- * @tparam Op Unary operation functor
- */
-template<typename Expr, typename Op>
-class MatrixUnaryExpr : public MatrixExpr<MatrixUnaryExpr<Expr, Op>> {
-private:
-    Expr expr_;
-    Op op_;
-
-public:
-    /**
-     * @brief Construct unary expression
-     * @param expr Operand expression
-     * @param op Operation to apply
-     */
-    constexpr MatrixUnaryExpr(const Expr& expr, Op op = Op{})
-        : expr_(expr), op_(op) {}
-
-    /**
-     * @brief Access element at (i,j)
-     * @param i Row index
-     * @param j Column index
-     * @return Result of operation on element at (i,j)
-     */
-    constexpr auto operator()(std::size_t i, std::size_t j) const {
-        return op_(expr_(i, j));
-    }
-
-    /**
-     * @brief Get number of rows
-     * @return Number of rows
-     */
-    constexpr std::size_t rows() const {
-        return expr_.rows();
-    }
-
-    /**
-     * @brief Get number of columns
-     * @return Number of columns
-     */
-    constexpr std::size_t cols() const {
-        return expr_.cols();
-    }
-};
-
-/**
- * @brief Scalar multiplication expression
- * @tparam Expr Matrix expression type
- * @tparam Scalar Scalar type
- */
-template<typename Expr, typename Scalar>
-class MatrixScalarExpr : public MatrixExpr<MatrixScalarExpr<Expr, Scalar>> {
-private:
-    Expr expr_;
-    Scalar scalar_;
-
-public:
-    /**
-     * @brief Construct scalar multiplication expression
-     * @param expr Matrix expression
-     * @param scalar Scalar value
-     */
-    constexpr MatrixScalarExpr(const Expr& expr, Scalar scalar)
-        : expr_(expr), scalar_(scalar) {}
-
-    /**
-     * @brief Access element at (i,j)
-     * @param i Row index
-     * @param j Column index
-     * @return Element multiplied by scalar
-     */
-    constexpr auto operator()(std::size_t i, std::size_t j) const {
-        return expr_(i, j) * scalar_;
-    }
-
-    /**
-     * @brief Get number of rows
-     * @return Number of rows
-     */
-    constexpr std::size_t rows() const {
-        return expr_.rows();
-    }
-
-    /**
-     * @brief Get number of columns
-     * @return Number of columns
-     */
-    constexpr std::size_t cols() const {
-        return expr_.cols();
-    }
-};
-
-/**
- * @brief Scalar division expression
- * @tparam Expr Matrix expression type
- * @tparam Scalar Scalar type
- */
-template<typename Expr, typename Scalar>
-class MatrixScalarDivExpr : public MatrixExpr<MatrixScalarDivExpr<Expr, Scalar>> {
-private:
-    Expr expr_;
-    Scalar scalar_;
-
-public:
-    /**
-     * @brief Construct scalar division expression
-     * @param expr Matrix expression
-     * @param scalar Scalar divisor
-     */
-    constexpr MatrixScalarDivExpr(const Expr& expr, Scalar scalar)
-        : expr_(expr), scalar_(scalar) {}
-
-    /**
-     * @brief Access element at (i,j)
-     * @param i Row index
-     * @param j Column index
-     * @return Element divided by scalar
-     */
-    constexpr auto operator()(std::size_t i, std::size_t j) const {
-        return expr_(i, j) / scalar_;
-    }
-
-    /**
-     * @brief Get number of rows
-     * @return Number of rows
-     */
-    constexpr std::size_t rows() const {
-        return expr_.rows();
-    }
-
-    /**
-     * @brief Get number of columns
-     * @return Number of columns
-     */
-    constexpr std::size_t cols() const {
-        return expr_.cols();
-    }
-};
-
-/**
- * @brief Matrix multiplication expression (lazy evaluation)
- * @tparam LHS Left matrix expression type
- * @tparam RHS Right matrix expression type
- *
- * Computes matrix multiplication A*B lazily
- */
-template<typename LHS, typename RHS>
-class MatrixMulExpr : public MatrixExpr<MatrixMulExpr<LHS, RHS>> {
-private:
-    LHS lhs_;
-    RHS rhs_;
-
-public:
-    /**
-     * @brief Construct matrix multiplication expression
-     * @param lhs Left matrix
-     * @param rhs Right matrix
-     */
-    constexpr MatrixMulExpr(const LHS& lhs, const RHS& rhs)
-        : lhs_(lhs), rhs_(rhs) {}
-
-    /**
-     * @brief Compute element at (i,j)
-     * @param i Row index
-     * @param j Column index
-     * @return Dot product of row i of lhs and column j of rhs
-     */
-    constexpr auto operator()(std::size_t i, std::size_t j) const {
-        using result_type = decltype(lhs_(0, 0) * rhs_(0, 0));
-        result_type sum = result_type{0};
-        const auto n = lhs_.cols();
-        for (std::size_t k = 0; k < n; ++k) {
-            sum += lhs_(i, k) * rhs_(k, j);
-        }
-        return sum;
-    }
-
-    /**
-     * @brief Get number of rows (from left matrix)
-     * @return Number of rows
-     */
-    constexpr std::size_t rows() const {
-        return lhs_.rows();
-    }
-
-    /**
-     * @brief Get number of columns (from right matrix)
-     * @return Number of columns
-     */
-    constexpr std::size_t cols() const {
-        return rhs_.cols();
-    }
-};
-
-/**
- * @brief Transpose expression (lazy evaluation)
- * @tparam Expr Matrix expression type
- */
-template<typename Expr>
-class TransposeExpr : public MatrixExpr<TransposeExpr<Expr>> {
-private:
-    Expr expr_;
-
-public:
-    /**
-     * @brief Construct transpose expression
-     * @param expr Matrix expression to transpose
-     */
-    constexpr explicit TransposeExpr(const Expr& expr)
-        : expr_(expr) {}
-
-    /**
-     * @brief Access transposed element
-     * @param i Row index (becomes column in original)
-     * @param j Column index (becomes row in original)
-     * @return Element at (j,i) of original matrix
-     */
-    constexpr auto operator()(std::size_t i, std::size_t j) const {
-        return expr_(j, i);
-    }
-
-    /**
-     * @brief Get number of rows (columns of original)
-     * @return Number of rows
-     */
-    constexpr std::size_t rows() const {
-        return expr_.cols();
-    }
-
-    /**
-     * @brief Get number of columns (rows of original)
-     * @return Number of columns
-     */
-    constexpr std::size_t cols() const {
-        return expr_.rows();
-    }
-};
-
-/**
- * @brief Diagonal matrix expression (creates diagonal matrix from vector)
- * @tparam VecExpr Vector expression type
- */
-template<typename VecExpr>
-class DiagonalExpr : public MatrixExpr<DiagonalExpr<VecExpr>> {
-private:
-    VecExpr vec_;
-    std::size_t n_;
-
-public:
-    /**
-     * @brief Construct diagonal matrix from vector
-     * @param vec Vector of diagonal elements
-     * @param n Matrix dimension (default: vector size)
-     */
-    constexpr explicit DiagonalExpr(const VecExpr& vec, std::size_t n = 0)
-        : vec_(vec), n_(n > 0 ? n : vec.size()) {}
-
-    /**
-     * @brief Access element
-     * @param i Row index
-     * @param j Column index
-     * @return Diagonal element if i==j, zero otherwise
-     */
-    constexpr auto operator()(std::size_t i, std::size_t j) const {
-        using result_type = decltype(vec_[0]);
-        return (i == j && i < vec_.size()) ? vec_[i] : result_type{0};
-    }
-
-    /**
-     * @brief Get number of rows
-     * @return Number of rows
-     */
-    constexpr std::size_t rows() const {
-        return n_;
-    }
-
-    /**
-     * @brief Get number of columns
-     * @return Number of columns
-     */
-    constexpr std::size_t cols() const {
-        return n_;
-    }
-};
-
-/**
- * @brief Addition operator for matrix expressions
- */
-template<typename LHS, typename RHS,
-         typename = std::enable_if_t<
-             std::is_base_of_v<MatrixExpr<LHS>, LHS> &&
-             std::is_base_of_v<MatrixExpr<RHS>, RHS>
-         >>
-constexpr auto operator+(const MatrixExpr<LHS>& lhs, const MatrixExpr<RHS>& rhs) {
-    return MatrixBinaryExpr<LHS, RHS, detail::ops::Add>(
-        lhs.derived(), rhs.derived(), detail::ops::Add{}
-    );
-}
-
-/**
- * @brief Subtraction operator for matrix expressions
- */
-template<typename LHS, typename RHS,
-         typename = std::enable_if_t<
-             std::is_base_of_v<MatrixExpr<LHS>, LHS> &&
-             std::is_base_of_v<MatrixExpr<RHS>, RHS>
-         >>
-constexpr auto operator-(const MatrixExpr<LHS>& lhs, const MatrixExpr<RHS>& rhs) {
-    return MatrixBinaryExpr<LHS, RHS, detail::ops::Sub>(
-        lhs.derived(), rhs.derived(), detail::ops::Sub{}
-    );
-}
-
-/**
- * @brief Matrix multiplication operator
- */
-template<typename LHS, typename RHS,
-         typename = std::enable_if_t<
-             std::is_base_of_v<MatrixExpr<LHS>, LHS> &&
-             std::is_base_of_v<MatrixExpr<RHS>, RHS>
-         >>
-constexpr auto operator*(const MatrixExpr<LHS>& lhs, const MatrixExpr<RHS>& rhs) {
-    return MatrixMulExpr<LHS, RHS>(lhs.derived(), rhs.derived());
-}
-
-/**
- * @brief Element-wise multiplication (Hadamard product)
- */
-template<typename LHS, typename RHS,
-         typename = std::enable_if_t<
-             std::is_base_of_v<MatrixExpr<LHS>, LHS> &&
-             std::is_base_of_v<MatrixExpr<RHS>, RHS>
-         >>
-constexpr auto hadamard(const MatrixExpr<LHS>& lhs, const MatrixExpr<RHS>& rhs) {
-    return MatrixBinaryExpr<LHS, RHS, detail::ops::Mul>(
-        lhs.derived(), rhs.derived(), detail::ops::Mul{}
-    );
-}
-
-/**
- * @brief Element-wise division
- */
-template<typename LHS, typename RHS,
-         typename = std::enable_if_t<
-             std::is_base_of_v<MatrixExpr<LHS>, LHS> &&
-             std::is_base_of_v<MatrixExpr<RHS>, RHS>
-         >>
-constexpr auto hadamard_div(const MatrixExpr<LHS>& lhs, const MatrixExpr<RHS>& rhs) {
-    return MatrixBinaryExpr<LHS, RHS, detail::ops::Div>(
-        lhs.derived(), rhs.derived(), detail::ops::Div{}
-    );
-}
-
-/**
- * @brief Negation operator for matrix expressions
- */
-template<typename Expr,
-         typename = std::enable_if_t<
-             std::is_base_of_v<MatrixExpr<Expr>, Expr>
-         >>
-constexpr auto operator-(const MatrixExpr<Expr>& expr) {
-    return MatrixUnaryExpr<Expr, detail::ops::Negate>(
-        expr.derived(), detail::ops::Negate{}
-    );
-}
-
-/**
- * @brief Scalar multiplication operator (matrix * scalar)
- */
-template<typename Expr, typename Scalar,
-         typename = std::enable_if_t<
-             std::is_base_of_v<MatrixExpr<Expr>, Expr> &&
-             std::is_arithmetic_v<Scalar>
-         >>
-constexpr auto operator*(const MatrixExpr<Expr>& expr, Scalar scalar) {
-    return MatrixScalarExpr<Expr, Scalar>(expr.derived(), scalar);
-}
-
-/**
- * @brief Scalar multiplication operator (scalar * matrix)
- */
-template<typename Scalar, typename Expr,
-         typename = std::enable_if_t<
-             std::is_arithmetic_v<Scalar> &&
-             std::is_base_of_v<MatrixExpr<Expr>, Expr>
-         >>
-constexpr auto operator*(Scalar scalar, const MatrixExpr<Expr>& expr) {
-    return MatrixScalarExpr<Expr, Scalar>(expr.derived(), scalar);
-}
-
-/**
- * @brief Scalar division operator (matrix / scalar)
- */
-template<typename Expr, typename Scalar,
-         typename = std::enable_if_t<
-             std::is_base_of_v<MatrixExpr<Expr>, Expr> &&
-             std::is_arithmetic_v<Scalar>
-         >>
-constexpr auto operator/(const MatrixExpr<Expr>& expr, Scalar scalar) {
-    return MatrixScalarDivExpr<Expr, Scalar>(expr.derived(), scalar);
-}
-
-/**
- * @brief Transpose function
- */
-template<typename Expr,
-         typename = std::enable_if_t<
-             std::is_base_of_v<MatrixExpr<Expr>, Expr>
-         >>
-constexpr auto transpose(const MatrixExpr<Expr>& expr) {
-    return TransposeExpr<Expr>(expr.derived());
-}
-
-/**
- * @brief Element-wise absolute value
- */
-template<typename Expr,
-         typename = std::enable_if_t<
-             std::is_base_of_v<MatrixExpr<Expr>, Expr>
-         >>
-constexpr auto abs(const MatrixExpr<Expr>& expr) {
-    return MatrixUnaryExpr<Expr, detail::ops::Abs>(expr.derived(), detail::ops::Abs{});
-}
-
-/**
- * @brief Element-wise square root
- */
-template<typename Expr,
-         typename = std::enable_if_t<
-             std::is_base_of_v<MatrixExpr<Expr>, Expr>
-         >>
-constexpr auto sqrt(const MatrixExpr<Expr>& expr) {
-    return MatrixUnaryExpr<Expr, detail::ops::Sqrt>(expr.derived(), detail::ops::Sqrt{});
-}
-
-/**
- * @brief Compute Frobenius norm squared of matrix expression
- * @tparam Expr Matrix expression type
- * @param expr Matrix expression
- * @return Square of the Frobenius norm
- */
-template<typename Expr,
-         typename = std::enable_if_t<
-             std::is_base_of_v<MatrixExpr<Expr>, Expr>
-         >>
-constexpr auto frobenius_norm_squared(const MatrixExpr<Expr>& expr) {
-    using result_type = decltype(expr.derived()(0, 0) * expr.derived()(0, 0));
-    result_type sum = result_type{0};
-    const auto m = expr.rows();
-    const auto n = expr.cols();
-    for (std::size_t i = 0; i < m; ++i) {
-        for (std::size_t j = 0; j < n; ++j) {
-            auto val = expr.derived()(i, j);
-            sum += val * val;
-        }
-    }
-    return sum;
-}
-
-/**
- * @brief Compute Frobenius norm of matrix expression
- * @tparam Expr Matrix expression type
- * @param expr Matrix expression
- * @return Frobenius norm
- */
-template<typename Expr,
-         typename = std::enable_if_t<
-             std::is_base_of_v<MatrixExpr<Expr>, Expr>
-         >>
-constexpr auto frobenius_norm(const MatrixExpr<Expr>& expr) {
-    using std::sqrt;
-    return sqrt(frobenius_norm_squared(expr));
-}
-
-/**
- * @brief Compute trace of square matrix expression
- * @tparam Expr Matrix expression type
- * @param expr Matrix expression
- * @return Sum of diagonal elements
- */
-template<typename Expr,
-         typename = std::enable_if_t<
-             std::is_base_of_v<MatrixExpr<Expr>, Expr>
-         >>
-constexpr auto trace(const MatrixExpr<Expr>& expr) {
-    using result_type = decltype(expr.derived()(0, 0));
-    result_type sum = result_type{0};
-    const auto n = std::min(expr.rows(), expr.cols());
-    for (std::size_t i = 0; i < n; ++i) {
-        sum += expr.derived()(i, i);
-    }
-    return sum;
-}
-
-} // namespace math
-} // namespace FE
-} // namespace svmp
-
-#endif // SVMP_FE_MATH_MATRIX_EXPR_H
diff --git a/Code/Source/solver/FE/Math/Vector.h b/Code/Source/solver/FE/Math/Vector.h
index 0ec99c81f..b234bac49 100644
--- a/Code/Source/solver/FE/Math/Vector.h
+++ b/Code/Source/solver/FE/Math/Vector.h
@@ -6,574 +6,43 @@
 
 /**
  * @file Vector.h
- * @brief Fixed-size vectors with expression templates for FE computations
+ * @brief Fixed-size vector types for FE computations, backed by Eigen.
  *
- * This header provides optimized fixed-size vector operations for element-level
- * computations. All operations use expression templates to eliminate temporaries
- * and are header-only for maximum inlining. Memory is aligned for SIMD operations.
+ * The FE library standardizes on Eigen for linear algebra. These aliases give
+ * element-level code a stable vocabulary type without re-exporting all of
+ * Eigen. Note that, unlike the previous in-house implementation, Eigen types
+ * are NOT zero-initialized by default construction; use Vector::Zero() where a
+ * zeroed value is required.
  */
 
-#include "VectorExpr.h"
-#include "../Common/Types.h"
-#include <algorithm>
-#include <array>
-#include <cmath>
-#include <initializer_list>
-#include <limits>
-#include <ostream>
-#include <stdexcept>
-#include <type_traits>
+#include <Eigen/Core>
+
+#include <cstddef>
 
 /// \defgroup FE_Math Math
 /// \ingroup FE
-/// \brief Fixed-size and dense linear algebra utilities for finite-element computations.
+/// \brief Linear algebra vocabulary types and dense utilities for finite-element computations.
 ///
-/// \details The Math module provides small fixed-size vector and matrix types
-/// used in element-level kernels, expression-template infrastructure for
-/// allocation-free algebraic expressions, and dense linear algebra utilities
-/// used by basis construction and local transforms.
+/// \details The Math module defines the fixed-size vector and matrix types
+/// used in element-level kernels (as aliases of Eigen types) and dense linear
+/// algebra utilities used by basis construction and local transforms.
 ///
 /// \defgroup FE_VectorMath Vector
 /// \ingroup FE_Math
-/// \brief Fixed-size vector types and vector expression utilities.
+/// \brief Fixed-size vector type aliases.
 
 namespace svmp {
 namespace FE {
 namespace math {
 
-template<typename T>
-inline constexpr T tolerance =
-    std::is_floating_point_v<T> ? T(1000) * std::numeric_limits<T>::epsilon() : T(0);
-
-template<typename T>
-inline bool approx_zero(T value, T tol = tolerance<T>) {
-    using std::abs;
-    return abs(value) <= tol;
-}
-
-template<typename T>
-inline bool approx_equal(T a, T b, T tol = tolerance<T>) {
-    using std::abs;
-    const T scale = std::max({abs(a), abs(b), T(1)});
-    return abs(a - b) <= tol * scale;
-}
-
 /**
- * @brief Fixed-size vector for element-level computations
+ * @brief Fixed-size column vector for element-level computations
  * @ingroup FE_VectorMath
  * @tparam T Scalar type (float, double)
  * @tparam N Vector dimension
- *
- * This class provides small vector operations optimized for
- * compile-time known dimensions. Memory is aligned for SIMD operations.
  */
 template<typename T, std::size_t N>
-class Vector : public VectorExpr<Vector<T, N>> {
-    static_assert(std::is_arithmetic_v<T>, "T must be an arithmetic type");
-    static_assert(N > 0, "Vector dimension must be positive");
-
-private:
-    alignas(kFEFixedObjectAlignmentBytes) T data_[N];  // SIMD-friendly alignment
-
-public:
-    // Type definitions
-    using value_type = T;
-    using size_type = std::size_t;
-    using reference = T&;
-    using const_reference = const T&;
-    using pointer = T*;
-    using const_pointer = const T*;
-
-    /**
-     * @brief Default constructor - zero initializes all components
-     */
-    constexpr Vector() : data_{} {}
-
-    /**
-     * @brief Fill constructor - initializes all components with same value
-     * @param value Value to fill vector with
-     */
-    constexpr explicit Vector(T value) {
-        for (size_type i = 0; i < N; ++i) {
-            data_[i] = value;
-        }
-    }
-
-    /**
-     * @brief Initializer list constructor
-     * @param init List of values
-     */
-    constexpr Vector(std::initializer_list<T> init) : data_{} {
-        auto it = init.begin();
-        for (size_type i = 0; i < N && it != init.end(); ++i, ++it) {
-            data_[i] = *it;
-        }
-    }
-
-    /**
-     * @brief Constructor from expression template
-     * @tparam Expr Expression type
-     * @param expr Vector expression to evaluate
-     */
-    template<typename Expr>
-    Vector(const VectorExpr<Expr>& expr) {
-        const auto& e = expr.derived();
-        for (size_type i = 0; i < N; ++i) {
-            data_[i] = e[i];
-        }
-    }
-
-    /**
-     * @brief Copy constructor
-     */
-    constexpr Vector(const Vector&) = default;
-
-    /**
-     * @brief Move constructor
-     */
-    constexpr Vector(Vector&&) noexcept = default;
-
-    /**
-     * @brief Copy assignment
-     */
-    Vector& operator=(const Vector&) = default;
-
-    /**
-     * @brief Move assignment
-     */
-    Vector& operator=(Vector&&) noexcept = default;
-
-    /**
-     * @brief Assignment from expression template
-     * @tparam Expr Expression type
-     * @param expr Vector expression to evaluate
-     * @return Reference to this
-     */
-    template<typename Expr>
-    Vector& operator=(const VectorExpr<Expr>& expr) {
-        const auto& e = expr.derived();
-        for (size_type i = 0; i < N; ++i) {
-            data_[i] = e[i];
-        }
-        return *this;
-    }
-
-    /**
-     * @brief Get vector size (compile-time constant)
-     * @return Number of elements
-     */
-    static constexpr size_type size() { return N; }
-
-    /**
-     * @brief Element access (no bounds checking)
-     * @param i Element index
-     * @return Reference to element
-     */
-    constexpr T& operator[](size_type i) {
-        return data_[i];
-    }
-
-    /**
-     * @brief Element access (no bounds checking) - const version
-     * @param i Element index
-     * @return Const reference to element
-     */
-    constexpr const T& operator[](size_type i) const {
-        return data_[i];
-    }
-
-    /**
-     * @brief Element access with bounds checking
-     * @param i Element index
-     * @return Reference to element
-     * @throws std::out_of_range if i >= N
-     */
-    T& at(size_type i) {
-        if (i >= N) {
-            throw std::out_of_range("Vector::at: index out of range");
-        }
-        return data_[i];
-    }
-
-    /**
-     * @brief Element access with bounds checking - const version
-     * @param i Element index
-     * @return Const reference to element
-     * @throws std::out_of_range if i >= N
-     */
-    const T& at(size_type i) const {
-        if (i >= N) {
-            throw std::out_of_range("Vector::at: index out of range");
-        }
-        return data_[i];
-    }
-
-    /**
-     * @brief Access first element
-     * @return Reference to first element
-     */
-    T& front() { return data_[0]; }
-    const T& front() const { return data_[0]; }
-
-    /**
-     * @brief Access last element
-     * @return Reference to last element
-     */
-    T& back() { return data_[N-1]; }
-    const T& back() const { return data_[N-1]; }
-
-    /**
-     * @brief Get pointer to underlying data
-     * @return Pointer to first element
-     */
-    T* data() { return data_; }
-    const T* data() const { return data_; }
-
-    /**
-     * @brief Fill vector with value
-     * @param value Value to fill with
-     */
-    void fill(T value) {
-        for (size_type i = 0; i < N; ++i) {
-            data_[i] = value;
-        }
-    }
-
-    /**
-     * @brief Set all components to zero
-     */
-    void set_zero() {
-        fill(T{0});
-    }
-
-    // Arithmetic operators
-
-    /**
-     * @brief In-place addition
-     * @param other Vector to add
-     * @return Reference to this
-     */
-    Vector& operator+=(const Vector& other) {
-        for (size_type i = 0; i < N; ++i) {
-            data_[i] += other.data_[i];
-        }
-        return *this;
-    }
-
-    /**
-     * @brief In-place subtraction
-     * @param other Vector to subtract
-     * @return Reference to this
-     */
-    Vector& operator-=(const Vector& other) {
-        for (size_type i = 0; i < N; ++i) {
-            data_[i] -= other.data_[i];
-        }
-        return *this;
-    }
-
-    /**
-     * @brief In-place scalar multiplication
-     * @param scalar Scalar to multiply by
-     * @return Reference to this
-     */
-    Vector& operator*=(T scalar) {
-        for (size_type i = 0; i < N; ++i) {
-            data_[i] *= scalar;
-        }
-        return *this;
-    }
-
-    /**
-     * @brief In-place scalar division
-     * @param scalar Scalar to divide by
-     * @return Reference to this
-     */
-    Vector& operator/=(T scalar) {
-        const T inv = T(1) / scalar;
-        return (*this) *= inv;
-    }
-
-    // Vector operations
-
-    /**
-     * @brief Compute dot product
-     * @param other Other vector
-     * @return Dot product
-     */
-    T dot(const Vector& other) const {
-        T result = T(0);
-        for (size_type i = 0; i < N; ++i) {
-            result += data_[i] * other.data_[i];
-        }
-        return result;
-    }
-
-    /**
-     * @brief Compute squared Euclidean norm
-     * @return Squared norm
-     */
-    T norm_squared() const {
-        return dot(*this);
-    }
-
-    /**
-     * @brief Compute Euclidean norm
-     * @return Norm
-     */
-    T norm() const {
-        using std::sqrt;
-        return sqrt(norm_squared());
-    }
-
-    /**
-     * @brief Get normalized vector
-     * @return Unit vector in same direction
-     */
-    Vector normalized() const {
-        const T n = norm();
-        if (approx_zero(n)) {
-            return Vector();  // Return zero vector
-        }
-        return (*this) / n;
-    }
-
-    /**
-     * @brief Normalize this vector in place
-     * @return Reference to this
-     */
-    Vector& normalize() {
-        const T n = norm();
-        if (!approx_zero(n)) {
-            (*this) /= n;
-        }
-        return *this;
-    }
-
-    /**
-     * @brief Compute L1 norm (Manhattan norm)
-     * @return Sum of absolute values
-     */
-    T norm_l1() const {
-        T result = T(0);
-        for (size_type i = 0; i < N; ++i) {
-            using std::abs;
-            result += abs(data_[i]);
-        }
-        return result;
-    }
-
-    /**
-     * @brief Compute L-infinity norm (maximum norm)
-     * @return Maximum absolute value
-     */
-    T norm_inf() const {
-        T result = T(0);
-        for (size_type i = 0; i < N; ++i) {
-            using std::abs;
-            result = std::max(result, abs(data_[i]));
-        }
-        return result;
-    }
-
-    /**
-     * @brief Get minimum component
-     * @return Minimum value
-     */
-    T min() const {
-        T result = data_[0];
-        for (size_type i = 1; i < N; ++i) {
-            result = std::min(result, data_[i]);
-        }
-        return result;
-    }
-
-    /**
-     * @brief Get maximum component
-     * @return Maximum value
-     */
-    T max() const {
-        T result = data_[0];
-        for (size_type i = 1; i < N; ++i) {
-            result = std::max(result, data_[i]);
-        }
-        return result;
-    }
-
-    /**
-     * @brief Get sum of all components
-     * @return Sum of components
-     */
-    T sum() const {
-        T result = T(0);
-        for (size_type i = 0; i < N; ++i) {
-            result += data_[i];
-        }
-        return result;
-    }
-
-    /**
-     * @brief Get product of all components
-     * @return Product of components
-     */
-    T product() const {
-        T result = data_[0];
-        for (size_type i = 1; i < N; ++i) {
-            result *= data_[i];
-        }
-        return result;
-    }
-
-    // Static factory functions
-
-    /**
-     * @brief Create zero vector
-     * @return Vector with all components zero
-     */
-    static constexpr Vector zeros() {
-        return Vector();
-    }
-
-    /**
-     * @brief Create vector with all components one
-     * @return Vector with all components one
-     */
-    static constexpr Vector ones() {
-        return Vector(T(1));
-    }
-
-    /**
-     * @brief Create unit vector along axis
-     * @param axis Axis index (0-based)
-     * @return Unit vector
-     */
-    static Vector unit(size_type axis) {
-        Vector v;
-        if (axis < N) {
-            v[axis] = T(1);
-        }
-        return v;
-    }
-
-    /**
-     * @brief Create basis vector (alias for unit)
-     * @param i Axis index (0-based)
-     * @return Basis vector
-     */
-    static Vector basis(size_type i) {
-        return unit(i);
-    }
-
-    /**
-     * @brief Create zero vector (alias for zeros)
-     * @return Zero vector
-     */
-    static constexpr Vector zero() {
-        return zeros();
-    }
-
-    /**
-     * @brief Get index of minimum element
-     * @return Index of minimum value
-     */
-    size_type min_index() const {
-        size_type idx = 0;
-        T min_val = data_[0];
-        for (size_type i = 1; i < N; ++i) {
-            if (data_[i] < min_val) {
-                min_val = data_[i];
-                idx = i;
-            }
-        }
-        return idx;
-    }
-
-    /**
-     * @brief Get index of maximum element
-     * @return Index of maximum value
-     */
-    size_type max_index() const {
-        size_type idx = 0;
-        T max_val = data_[0];
-        for (size_type i = 1; i < N; ++i) {
-            if (data_[i] > max_val) {
-                max_val = data_[i];
-                idx = i;
-            }
-        }
-        return idx;
-    }
-
-    /**
-     * @brief Compute mean of all components
-     * @return Average value
-     */
-    T mean() const {
-        return sum() / static_cast<T>(N);
-    }
-
-    /**
-     * @brief Cross product for 3D vectors
-     * @param other Other vector
-     * @return Cross product
-     * @note Only available for 3D vectors
-     */
-    template<typename U = T>
-    std::enable_if_t<N == 3, Vector<U, 3>> cross(const Vector<U, 3>& other) const {
-        return Vector<U, 3>{
-            data_[1] * other[2] - data_[2] * other[1],
-            data_[2] * other[0] - data_[0] * other[2],
-            data_[0] * other[1] - data_[1] * other[0]
-        };
-    }
-
-    /**
-     * @brief Check if vectors are approximately equal
-     * @param other Other vector
-     * @param tol Tolerance
-     * @return true if equal within tolerance
-     */
-    bool approx_equal(const Vector& other, T tol = tolerance<T>) const {
-        for (size_type i = 0; i < N; ++i) {
-            using std::abs;
-            if (abs(data_[i] - other.data_[i]) > tol) {
-                return false;
-            }
-        }
-        return true;
-    }
-
-    /**
-     * @brief Equality comparison
-     * @param other Other vector
-     * @return true if exactly equal
-     */
-    bool operator==(const Vector& other) const {
-        for (size_type i = 0; i < N; ++i) {
-            if (data_[i] != other.data_[i]) {
-                return false;
-            }
-        }
-        return true;
-    }
-
-    /**
-     * @brief Inequality comparison
-     * @param other Other vector
-     * @return true if not equal
-     */
-    bool operator!=(const Vector& other) const {
-        return !(*this == other);
-    }
-
-    // Iterators
-    T* begin() { return data_; }
-    T* end() { return data_ + N; }
-    const T* begin() const { return data_; }
-    const T* end() const { return data_ + N; }
-    const T* cbegin() const { return data_; }
-    const T* cend() const { return data_ + N; }
-};
+using Vector = Eigen::Matrix<T, static_cast<int>(N), 1>;
 
 // Type aliases for common vector types
 template<typename T> using Vector2 = Vector<T, 2>;
@@ -595,269 +64,6 @@ using Vector2i = Vector2<int>;
 using Vector3i = Vector3<int>;
 using Vector4i = Vector4<int>;
 
-/**
- * @brief 3D Cross product
- * @tparam T Scalar type
- * @param a First vector
- * @param b Second vector
- * @return Cross product a × b
- */
-template<typename T>
-inline Vector3<T> cross(const Vector3<T>& a, const Vector3<T>& b) {
-    return Vector3<T>{
-        a[1] * b[2] - a[2] * b[1],
-        a[2] * b[0] - a[0] * b[2],
-        a[0] * b[1] - a[1] * b[0]
-    };
-}
-
-/**
- * @brief 2D Cross product (returns scalar - z component of 3D cross)
- * @tparam T Scalar type
- * @param a First vector
- * @param b Second vector
- * @return Scalar cross product
- */
-template<typename T>
-inline T cross(const Vector2<T>& a, const Vector2<T>& b) {
-    return a[0] * b[1] - a[1] * b[0];
-}
-
-/**
- * @brief Triple scalar product (a · (b × c))
- * @tparam T Scalar type
- * @param a First vector
- * @param b Second vector
- * @param c Third vector
- * @return Scalar triple product
- */
-template<typename T>
-inline T triple_product(const Vector3<T>& a, const Vector3<T>& b, const Vector3<T>& c) {
-    return a.dot(cross(b, c));
-}
-
-// Free functions for common operations
-
-/**
- * @brief Compute dot product
- */
-template<typename T, std::size_t N>
-inline T dot(const Vector<T, N>& a, const Vector<T, N>& b) {
-    return a.dot(b);
-}
-
-/**
- * @brief Compute Euclidean norm
- */
-template<typename T, std::size_t N>
-inline T norm(const Vector<T, N>& v) {
-    return v.norm();
-}
-
-/**
- * @brief Compute squared Euclidean norm
- */
-template<typename T, std::size_t N>
-inline T norm_squared(const Vector<T, N>& v) {
-    return v.norm_squared();
-}
-
-/**
- * @brief Get normalized vector
- */
-template<typename T, std::size_t N>
-inline Vector<T, N> normalize(const Vector<T, N>& v) {
-    return v.normalized();
-}
-
-/**
- * @brief Component-wise absolute value
- */
-template<typename T, std::size_t N>
-inline Vector<T, N> abs(const Vector<T, N>& v) {
-    Vector<T, N> result;
-    for (std::size_t i = 0; i < N; ++i) {
-        using std::abs;
-        result[i] = abs(v[i]);
-    }
-    return result;
-}
-
-/**
- * @brief Component-wise minimum
- */
-template<typename T, std::size_t N>
-inline Vector<T, N> min(const Vector<T, N>& a, const Vector<T, N>& b) {
-    Vector<T, N> result;
-    for (std::size_t i = 0; i < N; ++i) {
-        result[i] = std::min(a[i], b[i]);
-    }
-    return result;
-}
-
-/**
- * @brief Component-wise maximum
- */
-template<typename T, std::size_t N>
-inline Vector<T, N> max(const Vector<T, N>& a, const Vector<T, N>& b) {
-    Vector<T, N> result;
-    for (std::size_t i = 0; i < N; ++i) {
-        result[i] = std::max(a[i], b[i]);
-    }
-    return result;
-}
-
-/**
- * @brief Component-wise clamp
- */
-template<typename T, std::size_t N>
-inline Vector<T, N> clamp(const Vector<T, N>& v, const Vector<T, N>& min_v, const Vector<T, N>& max_v) {
-    Vector<T, N> result;
-    for (std::size_t i = 0; i < N; ++i) {
-        result[i] = std::clamp(v[i], min_v[i], max_v[i]);
-    }
-    return result;
-}
-
-/**
- * @brief Linear interpolation between vectors
- * @tparam T Scalar type
- * @tparam N Vector dimension
- * @param t Interpolation parameter [0, 1]
- * @param a Start vector (at t=0)
- * @param b End vector (at t=1)
- * @return Interpolated vector
- */
-template<typename T, std::size_t N>
-inline Vector<T, N> lerp(T t, const Vector<T, N>& a, const Vector<T, N>& b) {
-    return a + t * (b - a);
-}
-
-/**
- * @brief Spherical linear interpolation (for unit vectors)
- * @tparam T Scalar type
- * @param t Interpolation parameter [0, 1]
- * @param a Start unit vector
- * @param b End unit vector
- * @return Interpolated unit vector
- */
-template<typename T>
-inline Vector3<T> slerp(T t, const Vector3<T>& a, const Vector3<T>& b) {
-    T cos_angle = a.dot(b);
-
-    // Handle numerical issues
-    cos_angle = std::clamp(cos_angle, T(-1), T(1));
-
-    // If vectors are nearly parallel, use linear interpolation
-    if (cos_angle > T(0.9995)) {
-        return normalize(lerp(t, a, b));
-    }
-
-    T angle = std::acos(cos_angle);
-    T sin_angle = std::sin(angle);
-
-    T t0 = std::sin((T(1) - t) * angle) / sin_angle;
-    T t1 = std::sin(t * angle) / sin_angle;
-
-    return t0 * a + t1 * b;
-}
-
-/**
- * @brief Reflect vector about normal
- * @tparam T Scalar type
- * @tparam N Vector dimension
- * @param v Incident vector
- * @param n Normal vector (should be unit)
- * @return Reflected vector
- */
-template<typename T, std::size_t N>
-inline Vector<T, N> reflect(const Vector<T, N>& v, const Vector<T, N>& n) {
-    return v - T(2) * dot(v, n) * n;
-}
-
-/**
- * @brief Project vector onto another vector
- * @tparam T Scalar type
- * @tparam N Vector dimension
- * @param v Vector to project
- * @param onto Vector to project onto
- * @return Projection of v onto 'onto'
- */
-template<typename T, std::size_t N>
-inline Vector<T, N> project(const Vector<T, N>& v, const Vector<T, N>& onto) {
-    T denom = onto.norm_squared();
-    if (approx_zero(denom)) {
-        return Vector<T, N>::zeros();
-    }
-    return (dot(v, onto) / denom) * onto;
-}
-
-/**
- * @brief Get perpendicular component of vector
- * @tparam T Scalar type
- * @tparam N Vector dimension
- * @param v Vector
- * @param direction Direction to remove
- * @return Component of v perpendicular to direction
- */
-template<typename T, std::size_t N>
-inline Vector<T, N> perpendicular(const Vector<T, N>& v, const Vector<T, N>& direction) {
-    return v - project(v, direction);
-}
-
-/**
- * @brief Compute angle between two vectors
- * @tparam T Scalar type
- * @tparam N Vector dimension
- * @param a First vector
- * @param b Second vector
- * @return Angle in radians [0, π]
- */
-template<typename T, std::size_t N>
-inline T angle(const Vector<T, N>& a, const Vector<T, N>& b) {
-    T cos_angle = dot(a, b) / (norm(a) * norm(b));
-    cos_angle = std::clamp(cos_angle, T(-1), T(1));
-    return std::acos(cos_angle);
-}
-
-/**
- * @brief Check if two vectors are approximately equal
- * @tparam T Scalar type
- * @tparam N Vector dimension
- * @param a First vector
- * @param b Second vector
- * @param tol Tolerance
- * @return true if vectors are equal within tolerance
- */
-template<typename T, std::size_t N>
-inline bool approx_equal(const Vector<T, N>& a, const Vector<T, N>& b, T tol = tolerance<T>) {
-    for (std::size_t i = 0; i < N; ++i) {
-        if (!approx_equal(a[i], b[i], tol)) {
-            return false;
-        }
-    }
-    return true;
-}
-
-/**
- * @brief Stream output operator
- * @tparam T Scalar type
- * @tparam N Vector dimension
- * @param os Output stream
- * @param v Vector to output
- * @return Reference to output stream
- */
-template<typename T, std::size_t N>
-inline std::ostream& operator<<(std::ostream& os, const Vector<T, N>& v) {
-    os << "[";
-    for (std::size_t i = 0; i < N; ++i) {
-        if (i > 0) os << ", ";
-        os << v[i];
-    }
-    os << "]";
-    return os;
-}
-
 } // namespace math
 } // namespace FE
 } // namespace svmp
diff --git a/Code/Source/solver/FE/Math/VectorExpr.h b/Code/Source/solver/FE/Math/VectorExpr.h
deleted file mode 100644
index aa712dd63..000000000
--- a/Code/Source/solver/FE/Math/VectorExpr.h
+++ /dev/null
@@ -1,476 +0,0 @@
-// SPDX-FileCopyrightText: Copyright (c) Stanford University, The Regents of the University of California, and others.
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef SVMP_FE_MATH_VECTOR_EXPR_H
-#define SVMP_FE_MATH_VECTOR_EXPR_H
-
-/**
- * @file VectorExpr.h
- * @brief Expression template infrastructure for lazy evaluation of vector operations
- *
- * This header provides expression templates that enable compound vector operations
- * without creating temporary objects. Operations are evaluated lazily at the point
- * of assignment, eliminating intermediate allocations and improving performance.
- */
-
-#include <cmath>
-#include <cstddef>
-#include <type_traits>
-
-namespace svmp {
-namespace FE {
-namespace math {
-namespace detail {
-namespace ops {
-
-struct Add {
-    template<typename T1, typename T2>
-    constexpr auto operator()(const T1& a, const T2& b) const {
-        return a + b;
-    }
-};
-
-struct Sub {
-    template<typename T1, typename T2>
-    constexpr auto operator()(const T1& a, const T2& b) const {
-        return a - b;
-    }
-};
-
-struct Mul {
-    template<typename T1, typename T2>
-    constexpr auto operator()(const T1& a, const T2& b) const {
-        return a * b;
-    }
-};
-
-struct Div {
-    template<typename T1, typename T2>
-    constexpr auto operator()(const T1& a, const T2& b) const {
-        return a / b;
-    }
-};
-
-struct Negate {
-    template<typename T>
-    constexpr auto operator()(const T& a) const {
-        return -a;
-    }
-};
-
-struct Abs {
-    template<typename T>
-    constexpr auto operator()(const T& a) const {
-        using std::abs;
-        return abs(a);
-    }
-};
-
-struct Sqrt {
-    template<typename T>
-    constexpr auto operator()(const T& a) const {
-        using std::sqrt;
-        return sqrt(a);
-    }
-};
-
-} // namespace ops
-} // namespace detail
-
-/**
- * @brief Base class for all vector expressions using CRTP
- * @tparam Derived The derived expression type
- *
- * This uses the Curiously Recurring Template Pattern (CRTP) to provide
- * static polymorphism for expression templates.
- */
-template<typename Derived>
-class VectorExpr {
-public:
-    /**
-     * @brief Get the derived expression
-     * @return Reference to the derived type
-     */
-    const Derived& derived() const {
-        return static_cast<const Derived&>(*this);
-    }
-
-    /**
-     * @brief Get the derived expression (non-const)
-     * @return Reference to the derived type
-     */
-    Derived& derived() {
-        return static_cast<Derived&>(*this);
-    }
-
-    /**
-     * @brief Access element by index
-     * @param i Element index
-     * @return Value at index i
-     */
-    auto operator[](std::size_t i) const {
-        return derived()[i];
-    }
-
-    /**
-     * @brief Get the size of the vector expression
-     * @return Number of elements
-     */
-    std::size_t size() const {
-        return derived().size();
-    }
-};
-
-/**
- * @brief Binary expression for element-wise operations between two vector expressions
- * @tparam LHS Left-hand side expression type
- * @tparam RHS Right-hand side expression type
- * @tparam Op Binary operation functor
- */
-template<typename LHS, typename RHS, typename Op>
-class VectorBinaryExpr : public VectorExpr<VectorBinaryExpr<LHS, RHS, Op>> {
-private:
-    LHS lhs_;
-    RHS rhs_;
-    Op op_;
-
-public:
-    /**
-     * @brief Construct binary expression
-     * @param lhs Left operand
-     * @param rhs Right operand
-     * @param op Operation to apply
-     */
-    constexpr VectorBinaryExpr(const LHS& lhs, const RHS& rhs, Op op = Op{})
-        : lhs_(lhs), rhs_(rhs), op_(op) {}
-
-    /**
-     * @brief Access element at index
-     * @param i Element index
-     * @return Result of operation on elements at index i
-     */
-    constexpr auto operator[](std::size_t i) const {
-        return op_(lhs_[i], rhs_[i]);
-    }
-
-    /**
-     * @brief Get size of expression (from left operand)
-     * @return Number of elements
-     */
-    constexpr std::size_t size() const {
-        return lhs_.size();
-    }
-};
-
-/**
- * @brief Unary expression for element-wise operations on a single vector expression
- * @tparam Expr Expression type
- * @tparam Op Unary operation functor
- */
-template<typename Expr, typename Op>
-class VectorUnaryExpr : public VectorExpr<VectorUnaryExpr<Expr, Op>> {
-private:
-    Expr expr_;
-    Op op_;
-
-public:
-    /**
-     * @brief Construct unary expression
-     * @param expr Operand expression
-     * @param op Operation to apply
-     */
-    constexpr VectorUnaryExpr(const Expr& expr, Op op = Op{})
-        : expr_(expr), op_(op) {}
-
-    /**
-     * @brief Access element at index
-     * @param i Element index
-     * @return Result of operation on element at index i
-     */
-    constexpr auto operator[](std::size_t i) const {
-        return op_(expr_[i]);
-    }
-
-    /**
-     * @brief Get size of expression
-     * @return Number of elements
-     */
-    constexpr std::size_t size() const {
-        return expr_.size();
-    }
-};
-
-/**
- * @brief Scalar multiplication expression
- * @tparam Expr Vector expression type
- * @tparam Scalar Scalar type
- */
-template<typename Expr, typename Scalar>
-class VectorScalarExpr : public VectorExpr<VectorScalarExpr<Expr, Scalar>> {
-private:
-    Expr expr_;
-    Scalar scalar_;
-
-public:
-    /**
-     * @brief Construct scalar multiplication expression
-     * @param expr Vector expression
-     * @param scalar Scalar value
-     */
-    constexpr VectorScalarExpr(const Expr& expr, Scalar scalar)
-        : expr_(expr), scalar_(scalar) {}
-
-    /**
-     * @brief Access element at index
-     * @param i Element index
-     * @return Element multiplied by scalar
-     */
-    constexpr auto operator[](std::size_t i) const {
-        return expr_[i] * scalar_;
-    }
-
-    /**
-     * @brief Get size of expression
-     * @return Number of elements
-     */
-    constexpr std::size_t size() const {
-        return expr_.size();
-    }
-};
-
-/**
- * @brief Scalar division expression
- * @tparam Expr Vector expression type
- * @tparam Scalar Scalar type
- */
-template<typename Expr, typename Scalar>
-class VectorScalarDivExpr : public VectorExpr<VectorScalarDivExpr<Expr, Scalar>> {
-private:
-    Expr expr_;
-    Scalar scalar_;
-
-public:
-    /**
-     * @brief Construct scalar division expression
-     * @param expr Vector expression
-     * @param scalar Scalar divisor
-     */
-    constexpr VectorScalarDivExpr(const Expr& expr, Scalar scalar)
-        : expr_(expr), scalar_(scalar) {}
-
-    /**
-     * @brief Access element at index
-     * @param i Element index
-     * @return Element divided by scalar
-     */
-    constexpr auto operator[](std::size_t i) const {
-        return expr_[i] / scalar_;
-    }
-
-    /**
-     * @brief Get size of expression
-     * @return Number of elements
-     */
-    constexpr std::size_t size() const {
-        return expr_.size();
-    }
-};
-
-/**
- * @brief Addition operator for vector expressions
- */
-template<typename LHS, typename RHS,
-         typename = std::enable_if_t<
-             std::is_base_of_v<VectorExpr<LHS>, LHS> &&
-             std::is_base_of_v<VectorExpr<RHS>, RHS>
-         >>
-constexpr auto operator+(const VectorExpr<LHS>& lhs, const VectorExpr<RHS>& rhs) {
-    return VectorBinaryExpr<LHS, RHS, detail::ops::Add>(
-        lhs.derived(), rhs.derived(), detail::ops::Add{}
-    );
-}
-
-/**
- * @brief Subtraction operator for vector expressions
- */
-template<typename LHS, typename RHS,
-         typename = std::enable_if_t<
-             std::is_base_of_v<VectorExpr<LHS>, LHS> &&
-             std::is_base_of_v<VectorExpr<RHS>, RHS>
-         >>
-constexpr auto operator-(const VectorExpr<LHS>& lhs, const VectorExpr<RHS>& rhs) {
-    return VectorBinaryExpr<LHS, RHS, detail::ops::Sub>(
-        lhs.derived(), rhs.derived(), detail::ops::Sub{}
-    );
-}
-
-/**
- * @brief Element-wise multiplication operator for vector expressions
- */
-template<typename LHS, typename RHS,
-         typename = std::enable_if_t<
-             std::is_base_of_v<VectorExpr<LHS>, LHS> &&
-             std::is_base_of_v<VectorExpr<RHS>, RHS>
-         >>
-constexpr auto hadamard(const VectorExpr<LHS>& lhs, const VectorExpr<RHS>& rhs) {
-    return VectorBinaryExpr<LHS, RHS, detail::ops::Mul>(
-        lhs.derived(), rhs.derived(), detail::ops::Mul{}
-    );
-}
-
-/**
- * @brief Element-wise division operator for vector expressions
- */
-template<typename LHS, typename RHS,
-         typename = std::enable_if_t<
-             std::is_base_of_v<VectorExpr<LHS>, LHS> &&
-             std::is_base_of_v<VectorExpr<RHS>, RHS>
-         >>
-constexpr auto hadamard_div(const VectorExpr<LHS>& lhs, const VectorExpr<RHS>& rhs) {
-    return VectorBinaryExpr<LHS, RHS, detail::ops::Div>(
-        lhs.derived(), rhs.derived(), detail::ops::Div{}
-    );
-}
-
-/**
- * @brief Negation operator for vector expressions
- */
-template<typename Expr,
-         typename = std::enable_if_t<
-             std::is_base_of_v<VectorExpr<Expr>, Expr>
-         >>
-constexpr auto operator-(const VectorExpr<Expr>& expr) {
-    return VectorUnaryExpr<Expr, detail::ops::Negate>(
-        expr.derived(), detail::ops::Negate{}
-    );
-}
-
-/**
- * @brief Scalar multiplication operator (vector * scalar)
- */
-template<typename Expr, typename Scalar,
-         typename = std::enable_if_t<
-             std::is_base_of_v<VectorExpr<Expr>, Expr> &&
-             std::is_arithmetic_v<Scalar>
-         >>
-constexpr auto operator*(const VectorExpr<Expr>& expr, Scalar scalar) {
-    return VectorScalarExpr<Expr, Scalar>(expr.derived(), scalar);
-}
-
-/**
- * @brief Scalar multiplication operator (scalar * vector)
- */
-template<typename Scalar, typename Expr,
-         typename = std::enable_if_t<
-             std::is_arithmetic_v<Scalar> &&
-             std::is_base_of_v<VectorExpr<Expr>, Expr>
-         >>
-constexpr auto operator*(Scalar scalar, const VectorExpr<Expr>& expr) {
-    return VectorScalarExpr<Expr, Scalar>(expr.derived(), scalar);
-}
-
-/**
- * @brief Scalar division operator (vector / scalar)
- */
-template<typename Expr, typename Scalar,
-         typename = std::enable_if_t<
-             std::is_base_of_v<VectorExpr<Expr>, Expr> &&
-             std::is_arithmetic_v<Scalar>
-         >>
-constexpr auto operator/(const VectorExpr<Expr>& expr, Scalar scalar) {
-    return VectorScalarDivExpr<Expr, Scalar>(expr.derived(), scalar);
-}
-
-/**
- * @brief Element-wise absolute value
- */
-template<typename Expr,
-         typename = std::enable_if_t<
-             std::is_base_of_v<VectorExpr<Expr>, Expr>
-         >>
-constexpr auto abs(const VectorExpr<Expr>& expr) {
-    return VectorUnaryExpr<Expr, detail::ops::Abs>(expr.derived(), detail::ops::Abs{});
-}
-
-/**
- * @brief Element-wise square root
- */
-template<typename Expr,
-         typename = std::enable_if_t<
-             std::is_base_of_v<VectorExpr<Expr>, Expr>
-         >>
-constexpr auto sqrt(const VectorExpr<Expr>& expr) {
-    return VectorUnaryExpr<Expr, detail::ops::Sqrt>(expr.derived(), detail::ops::Sqrt{});
-}
-
-/**
- * @brief Dot product for vector expressions
- * @tparam LHS Left vector expression type
- * @tparam RHS Right vector expression type
- * @param lhs Left operand
- * @param rhs Right operand
- * @return Dot product result
- */
-template<typename LHS, typename RHS,
-         typename = std::enable_if_t<
-             std::is_base_of_v<VectorExpr<LHS>, LHS> &&
-             std::is_base_of_v<VectorExpr<RHS>, RHS>
-         >>
-constexpr auto dot(const VectorExpr<LHS>& lhs, const VectorExpr<RHS>& rhs) {
-    using result_type = decltype(lhs.derived()[0] * rhs.derived()[0]);
-    result_type sum = result_type{0};
-    const auto n = lhs.size();
-    for (std::size_t i = 0; i < n; ++i) {
-        sum += lhs.derived()[i] * rhs.derived()[i];
-    }
-    return sum;
-}
-
-/**
- * @brief Compute norm squared of vector expression
- * @tparam Expr Vector expression type
- * @param expr Vector expression
- * @return Square of the Euclidean norm
- */
-template<typename Expr,
-         typename = std::enable_if_t<
-             std::is_base_of_v<VectorExpr<Expr>, Expr>
-         >>
-constexpr auto norm_squared(const VectorExpr<Expr>& expr) {
-    return dot(expr, expr);
-}
-
-/**
- * @brief Compute norm of vector expression
- * @tparam Expr Vector expression type
- * @param expr Vector expression
- * @return Euclidean norm
- */
-template<typename Expr,
-         typename = std::enable_if_t<
-             std::is_base_of_v<VectorExpr<Expr>, Expr>
-         >>
-constexpr auto norm(const VectorExpr<Expr>& expr) {
-    using std::sqrt;
-    return sqrt(norm_squared(expr));
-}
-
-/**
- * @brief Normalize vector expression
- * @tparam Expr Vector expression type
- * @param expr Vector expression
- * @return Normalized vector expression
- */
-template<typename Expr,
-         typename = std::enable_if_t<
-             std::is_base_of_v<VectorExpr<Expr>, Expr>
-         >>
-constexpr auto normalize(const VectorExpr<Expr>& expr) {
-    return expr / norm(expr);
-}
-
-} // namespace math
-} // namespace FE
-} // namespace svmp
-
-#endif // SVMP_FE_MATH_VECTOR_EXPR_H
diff --git a/Code/Source/solver/nn.cpp b/Code/Source/solver/nn.cpp
index eb6d35106..60fcddf81 100644
--- a/Code/Source/solver/nn.cpp
+++ b/Code/Source/solver/nn.cpp
@@ -28,8 +28,10 @@
 
 #include <array>
 #include <functional>
-#include <math.h> 
+#include <map>
+#include <math.h>
 #include <memory>
+#include <mutex>
 #include <optional>
 #include <span>
 #include <string>
@@ -55,12 +57,6 @@ namespace {
 namespace fe = svmp::FE;
 namespace febasis = svmp::FE::basis;
 
-struct BasisSelection {
-  fe::ElementType element;
-  fe::BasisType basis;
-  int order;
-};
-
 std::string solver_element_name(consts::ElementType eType)
 {
   auto it = consts::element_type_to_string.find(eType);
@@ -70,34 +66,42 @@ std::string solver_element_name(consts::ElementType eType)
   return "unknown (" + std::to_string(static_cast<int>(eType)) + ")";
 }
 
-std::optional<BasisSelection> to_basis_selection(consts::ElementType eType)
+/// Translate a solver element type into its FE library counterpart. This is a
+/// pure renaming between the two enum vocabularies: the FE library owns the
+/// choice of basis family and polynomial order for each element type
+/// (basis_factory::default_basis_request). The switch deliberately has no
+/// default case so that compilers building with -Wswitch flag any newly added
+/// solver element type that is missing a mapping here.
+std::optional<fe::ElementType> to_fe_element_type(consts::ElementType eType)
 {
-  static constexpr std::array supported{
-      BasisSelection{fe::ElementType::Line2,     fe::BasisType::Lagrange,    1},
-      BasisSelection{fe::ElementType::Line3,     fe::BasisType::Lagrange,    2},
-      BasisSelection{fe::ElementType::Triangle3, fe::BasisType::Lagrange,    1},
-      BasisSelection{fe::ElementType::Triangle6, fe::BasisType::Lagrange,    2},
-      BasisSelection{fe::ElementType::Quad4,     fe::BasisType::Lagrange,    1},
-      BasisSelection{fe::ElementType::Quad8,     fe::BasisType::Serendipity, 2},
-      BasisSelection{fe::ElementType::Quad9,     fe::BasisType::Lagrange,    2},
-      BasisSelection{fe::ElementType::Tetra4,    fe::BasisType::Lagrange,    1},
-      BasisSelection{fe::ElementType::Tetra10,   fe::BasisType::Lagrange,    2},
-      BasisSelection{fe::ElementType::Hex8,      fe::BasisType::Lagrange,    1},
-      BasisSelection{fe::ElementType::Hex20,     fe::BasisType::Serendipity, 2},
-      BasisSelection{fe::ElementType::Hex27,     fe::BasisType::Lagrange,    2},
-      BasisSelection{fe::ElementType::Wedge6,    fe::BasisType::Lagrange,    1},
-  };
-
-  const int index = static_cast<int>(eType) - static_cast<int>(consts::ElementType::LIN1);
-  if (index >= 0 && static_cast<std::size_t>(index) < supported.size()) {
-    return supported[static_cast<std::size_t>(index)];
+  switch (eType) {
+    case consts::ElementType::LIN1:  return fe::ElementType::Line2;
+    case consts::ElementType::LIN2:  return fe::ElementType::Line3;
+    case consts::ElementType::TRI3:  return fe::ElementType::Triangle3;
+    case consts::ElementType::TRI6:  return fe::ElementType::Triangle6;
+    case consts::ElementType::QUD4:  return fe::ElementType::Quad4;
+    case consts::ElementType::QUD8:  return fe::ElementType::Quad8;
+    case consts::ElementType::QUD9:  return fe::ElementType::Quad9;
+    case consts::ElementType::TET4:  return fe::ElementType::Tetra4;
+    case consts::ElementType::TET10: return fe::ElementType::Tetra10;
+    case consts::ElementType::HEX8:  return fe::ElementType::Hex8;
+    case consts::ElementType::HEX20: return fe::ElementType::Hex20;
+    case consts::ElementType::HEX27: return fe::ElementType::Hex27;
+    case consts::ElementType::WDG:   return fe::ElementType::Wedge6;
+
+    // No FE basis mapping: points use dedicated shape data in get_gnn and
+    // NURBS are outside the current FE Basis scope.
+    case consts::ElementType::NA:
+    case consts::ElementType::PNT:
+    case consts::ElementType::NRB:
+      return std::nullopt;
   }
   return std::nullopt;
 }
 
 bool use_basis_adapter_for(consts::ElementType eType)
 {
-  return to_basis_selection(eType).has_value();
+  return to_fe_element_type(eType).has_value();
 }
 
 bool supports_face_basis_adapter_for(consts::ElementType eType)
@@ -110,23 +114,36 @@ bool supports_face_basis_adapter_for(consts::ElementType eType)
     case consts::ElementType::QUD4:
     case consts::ElementType::QUD8:
     case consts::ElementType::QUD9:
-      return to_basis_selection(eType).has_value();
+      return use_basis_adapter_for(eType);
     default:
       return false;
   }
 }
 
-std::shared_ptr<febasis::BasisFunction> make_basis_for_solver_element(consts::ElementType eType)
+/// Return the shared FE basis for a solver element type, constructing it on
+/// first use. Basis construction is not free (node-lattice generation, and a
+/// Vandermonde inversion for quadrilateral serendipity), while callers invoke
+/// this per Gauss point or per probe point, so instances are cached per
+/// element type. Sharing is safe: bases are immutable after construction,
+/// evaluation is const, and BasisFunction scratch state is thread_local.
+const febasis::BasisFunction& basis_for_solver_element(consts::ElementType eType)
 {
-  auto selection = to_basis_selection(eType);
-  if (!selection) {
+  static std::mutex cache_mutex;
+  static std::map<consts::ElementType, std::shared_ptr<febasis::BasisFunction>> cache;
+
+  const auto fe_type = to_fe_element_type(eType);
+  if (!fe_type) {
     throw febasis::BasisElementCompatibilityException(
         "No FE Basis selection for solver element " + solver_element_name(eType),
         __FILE__, __LINE__, __func__);
   }
 
-  return febasis::basis_factory::create(
-      {selection->element, selection->basis, selection->order});
+  const std::lock_guard<std::mutex> lock(cache_mutex);
+  auto it = cache.find(eType);
+  if (it == cache.end()) {
+    it = cache.emplace(eType, febasis::basis_factory::create_default_for(*fe_type)).first;
+  }
+  return *it->second;
 }
 
 std::span<const std::size_t> solver_to_basis_node_map(consts::ElementType eType)
@@ -192,7 +209,9 @@ fe::math::Vector<fe::Real, 3> make_basis_point(const febasis::BasisFunction& bas
         __FILE__, __LINE__, __func__);
   }
 
-  fe::math::Vector<fe::Real, 3> point{};
+  // Inactive trailing components must be zero for lower-dimensional elements;
+  // Eigen-backed vectors are not zero-initialized by default construction.
+  fe::math::Vector<fe::Real, 3> point = fe::math::Vector<fe::Real, 3>::Zero();
   for (int d = 0; d < basis.dimension(); ++d) {
     point[static_cast<std::size_t>(d)] = xi(d, g);
   }
@@ -250,19 +269,19 @@ void evaluate_basis_values_and_gradients(const int insd,
                                          Array<double>& N,
                                          Array3<double>& Nx)
 {
-  auto basis = make_basis_for_solver_element(eType);
-  if (insd < basis->dimension()) {
+  const auto& basis = basis_for_solver_element(eType);
+  if (insd < basis.dimension()) {
     throw febasis::BasisConfigurationException(
         "solver insd " + std::to_string(insd) +
-            " is smaller than FE Basis reference dimension " + std::to_string(basis->dimension()),
+            " is smaller than FE Basis reference dimension " + std::to_string(basis.dimension()),
         __FILE__, __LINE__, __func__);
   }
 
-  const auto point = make_basis_point(*basis, g, xi);
+  const auto point = make_basis_point(basis, g, xi);
   std::vector<fe::Real> values;
   std::vector<febasis::Gradient> gradients;
-  basis->evaluate_values(point, values);
-  basis->evaluate_gradients(point, gradients);
+  basis.evaluate_values(point, values);
+  basis.evaluate_gradients(point, gradients);
 
   // FE Basis owns the formulas; fsType and mshType remain the solver-facing storage contract.
   copy_basis_values_to_solver_arrays(eType, eNoN, g, values, gradients, N, Nx);
@@ -355,15 +374,15 @@ void evaluate_basis_hessians(const int insd,
                              const Array<double>& xi,
                              Array3<double>& Nxx)
 {
-  auto basis = make_basis_for_solver_element(eType);
-  if (insd < basis->dimension()) {
+  const auto& basis = basis_for_solver_element(eType);
+  if (insd < basis.dimension()) {
     throw febasis::BasisConfigurationException(
         "solver insd " + std::to_string(insd) +
-            " is smaller than FE Basis reference dimension " + std::to_string(basis->dimension()),
+            " is smaller than FE Basis reference dimension " + std::to_string(basis.dimension()),
         __FILE__, __LINE__, __func__);
   }
 
-  const int required_components = required_nxx_components_for_dimension(basis->dimension());
+  const int required_components = required_nxx_components_for_dimension(basis.dimension());
   if (ind2 < required_components) {
     throw febasis::BasisConfigurationException(
         "solver ind2 " + std::to_string(ind2) +
@@ -371,12 +390,12 @@ void evaluate_basis_hessians(const int insd,
         __FILE__, __LINE__, __func__);
   }
 
-  const auto point = make_basis_point(*basis, gaus_pt, xi);
+  const auto point = make_basis_point(basis, gaus_pt, xi);
   std::vector<febasis::Hessian> hessians;
-  basis->evaluate_hessians(point, hessians);
+  basis.evaluate_hessians(point, hessians);
 
   // Solver Nxx packing is dxx, dyy, dxy in 2D and dxx, dyy, dzz, dxy, dyz, dxz in 3D.
-  copy_basis_hessians_to_solver_nxx(eType, eNoN, gaus_pt, basis->dimension(), hessians, Nxx);
+  copy_basis_hessians_to_solver_nxx(eType, eNoN, gaus_pt, basis.dimension(), hessians, Nxx);
 }
 
 void set_point_face_shape_data(const int gaus_pt, faceType& face)
diff --git a/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp b/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
index d4bf1d6e5..60ca72114 100644
--- a/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
+++ b/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
@@ -34,6 +34,61 @@ class MinimalScalarBasis : public BasisFunction {
     }
 };
 
+// Quadratic scalar basis with exact analytic derivatives, used to verify the
+// protected numerical_gradient/numerical_hessian development helpers. Centered
+// differences are exact (up to roundoff) on quadratics, so any mismatch is a
+// bug in the helpers themselves.
+class ExactQuadraticBasis : public BasisFunction {
+public:
+    using BasisFunction::numerical_gradient;
+    using BasisFunction::numerical_hessian;
+
+    BasisType basis_type() const noexcept override { return BasisType::Custom; }
+    ElementType element_type() const noexcept override { return ElementType::Hex8; }
+    int dimension() const noexcept override { return 3; }
+    int order() const noexcept override { return 2; }
+    std::size_t size() const noexcept override { return 2u; }
+
+    void evaluate_values(const math::Vector<Real, 3>& xi,
+                         std::vector<Real>& values) const override
+    {
+        const Real x = xi[0];
+        const Real y = xi[1];
+        const Real z = xi[2];
+        values.resize(size());
+        values[0] = Real(1) + Real(2) * x - y + Real(0.5) * z +
+                    x * x + Real(0.75) * y * y - Real(0.25) * z * z +
+                    Real(0.2) * x * y - Real(0.3) * x * z + Real(0.4) * y * z;
+        values[1] = Real(3) - x + Real(2) * y + z +
+                    Real(0.5) * x * x - y * y + z * z +
+                    x * y + x * z - y * z;
+    }
+
+    void evaluate_gradients(const math::Vector<Real, 3>& xi,
+                            std::vector<Gradient>& gradients) const override
+    {
+        const Real x = xi[0];
+        const Real y = xi[1];
+        const Real z = xi[2];
+        gradients.assign(size(), Gradient::Zero());
+        gradients[0][0] = Real(2) + Real(2) * x + Real(0.2) * y - Real(0.3) * z;
+        gradients[0][1] = Real(-1) + Real(1.5) * y + Real(0.2) * x + Real(0.4) * z;
+        gradients[0][2] = Real(0.5) - Real(0.5) * z - Real(0.3) * x + Real(0.4) * y;
+        gradients[1][0] = Real(-1) + x + y + z;
+        gradients[1][1] = Real(2) - Real(2) * y + x - z;
+        gradients[1][2] = Real(1) + Real(2) * z + x - y;
+    }
+
+    void exact_hessians(std::vector<Hessian>& hessians) const
+    {
+        hessians.assign(size(), Hessian::Zero());
+        hessians[0] = make_symmetric_hessian(Real(2), Real(1.5), Real(-0.5),
+                                             Real(0.2), Real(-0.3), Real(0.4));
+        hessians[1] = make_symmetric_hessian(Real(1), Real(-2), Real(2),
+                                             Real(1), Real(1), Real(-1));
+    }
+};
+
 class CompleteFallbackBasis : public BasisFunction {
 public:
     BasisType basis_type() const noexcept override { return BasisType::Lagrange; }
@@ -53,7 +108,7 @@ class CompleteFallbackBasis : public BasisFunction {
     void evaluate_gradients(const math::Vector<Real, 3>&,
                             std::vector<Gradient>& gradients) const override
     {
-        gradients.assign(size(), Gradient{});
+        gradients.assign(size(), Gradient::Zero());
         gradients[0][0] = Real(1);
         gradients[1][1] = Real(1);
     }
@@ -61,7 +116,7 @@ class CompleteFallbackBasis : public BasisFunction {
     void evaluate_hessians(const math::Vector<Real, 3>& xi,
                            std::vector<Hessian>& hessians) const override
     {
-        hessians.assign(size(), Hessian{});
+        hessians.assign(size(), Hessian::Zero());
         for (std::size_t d = 0; d < hessians.size(); ++d) {
             for (std::size_t r = 0; r < 3u; ++r) {
                 for (std::size_t c = 0; c < 3u; ++c) {
@@ -96,6 +151,16 @@ TEST(BasisErrorPaths, SerendipityInvalidRequestsThrowBasisExceptions) {
                  BasisElementCompatibilityException);
 }
 
+TEST(BasisErrorPaths, BasisFactoryRejectsNonC0Continuity) {
+    BasisRequest c1_request{ElementType::Line2, BasisType::Lagrange, 1};
+    c1_request.continuity = Continuity::C1;
+    EXPECT_THROW((void)basis_factory::create(c1_request), BasisConfigurationException);
+
+    BasisRequest l2_request{ElementType::Quad8, BasisType::Serendipity, 2};
+    l2_request.continuity = Continuity::L2;
+    EXPECT_THROW((void)basis_factory::create(l2_request), BasisConfigurationException);
+}
+
 TEST(BasisErrorPaths, BasisFactoryInvalidRequestsThrowBasisExceptions) {
     EXPECT_THROW((void)basis_factory::create(
                      BasisRequest{ElementType::Line2, BasisType::Lagrange}),
@@ -153,6 +218,43 @@ TEST(BasisErrorPaths, BasisFunctionDefaultsThrowForMissingDerivatives) {
     EXPECT_THROW(basis.evaluate_hessians(xi, hessians), BasisEvaluationException);
 }
 
+TEST(BasisErrorPaths, NumericalDerivativeHelpersMatchAnalyticDerivatives) {
+    ExactQuadraticBasis basis;
+    const math::Vector<Real, 3> xi{Real(0.2), Real(-0.35), Real(0.4)};
+
+    std::vector<Gradient> exact_gradients;
+    basis.evaluate_gradients(xi, exact_gradients);
+
+    std::vector<Gradient> approx_gradients;
+    basis.numerical_gradient(xi, approx_gradients);
+    ASSERT_EQ(approx_gradients.size(), basis.size());
+    for (std::size_t n = 0; n < basis.size(); ++n) {
+        for (int d = 0; d < basis.dimension(); ++d) {
+            const std::size_t sd = static_cast<std::size_t>(d);
+            EXPECT_NEAR(approx_gradients[n][sd], exact_gradients[n][sd], Real(1e-8))
+                << "basis=" << n << " component=" << d;
+        }
+    }
+
+    std::vector<Hessian> exact_hessians;
+    basis.exact_hessians(exact_hessians);
+
+    std::vector<Hessian> approx_hessians;
+    basis.numerical_hessian(xi, approx_hessians);
+    ASSERT_EQ(approx_hessians.size(), basis.size());
+    for (std::size_t n = 0; n < basis.size(); ++n) {
+        for (int r = 0; r < basis.dimension(); ++r) {
+            for (int c = 0; c < basis.dimension(); ++c) {
+                const std::size_t sr = static_cast<std::size_t>(r);
+                const std::size_t sc = static_cast<std::size_t>(c);
+                EXPECT_NEAR(approx_hessians[n](sr, sc), exact_hessians[n](sr, sc),
+                            Real(1e-8))
+                    << "basis=" << n << " component=(" << r << "," << c << ")";
+            }
+        }
+    }
+}
+
 TEST(BasisErrorPaths, BasisFunctionFallbackWritesRawLayouts) {
     CompleteFallbackBasis basis;
     const math::Vector<Real, 3> point{Real(0.25), Real(0.5), Real(-0.25)};
diff --git a/tests/unitTests/FE/Basis/test_BasisHessians.cpp b/tests/unitTests/FE/Basis/test_BasisHessians.cpp
index f786b07cd..9ad458c0b 100644
--- a/tests/unitTests/FE/Basis/test_BasisHessians.cpp
+++ b/tests/unitTests/FE/Basis/test_BasisHessians.cpp
@@ -18,12 +18,39 @@ using namespace svmp::FE::basis;
 
 namespace {
 
+void numerical_gradient_helper(const BasisFunction& basis,
+                               const math::Vector<Real, 3>& xi,
+                               std::vector<Gradient>& gradients,
+                               Real eps = Real(1e-6))
+{
+    std::vector<Real> base;
+    basis.evaluate_values(xi, base);
+    gradients.assign(base.size(), Gradient::Zero());
+
+    for (int d = 0; d < basis.dimension(); ++d) {
+        const std::size_t sd = static_cast<std::size_t>(d);
+        math::Vector<Real, 3> xi_p = xi;
+        math::Vector<Real, 3> xi_m = xi;
+        xi_p[sd] += eps;
+        xi_m[sd] -= eps;
+
+        std::vector<Real> v_p;
+        std::vector<Real> v_m;
+        basis.evaluate_values(xi_p, v_p);
+        basis.evaluate_values(xi_m, v_m);
+
+        for (std::size_t n = 0; n < base.size(); ++n) {
+            gradients[n][sd] = (v_p[n] - v_m[n]) / (Real(2) * eps);
+        }
+    }
+}
+
 void numerical_hessian_helper(const BasisFunction& basis,
                               const math::Vector<Real, 3>& xi,
                               std::vector<Hessian>& hessians,
                               Real eps = Real(1e-5))
 {
-    hessians.assign(basis.size(), Hessian{});
+    hessians.assign(basis.size(), Hessian::Zero());
     const int dim = basis.dimension();
 
     for (int i = 0; i < dim; ++i) {
@@ -66,7 +93,31 @@ std::vector<math::Vector<Real, 3>> sample_points_for(ElementType type) {
     }
 }
 
-void expect_hessians_match_numerical(const LagrangeBasis& basis,
+void expect_gradients_match_numerical(const BasisFunction& basis,
+                                      const std::vector<math::Vector<Real, 3>>& points,
+                                      Real tol,
+                                      Real eps = Real(1e-6))
+{
+    for (const auto& xi : points) {
+        std::vector<Gradient> analytical;
+        std::vector<Gradient> numerical;
+        basis.evaluate_gradients(xi, analytical);
+        numerical_gradient_helper(basis, xi, numerical, eps);
+
+        ASSERT_EQ(analytical.size(), numerical.size());
+        for (std::size_t n = 0; n < analytical.size(); ++n) {
+            for (int d = 0; d < basis.dimension(); ++d) {
+                const std::size_t sd = static_cast<std::size_t>(d);
+                EXPECT_NEAR(analytical[n][sd], numerical[n][sd], tol)
+                    << "basis " << n << ", component " << d
+                    << ", element " << static_cast<int>(basis.element_type())
+                    << ", order " << basis.order();
+            }
+        }
+    }
+}
+
+void expect_hessians_match_numerical(const BasisFunction& basis,
                                      const std::vector<math::Vector<Real, 3>>& points,
                                      Real tol,
                                      Real eps = Real(1e-5))
@@ -100,7 +151,7 @@ void expect_partition_hessian_sum_zero(const LagrangeBasis& basis,
     std::vector<Hessian> hessians;
     basis.evaluate_hessians(xi, hessians);
 
-    Hessian sum{};
+    Hessian sum = Hessian::Zero();
     for (const auto& hessian : hessians) {
         for (std::size_t r = 0; r < 3u; ++r) {
             for (std::size_t c = 0; c < 3u; ++c) {
@@ -145,7 +196,7 @@ void expect_partition_hessian_sum_zero(const BasisFunction& basis,
     std::vector<Hessian> hessians;
     basis.evaluate_hessians(xi, hessians);
 
-    Hessian sum{};
+    Hessian sum = Hessian::Zero();
     for (const auto& hessian : hessians) {
         for (std::size_t r = 0; r < 3u; ++r) {
             for (std::size_t c = 0; c < 3u; ++c) {
@@ -183,6 +234,16 @@ void expect_hessians_symmetric(const BasisFunction& basis,
     }
 }
 
+std::vector<math::Vector<Real, 3>> serendipity_sample_points(ElementType type) {
+    if (type == ElementType::Quad4 || type == ElementType::Quad8) {
+        return {{Real(0.17), Real(-0.31), Real(0)}, {Real(-0.45), Real(0.25), Real(0)}};
+    }
+    if (type == ElementType::Hex8 || type == ElementType::Hex20) {
+        return {{Real(0.2), Real(-0.1), Real(0.3)}, {Real(-0.35), Real(0.25), Real(-0.15)}};
+    }
+    return {{Real(0.2), Real(0.3), Real(0.1)}, {Real(0.12), Real(0.16), Real(-0.2)}};
+}
+
 } // namespace
 
 TEST(BasisHessians, LagrangeCanonicalTopologiesMatchNumericalHessians) {
@@ -280,3 +341,75 @@ TEST(BasisHessians, SolverMappedVolumeSelectionsSatisfyInvariants) {
 
     EXPECT_EQ(covered, 13);
 }
+
+// Gradients must match centered finite differences of values. This is the only
+// check that ties the gradient code path back to the value code path; partition
+// sums and Hessian-vs-FD(gradient) comparisons cannot catch a systematic error
+// shared by the first- and second-derivative recurrences.
+TEST(BasisGradients, LagrangeCanonicalTopologiesMatchNumericalGradients) {
+    const struct Case {
+        ElementType type;
+        int order;
+        Real tol;
+    } cases[] = {
+        {ElementType::Line2, 3, Real(1e-8)},
+        {ElementType::Triangle3, 3, Real(1e-7)},
+        {ElementType::Quad4, 3, Real(1e-7)},
+        {ElementType::Tetra4, 2, Real(1e-7)},
+        {ElementType::Hex8, 2, Real(1e-7)},
+        {ElementType::Wedge6, 2, Real(1e-7)},
+    };
+
+    for (const auto& c : cases) {
+        LagrangeBasis basis(c.type, c.order);
+        expect_gradients_match_numerical(basis, sample_points_for(c.type), c.tol);
+    }
+}
+
+// The serendipity coefficient tables (Hex20 20x20, Wedge15 15x15) and the quad
+// inverse-Vandermonde path each differentiate values through hand-written code
+// that is independent of the value evaluation. Partition sums only verify that
+// the constant function differentiates to zero, and symmetry is assigned
+// structurally, so neither can detect a wrong derivative formula. Finite
+// differences of values are the authoritative check.
+TEST(BasisGradients, SerendipityFamiliesMatchNumericalGradients) {
+    const struct Case {
+        ElementType type;
+        int order;
+        Real tol;
+    } cases[] = {
+        {ElementType::Quad4, 1, Real(1e-8)},
+        {ElementType::Quad8, 2, Real(1e-7)},
+        {ElementType::Quad4, 3, Real(1e-7)},
+        {ElementType::Quad4, 4, Real(5e-7)},
+        {ElementType::Hex8, 1, Real(1e-8)},
+        {ElementType::Hex20, 2, Real(1e-7)},
+        {ElementType::Wedge15, 2, Real(1e-7)},
+    };
+
+    for (const auto& c : cases) {
+        SerendipityBasis basis(c.type, c.order);
+        expect_gradients_match_numerical(basis, serendipity_sample_points(c.type), c.tol);
+    }
+}
+
+TEST(BasisHessians, SerendipityFamiliesMatchNumericalHessians) {
+    const struct Case {
+        ElementType type;
+        int order;
+        Real tol;
+    } cases[] = {
+        {ElementType::Quad4, 1, Real(1e-6)},
+        {ElementType::Quad8, 2, Real(1e-6)},
+        {ElementType::Quad4, 3, Real(1e-6)},
+        {ElementType::Quad4, 4, Real(5e-6)},
+        {ElementType::Hex8, 1, Real(1e-6)},
+        {ElementType::Hex20, 2, Real(1e-6)},
+        {ElementType::Wedge15, 2, Real(1e-6)},
+    };
+
+    for (const auto& c : cases) {
+        SerendipityBasis basis(c.type, c.order);
+        expect_hessians_match_numerical(basis, serendipity_sample_points(c.type), c.tol);
+    }
+}
diff --git a/tests/unitTests/FE/Basis/test_HigherOrderWedge.cpp b/tests/unitTests/FE/Basis/test_HigherOrderWedge.cpp
index 3faffd9e0..8827eebb0 100644
--- a/tests/unitTests/FE/Basis/test_HigherOrderWedge.cpp
+++ b/tests/unitTests/FE/Basis/test_HigherOrderWedge.cpp
@@ -57,8 +57,8 @@ void expect_partition_gradient_hessian_sums(const LagrangeBasis& basis,
         basis.evaluate_all(xi, values, gradients, hessians);
 
         Real value_sum = Real(0);
-        Gradient gradient_sum{};
-        Hessian hessian_sum{};
+        Gradient gradient_sum = Gradient::Zero();
+        Hessian hessian_sum = Hessian::Zero();
         for (std::size_t i = 0; i < values.size(); ++i) {
             value_sum += values[i];
             for (std::size_t d = 0; d < 3u; ++d) {
@@ -137,3 +137,21 @@ TEST(HigherOrderWedge, OrderFourEvaluationsRemainFinite) {
     expect_all_entries_finite(wedge, {Real(0.2), Real(0.1), Real(-0.6)});
     expect_all_entries_finite(wedge, {Real(0.05), Real(0.8), Real(0.3)});
 }
+
+// Finiteness alone cannot detect a wrong triangle-index or axis-index lookup;
+// the Kronecker property validates the order-four node lattice and its inverse
+// index mapping end to end.
+TEST(HigherOrderWedge, OrderFourIsNodalAndPartitionsUnity) {
+    LagrangeBasis wedge(ElementType::Wedge6, 4);
+
+    EXPECT_EQ(wedge.size(), 75u);
+    expect_kronecker_at_nodes(wedge, Real(1e-9));
+    expect_partition_gradient_hessian_sums(
+        wedge,
+        {
+            {Real(0.18), Real(0.22), Real(-0.2)},
+            {Real(0.25), Real(0.15), Real(0.45)},
+        },
+        Real(1e-12),
+        Real(1e-7));
+}
diff --git a/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp b/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp
index 9d93f8931..8a1f43c58 100644
--- a/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp
+++ b/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp
@@ -10,6 +10,7 @@
 #include "FE/Basis/LagrangeBasis.h"
 #include "FE/Basis/NodeOrderingConventions.h"
 
+#include <algorithm>
 #include <array>
 #include <tuple>
 #include <vector>
@@ -90,8 +91,8 @@ void expect_partition_gradient_hessian_sums(const LagrangeBasis& basis,
         basis.evaluate_all(xi, values, gradients, hessians);
 
         Real value_sum = Real(0);
-        Gradient gradient_sum{};
-        Hessian hessian_sum{};
+        Gradient gradient_sum = Gradient::Zero();
+        Hessian hessian_sum = Hessian::Zero();
         for (std::size_t i = 0; i < values.size(); ++i) {
             value_sum += values[i];
             for (std::size_t d = 0; d < 3u; ++d) {
@@ -190,7 +191,7 @@ Real linear_function(const Point& p) {
 }
 
 Gradient linear_gradient() {
-    Gradient g{};
+    Gradient g = Gradient::Zero();
     g[0] = Real(3);
     g[1] = Real(-4);
     g[2] = Real(5);
@@ -204,6 +205,18 @@ Real quadratic_function(const Point& p) {
            Real(0.4) * p[1] * p[2];
 }
 
+// Total degree three, so it lies in both the P3 simplex space and the Q3
+// tensor-product space.
+Real cubic_function(const Point& p) {
+    return quadratic_function(p) +
+           Real(0.1) * p[0] * p[0] * p[0] -
+           Real(0.2) * p[1] * p[1] * p[1] +
+           Real(0.3) * p[2] * p[2] * p[2] +
+           Real(0.15) * p[0] * p[0] * p[1] -
+           Real(0.12) * p[0] * p[2] * p[2] +
+           Real(0.08) * p[0] * p[1] * p[2];
+}
+
 template<typename Function>
 Real interpolate_value(const LagrangeBasis& basis,
                        const std::vector<Real>& values,
@@ -338,7 +351,7 @@ TEST(LagrangeBasis, LinearPolynomialReproductionAcrossLinearTopologies) {
             interpolate_value(basis, values, linear_function);
         EXPECT_NEAR(interpolated, linear_function(point), Real(1e-12));
 
-        Gradient interpolated_gradient{};
+        Gradient interpolated_gradient = Gradient::Zero();
         for (std::size_t i = 0; i < gradients.size(); ++i) {
             const Real nodal_value = linear_function(basis.nodes()[i]);
             for (int d = 0; d < basis.dimension(); ++d) {
@@ -376,6 +389,192 @@ TEST(LagrangeBasis, QuadraticPolynomialReproductionAcrossQuadraticAliases) {
     }
 }
 
+// Tetra order >= 3 activates the face-interior node loops, tetra order >= 4
+// activates the volume-interior lattice, and hex order >= 3 activates the six
+// orientation-specific face traversals in NodeOrderingConventions. None of
+// those generation paths run at the orders covered elsewhere; the Kronecker
+// test is what validates the node lattice together with its llround-based
+// inverse index mapping (a duplicated or missing node makes the basis
+// non-nodal here).
+TEST(LagrangeBasis, HigherOrderLatticesAreNodalAndPartitionUnity) {
+    const struct Case {
+        ElementType type;
+        int order;
+        std::size_t size;
+        Real kronecker_tol;
+        Real derivative_tol;
+        std::vector<Point> points;
+    } cases[] = {
+        {ElementType::Tetra4, 3, 20u, Real(5e-10), Real(1e-8),
+         {{Real(0.12), Real(0.18), Real(0.16)}, {Real(0.3), Real(0.2), Real(0.25)}}},
+        {ElementType::Tetra4, 4, 35u, Real(1e-9), Real(1e-7),
+         {{Real(0.12), Real(0.18), Real(0.16)}, {Real(0.2), Real(0.1), Real(0.18)}}},
+        {ElementType::Hex8, 3, 64u, Real(5e-10), Real(1e-8),
+         {{Real(0.1), Real(-0.2), Real(0.3)}, {Real(-0.35), Real(0.25), Real(-0.15)}}},
+    };
+
+    for (const auto& c : cases) {
+        LagrangeBasis basis(c.type, c.order);
+        EXPECT_EQ(basis.size(), c.size);
+        expect_kronecker_at_nodes(basis, c.kronecker_tol);
+        expect_partition_gradient_hessian_sums(basis, c.points, c.derivative_tol);
+    }
+}
+
+TEST(LagrangeBasis, CubicPolynomialReproductionAtOrderThree) {
+    const std::vector<std::pair<ElementType, Point>> cases = {
+        {ElementType::Tetra4, {Real(0.15), Real(0.2), Real(0.25)}},
+        {ElementType::Hex8, {Real(0.15), Real(-0.2), Real(0.25)}},
+    };
+
+    for (const auto& [type, point] : cases) {
+        LagrangeBasis basis(type, 3);
+        std::vector<Real> values;
+        basis.evaluate_values(point, values);
+
+        const Real interpolated = interpolate_value(basis, values, cubic_function);
+        EXPECT_NEAR(interpolated, cubic_function(point), Real(1e-10))
+            << "element=" << static_cast<int>(type);
+    }
+}
+
+TEST(LagrangeBasis, PointTopologyEvaluatesConstantUnity) {
+    LagrangeBasis basis(ElementType::Point1, 0);
+
+    EXPECT_EQ(basis.element_type(), ElementType::Point1);
+    EXPECT_EQ(basis.size(), 1u);
+    EXPECT_EQ(basis.dimension(), 0);
+    ASSERT_EQ(basis.nodes().size(), 1u);
+
+    const Point xi{Real(0.3), Real(-0.4), Real(0.1)};
+    std::vector<Real> values;
+    std::vector<Gradient> gradients;
+    std::vector<Hessian> hessians;
+    basis.evaluate_all(xi, values, gradients, hessians);
+
+    ASSERT_EQ(values.size(), 1u);
+    EXPECT_EQ(values[0], Real(1));
+    for (std::size_t d = 0; d < 3u; ++d) {
+        EXPECT_EQ(gradients[0][d], Real(0));
+        for (std::size_t e = 0; e < 3u; ++e) {
+            EXPECT_EQ(hessians[0](d, e), Real(0));
+        }
+    }
+
+    Real flat_value = Real(-1);
+    Real flat_gradient[3] = {Real(-1), Real(-1), Real(-1)};
+    Real flat_hessian[9];
+    std::fill_n(flat_hessian, 9u, Real(-1));
+    basis.evaluate_values_to(xi, &flat_value);
+    basis.evaluate_gradients_to(xi, flat_gradient);
+    basis.evaluate_hessians_to(xi, flat_hessian);
+    EXPECT_EQ(flat_value, Real(1));
+    for (std::size_t d = 0; d < 3u; ++d) {
+        EXPECT_EQ(flat_gradient[d], Real(0));
+    }
+    for (std::size_t e = 0; e < 9u; ++e) {
+        EXPECT_EQ(flat_hessian[e], Real(0));
+    }
+}
+
+// P0 bases back piecewise-constant fields (e.g. pressure in mixed elements);
+// the order-zero branches in node generation and the simplex/tensor/wedge
+// evaluators have no other coverage.
+TEST(LagrangeBasis, OrderZeroBasesAreConstantUnity) {
+    const std::array<ElementType, 6> types = {
+        ElementType::Line2,
+        ElementType::Triangle3,
+        ElementType::Quad4,
+        ElementType::Tetra4,
+        ElementType::Hex8,
+        ElementType::Wedge6,
+    };
+
+    for (const auto type : types) {
+        LagrangeBasis basis(type, 0);
+        EXPECT_EQ(basis.order(), 0) << "element=" << static_cast<int>(type);
+        EXPECT_EQ(basis.size(), 1u) << "element=" << static_cast<int>(type);
+
+        for (const auto& xi : sample_points_for(type)) {
+            std::vector<Real> values;
+            std::vector<Gradient> gradients;
+            std::vector<Hessian> hessians;
+            basis.evaluate_all(xi, values, gradients, hessians);
+
+            ASSERT_EQ(values.size(), 1u);
+            EXPECT_NEAR(values[0], Real(1), Real(1e-14))
+                << "element=" << static_cast<int>(type);
+            for (std::size_t d = 0; d < 3u; ++d) {
+                EXPECT_NEAR(gradients[0][d], Real(0), Real(1e-14));
+                for (std::size_t e = 0; e < 3u; ++e) {
+                    EXPECT_NEAR(hessians[0](d, e), Real(0), Real(1e-14));
+                }
+            }
+        }
+    }
+}
+
+// Pins the default basis selection for every supported element type. The
+// solver adapter (nn.cpp) translates solver element names to ElementType and
+// delegates the family/order choice to default_basis_request; a silent change
+// here would change the discretization of every simulation using that element.
+TEST(BasisFactoryDefaults, SelectionsArePinnedForAllSupportedElements) {
+    struct Expected {
+        ElementType type;
+        BasisType family;
+        int order;
+        std::size_t size;
+    };
+    const std::vector<Expected> cases = {
+        {ElementType::Point1,    BasisType::Lagrange,    0, 1u},
+        {ElementType::Line2,     BasisType::Lagrange,    1, 2u},
+        {ElementType::Line3,     BasisType::Lagrange,    2, 3u},
+        {ElementType::Triangle3, BasisType::Lagrange,    1, 3u},
+        {ElementType::Triangle6, BasisType::Lagrange,    2, 6u},
+        {ElementType::Quad4,     BasisType::Lagrange,    1, 4u},
+        {ElementType::Quad8,     BasisType::Serendipity, 2, 8u},
+        {ElementType::Quad9,     BasisType::Lagrange,    2, 9u},
+        {ElementType::Tetra4,    BasisType::Lagrange,    1, 4u},
+        {ElementType::Tetra10,   BasisType::Lagrange,    2, 10u},
+        {ElementType::Hex8,      BasisType::Lagrange,    1, 8u},
+        {ElementType::Hex20,     BasisType::Serendipity, 2, 20u},
+        {ElementType::Hex27,     BasisType::Lagrange,    2, 27u},
+        {ElementType::Wedge6,    BasisType::Lagrange,    1, 6u},
+        {ElementType::Wedge15,   BasisType::Serendipity, 2, 15u},
+        {ElementType::Wedge18,   BasisType::Lagrange,    2, 18u},
+    };
+
+    for (const auto& expected : cases) {
+        const auto request = basis_factory::default_basis_request(expected.type);
+        EXPECT_EQ(request.element_type, expected.type)
+            << "element=" << static_cast<int>(expected.type);
+        EXPECT_EQ(request.basis_type, expected.family)
+            << "element=" << static_cast<int>(expected.type);
+        ASSERT_TRUE(request.order.has_value())
+            << "element=" << static_cast<int>(expected.type);
+        EXPECT_EQ(*request.order, expected.order)
+            << "element=" << static_cast<int>(expected.type);
+
+        auto basis = basis_factory::create_default_for(expected.type);
+        ASSERT_NE(basis, nullptr);
+        EXPECT_EQ(basis->basis_type(), expected.family)
+            << "element=" << static_cast<int>(expected.type);
+        EXPECT_EQ(basis->order(), expected.order)
+            << "element=" << static_cast<int>(expected.type);
+        EXPECT_EQ(basis->size(), expected.size)
+            << "element=" << static_cast<int>(expected.type);
+    }
+}
+
+TEST(BasisFactoryDefaults, RejectsElementsWithoutDefaultBasis) {
+    EXPECT_THROW((void)basis_factory::default_basis_request(ElementType::Pyramid5),
+                 BasisElementCompatibilityException);
+    EXPECT_THROW((void)basis_factory::default_basis_request(ElementType::Pyramid13),
+                 BasisElementCompatibilityException);
+    EXPECT_THROW((void)basis_factory::create_default_for(ElementType::Unknown),
+                 BasisElementCompatibilityException);
+}
+
 TEST(LagrangeBasis, FactoryCreatesReducedScalarBasisFamilies) {
     auto lagrange =
         basis_factory::create(BasisRequest{ElementType::Hex27, BasisType::Lagrange, 1});
diff --git a/tests/unitTests/FE/Basis/test_SerendipityTensorModal.cpp b/tests/unitTests/FE/Basis/test_SerendipityTensorModal.cpp
index 30f876420..235dc8c40 100644
--- a/tests/unitTests/FE/Basis/test_SerendipityTensorModal.cpp
+++ b/tests/unitTests/FE/Basis/test_SerendipityTensorModal.cpp
@@ -5,9 +5,11 @@
 
 #include <gtest/gtest.h>
 
+#include "FE/Basis/LagrangeBasis.h"
 #include "FE/Basis/NodeOrderingConventions.h"
 #include "FE/Basis/SerendipityBasis.h"
 
+#include <cmath>
 #include <vector>
 
 using namespace svmp::FE;
@@ -25,7 +27,7 @@ void expect_partition_of_unity(const SerendipityBasis& basis,
     basis.evaluate_gradients(xi, gradients);
 
     Real value_sum = Real(0);
-    Gradient gradient_sum{};
+    Gradient gradient_sum = Gradient::Zero();
     for (std::size_t i = 0; i < values.size(); ++i) {
         value_sum += values[i];
         for (std::size_t component = 0; component < 3u; ++component) {
@@ -68,6 +70,36 @@ std::vector<math::Vector<Real, 3>> reference_nodes(ElementType type,
     return nodes;
 }
 
+template<typename Function>
+Real interpolate_nodal_function(const SerendipityBasis& basis,
+                                const math::Vector<Real, 3>& xi,
+                                Function&& nodal_function)
+{
+    std::vector<Real> values;
+    basis.evaluate_values(xi, values);
+
+    Real result = Real(0);
+    const auto& nodes = basis.nodes();
+    for (std::size_t i = 0; i < values.size(); ++i) {
+        result += values[i] * nodal_function(nodes[i]);
+    }
+    return result;
+}
+
+// Every monomial here has superlinear degree at most three, so it lies in the
+// order-three quadrilateral serendipity space.
+Real cubic_serendipity_function(const math::Vector<Real, 3>& p) {
+    const Real x = p[0];
+    const Real y = p[1];
+    return Real(1) + Real(2) * x - y + Real(3) * x * y +
+           x * x * x - Real(2) * y * y * y +
+           Real(0.5) * x * x * x * y - Real(0.25) * x * y * y * y;
+}
+
+Real bilinear_function(const math::Vector<Real, 3>& p) {
+    return Real(2) - Real(3) * p[0] + Real(4) * p[1] + Real(0.5) * p[0] * p[1];
+}
+
 } // namespace
 
 TEST(SerendipityBasis, Quad8IsNodalAndPartitionsUnity) {
@@ -104,3 +136,154 @@ TEST(SerendipityBasis, RejectsUnsupportedSerendipityAliases) {
     EXPECT_THROW(SerendipityBasis(ElementType::Pyramid14, 2), FEException);
     EXPECT_THROW(SerendipityBasis(ElementType::Quad8, 3), FEException);
 }
+
+// Orders other than two run the generic quadrilateral path: serendipity
+// monomial selection, boundary plus interior node placement, and a runtime
+// Vandermonde inversion whose unisolvence is assumed rather than tabulated.
+// Order four is the first order that selects an interior node.
+TEST(SerendipityBasis, QuadrilateralOrdersOneThreeFourAreNodalAndPartitionUnity) {
+    const struct Case {
+        int order;
+        std::size_t size;
+    } cases[] = {
+        {1, 4u},
+        {3, 12u},
+        {4, 17u},
+    };
+
+    for (const auto& c : cases) {
+        SerendipityBasis basis(ElementType::Quad4, c.order);
+        EXPECT_EQ(basis.size(), c.size) << "order=" << c.order;
+        EXPECT_EQ(basis.order(), c.order);
+        EXPECT_EQ(basis.dimension(), 2);
+        ASSERT_EQ(basis.nodes().size(), c.size);
+
+        for (const auto& node : basis.nodes()) {
+            EXPECT_LE(std::abs(node[0]), Real(1));
+            EXPECT_LE(std::abs(node[1]), Real(1));
+        }
+
+        expect_nodal_delta(basis, basis.nodes(), Real(1e-9));
+        expect_partition_of_unity(basis, {Real(0.17), Real(-0.31), Real(0)}, Real(1e-9));
+        expect_partition_of_unity(basis, {Real(-0.45), Real(0.25), Real(0)}, Real(1e-9));
+    }
+}
+
+TEST(SerendipityBasis, QuadrilateralOrderOneReproducesBilinearFunctions) {
+    SerendipityBasis basis(ElementType::Quad4, 1);
+
+    const std::vector<math::Vector<Real, 3>> points = {
+        {Real(0.25), Real(-0.4), Real(0)},
+        {Real(-0.7), Real(0.6), Real(0)},
+    };
+    for (const auto& xi : points) {
+        EXPECT_NEAR(interpolate_nodal_function(basis, xi, bilinear_function),
+                    bilinear_function(xi),
+                    Real(1e-12));
+    }
+}
+
+TEST(SerendipityBasis, QuadrilateralOrderThreeReproducesSerendipityCubics) {
+    SerendipityBasis basis(ElementType::Quad4, 3);
+
+    const std::vector<math::Vector<Real, 3>> points = {
+        {Real(0.25), Real(-0.4), Real(0)},
+        {Real(-0.7), Real(0.6), Real(0)},
+    };
+    for (const auto& xi : points) {
+        EXPECT_NEAR(interpolate_nodal_function(basis, xi, cubic_serendipity_function),
+                    cubic_serendipity_function(xi),
+                    Real(1e-11));
+    }
+}
+
+// SerendipityBasis(Hex8, 1) is the only route to the hand-written trilinear
+// corner evaluator (values, gradients, and Hessians); it must agree with the
+// trilinear Lagrange basis on the same element.
+TEST(SerendipityBasis, TrilinearHexMatchesLagrangeHex8) {
+    SerendipityBasis serendipity(ElementType::Hex8, 1);
+    LagrangeBasis lagrange(ElementType::Hex8, 1);
+
+    EXPECT_EQ(serendipity.size(), 8u);
+    EXPECT_EQ(serendipity.dimension(), 3);
+    expect_nodal_delta(serendipity,
+                       reference_nodes(ElementType::Hex8, serendipity.size()),
+                       Real(1e-12));
+
+    const std::vector<math::Vector<Real, 3>> points = {
+        {Real(0.2), Real(-0.1), Real(0.3)},
+        {Real(-0.35), Real(0.25), Real(-0.15)},
+    };
+    for (const auto& xi : points) {
+        std::vector<Real> s_values;
+        std::vector<Real> l_values;
+        std::vector<Gradient> s_gradients;
+        std::vector<Gradient> l_gradients;
+        std::vector<Hessian> s_hessians;
+        std::vector<Hessian> l_hessians;
+        serendipity.evaluate_all(xi, s_values, s_gradients, s_hessians);
+        lagrange.evaluate_all(xi, l_values, l_gradients, l_hessians);
+
+        ASSERT_EQ(s_values.size(), l_values.size());
+        for (std::size_t i = 0; i < s_values.size(); ++i) {
+            EXPECT_NEAR(s_values[i], l_values[i], Real(1e-13));
+            for (std::size_t d = 0; d < 3u; ++d) {
+                EXPECT_NEAR(s_gradients[i][d], l_gradients[i][d], Real(1e-13));
+                for (std::size_t e = 0; e < 3u; ++e) {
+                    EXPECT_NEAR(s_hessians[i](d, e), l_hessians[i](d, e), Real(1e-13));
+                }
+            }
+        }
+    }
+}
+
+// Geometry mode keeps the public Hex20 node count while mapping geometry with
+// the trilinear corner functions: corners must match the Hex8 basis exactly
+// and the quadratic edge nodes must contribute nothing.
+TEST(SerendipityBasis, Hex20GeometryModeUsesTrilinearCornersOnly) {
+    SerendipityBasis geometry(ElementType::Hex20, 2, true);
+    SerendipityBasis trilinear(ElementType::Hex8, 1);
+
+    EXPECT_EQ(geometry.size(), 20u);
+    EXPECT_EQ(geometry.order(), 2);
+
+    const std::vector<math::Vector<Real, 3>> points = {
+        {Real(0.2), Real(-0.1), Real(0.3)},
+        {Real(-0.35), Real(0.25), Real(-0.15)},
+    };
+    for (const auto& xi : points) {
+        std::vector<Real> g_values;
+        std::vector<Gradient> g_gradients;
+        std::vector<Hessian> g_hessians;
+        geometry.evaluate_all(xi, g_values, g_gradients, g_hessians);
+        ASSERT_EQ(g_values.size(), 20u);
+
+        std::vector<Real> t_values;
+        std::vector<Gradient> t_gradients;
+        std::vector<Hessian> t_hessians;
+        trilinear.evaluate_all(xi, t_values, t_gradients, t_hessians);
+
+        Real value_sum = Real(0);
+        for (std::size_t i = 0; i < 20u; ++i) {
+            value_sum += g_values[i];
+            if (i < 8u) {
+                EXPECT_NEAR(g_values[i], t_values[i], Real(1e-13)) << "corner=" << i;
+                for (std::size_t d = 0; d < 3u; ++d) {
+                    EXPECT_NEAR(g_gradients[i][d], t_gradients[i][d], Real(1e-13));
+                    for (std::size_t e = 0; e < 3u; ++e) {
+                        EXPECT_NEAR(g_hessians[i](d, e), t_hessians[i](d, e), Real(1e-13));
+                    }
+                }
+            } else {
+                EXPECT_EQ(g_values[i], Real(0)) << "edge node=" << i;
+                for (std::size_t d = 0; d < 3u; ++d) {
+                    EXPECT_EQ(g_gradients[i][d], Real(0));
+                    for (std::size_t e = 0; e < 3u; ++e) {
+                        EXPECT_EQ(g_hessians[i](d, e), Real(0));
+                    }
+                }
+            }
+        }
+        EXPECT_NEAR(value_sum, Real(1), Real(1e-13));
+    }
+}
diff --git a/tests/unitTests/FE/Math/test_DenseLinearAlgebra.cpp b/tests/unitTests/FE/Math/test_DenseLinearAlgebra.cpp
index 2b44ad2bf..9e9e08e95 100644
--- a/tests/unitTests/FE/Math/test_DenseLinearAlgebra.cpp
+++ b/tests/unitTests/FE/Math/test_DenseLinearAlgebra.cpp
@@ -108,6 +108,132 @@ TEST(DenseLinearAlgebra, FactorizationSolvesDenseRightHandSideBlock) {
     }
 }
 
+// Every other matrix in this file already has its largest pivot on the
+// diagonal, so without these cases the row-exchange branch in
+// factor_dense_matrix and the permutation replay in solve_in_place never
+// execute. SerendipityBasis inverts its Vandermonde matrices through this
+// code in production.
+TEST(DenseLinearAlgebra, FactorizationPivotsThroughZeroLeadingDiagonal) {
+    const std::vector<Real> swap_2x2{
+        Real(0), Real(1),
+        Real(1), Real(0)
+    };
+
+    const auto solver = factor_dense_matrix(swap_2x2, 2u, "swap 2x2");
+    const std::vector<Real> rhs{Real(3), Real(7)};
+    const auto x = solver.solve(std::span<const Real>(rhs.data(), rhs.size()));
+    ASSERT_EQ(x.size(), 2u);
+    EXPECT_NEAR(x[0], Real(7), Real(1.0e-14));
+    EXPECT_NEAR(x[1], Real(3), Real(1.0e-14));
+
+    const auto inv = invert_dense_matrix(swap_2x2, 2u, "swap 2x2");
+    for (std::size_t row = 0; row < 2u; ++row) {
+        for (std::size_t col = 0; col < 2u; ++col) {
+            EXPECT_NEAR(inv[row * 2u + col], swap_2x2[row * 2u + col], Real(1.0e-14));
+        }
+    }
+
+    // Every column requires a row exchange during elimination.
+    const std::vector<Real> permuted_scaled{
+        Real(0), Real(0), Real(1), Real(0),
+        Real(1), Real(0), Real(0), Real(0),
+        Real(0), Real(0), Real(0), Real(2),
+        Real(0), Real(3), Real(0), Real(0)
+    };
+
+    const auto inv4 = invert_dense_matrix(permuted_scaled, 4u, "permuted scaled 4x4");
+    for (std::size_t row = 0; row < 4u; ++row) {
+        for (std::size_t col = 0; col < 4u; ++col) {
+            const Real expected = (row == col) ? Real(1) : Real(0);
+            EXPECT_NEAR(multiply_entry(permuted_scaled, inv4, 4u, row, col),
+                        expected,
+                        Real(1.0e-14));
+        }
+    }
+}
+
+TEST(DenseLinearAlgebra, WideMultiRhsSolveWithPivoting) {
+    // Requires a row swap in column 0 and uses a wide right-hand-side block to
+    // exercise the row-interleaved multi-RHS layout end to end.
+    const std::vector<Real> A{
+        Real(0), Real(2), Real(1),
+        Real(4), Real(1), Real(0),
+        Real(1), Real(0), Real(3)
+    };
+    constexpr std::size_t kRhsCount = 33u;
+
+    const auto solver = factor_dense_matrix(A, 3u, "pivoting 3x3");
+
+    std::vector<Real> rhs(3u * kRhsCount, Real(0));
+    for (std::size_t row = 0; row < 3u; ++row) {
+        for (std::size_t r = 0; r < kRhsCount; ++r) {
+            rhs[row * kRhsCount + r] =
+                Real(1) + static_cast<Real>(row) - Real(0.25) * static_cast<Real>(r % 7u);
+        }
+    }
+    const auto original_rhs = rhs;
+
+    solver.solve_in_place(std::span<Real>(rhs.data(), rhs.size()), kRhsCount);
+
+    for (std::size_t r = 0; r < kRhsCount; ++r) {
+        for (std::size_t row = 0; row < 3u; ++row) {
+            Real ax = Real(0);
+            for (std::size_t col = 0; col < 3u; ++col) {
+                ax += A[row * 3u + col] * rhs[col * kRhsCount + r];
+            }
+            EXPECT_NEAR(ax, original_rhs[row * kRhsCount + r], Real(1.0e-12))
+                << "rhs column " << r << ", row " << row;
+        }
+    }
+}
+
+TEST(DenseLinearAlgebra, SolveInPlaceValidatesInputs) {
+    const std::vector<Real> identity{
+        Real(1), Real(0),
+        Real(0), Real(1)
+    };
+    const auto solver = factor_dense_matrix(identity, 2u, "identity 2x2");
+
+    std::vector<Real> rhs{Real(1), Real(2)};
+    EXPECT_THROW(solver.solve_in_place(std::span<Real>(rhs.data(), rhs.size()), 0u),
+                 FEException);
+
+    std::vector<Real> wrong_size{Real(1), Real(2), Real(3)};
+    EXPECT_THROW(
+        solver.solve_in_place(std::span<Real>(wrong_size.data(), wrong_size.size()), 1u),
+        FEException);
+
+    DenseLUSolver unfactored;
+    unfactored.n = 2u;
+    unfactored.label = "unfactored";
+    EXPECT_FALSE(unfactored.empty());
+    EXPECT_THROW(unfactored.solve_in_place(std::span<Real>(rhs.data(), rhs.size()), 1u),
+                 FEException);
+}
+
+TEST(DenseLinearAlgebra, DiagnosticValidationRejectsRankMismatch) {
+    DenseInverseResult result;
+    result.diagnostics.rank = 1u;
+
+    EXPECT_THROW(validate_dense_inverse_diagnostics(result, 2u, "rank mismatch"),
+                 FEException);
+}
+
+TEST(DenseLinearAlgebra, RankHandlesNonSquareMatrices) {
+    const std::vector<Real> wide_full{
+        Real(1), Real(0), Real(2),
+        Real(0), Real(1), Real(-1)
+    };
+    EXPECT_EQ(dense_matrix_rank(wide_full, 2u, 3u), 2u);
+
+    const std::vector<Real> tall_rank_one{
+        Real(1), Real(2),
+        Real(2), Real(4),
+        Real(3), Real(6)
+    };
+    EXPECT_EQ(dense_matrix_rank(tall_rank_one, 3u, 2u), 1u);
+}
+
 TEST(DenseLinearAlgebra, HighConditionInverseUsesSvdFallback) {
     const std::vector<Real> high_condition{
         Real(1), Real(0),
@@ -117,13 +243,9 @@ TEST(DenseLinearAlgebra, HighConditionInverseUsesSvdFallback) {
     const auto result =
         invert_dense_matrix_with_diagnostics(high_condition, 2u, "high-condition diagonal");
     EXPECT_EQ(result.diagnostics.rank, 2u);
-#if defined(FE_HAS_EIGEN) && FE_HAS_EIGEN
     EXPECT_GT(result.diagnostics.condition_estimate,
               dense_matrix_condition_fallback_threshold());
     EXPECT_TRUE(result.used_svd_fallback);
-#else
-    EXPECT_FALSE(result.used_svd_fallback);
-#endif
 
     for (std::size_t row = 0; row < 2u; ++row) {
         for (std::size_t col = 0; col < 2u; ++col) {
@@ -136,9 +258,6 @@ TEST(DenseLinearAlgebra, HighConditionInverseUsesSvdFallback) {
 }
 
 TEST(DenseLinearAlgebra, DiagnosticValidationRejectsUnsupportedCondition) {
-#if !(defined(FE_HAS_EIGEN) && FE_HAS_EIGEN)
-    GTEST_SKIP() << "condition rejection requires FE_ENABLE_EIGEN diagnostics";
-#endif
     DenseInverseResult result;
     result.diagnostics.rank = 2u;
     result.diagnostics.condition_estimate =
@@ -193,13 +312,9 @@ TEST(DenseLinearAlgebra, DiagnosticsReportRankAndConditionEstimate) {
     const auto full =
         dense_matrix_diagnostics(diagonal, 2u, 2u, "diagonal 2x2");
     EXPECT_EQ(full.rank, 2u);
-#if defined(FE_HAS_EIGEN) && FE_HAS_EIGEN
     EXPECT_NEAR(full.largest_singular_value, Real(4), Real(1.0e-14));
     EXPECT_NEAR(full.smallest_retained_singular_value, Real(0.5), Real(1.0e-14));
     EXPECT_NEAR(full.condition_estimate, Real(8), Real(1.0e-14));
-#else
-    EXPECT_TRUE(std::isinf(full.condition_estimate));
-#endif
 
     const std::vector<Real> rank_one{
         Real(1), Real(2),
@@ -212,9 +327,6 @@ TEST(DenseLinearAlgebra, DiagnosticsReportRankAndConditionEstimate) {
 }
 
 TEST(DenseLinearAlgebra, PseudoInverseHandlesSingularMatrixWithoutNormalEquations) {
-#if !(defined(FE_HAS_EIGEN) && FE_HAS_EIGEN)
-    GTEST_SKIP() << "rank-revealing pseudo-inverse requires FE_ENABLE_EIGEN";
-#endif
     const std::vector<Real> rank_one{
         Real(1), Real(2),
         Real(2), Real(4)
@@ -246,9 +358,6 @@ TEST(DenseLinearAlgebra, PseudoInverseHandlesSingularMatrixWithoutNormalEquation
 }
 
 TEST(DenseLinearAlgebra, PseudoInverseDropsNearZeroSingularValues) {
-#if !(defined(FE_HAS_EIGEN) && FE_HAS_EIGEN)
-    GTEST_SKIP() << "rank-revealing pseudo-inverse requires FE_ENABLE_EIGEN";
-#endif
     const std::vector<Real> near_singular{
         Real(1), Real(0),
         Real(0), Real(1.0e-18)
diff --git a/tests/unitTests/FE/Math/test_Matrix.cpp b/tests/unitTests/FE/Math/test_Matrix.cpp
deleted file mode 100644
index 3b2fe664a..000000000
--- a/tests/unitTests/FE/Math/test_Matrix.cpp
+++ /dev/null
@@ -1,593 +0,0 @@
-/**
- * @file test_Matrix.cpp
- * @brief Unit tests for Matrix.h - fixed-size matrices with expression templates
- */
-
-#include <gtest/gtest.h>
-#include "FE/Math/Matrix.h"
-#include "FE/Math/Vector.h"
-#include "FE/Math/MatrixExpr.h"
-#include <limits>
-#include <cmath>
-#include <thread>
-#include <vector>
-
-using namespace svmp::FE::math;
-
-// Test fixture for Matrix tests
-class MatrixTest : public ::testing::Test {
-protected:
-    static constexpr double tolerance = 1e-14;
-
-    void SetUp() override {}
-    void TearDown() override {}
-
-    // Helper function to check if two values are approximately equal
-    template<typename T>
-    bool approx_equal(T a, T b, T tol = tolerance) {
-        return std::abs(a - b) <= tol;
-    }
-};
-
-// =============================================================================
-// Construction and Initialization Tests
-// =============================================================================
-
-TEST_F(MatrixTest, DefaultConstruction) {
-    Matrix<double, 3, 3> m;
-    for (size_t i = 0; i < 3; ++i) {
-        for (size_t j = 0; j < 3; ++j) {
-            EXPECT_EQ(m(i, j), 0.0);
-        }
-    }
-}
-
-TEST_F(MatrixTest, FillConstruction) {
-    Matrix<double, 2, 3> m(5.0);
-    for (size_t i = 0; i < 2; ++i) {
-        for (size_t j = 0; j < 3; ++j) {
-            EXPECT_EQ(m(i, j), 5.0);
-        }
-    }
-}
-
-TEST_F(MatrixTest, InitializerListConstruction) {
-    Matrix<double, 2, 3> m{{1.0, 2.0, 3.0},
-                           {4.0, 5.0, 6.0}};
-
-    EXPECT_EQ(m(0, 0), 1.0);
-    EXPECT_EQ(m(0, 1), 2.0);
-    EXPECT_EQ(m(0, 2), 3.0);
-    EXPECT_EQ(m(1, 0), 4.0);
-    EXPECT_EQ(m(1, 1), 5.0);
-    EXPECT_EQ(m(1, 2), 6.0);
-}
-
-TEST_F(MatrixTest, CopyConstruction) {
-    Matrix<double, 2, 2> m1{{1.0, 2.0},
-                            {3.0, 4.0}};
-    Matrix<double, 2, 2> m2(m1);
-
-    EXPECT_EQ(m2(0, 0), 1.0);
-    EXPECT_EQ(m2(0, 1), 2.0);
-    EXPECT_EQ(m2(1, 0), 3.0);
-    EXPECT_EQ(m2(1, 1), 4.0);
-
-    // Ensure deep copy
-    m2(0, 0) = 10.0;
-    EXPECT_EQ(m1(0, 0), 1.0);
-    EXPECT_EQ(m2(0, 0), 10.0);
-}
-
-TEST_F(MatrixTest, MoveConstruction) {
-    Matrix<double, 2, 2> m1{{1.0, 2.0},
-                            {3.0, 4.0}};
-    Matrix<double, 2, 2> m2(std::move(m1));
-
-    EXPECT_EQ(m2(0, 0), 1.0);
-    EXPECT_EQ(m2(0, 1), 2.0);
-    EXPECT_EQ(m2(1, 0), 3.0);
-    EXPECT_EQ(m2(1, 1), 4.0);
-}
-
-// =============================================================================
-// Element Access Tests
-// =============================================================================
-
-TEST_F(MatrixTest, ElementAccess) {
-    Matrix<double, 2, 3> m{{1.0, 2.0, 3.0},
-                           {4.0, 5.0, 6.0}};
-
-    // Non-const access using operator()
-    EXPECT_EQ(m(0, 0), 1.0);
-    EXPECT_EQ(m(0, 2), 3.0);
-    EXPECT_EQ(m(1, 1), 5.0);
-
-    // Modification
-    m(1, 2) = 7.0;
-    EXPECT_EQ(m(1, 2), 7.0);
-
-    // Const access
-    const Matrix<double, 2, 3> cm{{1.0, 2.0, 3.0},
-                                  {4.0, 5.0, 6.0}};
-    EXPECT_EQ(cm(0, 1), 2.0);
-    EXPECT_EQ(cm(1, 0), 4.0);
-}
-
-TEST_F(MatrixTest, ElementAccessBounds) {
-    Matrix<double, 2, 3> m{{1.0, 2.0, 3.0},
-                           {4.0, 5.0, 6.0}};
-
-    // at() with bounds checking
-    EXPECT_EQ(m.at(0, 0), 1.0);
-    EXPECT_EQ(m.at(1, 2), 6.0);
-
-    // Test out of bounds throws
-    EXPECT_THROW(m.at(2, 0), std::out_of_range);
-    EXPECT_THROW(m.at(0, 3), std::out_of_range);
-    EXPECT_THROW(m.at(10, 10), std::out_of_range);
-}
-
-TEST_F(MatrixTest, RowColumnAccess) {
-    Matrix<double, 3, 3> m{{1.0, 2.0, 3.0},
-                           {4.0, 5.0, 6.0},
-                           {7.0, 8.0, 9.0}};
-
-    // Get row
-    auto row1 = m.row(1);
-    EXPECT_EQ(row1[0], 4.0);
-    EXPECT_EQ(row1[1], 5.0);
-    EXPECT_EQ(row1[2], 6.0);
-
-    // Get column
-    auto col2 = m.col(2);
-    EXPECT_EQ(col2[0], 3.0);
-    EXPECT_EQ(col2[1], 6.0);
-    EXPECT_EQ(col2[2], 9.0);
-
-    // Set row
-    Vector<double, 3> new_row{10.0, 11.0, 12.0};
-    m.set_row(0, new_row);
-    EXPECT_EQ(m(0, 0), 10.0);
-    EXPECT_EQ(m(0, 1), 11.0);
-    EXPECT_EQ(m(0, 2), 12.0);
-
-    // Set column
-    Vector<double, 3> new_col{20.0, 21.0, 22.0};
-    m.set_col(1, new_col);
-    EXPECT_EQ(m(0, 1), 20.0);
-    EXPECT_EQ(m(1, 1), 21.0);
-    EXPECT_EQ(m(2, 1), 22.0);
-}
-
-// =============================================================================
-// Arithmetic Operations Tests
-// =============================================================================
-
-TEST_F(MatrixTest, Addition) {
-    Matrix<double, 2, 3> a{{1.0, 2.0, 3.0},
-                           {4.0, 5.0, 6.0}};
-    Matrix<double, 2, 3> b{{7.0, 8.0, 9.0},
-                           {10.0, 11.0, 12.0}};
-
-    Matrix<double, 2, 3> c = a + b;
-    EXPECT_EQ(c(0, 0), 8.0);
-    EXPECT_EQ(c(0, 1), 10.0);
-    EXPECT_EQ(c(0, 2), 12.0);
-    EXPECT_EQ(c(1, 0), 14.0);
-    EXPECT_EQ(c(1, 1), 16.0);
-    EXPECT_EQ(c(1, 2), 18.0);
-}
-
-TEST_F(MatrixTest, Subtraction) {
-    Matrix<double, 2, 3> a{{8.0, 10.0, 12.0},
-                           {14.0, 16.0, 18.0}};
-    Matrix<double, 2, 3> b{{7.0, 8.0, 9.0},
-                           {10.0, 11.0, 12.0}};
-
-    Matrix<double, 2, 3> c = a - b;
-    EXPECT_EQ(c(0, 0), 1.0);
-    EXPECT_EQ(c(0, 1), 2.0);
-    EXPECT_EQ(c(0, 2), 3.0);
-    EXPECT_EQ(c(1, 0), 4.0);
-    EXPECT_EQ(c(1, 1), 5.0);
-    EXPECT_EQ(c(1, 2), 6.0);
-}
-
-TEST_F(MatrixTest, ScalarMultiplication) {
-    Matrix<double, 2, 2> a{{1.0, 2.0},
-                           {3.0, 4.0}};
-
-    Matrix<double, 2, 2> b = 2.0 * a;
-    EXPECT_EQ(b(0, 0), 2.0);
-    EXPECT_EQ(b(0, 1), 4.0);
-    EXPECT_EQ(b(1, 0), 6.0);
-    EXPECT_EQ(b(1, 1), 8.0);
-
-    Matrix<double, 2, 2> c = a * 3.0;
-    EXPECT_EQ(c(0, 0), 3.0);
-    EXPECT_EQ(c(0, 1), 6.0);
-    EXPECT_EQ(c(1, 0), 9.0);
-    EXPECT_EQ(c(1, 1), 12.0);
-}
-
-TEST_F(MatrixTest, ScalarDivision) {
-    Matrix<double, 2, 2> a{{2.0, 4.0},
-                           {6.0, 8.0}};
-
-    Matrix<double, 2, 2> b = a / 2.0;
-    EXPECT_EQ(b(0, 0), 1.0);
-    EXPECT_EQ(b(0, 1), 2.0);
-    EXPECT_EQ(b(1, 0), 3.0);
-    EXPECT_EQ(b(1, 1), 4.0);
-}
-
-TEST_F(MatrixTest, MatrixMultiplication) {
-    Matrix<double, 2, 3> a{{1.0, 2.0, 3.0},
-                           {4.0, 5.0, 6.0}};
-    Matrix<double, 3, 2> b{{7.0, 8.0},
-                           {9.0, 10.0},
-                           {11.0, 12.0}};
-
-    Matrix<double, 2, 2> c = a * b;
-    EXPECT_EQ(c(0, 0), 58.0);   // 1*7 + 2*9 + 3*11
-    EXPECT_EQ(c(0, 1), 64.0);   // 1*8 + 2*10 + 3*12
-    EXPECT_EQ(c(1, 0), 139.0);  // 4*7 + 5*9 + 6*11
-    EXPECT_EQ(c(1, 1), 154.0);  // 4*8 + 5*10 + 6*12
-}
-
-TEST_F(MatrixTest, MatrixVectorMultiplication) {
-    Matrix<double, 3, 3> m{{1.0, 2.0, 3.0},
-                           {4.0, 5.0, 6.0},
-                           {7.0, 8.0, 9.0}};
-    Vector<double, 3> v{1.0, 2.0, 3.0};
-
-    Vector<double, 3> result = m * v;
-    EXPECT_EQ(result[0], 14.0);  // 1*1 + 2*2 + 3*3
-    EXPECT_EQ(result[1], 32.0);  // 4*1 + 5*2 + 6*3
-    EXPECT_EQ(result[2], 50.0);  // 7*1 + 8*2 + 9*3
-}
-
-// =============================================================================
-// Special Matrix Operations Tests
-// =============================================================================
-
-TEST_F(MatrixTest, Transpose) {
-    Matrix<double, 2, 3> m{{1.0, 2.0, 3.0},
-                           {4.0, 5.0, 6.0}};
-
-    Matrix<double, 3, 2> mt = m.transpose();
-    EXPECT_EQ(mt(0, 0), 1.0);
-    EXPECT_EQ(mt(0, 1), 4.0);
-    EXPECT_EQ(mt(1, 0), 2.0);
-    EXPECT_EQ(mt(1, 1), 5.0);
-    EXPECT_EQ(mt(2, 0), 3.0);
-    EXPECT_EQ(mt(2, 1), 6.0);
-}
-
-TEST_F(MatrixTest, Determinant2x2) {
-    Matrix<double, 2, 2> m{{1.0, 2.0},
-                           {3.0, 4.0}};
-
-    double det = m.determinant();
-    EXPECT_EQ(det, -2.0);  // 1*4 - 2*3 = 4 - 6 = -2
-}
-
-TEST_F(MatrixTest, Determinant3x3) {
-    Matrix<double, 3, 3> m{{1.0, 2.0, 3.0},
-                           {0.0, 1.0, 4.0},
-                           {5.0, 6.0, 0.0}};
-
-    double det = m.determinant();
-    EXPECT_EQ(det, 1.0);  // Using Sarrus rule
-}
-
-TEST_F(MatrixTest, Determinant4x4) {
-    Matrix<double, 4, 4> m{{1, 0, 0, 0},
-                           {0, 2, 0, 0},
-                           {0, 0, 3, 0},
-                           {0, 0, 0, 4}};
-
-    double det = m.determinant();
-    EXPECT_EQ(det, 24.0);  // 1*2*3*4 = 24 (diagonal matrix)
-}
-
-TEST_F(MatrixTest, Inverse2x2) {
-    Matrix<double, 2, 2> m{{1.0, 2.0},
-                           {3.0, 4.0}};
-
-    Matrix<double, 2, 2> inv = m.inverse();
-
-    // Check inverse properties
-    EXPECT_NEAR(inv(0, 0), -2.0, tolerance);
-    EXPECT_NEAR(inv(0, 1), 1.0, tolerance);
-    EXPECT_NEAR(inv(1, 0), 1.5, tolerance);
-    EXPECT_NEAR(inv(1, 1), -0.5, tolerance);
-
-    // Verify M * M^-1 = I
-    Matrix<double, 2, 2> identity = m * inv;
-    EXPECT_NEAR(identity(0, 0), 1.0, tolerance);
-    EXPECT_NEAR(identity(0, 1), 0.0, tolerance);
-    EXPECT_NEAR(identity(1, 0), 0.0, tolerance);
-    EXPECT_NEAR(identity(1, 1), 1.0, tolerance);
-}
-
-TEST_F(MatrixTest, Inverse3x3) {
-    Matrix<double, 3, 3> m{{1.0, 2.0, 3.0},
-                           {0.0, 1.0, 4.0},
-                           {5.0, 6.0, 0.0}};
-
-    Matrix<double, 3, 3> inv = m.inverse();
-
-    // Verify M * M^-1 = I
-    Matrix<double, 3, 3> identity = m * inv;
-    for (size_t i = 0; i < 3; ++i) {
-        for (size_t j = 0; j < 3; ++j) {
-            double expected = (i == j) ? 1.0 : 0.0;
-            EXPECT_NEAR(identity(i, j), expected, tolerance);
-        }
-    }
-}
-
-TEST_F(MatrixTest, Trace) {
-    Matrix<double, 3, 3> m{{1.0, 2.0, 3.0},
-                           {4.0, 5.0, 6.0},
-                           {7.0, 8.0, 9.0}};
-
-    double trace = m.trace();
-    EXPECT_EQ(trace, 15.0);  // 1 + 5 + 9 = 15
-}
-
-// =============================================================================
-// Special Matrix Types Tests
-// =============================================================================
-
-TEST_F(MatrixTest, IdentityMatrix) {
-    Matrix<double, 3, 3> I = Matrix<double, 3, 3>::identity();
-
-    for (size_t i = 0; i < 3; ++i) {
-        for (size_t j = 0; j < 3; ++j) {
-            double expected = (i == j) ? 1.0 : 0.0;
-            EXPECT_EQ(I(i, j), expected);
-        }
-    }
-
-    // Test identity property
-    Matrix<double, 3, 3> m{{1.0, 2.0, 3.0},
-                           {4.0, 5.0, 6.0},
-                           {7.0, 8.0, 9.0}};
-    Matrix<double, 3, 3> result = m * I;
-
-    for (size_t i = 0; i < 3; ++i) {
-        for (size_t j = 0; j < 3; ++j) {
-            EXPECT_EQ(result(i, j), m(i, j));
-        }
-    }
-}
-
-TEST_F(MatrixTest, ZeroMatrix) {
-    Matrix<double, 2, 3> Z = Matrix<double, 2, 3>::zero();
-
-    for (size_t i = 0; i < 2; ++i) {
-        for (size_t j = 0; j < 3; ++j) {
-            EXPECT_EQ(Z(i, j), 0.0);
-        }
-    }
-}
-
-TEST_F(MatrixTest, DiagonalMatrix) {
-    Vector<double, 3> diag{1.0, 2.0, 3.0};
-    Matrix<double, 3, 3> D = Matrix<double, 3, 3>::diagonal(diag);
-
-    EXPECT_EQ(D(0, 0), 1.0);
-    EXPECT_EQ(D(1, 1), 2.0);
-    EXPECT_EQ(D(2, 2), 3.0);
-
-    // Off-diagonal elements should be zero
-    EXPECT_EQ(D(0, 1), 0.0);
-    EXPECT_EQ(D(0, 2), 0.0);
-    EXPECT_EQ(D(1, 0), 0.0);
-    EXPECT_EQ(D(1, 2), 0.0);
-    EXPECT_EQ(D(2, 0), 0.0);
-    EXPECT_EQ(D(2, 1), 0.0);
-}
-
-// =============================================================================
-// Expression Template Tests
-// =============================================================================
-
-TEST_F(MatrixTest, ExpressionTemplatesNoTemporaries) {
-    Matrix<double, 2, 2> a{{1, 2}, {3, 4}};
-    Matrix<double, 2, 2> b{{5, 6}, {7, 8}};
-    Matrix<double, 2, 2> c{{9, 10}, {11, 12}};
-
-    // Complex expression should create no temporaries
-    Matrix<double, 2, 2> result = a + b - c;
-
-    EXPECT_EQ(result(0, 0), -3.0);   // 1 + 5 - 9
-    EXPECT_EQ(result(0, 1), -2.0);   // 2 + 6 - 10
-    EXPECT_EQ(result(1, 0), -1.0);   // 3 + 7 - 11
-    EXPECT_EQ(result(1, 1), 0.0);    // 4 + 8 - 12
-}
-
-TEST_F(MatrixTest, LazyEvaluation) {
-    Matrix<double, 2, 2> a{{1, 2}, {3, 4}};
-    Matrix<double, 2, 2> b{{5, 6}, {7, 8}};
-
-    // Expression should not be evaluated until assignment
-    auto expr = a + b;  // No computation yet
-
-    Matrix<double, 2, 2> result = expr;  // Evaluation happens here
-    EXPECT_EQ(result(0, 0), 6.0);
-    EXPECT_EQ(result(0, 1), 8.0);
-}
-
-// =============================================================================
-// Edge Cases and Error Handling Tests
-// =============================================================================
-
-TEST_F(MatrixTest, SingularMatrixInverse) {
-    Matrix<double, 2, 2> singular{{1.0, 2.0},
-                                  {2.0, 4.0}};  // det = 0
-
-    EXPECT_THROW(singular.inverse(), std::runtime_error);
-}
-
-TEST_F(MatrixTest, DivisionByZero) {
-    Matrix<double, 2, 2> m{{1.0, 2.0},
-                           {3.0, 4.0}};
-
-    Matrix<double, 2, 2> result = m / 0.0;
-    EXPECT_TRUE(std::isinf(result(0, 0)));
-    EXPECT_TRUE(std::isinf(result(0, 1)));
-}
-
-TEST_F(MatrixTest, ExtremeLargeValues) {
-    double large = 1e308;
-    Matrix<double, 2, 2> m{{large, 0}, {0, large}};
-
-    Matrix<double, 2, 2> half = m / 2.0;
-    EXPECT_FALSE(std::isinf(half(0, 0)));
-    EXPECT_EQ(half(0, 0), large / 2.0);
-}
-
-// =============================================================================
-// Numerical Precision Tests
-// =============================================================================
-
-TEST_F(MatrixTest, NumericalStability) {
-    // Test near-singular matrix
-    double eps = 1e-15;
-    Matrix<double, 2, 2> m{{1.0, 1.0},
-                           {1.0, 1.0 + eps}};
-
-    double det = m.determinant();
-    // Relax tolerance due to floating-point arithmetic in determinant calculation
-    EXPECT_NEAR(det, eps, 1e-14);
-}
-
-TEST_F(MatrixTest, OrthogonalMatrixProperties) {
-    // Create rotation matrix (orthogonal)
-    double angle = M_PI / 4;
-    Matrix<double, 2, 2> R{{cos(angle), -sin(angle)},
-                           {sin(angle), cos(angle)}};
-
-    // Check orthogonality: R * R^T = I
-    Matrix<double, 2, 2> RRt = R * R.transpose();
-    EXPECT_NEAR(RRt(0, 0), 1.0, tolerance);
-    EXPECT_NEAR(RRt(0, 1), 0.0, tolerance);
-    EXPECT_NEAR(RRt(1, 0), 0.0, tolerance);
-    EXPECT_NEAR(RRt(1, 1), 1.0, tolerance);
-
-    // Check determinant = ±1
-    EXPECT_NEAR(std::abs(R.determinant()), 1.0, tolerance);
-}
-
-// =============================================================================
-// Matrix Properties Tests
-// =============================================================================
-
-TEST_F(MatrixTest, IsSymmetric) {
-    Matrix<double, 3, 3> sym{{1, 2, 3},
-                             {2, 4, 5},
-                             {3, 5, 6}};
-    EXPECT_TRUE(sym.is_symmetric(tolerance));
-
-    Matrix<double, 3, 3> nonsym{{1, 2, 3},
-                                {4, 5, 6},
-                                {7, 8, 9}};
-    EXPECT_FALSE(nonsym.is_symmetric(tolerance));
-}
-
-TEST_F(MatrixTest, IsSkewSymmetric) {
-    Matrix<double, 3, 3> skew{{0, -1, 2},
-                              {1, 0, -3},
-                              {-2, 3, 0}};
-    EXPECT_TRUE(skew.is_skew_symmetric(tolerance));
-
-    Matrix<double, 3, 3> nonskew{{1, 2, 3},
-                                 {4, 5, 6},
-                                 {7, 8, 9}};
-    EXPECT_FALSE(nonskew.is_skew_symmetric(tolerance));
-}
-
-TEST_F(MatrixTest, IsDiagonal) {
-    Matrix<double, 3, 3> diag{{1, 0, 0},
-                              {0, 2, 0},
-                              {0, 0, 3}};
-    EXPECT_TRUE(diag.is_diagonal(tolerance));
-
-    Matrix<double, 3, 3> nondiag{{1, 0.1, 0},
-                                 {0, 2, 0},
-                                 {0, 0, 3}};
-    EXPECT_FALSE(nondiag.is_diagonal(tolerance));
-}
-
-// =============================================================================
-// Thread Safety Tests
-// =============================================================================
-
-TEST_F(MatrixTest, ThreadSafetyReadOnly) {
-    Matrix<double, 3, 3> m{{1, 2, 3}, {4, 5, 6}, {7, 8, 9}};
-
-    std::vector<std::thread> threads;
-    std::vector<double> results(10);
-
-    for (int i = 0; i < 10; ++i) {
-        threads.emplace_back([&m, &results, i]() {
-            results[static_cast<std::size_t>(i)] = m.trace();
-        });
-    }
-
-    for (auto& t : threads) {
-        t.join();
-    }
-
-    for (double r : results) {
-        EXPECT_EQ(r, 15.0);
-    }
-}
-
-// =============================================================================
-// Memory Alignment Tests
-// =============================================================================
-
-TEST_F(MatrixTest, MemoryAlignment) {
-    Matrix<double, 3, 3> m;
-
-    std::uintptr_t addr = reinterpret_cast<std::uintptr_t>(m.data());
-    EXPECT_EQ(addr % 32, 0) << "Matrix data should be 32-byte aligned for AVX";
-}
-
-// =============================================================================
-// Utility Function Tests
-// =============================================================================
-
-TEST_F(MatrixTest, Norms) {
-    Matrix<double, 2, 2> m{{1, 2}, {3, 4}};
-
-    // Frobenius norm: sqrt(1^2 + 2^2 + 3^2 + 4^2) = sqrt(30)
-    EXPECT_NEAR(m.frobenius_norm(), std::sqrt(30.0), tolerance);
-
-    // Infinity norm (max row sum)
-    EXPECT_EQ(m.infinity_norm(), 7.0);  // max(|1|+|2|, |3|+|4|) = max(3, 7)
-
-    // One norm (max column sum)
-    EXPECT_EQ(m.one_norm(), 6.0);  // max(|1|+|3|, |2|+|4|) = max(4, 6)
-}
-
-TEST_F(MatrixTest, MinMaxElements) {
-    Matrix<double, 2, 3> m{{3, -1, 4}, {1, -2, 5}};
-
-    EXPECT_EQ(m.min(), -2.0);
-    EXPECT_EQ(m.max(), 5.0);
-}
-
-TEST_F(MatrixTest, ToString) {
-    Matrix<double, 2, 2> m{{1, 2}, {3, 4}};
-    std::stringstream ss;
-    ss << m;
-
-    std::string expected = "[[1, 2]\n [3, 4]]";
-    EXPECT_EQ(ss.str(), expected);
-}
diff --git a/tests/unitTests/FE/Math/test_MatrixExpr.cpp b/tests/unitTests/FE/Math/test_MatrixExpr.cpp
deleted file mode 100644
index b17bce928..000000000
--- a/tests/unitTests/FE/Math/test_MatrixExpr.cpp
+++ /dev/null
@@ -1,527 +0,0 @@
-/**
- * @file test_MatrixExpr.cpp
- * @brief Unit tests for MatrixExpr.h - matrix expression templates
- */
-
-#include <gtest/gtest.h>
-#include "FE/Math/Matrix.h"
-#include "FE/Math/MatrixExpr.h"
-#include "FE/Math/Vector.h"
-#include <limits>
-#include <cmath>
-#include <memory>
-#include <atomic>
-#include <type_traits>
-
-using namespace svmp::FE::math;
-
-// Test fixture for MatrixExpr tests
-class MatrixExprTest : public ::testing::Test {
-protected:
-    static constexpr double tolerance = 1e-14;
-
-    // Custom allocator to track memory allocations
-    template<typename T>
-    class TrackingAllocator {
-    public:
-        using value_type = T;
-
-        static std::atomic<size_t> allocations;
-        static std::atomic<size_t> deallocations;
-        static std::atomic<size_t> bytes_allocated;
-
-        TrackingAllocator() = default;
-
-        template<typename U>
-        TrackingAllocator(const TrackingAllocator<U>&) {}
-
-        T* allocate(size_t n) {
-            allocations.fetch_add(1);
-            bytes_allocated.fetch_add(n * sizeof(T));
-            return static_cast<T*>(::operator new(n * sizeof(T)));
-        }
-
-        void deallocate(T* p, size_t n) {
-            deallocations.fetch_add(1);
-            ::operator delete(p);
-        }
-
-        static void reset() {
-            allocations = 0;
-            deallocations = 0;
-            bytes_allocated = 0;
-        }
-    };
-
-    void SetUp() override {
-        TrackingAllocator<double>::reset();
-    }
-
-    void TearDown() override {}
-
-    template<typename T>
-    bool approx_equal(T a, T b, T tol = tolerance) {
-        return std::abs(a - b) <= tol;
-    }
-};
-
-template<typename T>
-std::atomic<size_t> MatrixExprTest::TrackingAllocator<T>::allocations{0};
-template<typename T>
-std::atomic<size_t> MatrixExprTest::TrackingAllocator<T>::deallocations{0};
-template<typename T>
-std::atomic<size_t> MatrixExprTest::TrackingAllocator<T>::bytes_allocated{0};
-
-// =============================================================================
-// Lazy Evaluation Tests
-// =============================================================================
-
-TEST_F(MatrixExprTest, LazyEvaluationNoTemporaries) {
-    // Expression templates should not create temporary matrices
-    Matrix<double, 2, 2> A{{1.0, 2.0}, {3.0, 4.0}};
-    Matrix<double, 2, 2> B{{5.0, 6.0}, {7.0, 8.0}};
-    Matrix<double, 2, 2> C{{9.0, 10.0}, {11.0, 12.0}};
-
-    // Build expression without evaluation
-    auto expr = A + B - C;
-
-    // Expression type should not be Matrix, but an expression type
-    using ExprType = decltype(expr);
-    EXPECT_FALSE((std::is_same_v<ExprType, Matrix<double, 2, 2>>));
-
-    // Now evaluate
-    Matrix<double, 2, 2> result = expr;
-    EXPECT_DOUBLE_EQ(result(0, 0), -3.0);
-    EXPECT_DOUBLE_EQ(result(0, 1), -2.0);
-    EXPECT_DOUBLE_EQ(result(1, 0), -1.0);
-    EXPECT_DOUBLE_EQ(result(1, 1), 0.0);
-}
-
-TEST_F(MatrixExprTest, LazyEvaluationAccessPattern) {
-    Matrix<double, 3, 3> A;
-    Matrix<double, 3, 3> B;
-    for (int i = 0; i < 3; ++i) {
-        for (int j = 0; j < 3; ++j) {
-            A(i, j) = i * 3 + j + 1;
-            B(i, j) = (i * 3 + j + 1) * 2;
-        }
-    }
-
-    auto expr = A + B;
-
-    // Access individual elements without full evaluation
-    EXPECT_DOUBLE_EQ(expr(0, 0), 3.0);
-    EXPECT_DOUBLE_EQ(expr(1, 1), 15.0);
-    EXPECT_DOUBLE_EQ(expr(2, 2), 27.0);
-
-    // Size should be accessible
-    EXPECT_EQ(expr.rows(), 3u);
-    EXPECT_EQ(expr.cols(), 3u);
-}
-
-// =============================================================================
-// Matrix Multiplication Tests
-// =============================================================================
-
-TEST_F(MatrixExprTest, MatrixMultiplicationExpression) {
-    Matrix<double, 2, 3> A{{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}};
-    Matrix<double, 3, 2> B{{7.0, 8.0}, {9.0, 10.0}, {11.0, 12.0}};
-
-    // Matrix multiplication should produce 2x2 result
-    Matrix<double, 2, 2> C = A * B;
-
-    // Verify results
-    EXPECT_DOUBLE_EQ(C(0, 0), 58.0);   // 1*7 + 2*9 + 3*11
-    EXPECT_DOUBLE_EQ(C(0, 1), 64.0);   // 1*8 + 2*10 + 3*12
-    EXPECT_DOUBLE_EQ(C(1, 0), 139.0);  // 4*7 + 5*9 + 6*11
-    EXPECT_DOUBLE_EQ(C(1, 1), 154.0);  // 4*8 + 5*10 + 6*12
-}
-
-TEST_F(MatrixExprTest, ChainedMatrixMultiplication) {
-    Matrix<double, 2, 2> A{{1.0, 2.0}, {3.0, 4.0}};
-    Matrix<double, 2, 2> B{{5.0, 6.0}, {7.0, 8.0}};
-    Matrix<double, 2, 2> C{{9.0, 10.0}, {11.0, 12.0}};
-
-    // Chain matrix multiplications: (A * B) * C
-    Matrix<double, 2, 2> result = A * B * C;
-
-    // First compute A * B
-    Matrix<double, 2, 2> AB = A * B;
-    EXPECT_DOUBLE_EQ(AB(0, 0), 19.0);  // 1*5 + 2*7
-    EXPECT_DOUBLE_EQ(AB(0, 1), 22.0);  // 1*6 + 2*8
-    EXPECT_DOUBLE_EQ(AB(1, 0), 43.0);  // 3*5 + 4*7
-    EXPECT_DOUBLE_EQ(AB(1, 1), 50.0);  // 3*6 + 4*8
-
-    // Then (A * B) * C
-    EXPECT_DOUBLE_EQ(result(0, 0), 413.0);  // 19*9 + 22*11
-    EXPECT_DOUBLE_EQ(result(0, 1), 454.0);  // 19*10 + 22*12
-    EXPECT_DOUBLE_EQ(result(1, 0), 937.0);  // 43*9 + 50*11
-    EXPECT_DOUBLE_EQ(result(1, 1), 1030.0); // 43*10 + 50*12
-}
-
-// =============================================================================
-// Mixed Operations Tests
-// =============================================================================
-
-TEST_F(MatrixExprTest, MixedMatrixOperations) {
-    Matrix<double, 3, 3> A, B, C, D;
-
-    // Initialize matrices
-    for (int i = 0; i < 3; ++i) {
-        for (int j = 0; j < 3; ++j) {
-            A(i, j) = i + j + 1;
-            B(i, j) = (i + 1) * (j + 1);
-            C(i, j) = i * j + 1;
-            D(i, j) = 1.0;
-        }
-    }
-
-    // Complex expression: A * B + C * D
-    Matrix<double, 3, 3> result = A * B + C * D;
-
-    // Verify a few key elements
-    Matrix<double, 3, 3> AB = A * B;
-    Matrix<double, 3, 3> CD = C * D;
-
-    for (int i = 0; i < 3; ++i) {
-        for (int j = 0; j < 3; ++j) {
-            EXPECT_DOUBLE_EQ(result(i, j), AB(i, j) + CD(i, j));
-        }
-    }
-}
-
-TEST_F(MatrixExprTest, ScalarMultiplicationInExpression) {
-    Matrix<double, 2, 2> A{{1.0, 2.0}, {3.0, 4.0}};
-    Matrix<double, 2, 2> B{{5.0, 6.0}, {7.0, 8.0}};
-
-    Matrix<double, 2, 2> result = 2.0 * (A + B) / 3.0;
-
-    EXPECT_TRUE(approx_equal(result(0, 0), 4.0));
-    EXPECT_TRUE(approx_equal(result(0, 1), 16.0/3.0));
-    EXPECT_TRUE(approx_equal(result(1, 0), 20.0/3.0));
-    EXPECT_TRUE(approx_equal(result(1, 1), 8.0));
-}
-
-// =============================================================================
-// Transpose Tests
-// =============================================================================
-
-TEST_F(MatrixExprTest, TransposeExpression) {
-    Matrix<double, 2, 3> A{{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}};
-
-    auto AT = transpose(A);
-
-    // Check dimensions
-    EXPECT_EQ(AT.rows(), 3u);
-    EXPECT_EQ(AT.cols(), 2u);
-
-    // Check values
-    EXPECT_DOUBLE_EQ(AT(0, 0), 1.0);
-    EXPECT_DOUBLE_EQ(AT(0, 1), 4.0);
-    EXPECT_DOUBLE_EQ(AT(1, 0), 2.0);
-    EXPECT_DOUBLE_EQ(AT(1, 1), 5.0);
-    EXPECT_DOUBLE_EQ(AT(2, 0), 3.0);
-    EXPECT_DOUBLE_EQ(AT(2, 1), 6.0);
-}
-
-TEST_F(MatrixExprTest, TransposeInExpression) {
-    Matrix<double, 3, 2> A{{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}};
-    Matrix<double, 3, 2> B{{7.0, 8.0}, {9.0, 10.0}, {11.0, 12.0}};
-
-    // Compute A^T * B (should be 2x2)
-    Matrix<double, 2, 2> result = transpose(A) * B;
-
-    EXPECT_DOUBLE_EQ(result(0, 0), 89.0);   // 1*7 + 3*9 + 5*11
-    EXPECT_DOUBLE_EQ(result(0, 1), 98.0);   // 1*8 + 3*10 + 5*12
-    EXPECT_DOUBLE_EQ(result(1, 0), 116.0);  // 2*7 + 4*9 + 6*11
-    EXPECT_DOUBLE_EQ(result(1, 1), 128.0);  // 2*8 + 4*10 + 6*12
-}
-
-// =============================================================================
-// Unary Operations Tests
-// =============================================================================
-
-TEST_F(MatrixExprTest, NegationInExpression) {
-    Matrix<double, 2, 2> A{{1.0, -2.0}, {3.0, -4.0}};
-    Matrix<double, 2, 2> B{{5.0, 6.0}, {-7.0, 8.0}};
-
-    Matrix<double, 2, 2> result = -A + (-B);
-
-    EXPECT_DOUBLE_EQ(result(0, 0), -6.0);
-    EXPECT_DOUBLE_EQ(result(0, 1), -4.0);
-    EXPECT_DOUBLE_EQ(result(1, 0), 4.0);
-    EXPECT_DOUBLE_EQ(result(1, 1), -4.0);
-}
-
-TEST_F(MatrixExprTest, AbsoluteValueExpression) {
-    Matrix<double, 2, 3> M{{-1.5, 2.3, -4.7}, {0.0, -3.2, 5.1}};
-
-    Matrix<double, 2, 3> result = abs(M);
-
-    EXPECT_DOUBLE_EQ(result(0, 0), 1.5);
-    EXPECT_DOUBLE_EQ(result(0, 1), 2.3);
-    EXPECT_DOUBLE_EQ(result(0, 2), 4.7);
-    EXPECT_DOUBLE_EQ(result(1, 0), 0.0);
-    EXPECT_DOUBLE_EQ(result(1, 1), 3.2);
-    EXPECT_DOUBLE_EQ(result(1, 2), 5.1);
-}
-
-TEST_F(MatrixExprTest, SqrtExpression) {
-    Matrix<double, 2, 2> M{{4.0, 9.0}, {16.0, 25.0}};
-
-    Matrix<double, 2, 2> result = sqrt(M);
-
-    EXPECT_DOUBLE_EQ(result(0, 0), 2.0);
-    EXPECT_DOUBLE_EQ(result(0, 1), 3.0);
-    EXPECT_DOUBLE_EQ(result(1, 0), 4.0);
-    EXPECT_DOUBLE_EQ(result(1, 1), 5.0);
-}
-
-// =============================================================================
-// Element-wise Operations Tests
-// =============================================================================
-
-TEST_F(MatrixExprTest, HadamardProductExpression) {
-    Matrix<double, 2, 3> A{{2.0, 3.0, 4.0}, {5.0, 6.0, 7.0}};
-    Matrix<double, 2, 3> B{{8.0, 9.0, 10.0}, {11.0, 12.0, 13.0}};
-
-    Matrix<double, 2, 3> result = hadamard(A, B);
-
-    EXPECT_DOUBLE_EQ(result(0, 0), 16.0);
-    EXPECT_DOUBLE_EQ(result(0, 1), 27.0);
-    EXPECT_DOUBLE_EQ(result(0, 2), 40.0);
-    EXPECT_DOUBLE_EQ(result(1, 0), 55.0);
-    EXPECT_DOUBLE_EQ(result(1, 1), 72.0);
-    EXPECT_DOUBLE_EQ(result(1, 2), 91.0);
-}
-
-TEST_F(MatrixExprTest, HadamardDivisionExpression) {
-    Matrix<double, 2, 2> A{{10.0, 18.0}, {28.0, 36.0}};
-    Matrix<double, 2, 2> B{{2.0, 3.0}, {4.0, 6.0}};
-
-    Matrix<double, 2, 2> result = hadamard_div(A, B);
-
-    EXPECT_DOUBLE_EQ(result(0, 0), 5.0);
-    EXPECT_DOUBLE_EQ(result(0, 1), 6.0);
-    EXPECT_DOUBLE_EQ(result(1, 0), 7.0);
-    EXPECT_DOUBLE_EQ(result(1, 1), 6.0);
-}
-
-// =============================================================================
-// Norm and Trace Tests
-// =============================================================================
-
-TEST_F(MatrixExprTest, FrobeniusNormOfExpression) {
-    Matrix<double, 2, 2> A{{1.0, 2.0}, {3.0, 4.0}};
-    Matrix<double, 2, 2> B{{2.0, 2.0}, {2.0, 2.0}};
-
-    double norm_sq = frobenius_norm_squared(A - B);
-    double norm = frobenius_norm(A - B);
-
-    // (A - B) = [[-1, 0], [1, 2]]
-    // norm_squared = 1 + 0 + 1 + 4 = 6
-    EXPECT_DOUBLE_EQ(norm_sq, 6.0);
-    EXPECT_DOUBLE_EQ(norm, std::sqrt(6.0));
-}
-
-TEST_F(MatrixExprTest, TraceOfExpression) {
-    Matrix<double, 3, 3> A;
-    Matrix<double, 3, 3> B;
-
-    // Initialize as diagonal matrices
-    for (int i = 0; i < 3; ++i) {
-        for (int j = 0; j < 3; ++j) {
-            A(i, j) = (i == j) ? (i + 1) : 0.0;  // diag(1, 2, 3)
-            B(i, j) = (i == j) ? (i + 4) : 0.0;  // diag(4, 5, 6)
-        }
-    }
-
-    double tr = trace(A + B);
-
-    // trace(A + B) = trace(diag(5, 7, 9)) = 21
-    EXPECT_DOUBLE_EQ(tr, 21.0);
-}
-
-// =============================================================================
-// Type Deduction Tests
-// =============================================================================
-
-TEST_F(MatrixExprTest, TypeDeductionCorrectness) {
-    Matrix<float, 2, 2> Mf{{1.0f, 2.0f}, {3.0f, 4.0f}};
-    Matrix<double, 2, 2> Md{{5.0, 6.0}, {7.0, 8.0}};
-
-    // Float expression
-    auto expr = Mf + Mf;
-    using ExprType = decltype(expr(0, 0));
-    EXPECT_TRUE((std::is_same_v<ExprType, float>));
-
-    // Test that expression evaluates correctly
-    Matrix<float, 2, 2> result = expr;
-    EXPECT_FLOAT_EQ(result(0, 0), 2.0f);
-    EXPECT_FLOAT_EQ(result(1, 1), 8.0f);
-}
-
-// =============================================================================
-// SFINAE and Compile-time Tests
-// =============================================================================
-
-TEST_F(MatrixExprTest, SFINAEConstraints) {
-    // Test that MatrixExpr operators only work with MatrixExpr types
-    Matrix<double, 2, 2> M1{{1.0, 2.0}, {3.0, 4.0}};
-    Matrix<double, 2, 2> M2{{5.0, 6.0}, {7.0, 8.0}};
-
-    // This should compile
-    auto expr = M1 + M2;
-    Matrix<double, 2, 2> result = expr;
-
-    // Verify the constraint checking
-    EXPECT_TRUE((std::is_base_of_v<MatrixExpr<Matrix<double, 2, 2>>, Matrix<double, 2, 2>>));
-}
-
-// =============================================================================
-// Aliasing and Self-Assignment Tests
-// =============================================================================
-
-TEST_F(MatrixExprTest, SelfAssignmentWithExpression) {
-    Matrix<double, 2, 2> A{{1.0, 2.0}, {3.0, 4.0}};
-    Matrix<double, 2, 2> B{{5.0, 6.0}, {7.0, 8.0}};
-
-    // Self-assignment through expression
-    A = A + B;
-
-    EXPECT_DOUBLE_EQ(A(0, 0), 6.0);
-    EXPECT_DOUBLE_EQ(A(0, 1), 8.0);
-    EXPECT_DOUBLE_EQ(A(1, 0), 10.0);
-    EXPECT_DOUBLE_EQ(A(1, 1), 12.0);
-}
-
-TEST_F(MatrixExprTest, AliasingInExpression) {
-    Matrix<double, 2, 2> A{{2.0, 3.0}, {4.0, 5.0}};
-    Matrix<double, 2, 2> B{{1.0, 1.0}, {1.0, 1.0}};
-
-    // A appears on both sides
-    A = B + A;
-
-    EXPECT_DOUBLE_EQ(A(0, 0), 3.0);
-    EXPECT_DOUBLE_EQ(A(0, 1), 4.0);
-    EXPECT_DOUBLE_EQ(A(1, 0), 5.0);
-    EXPECT_DOUBLE_EQ(A(1, 1), 6.0);
-}
-
-// =============================================================================
-// Edge Cases Tests
-// =============================================================================
-
-TEST_F(MatrixExprTest, SingleElementMatrix) {
-    Matrix<double, 1, 1> A{{5.0}};
-    Matrix<double, 1, 1> B{{3.0}};
-
-    Matrix<double, 1, 1> result = A + B - A * 0.5;
-
-    EXPECT_DOUBLE_EQ(result(0, 0), 5.5);
-}
-
-TEST_F(MatrixExprTest, NonSquareMatrixOperations) {
-    Matrix<double, 2, 4> A;
-    Matrix<double, 2, 4> B;
-
-    for (int i = 0; i < 2; ++i) {
-        for (int j = 0; j < 4; ++j) {
-            A(i, j) = i * 4 + j + 1;
-            B(i, j) = (i * 4 + j + 1) * 2;
-        }
-    }
-
-    Matrix<double, 2, 4> result = A + B - A * 0.5;
-
-    for (int i = 0; i < 2; ++i) {
-        for (int j = 0; j < 4; ++j) {
-            double expected = A(i, j) + B(i, j) - A(i, j) * 0.5;
-            EXPECT_DOUBLE_EQ(result(i, j), expected);
-        }
-    }
-}
-
-// =============================================================================
-// Diagonal Matrix Tests
-// =============================================================================
-
-TEST_F(MatrixExprTest, DiagonalMatrixExpression) {
-    Vector<double, 3> v{1.0, 2.0, 3.0};
-
-    auto diag = DiagonalExpr<Vector<double, 3>>(v);
-
-    // Check dimensions
-    EXPECT_EQ(diag.rows(), 3u);
-    EXPECT_EQ(diag.cols(), 3u);
-
-    // Check values
-    EXPECT_DOUBLE_EQ(diag(0, 0), 1.0);
-    EXPECT_DOUBLE_EQ(diag(1, 1), 2.0);
-    EXPECT_DOUBLE_EQ(diag(2, 2), 3.0);
-
-    // Off-diagonal should be zero
-    EXPECT_DOUBLE_EQ(diag(0, 1), 0.0);
-    EXPECT_DOUBLE_EQ(diag(1, 0), 0.0);
-}
-
-TEST_F(MatrixExprTest, DiagonalMatrixInExpression) {
-    Vector<double, 2> v{2.0, 3.0};
-    Matrix<double, 2, 2> A{{1.0, 1.0}, {1.0, 1.0}};
-
-    auto diag = DiagonalExpr<Vector<double, 2>>(v);
-    Matrix<double, 2, 2> result = A + diag;
-
-    EXPECT_DOUBLE_EQ(result(0, 0), 3.0);
-    EXPECT_DOUBLE_EQ(result(0, 1), 1.0);
-    EXPECT_DOUBLE_EQ(result(1, 0), 1.0);
-    EXPECT_DOUBLE_EQ(result(1, 1), 4.0);
-}
-
-// =============================================================================
-// Complex Expression Pattern Tests
-// =============================================================================
-
-TEST_F(MatrixExprTest, ComplexNestedExpression) {
-    Matrix<double, 2, 2> A{{1.0, 2.0}, {3.0, 4.0}};
-    Matrix<double, 2, 2> B{{5.0, 6.0}, {7.0, 8.0}};
-    Matrix<double, 2, 2> C{{9.0, 10.0}, {11.0, 12.0}};
-
-    // Complex expression with multiple operation types
-    Matrix<double, 2, 2> result = 2.0 * abs(A - B) + sqrt(hadamard(C, C)) / 3.0;
-
-    // |A - B| = |[-4, -4], [-4, -4]| = [4, 4], [4, 4]
-    // 2 * [4, 4], [4, 4] = [8, 8], [8, 8]
-    // C * C (element-wise) = [81, 100], [121, 144]
-    // sqrt(C * C) = [9, 10], [11, 12]
-    // sqrt(C * C) / 3 = [3, 10/3], [11/3, 4]
-    // result = [11, 34/3], [35/3, 12]
-
-    EXPECT_DOUBLE_EQ(result(0, 0), 11.0);
-    EXPECT_TRUE(approx_equal(result(0, 1), 34.0/3.0));
-    EXPECT_TRUE(approx_equal(result(1, 0), 35.0/3.0));
-    EXPECT_DOUBLE_EQ(result(1, 1), 12.0);
-}
-
-TEST_F(MatrixExprTest, MatrixVectorMixedExpression) {
-    Matrix<double, 3, 3> A;
-    Vector<double, 3> v{1.0, 2.0, 3.0};
-
-    // Create identity matrix
-    for (int i = 0; i < 3; ++i) {
-        for (int j = 0; j < 3; ++j) {
-            A(i, j) = (i == j) ? 1.0 : 0.0;
-        }
-    }
-
-    // Create diagonal from vector and add to identity
-    auto diag = DiagonalExpr<Vector<double, 3>>(v);
-    Matrix<double, 3, 3> result = A + diag;
-
-    // Result should be diag(2, 3, 4)
-    EXPECT_DOUBLE_EQ(result(0, 0), 2.0);
-    EXPECT_DOUBLE_EQ(result(1, 1), 3.0);
-    EXPECT_DOUBLE_EQ(result(2, 2), 4.0);
-    EXPECT_DOUBLE_EQ(result(0, 1), 0.0);
-    EXPECT_DOUBLE_EQ(result(1, 0), 0.0);
-}
diff --git a/tests/unitTests/FE/Math/test_Vector.cpp b/tests/unitTests/FE/Math/test_Vector.cpp
deleted file mode 100644
index 754ad819d..000000000
--- a/tests/unitTests/FE/Math/test_Vector.cpp
+++ /dev/null
@@ -1,588 +0,0 @@
-/**
- * @file test_Vector.cpp
- * @brief Unit tests for Vector.h - fixed-size vectors with expression templates
- */
-
-#include <gtest/gtest.h>
-#include "FE/Math/Vector.h"
-#include "FE/Math/VectorExpr.h"
-#include <limits>
-#include <cmath>
-#include <sstream>
-#include <thread>
-#include <vector>
-
-using namespace svmp::FE::math;
-
-// Test fixture for Vector tests
-class VectorTest : public ::testing::Test {
-protected:
-    static constexpr double tolerance = 1e-14;
-
-    void SetUp() override {}
-    void TearDown() override {}
-
-    // Helper function to check if two values are approximately equal
-    template<typename T>
-    bool approx_equal(T a, T b, T tol = tolerance) {
-        return std::abs(a - b) <= tol;
-    }
-};
-
-// =============================================================================
-// Construction and Initialization Tests
-// =============================================================================
-
-TEST_F(VectorTest, DefaultConstruction) {
-    Vector<double, 3> v;
-    EXPECT_EQ(v[0], 0.0);
-    EXPECT_EQ(v[1], 0.0);
-    EXPECT_EQ(v[2], 0.0);
-
-    Vector<float, 4> vf;
-    for (size_t i = 0; i < 4; ++i) {
-        EXPECT_EQ(vf[i], 0.0f);
-    }
-}
-
-TEST_F(VectorTest, FillConstruction) {
-    Vector<double, 3> v(5.0);
-    EXPECT_EQ(v[0], 5.0);
-    EXPECT_EQ(v[1], 5.0);
-    EXPECT_EQ(v[2], 5.0);
-
-    Vector<int, 10> vi(-3);
-    for (size_t i = 0; i < 10; ++i) {
-        EXPECT_EQ(vi[i], -3);
-    }
-}
-
-TEST_F(VectorTest, InitializerListConstruction) {
-    Vector<double, 3> v{1.0, 2.0, 3.0};
-    EXPECT_EQ(v[0], 1.0);
-    EXPECT_EQ(v[1], 2.0);
-    EXPECT_EQ(v[2], 3.0);
-
-    // Partial initialization
-    Vector<double, 5> v2{1.0, 2.0};
-    EXPECT_EQ(v2[0], 1.0);
-    EXPECT_EQ(v2[1], 2.0);
-    EXPECT_EQ(v2[2], 0.0);
-    EXPECT_EQ(v2[3], 0.0);
-    EXPECT_EQ(v2[4], 0.0);
-}
-
-TEST_F(VectorTest, CopyConstruction) {
-    Vector<double, 3> v1{1.0, 2.0, 3.0};
-    Vector<double, 3> v2(v1);
-
-    EXPECT_EQ(v2[0], 1.0);
-    EXPECT_EQ(v2[1], 2.0);
-    EXPECT_EQ(v2[2], 3.0);
-
-    // Ensure deep copy
-    v2[0] = 10.0;
-    EXPECT_EQ(v1[0], 1.0);
-    EXPECT_EQ(v2[0], 10.0);
-}
-
-TEST_F(VectorTest, MoveConstruction) {
-    Vector<double, 3> v1{1.0, 2.0, 3.0};
-    Vector<double, 3> v2(std::move(v1));
-
-    EXPECT_EQ(v2[0], 1.0);
-    EXPECT_EQ(v2[1], 2.0);
-    EXPECT_EQ(v2[2], 3.0);
-}
-
-// =============================================================================
-// Element Access Tests
-// =============================================================================
-
-TEST_F(VectorTest, ElementAccess) {
-    Vector<double, 3> v{1.0, 2.0, 3.0};
-
-    // Non-const access
-    EXPECT_EQ(v[0], 1.0);
-    EXPECT_EQ(v[1], 2.0);
-    EXPECT_EQ(v[2], 3.0);
-
-    // Modification
-    v[1] = 5.0;
-    EXPECT_EQ(v[1], 5.0);
-
-    // Const access
-    const Vector<double, 3> cv{4.0, 5.0, 6.0};
-    EXPECT_EQ(cv[0], 4.0);
-    EXPECT_EQ(cv[1], 5.0);
-    EXPECT_EQ(cv[2], 6.0);
-}
-
-TEST_F(VectorTest, ElementAccessBounds) {
-    Vector<double, 3> v{1.0, 2.0, 3.0};
-
-    // at() with bounds checking
-    EXPECT_EQ(v.at(0), 1.0);
-    EXPECT_EQ(v.at(1), 2.0);
-    EXPECT_EQ(v.at(2), 3.0);
-
-    // Test out of bounds throws
-    EXPECT_THROW(v.at(3), std::out_of_range);
-    EXPECT_THROW(v.at(100), std::out_of_range);
-}
-
-TEST_F(VectorTest, DataPointerAccess) {
-    Vector<double, 3> v{1.0, 2.0, 3.0};
-
-    double* data = v.data();
-    EXPECT_EQ(data[0], 1.0);
-    EXPECT_EQ(data[1], 2.0);
-    EXPECT_EQ(data[2], 3.0);
-
-    // Const data access
-    const Vector<double, 3> cv{4.0, 5.0, 6.0};
-    const double* cdata = cv.data();
-    EXPECT_EQ(cdata[0], 4.0);
-    EXPECT_EQ(cdata[1], 5.0);
-    EXPECT_EQ(cdata[2], 6.0);
-}
-
-// =============================================================================
-// Arithmetic Operations Tests
-// =============================================================================
-
-TEST_F(VectorTest, Addition) {
-    Vector<double, 3> a{1.0, 2.0, 3.0};
-    Vector<double, 3> b{4.0, 5.0, 6.0};
-
-    Vector<double, 3> c = a + b;
-    EXPECT_EQ(c[0], 5.0);
-    EXPECT_EQ(c[1], 7.0);
-    EXPECT_EQ(c[2], 9.0);
-}
-
-TEST_F(VectorTest, Subtraction) {
-    Vector<double, 3> a{5.0, 7.0, 9.0};
-    Vector<double, 3> b{4.0, 5.0, 6.0};
-
-    Vector<double, 3> c = a - b;
-    EXPECT_EQ(c[0], 1.0);
-    EXPECT_EQ(c[1], 2.0);
-    EXPECT_EQ(c[2], 3.0);
-}
-
-TEST_F(VectorTest, ScalarMultiplication) {
-    Vector<double, 3> a{1.0, 2.0, 3.0};
-
-    // Scalar * Vector
-    Vector<double, 3> b = 2.0 * a;
-    EXPECT_EQ(b[0], 2.0);
-    EXPECT_EQ(b[1], 4.0);
-    EXPECT_EQ(b[2], 6.0);
-
-    // Vector * Scalar
-    Vector<double, 3> c = a * 3.0;
-    EXPECT_EQ(c[0], 3.0);
-    EXPECT_EQ(c[1], 6.0);
-    EXPECT_EQ(c[2], 9.0);
-}
-
-TEST_F(VectorTest, ScalarDivision) {
-    Vector<double, 3> a{2.0, 4.0, 6.0};
-
-    Vector<double, 3> b = a / 2.0;
-    EXPECT_EQ(b[0], 1.0);
-    EXPECT_EQ(b[1], 2.0);
-    EXPECT_EQ(b[2], 3.0);
-}
-
-TEST_F(VectorTest, UnaryNegation) {
-    Vector<double, 3> a{1.0, -2.0, 3.0};
-
-    Vector<double, 3> b = -a;
-    EXPECT_EQ(b[0], -1.0);
-    EXPECT_EQ(b[1], 2.0);
-    EXPECT_EQ(b[2], -3.0);
-}
-
-TEST_F(VectorTest, CompoundAssignment) {
-    Vector<double, 3> a{1.0, 2.0, 3.0};
-    Vector<double, 3> b{4.0, 5.0, 6.0};
-
-    // +=
-    a += b;
-    EXPECT_EQ(a[0], 5.0);
-    EXPECT_EQ(a[1], 7.0);
-    EXPECT_EQ(a[2], 9.0);
-
-    // -=
-    a -= b;
-    EXPECT_EQ(a[0], 1.0);
-    EXPECT_EQ(a[1], 2.0);
-    EXPECT_EQ(a[2], 3.0);
-
-    // *=
-    a *= 2.0;
-    EXPECT_EQ(a[0], 2.0);
-    EXPECT_EQ(a[1], 4.0);
-    EXPECT_EQ(a[2], 6.0);
-
-    // /=
-    a /= 2.0;
-    EXPECT_EQ(a[0], 1.0);
-    EXPECT_EQ(a[1], 2.0);
-    EXPECT_EQ(a[2], 3.0);
-}
-
-// =============================================================================
-// Vector Operations Tests
-// =============================================================================
-
-TEST_F(VectorTest, DotProduct) {
-    Vector<double, 3> a{1.0, 2.0, 3.0};
-    Vector<double, 3> b{4.0, 5.0, 6.0};
-
-    double dot = a.dot(b);
-    EXPECT_EQ(dot, 32.0);  // 1*4 + 2*5 + 3*6 = 4 + 10 + 18 = 32
-
-    // Test commutativity
-    EXPECT_EQ(b.dot(a), dot);
-
-    // Test orthogonal vectors
-    Vector<double, 3> x{1.0, 0.0, 0.0};
-    Vector<double, 3> y{0.0, 1.0, 0.0};
-    EXPECT_EQ(x.dot(y), 0.0);
-}
-
-TEST_F(VectorTest, CrossProduct3D) {
-    Vector<double, 3> x{1.0, 0.0, 0.0};
-    Vector<double, 3> y{0.0, 1.0, 0.0};
-    Vector<double, 3> z{0.0, 0.0, 1.0};
-
-    // Test basis vector cross products
-    Vector<double, 3> xy = x.cross(y);
-    EXPECT_EQ(xy[0], 0.0);
-    EXPECT_EQ(xy[1], 0.0);
-    EXPECT_EQ(xy[2], 1.0);
-
-    Vector<double, 3> yx = y.cross(x);
-    EXPECT_EQ(yx[0], 0.0);
-    EXPECT_EQ(yx[1], 0.0);
-    EXPECT_EQ(yx[2], -1.0);
-
-    // General cross product
-    Vector<double, 3> a{1.0, 2.0, 3.0};
-    Vector<double, 3> b{4.0, 5.0, 6.0};
-    Vector<double, 3> c = a.cross(b);
-
-    EXPECT_EQ(c[0], -3.0);  // 2*6 - 3*5 = 12 - 15 = -3
-    EXPECT_EQ(c[1], 6.0);   // 3*4 - 1*6 = 12 - 6 = 6
-    EXPECT_EQ(c[2], -3.0);  // 1*5 - 2*4 = 5 - 8 = -3
-}
-
-TEST_F(VectorTest, Norm) {
-    Vector<double, 3> v{3.0, 4.0, 0.0};
-    EXPECT_EQ(v.norm(), 5.0);
-
-    Vector<double, 3> unit{1.0, 0.0, 0.0};
-    EXPECT_EQ(unit.norm(), 1.0);
-
-    Vector<double, 3> zero{0.0, 0.0, 0.0};
-    EXPECT_EQ(zero.norm(), 0.0);
-}
-
-TEST_F(VectorTest, NormSquared) {
-    Vector<double, 3> v{3.0, 4.0, 0.0};
-    EXPECT_EQ(v.norm_squared(), 25.0);
-
-    Vector<double, 3> a{1.0, 2.0, 3.0};
-    EXPECT_EQ(a.norm_squared(), 14.0);  // 1 + 4 + 9 = 14
-}
-
-TEST_F(VectorTest, Normalize) {
-    Vector<double, 3> v{3.0, 4.0, 0.0};
-    Vector<double, 3> n = v.normalized();
-
-    EXPECT_NEAR(n[0], 0.6, tolerance);
-    EXPECT_NEAR(n[1], 0.8, tolerance);
-    EXPECT_NEAR(n[2], 0.0, tolerance);
-    EXPECT_NEAR(n.norm(), 1.0, tolerance);
-
-    // In-place normalization
-    v.normalize();
-    EXPECT_NEAR(v[0], 0.6, tolerance);
-    EXPECT_NEAR(v[1], 0.8, tolerance);
-    EXPECT_NEAR(v.norm(), 1.0, tolerance);
-}
-
-// =============================================================================
-// Expression Template Tests
-// =============================================================================
-
-TEST_F(VectorTest, ExpressionTemplatesNoTemporaries) {
-    Vector<double, 3> a{1.0, 2.0, 3.0};
-    Vector<double, 3> b{4.0, 5.0, 6.0};
-    Vector<double, 3> c{7.0, 8.0, 9.0};
-    Vector<double, 3> d{10.0, 11.0, 12.0};
-
-    // Complex expression should create no temporaries
-    Vector<double, 3> result = a + b - c + d;
-
-    EXPECT_EQ(result[0], 8.0);   // 1 + 4 - 7 + 10
-    EXPECT_EQ(result[1], 10.0);  // 2 + 5 - 8 + 11
-    EXPECT_EQ(result[2], 12.0);  // 3 + 6 - 9 + 12
-}
-
-TEST_F(VectorTest, LazyEvaluation) {
-    Vector<double, 3> a{1.0, 2.0, 3.0};
-    Vector<double, 3> b{4.0, 5.0, 6.0};
-
-    // Expression should not be evaluated until assignment
-    auto expr = a + b;  // No computation yet
-
-    Vector<double, 3> result = expr;  // Evaluation happens here
-    EXPECT_EQ(result[0], 5.0);
-    EXPECT_EQ(result[1], 7.0);
-    EXPECT_EQ(result[2], 9.0);
-}
-
-TEST_F(VectorTest, MixedExpressions) {
-    Vector<double, 3> a{1.0, 2.0, 3.0};
-    Vector<double, 3> b{4.0, 5.0, 6.0};
-    double scalar = 2.0;
-
-    // Complex mixed expression
-    Vector<double, 3> result = scalar * (a + b) - a / scalar;
-
-    EXPECT_NEAR(result[0], 9.5, tolerance);   // 2*(1+4) - 1/2
-    EXPECT_NEAR(result[1], 13.0, tolerance);  // 2*(2+5) - 2/2
-    EXPECT_NEAR(result[2], 16.5, tolerance);  // 2*(3+6) - 3/2
-}
-
-// =============================================================================
-// Special Values Tests
-// =============================================================================
-
-TEST_F(VectorTest, ZeroVector) {
-    Vector<double, 3> zero = Vector<double, 3>::zero();
-    EXPECT_EQ(zero[0], 0.0);
-    EXPECT_EQ(zero[1], 0.0);
-    EXPECT_EQ(zero[2], 0.0);
-    EXPECT_EQ(zero.norm(), 0.0);
-}
-
-TEST_F(VectorTest, OnesVector) {
-    Vector<double, 3> ones = Vector<double, 3>::ones();
-    EXPECT_EQ(ones[0], 1.0);
-    EXPECT_EQ(ones[1], 1.0);
-    EXPECT_EQ(ones[2], 1.0);
-}
-
-TEST_F(VectorTest, BasisVectors) {
-    auto e0 = Vector<double, 3>::basis(0);
-    EXPECT_EQ(e0[0], 1.0);
-    EXPECT_EQ(e0[1], 0.0);
-    EXPECT_EQ(e0[2], 0.0);
-
-    auto e1 = Vector<double, 3>::basis(1);
-    EXPECT_EQ(e1[0], 0.0);
-    EXPECT_EQ(e1[1], 1.0);
-    EXPECT_EQ(e1[2], 0.0);
-
-    auto e2 = Vector<double, 3>::basis(2);
-    EXPECT_EQ(e2[0], 0.0);
-    EXPECT_EQ(e2[1], 0.0);
-    EXPECT_EQ(e2[2], 1.0);
-}
-
-// =============================================================================
-// Edge Cases and Error Handling Tests
-// =============================================================================
-
-TEST_F(VectorTest, DivisionByZero) {
-    Vector<double, 3> v{1.0, 2.0, 3.0};
-
-    // Division by zero should produce inf
-    Vector<double, 3> result = v / 0.0;
-    EXPECT_TRUE(std::isinf(result[0]));
-    EXPECT_TRUE(std::isinf(result[1]));
-    EXPECT_TRUE(std::isinf(result[2]));
-}
-
-TEST_F(VectorTest, NormalizeZeroVector) {
-    Vector<double, 3> zero{0.0, 0.0, 0.0};
-
-    // Normalizing zero vector should handle gracefully
-    Vector<double, 3> n = zero.normalized();
-    EXPECT_TRUE(std::isnan(n[0]) || n[0] == 0.0);
-}
-
-TEST_F(VectorTest, ExtremeLargeValues) {
-    double large = 1e308;  // Near double max
-    Vector<double, 3> v{large, large, large};
-
-    // Operations should not overflow
-    Vector<double, 3> half = v / 2.0;
-    EXPECT_FALSE(std::isinf(half[0]));
-    EXPECT_EQ(half[0], large / 2.0);
-}
-
-TEST_F(VectorTest, ExtremeSmallValues) {
-    double tiny = 1e-308;  // Near double min
-    Vector<double, 3> v{tiny, tiny, tiny};
-
-    // Operations should maintain precision
-    Vector<double, 3> doubled = v * 2.0;
-    EXPECT_EQ(doubled[0], tiny * 2.0);
-}
-
-// =============================================================================
-// Numerical Precision Tests
-// =============================================================================
-
-TEST_F(VectorTest, NumericalStability) {
-    // Test Kahan summation for better precision
-    Vector<double, 4> v{1e16, 1.0, -1e16, 1.0};
-    // Computed for future validation - demonstrates numerical precision issues
-    [[maybe_unused]] double sum = v[0] + v[1] + v[2] + v[3];
-
-    // Direct summation might lose precision
-    // But vector operations should maintain it
-    Vector<double, 4> a{1e16, 0.0, -1e16, 0.0};
-    Vector<double, 4> b{0.0, 1.0, 0.0, 1.0};
-    Vector<double, 4> c = a + b;
-
-    EXPECT_EQ(c[0], 1e16);
-    EXPECT_EQ(c[1], 1.0);
-    EXPECT_EQ(c[2], -1e16);
-    EXPECT_EQ(c[3], 1.0);
-}
-
-TEST_F(VectorTest, OrthogonalityPreservation) {
-    // Create nearly orthogonal vectors
-    Vector<double, 3> a{1.0, 1e-15, 0.0};
-    Vector<double, 3> b{0.0, 1.0, 0.0};
-
-    double dot = a.dot(b);
-    EXPECT_NEAR(dot, 1e-15, 1e-16);
-}
-
-// =============================================================================
-// Comparison Operations Tests
-// =============================================================================
-
-TEST_F(VectorTest, Equality) {
-    Vector<double, 3> a{1.0, 2.0, 3.0};
-    Vector<double, 3> b{1.0, 2.0, 3.0};
-    Vector<double, 3> c{1.0, 2.0, 3.1};
-
-    EXPECT_TRUE(a == b);
-    EXPECT_FALSE(a == c);
-    EXPECT_FALSE(a != b);
-    EXPECT_TRUE(a != c);
-}
-
-TEST_F(VectorTest, ApproximateEquality) {
-    Vector<double, 3> a{1.0, 2.0, 3.0};
-    Vector<double, 3> b{1.0 + 1e-15, 2.0 - 1e-15, 3.0 + 1e-15};
-
-    EXPECT_TRUE(a.approx_equal(b, 1e-14));
-    EXPECT_FALSE(a.approx_equal(b, 1e-16));
-}
-
-// =============================================================================
-// Thread Safety Tests
-// =============================================================================
-
-TEST_F(VectorTest, ThreadSafetyReadOnly) {
-    Vector<double, 3> v{1.0, 2.0, 3.0};
-
-    // Multiple threads reading should be safe
-    std::vector<std::thread> threads;
-    std::vector<double> results(10);
-
-    for (int i = 0; i < 10; ++i) {
-        threads.emplace_back([&v, &results, i]() {
-            results[static_cast<std::size_t>(i)] = v.norm();
-        });
-    }
-
-    for (auto& t : threads) {
-        t.join();
-    }
-
-    // All threads should get same result
-    double expected = v.norm();
-    for (double r : results) {
-        EXPECT_EQ(r, expected);
-    }
-}
-
-TEST_F(VectorTest, ThreadSafetyIsolated) {
-    // Each thread works on its own vector
-    std::vector<std::thread> threads;
-    std::vector<Vector<double, 3>> results(10);
-
-    for (int i = 0; i < 10; ++i) {
-        threads.emplace_back([&results, i]() {
-            Vector<double, 3> local{static_cast<double>(i), 0.0, 0.0};
-            results[static_cast<std::size_t>(i)] = local * 2.0;
-        });
-    }
-
-    for (auto& t : threads) {
-        t.join();
-    }
-
-    // Check each thread computed correctly
-    for (int i = 0; i < 10; ++i) {
-        EXPECT_EQ(results[static_cast<std::size_t>(i)][0], 2.0 * i);
-    }
-}
-
-// =============================================================================
-// Memory Alignment Tests
-// =============================================================================
-
-TEST_F(VectorTest, MemoryAlignment) {
-    Vector<double, 3> v;
-
-    // Check that data is properly aligned for SIMD
-    std::uintptr_t addr = reinterpret_cast<std::uintptr_t>(v.data());
-    EXPECT_EQ(addr % 32, 0) << "Vector data should be 32-byte aligned for AVX";
-}
-
-// =============================================================================
-// Utility Function Tests
-// =============================================================================
-
-TEST_F(VectorTest, MinMaxElements) {
-    Vector<double, 5> v{3.0, -1.0, 4.0, 1.0, -2.0};
-
-    EXPECT_EQ(v.min(), -2.0);
-    EXPECT_EQ(v.max(), 4.0);
-    EXPECT_EQ(v.min_index(), 4);
-    EXPECT_EQ(v.max_index(), 2);
-}
-
-TEST_F(VectorTest, Sum) {
-    Vector<double, 4> v{1.0, 2.0, 3.0, 4.0};
-    EXPECT_EQ(v.sum(), 10.0);
-
-    Vector<double, 3> zero{0.0, 0.0, 0.0};
-    EXPECT_EQ(zero.sum(), 0.0);
-}
-
-TEST_F(VectorTest, Mean) {
-    Vector<double, 4> v{1.0, 2.0, 3.0, 4.0};
-    EXPECT_EQ(v.mean(), 2.5);
-}
-
-TEST_F(VectorTest, ToString) {
-    Vector<double, 3> v{1.0, 2.0, 3.0};
-    std::stringstream ss;
-    ss << v;
-
-    std::string expected = "[1, 2, 3]";
-    EXPECT_EQ(ss.str(), expected);
-}
diff --git a/tests/unitTests/FE/Math/test_VectorExpr.cpp b/tests/unitTests/FE/Math/test_VectorExpr.cpp
deleted file mode 100644
index 0e7363c64..000000000
--- a/tests/unitTests/FE/Math/test_VectorExpr.cpp
+++ /dev/null
@@ -1,408 +0,0 @@
-/**
- * @file test_VectorExpr.cpp
- * @brief Unit tests for VectorExpr.h - vector expression templates
- */
-
-#include <gtest/gtest.h>
-#include "FE/Math/Vector.h"
-#include "FE/Math/VectorExpr.h"
-#include <limits>
-#include <cmath>
-#include <memory>
-#include <atomic>
-#include <type_traits>
-
-using namespace svmp::FE::math;
-
-// Test fixture for VectorExpr tests
-class VectorExprTest : public ::testing::Test {
-protected:
-    static constexpr double tolerance = 1e-14;
-
-    // Custom allocator to track memory allocations
-    template<typename T>
-    class TrackingAllocator {
-    public:
-        using value_type = T;
-
-        static std::atomic<size_t> allocations;
-        static std::atomic<size_t> deallocations;
-        static std::atomic<size_t> bytes_allocated;
-
-        TrackingAllocator() = default;
-
-        template<typename U>
-        TrackingAllocator(const TrackingAllocator<U>&) {}
-
-        T* allocate(size_t n) {
-            allocations.fetch_add(1);
-            bytes_allocated.fetch_add(n * sizeof(T));
-            return static_cast<T*>(::operator new(n * sizeof(T)));
-        }
-
-        void deallocate(T* p, size_t n) {
-            deallocations.fetch_add(1);
-            ::operator delete(p);
-        }
-
-        static void reset() {
-            allocations = 0;
-            deallocations = 0;
-            bytes_allocated = 0;
-        }
-    };
-
-    void SetUp() override {
-        TrackingAllocator<double>::reset();
-    }
-
-    void TearDown() override {}
-
-    template<typename T>
-    bool approx_equal(T a, T b, T tol = tolerance) {
-        return std::abs(a - b) <= tol;
-    }
-};
-
-template<typename T>
-std::atomic<size_t> VectorExprTest::TrackingAllocator<T>::allocations{0};
-template<typename T>
-std::atomic<size_t> VectorExprTest::TrackingAllocator<T>::deallocations{0};
-template<typename T>
-std::atomic<size_t> VectorExprTest::TrackingAllocator<T>::bytes_allocated{0};
-
-// =============================================================================
-// Lazy Evaluation Tests
-// =============================================================================
-
-TEST_F(VectorExprTest, LazyEvaluationNoTemporaries) {
-    // Expression templates should not create temporary vectors
-    Vector<double, 3> a{1.0, 2.0, 3.0};
-    Vector<double, 3> b{4.0, 5.0, 6.0};
-    Vector<double, 3> c{7.0, 8.0, 9.0};
-
-    // Build expression without evaluation
-    auto expr = a + b - c;
-
-    // Expression type should not be Vector, but an expression type
-    using ExprType = decltype(expr);
-    EXPECT_FALSE((std::is_same_v<ExprType, Vector<double, 3>>));
-
-    // Now evaluate
-    Vector<double, 3> result = expr;
-    EXPECT_DOUBLE_EQ(result[0], -2.0);
-    EXPECT_DOUBLE_EQ(result[1], -1.0);
-    EXPECT_DOUBLE_EQ(result[2], 0.0);
-}
-
-TEST_F(VectorExprTest, LazyEvaluationAccessPattern) {
-    Vector<double, 4> a{1.0, 2.0, 3.0, 4.0};
-    Vector<double, 4> b{5.0, 6.0, 7.0, 8.0};
-
-    auto expr = a + b;
-
-    // Access individual elements without full evaluation
-    EXPECT_DOUBLE_EQ(expr[0], 6.0);
-    EXPECT_DOUBLE_EQ(expr[2], 10.0);
-
-    // Size should be accessible
-    EXPECT_EQ(expr.size(), 4u);
-}
-
-// =============================================================================
-// Expression Chaining Tests
-// =============================================================================
-
-TEST_F(VectorExprTest, ChainedAdditionSubtraction) {
-    Vector<double, 3> a{1.0, 2.0, 3.0};
-    Vector<double, 3> b{4.0, 5.0, 6.0};
-    Vector<double, 3> c{2.0, 3.0, 4.0};
-    Vector<double, 3> d{1.0, 1.0, 1.0};
-
-    // Chain multiple operations
-    Vector<double, 3> result = a + b - c + d;
-
-    EXPECT_DOUBLE_EQ(result[0], 4.0);
-    EXPECT_DOUBLE_EQ(result[1], 5.0);
-    EXPECT_DOUBLE_EQ(result[2], 6.0);
-}
-
-TEST_F(VectorExprTest, DeepExpressionNesting) {
-    Vector<double, 2> v1{1.0, 2.0};
-    Vector<double, 2> v2{3.0, 4.0};
-    Vector<double, 2> v3{5.0, 6.0};
-    Vector<double, 2> v4{7.0, 8.0};
-    Vector<double, 2> v5{9.0, 10.0};
-
-    // Deep nesting
-    Vector<double, 2> result = ((v1 + v2) - (v3 - v4)) + v5;
-
-    EXPECT_DOUBLE_EQ(result[0], 15.0);
-    EXPECT_DOUBLE_EQ(result[1], 18.0);
-}
-
-// =============================================================================
-// Mixed Operations Tests
-// =============================================================================
-
-TEST_F(VectorExprTest, ScalarMultiplicationInExpression) {
-    Vector<double, 3> a{1.0, 2.0, 3.0};
-    Vector<double, 3> b{4.0, 5.0, 6.0};
-
-    Vector<double, 3> result = 2.0 * (a + b) / 3.0;
-
-    EXPECT_TRUE(approx_equal(result[0], 10.0/3.0));
-    EXPECT_TRUE(approx_equal(result[1], 14.0/3.0));
-    EXPECT_TRUE(approx_equal(result[2], 6.0));
-}
-
-TEST_F(VectorExprTest, MixedScalarVectorOperations) {
-    Vector<double, 4> v{2.0, 4.0, 6.0, 8.0};
-
-    // Complex mixed expression
-    Vector<double, 4> result = 3.0 * v / 2.0 + v * 0.5 - 1.0 * v;
-
-    EXPECT_DOUBLE_EQ(result[0], 2.0);
-    EXPECT_DOUBLE_EQ(result[1], 4.0);
-    EXPECT_DOUBLE_EQ(result[2], 6.0);
-    EXPECT_DOUBLE_EQ(result[3], 8.0);
-}
-
-// =============================================================================
-// Unary Operations Tests
-// =============================================================================
-
-TEST_F(VectorExprTest, NegationInExpression) {
-    Vector<double, 3> a{1.0, -2.0, 3.0};
-    Vector<double, 3> b{4.0, 5.0, -6.0};
-
-    Vector<double, 3> result = -a + (-b);
-
-    EXPECT_DOUBLE_EQ(result[0], -5.0);
-    EXPECT_DOUBLE_EQ(result[1], -3.0);
-    EXPECT_DOUBLE_EQ(result[2], 3.0);
-}
-
-TEST_F(VectorExprTest, AbsoluteValueExpression) {
-    Vector<double, 4> v{-1.5, 2.3, -4.7, 0.0};
-
-    Vector<double, 4> result = abs(v);
-
-    EXPECT_DOUBLE_EQ(result[0], 1.5);
-    EXPECT_DOUBLE_EQ(result[1], 2.3);
-    EXPECT_DOUBLE_EQ(result[2], 4.7);
-    EXPECT_DOUBLE_EQ(result[3], 0.0);
-}
-
-TEST_F(VectorExprTest, SqrtExpression) {
-    Vector<double, 3> v{4.0, 9.0, 16.0};
-
-    Vector<double, 3> result = sqrt(v);
-
-    EXPECT_DOUBLE_EQ(result[0], 2.0);
-    EXPECT_DOUBLE_EQ(result[1], 3.0);
-    EXPECT_DOUBLE_EQ(result[2], 4.0);
-}
-
-// =============================================================================
-// Element-wise Operations Tests
-// =============================================================================
-
-TEST_F(VectorExprTest, HadamardProductExpression) {
-    Vector<double, 3> a{2.0, 3.0, 4.0};
-    Vector<double, 3> b{5.0, 6.0, 7.0};
-
-    Vector<double, 3> result = hadamard(a, b);
-
-    EXPECT_DOUBLE_EQ(result[0], 10.0);
-    EXPECT_DOUBLE_EQ(result[1], 18.0);
-    EXPECT_DOUBLE_EQ(result[2], 28.0);
-}
-
-TEST_F(VectorExprTest, HadamardDivisionExpression) {
-    Vector<double, 3> a{10.0, 18.0, 28.0};
-    Vector<double, 3> b{2.0, 3.0, 4.0};
-
-    Vector<double, 3> result = hadamard_div(a, b);
-
-    EXPECT_DOUBLE_EQ(result[0], 5.0);
-    EXPECT_DOUBLE_EQ(result[1], 6.0);
-    EXPECT_DOUBLE_EQ(result[2], 7.0);
-}
-
-// =============================================================================
-// Dot Product and Norm Tests
-// =============================================================================
-
-TEST_F(VectorExprTest, DotProductOfExpressions) {
-    Vector<double, 3> a{1.0, 2.0, 3.0};
-    Vector<double, 3> b{4.0, 5.0, 6.0};
-    Vector<double, 3> c{2.0, 2.0, 2.0};
-
-    // Dot product of expressions
-    double result = dot(a + b, c);
-
-    EXPECT_DOUBLE_EQ(result, 42.0);
-}
-
-TEST_F(VectorExprTest, NormOfExpression) {
-    Vector<double, 2> a{3.0, 0.0};
-    Vector<double, 2> b{0.0, 4.0};
-
-    double result = norm(a + b);
-
-    EXPECT_DOUBLE_EQ(result, 5.0);  // norm of (3,4) = 5
-}
-
-TEST_F(VectorExprTest, NormalizeExpression) {
-    Vector<double, 3> v{3.0, 0.0, 4.0};
-
-    Vector<double, 3> result = normalize(v);
-
-    EXPECT_DOUBLE_EQ(result[0], 0.6);
-    EXPECT_DOUBLE_EQ(result[1], 0.0);
-    EXPECT_DOUBLE_EQ(result[2], 0.8);
-}
-
-// =============================================================================
-// Type Deduction Tests
-// =============================================================================
-
-TEST_F(VectorExprTest, TypeDeductionCorrectness) {
-    Vector<float, 3> vf{1.0f, 2.0f, 3.0f};
-    Vector<double, 3> vd{4.0, 5.0, 6.0};
-
-    // Mixed type operations should promote to higher precision
-    auto expr = vf + vf;  // float expression
-    using ExprType = decltype(expr[0]);
-    EXPECT_TRUE((std::is_same_v<ExprType, float>));
-
-    // Test that expression evaluates correctly
-    Vector<float, 3> result = expr;
-    EXPECT_FLOAT_EQ(result[0], 2.0f);
-    EXPECT_FLOAT_EQ(result[1], 4.0f);
-    EXPECT_FLOAT_EQ(result[2], 6.0f);
-}
-
-// =============================================================================
-// SFINAE and Compile-time Tests
-// =============================================================================
-
-TEST_F(VectorExprTest, SFINAEConstraints) {
-    // Test that VectorExpr operators only work with VectorExpr types
-    Vector<double, 3> v1{1.0, 2.0, 3.0};
-    Vector<double, 3> v2{4.0, 5.0, 6.0};
-
-    // This should compile
-    auto expr = v1 + v2;
-    Vector<double, 3> result = expr;
-
-    // Verify the constraint checking
-    EXPECT_TRUE((std::is_base_of_v<VectorExpr<Vector<double, 3>>, Vector<double, 3>>));
-}
-
-// =============================================================================
-// Aliasing and Self-Assignment Tests
-// =============================================================================
-
-TEST_F(VectorExprTest, SelfAssignmentWithExpression) {
-    Vector<double, 3> a{1.0, 2.0, 3.0};
-    Vector<double, 3> b{4.0, 5.0, 6.0};
-
-    // Self-assignment through expression
-    a = a + b;
-
-    EXPECT_DOUBLE_EQ(a[0], 5.0);
-    EXPECT_DOUBLE_EQ(a[1], 7.0);
-    EXPECT_DOUBLE_EQ(a[2], 9.0);
-}
-
-TEST_F(VectorExprTest, AliasingInExpression) {
-    Vector<double, 3> a{2.0, 3.0, 4.0};
-    Vector<double, 3> b{1.0, 1.0, 1.0};
-
-    // a appears on both sides
-    a = b + a;
-
-    EXPECT_DOUBLE_EQ(a[0], 3.0);
-    EXPECT_DOUBLE_EQ(a[1], 4.0);
-    EXPECT_DOUBLE_EQ(a[2], 5.0);
-}
-
-// =============================================================================
-// Edge Cases Tests
-// =============================================================================
-
-TEST_F(VectorExprTest, SingleElementVector) {
-    Vector<double, 1> a{5.0};
-    Vector<double, 1> b{3.0};
-
-    Vector<double, 1> result = a + b - a * 0.5;
-
-    EXPECT_DOUBLE_EQ(result[0], 5.5);
-}
-
-TEST_F(VectorExprTest, EmptyExpression) {
-    Vector<double, 3> v{1.0, 2.0, 3.0};
-
-    // Expression that evaluates to identity
-    Vector<double, 3> result = v + v * 0.0;
-
-    EXPECT_DOUBLE_EQ(result[0], 1.0);
-    EXPECT_DOUBLE_EQ(result[1], 2.0);
-    EXPECT_DOUBLE_EQ(result[2], 3.0);
-}
-
-TEST_F(VectorExprTest, LargeVectorExpression) {
-    const size_t N = 100;
-    Vector<double, N> a, b, c;
-
-    for (size_t i = 0; i < N; ++i) {
-        a[i] = static_cast<double>(i);
-        b[i] = static_cast<double>(i * 2);
-        c[i] = static_cast<double>(i * 3);
-    }
-
-    Vector<double, N> result = a + b - c / 2.0;
-
-    for (size_t i = 0; i < N; ++i) {
-        EXPECT_DOUBLE_EQ(result[i], i + 2.0 * i - 1.5 * i);
-    }
-}
-
-// =============================================================================
-// Complex Expression Pattern Tests
-// =============================================================================
-
-TEST_F(VectorExprTest, ComplexNestedExpression) {
-    Vector<double, 3> a{1.0, 2.0, 3.0};
-    Vector<double, 3> b{4.0, 5.0, 6.0};
-    Vector<double, 3> c{7.0, 8.0, 9.0};
-
-    // Complex expression with multiple operation types
-    Vector<double, 3> result = 2.0 * abs(a - b) + sqrt(hadamard(c, c)) / 3.0;
-
-    // Verify each component
-    // |a - b| = |(-3, -3, -3)| = (3, 3, 3)
-    // 2 * (3, 3, 3) = (6, 6, 6)
-    // c * c = (49, 64, 81)
-    // sqrt(c * c) = (7, 8, 9)
-    // sqrt(c * c) / 3 = (7/3, 8/3, 3)
-    // result = (6 + 7/3, 6 + 8/3, 6 + 3) = (25/3, 26/3, 9)
-
-    EXPECT_TRUE(approx_equal(result[0], 25.0/3.0));
-    EXPECT_TRUE(approx_equal(result[1], 26.0/3.0));
-    EXPECT_DOUBLE_EQ(result[2], 9.0);
-}
-
-TEST_F(VectorExprTest, ChainedUnaryOperations) {
-    Vector<double, 4> v{-4.0, -9.0, -16.0, -25.0};
-
-    // Chain of unary operations
-    Vector<double, 4> result = sqrt(abs(-v));
-
-    EXPECT_DOUBLE_EQ(result[0], 2.0);
-    EXPECT_DOUBLE_EQ(result[1], 3.0);
-    EXPECT_DOUBLE_EQ(result[2], 4.0);
-    EXPECT_DOUBLE_EQ(result[3], 5.0);
-}

From 82a1158eceeb4ad5c09591f9139bc29cea2e5e55 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Tue, 9 Jun 2026 18:21:37 -0700
Subject: [PATCH 14/22] adding doxygen to Common submodule

---
 Code/Source/solver/FE/Common/FEException.h | 189 +++++++++++++++
 Code/Source/solver/FE/Common/Types.h       | 253 ++++++++++++++-------
 2 files changed, 354 insertions(+), 88 deletions(-)

diff --git a/Code/Source/solver/FE/Common/FEException.h b/Code/Source/solver/FE/Common/FEException.h
index 67b7da234..033b85eb1 100644
--- a/Code/Source/solver/FE/Common/FEException.h
+++ b/Code/Source/solver/FE/Common/FEException.h
@@ -22,8 +22,34 @@
 namespace svmp {
 namespace FE {
 
+/// \defgroup FE_CommonExceptions Exceptions
+/// \ingroup FE_Common
+/// \brief FE exception hierarchy and throw/check helper functions.
+///
+/// \details All FE-specific exceptions derive from FEException, which itself
+/// derives from the shared solver ExceptionBase. Specialized subclasses carry
+/// structured context (element type, DOF index, backend name and error code,
+/// iteration counts, Jacobian determinants) so call sites can report
+/// actionable diagnostics. The free helper templates raise(), throw_if(),
+/// check_arg(), check_not_null(), and check_index() wrap common validation
+/// patterns with source-location capture.
+/// @{
+
+/**
+ * @brief Base exception type for errors originating in the FE library
+ *
+ * Carries a status code and source location alongside the message. Derived
+ * classes select an appropriate StatusCode and may attach additional
+ * structured context.
+ */
 class FEException : public ExceptionBase {
 public:
+    /// @brief Construct with a message and optional status code and source location.
+    /// @param message Human-readable error description.
+    /// @param status Status code classifying the failure.
+    /// @param file Source file where the error was raised.
+    /// @param line Source line where the error was raised.
+    /// @param function Function where the error was raised.
     FEException(const std::string& message,
                 StatusCode status = StatusCode::Unknown,
                 const char* file = "",
@@ -38,6 +64,11 @@ class FEException : public ExceptionBase {
     {
     }
 
+    /// @brief Construct with a message and source location, using an Unknown status.
+    /// @param message Human-readable error description.
+    /// @param file Source file where the error was raised.
+    /// @param line Source line where the error was raised.
+    /// @param function Function where the error was raised.
     FEException(const std::string& message,
                 const char* file,
                 int line,
@@ -46,11 +77,21 @@ class FEException : public ExceptionBase {
     {
     }
 
+    /// @brief Status code classifying the failure.
+    /// @return The status code recorded at construction.
     StatusCode status() const noexcept { return status_code(); }
 };
 
+/**
+ * @brief An argument failed validation
+ */
 class InvalidArgumentException : public FEException {
 public:
+    /// @brief Construct with a message and optional source location.
+    /// @param message Human-readable error description.
+    /// @param file Source file where the error was raised.
+    /// @param line Source line where the error was raised.
+    /// @param function Function where the error was raised.
     InvalidArgumentException(const std::string& message,
                              const char* file = "",
                              int line = 0,
@@ -61,8 +102,19 @@ class InvalidArgumentException : public FEException {
     }
 };
 
+/**
+ * @brief Unsupported or malformed element request
+ *
+ * Records the offending element type so error reports can name it.
+ */
 class InvalidElementException : public FEException {
 public:
+    /// @brief Construct with a message and optional element-type context.
+    /// @param message Human-readable error description.
+    /// @param element_type Name of the offending element type; appended to the message when non-empty.
+    /// @param file Source file where the error was raised.
+    /// @param line Source line where the error was raised.
+    /// @param function Function where the error was raised.
     InvalidElementException(const std::string& message,
                             std::string element_type = "",
                             const char* file = "",
@@ -77,6 +129,8 @@ class InvalidElementException : public FEException {
     {
     }
 
+    /// @brief Name of the offending element type.
+    /// @return Element-type name; empty when not provided.
     const std::string& element_type() const noexcept { return element_type_; }
 
 private:
@@ -93,8 +147,19 @@ class InvalidElementException : public FEException {
     std::string element_type_;
 };
 
+/**
+ * @brief Degree-of-freedom numbering or lookup failure
+ *
+ * Records the offending DOF index so error reports can name it.
+ */
 class DofException : public FEException {
 public:
+    /// @brief Construct with a message and optional DOF-index context.
+    /// @param message Human-readable error description.
+    /// @param dof_index Offending DOF index; appended to the message unless it equals invalid_dof_index().
+    /// @param file Source file where the error was raised.
+    /// @param line Source line where the error was raised.
+    /// @param function Function where the error was raised.
     DofException(const std::string& message,
                  long long dof_index = invalid_dof_index(),
                  const char* file = "",
@@ -109,7 +174,11 @@ class DofException : public FEException {
     {
     }
 
+    /// @brief Offending DOF index.
+    /// @return DOF index; invalid_dof_index() when not provided.
     long long dof_index() const noexcept { return dof_index_; }
+    /// @brief Sentinel meaning "no DOF index attached".
+    /// @return The sentinel value -1.
     static constexpr long long invalid_dof_index() noexcept { return -1; }
 
 private:
@@ -126,8 +195,16 @@ class DofException : public FEException {
     long long dof_index_ = invalid_dof_index();
 };
 
+/**
+ * @brief Global assembly failure
+ */
 class AssemblyException : public FEException {
 public:
+    /// @brief Construct with a message and optional source location.
+    /// @param message Human-readable error description.
+    /// @param file Source file where the error was raised.
+    /// @param line Source line where the error was raised.
+    /// @param function Function where the error was raised.
     AssemblyException(const std::string& message,
                       const char* file = "",
                       int line = 0,
@@ -137,8 +214,21 @@ class AssemblyException : public FEException {
     }
 };
 
+/**
+ * @brief Failure reported by a linear-algebra or solver backend
+ *
+ * Records the backend name and its native error code so error reports can
+ * identify the failing dependency.
+ */
 class BackendException : public FEException {
 public:
+    /// @brief Construct with a message and optional backend context.
+    /// @param message Human-readable error description.
+    /// @param backend_name Name of the failing backend; appended to the message when non-empty.
+    /// @param error_code Backend-native error code; appended to the message when nonzero.
+    /// @param file Source file where the error was raised.
+    /// @param line Source line where the error was raised.
+    /// @param function Function where the error was raised.
     BackendException(const std::string& message,
                      std::string backend_name = "",
                      int error_code = 0,
@@ -155,7 +245,11 @@ class BackendException : public FEException {
     {
     }
 
+    /// @brief Name of the failing backend.
+    /// @return Backend name; empty when not provided.
     const std::string& backend_name() const noexcept { return backend_name_; }
+    /// @brief Backend-native error code.
+    /// @return Error code; zero when not provided.
     int error_code() const noexcept { return error_code_; }
 
 private:
@@ -185,8 +279,16 @@ class BackendException : public FEException {
     int error_code_ = 0;
 };
 
+/**
+ * @brief Requested feature is not implemented
+ */
 class NotImplementedException : public FEException {
 public:
+    /// @brief Construct from the name of the missing feature.
+    /// @param feature Description of the unimplemented feature.
+    /// @param file Source file where the error was raised.
+    /// @param line Source line where the error was raised.
+    /// @param function Function where the error was raised.
     NotImplementedException(const std::string& feature,
                             const char* file = "",
                             int line = 0,
@@ -200,8 +302,16 @@ class NotImplementedException : public FEException {
     }
 };
 
+/**
+ * @brief Required initialization step has not been performed
+ */
 class NotInitializedException : public FEException {
 public:
+  /// @brief Construct from the name of the uninitialized feature.
+  /// @param feature Description of the missing initialization.
+  /// @param file Source file where the error was raised.
+  /// @param line Source line where the error was raised.
+  /// @param function Function where the error was raised.
   NotInitializedException(const std::string &feature,
                           const char *file,
                           int line = 0,
@@ -215,8 +325,21 @@ class NotInitializedException : public FEException {
   }
 };
 
+/**
+ * @brief Iterative process failed to converge
+ *
+ * Records the iteration count and final residual so error reports can show
+ * how far the iteration progressed.
+ */
 class ConvergenceException : public FEException {
 public:
+    /// @brief Construct with a message and optional iteration context.
+    /// @param message Human-readable error description.
+    /// @param iteration Iteration at which the failure was detected; appended to the message when non-negative.
+    /// @param residual Final residual; appended to the message when positive.
+    /// @param file Source file where the error was raised.
+    /// @param line Source line where the error was raised.
+    /// @param function Function where the error was raised.
     ConvergenceException(const std::string& message,
                          int iteration = -1,
                          double residual = 0.0,
@@ -233,7 +356,11 @@ class ConvergenceException : public FEException {
     {
     }
 
+    /// @brief Iteration at which the failure was detected.
+    /// @return Iteration count; -1 when not provided.
     int iteration() const noexcept { return iteration_; }
+    /// @brief Final residual value.
+    /// @return Residual; 0.0 when not provided.
     double residual() const noexcept { return residual_; }
 
 private:
@@ -257,8 +384,20 @@ class ConvergenceException : public FEException {
     double residual_ = 0.0;
 };
 
+/**
+ * @brief Element geometric mapping is singular or inverted
+ *
+ * Records the offending Jacobian determinant so error reports can show the
+ * degeneracy.
+ */
 class SingularMappingException : public FEException {
 public:
+    /// @brief Construct with a message and the offending Jacobian determinant.
+    /// @param message Human-readable error description.
+    /// @param jacobian_det Jacobian determinant at the failure point; appended to the message.
+    /// @param file Source file where the error was raised.
+    /// @param line Source line where the error was raised.
+    /// @param function Function where the error was raised.
     SingularMappingException(const std::string& message,
                              double jacobian_det = 0.0,
                              const char* file = "",
@@ -273,6 +412,8 @@ class SingularMappingException : public FEException {
     {
     }
 
+    /// @brief Jacobian determinant at the failure point.
+    /// @return The determinant recorded at construction.
     double jacobian_det() const noexcept { return jacobian_det_; }
 
 private:
@@ -285,12 +426,27 @@ class SingularMappingException : public FEException {
     double jacobian_det_ = 0.0;
 };
 
+/**
+ * @brief Throw an FE exception with source-location capture
+ * @tparam ExceptionT Exception type to throw.
+ * @tparam Args Constructor argument types forwarded to the exception.
+ * @param location Source location to record in the exception.
+ * @param args Arguments forwarded to the exception constructor.
+ */
 template <class ExceptionT, class... Args>
 [[noreturn]] inline void raise(SourceLocation location, Args&&... args)
 {
     ::svmp::raise<ExceptionT>(location, std::forward<Args>(args)...);
 }
 
+/**
+ * @brief Throw an FE exception when a condition holds
+ * @tparam ExceptionT Exception type to throw; defaults to FEException.
+ * @tparam Args Constructor argument types forwarded to the exception.
+ * @param condition Condition that triggers the throw when true.
+ * @param location Source location to record in the exception.
+ * @param args Arguments forwarded to the exception constructor.
+ */
 template <class ExceptionT = FEException, class... Args>
 inline void throw_if(bool condition, SourceLocation location, Args&&... args)
 {
@@ -299,6 +455,14 @@ inline void throw_if(bool condition, SourceLocation location, Args&&... args)
     }
 }
 
+/**
+ * @brief Validate an argument condition, throwing when it fails
+ * @tparam ExceptionT Exception type to throw; defaults to InvalidArgumentException.
+ * @tparam Args Constructor argument types forwarded to the exception.
+ * @param condition Condition that must hold for the argument to be valid.
+ * @param location Source location to record in the exception.
+ * @param args Arguments forwarded to the exception constructor.
+ */
 template <class ExceptionT = InvalidArgumentException, class... Args>
 inline void check_arg(bool condition, SourceLocation location, Args&&... args)
 {
@@ -306,6 +470,15 @@ inline void check_arg(bool condition, SourceLocation location, Args&&... args)
                                   std::forward<Args>(args)...);
 }
 
+/**
+ * @brief Validate that a pointer is non-null, throwing when it is null
+ * @tparam ExceptionT Exception type to throw; defaults to InvalidArgumentException.
+ * @tparam PointerT Pointer-like type being checked.
+ * @tparam Args Constructor argument types forwarded to the exception.
+ * @param ptr Pointer to validate.
+ * @param location Source location to record in the exception.
+ * @param args Arguments forwarded to the exception constructor.
+ */
 template <class ExceptionT = InvalidArgumentException, class PointerT,
           class... Args>
 inline void check_not_null(PointerT ptr, SourceLocation location,
@@ -314,6 +487,15 @@ inline void check_not_null(PointerT ptr, SourceLocation location,
     ::svmp::check_not_null<ExceptionT>(ptr, location, std::forward<Args>(args)...);
 }
 
+/**
+ * @brief Validate that an index lies in [0, size), throwing when out of bounds
+ * @tparam ExceptionT Exception type to throw; defaults to InvalidArgumentException.
+ * @tparam IndexT Integral index type.
+ * @tparam SizeT Integral size type.
+ * @param index Index to validate.
+ * @param size Exclusive upper bound for the index.
+ * @param location Source location to record in the exception.
+ */
 template <class ExceptionT = InvalidArgumentException, class IndexT,
           class SizeT>
 inline void check_index(IndexT index, SizeT size, SourceLocation location)
@@ -329,12 +511,19 @@ inline void check_index(IndexT index, SizeT size, SourceLocation location)
             " out of bounds [0, " + std::to_string(fe_check_size_value) + ")");
 }
 
+/**
+ * @brief Throw NotImplementedException for a missing feature
+ * @param feature Description of the unimplemented feature.
+ * @param location Source location to record in the exception.
+ */
 [[noreturn]] inline void not_implemented(const std::string& feature,
                                          SourceLocation location)
 {
     ::svmp::FE::raise<NotImplementedException>(location, feature);
 }
 
+/// @}
+
 } // namespace FE
 } // namespace svmp
 
diff --git a/Code/Source/solver/FE/Common/Types.h b/Code/Source/solver/FE/Common/Types.h
index e3d5a46e9..1f57ffcc5 100644
--- a/Code/Source/solver/FE/Common/Types.h
+++ b/Code/Source/solver/FE/Common/Types.h
@@ -16,18 +16,26 @@
 
 #if defined(SVMP_FE_WITH_MESH) && SVMP_FE_WITH_MESH
 #  include "Mesh/Core/MeshTypes.h"
+/// Nonzero when FE shares scalar/index types with the Mesh library.
 #  define SVMP_FE_HAS_MESH_TYPES 1
 #else
 // Build FE without Mesh types unless explicitly enabled.
+/// Nonzero when FE shares scalar/index types with the Mesh library.
 #  define SVMP_FE_HAS_MESH_TYPES 0
 #endif
 
 #if !SVMP_FE_HAS_MESH_TYPES
 namespace svmp {
-// Minimal fallback when the Mesh library is not available.
-// Keeps FE compilation self-contained while preserving the same namespace.
 #ifndef SVMP_CELL_FAMILY_DEFINED
+/// Guard marking that svmp::CellFamily has been defined.
 #define SVMP_CELL_FAMILY_DEFINED 1
+/**
+ * @brief Minimal fallback for svmp::CellFamily when the Mesh library is unavailable
+ * @ingroup FE_CommonTypes
+ *
+ * Keeps FE compilation self-contained while preserving the same namespace
+ * and enumerator set as the Mesh library's cell-family classification.
+ */
 enum class CellFamily {
     Point,
     Line,
@@ -51,16 +59,40 @@ enum class CellFamily {
 #include <limits>
 
 #if defined(_MSC_VER)
+/// Portable restrict qualifier for aliasing-free pointer parameters.
 #  define SVMP_RESTRICT __restrict
 #elif defined(__clang__) || defined(__GNUC__)
+/// Portable restrict qualifier for aliasing-free pointer parameters.
 #  define SVMP_RESTRICT __restrict__
 #else
+/// Portable restrict qualifier for aliasing-free pointer parameters.
 #  define SVMP_RESTRICT
 #endif
 
+/// \defgroup FE_Common Common
+/// \ingroup FE
+/// \brief Shared vocabulary types, constants, and exception infrastructure used by every FE module.
+///
+/// \details The Common module collects the foundational definitions that the
+/// rest of the FE library builds on: index and scalar type aliases; element,
+/// basis, quadrature, and field enumerations; sentinel constants and strong
+/// type wrappers; and the FE exception hierarchy together with its
+/// argument-checking helpers.
+
 namespace svmp {
 namespace FE {
 
+/// \defgroup FE_CommonTypes Types
+/// \ingroup FE_Common
+/// \brief Core type aliases, enumerations, constants, geometric types, and compile-time traits.
+///
+/// \details This group documents the index and identifier types used for
+/// element-local and global numbering, the element/basis/quadrature/field
+/// enumerations shared across modules, sentinel constants, reference- and
+/// physical-space geometric aliases, and the strong-type utilities that
+/// prevent accidental mixing of conceptually distinct values.
+/// @{
+
 // ============================================================================
 // Index Types
 // ============================================================================
@@ -88,10 +120,16 @@ using GlobalIndex = std::int64_t;
  * Provides type safety at compile time.
  */
 struct DofIndex {
-    GlobalIndex value;
+    GlobalIndex value;  ///< Underlying global DOF index; negative values are invalid.
 
+    /// @brief Construct a DOF index, defaulting to the invalid sentinel.
+    /// @param v Global DOF index value.
     constexpr explicit DofIndex(GlobalIndex v = -1) noexcept : value(v) {}
+    /// @brief Convert to the underlying global index value.
+    /// @return The stored global index.
     constexpr operator GlobalIndex() const noexcept { return value; }
+    /// @brief Check whether this index refers to a valid DOF.
+    /// @return True when the stored value is non-negative.
     constexpr bool is_valid() const noexcept { return value >= 0; }
 };
 
@@ -109,28 +147,32 @@ using BlockId = std::uint16_t;
 
 // Import mesh library scalar/index types when available (optional dependency).
 #if SVMP_FE_HAS_MESH_TYPES
-using MeshIndex = svmp::index_t;
-using MeshOffset = svmp::offset_t;
-using MeshGlobalId = svmp::gid_t;
-using Real = svmp::real_t;  // Use same precision as Mesh library
+using MeshIndex = svmp::index_t;        ///< Local mesh entity index, shared with the Mesh library.
+using MeshOffset = svmp::offset_t;      ///< Offset type for mesh connectivity arrays.
+using MeshGlobalId = svmp::gid_t;       ///< Global mesh entity identifier.
+using Real = svmp::real_t;              ///< Floating-point scalar type; same precision as the Mesh library.
 #else
-using MeshIndex = std::int32_t;
-using MeshOffset = std::int64_t;
-using MeshGlobalId = std::int64_t;
-using Real = double;
+using MeshIndex = std::int32_t;         ///< Local mesh entity index, shared with the Mesh library.
+using MeshOffset = std::int64_t;        ///< Offset type for mesh connectivity arrays.
+using MeshGlobalId = std::int64_t;      ///< Global mesh entity identifier.
+using Real = double;                    ///< Floating-point scalar type; same precision as the Mesh library.
 #endif
 
 // ============================================================================
 // Constants
 // ============================================================================
 
+/// Sentinel for an unset or out-of-range local index.
 constexpr LocalIndex INVALID_LOCAL_INDEX = std::numeric_limits<LocalIndex>::max();
+/// Sentinel for an unset or out-of-range global index.
 constexpr GlobalIndex INVALID_GLOBAL_INDEX = -1;
+/// Sentinel FieldId meaning "uninitialized / no field".
 constexpr FieldId INVALID_FIELD_ID = std::numeric_limits<FieldId>::max();
 /// Sentinel FieldId for geometry-only quantities (no DOF dependence).
 /// Uses first registered field's space for quadrature, but logically decoupled
 /// from any specific field's DOFs.
 constexpr FieldId GEOMETRY_FIELD_ID = std::numeric_limits<FieldId>::max() - 1;
+/// Sentinel for an unset or out-of-range block identifier.
 constexpr BlockId INVALID_BLOCK_ID = std::numeric_limits<BlockId>::max();
 
 /**
@@ -169,9 +211,9 @@ constexpr int MAX_FIELD_VALUE_COMPONENTS = 9;
  * Node-scoped auxiliary models with Lagrange Kronecker delta).
  */
 struct FieldValueEntry {
-    FieldId field{INVALID_FIELD_ID};
-    int n_components{0};
-    Real components[MAX_FIELD_VALUE_COMPONENTS]{};
+    FieldId field{INVALID_FIELD_ID};                  ///< Field this value belongs to.
+    int n_components{0};                              ///< Number of valid entries in components.
+    Real components[MAX_FIELD_VALUE_COMPONENTS]{};    ///< Component values, row-major for tensors.
 };
 
 // ============================================================================
@@ -186,115 +228,115 @@ struct FieldValueEntry {
  */
 enum class ElementType : std::uint8_t {
     // Linear elements
-    Line2      = 0,   // 2-node line
-    Triangle3  = 1,   // 3-node triangle
-    Quad4      = 2,   // 4-node quadrilateral
-    Tetra4     = 3,   // 4-node tetrahedron
-    Hex8       = 4,   // 8-node hexahedron
-    Wedge6     = 5,   // 6-node wedge/prism
-    Pyramid5   = 6,   // 5-node pyramid
+    Line2      = 0,   ///< 2-node line
+    Triangle3  = 1,   ///< 3-node triangle
+    Quad4      = 2,   ///< 4-node quadrilateral
+    Tetra4     = 3,   ///< 4-node tetrahedron
+    Hex8       = 4,   ///< 8-node hexahedron
+    Wedge6     = 5,   ///< 6-node wedge/prism
+    Pyramid5   = 6,   ///< 5-node pyramid
 
     // Quadratic elements
-    Line3      = 10,  // 3-node line
-    Triangle6  = 11,  // 6-node triangle
-    Quad9      = 12,  // 9-node quadrilateral (bi-quadratic)
-    Quad8      = 13,  // 8-node quadrilateral (serendipity)
-    Tetra10    = 14,  // 10-node tetrahedron
-    Hex27      = 15,  // 27-node hexahedron (tri-quadratic)
-    Hex20      = 16,  // 20-node hexahedron (serendipity)
-    Wedge15    = 17,  // 15-node wedge
-    Wedge18    = 18,  // 18-node wedge (complete quadratic)
-    Pyramid13  = 19,  // 13-node pyramid
-    Pyramid14  = 20,  // 14-node pyramid
+    Line3      = 10,  ///< 3-node line
+    Triangle6  = 11,  ///< 6-node triangle
+    Quad9      = 12,  ///< 9-node quadrilateral (bi-quadratic)
+    Quad8      = 13,  ///< 8-node quadrilateral (serendipity)
+    Tetra10    = 14,  ///< 10-node tetrahedron
+    Hex27      = 15,  ///< 27-node hexahedron (tri-quadratic)
+    Hex20      = 16,  ///< 20-node hexahedron (serendipity)
+    Wedge15    = 17,  ///< 15-node wedge
+    Wedge18    = 18,  ///< 18-node wedge (complete quadratic)
+    Pyramid13  = 19,  ///< 13-node pyramid
+    Pyramid14  = 20,  ///< 14-node pyramid
 
     // Special elements
-    Point1     = 30,  // 1-node point element
+    Point1     = 30,  ///< 1-node point element
 
-    Unknown    = 255
+    Unknown    = 255  ///< Unrecognized or uninitialized element type
 };
 
 /**
  * @brief Quadrature rule types
  */
 enum class QuadratureType : std::uint8_t {
-    GaussLegendre,     // Standard Gaussian quadrature
-    GaussLobatto,      // Includes endpoints (for spectral elements)
-    Newton,            // Newton-Cotes rules
-    Reduced,           // Order-based reduced integration for locking
-    PositionBased,     // Position-based reduced integration (legacy compatible)
-    Composite,         // Composite rules for adaptivity
-    Custom             // User-defined quadrature points
+    GaussLegendre,     ///< Standard Gaussian quadrature
+    GaussLobatto,      ///< Includes endpoints (for spectral elements)
+    Newton,            ///< Newton-Cotes rules
+    Reduced,           ///< Order-based reduced integration for locking
+    PositionBased,     ///< Position-based reduced integration (legacy compatible)
+    Composite,         ///< Composite rules for adaptivity
+    Custom             ///< User-defined quadrature points
 };
 
 /**
  * @brief Basis function families
  */
 enum class BasisType : std::uint8_t {
-    Lagrange,          // Standard nodal Lagrange basis
-    Hierarchical,      // Hierarchical/modal basis
-    Bernstein,         // Bernstein polynomials
-    NURBS,             // Non-uniform rational B-splines
-    BSpline,           // Non-rational B-spline basis
-    Spectral,          // Spectral element basis
-    Serendipity,       // Serendipity elements
-    Hermite,           // Hermite C1 continuity basis
-    RaviartThomas,     // H(div) Raviart-Thomas family
-    Nedelec,           // H(curl) Nedelec edge elements
-    BDM,               // H(div) Brezzi-Douglas-Marini family
-    Bubble,            // Interior bubble functions for enrichment
-    Custom             // User-defined basis
+    Lagrange,          ///< Standard nodal Lagrange basis
+    Hierarchical,      ///< Hierarchical/modal basis
+    Bernstein,         ///< Bernstein polynomials
+    NURBS,             ///< Non-uniform rational B-splines
+    BSpline,           ///< Non-rational B-spline basis
+    Spectral,          ///< Spectral element basis
+    Serendipity,       ///< Serendipity elements
+    Hermite,           ///< Hermite C1 continuity basis
+    RaviartThomas,     ///< H(div) Raviart-Thomas family
+    Nedelec,           ///< H(curl) Nedelec edge elements
+    BDM,               ///< H(div) Brezzi-Douglas-Marini family
+    Bubble,            ///< Interior bubble functions for enrichment
+    Custom             ///< User-defined basis
 };
 
 /**
  * @brief Field types for function spaces
  */
 enum class FieldType : std::uint8_t {
-    Scalar,            // Scalar field (temperature, pressure)
-    Vector,            // Vector field (velocity, displacement)
-    Tensor,            // Tensor field (stress, strain)
-    SymmetricTensor,   // Symmetric tensor field
-    Mixed              // Mixed/composite field
+    Scalar,            ///< Scalar field (temperature, pressure)
+    Vector,            ///< Vector field (velocity, displacement)
+    Tensor,            ///< Tensor field (stress, strain)
+    SymmetricTensor,   ///< Symmetric tensor field
+    Mixed              ///< Mixed/composite field
 };
 
 /**
  * @brief Continuity requirements for function spaces
  */
 enum class Continuity : std::uint8_t {
-    C0,                // Continuous (standard FEM)
-    C1,                // C1 continuous (for plates/shells)
-    L2,                // L2 (discontinuous)
-    H_div,             // H(div) conforming
-    H_curl,            // H(curl) conforming
-    Custom
+    C0,                ///< Continuous (standard FEM)
+    C1,                ///< C1 continuous (for plates/shells)
+    L2,                ///< L2 (discontinuous)
+    H_div,             ///< H(div) conforming
+    H_curl,            ///< H(curl) conforming
+    Custom             ///< User-defined continuity requirement
 };
 
 /**
  * @brief Assembly strategies
  */
 enum class AssemblyStrategy : std::uint8_t {
-    ElementByElement,  // Traditional element loop
-    Vectorized,        // SIMD vectorized assembly
-    MatrixFree,        // Matrix-free operators
-    Hybrid             // Mixed strategy
+    ElementByElement,  ///< Traditional element loop
+    Vectorized,        ///< SIMD vectorized assembly
+    MatrixFree,        ///< Matrix-free operators
+    Hybrid             ///< Mixed strategy
 };
 
 /**
  * @brief Status codes for FE operations
  */
 enum class FEStatus : std::uint8_t {
-    Success           = 0,
-    InvalidArgument   = 1,
-    InvalidElement    = 2,
-    SingularMapping   = 3,
-    QuadratureError   = 4,
-    AssemblyError     = 5,
-    BackendError      = 6,
-    NotImplemented    = 7,
-    ConvergenceError  = 8,
-    AllocationError   = 9,
-    MPIError          = 10,
-    IOError           = 11,
-    Unknown           = 255
+    Success           = 0,    ///< Operation completed successfully
+    InvalidArgument   = 1,    ///< An argument failed validation
+    InvalidElement    = 2,    ///< Unsupported or malformed element
+    SingularMapping   = 3,    ///< Element mapping Jacobian is singular
+    QuadratureError   = 4,    ///< Quadrature rule construction or evaluation failed
+    AssemblyError     = 5,    ///< Global assembly failure
+    BackendError      = 6,    ///< Linear-algebra backend failure
+    NotImplemented    = 7,    ///< Requested feature is not implemented
+    ConvergenceError  = 8,    ///< Iterative process failed to converge
+    AllocationError   = 9,    ///< Memory allocation failure
+    MPIError          = 10,   ///< MPI communication failure
+    IOError           = 11,   ///< File or stream I/O failure
+    Unknown           = 255   ///< Unclassified error
 };
 
 // ============================================================================
@@ -303,6 +345,7 @@ enum class FEStatus : std::uint8_t {
 
 /**
  * @brief Point in reference element coordinates
+ * @tparam Dim Reference-space dimension
  */
 template<int Dim>
 using ReferencePoint = std::array<Real, static_cast<std::size_t>(Dim)>;
@@ -314,6 +357,8 @@ using PhysicalPoint = std::array<Real, 3>;
 
 /**
  * @brief Jacobian matrix type
+ * @tparam SpatialDim Physical-space dimension (rows)
+ * @tparam ReferenceDim Reference-space dimension (columns)
  */
 template<int SpatialDim, int ReferenceDim = SpatialDim>
 using Jacobian = std::array<std::array<Real, static_cast<std::size_t>(ReferenceDim)>, static_cast<std::size_t>(SpatialDim)>;
@@ -327,31 +372,51 @@ using Jacobian = std::array<std::array<Real, static_cast<std::size_t>(ReferenceD
  *
  * Prevents accidental mixing of conceptually different types that have
  * the same underlying representation.
+ *
+ * @tparam T Underlying value type
+ * @tparam Tag Empty tag type that distinguishes otherwise identical wrappers
  */
 template<typename T, typename Tag>
 class StrongType {
 public:
+    /// @brief Underlying value type.
     using ValueType = T;
 
+    /// @brief Value-initialize the wrapped value.
     constexpr StrongType() noexcept(std::is_nothrow_default_constructible_v<T>)
         : value_{} {}
 
+    /// @brief Wrap an explicit value.
+    /// @param value Value to store.
     constexpr explicit StrongType(T value) noexcept(std::is_nothrow_move_constructible_v<T>)
         : value_(std::move(value)) {}
 
+    /// @brief Access the wrapped value.
+    /// @return Reference to the wrapped value.
     constexpr T& get() noexcept { return value_; }
+    /// @brief Access the wrapped value.
+    /// @return Reference to the wrapped value.
     constexpr const T& get() const noexcept { return value_; }
 
-    // Explicit conversion
+    /// @brief Explicitly convert back to the underlying type.
+    /// @return Copy of the wrapped value.
     constexpr explicit operator T() const noexcept { return value_; }
 
-    // Comparison operators
+    /// @brief Compare wrapped values for equality.
+    /// @param other Wrapper to compare against.
+    /// @return True when the wrapped values are equal.
     constexpr bool operator==(const StrongType& other) const noexcept {
         return value_ == other.value_;
     }
+    /// @brief Compare wrapped values for inequality.
+    /// @param other Wrapper to compare against.
+    /// @return True when the wrapped values differ.
     constexpr bool operator!=(const StrongType& other) const noexcept {
         return value_ != other.value_;
     }
+    /// @brief Order by wrapped value.
+    /// @param other Wrapper to compare against.
+    /// @return True when this wrapped value orders before the other.
     constexpr bool operator<(const StrongType& other) const noexcept {
         return value_ < other.value_;
     }
@@ -361,12 +426,14 @@ class StrongType {
 };
 
 // Specific strong types for common use cases
-struct QuadraturePointTag {};
-struct QuadratureWeightTag {};
-struct BasisValueTag {};
-struct BasisGradientTag {};
+struct QuadraturePointTag {};   ///< Tag type for quadrature-point indices.
+struct QuadratureWeightTag {};  ///< Tag type for quadrature weights.
+struct BasisValueTag {};        ///< Tag type for basis-function values.
+struct BasisGradientTag {};     ///< Tag type for basis-function gradients.
 
+/// Type-safe index of a quadrature point within a rule.
 using QuadraturePointIndex = StrongType<LocalIndex, QuadraturePointTag>;
+/// Type-safe quadrature weight value.
 using QuadratureWeight = StrongType<Real, QuadratureWeightTag>;
 
 // ============================================================================
@@ -388,6 +455,7 @@ struct is_index_type<GlobalIndex> : std::true_type {};
 template<>
 struct is_index_type<DofIndex> : std::true_type {};
 
+/// Convenience variable template for is_index_type.
 template<typename T>
 inline constexpr bool is_index_type_v = is_index_type<T>::value;
 
@@ -400,6 +468,7 @@ struct is_field_type : std::false_type {};
 template<>
 struct is_field_type<FieldType> : std::true_type {};
 
+/// Convenience variable template for is_field_type.
 template<typename T>
 inline constexpr bool is_field_type_v = is_field_type<T>::value;
 
@@ -409,6 +478,8 @@ inline constexpr bool is_field_type_v = is_field_type<T>::value;
 
 /**
  * @brief Convert FE ElementType to Mesh CellFamily
+ * @param elem Element type to classify.
+ * @return Cell family of the element's linear topology; Point for unknown types.
  */
 constexpr svmp::CellFamily to_mesh_family(ElementType elem) noexcept {
     switch(elem) {
@@ -454,6 +525,8 @@ constexpr svmp::CellFamily to_mesh_family(ElementType elem) noexcept {
 
 /**
  * @brief Get spatial dimension of element type
+ * @param elem Element type to query.
+ * @return Reference dimension from 0 (point) to 3 (volume); -1 for unknown types.
  */
 constexpr int element_dimension(ElementType elem) noexcept {
     switch(elem) {
@@ -487,6 +560,8 @@ constexpr int element_dimension(ElementType elem) noexcept {
 
 /**
  * @brief Convert status code to string for error reporting
+ * @param status Status code to describe.
+ * @return Static human-readable description of the status.
  */
 inline const char* status_to_string(FEStatus status) noexcept {
     switch(status) {
@@ -506,6 +581,8 @@ inline const char* status_to_string(FEStatus status) noexcept {
     }
 }
 
+/// @}
+
 } // namespace FE
 } // namespace svmp
 

From 917c638668e816199f23018cb6920d0670fafb0a Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Tue, 9 Jun 2026 19:06:10 -0700
Subject: [PATCH 15/22] aligning exception throws and raises with the
 function-template calls and using `SVMP_HERE` for file, line, and function
 source location information

---
 Code/Source/solver/FE/Basis/BasisExceptions.h | 40 ----------
 Code/Source/solver/FE/Basis/BasisFactory.cpp  | 38 ++++------
 Code/Source/solver/FE/Basis/BasisFunction.cpp |  8 +-
 Code/Source/solver/FE/Basis/LagrangeBasis.cpp | 46 +++++-------
 .../FE/Basis/NodeOrderingConventions.cpp      | 28 +++----
 .../solver/FE/Basis/SerendipityBasis.cpp      | 73 ++++++++-----------
 6 files changed, 78 insertions(+), 155 deletions(-)

diff --git a/Code/Source/solver/FE/Basis/BasisExceptions.h b/Code/Source/solver/FE/Basis/BasisExceptions.h
index c1af17049..8f8fd3c3c 100644
--- a/Code/Source/solver/FE/Basis/BasisExceptions.h
+++ b/Code/Source/solver/FE/Basis/BasisExceptions.h
@@ -83,46 +83,6 @@ class BasisConstructionException : public BasisException {
         : BasisException(message, file, line, function, StatusCode::InternalError) {}
 };
 
-#define BASIS_CHECK_CONFIG(condition, message)                                                 \
-    do {                                                                                       \
-        if (!(condition)) {                                                                    \
-            throw ::svmp::FE::basis::BasisConfigurationException((message),                    \
-                                                                  __FILE__, __LINE__, __func__); \
-        }                                                                                      \
-    } while (false)
-
-#define BASIS_CHECK_COMPAT(condition, message)                                                 \
-    do {                                                                                       \
-        if (!(condition)) {                                                                    \
-            throw ::svmp::FE::basis::BasisElementCompatibilityException((message),             \
-                                                                         __FILE__, __LINE__, __func__); \
-        }                                                                                      \
-    } while (false)
-
-#define BASIS_CHECK_EVAL(condition, message)                                                   \
-    do {                                                                                       \
-        if (!(condition)) {                                                                    \
-            throw ::svmp::FE::basis::BasisEvaluationException((message),                       \
-                                                               __FILE__, __LINE__, __func__);  \
-        }                                                                                      \
-    } while (false)
-
-#define BASIS_CHECK_NODE_ORDER(condition, message)                                             \
-    do {                                                                                       \
-        if (!(condition)) {                                                                    \
-            throw ::svmp::FE::basis::BasisNodeOrderingException((message),                     \
-                                                                 __FILE__, __LINE__, __func__); \
-        }                                                                                      \
-    } while (false)
-
-#define BASIS_CHECK_CONSTRUCTION(condition, message)                                           \
-    do {                                                                                       \
-        if (!(condition)) {                                                                    \
-            throw ::svmp::FE::basis::BasisConstructionException((message),                     \
-                                                                 __FILE__, __LINE__, __func__); \
-        }                                                                                      \
-    } while (false)
-
 } // namespace basis
 } // namespace FE
 } // namespace svmp
diff --git a/Code/Source/solver/FE/Basis/BasisFactory.cpp b/Code/Source/solver/FE/Basis/BasisFactory.cpp
index b48e25536..c3130d16f 100644
--- a/Code/Source/solver/FE/Basis/BasisFactory.cpp
+++ b/Code/Source/solver/FE/Basis/BasisFactory.cpp
@@ -16,28 +16,20 @@ namespace {
 int require_basis_order(const BasisRequest& req,
                         const char* missing_message,
                         const char* negative_message) {
-    if (!req.order.has_value()) {
-        throw BasisConfigurationException(missing_message,
-                                          __FILE__, __LINE__, __func__);
-    }
-    if (*req.order < 0) {
-        throw BasisConfigurationException(negative_message,
-                                          __FILE__, __LINE__, __func__);
-    }
+    FE::throw_if<BasisConfigurationException>(!req.order.has_value(), SVMP_HERE,
+                                              missing_message);
+    FE::throw_if<BasisConfigurationException>(*req.order < 0, SVMP_HERE,
+                                              negative_message);
     return *req.order;
 }
 
 void require_scalar_c0_request(const BasisRequest& req) {
-    if (req.field_type != FieldType::Scalar) {
-        throw BasisConfigurationException(
-            "BasisFactory: Lagrange/Serendipity bases support scalar fields only",
-            __FILE__, __LINE__, __func__);
-    }
-    if (req.continuity != Continuity::C0) {
-        throw BasisConfigurationException(
-            "BasisFactory: Lagrange/Serendipity bases support C0 continuity only",
-            __FILE__, __LINE__, __func__);
-    }
+    FE::throw_if<BasisConfigurationException>(
+        req.field_type != FieldType::Scalar, SVMP_HERE,
+        "BasisFactory: Lagrange/Serendipity bases support scalar fields only");
+    FE::throw_if<BasisConfigurationException>(
+        req.continuity != Continuity::C0, SVMP_HERE,
+        "BasisFactory: Lagrange/Serendipity bases support C0 continuity only");
 }
 
 std::shared_ptr<BasisFunction> create_lagrange(const BasisRequest& req) {
@@ -69,9 +61,8 @@ std::shared_ptr<BasisFunction> create(const BasisRequest& req) {
         case BasisType::Serendipity:
             return create_serendipity(req);
         default:
-            throw BasisConfigurationException(
-                "BasisFactory: requested basis family is outside the scalar Lagrange/Serendipity scope",
-                __FILE__, __LINE__, __func__);
+            FE::raise<BasisConfigurationException>(SVMP_HERE,
+                "BasisFactory: requested basis family is outside the scalar Lagrange/Serendipity scope");
     }
 }
 
@@ -90,9 +81,8 @@ BasisRequest default_basis_request(ElementType element_type) {
             if (order >= 0) {
                 return BasisRequest{element_type, BasisType::Lagrange, order};
             }
-            throw BasisElementCompatibilityException(
-                "BasisFactory: no default basis is defined for the requested element type",
-                __FILE__, __LINE__, __func__);
+            FE::raise<BasisElementCompatibilityException>(SVMP_HERE,
+                "BasisFactory: no default basis is defined for the requested element type");
         }
     }
 }
diff --git a/Code/Source/solver/FE/Basis/BasisFunction.cpp b/Code/Source/solver/FE/Basis/BasisFunction.cpp
index b98a36292..591f6751a 100644
--- a/Code/Source/solver/FE/Basis/BasisFunction.cpp
+++ b/Code/Source/solver/FE/Basis/BasisFunction.cpp
@@ -28,16 +28,16 @@ void BasisFunction::evaluate_gradients(const math::Vector<Real, 3>& xi,
                                        std::vector<Gradient>& gradients) const {
     (void)xi;
     (void)gradients;
-    throw BasisEvaluationException("Analytic gradient evaluation is not implemented for this basis",
-                                   __FILE__, __LINE__, __func__);
+    FE::raise<BasisEvaluationException>(SVMP_HERE,
+        "Analytic gradient evaluation is not implemented for this basis");
 }
 
 void BasisFunction::evaluate_hessians(const math::Vector<Real, 3>& xi,
                                       std::vector<Hessian>& hessians) const {
     (void)xi;
     (void)hessians;
-    throw BasisEvaluationException("Analytic Hessian evaluation is not implemented for this basis",
-                                   __FILE__, __LINE__, __func__);
+    FE::raise<BasisEvaluationException>(SVMP_HERE,
+        "Analytic Hessian evaluation is not implemented for this basis");
 }
 
 void BasisFunction::evaluate_all(const math::Vector<Real, 3>& xi,
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
index 4f8c15bb1..b32199d03 100644
--- a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
+++ b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
@@ -44,10 +44,8 @@ struct NormalizedLagrangeRequest {
 // Validate and return the supported basis topology for a Lagrange element type.
 BasisTopology supported_lagrange_topology(ElementType type) {
     const BasisTopology top = topology(type);
-    if (top == BasisTopology::Unknown) {
-        throw BasisElementCompatibilityException("LagrangeBasis: unsupported element type",
-                                                __FILE__, __LINE__, __func__);
-    }
+    FE::throw_if<BasisElementCompatibilityException>(top == BasisTopology::Unknown, SVMP_HERE,
+                                                     "LagrangeBasis: unsupported element type");
     return top;
 }
 
@@ -67,23 +65,19 @@ NormalizedLagrangeRequest normalize_lagrange_request(ElementType element_type, i
         case ElementType::Wedge18:
             return {ElementType::Wedge6, std::max(order, 2)};
         case ElementType::Quad8:
-            throw BasisElementCompatibilityException(
-                "LagrangeBasis: Quad8 is serendipity; use SerendipityBasis",
-                __FILE__, __LINE__, __func__);
+            FE::raise<BasisElementCompatibilityException>(SVMP_HERE,
+                "LagrangeBasis: Quad8 is serendipity; use SerendipityBasis");
         case ElementType::Hex20:
-            throw BasisElementCompatibilityException(
-                "LagrangeBasis: Hex20 is serendipity; use SerendipityBasis",
-                __FILE__, __LINE__, __func__);
+            FE::raise<BasisElementCompatibilityException>(SVMP_HERE,
+                "LagrangeBasis: Hex20 is serendipity; use SerendipityBasis");
         case ElementType::Wedge15:
-            throw BasisElementCompatibilityException(
-                "LagrangeBasis: Wedge15 is serendipity; use SerendipityBasis",
-                __FILE__, __LINE__, __func__);
+            FE::raise<BasisElementCompatibilityException>(SVMP_HERE,
+                "LagrangeBasis: Wedge15 is serendipity; use SerendipityBasis");
         case ElementType::Pyramid5:
         case ElementType::Pyramid13:
         case ElementType::Pyramid14:
-            throw BasisElementCompatibilityException(
-                "LagrangeBasis: pyramid support is not within the current solver basis scope",
-                __FILE__, __LINE__, __func__);
+            FE::raise<BasisElementCompatibilityException>(SVMP_HERE,
+                "LagrangeBasis: pyramid support is not within the current solver basis scope");
         default:
             return {element_type, order};
     }
@@ -315,10 +309,8 @@ LagrangeBasis::LagrangeBasis(ElementType type, int order)
     const auto normalized = normalize_lagrange_request(element_type_, order_);
     element_type_ = normalized.element_type;
     order_ = normalized.order;
-    if (order_ < 0) {
-        throw BasisConfigurationException("LagrangeBasis requires non-negative polynomial order",
-                                          __FILE__, __LINE__, __func__);
-    }
+    FE::throw_if<BasisConfigurationException>(order_ < 0, SVMP_HERE,
+                                              "LagrangeBasis requires non-negative polynomial order");
 
     topology_ = supported_lagrange_topology(element_type_);
     dimension_ = reference_dimension(element_type_);
@@ -366,8 +358,8 @@ void LagrangeBasis::init_nodes() {
             break;
     }
 
-    throw BasisElementCompatibilityException("Unsupported element type in LagrangeBasis::init_nodes",
-                                             __FILE__, __LINE__, __func__);
+    FE::raise<BasisElementCompatibilityException>(SVMP_HERE,
+        "Unsupported element type in LagrangeBasis::init_nodes");
 }
 
 // Build the single reference node for a point basis.
@@ -419,10 +411,8 @@ void LagrangeBasis::build_wedge_nodes() {
         const auto tri_exp =
             simplex_exponent_from_point(node, BasisTopology::Triangle, order_);
         auto it = std::find(simplex_exponents_.begin(), simplex_exponents_.end(), tri_exp);
-        if (it == simplex_exponents_.end()) {
-            throw BasisConstructionException("LagrangeBasis: wedge node triangle index lookup failed",
-                                             __FILE__, __LINE__, __func__);
-        }
+        FE::throw_if<BasisConstructionException>(it == simplex_exponents_.end(), SVMP_HERE,
+                                                 "LagrangeBasis: wedge node triangle index lookup failed");
         const std::size_t tri_index =
             static_cast<std::size_t>(std::distance(simplex_exponents_.begin(), it));
         wedge_indices_.push_back({tri_index, axis_index_pm_one(node[2], order_)});
@@ -555,8 +545,8 @@ void LagrangeBasis::evaluate_all_to(const Vec3& xi,
         return;
     }
 
-    throw BasisEvaluationException("Unsupported element in LagrangeBasis evaluation",
-                                   __FILE__, __LINE__, __func__);
+    FE::raise<BasisEvaluationException>(SVMP_HERE,
+        "Unsupported element in LagrangeBasis evaluation");
 }
 
 void LagrangeBasis::evaluate_values(const Vec3& xi,
diff --git a/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp b/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
index 76662abe1..850f8cd0a 100644
--- a/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
+++ b/Code/Source/solver/FE/Basis/NodeOrderingConventions.cpp
@@ -318,10 +318,8 @@ std::vector<Point> generate_wedge_nodes(int order) {
 }
 
 std::vector<Point> complete_lagrange_nodes(ElementType canonical_type, int order) {
-    if (order < 0) {
-        throw BasisNodeOrderingException("ReferenceNodeLayout requires non-negative Lagrange order",
-                                         __FILE__, __LINE__, __func__);
-    }
+    FE::throw_if<BasisNodeOrderingException>(order < 0, SVMP_HERE,
+                                             "ReferenceNodeLayout requires non-negative Lagrange order");
     const ElementType type = canonical_lagrange_type(canonical_type);
     switch (type) {
         case ElementType::Point1:
@@ -339,11 +337,11 @@ std::vector<Point> complete_lagrange_nodes(ElementType canonical_type, int order
         case ElementType::Wedge6:
             return generate_wedge_nodes(order);
         case ElementType::Pyramid5:
-            throw BasisNodeOrderingException("ReferenceNodeLayout: pyramid node ordering is disabled",
-                                             __FILE__, __LINE__, __func__);
+            FE::raise<BasisNodeOrderingException>(SVMP_HERE,
+                "ReferenceNodeLayout: pyramid node ordering is disabled");
         default:
-            throw BasisNodeOrderingException("ReferenceNodeLayout: unsupported Lagrange topology",
-                                             __FILE__, __LINE__, __func__);
+            FE::raise<BasisNodeOrderingException>(SVMP_HERE,
+                "ReferenceNodeLayout: unsupported Lagrange topology");
     }
 }
 
@@ -370,11 +368,11 @@ std::vector<Point> element_nodes(ElementType elem_type) {
             return nodes;
         }
         case ElementType::Pyramid13:
-            throw BasisNodeOrderingException("ReferenceNodeLayout: pyramid node ordering is disabled",
-                                             __FILE__, __LINE__, __func__);
+            FE::raise<BasisNodeOrderingException>(SVMP_HERE,
+                "ReferenceNodeLayout: pyramid node ordering is disabled");
         default:
-            throw BasisNodeOrderingException("ReferenceNodeLayout: unknown element type",
-                                             __FILE__, __LINE__, __func__);
+            FE::raise<BasisNodeOrderingException>(SVMP_HERE,
+                "ReferenceNodeLayout: unknown element type");
     }
 }
 
@@ -383,10 +381,8 @@ std::vector<Point> element_nodes(ElementType elem_type) {
 math::Vector<Real, 3> ReferenceNodeLayout::get_node_coords(ElementType elem_type,
                                                            std::size_t local_node) {
     const auto nodes = element_nodes(elem_type);
-    if (local_node >= nodes.size()) {
-        throw BasisNodeOrderingException("ReferenceNodeLayout::get_node_coords: node index out of range",
-                                         __FILE__, __LINE__, __func__);
-    }
+    FE::throw_if<BasisNodeOrderingException>(local_node >= nodes.size(), SVMP_HERE,
+                                             "ReferenceNodeLayout::get_node_coords: node index out of range");
     return nodes[local_node];
 }
 
diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
index 30eac9c38..006d43fdc 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
@@ -115,11 +115,9 @@ std::vector<Vec3> quad_serendipity_nodes(int order, std::size_t total_size) {
         nodes.push_back(Vec3{Real(-1), Real(1) - Real(2 * i) * inv_order, Real(0)});
     }
 
-    if (nodes.size() > total_size) {
-        throw BasisConstructionException(
-            "SerendipityBasis: quadrilateral serendipity boundary nodes exceed requested size",
-            __FILE__, __LINE__, __func__);
-    }
+    FE::throw_if<BasisConstructionException>(
+        nodes.size() > total_size, SVMP_HERE,
+        "SerendipityBasis: quadrilateral serendipity boundary nodes exceed requested size");
 
     const std::size_t interior_count = total_size - nodes.size();
     if (interior_count == 0u) {
@@ -157,11 +155,9 @@ std::vector<Vec3> quad_serendipity_nodes(int order, std::size_t total_size) {
                   return a[0] < b[0];
               });
 
-    if (interior_count > interior_candidates.size()) {
-        throw BasisConstructionException(
-            "SerendipityBasis: insufficient quadrilateral interior nodes for requested serendipity order",
-            __FILE__, __LINE__, __func__);
-    }
+    FE::throw_if<BasisConstructionException>(
+        interior_count > interior_candidates.size(), SVMP_HERE,
+        "SerendipityBasis: insufficient quadrilateral interior nodes for requested serendipity order");
 
     nodes.insert(nodes.end(),
                  interior_candidates.begin(),
@@ -181,11 +177,9 @@ std::vector<Real> quad_serendipity_inverse_vandermonde(
     std::span<const std::array<int, 2>> exponents,
     int order) {
     const int n = static_cast<int>(nodes.size());
-    if (n == 0 || exponents.size() != nodes.size()) {
-        throw BasisConstructionException(
-            "SerendipityBasis: invalid quadrilateral serendipity interpolation setup",
-            __FILE__, __LINE__, __func__);
-    }
+    FE::throw_if<BasisConstructionException>(
+        n == 0 || exponents.size() != nodes.size(), SVMP_HERE,
+        "SerendipityBasis: invalid quadrilateral serendipity interpolation setup");
 
     std::vector<Real> vandermonde(static_cast<std::size_t>(n * n), Real(0));
     auto idx = [n](int row, int col) -> std::size_t {
@@ -499,19 +493,15 @@ SerendipityBasis::SerendipityBasis(ElementType type, int order, bool geometry_mo
         if (order_ < 1) {
             order_ = 1;
         }
-        if (type == ElementType::Quad8 && order_ != 2) {
-            throw BasisConfigurationException(
-                "SerendipityBasis: Quad8 is only valid for quadratic order 2; use Quad4 for higher-order quadrilateral serendipity",
-                __FILE__, __LINE__, __func__);
-        }
+        FE::throw_if<BasisConfigurationException>(
+            type == ElementType::Quad8 && order_ != 2, SVMP_HERE,
+            "SerendipityBasis: Quad8 is only valid for quadratic order 2; use Quad4 for higher-order quadrilateral serendipity");
         quad_monomial_exponents_ = quad_serendipity_exponents(order_);
         size_ = quad_monomial_exponents_.size();
         nodes_ = quad_serendipity_nodes(order_, size_);
-        if (nodes_.size() != size_) {
-            throw BasisConstructionException(
-                "SerendipityBasis: quadrilateral serendipity setup produced inconsistent sizes",
-                __FILE__, __LINE__, __func__);
-        }
+        FE::throw_if<BasisConstructionException>(
+            nodes_.size() != size_, SVMP_HERE,
+            "SerendipityBasis: quadrilateral serendipity setup produced inconsistent sizes");
         quad_inv_vandermonde_ = quad_serendipity_inverse_vandermonde(nodes_, quad_monomial_exponents_, order_);
     } else if (type == ElementType::Hex8 || type == ElementType::Hex20) {
         dimension_ = 3;
@@ -521,9 +511,8 @@ SerendipityBasis::SerendipityBasis(ElementType type, int order, bool geometry_mo
         } else if (order_ == 2) {
             size_ = 20;
         } else {
-            throw BasisConfigurationException(
-                "SerendipityBasis supports up to quadratic on hexahedra",
-                __FILE__, __LINE__, __func__);
+            FE::raise<BasisConfigurationException>(SVMP_HERE,
+                "SerendipityBasis supports up to quadratic on hexahedra");
         }
     } else if (type == ElementType::Wedge15) {
         dimension_ = 3;
@@ -533,13 +522,12 @@ SerendipityBasis::SerendipityBasis(ElementType type, int order, bool geometry_mo
         if (order_ == 2) {
             size_ = 15;
         } else {
-            throw BasisConfigurationException(
-                "SerendipityBasis supports up to quadratic on wedge15",
-                __FILE__, __LINE__, __func__);
+            FE::raise<BasisConfigurationException>(SVMP_HERE,
+                "SerendipityBasis supports up to quadratic on wedge15");
         }
     } else {
-        throw BasisElementCompatibilityException("SerendipityBasis supports Quad4/Quad8, Hex8/Hex20, and Wedge15 elements",
-                                                 __FILE__, __LINE__, __func__);
+        FE::raise<BasisElementCompatibilityException>(SVMP_HERE,
+            "SerendipityBasis supports Quad4/Quad8, Hex8/Hex20, and Wedge15 elements");
     }
 
     if (nodes_.empty()) {
@@ -573,12 +561,11 @@ void SerendipityBasis::evaluate_all_to(const math::Vector<Real, 3>& xi,
     const Real z = xi[2];
 
     if (dimension_ == 2) {
-        if (quad_monomial_exponents_.size() != size_ ||
-            quad_inv_vandermonde_.size() != size_ * size_) {
-            throw BasisEvaluationException(
-                "SerendipityBasis: quadrilateral interpolation tables are not initialized for value evaluation",
-                __FILE__, __LINE__, __func__);
-        }
+        FE::throw_if<BasisEvaluationException>(
+            quad_monomial_exponents_.size() != size_ ||
+                quad_inv_vandermonde_.size() != size_ * size_,
+            SVMP_HERE,
+            "SerendipityBasis: quadrilateral interpolation tables are not initialized for value evaluation");
 
         for (std::size_t j = 0; j < size_; ++j) {
             const auto [ax, ay] = quad_monomial_exponents_[j];
@@ -632,8 +619,8 @@ void SerendipityBasis::evaluate_all_to(const math::Vector<Real, 3>& xi,
 
     if (element_type_ == ElementType::Hex20) {
         const auto mesh_to_basis = ReferenceNodeLayout::mesh_to_basis_ordering(element_type_);
-        BASIS_CHECK_EVAL(mesh_to_basis.size() == size_,
-                         "Hex20 mesh-to-basis ordering is not registered");
+        FE::throw_if<BasisEvaluationException>(mesh_to_basis.size() != size_, SVMP_HERE,
+                                               "Hex20 mesh-to-basis ordering is not registered");
 
         if (values_out) {
             Real internal_vals[20];
@@ -681,8 +668,8 @@ void SerendipityBasis::evaluate_all_to(const math::Vector<Real, 3>& xi,
         return;
     }
 
-    throw BasisEvaluationException("SerendipityBasis::evaluate_all_to: unsupported serendipity configuration",
-                                   __FILE__, __LINE__, __func__);
+    FE::raise<BasisEvaluationException>(SVMP_HERE,
+        "SerendipityBasis::evaluate_all_to: unsupported serendipity configuration");
 }
 
 void SerendipityBasis::evaluate_values(const math::Vector<Real, 3>& xi,

From 4819f595e920e95a88be0c5895be3e1d5dc055d0 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Tue, 9 Jun 2026 19:35:11 -0700
Subject: [PATCH 16/22] fixing doxygen layout to allow for visible topic
 sections since modules in doxygen are now reserved for c++ modules

---
 Documentation/DoxygenLayout.xml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Documentation/DoxygenLayout.xml b/Documentation/DoxygenLayout.xml
index df0146828..f056df891 100644
--- a/Documentation/DoxygenLayout.xml
+++ b/Documentation/DoxygenLayout.xml
@@ -3,7 +3,11 @@
   <!-- Navigation index tabs for HTML output -->
   <navindex>
     <!-- <tab type="pages" visible="yes" title="" intro=""/> -->
+    <!-- Doxygen <= 1.9.7 renders grouping pages via the "modules" tab; 1.9.8+
+         renamed them to "topics" and reuses "modules" for C++20 modules.
+         Declare both so the tab appears regardless of the doxygen version. -->
     <tab type="modules" visible="yes" title="" intro=""/>
+    <tab type="topics" visible="yes" title="" intro=""/>
     <tab type="namespaces" visible="yes" title="">
       <tab type="namespacelist" visible="yes" title="" intro=""/>
       <tab type="namespacemembers" visible="yes" title="" intro=""/>

From dfd5aff359ee357771550c228e303f58d5f862b2 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Tue, 9 Jun 2026 20:46:58 -0700
Subject: [PATCH 17/22] added topology evaluation helpers and cleaned up static
 cast helpers

---
 Code/Source/solver/FE/Basis/BasisFunction.cpp |   4 +-
 Code/Source/solver/FE/Basis/BasisFunction.h   |  14 +
 Code/Source/solver/FE/Basis/LagrangeBasis.cpp | 361 +++++++++---------
 Code/Source/solver/FE/Basis/LagrangeBasis.h   |  17 +-
 .../solver/FE/Basis/SerendipityBasis.cpp      |  14 +-
 5 files changed, 215 insertions(+), 195 deletions(-)

diff --git a/Code/Source/solver/FE/Basis/BasisFunction.cpp b/Code/Source/solver/FE/Basis/BasisFunction.cpp
index 591f6751a..d847a9cca 100644
--- a/Code/Source/solver/FE/Basis/BasisFunction.cpp
+++ b/Code/Source/solver/FE/Basis/BasisFunction.cpp
@@ -63,9 +63,7 @@ void BasisFunction::evaluate_gradients_to(const math::Vector<Real, 3>& xi,
     tmp.resize(size());
     evaluate_gradients(xi, tmp);
     for (std::size_t i = 0; i < tmp.size(); ++i) {
-        gradients_out[i * 3u + 0u] = tmp[i][0];
-        gradients_out[i * 3u + 1u] = tmp[i][1];
-        gradients_out[i * 3u + 2u] = tmp[i][2];
+        store_gradient(tmp[i], gradients_out + i * 3u);
     }
 }
 
diff --git a/Code/Source/solver/FE/Basis/BasisFunction.h b/Code/Source/solver/FE/Basis/BasisFunction.h
index e7de2bf01..832926199 100644
--- a/Code/Source/solver/FE/Basis/BasisFunction.h
+++ b/Code/Source/solver/FE/Basis/BasisFunction.h
@@ -49,6 +49,20 @@ using Hessian  = math::Matrix<Real, 3, 3>;
     return hessian;
 }
 
+inline void store_gradient(const Gradient& gradient, Real* dst) noexcept {
+    dst[0] = gradient[0];
+    dst[1] = gradient[1];
+    dst[2] = gradient[2];
+}
+
+[[nodiscard]] inline Gradient load_gradient(const Real* src) noexcept {
+    Gradient gradient;
+    gradient[0] = src[0];
+    gradient[1] = src[1];
+    gradient[2] = src[2];
+    return gradient;
+}
+
 inline void store_hessian(const Hessian& hessian, Real* dst) noexcept {
     dst[0] = hessian(0, 0);
     dst[1] = hessian(0, 1);
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
index b32199d03..4ec970b86 100644
--- a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
+++ b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
@@ -7,6 +7,7 @@
 #include <algorithm>
 #include <array>
 #include <cmath>
+#include <limits>
 
 namespace svmp {
 namespace FE {
@@ -121,6 +122,9 @@ LagrangeBasis::SimplexExponent simplex_exponent_from_point(const Vec3& p,
     return e;
 }
 
+// Sentinel node index meaning "skip nothing" in product_excluding below.
+constexpr std::size_t kNoSkip = std::numeric_limits<std::size_t>::max();
+
 // Evaluate 1D Lagrange polynomials and derivatives at a point.
 void evaluate_1d_lagrange(Real x, const std::vector<Real>& nodes, AxisEval& out) {
     const std::size_t n = nodes.size();
@@ -134,6 +138,19 @@ void evaluate_1d_lagrange(Real x, const std::vector<Real>& nodes, AxisEval& out)
     }
 
     for (std::size_t i = 0; i < n; ++i) {
+        // Product of (x - nodes[j]) over all j except i and the listed skips.
+        // Each derivative order drops one additional factor from the product.
+        const auto product_excluding = [&](std::size_t skip1 = kNoSkip,
+                                           std::size_t skip2 = kNoSkip) {
+            Real product = Real(1);
+            for (std::size_t j = 0; j < n; ++j) {
+                if (j != i && j != skip1 && j != skip2) {
+                    product *= x - nodes[j];
+                }
+            }
+            return product;
+        };
+
         Real denom = Real(1);
         for (std::size_t j = 0; j < n; ++j) {
             if (j != i) {
@@ -141,26 +158,13 @@ void evaluate_1d_lagrange(Real x, const std::vector<Real>& nodes, AxisEval& out)
             }
         }
 
-        Real value = Real(1);
-        for (std::size_t j = 0; j < n; ++j) {
-            if (j != i) {
-                value *= x - nodes[j];
-            }
-        }
-        out.value[i] = value / denom;
+        out.value[i] = product_excluding() / denom;
 
         Real first = Real(0);
         for (std::size_t m = 0; m < n; ++m) {
-            if (m == i) {
-                continue;
+            if (m != i) {
+                first += product_excluding(m);
             }
-            Real product = Real(1);
-            for (std::size_t j = 0; j < n; ++j) {
-                if (j != i && j != m) {
-                    product *= x - nodes[j];
-                }
-            }
-            first += product;
         }
         out.first[i] = first / denom;
 
@@ -170,16 +174,9 @@ void evaluate_1d_lagrange(Real x, const std::vector<Real>& nodes, AxisEval& out)
                 continue;
             }
             for (std::size_t l = 0; l < n; ++l) {
-                if (l == i || l == m) {
-                    continue;
-                }
-                Real product = Real(1);
-                for (std::size_t j = 0; j < n; ++j) {
-                    if (j != i && j != m && j != l) {
-                        product *= x - nodes[j];
-                    }
+                if (l != i && l != m) {
+                    second += product_excluding(m, l);
                 }
-                second += product;
             }
         }
         out.second[i] = second / denom;
@@ -222,7 +219,7 @@ void evaluate_simplex(const Vec3& xi,
         return;
     }
 
-    const int bary_count = top == BasisTopology::Triangle ? 3 : 4;
+    const std::size_t bary_count = top == BasisTopology::Triangle ? 3u : 4u;
     std::array<Real, 4> lambda{Real(0), Real(0), Real(0), Real(0)};
     std::array<Gradient, 4> lambda_grad;
     lambda_grad.fill(Gradient::Zero());
@@ -246,48 +243,40 @@ void evaluate_simplex(const Vec3& xi,
 
     for (std::size_t i = 0; i < n; ++i) {
         std::array<std::array<Real, 3>, 4> f{};
-        for (int a = 0; a < bary_count; ++a) {
-            f[static_cast<std::size_t>(a)] =
-                simplex_factor(exponents[i][static_cast<std::size_t>(a)],
-                               lambda[static_cast<std::size_t>(a)],
-                               order);
+        for (std::size_t a = 0; a < bary_count; ++a) {
+            f[a] = simplex_factor(exponents[i][a], lambda[a], order);
         }
 
         Real value = Real(1);
-        for (int a = 0; a < bary_count; ++a) {
-            value *= f[static_cast<std::size_t>(a)][0];
+        for (std::size_t a = 0; a < bary_count; ++a) {
+            value *= f[a][0];
         }
         out.value[i] = value;
 
-        for (int a = 0; a < bary_count; ++a) {
-            Real product = f[static_cast<std::size_t>(a)][1];
-            for (int b = 0; b < bary_count; ++b) {
+        for (std::size_t a = 0; a < bary_count; ++a) {
+            Real product = f[a][1];
+            for (std::size_t b = 0; b < bary_count; ++b) {
                 if (b != a) {
-                    product *= f[static_cast<std::size_t>(b)][0];
+                    product *= f[b][0];
                 }
             }
             for (std::size_t c = 0; c < 3u; ++c) {
-                out.gradient[i][c] += product * lambda_grad[static_cast<std::size_t>(a)][c];
+                out.gradient[i][c] += product * lambda_grad[a][c];
             }
         }
 
-        for (int a = 0; a < bary_count; ++a) {
-            for (int b = 0; b < bary_count; ++b) {
-                Real product = (a == b)
-                    ? f[static_cast<std::size_t>(a)][2]
-                    : f[static_cast<std::size_t>(a)][1] *
-                      f[static_cast<std::size_t>(b)][1];
-                for (int c = 0; c < bary_count; ++c) {
-                    if (c != a && c != b) {
-                        product *= f[static_cast<std::size_t>(c)][0];
+        for (std::size_t a = 0; a < bary_count; ++a) {
+            for (std::size_t b = 0; b < bary_count; ++b) {
+                Real product = (a == b) ? f[a][2] : f[a][1] * f[b][1];
+                for (std::size_t k = 0; k < bary_count; ++k) {
+                    if (k != a && k != b) {
+                        product *= f[k][0];
                     }
                 }
                 for (std::size_t r = 0; r < 3u; ++r) {
                     for (std::size_t c = 0; c < 3u; ++c) {
                         out.hessian[i](r, c) +=
-                            product *
-                            lambda_grad[static_cast<std::size_t>(a)][r] *
-                            lambda_grad[static_cast<std::size_t>(b)][c];
+                            product * lambda_grad[a][r] * lambda_grad[b][c];
                     }
                 }
             }
@@ -295,13 +284,6 @@ void evaluate_simplex(const Vec3& xi,
     }
 }
 
-// Store a gradient in the flat buffer layout used by fast evaluators.
-void store_gradient(const Gradient& gradient, Real* dst) {
-    dst[0] = gradient[0];
-    dst[1] = gradient[1];
-    dst[2] = gradient[2];
-}
-
 } // namespace
 
 LagrangeBasis::LagrangeBasis(ElementType type, int order)
@@ -339,13 +321,9 @@ void LagrangeBasis::init_nodes() {
             build_point_nodes();
             return;
         case BasisTopology::Line:
-            build_tensor_product_nodes(1);
-            return;
         case BasisTopology::Quadrilateral:
-            build_tensor_product_nodes(2);
-            return;
         case BasisTopology::Hexahedron:
-            build_tensor_product_nodes(3);
+            build_tensor_product_nodes();
             return;
         case BasisTopology::Triangle:
         case BasisTopology::Tetrahedron:
@@ -368,17 +346,17 @@ void LagrangeBasis::build_point_nodes() {
 }
 
 // Build nodes and axis indices for tensor-product elements.
-void LagrangeBasis::build_tensor_product_nodes(int dimensions) {
+void LagrangeBasis::build_tensor_product_nodes() {
     init_equispaced_1d_nodes();
     nodes_ = ReferenceNodeLayout::get_lagrange_node_coords(element_type_, order_);
     tensor_indices_.reserve(nodes_.size());
     for (const auto& node : nodes_) {
         TensorNodeIndex idx{0u, 0u, 0u};
         idx[0] = axis_index_pm_one(node[0], order_);
-        if (dimensions >= 2) {
+        if (dimension_ >= 2) {
             idx[1] = axis_index_pm_one(node[1], order_);
         }
-        if (dimensions >= 3) {
+        if (dimension_ >= 3) {
             idx[2] = axis_index_pm_one(node[2], order_);
         }
         tensor_indices_.push_back(idx);
@@ -419,130 +397,159 @@ void LagrangeBasis::build_wedge_nodes() {
     }
 }
 
-// Evaluate requested basis quantities into caller-provided flat buffers.
-void LagrangeBasis::evaluate_all_to(const Vec3& xi,
-                                    Real* SVMP_RESTRICT values_out,
-                                    Real* SVMP_RESTRICT gradients_out,
-                                    Real* SVMP_RESTRICT hessians_out) const {
-    if (topology_ == BasisTopology::Point) {
+// Evaluate the constant point basis.
+void LagrangeBasis::evaluate_point_to(Real* SVMP_RESTRICT values_out,
+                                      Real* SVMP_RESTRICT gradients_out,
+                                      Real* SVMP_RESTRICT hessians_out) const {
+    if (values_out) {
+        values_out[0] = Real(1);
+    }
+    if (gradients_out) {
+        gradients_out[0] = gradients_out[1] = gradients_out[2] = Real(0);
+    }
+    if (hessians_out) {
+        std::fill_n(hessians_out, 9u, Real(0));
+    }
+}
+
+// Evaluate line, quadrilateral, and hexahedron bases as axis-polynomial products.
+void LagrangeBasis::evaluate_tensor_product_to(const Vec3& xi,
+                                               Real* SVMP_RESTRICT values_out,
+                                               Real* SVMP_RESTRICT gradients_out,
+                                               Real* SVMP_RESTRICT hessians_out) const {
+    AxisEval ax;
+    AxisEval ay;
+    AxisEval az;
+    evaluate_1d_lagrange(xi[0], nodes_1d_, ax);
+    if (dimension_ >= 2) {
+        evaluate_1d_lagrange(xi[1], nodes_1d_, ay);
+    }
+    if (dimension_ >= 3) {
+        evaluate_1d_lagrange(xi[2], nodes_1d_, az);
+    }
+
+    for (std::size_t node = 0; node < tensor_indices_.size(); ++node) {
+        const auto& idx = tensor_indices_[node];
+        const Real vx = ax.value[idx[0]];
+        const Real dx = ax.first[idx[0]];
+        const Real d2x = ax.second[idx[0]];
+        const Real vy = dimension_ >= 2 ? ay.value[idx[1]] : Real(1);
+        const Real dy = dimension_ >= 2 ? ay.first[idx[1]] : Real(0);
+        const Real d2y = dimension_ >= 2 ? ay.second[idx[1]] : Real(0);
+        const Real vz = dimension_ >= 3 ? az.value[idx[2]] : Real(1);
+        const Real dz = dimension_ >= 3 ? az.first[idx[2]] : Real(0);
+        const Real d2z = dimension_ >= 3 ? az.second[idx[2]] : Real(0);
+
         if (values_out) {
-            values_out[0] = Real(1);
+            values_out[node] = vx * vy * vz;
         }
         if (gradients_out) {
-            gradients_out[0] = gradients_out[1] = gradients_out[2] = Real(0);
+            Real* g = gradients_out + node * 3u;
+            g[0] = dx * vy * vz;
+            g[1] = vx * dy * vz;
+            g[2] = vx * vy * dz;
         }
         if (hessians_out) {
-            std::fill_n(hessians_out, 9u, Real(0));
+            Real* h = hessians_out + node * 9u;
+            h[0] = d2x * vy * vz;
+            h[1] = dx * dy * vz;
+            h[2] = dx * vy * dz;
+            h[3] = h[1];
+            h[4] = vx * d2y * vz;
+            h[5] = vx * dy * dz;
+            h[6] = h[2];
+            h[7] = h[5];
+            h[8] = vx * vy * d2z;
         }
-        return;
     }
+}
 
-    if (topology_ == BasisTopology::Line ||
-        topology_ == BasisTopology::Quadrilateral ||
-        topology_ == BasisTopology::Hexahedron) {
-        AxisEval ax;
-        AxisEval ay;
-        AxisEval az;
-        evaluate_1d_lagrange(xi[0], nodes_1d_, ax);
-        if (dimension_ >= 2) {
-            evaluate_1d_lagrange(xi[1], nodes_1d_, ay);
+// Evaluate triangle and tetrahedron bases from barycentric factors.
+void LagrangeBasis::evaluate_simplex_to(const Vec3& xi,
+                                        Real* SVMP_RESTRICT values_out,
+                                        Real* SVMP_RESTRICT gradients_out,
+                                        Real* SVMP_RESTRICT hessians_out) const {
+    SimplexEval simplex;
+    evaluate_simplex(xi, topology_, order_, simplex_exponents_, simplex);
+    for (std::size_t i = 0; i < simplex.value.size(); ++i) {
+        if (values_out) {
+            values_out[i] = simplex.value[i];
         }
-        if (dimension_ >= 3) {
-            evaluate_1d_lagrange(xi[2], nodes_1d_, az);
+        if (gradients_out) {
+            store_gradient(simplex.gradient[i], gradients_out + i * 3u);
         }
-
-        for (std::size_t node = 0; node < tensor_indices_.size(); ++node) {
-            const auto& idx = tensor_indices_[node];
-            const Real vx = ax.value[idx[0]];
-            const Real dx = ax.first[idx[0]];
-            const Real d2x = ax.second[idx[0]];
-            const Real vy = dimension_ >= 2 ? ay.value[idx[1]] : Real(1);
-            const Real dy = dimension_ >= 2 ? ay.first[idx[1]] : Real(0);
-            const Real d2y = dimension_ >= 2 ? ay.second[idx[1]] : Real(0);
-            const Real vz = dimension_ >= 3 ? az.value[idx[2]] : Real(1);
-            const Real dz = dimension_ >= 3 ? az.first[idx[2]] : Real(0);
-            const Real d2z = dimension_ >= 3 ? az.second[idx[2]] : Real(0);
-
-            if (values_out) {
-                values_out[node] = vx * vy * vz;
-            }
-            if (gradients_out) {
-                Real* g = gradients_out + node * 3u;
-                g[0] = dx * vy * vz;
-                g[1] = vx * dy * vz;
-                g[2] = vx * vy * dz;
-            }
-            if (hessians_out) {
-                Real* h = hessians_out + node * 9u;
-                h[0] = d2x * vy * vz;
-                h[1] = dx * dy * vz;
-                h[2] = dx * vy * dz;
-                h[3] = h[1];
-                h[4] = vx * d2y * vz;
-                h[5] = vx * dy * dz;
-                h[6] = h[2];
-                h[7] = h[5];
-                h[8] = vx * vy * d2z;
-            }
+        if (hessians_out) {
+            store_hessian(simplex.hessian[i], hessians_out + i * 9u);
         }
-        return;
     }
+}
 
-    if (topology_ == BasisTopology::Triangle || topology_ == BasisTopology::Tetrahedron) {
-        SimplexEval simplex;
-        evaluate_simplex(xi, topology_, order_, simplex_exponents_, simplex);
-        for (std::size_t i = 0; i < simplex.value.size(); ++i) {
-            if (values_out) {
-                values_out[i] = simplex.value[i];
-            }
-            if (gradients_out) {
-                store_gradient(simplex.gradient[i], gradients_out + i * 3u);
-            }
-            if (hessians_out) {
-                store_hessian(simplex.hessian[i], hessians_out + i * 9u);
-            }
+// Evaluate wedge bases as triangle/through-axis products.
+void LagrangeBasis::evaluate_wedge_to(const Vec3& xi,
+                                      Real* SVMP_RESTRICT values_out,
+                                      Real* SVMP_RESTRICT gradients_out,
+                                      Real* SVMP_RESTRICT hessians_out) const {
+    SimplexEval tri;
+    AxisEval z_axis;
+    evaluate_simplex(xi, BasisTopology::Triangle, order_, simplex_exponents_, tri);
+    evaluate_1d_lagrange(xi[2], nodes_1d_, z_axis);
+
+    for (std::size_t node = 0; node < wedge_indices_.size(); ++node) {
+        const auto [tri_idx, z_idx] = wedge_indices_[node];
+        const Real tv = tri.value[tri_idx];
+        const Real zv = z_axis.value[z_idx];
+        const Real dz = z_axis.first[z_idx];
+        const Real d2z = z_axis.second[z_idx];
+
+        if (values_out) {
+            values_out[node] = tv * zv;
+        }
+        if (gradients_out) {
+            Real* g = gradients_out + node * 3u;
+            g[0] = tri.gradient[tri_idx][0] * zv;
+            g[1] = tri.gradient[tri_idx][1] * zv;
+            g[2] = tv * dz;
+        }
+        if (hessians_out) {
+            Real* h = hessians_out + node * 9u;
+            const Hessian& th = tri.hessian[tri_idx];
+            const Gradient& tg = tri.gradient[tri_idx];
+            h[0] = th(0, 0) * zv;
+            h[1] = th(0, 1) * zv;
+            h[2] = tg[0] * dz;
+            h[3] = h[1];
+            h[4] = th(1, 1) * zv;
+            h[5] = tg[1] * dz;
+            h[6] = h[2];
+            h[7] = h[5];
+            h[8] = tv * d2z;
         }
-        return;
     }
+}
 
-    if (topology_ == BasisTopology::Wedge) {
-        SimplexEval tri;
-        AxisEval z_axis;
-        evaluate_simplex(xi, BasisTopology::Triangle, order_, simplex_exponents_, tri);
-        evaluate_1d_lagrange(xi[2], nodes_1d_, z_axis);
-
-        for (std::size_t node = 0; node < wedge_indices_.size(); ++node) {
-            const auto [tri_idx, z_idx] = wedge_indices_[node];
-            const Real tv = tri.value[tri_idx];
-            const Real zv = z_axis.value[z_idx];
-            const Real dz = z_axis.first[z_idx];
-            const Real d2z = z_axis.second[z_idx];
-
-            if (values_out) {
-                values_out[node] = tv * zv;
-            }
-            if (gradients_out) {
-                Real* g = gradients_out + node * 3u;
-                g[0] = tri.gradient[tri_idx][0] * zv;
-                g[1] = tri.gradient[tri_idx][1] * zv;
-                g[2] = tv * dz;
-            }
-            if (hessians_out) {
-                Real* h = hessians_out + node * 9u;
-                const Hessian& th = tri.hessian[tri_idx];
-                const Gradient& tg = tri.gradient[tri_idx];
-                h[0] = th(0, 0) * zv;
-                h[1] = th(0, 1) * zv;
-                h[2] = tg[0] * dz;
-                h[3] = h[1];
-                h[4] = th(1, 1) * zv;
-                h[5] = tg[1] * dz;
-                h[6] = h[2];
-                h[7] = h[5];
-                h[8] = tv * d2z;
-            }
-        }
-        return;
+// Evaluate requested basis quantities into caller-provided flat buffers.
+void LagrangeBasis::evaluate_all_to(const Vec3& xi,
+                                    Real* SVMP_RESTRICT values_out,
+                                    Real* SVMP_RESTRICT gradients_out,
+                                    Real* SVMP_RESTRICT hessians_out) const {
+    switch (topology_) {
+        case BasisTopology::Point:
+            evaluate_point_to(values_out, gradients_out, hessians_out);
+            return;
+        case BasisTopology::Line:
+        case BasisTopology::Quadrilateral:
+        case BasisTopology::Hexahedron:
+            evaluate_tensor_product_to(xi, values_out, gradients_out, hessians_out);
+            return;
+        case BasisTopology::Triangle:
+        case BasisTopology::Tetrahedron:
+            evaluate_simplex_to(xi, values_out, gradients_out, hessians_out);
+            return;
+        case BasisTopology::Wedge:
+            evaluate_wedge_to(xi, values_out, gradients_out, hessians_out);
+            return;
+        default:
+            break;
     }
 
     FE::raise<BasisEvaluationException>(SVMP_HERE,
@@ -561,9 +568,7 @@ void LagrangeBasis::evaluate_gradients(const Vec3& xi,
     std::vector<Real> flat(size() * 3u, Real(0));
     evaluate_gradients_to(xi, flat.data());
     for (std::size_t i = 0; i < size(); ++i) {
-        gradients[i][0] = flat[i * 3u + 0u];
-        gradients[i][1] = flat[i * 3u + 1u];
-        gradients[i][2] = flat[i * 3u + 2u];
+        gradients[i] = load_gradient(flat.data() + i * 3u);
     }
 }
 
@@ -588,9 +593,7 @@ void LagrangeBasis::evaluate_all(const Vec3& xi,
     std::vector<Real> flat_h(size() * 9u, Real(0));
     evaluate_all_to(xi, values.data(), flat_g.data(), flat_h.data());
     for (std::size_t i = 0; i < size(); ++i) {
-        gradients[i][0] = flat_g[i * 3u + 0u];
-        gradients[i][1] = flat_g[i * 3u + 1u];
-        gradients[i][2] = flat_g[i * 3u + 2u];
+        gradients[i] = load_gradient(flat_g.data() + i * 3u);
         hessians[i] = load_hessian(flat_h.data() + i * 9u);
     }
 }
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasis.h b/Code/Source/solver/FE/Basis/LagrangeBasis.h
index 3bb1a5e74..cd0ca6058 100644
--- a/Code/Source/solver/FE/Basis/LagrangeBasis.h
+++ b/Code/Source/solver/FE/Basis/LagrangeBasis.h
@@ -218,7 +218,7 @@ class LagrangeBasis : public BasisFunction {
 
     void init_nodes();
     void build_point_nodes();
-    void build_tensor_product_nodes(int dimensions);
+    void build_tensor_product_nodes();
     void build_simplex_nodes();
     void build_wedge_nodes();
     void init_equispaced_1d_nodes();
@@ -227,6 +227,21 @@ class LagrangeBasis : public BasisFunction {
                          Real* SVMP_RESTRICT values_out,
                          Real* SVMP_RESTRICT gradients_out,
                          Real* SVMP_RESTRICT hessians_out) const;
+    void evaluate_point_to(Real* SVMP_RESTRICT values_out,
+                           Real* SVMP_RESTRICT gradients_out,
+                           Real* SVMP_RESTRICT hessians_out) const;
+    void evaluate_tensor_product_to(const math::Vector<Real, 3>& xi,
+                                    Real* SVMP_RESTRICT values_out,
+                                    Real* SVMP_RESTRICT gradients_out,
+                                    Real* SVMP_RESTRICT hessians_out) const;
+    void evaluate_simplex_to(const math::Vector<Real, 3>& xi,
+                             Real* SVMP_RESTRICT values_out,
+                             Real* SVMP_RESTRICT gradients_out,
+                             Real* SVMP_RESTRICT hessians_out) const;
+    void evaluate_wedge_to(const math::Vector<Real, 3>& xi,
+                           Real* SVMP_RESTRICT values_out,
+                           Real* SVMP_RESTRICT gradients_out,
+                           Real* SVMP_RESTRICT hessians_out) const;
 };
 
 /// @}
diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
index 006d43fdc..fd5f99cbc 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
@@ -18,12 +18,6 @@ namespace basis {
 namespace {
 using Vec3 = math::Vector<Real, 3>;
 
-void store_gradient(const Gradient& gradient, Real* dst) {
-    dst[0] = gradient[0];
-    dst[1] = gradient[1];
-    dst[2] = gradient[2];
-}
-
 void evaluate_hex8_reference(Real r,
                              Real s,
                              Real t,
@@ -684,9 +678,7 @@ void SerendipityBasis::evaluate_gradients(const math::Vector<Real, 3>& xi,
     std::vector<Real> flat(size_ * 3u, Real(0));
     evaluate_gradients_to(xi, flat.data());
     for (std::size_t i = 0; i < size_; ++i) {
-        gradients[i][0] = flat[i * 3u + 0u];
-        gradients[i][1] = flat[i * 3u + 1u];
-        gradients[i][2] = flat[i * 3u + 2u];
+        gradients[i] = load_gradient(flat.data() + i * 3u);
     }
 }
 
@@ -711,9 +703,7 @@ void SerendipityBasis::evaluate_all(const math::Vector<Real, 3>& xi,
     std::vector<Real> flat_hessians(size_ * 9u, Real(0));
     evaluate_all_to(xi, values.data(), flat_gradients.data(), flat_hessians.data());
     for (std::size_t i = 0; i < size_; ++i) {
-        gradients[i][0] = flat_gradients[i * 3u + 0u];
-        gradients[i][1] = flat_gradients[i * 3u + 1u];
-        gradients[i][2] = flat_gradients[i * 3u + 2u];
+        gradients[i] = load_gradient(flat_gradients.data() + i * 3u);
         hessians[i] = load_hessian(flat_hessians.data() + i * 9u);
     }
 }

From ddb509ac88fc6b28147e464f2333e99b0c305b61 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Wed, 10 Jun 2026 11:50:24 -0700
Subject: [PATCH 18/22] aligning throw and raise to use function-template
 helpers for svmp

---
 Code/Source/solver/nn.cpp | 141 +++++++++++++++-----------------------
 1 file changed, 56 insertions(+), 85 deletions(-)

diff --git a/Code/Source/solver/nn.cpp b/Code/Source/solver/nn.cpp
index 60fcddf81..547310703 100644
--- a/Code/Source/solver/nn.cpp
+++ b/Code/Source/solver/nn.cpp
@@ -133,9 +133,8 @@ const febasis::BasisFunction& basis_for_solver_element(consts::ElementType eType
 
   const auto fe_type = to_fe_element_type(eType);
   if (!fe_type) {
-    throw febasis::BasisElementCompatibilityException(
-        "No FE Basis selection for solver element " + solver_element_name(eType),
-        __FILE__, __LINE__, __func__);
+    fe::raise<febasis::BasisElementCompatibilityException>(SVMP_HERE,
+        "No FE Basis selection for solver element " + solver_element_name(eType));
   }
 
   const std::lock_guard<std::mutex> lock(cache_mutex);
@@ -177,10 +176,9 @@ std::span<const std::size_t> solver_to_basis_node_map(consts::ElementType eType)
 std::size_t basis_index_for_solver_node(consts::ElementType eType, const int solver_node)
 {
   if (solver_node < 0) {
-    throw febasis::BasisNodeOrderingException(
+    fe::raise<febasis::BasisNodeOrderingException>(SVMP_HERE,
         "Solver node " + std::to_string(solver_node) +
-            " is outside node map for " + solver_element_name(eType),
-        __FILE__, __LINE__, __func__);
+            " is outside node map for " + solver_element_name(eType));
   }
 
   const auto node = static_cast<std::size_t>(solver_node);
@@ -191,10 +189,9 @@ std::size_t basis_index_for_solver_node(consts::ElementType eType, const int sol
   if (node < map.size()) {
     return map[node];
   }
-  throw febasis::BasisNodeOrderingException(
+  fe::raise<febasis::BasisNodeOrderingException>(SVMP_HERE,
       "Solver node " + std::to_string(solver_node) +
-          " is outside node map for " + solver_element_name(eType),
-      __FILE__, __LINE__, __func__);
+          " is outside node map for " + solver_element_name(eType));
 }
 
 fe::math::Vector<fe::Real, 3> make_basis_point(const febasis::BasisFunction& basis,
@@ -202,11 +199,10 @@ fe::math::Vector<fe::Real, 3> make_basis_point(const febasis::BasisFunction& bas
                                                const Array<double>& xi)
 {
   if (xi.nrows() < basis.dimension()) {
-    throw febasis::BasisConfigurationException(
+    fe::raise<febasis::BasisConfigurationException>(SVMP_HERE,
         "xi has " + std::to_string(xi.nrows()) +
             " rows but FE Basis element requires " + std::to_string(basis.dimension()) +
-            " reference coordinates",
-        __FILE__, __LINE__, __func__);
+            " reference coordinates");
   }
 
   // Inactive trailing components must be zero for lower-dimensional elements;
@@ -227,26 +223,23 @@ void copy_basis_values_to_solver_arrays(consts::ElementType eType,
                                         Array3<double>& Nx)
 {
   if (values.size() != static_cast<std::size_t>(eNoN)) {
-    throw febasis::BasisEvaluationException(
+    fe::raise<febasis::BasisEvaluationException>(SVMP_HERE,
         "FE Basis value count " + std::to_string(values.size()) +
-            " does not match solver eNoN " + std::to_string(eNoN),
-        __FILE__, __LINE__, __func__);
+            " does not match solver eNoN " + std::to_string(eNoN));
   }
   if (gradients.size() != static_cast<std::size_t>(eNoN)) {
-    throw febasis::BasisEvaluationException(
+    fe::raise<febasis::BasisEvaluationException>(SVMP_HERE,
         "FE Basis gradient count " + std::to_string(gradients.size()) +
-            " does not match solver eNoN " + std::to_string(eNoN),
-        __FILE__, __LINE__, __func__);
+            " does not match solver eNoN " + std::to_string(eNoN));
   }
 
   for (int a = 0; a < eNoN; ++a) {
     const auto basis_index = basis_index_for_solver_node(eType, a);
     if (basis_index >= values.size() || basis_index >= gradients.size()) {
-      throw febasis::BasisNodeOrderingException(
+      fe::raise<febasis::BasisNodeOrderingException>(SVMP_HERE,
           "Solver node " + std::to_string(a) + " maps to FE Basis node " +
               std::to_string(basis_index) + " outside basis output for " +
-              solver_element_name(eType),
-          __FILE__, __LINE__, __func__);
+              solver_element_name(eType));
     }
 
     N(a, g) = values[basis_index];
@@ -271,10 +264,9 @@ void evaluate_basis_values_and_gradients(const int insd,
 {
   const auto& basis = basis_for_solver_element(eType);
   if (insd < basis.dimension()) {
-    throw febasis::BasisConfigurationException(
+    fe::raise<febasis::BasisConfigurationException>(SVMP_HERE,
         "solver insd " + std::to_string(insd) +
-            " is smaller than FE Basis reference dimension " + std::to_string(basis.dimension()),
-        __FILE__, __LINE__, __func__);
+            " is smaller than FE Basis reference dimension " + std::to_string(basis.dimension()));
   }
 
   const auto point = make_basis_point(basis, g, xi);
@@ -309,9 +301,8 @@ int required_nxx_components_for_dimension(const int dimension)
     case 3:
       return 6;
     default:
-      throw febasis::BasisConfigurationException(
-          "Unsupported FE Basis reference dimension " + std::to_string(dimension),
-          __FILE__, __LINE__, __func__);
+      fe::raise<febasis::BasisConfigurationException>(SVMP_HERE,
+          "Unsupported FE Basis reference dimension " + std::to_string(dimension));
   }
 }
 
@@ -323,18 +314,16 @@ void copy_basis_hessians_to_solver_nxx(consts::ElementType eType,
                                        Array3<double>& Nxx)
 {
   if (hessians.size() != static_cast<std::size_t>(eNoN)) {
-    throw febasis::BasisEvaluationException(
+    fe::raise<febasis::BasisEvaluationException>(SVMP_HERE,
         "FE Basis Hessian count " + std::to_string(hessians.size()) +
-            " does not match solver eNoN " + std::to_string(eNoN),
-        __FILE__, __LINE__, __func__);
+            " does not match solver eNoN " + std::to_string(eNoN));
   }
 
   const int required_components = required_nxx_components_for_dimension(dimension);
   if (Nxx.nrows() < required_components) {
-    throw febasis::BasisConfigurationException(
+    fe::raise<febasis::BasisConfigurationException>(SVMP_HERE,
         "solver Nxx has " + std::to_string(Nxx.nrows()) +
-            " rows but FE Basis Hessian packing requires " + std::to_string(required_components),
-        __FILE__, __LINE__, __func__);
+            " rows but FE Basis Hessian packing requires " + std::to_string(required_components));
   }
 
   for (int a = 0; a < eNoN; ++a) {
@@ -344,11 +333,10 @@ void copy_basis_hessians_to_solver_nxx(consts::ElementType eType,
 
     const auto basis_index = basis_index_for_solver_node(eType, a);
     if (basis_index >= hessians.size()) {
-      throw febasis::BasisNodeOrderingException(
+      fe::raise<febasis::BasisNodeOrderingException>(SVMP_HERE,
           "Solver node " + std::to_string(a) + " maps to FE Basis Hessian node " +
               std::to_string(basis_index) + " outside basis output for " +
-              solver_element_name(eType),
-          __FILE__, __LINE__, __func__);
+              solver_element_name(eType));
     }
 
     const auto& hessian = hessians[basis_index];
@@ -376,18 +364,16 @@ void evaluate_basis_hessians(const int insd,
 {
   const auto& basis = basis_for_solver_element(eType);
   if (insd < basis.dimension()) {
-    throw febasis::BasisConfigurationException(
+    fe::raise<febasis::BasisConfigurationException>(SVMP_HERE,
         "solver insd " + std::to_string(insd) +
-            " is smaller than FE Basis reference dimension " + std::to_string(basis.dimension()),
-        __FILE__, __LINE__, __func__);
+            " is smaller than FE Basis reference dimension " + std::to_string(basis.dimension()));
   }
 
   const int required_components = required_nxx_components_for_dimension(basis.dimension());
   if (ind2 < required_components) {
-    throw febasis::BasisConfigurationException(
+    fe::raise<febasis::BasisConfigurationException>(SVMP_HERE,
         "solver ind2 " + std::to_string(ind2) +
-            " is smaller than packed Hessian component count " + std::to_string(required_components),
-        __FILE__, __LINE__, __func__);
+            " is smaller than packed Hessian component count " + std::to_string(required_components));
   }
 
   const auto point = make_basis_point(basis, gaus_pt, xi);
@@ -415,9 +401,9 @@ void get_gip(const int insd, consts::ElementType eType, const int nG, Vector<dou
   try {
     get_element_gauss_int_data[eType](insd, nG, w, xi);
   } catch (const std::bad_function_call& exception) {
-    throw fe::InvalidElementException(
+    fe::raise<fe::InvalidElementException>(SVMP_HERE,
         "No support in 'get_element_gauss_int_data'",
-        solver_element_name(eType), __FILE__, __LINE__, __func__);
+        solver_element_name(eType));
   }
 }
 
@@ -430,9 +416,9 @@ void get_gip(mshType& mesh)
   try {
     set_element_gauss_int_data[mesh.eType](mesh);
   } catch (const std::bad_function_call& exception) {
-    throw fe::InvalidElementException(
+    fe::raise<fe::InvalidElementException>(SVMP_HERE,
         "No support in 'set_element_gauss_int_data'",
-        solver_element_name(mesh.eType), __FILE__, __LINE__, __func__);
+        solver_element_name(mesh.eType));
   }
 }
 
@@ -441,9 +427,9 @@ void get_gip(Simulation* simulation, faceType& face)
   try {
     set_face_gauss_int_data[face.eType](face);
   } catch (const std::bad_function_call& exception) {
-    throw fe::InvalidElementException(
+    fe::raise<fe::InvalidElementException>(SVMP_HERE,
         "No support in 'set_face_gauss_int_data'",
-        solver_element_name(face.eType), __FILE__, __LINE__, __func__);
+        solver_element_name(face.eType));
   }
 }
 
@@ -453,9 +439,8 @@ void get_gnn(const int insd, consts::ElementType eType, const int eNoN, const in
     Array<double>& N, Array3<double>& Nx)
 {
   if (!use_basis_adapter_for(eType)) {
-    throw febasis::BasisElementCompatibilityException(
-        "[get_gnn] FE Basis does not support solver element " + solver_element_name(eType),
-        __FILE__, __LINE__, __func__);
+    fe::raise<febasis::BasisElementCompatibilityException>(SVMP_HERE,
+        "[get_gnn] FE Basis does not support solver element " + solver_element_name(eType));
   }
 
   evaluate_basis_values_and_gradients(insd, eType, eNoN, g, xi, N, Nx);
@@ -488,11 +473,8 @@ void get_gnn(Simulation* simulation, int gaus_pt, faceType& face)
 {
   using consts::ElementType;
 
-  if (face.eType == ElementType::NRB) {
-    throw fe::NotImplementedException(
-        "[get_gnn(face)] NRB face shape functions are unsupported by FE Basis",
-        __FILE__, __LINE__, __func__);
-  }
+  fe::throw_if<fe::NotImplementedException>(face.eType == ElementType::NRB, SVMP_HERE,
+      "[get_gnn(face)] NRB face shape functions are unsupported by FE Basis");
 
   if (face.eType == ElementType::PNT) {
     set_point_face_shape_data(gaus_pt, face);
@@ -505,9 +487,8 @@ void get_gnn(Simulation* simulation, int gaus_pt, faceType& face)
     return;
   }
 
-  throw febasis::BasisElementCompatibilityException(
-      "[get_gnn(face)] FE Basis does not support face element " + solver_element_name(face.eType),
-      __FILE__, __LINE__, __func__);
+  fe::raise<febasis::BasisElementCompatibilityException>(SVMP_HERE,
+      "[get_gnn(face)] FE Basis does not support face element " + solver_element_name(face.eType));
 }
 
 /// @brief Returns second order derivatives at given natural coords.
@@ -523,10 +504,9 @@ void get_gn_nxx(const int insd, const int ind2, consts::ElementType eType, const
   }
 
   if (!use_basis_adapter_for(eType)) {
-    throw febasis::BasisElementCompatibilityException(
+    fe::raise<febasis::BasisElementCompatibilityException>(SVMP_HERE,
         "[get_gn_nxx] FE Basis Hessian evaluation does not support solver element " +
-            solver_element_name(eType),
-        __FILE__, __LINE__, __func__);
+            solver_element_name(eType));
   }
 
   evaluate_basis_hessians(insd, ind2, eType, eNoN, gaus_pt, xi, Nxx);
@@ -713,11 +693,8 @@ void get_nnx(const int nsd, const consts::ElementType eType, const int eNoN, con
 
   l1 = (l1 && l2 && l3 && l4);
 
-  if (!l1) {
-    throw fe::InvalidArgumentException(
-        "Error in computing shape functions",
-        __FILE__, __LINE__, __func__);
-  }
+  fe::throw_if<fe::InvalidArgumentException>(!l1, SVMP_HERE,
+      "Error in computing shape functions");
 }
 
 /// @brief Inverse maps {xp} to {$\xi$} in an element with coordinates {xl} using Newton's method
@@ -965,11 +942,10 @@ void gnnb(const ComMod& com_mod, const faceType& lFa, const int e, const int g,
     }
 
     if (!found_node) {
-      throw fe::InvalidArgumentException(
+      fe::raise<fe::InvalidArgumentException>(SVMP_HERE,
           "[svMultiPhysics::gnnb] ERROR: The '" + lFa.name + "' face node " +
               std::to_string(Ac) + " could not be matched to a node in the '" +
-              msh.name + "' volume mesh.",
-          __FILE__, __LINE__, __func__);
+              msh.name + "' volume mesh.");
     }
 
     ptr(a) = b;
@@ -1018,9 +994,8 @@ void gnnb(const ComMod& com_mod, const faceType& lFa, const int e, const int g,
           }
           break;
         default:
-          throw fe::InvalidArgumentException(
-              "gnnb: invalid MechanicalConfigurationType provided",
-              __FILE__, __LINE__, __func__);
+          fe::raise<fe::InvalidArgumentException>(SVMP_HERE,
+              "gnnb: invalid MechanicalConfigurationType provided");
       }
     }
   }
@@ -1208,10 +1183,8 @@ void gn_nxx(const int l, const int eNoN, const int nsd, const int insd, Array<do
 
     dgesv_(&l, &eNoN, K.data(), &l, IPIV.data(), B.data(), &l, &INFO);
 
-    if (INFO != 0) {
-      throw fe::BackendException("[gn_nxx] Error in Lapack", "LAPACK dgesv", INFO,
-          __FILE__, __LINE__, __func__);
-    }
+    fe::throw_if<fe::BackendException>(INFO != 0, SVMP_HERE,
+        "[gn_nxx] Error in Lapack", "LAPACK dgesv", INFO);
 
     Nxx = B;
 
@@ -1280,10 +1253,8 @@ void gn_nxx(const int l, const int eNoN, const int nsd, const int insd, Array<do
 
     dgesv_(&l, &eNoN, K.data(), &l, IPIV.data(), B.data(), &l, &INFO);
 
-    if (INFO != 0) {
-      throw fe::BackendException("[gn_nxx] Error in Lapack", "LAPACK dgesv", INFO,
-          __FILE__, __LINE__, __func__);
-    }
+    fe::throw_if<fe::BackendException>(INFO != 0, SVMP_HERE,
+        "[gn_nxx] Error in Lapack", "LAPACK dgesv", INFO);
 
     Nxx = B;
   }
@@ -1330,10 +1301,10 @@ void select_ele(const ComMod& com_mod, mshType& mesh)
       set_1d_element_props[mesh.eNoN](insd, mesh);
     }
   } catch (const std::bad_function_call& exception) {
-      throw fe::InvalidElementException(
+      fe::raise<fe::InvalidElementException>(SVMP_HERE,
           "[select_ele] No support for " + std::to_string(mesh.eNoN) +
               " noded " + std::to_string(insd) + "D elements.",
-          solver_element_name(mesh.eType), __FILE__, __LINE__, __func__);
+          solver_element_name(mesh.eType));
   }
 
   // Set mesh 'w' and 'xi' arrays used for Gauss integration.
@@ -1389,10 +1360,10 @@ void select_eleb(Simulation* simulation, mshType& mesh, faceType& face)
   try {
     set_face_element_props[face.eNoN](insd, face);
   } catch (const std::bad_function_call& exception) {
-    throw fe::InvalidElementException(
+    fe::raise<fe::InvalidElementException>(SVMP_HERE,
         "No support for " + std::to_string(face.eNoN) + " noded " +
             std::to_string(insd) + "D elements in 'set_face_element_props'.",
-        solver_element_name(face.eType), __FILE__, __LINE__, __func__);
+        solver_element_name(face.eType));
   }
 
   // Set face 'w' and 'xi' arrays used for Gauss integration.

From 9d6266b0a268569fca104b82d38dcf1b0230e4f2 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Thu, 11 Jun 2026 09:41:23 -0700
Subject: [PATCH 19/22] improving doxygen documentation for the basis topic

---
 Code/Source/solver/FE/Basis/BasisFunction.h | 118 ++++++++++++++++++--
 Code/Source/solver/FE/FE.h                  |  22 ++++
 2 files changed, 132 insertions(+), 8 deletions(-)
 create mode 100644 Code/Source/solver/FE/FE.h

diff --git a/Code/Source/solver/FE/Basis/BasisFunction.h b/Code/Source/solver/FE/Basis/BasisFunction.h
index 832926199..9b8e29aaa 100644
--- a/Code/Source/solver/FE/Basis/BasisFunction.h
+++ b/Code/Source/solver/FE/Basis/BasisFunction.h
@@ -12,16 +12,115 @@
 #include <cstddef>
 #include <vector>
 
-/// \defgroup FE FE Library
-/// \brief Finite-element interfaces and utilities used by the solver.
-///
-/// The FE library groups basis functions, math utilities, assembly interfaces,
-/// and related support code that can be built and consumed as a coherent
-/// finite-element component.
-
 /// \defgroup FE_Basis Basis
 /// \ingroup FE
 /// \brief Basis-function interfaces, concrete basis families, and reference-node conventions.
+///
+/// \details
+/// ## Scope
+///
+/// The Basis module owns reference-element shape functions. It provides the
+/// number of basis functions and the values and derivatives,
+/// \f$N_i\f$, \f$\partial N_i / \partial \xi_j\f$, and
+/// \f$\partial^2 N_i / \partial \xi_j \partial \xi_k\f$ at reference
+/// points. It does not own mesh storage, quadrature selection, field
+/// formulation policy, or transformation of derivatives to physical
+/// coordinates. Those decisions stay with the solver layer that has the mesh,
+/// material model, and equation context.
+///
+/// The main pieces are:
+/// - BasisFunction (BasisFunction.h): the abstract query and evaluation
+///   contract for code that does not need to know the concrete family.
+/// - \ref FE_LagrangeBasis "LagrangeBasis" and
+///   \ref FE_SerendipityBasis "SerendipityBasis": the implemented nodal
+///   families, including analytical first and second derivatives in reference
+///   coordinates.
+/// - basis_factory (BasisFactory.h): runtime construction from a BasisRequest.
+///   basis_factory::default_basis_request() centralizes the family/order that
+///   matches each supported element's public node layout.
+/// - ReferenceNodeLayout (NodeOrderingConventions.h): canonical reference-node
+///   coordinates and the output ordering used by every basis evaluator.
+/// - BasisTraits.h and BasisExceptions.h: topology classification,
+///   compile-time helpers, and module-specific exception types.
+///
+/// ## Object and evaluation contract
+///
+/// A basis object is immutable after construction. It represents one reference
+/// topology, basis family, and effective polynomial order, and can be shared
+/// safely across evaluations. Construction may build node lattices or invert
+/// interpolation matrices, so callers should construct through basis_factory
+/// and cache one instance for each distinct basis request instead of rebuilding
+/// inside element loops.
+///
+/// Every evaluator takes a three-component reference coordinate. For
+/// lower-dimensional elements, only the first dimension() components are
+/// active. Returned gradients always have three components and Hessians are
+/// always 3-by-3 matrices; inactive reference directions are expected to be
+/// zero for conforming lower-dimensional bases. The std::vector overloads are
+/// convenient for setup, tests, and adapter code. The *_to overloads write to
+/// caller-owned flat buffers and are the allocation-free path for assembly.
+///
+/// Outputs are in ReferenceNodeLayout basis order, not necessarily the mesh or
+/// solver's native node order. A caller that stores elements in another local
+/// ordering must apply the appropriate permutation at the boundary between the
+/// basis module and that storage format.
+///
+/// ## Inputs and ownership
+///
+/// Constructing and evaluating a basis combines several independent choices:
+///
+/// - **Element topology comes from the mesh.** The mesh cell type is translated
+///   to ElementType, which defines the reference topology and public node
+///   layout. This is structural information, not a complete discretization
+///   policy.
+/// - **Geometry interpolation follows the mesh nodes.** The basis used for the
+///   reference-to-physical map must be compatible with the element's node
+///   count and ordering. For that case, callers normally use
+///   basis_factory::create_default_for(element_type), which selects the
+///   Lagrange or serendipity space associated with that element layout. A
+///   Tetra10 mesh therefore implies a quadratic geometry map; a Hex20 mesh
+///   implies the supported Hex20 serendipity geometry basis.
+/// - **Field approximation is chosen by the formulation.** Field bases do not
+///   have to match the geometry map. Mixed formulations, stabilized methods,
+///   enrichment, and convergence studies may use different families or orders
+///   for different fields on the same mesh topology. Those bases should be
+///   requested explicitly with basis_factory::create() and a BasisRequest
+///   naming the desired family and order.
+/// - **Evaluation points come from the caller.** Quadrature rules, probe
+///   points, interpolation targets, and error-sampling locations are outside
+///   this module. The basis only evaluates at the reference coordinates it is
+///   given.
+///
+/// \dot "Basis inputs and responsibilities"
+/// digraph fe_basis_information_flow {
+///   rankdir=LR;
+///   node [shape=box, fontname=Helvetica, fontsize=10];
+///   mesh     [label="Mesh element type"];
+///   request  [label="BasisRequest\nfamily + order"];
+///   topology [label="Reference topology\nand node layout"];
+///   basis    [label="Basis object", style=filled, fillcolor=lightgray];
+///   points   [label="Reference points"];
+///   outputs  [label="Reference values\nand derivatives"];
+///   mesh -> topology;
+///   request -> basis;
+///   topology -> basis;
+///   basis -> outputs;
+///   points -> outputs;
+/// }
+/// \enddot
+///
+/// ## Reference scope and the solver adapter
+///
+/// The solver-facing adapter in nn.cpp is the boundary between this reference
+/// basis contract and legacy solver storage. It translates solver element
+/// enums to ElementType, obtains cached default bases for mesh/face shape
+/// tables, permutes from ReferenceNodeLayout order into solver node order, and
+/// stores N, Nx, and, where needed, packed Nxx at Gauss points. At that stage
+/// Nx and Nxx are still derivatives with respect to reference coordinates.
+/// Physical-coordinate derivatives are formed later, for a particular
+/// configuration and element geometry, by composing the cached reference data
+/// with the mapping Jacobian (nn::gnn for first derivatives and nn::gn_nxx for
+/// second derivatives).
 
 namespace svmp {
 namespace FE {
@@ -105,7 +204,10 @@ inline void add_scaled_hessian(Hessian& target,
 /// BasisFunction defines the common query and evaluation API used by solver
 /// code that does not need to know the concrete basis implementation. Derived
 /// classes provide values at minimum and can override analytical gradients,
-/// Hessians, combined evaluation, and flat-buffer output paths.
+/// Hessians, combined evaluation, and flat-buffer output paths. The interface
+/// is deliberately limited to reference-space quantities; callers own node
+/// ordering translation, physical mapping, and any field-level discretization
+/// policy.
 class BasisFunction {
 public:
     /// \brief Destroy a basis function through the abstract interface.
diff --git a/Code/Source/solver/FE/FE.h b/Code/Source/solver/FE/FE.h
new file mode 100644
index 000000000..1d3bba72b
--- /dev/null
+++ b/Code/Source/solver/FE/FE.h
@@ -0,0 +1,22 @@
+// SPDX-FileCopyrightText: Copyright (c) Stanford University, The Regents of the University of California, and others.
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef SVMP_FE_FE_H
+#define SVMP_FE_FE_H
+
+/// \file FE.h
+/// \brief Library-level Doxygen group for the finite-element support code.
+///
+/// This header intentionally contains no declarations. It gives Doxygen a
+/// header-based home for the top-level FE group; submodule groups attach to it
+/// from their own headers, including FE_Basis (Basis/BasisFunction.h),
+/// FE_Common (Common/Types.h), and FE_Math (Math/Vector.h).
+
+/// \defgroup FE FE Library
+/// \brief Finite-element interfaces and utilities used by the solver.
+///
+/// The FE library groups basis functions, math utilities, assembly interfaces,
+/// and related support code that can be built and consumed as a coherent
+/// finite-element component.
+
+#endif // SVMP_FE_FE_H

From bd7c2ad86d687606319d768d9abd4ab85c997d63 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Mon, 15 Jun 2026 10:10:14 -0700
Subject: [PATCH 20/22] removing chrono guard from Eigen

---
 .../eigen/include/eigen3/unsupported/Eigen/CXX11/Tensor         | 2 --
 1 file changed, 2 deletions(-)

diff --git a/Code/ThirdParty/eigen/include/eigen3/unsupported/Eigen/CXX11/Tensor b/Code/ThirdParty/eigen/include/eigen3/unsupported/Eigen/CXX11/Tensor
index 45b176fe7..0938bb554 100644
--- a/Code/ThirdParty/eigen/include/eigen3/unsupported/Eigen/CXX11/Tensor
+++ b/Code/ThirdParty/eigen/include/eigen3/unsupported/Eigen/CXX11/Tensor
@@ -34,9 +34,7 @@
   */
 
 #include <atomic>
-#ifdef EIGEN_USE_GPU
 #include <chrono>
-#endif
 #include <cmath>
 #include <cstddef>
 #include <cstring>

From 282626996ec2332a78c3789207e8ba7fccb67d6d Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Mon, 15 Jun 2026 11:06:34 -0700
Subject: [PATCH 21/22] reverting chrono replacement code changes

---
 Code/Source/solver/Timer.h      | 8 +++-----
 Code/Source/solver/load_msh.cpp | 1 +
 Code/Source/solver/utils.cpp    | 8 +++-----
 tests/unitTests/test_common.h   | 1 +
 4 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/Code/Source/solver/Timer.h b/Code/Source/solver/Timer.h
index b8ffa29df..1a55d7516 100644
--- a/Code/Source/solver/Timer.h
+++ b/Code/Source/solver/Timer.h
@@ -4,7 +4,7 @@
 #ifndef TIMER_H 
 #define TIMER_H 
 
-#include <sys/time.h>
+#include <chrono>
 
 /// @brief Keep track of time
 class Timer 
@@ -18,10 +18,8 @@ class Timer
 
     double get_time() const
     {
-      timeval now{};
-      gettimeofday(&now, nullptr);
-      return static_cast<double>(now.tv_sec) +
-             static_cast<double>(now.tv_usec) * 1.0e-6;
+      const auto now = std::chrono::steady_clock::now();
+      return std::chrono::duration<double>(now.time_since_epoch()).count();
     }
 
     void set_time()
diff --git a/Code/Source/solver/load_msh.cpp b/Code/Source/solver/load_msh.cpp
index 50d0ca858..05648b52d 100644
--- a/Code/Source/solver/load_msh.cpp
+++ b/Code/Source/solver/load_msh.cpp
@@ -13,6 +13,7 @@
 #include <iostream>
 #include <fstream>
 #include <sstream>
+#include <chrono>
 #include <unordered_map>
 #include <string>
 #include <iomanip>
diff --git a/Code/Source/solver/utils.cpp b/Code/Source/solver/utils.cpp
index 0fb062e8f..fb7874f95 100644
--- a/Code/Source/solver/utils.cpp
+++ b/Code/Source/solver/utils.cpp
@@ -4,6 +4,7 @@
 #include "utils.h"
 
 #include <bitset>
+#include <chrono>
 #include <cmath> 
 #include <limits>
 
@@ -12,7 +13,6 @@
 #include <iostream>
 #include <fstream>
 #include <sys/resource.h>
-#include <sys/time.h>
 
 #include "FE/Common/FEException.h"
 
@@ -37,10 +37,8 @@ int CountBits(int n)
 
 double cput()
 {
-  timeval now{};
-  gettimeofday(&now, nullptr);
-  return static_cast<double>(now.tv_sec) +
-         static_cast<double>(now.tv_usec) * 1.0e-6;
+  const auto now = std::chrono::system_clock::now();
+  return std::chrono::duration<double>(now.time_since_epoch()).count();
 }
 
 Vector<double> 
diff --git a/tests/unitTests/test_common.h b/tests/unitTests/test_common.h
index 7227b2beb..ce6ffed4b 100644
--- a/tests/unitTests/test_common.h
+++ b/tests/unitTests/test_common.h
@@ -33,6 +33,7 @@
 #include <stdlib.h>
 #include <iostream>
 #include <random>
+#include <chrono>
 #include "CepMod.h"
 #include "ComMod.h"
 #include "gtest/gtest.h"

From f734094a5d5a9f175904866f2c49e05fcde01f48 Mon Sep 17 00:00:00 2001
From: Zachary Sexton <zsexton@stanford.edu>
Date: Mon, 15 Jun 2026 15:18:46 -0700
Subject: [PATCH 22/22] swapping out raw pointers for span support in the
 non-owning buffer access

---
 Code/Source/solver/FE/Basis/BasisFunction.cpp |  27 ++-
 Code/Source/solver/FE/Basis/BasisFunction.h   |  63 ++----
 Code/Source/solver/FE/Basis/LagrangeBasis.cpp | 174 +++++++++-------
 Code/Source/solver/FE/Basis/LagrangeBasis.h   |  67 +++---
 .../solver/FE/Basis/SerendipityBasis.cpp      | 194 +++++++++---------
 .../Source/solver/FE/Basis/SerendipityBasis.h |  27 +--
 Code/Source/solver/FE/Common/Types.h          |  11 -
 .../solver/FE/Math/DenseTransformKernels.h    |  27 ++-
 .../FE/Basis/test_BasisErrorPaths.cpp         |  20 +-
 .../unitTests/FE/Basis/test_LagrangeBasis.cpp |  56 ++---
 10 files changed, 332 insertions(+), 334 deletions(-)

diff --git a/Code/Source/solver/FE/Basis/BasisFunction.cpp b/Code/Source/solver/FE/Basis/BasisFunction.cpp
index d847a9cca..1c8c31e5d 100644
--- a/Code/Source/solver/FE/Basis/BasisFunction.cpp
+++ b/Code/Source/solver/FE/Basis/BasisFunction.cpp
@@ -4,6 +4,7 @@
 #include "BasisFunction.h"
 
 #include <algorithm>
+#include <string>
 
 namespace svmp {
 namespace FE {
@@ -22,6 +23,13 @@ BasisFunctionScratch& scratch() {
     return data;
 }
 
+void require_span_size(std::size_t actual,
+                       std::size_t expected,
+                       const char* label) {
+    FE::throw_if<BasisEvaluationException>(actual < expected, SVMP_HERE,
+        std::string("BasisFunction::") + label + ": output span is smaller than basis size");
+}
+
 } // namespace
 
 void BasisFunction::evaluate_gradients(const math::Vector<Real, 3>& xi,
@@ -50,31 +58,30 @@ void BasisFunction::evaluate_all(const math::Vector<Real, 3>& xi,
 }
 
 void BasisFunction::evaluate_values_to(const math::Vector<Real, 3>& xi,
-                                       Real* SVMP_RESTRICT values_out) const {
+                                       std::span<Real> values_out) const {
+    require_span_size(values_out.size(), size(), "evaluate_values_to");
     auto& tmp = scratch().values;
     tmp.resize(size());
     evaluate_values(xi, tmp);
-    std::copy_n(tmp.data(), tmp.size(), values_out);
+    std::copy_n(tmp.begin(), tmp.size(), values_out.begin());
 }
 
 void BasisFunction::evaluate_gradients_to(const math::Vector<Real, 3>& xi,
-                                          Real* SVMP_RESTRICT gradients_out) const {
+                                          std::span<Gradient> gradients_out) const {
+    require_span_size(gradients_out.size(), size(), "evaluate_gradients_to");
     auto& tmp = scratch().gradients;
     tmp.resize(size());
     evaluate_gradients(xi, tmp);
-    for (std::size_t i = 0; i < tmp.size(); ++i) {
-        store_gradient(tmp[i], gradients_out + i * 3u);
-    }
+    std::copy_n(tmp.begin(), tmp.size(), gradients_out.begin());
 }
 
 void BasisFunction::evaluate_hessians_to(const math::Vector<Real, 3>& xi,
-                                         Real* SVMP_RESTRICT hessians_out) const {
+                                         std::span<Hessian> hessians_out) const {
+    require_span_size(hessians_out.size(), size(), "evaluate_hessians_to");
     auto& tmp = scratch().hessians;
     tmp.resize(size());
     evaluate_hessians(xi, tmp);
-    for (std::size_t i = 0; i < tmp.size(); ++i) {
-        store_hessian(tmp[i], hessians_out + i * 9u);
-    }
+    std::copy_n(tmp.begin(), tmp.size(), hessians_out.begin());
 }
 
 void BasisFunction::numerical_gradient(const math::Vector<Real, 3>& xi,
diff --git a/Code/Source/solver/FE/Basis/BasisFunction.h b/Code/Source/solver/FE/Basis/BasisFunction.h
index 9b8e29aaa..8327ffda9 100644
--- a/Code/Source/solver/FE/Basis/BasisFunction.h
+++ b/Code/Source/solver/FE/Basis/BasisFunction.h
@@ -10,6 +10,7 @@
 #include "Types.h"
 
 #include <cstddef>
+#include <span>
 #include <vector>
 
 /// \defgroup FE_Basis Basis
@@ -58,7 +59,7 @@
 /// always 3-by-3 matrices; inactive reference directions are expected to be
 /// zero for conforming lower-dimensional bases. The std::vector overloads are
 /// convenient for setup, tests, and adapter code. The *_to overloads write to
-/// caller-owned flat buffers and are the allocation-free path for assembly.
+/// caller-owned spans and are the allocation-free path for assembly.
 ///
 /// Outputs are in ReferenceNodeLayout basis order, not necessarily the mesh or
 /// solver's native node order. A caller that stores elements in another local
@@ -148,46 +149,6 @@ using Hessian  = math::Matrix<Real, 3, 3>;
     return hessian;
 }
 
-inline void store_gradient(const Gradient& gradient, Real* dst) noexcept {
-    dst[0] = gradient[0];
-    dst[1] = gradient[1];
-    dst[2] = gradient[2];
-}
-
-[[nodiscard]] inline Gradient load_gradient(const Real* src) noexcept {
-    Gradient gradient;
-    gradient[0] = src[0];
-    gradient[1] = src[1];
-    gradient[2] = src[2];
-    return gradient;
-}
-
-inline void store_hessian(const Hessian& hessian, Real* dst) noexcept {
-    dst[0] = hessian(0, 0);
-    dst[1] = hessian(0, 1);
-    dst[2] = hessian(0, 2);
-    dst[3] = hessian(1, 0);
-    dst[4] = hessian(1, 1);
-    dst[5] = hessian(1, 2);
-    dst[6] = hessian(2, 0);
-    dst[7] = hessian(2, 1);
-    dst[8] = hessian(2, 2);
-}
-
-[[nodiscard]] inline Hessian load_hessian(const Real* src) noexcept {
-    Hessian hessian = Hessian::Zero();
-    hessian(0, 0) = src[0];
-    hessian(0, 1) = src[1];
-    hessian(0, 2) = src[2];
-    hessian(1, 0) = src[3];
-    hessian(1, 1) = src[4];
-    hessian(1, 2) = src[5];
-    hessian(2, 0) = src[6];
-    hessian(2, 1) = src[7];
-    hessian(2, 2) = src[8];
-    return hessian;
-}
-
 inline void add_scaled_hessian(Hessian& target,
                                const Hessian& source,
                                Real scale) noexcept {
@@ -204,7 +165,7 @@ inline void add_scaled_hessian(Hessian& target,
 /// BasisFunction defines the common query and evaluation API used by solver
 /// code that does not need to know the concrete basis implementation. Derived
 /// classes provide values at minimum and can override analytical gradients,
-/// Hessians, combined evaluation, and flat-buffer output paths. The interface
+/// Hessians, combined evaluation, and span output paths. The interface
 /// is deliberately limited to reference-space quantities; callers own node
 /// ordering translation, physical mapping, and any field-level discretization
 /// policy.
@@ -263,23 +224,23 @@ class BasisFunction {
                               std::vector<Gradient>& gradients,
                               std::vector<Hessian>& hessians) const;
 
-    /// \brief Evaluate basis values into a flat caller-provided buffer.
+    /// \brief Evaluate basis values into caller-provided storage.
     /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
-    /// \param values_out Output buffer with at least size() entries.
+    /// \param values_out Output span with at least size() entries.
     virtual void evaluate_values_to(const math::Vector<Real, 3>& xi,
-                                    Real* SVMP_RESTRICT values_out) const;
+                                    std::span<Real> values_out) const;
 
-    /// \brief Evaluate basis gradients into a flat caller-provided buffer.
+    /// \brief Evaluate basis gradients into caller-provided storage.
     /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
-    /// \param gradients_out Output buffer with node-major layout: node * 3 + component.
+    /// \param gradients_out Output span with at least size() entries.
     virtual void evaluate_gradients_to(const math::Vector<Real, 3>& xi,
-                                       Real* SVMP_RESTRICT gradients_out) const;
+                                       std::span<Gradient> gradients_out) const;
 
-    /// \brief Evaluate basis Hessians into a flat caller-provided buffer.
+    /// \brief Evaluate basis Hessians into caller-provided storage.
     /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
-    /// \param hessians_out Output buffer with node-major row-major layout: node * 9 + row * 3 + col.
+    /// \param hessians_out Output span with at least size() entries.
     virtual void evaluate_hessians_to(const math::Vector<Real, 3>& xi,
-                                      Real* SVMP_RESTRICT hessians_out) const;
+                                      std::span<Hessian> hessians_out) const;
 
 protected:
     /// \brief Approximate gradients by centered finite differences of values.
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
index 4ec970b86..ab5e73ac7 100644
--- a/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
+++ b/Code/Source/solver/FE/Basis/LagrangeBasis.cpp
@@ -8,6 +8,8 @@
 #include <array>
 #include <cmath>
 #include <limits>
+#include <span>
+#include <string>
 
 namespace svmp {
 namespace FE {
@@ -284,6 +286,22 @@ void evaluate_simplex(const Vec3& xi,
     }
 }
 
+void require_output_span_size(std::size_t actual,
+                              std::size_t expected,
+                              const char* label) {
+    FE::throw_if<BasisEvaluationException>(actual < expected, SVMP_HERE,
+        std::string(label) + ": output span is smaller than basis size");
+}
+
+template<typename T>
+void require_requested_span_size(std::span<T> output,
+                                 std::size_t expected,
+                                 const char* label) {
+    if (!output.empty()) {
+        require_output_span_size(output.size(), expected, label);
+    }
+}
+
 } // namespace
 
 LagrangeBasis::LagrangeBasis(ElementType type, int order)
@@ -398,25 +416,25 @@ void LagrangeBasis::build_wedge_nodes() {
 }
 
 // Evaluate the constant point basis.
-void LagrangeBasis::evaluate_point_to(Real* SVMP_RESTRICT values_out,
-                                      Real* SVMP_RESTRICT gradients_out,
-                                      Real* SVMP_RESTRICT hessians_out) const {
-    if (values_out) {
+void LagrangeBasis::evaluate_point_to(std::span<Real> values_out,
+                                      std::span<Gradient> gradients_out,
+                                      std::span<Hessian> hessians_out) const {
+    if (!values_out.empty()) {
         values_out[0] = Real(1);
     }
-    if (gradients_out) {
-        gradients_out[0] = gradients_out[1] = gradients_out[2] = Real(0);
+    if (!gradients_out.empty()) {
+        gradients_out[0] = Gradient::Zero();
     }
-    if (hessians_out) {
-        std::fill_n(hessians_out, 9u, Real(0));
+    if (!hessians_out.empty()) {
+        hessians_out[0] = Hessian::Zero();
     }
 }
 
 // Evaluate line, quadrilateral, and hexahedron bases as axis-polynomial products.
 void LagrangeBasis::evaluate_tensor_product_to(const Vec3& xi,
-                                               Real* SVMP_RESTRICT values_out,
-                                               Real* SVMP_RESTRICT gradients_out,
-                                               Real* SVMP_RESTRICT hessians_out) const {
+                                               std::span<Real> values_out,
+                                               std::span<Gradient> gradients_out,
+                                               std::span<Hessian> hessians_out) const {
     AxisEval ax;
     AxisEval ay;
     AxisEval az;
@@ -440,55 +458,55 @@ void LagrangeBasis::evaluate_tensor_product_to(const Vec3& xi,
         const Real dz = dimension_ >= 3 ? az.first[idx[2]] : Real(0);
         const Real d2z = dimension_ >= 3 ? az.second[idx[2]] : Real(0);
 
-        if (values_out) {
+        if (!values_out.empty()) {
             values_out[node] = vx * vy * vz;
         }
-        if (gradients_out) {
-            Real* g = gradients_out + node * 3u;
+        if (!gradients_out.empty()) {
+            Gradient& g = gradients_out[node];
             g[0] = dx * vy * vz;
             g[1] = vx * dy * vz;
             g[2] = vx * vy * dz;
         }
-        if (hessians_out) {
-            Real* h = hessians_out + node * 9u;
-            h[0] = d2x * vy * vz;
-            h[1] = dx * dy * vz;
-            h[2] = dx * vy * dz;
-            h[3] = h[1];
-            h[4] = vx * d2y * vz;
-            h[5] = vx * dy * dz;
-            h[6] = h[2];
-            h[7] = h[5];
-            h[8] = vx * vy * d2z;
+        if (!hessians_out.empty()) {
+            Hessian& h = hessians_out[node];
+            h(0, 0) = d2x * vy * vz;
+            h(0, 1) = dx * dy * vz;
+            h(0, 2) = dx * vy * dz;
+            h(1, 0) = h(0, 1);
+            h(1, 1) = vx * d2y * vz;
+            h(1, 2) = vx * dy * dz;
+            h(2, 0) = h(0, 2);
+            h(2, 1) = h(1, 2);
+            h(2, 2) = vx * vy * d2z;
         }
     }
 }
 
 // Evaluate triangle and tetrahedron bases from barycentric factors.
 void LagrangeBasis::evaluate_simplex_to(const Vec3& xi,
-                                        Real* SVMP_RESTRICT values_out,
-                                        Real* SVMP_RESTRICT gradients_out,
-                                        Real* SVMP_RESTRICT hessians_out) const {
+                                        std::span<Real> values_out,
+                                        std::span<Gradient> gradients_out,
+                                        std::span<Hessian> hessians_out) const {
     SimplexEval simplex;
     evaluate_simplex(xi, topology_, order_, simplex_exponents_, simplex);
     for (std::size_t i = 0; i < simplex.value.size(); ++i) {
-        if (values_out) {
+        if (!values_out.empty()) {
             values_out[i] = simplex.value[i];
         }
-        if (gradients_out) {
-            store_gradient(simplex.gradient[i], gradients_out + i * 3u);
+        if (!gradients_out.empty()) {
+            gradients_out[i] = simplex.gradient[i];
         }
-        if (hessians_out) {
-            store_hessian(simplex.hessian[i], hessians_out + i * 9u);
+        if (!hessians_out.empty()) {
+            hessians_out[i] = simplex.hessian[i];
         }
     }
 }
 
 // Evaluate wedge bases as triangle/through-axis products.
 void LagrangeBasis::evaluate_wedge_to(const Vec3& xi,
-                                      Real* SVMP_RESTRICT values_out,
-                                      Real* SVMP_RESTRICT gradients_out,
-                                      Real* SVMP_RESTRICT hessians_out) const {
+                                      std::span<Real> values_out,
+                                      std::span<Gradient> gradients_out,
+                                      std::span<Hessian> hessians_out) const {
     SimplexEval tri;
     AxisEval z_axis;
     evaluate_simplex(xi, BasisTopology::Triangle, order_, simplex_exponents_, tri);
@@ -501,37 +519,45 @@ void LagrangeBasis::evaluate_wedge_to(const Vec3& xi,
         const Real dz = z_axis.first[z_idx];
         const Real d2z = z_axis.second[z_idx];
 
-        if (values_out) {
+        if (!values_out.empty()) {
             values_out[node] = tv * zv;
         }
-        if (gradients_out) {
-            Real* g = gradients_out + node * 3u;
+        if (!gradients_out.empty()) {
+            Gradient& g = gradients_out[node];
             g[0] = tri.gradient[tri_idx][0] * zv;
             g[1] = tri.gradient[tri_idx][1] * zv;
             g[2] = tv * dz;
         }
-        if (hessians_out) {
-            Real* h = hessians_out + node * 9u;
+        if (!hessians_out.empty()) {
+            Hessian& h = hessians_out[node];
             const Hessian& th = tri.hessian[tri_idx];
             const Gradient& tg = tri.gradient[tri_idx];
-            h[0] = th(0, 0) * zv;
-            h[1] = th(0, 1) * zv;
-            h[2] = tg[0] * dz;
-            h[3] = h[1];
-            h[4] = th(1, 1) * zv;
-            h[5] = tg[1] * dz;
-            h[6] = h[2];
-            h[7] = h[5];
-            h[8] = tv * d2z;
+            h(0, 0) = th(0, 0) * zv;
+            h(0, 1) = th(0, 1) * zv;
+            h(0, 2) = tg[0] * dz;
+            h(1, 0) = h(0, 1);
+            h(1, 1) = th(1, 1) * zv;
+            h(1, 2) = tg[1] * dz;
+            h(2, 0) = h(0, 2);
+            h(2, 1) = h(1, 2);
+            h(2, 2) = tv * d2z;
         }
     }
 }
 
-// Evaluate requested basis quantities into caller-provided flat buffers.
+// Evaluate requested basis quantities into caller-provided spans.
 void LagrangeBasis::evaluate_all_to(const Vec3& xi,
-                                    Real* SVMP_RESTRICT values_out,
-                                    Real* SVMP_RESTRICT gradients_out,
-                                    Real* SVMP_RESTRICT hessians_out) const {
+                                    std::span<Real> values_out,
+                                    std::span<Gradient> gradients_out,
+                                    std::span<Hessian> hessians_out) const {
+    require_requested_span_size(values_out, size(), "LagrangeBasis::evaluate_all_to values");
+    require_requested_span_size(gradients_out, size(), "LagrangeBasis::evaluate_all_to gradients");
+    require_requested_span_size(hessians_out, size(), "LagrangeBasis::evaluate_all_to hessians");
+
+    if (values_out.empty() && gradients_out.empty() && hessians_out.empty()) {
+        return;
+    }
+
     switch (topology_) {
         case BasisTopology::Point:
             evaluate_point_to(values_out, gradients_out, hessians_out);
@@ -559,27 +585,19 @@ void LagrangeBasis::evaluate_all_to(const Vec3& xi,
 void LagrangeBasis::evaluate_values(const Vec3& xi,
                                     std::vector<Real>& values) const {
     values.resize(size());
-    evaluate_values_to(xi, values.data());
+    evaluate_values_to(xi, std::span<Real>(values.data(), values.size()));
 }
 
 void LagrangeBasis::evaluate_gradients(const Vec3& xi,
                                        std::vector<Gradient>& gradients) const {
     gradients.resize(size());
-    std::vector<Real> flat(size() * 3u, Real(0));
-    evaluate_gradients_to(xi, flat.data());
-    for (std::size_t i = 0; i < size(); ++i) {
-        gradients[i] = load_gradient(flat.data() + i * 3u);
-    }
+    evaluate_gradients_to(xi, std::span<Gradient>(gradients.data(), gradients.size()));
 }
 
 void LagrangeBasis::evaluate_hessians(const Vec3& xi,
                                       std::vector<Hessian>& hessians) const {
     hessians.resize(size());
-    std::vector<Real> flat(size() * 9u, Real(0));
-    evaluate_hessians_to(xi, flat.data());
-    for (std::size_t i = 0; i < size(); ++i) {
-        hessians[i] = load_hessian(flat.data() + i * 9u);
-    }
+    evaluate_hessians_to(xi, std::span<Hessian>(hessians.data(), hessians.size()));
 }
 
 void LagrangeBasis::evaluate_all(const Vec3& xi,
@@ -589,28 +607,28 @@ void LagrangeBasis::evaluate_all(const Vec3& xi,
     values.resize(size());
     gradients.resize(size());
     hessians.resize(size());
-    std::vector<Real> flat_g(size() * 3u, Real(0));
-    std::vector<Real> flat_h(size() * 9u, Real(0));
-    evaluate_all_to(xi, values.data(), flat_g.data(), flat_h.data());
-    for (std::size_t i = 0; i < size(); ++i) {
-        gradients[i] = load_gradient(flat_g.data() + i * 3u);
-        hessians[i] = load_hessian(flat_h.data() + i * 9u);
-    }
+    evaluate_all_to(xi,
+                    std::span<Real>(values.data(), values.size()),
+                    std::span<Gradient>(gradients.data(), gradients.size()),
+                    std::span<Hessian>(hessians.data(), hessians.size()));
 }
 
 void LagrangeBasis::evaluate_values_to(const Vec3& xi,
-                                       Real* SVMP_RESTRICT values_out) const {
-    evaluate_all_to(xi, values_out, nullptr, nullptr);
+                                       std::span<Real> values_out) const {
+    require_output_span_size(values_out.size(), size(), "LagrangeBasis::evaluate_values_to");
+    evaluate_all_to(xi, values_out, std::span<Gradient>{}, std::span<Hessian>{});
 }
 
 void LagrangeBasis::evaluate_gradients_to(const Vec3& xi,
-                                          Real* SVMP_RESTRICT gradients_out) const {
-    evaluate_all_to(xi, nullptr, gradients_out, nullptr);
+                                          std::span<Gradient> gradients_out) const {
+    require_output_span_size(gradients_out.size(), size(), "LagrangeBasis::evaluate_gradients_to");
+    evaluate_all_to(xi, std::span<Real>{}, gradients_out, std::span<Hessian>{});
 }
 
 void LagrangeBasis::evaluate_hessians_to(const Vec3& xi,
-                                         Real* SVMP_RESTRICT hessians_out) const {
-    evaluate_all_to(xi, nullptr, nullptr, hessians_out);
+                                         std::span<Hessian> hessians_out) const {
+    require_output_span_size(hessians_out.size(), size(), "LagrangeBasis::evaluate_hessians_to");
+    evaluate_all_to(xi, std::span<Real>{}, std::span<Gradient>{}, hessians_out);
 }
 
 } // namespace basis
diff --git a/Code/Source/solver/FE/Basis/LagrangeBasis.h b/Code/Source/solver/FE/Basis/LagrangeBasis.h
index cd0ca6058..6137a557a 100644
--- a/Code/Source/solver/FE/Basis/LagrangeBasis.h
+++ b/Code/Source/solver/FE/Basis/LagrangeBasis.h
@@ -9,6 +9,7 @@
 
 #include <array>
 #include <cstddef>
+#include <span>
 
 namespace svmp {
 namespace FE {
@@ -58,8 +59,8 @@ namespace basis {
 /// \f$N_{a k}(r,s,t) = T_a(r,s)l_k(t)\f$.
 ///
 /// The vector-returning evaluators are convenient API wrappers. The `*_to`
-/// methods write to caller-provided flat buffers and are intended for assembly
-/// paths that avoid temporary allocations.
+/// methods write to caller-provided spans and are intended for assembly paths
+/// that avoid temporary allocations.
 class LagrangeBasis : public BasisFunction {
 public:
     /// \brief Axis-index tuple for tensor-product reference nodes.
@@ -171,38 +172,36 @@ class LagrangeBasis : public BasisFunction {
                       std::vector<Gradient>& gradients,
                       std::vector<Hessian>& hessians) const final;
 
-    /// \brief Evaluate Lagrange basis values into a flat caller-provided buffer.
+    /// \brief Evaluate Lagrange basis values into caller-provided storage.
     ///
     /// \details This is the low-allocation API intended for element assembly
-    /// loops. The buffer is filled in basis-node order and no vector resizing
-    /// is performed.
+    /// loops. The span is filled in basis-node order and no vector resizing is
+    /// performed.
     ///
     /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
-    /// \param values_out Output buffer with at least size() entries.
+    /// \param values_out Output span with at least size() entries.
     void evaluate_values_to(const math::Vector<Real, 3>& xi,
-                            Real* SVMP_RESTRICT values_out) const final;
+                            std::span<Real> values_out) const final;
 
-    /// \brief Evaluate Lagrange basis gradients into a flat caller-provided buffer.
+    /// \brief Evaluate Lagrange basis gradients into caller-provided storage.
     ///
-    /// \details Gradients are written in node-major order with three
-    /// reference-coordinate components per node. For node \f$i\f$ and component
-    /// \f$c\f$, the entry is `gradients_out[i * 3 + c]`.
+    /// \details Gradients are written in basis-node order with one
+    /// three-component gradient per node.
     ///
     /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
-    /// \param gradients_out Output buffer with node-major layout: node * 3 + component.
+    /// \param gradients_out Output span with at least size() entries.
     void evaluate_gradients_to(const math::Vector<Real, 3>& xi,
-                               Real* SVMP_RESTRICT gradients_out) const final;
+                               std::span<Gradient> gradients_out) const final;
 
-    /// \brief Evaluate Lagrange basis Hessians into a flat caller-provided buffer.
+    /// \brief Evaluate Lagrange basis Hessians into caller-provided storage.
     ///
-    /// \details Hessians are written in node-major row-major order. For node
-    /// \f$i\f$ and Hessian component \f$(r,c)\f$, the entry is
-    /// `hessians_out[i * 9 + r * 3 + c]`.
+    /// \details Hessians are written in basis-node order with one 3-by-3
+    /// Hessian per node.
     ///
     /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
-    /// \param hessians_out Output buffer with node-major row-major layout: node * 9 + row * 3 + col.
+    /// \param hessians_out Output span with at least size() entries.
     void evaluate_hessians_to(const math::Vector<Real, 3>& xi,
-                              Real* SVMP_RESTRICT hessians_out) const final;
+                              std::span<Hessian> hessians_out) const final;
 
 private:
     ElementType element_type_;
@@ -224,24 +223,24 @@ class LagrangeBasis : public BasisFunction {
     void init_equispaced_1d_nodes();
 
     void evaluate_all_to(const math::Vector<Real, 3>& xi,
-                         Real* SVMP_RESTRICT values_out,
-                         Real* SVMP_RESTRICT gradients_out,
-                         Real* SVMP_RESTRICT hessians_out) const;
-    void evaluate_point_to(Real* SVMP_RESTRICT values_out,
-                           Real* SVMP_RESTRICT gradients_out,
-                           Real* SVMP_RESTRICT hessians_out) const;
+                         std::span<Real> values_out,
+                         std::span<Gradient> gradients_out,
+                         std::span<Hessian> hessians_out) const;
+    void evaluate_point_to(std::span<Real> values_out,
+                           std::span<Gradient> gradients_out,
+                           std::span<Hessian> hessians_out) const;
     void evaluate_tensor_product_to(const math::Vector<Real, 3>& xi,
-                                    Real* SVMP_RESTRICT values_out,
-                                    Real* SVMP_RESTRICT gradients_out,
-                                    Real* SVMP_RESTRICT hessians_out) const;
+                                    std::span<Real> values_out,
+                                    std::span<Gradient> gradients_out,
+                                    std::span<Hessian> hessians_out) const;
     void evaluate_simplex_to(const math::Vector<Real, 3>& xi,
-                             Real* SVMP_RESTRICT values_out,
-                             Real* SVMP_RESTRICT gradients_out,
-                             Real* SVMP_RESTRICT hessians_out) const;
+                             std::span<Real> values_out,
+                             std::span<Gradient> gradients_out,
+                             std::span<Hessian> hessians_out) const;
     void evaluate_wedge_to(const math::Vector<Real, 3>& xi,
-                           Real* SVMP_RESTRICT values_out,
-                           Real* SVMP_RESTRICT gradients_out,
-                           Real* SVMP_RESTRICT hessians_out) const;
+                           std::span<Real> values_out,
+                           std::span<Gradient> gradients_out,
+                           std::span<Hessian> hessians_out) const;
 };
 
 /// @}
diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
index fd5f99cbc..ae505c2cf 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.cpp
@@ -21,9 +21,9 @@ using Vec3 = math::Vector<Real, 3>;
 void evaluate_hex8_reference(Real r,
                              Real s,
                              Real t,
-                             Real* values,
-                             Real* gradients,
-                             Real* hessians) {
+                             std::span<Real> values,
+                             std::span<Gradient> gradients,
+                             std::span<Hessian> hessians) {
     static constexpr int signs[8][3] = {
         {-1, -1, -1},
         { 1, -1, -1},
@@ -43,26 +43,26 @@ void evaluate_hex8_reference(Real r,
         const Real bs = Real(1) + b * s;
         const Real ct = Real(1) + c * t;
 
-        if (values) {
+        if (!values.empty()) {
             values[i] = Real(0.125) * ar * bs * ct;
         }
-        if (gradients) {
-            Real* g = gradients + i * 3u;
+        if (!gradients.empty()) {
+            Gradient& g = gradients[i];
             g[0] = Real(0.125) * a * bs * ct;
             g[1] = Real(0.125) * b * ar * ct;
             g[2] = Real(0.125) * c * ar * bs;
         }
-        if (hessians) {
-            Real* h = hessians + i * 9u;
-            h[0] = Real(0);
-            h[1] = Real(0.125) * a * b * ct;
-            h[2] = Real(0.125) * a * c * bs;
-            h[3] = h[1];
-            h[4] = Real(0);
-            h[5] = Real(0.125) * b * c * ar;
-            h[6] = h[2];
-            h[7] = h[5];
-            h[8] = Real(0);
+        if (!hessians.empty()) {
+            Hessian& h = hessians[i];
+            h(0, 0) = Real(0);
+            h(0, 1) = Real(0.125) * a * b * ct;
+            h(0, 2) = Real(0.125) * a * c * bs;
+            h(1, 0) = h(0, 1);
+            h(1, 1) = Real(0);
+            h(1, 2) = Real(0.125) * b * c * ar;
+            h(2, 0) = h(0, 2);
+            h(2, 1) = h(1, 2);
+            h(2, 2) = Real(0);
         }
     }
 }
@@ -262,7 +262,7 @@ inline std::array<Real, 3> quadratic_powers(Real x) {
     return {Real(1), x, x * x};
 }
 
-void eval_hex20_internal(Real r, Real s, Real t, Real* internal_vals) {
+void eval_hex20_internal(Real r, Real s, Real t, std::span<Real> internal_vals) {
     const auto rp = quadratic_powers(r);
     const auto sp = quadratic_powers(s);
     const auto tp = quadratic_powers(t);
@@ -284,7 +284,7 @@ void eval_hex20_internal(Real r, Real s, Real t, Real* internal_vals) {
     }
 }
 
-void eval_hex20_grad_internal(Real r, Real s, Real t, Gradient* internal_grads) {
+void eval_hex20_grad_internal(Real r, Real s, Real t, std::span<Gradient> internal_grads) {
     const auto rp = quadratic_powers(r);
     const auto sp = quadratic_powers(s);
     const auto tp = quadratic_powers(t);
@@ -321,7 +321,7 @@ void eval_hex20_grad_internal(Real r, Real s, Real t, Gradient* internal_grads)
     }
 }
 
-void eval_hex20_hess_internal(Real r, Real s, Real t, Hessian* internal_hessians) {
+void eval_hex20_hess_internal(Real r, Real s, Real t, std::span<Hessian> internal_hessians) {
     const auto rp = quadratic_powers(r);
     const auto sp = quadratic_powers(s);
     const auto tp = quadratic_powers(t);
@@ -384,9 +384,9 @@ void eval_hex20_hess_internal(Real r, Real s, Real t, Hessian* internal_hessians
 void eval_wedge15_polynomial(Real r,
                              Real s,
                              Real t,
-                             Real* values,
-                             Gradient* gradients,
-                             Hessian* hessians) {
+                             std::span<Real> values,
+                             std::span<Gradient> gradients,
+                             std::span<Hessian> hessians) {
     Real phi[15]{};
     Real dr[15]{};
     Real ds[15]{};
@@ -415,15 +415,15 @@ void eval_wedge15_polynomial(Real r,
         const Real sb = sp[bs];
         const Real tc = tp[ct];
 
-        if (values) {
+        if (!values.empty()) {
             phi[j] = ra * sb * tc;
         }
-        if (gradients) {
+        if (!gradients.empty()) {
             dr[j] = (a > 0) ? Real(a) * rp[ar - 1u] * sb * tc : Real(0);
             ds[j] = (b > 0) ? ra * Real(b) * sp[bs - 1u] * tc : Real(0);
             dt[j] = (c > 0) ? ra * sb * Real(c) * tp[ct - 1u] : Real(0);
         }
-        if (hessians) {
+        if (!hessians.empty()) {
             drr[j] = (a > 1) ? Real(a * (a - 1)) * rp[ar - 2u] * sb * tc : Real(0);
             dss[j] = (b > 1) ? ra * Real(b * (b - 1)) * sp[bs - 2u] * tc : Real(0);
             dtt[j] = (c > 1) ? ra * sb * Real(c * (c - 1)) * tp[ct - 2u] : Real(0);
@@ -442,15 +442,15 @@ void eval_wedge15_polynomial(Real r,
         for (int j = 0; j < 15; ++j) {
             const Real coefficient =
                 kWedge15Coefficients[static_cast<std::size_t>(j)][static_cast<std::size_t>(i)];
-            if (values) {
+            if (!values.empty()) {
                 value += coefficient * phi[j];
             }
-            if (gradients) {
+            if (!gradients.empty()) {
                 gr += coefficient * dr[j];
                 gs += coefficient * ds[j];
                 gt += coefficient * dt[j];
             }
-            if (hessians) {
+            if (!hessians.empty()) {
                 H(0, 0) += coefficient * drr[j];
                 H(1, 1) += coefficient * dss[j];
                 H(2, 2) += coefficient * dtt[j];
@@ -461,15 +461,15 @@ void eval_wedge15_polynomial(Real r,
         }
 
         const std::size_t index = static_cast<std::size_t>(i);
-        if (values) {
+        if (!values.empty()) {
             values[index] = value;
         }
-        if (gradients) {
+        if (!gradients.empty()) {
             gradients[index][0] = gr;
             gradients[index][1] = gs;
             gradients[index][2] = gt;
         }
-        if (hessians) {
+        if (!hessians.empty()) {
             H(1, 0) = H(0, 1);
             H(2, 0) = H(0, 2);
             H(2, 1) = H(1, 2);
@@ -478,6 +478,22 @@ void eval_wedge15_polynomial(Real r,
     }
 }
 
+void require_output_span_size(std::size_t actual,
+                              std::size_t expected,
+                              const char* label) {
+    FE::throw_if<BasisEvaluationException>(actual < expected, SVMP_HERE,
+        std::string(label) + ": output span is smaller than basis size");
+}
+
+template<typename T>
+void require_requested_span_size(std::span<T> output,
+                                 std::size_t expected,
+                                 const char* label) {
+    if (!output.empty()) {
+        require_output_span_size(output.size(), expected, label);
+    }
+}
+
 } // namespace
 
 SerendipityBasis::SerendipityBasis(ElementType type, int order, bool geometry_mode)
@@ -533,21 +549,25 @@ SerendipityBasis::SerendipityBasis(ElementType type, int order, bool geometry_mo
 }
 
 void SerendipityBasis::evaluate_all_to(const math::Vector<Real, 3>& xi,
-                                       Real* SVMP_RESTRICT values_out,
-                                       Real* SVMP_RESTRICT gradients_out,
-                                       Real* SVMP_RESTRICT hessians_out) const {
-    if (!values_out && !gradients_out && !hessians_out) {
+                                       std::span<Real> values_out,
+                                       std::span<Gradient> gradients_out,
+                                       std::span<Hessian> hessians_out) const {
+    require_requested_span_size(values_out, size_, "SerendipityBasis::evaluate_all_to values");
+    require_requested_span_size(gradients_out, size_, "SerendipityBasis::evaluate_all_to gradients");
+    require_requested_span_size(hessians_out, size_, "SerendipityBasis::evaluate_all_to hessians");
+
+    if (values_out.empty() && gradients_out.empty() && hessians_out.empty()) {
         return;
     }
 
-    if (values_out) {
-        std::fill_n(values_out, size_, Real(0));
+    if (!values_out.empty()) {
+        std::fill(values_out.begin(), values_out.end(), Real(0));
     }
-    if (gradients_out) {
-        std::fill_n(gradients_out, size_ * 3u, Real(0));
+    if (!gradients_out.empty()) {
+        std::fill(gradients_out.begin(), gradients_out.end(), Gradient::Zero());
     }
-    if (hessians_out) {
-        std::fill_n(hessians_out, size_ * 9u, Real(0));
+    if (!hessians_out.empty()) {
+        std::fill(hessians_out.begin(), hessians_out.end(), Hessian::Zero());
     }
 
     const Real x = xi[0];
@@ -581,20 +601,20 @@ void SerendipityBasis::evaluate_all_to(const math::Vector<Real, 3>& xi,
 
             for (std::size_t i = 0; i < size_; ++i) {
                 const Real coeff = quad_inv_vandermonde_[j * size_ + i];
-                if (values_out) {
+                if (!values_out.empty()) {
                     values_out[i] += value * coeff;
                 }
-                if (gradients_out) {
-                    Real* g = gradients_out + i * 3u;
+                if (!gradients_out.empty()) {
+                    Gradient& g = gradients_out[i];
                     g[0] += dx * coeff;
                     g[1] += dy * coeff;
                 }
-                if (hessians_out) {
-                    Real* h = hessians_out + i * 9u;
-                    h[0] += dxx * coeff;
-                    h[1] += dxy * coeff;
-                    h[3] += dxy * coeff;
-                    h[4] += dyy * coeff;
+                if (!hessians_out.empty()) {
+                    Hessian& h = hessians_out[i];
+                    h(0, 0) += dxx * coeff;
+                    h(0, 1) += dxy * coeff;
+                    h(1, 0) += dxy * coeff;
+                    h(1, 1) += dyy * coeff;
                 }
             }
         }
@@ -616,49 +636,37 @@ void SerendipityBasis::evaluate_all_to(const math::Vector<Real, 3>& xi,
         FE::throw_if<BasisEvaluationException>(mesh_to_basis.size() != size_, SVMP_HERE,
                                                "Hex20 mesh-to-basis ordering is not registered");
 
-        if (values_out) {
-            Real internal_vals[20];
+        if (!values_out.empty()) {
+            std::array<Real, 20u> internal_vals{};
             eval_hex20_internal(x, y, z, internal_vals);
             for (std::size_t i = 0; i < 20u; ++i) {
                 values_out[i] = internal_vals[mesh_to_basis[i]];
             }
         }
-        if (gradients_out) {
-            Gradient internal_grads[20];
+        if (!gradients_out.empty()) {
+            std::array<Gradient, 20u> internal_grads{};
             eval_hex20_grad_internal(x, y, z, internal_grads);
             for (std::size_t i = 0; i < 20u; ++i) {
-                store_gradient(internal_grads[mesh_to_basis[i]], gradients_out + i * 3u);
+                gradients_out[i] = internal_grads[mesh_to_basis[i]];
             }
         }
-        if (hessians_out) {
-            Hessian internal_hessians[20];
+        if (!hessians_out.empty()) {
+            std::array<Hessian, 20u> internal_hessians{};
             eval_hex20_hess_internal(x, y, z, internal_hessians);
             for (std::size_t i = 0; i < 20u; ++i) {
-                store_hessian(internal_hessians[mesh_to_basis[i]], hessians_out + i * 9u);
+                hessians_out[i] = internal_hessians[mesh_to_basis[i]];
             }
         }
         return;
     }
 
     if (element_type_ == ElementType::Wedge15) {
-        std::array<Gradient, 15u> wedge_gradients{};
-        std::array<Hessian, 15u> wedge_hessians{};
         eval_wedge15_polynomial(x,
                                  y,
                                  z,
                                  values_out,
-                                 gradients_out ? wedge_gradients.data() : nullptr,
-                                 hessians_out ? wedge_hessians.data() : nullptr);
-        if (gradients_out) {
-            for (std::size_t i = 0; i < 15u; ++i) {
-                store_gradient(wedge_gradients[i], gradients_out + i * 3u);
-            }
-        }
-        if (hessians_out) {
-            for (std::size_t i = 0; i < 15u; ++i) {
-                store_hessian(wedge_hessians[i], hessians_out + i * 9u);
-            }
-        }
+                                 gradients_out,
+                                 hessians_out);
         return;
     }
 
@@ -669,27 +677,19 @@ void SerendipityBasis::evaluate_all_to(const math::Vector<Real, 3>& xi,
 void SerendipityBasis::evaluate_values(const math::Vector<Real, 3>& xi,
                                        std::vector<Real>& values) const {
     values.resize(size_);
-    evaluate_values_to(xi, values.data());
+    evaluate_values_to(xi, std::span<Real>(values.data(), values.size()));
 }
 
 void SerendipityBasis::evaluate_gradients(const math::Vector<Real, 3>& xi,
                                           std::vector<Gradient>& gradients) const {
     gradients.resize(size_);
-    std::vector<Real> flat(size_ * 3u, Real(0));
-    evaluate_gradients_to(xi, flat.data());
-    for (std::size_t i = 0; i < size_; ++i) {
-        gradients[i] = load_gradient(flat.data() + i * 3u);
-    }
+    evaluate_gradients_to(xi, std::span<Gradient>(gradients.data(), gradients.size()));
 }
 
 void SerendipityBasis::evaluate_hessians(const math::Vector<Real, 3>& xi,
                                          std::vector<Hessian>& hessians) const {
     hessians.resize(size_);
-    std::vector<Real> flat(size_ * 9u, Real(0));
-    evaluate_hessians_to(xi, flat.data());
-    for (std::size_t i = 0; i < size_; ++i) {
-        hessians[i] = load_hessian(flat.data() + i * 9u);
-    }
+    evaluate_hessians_to(xi, std::span<Hessian>(hessians.data(), hessians.size()));
 }
 
 void SerendipityBasis::evaluate_all(const math::Vector<Real, 3>& xi,
@@ -699,28 +699,28 @@ void SerendipityBasis::evaluate_all(const math::Vector<Real, 3>& xi,
     values.resize(size_);
     gradients.resize(size_);
     hessians.resize(size_);
-    std::vector<Real> flat_gradients(size_ * 3u, Real(0));
-    std::vector<Real> flat_hessians(size_ * 9u, Real(0));
-    evaluate_all_to(xi, values.data(), flat_gradients.data(), flat_hessians.data());
-    for (std::size_t i = 0; i < size_; ++i) {
-        gradients[i] = load_gradient(flat_gradients.data() + i * 3u);
-        hessians[i] = load_hessian(flat_hessians.data() + i * 9u);
-    }
+    evaluate_all_to(xi,
+                    std::span<Real>(values.data(), values.size()),
+                    std::span<Gradient>(gradients.data(), gradients.size()),
+                    std::span<Hessian>(hessians.data(), hessians.size()));
 }
 
 void SerendipityBasis::evaluate_values_to(const math::Vector<Real, 3>& xi,
-                                          Real* SVMP_RESTRICT values_out) const {
-    evaluate_all_to(xi, values_out, nullptr, nullptr);
+                                          std::span<Real> values_out) const {
+    require_output_span_size(values_out.size(), size_, "SerendipityBasis::evaluate_values_to");
+    evaluate_all_to(xi, values_out, std::span<Gradient>{}, std::span<Hessian>{});
 }
 
 void SerendipityBasis::evaluate_gradients_to(const math::Vector<Real, 3>& xi,
-                                             Real* SVMP_RESTRICT gradients_out) const {
-    evaluate_all_to(xi, nullptr, gradients_out, nullptr);
+                                             std::span<Gradient> gradients_out) const {
+    require_output_span_size(gradients_out.size(), size_, "SerendipityBasis::evaluate_gradients_to");
+    evaluate_all_to(xi, std::span<Real>{}, gradients_out, std::span<Hessian>{});
 }
 
 void SerendipityBasis::evaluate_hessians_to(const math::Vector<Real, 3>& xi,
-                                            Real* SVMP_RESTRICT hessians_out) const {
-    evaluate_all_to(xi, nullptr, nullptr, hessians_out);
+                                            std::span<Hessian> hessians_out) const {
+    require_output_span_size(hessians_out.size(), size_, "SerendipityBasis::evaluate_hessians_to");
+    evaluate_all_to(xi, std::span<Real>{}, std::span<Gradient>{}, hessians_out);
 }
 
 } // namespace basis
diff --git a/Code/Source/solver/FE/Basis/SerendipityBasis.h b/Code/Source/solver/FE/Basis/SerendipityBasis.h
index 9c55c8eec..e231ed833 100644
--- a/Code/Source/solver/FE/Basis/SerendipityBasis.h
+++ b/Code/Source/solver/FE/Basis/SerendipityBasis.h
@@ -12,6 +12,7 @@
 #include "BasisFunction.h"
 
 #include <array>
+#include <span>
 
 namespace svmp {
 namespace FE {
@@ -153,7 +154,7 @@ class SerendipityBasis final : public BasisFunction {
 
     /// \brief Evaluate serendipity values, gradients, and Hessians together.
     ///
-    /// \details This vector API is backed by the same flat-buffer evaluator as
+    /// \details This vector API is backed by the same span-based evaluator as
     /// the assembly-oriented `*_to` methods, so topology-specific polynomial
     /// setup can be shared for a quadrature point.
     ///
@@ -166,23 +167,23 @@ class SerendipityBasis final : public BasisFunction {
                       std::vector<Gradient>& gradients,
                       std::vector<Hessian>& hessians) const final;
 
-    /// \brief Evaluate serendipity basis values into a flat caller-provided buffer.
+    /// \brief Evaluate serendipity basis values into caller-provided storage.
     /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
-    /// \param values_out Output buffer with at least size() entries.
+    /// \param values_out Output span with at least size() entries.
     void evaluate_values_to(const math::Vector<Real, 3>& xi,
-                            Real* SVMP_RESTRICT values_out) const final;
+                            std::span<Real> values_out) const final;
 
-    /// \brief Evaluate serendipity basis gradients into a flat caller-provided buffer.
+    /// \brief Evaluate serendipity basis gradients into caller-provided storage.
     /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
-    /// \param gradients_out Output buffer with node-major layout: node * 3 + component.
+    /// \param gradients_out Output span with at least size() entries.
     void evaluate_gradients_to(const math::Vector<Real, 3>& xi,
-                               Real* SVMP_RESTRICT gradients_out) const final;
+                               std::span<Gradient> gradients_out) const final;
 
-    /// \brief Evaluate serendipity basis Hessians into a flat caller-provided buffer.
+    /// \brief Evaluate serendipity basis Hessians into caller-provided storage.
     /// \param xi Reference coordinate. Lower-dimensional elements use the active prefix components.
-    /// \param hessians_out Output buffer with node-major row-major layout: node * 9 + row * 3 + col.
+    /// \param hessians_out Output span with at least size() entries.
     void evaluate_hessians_to(const math::Vector<Real, 3>& xi,
-                              Real* SVMP_RESTRICT hessians_out) const final;
+                              std::span<Hessian> hessians_out) const final;
 
 private:
     ElementType element_type_;
@@ -199,9 +200,9 @@ class SerendipityBasis final : public BasisFunction {
     bool geometry_mode_;
 
     void evaluate_all_to(const math::Vector<Real, 3>& xi,
-                         Real* SVMP_RESTRICT values_out,
-                         Real* SVMP_RESTRICT gradients_out,
-                         Real* SVMP_RESTRICT hessians_out) const;
+                         std::span<Real> values_out,
+                         std::span<Gradient> gradients_out,
+                         std::span<Hessian> hessians_out) const;
 };
 
 /// @}
diff --git a/Code/Source/solver/FE/Common/Types.h b/Code/Source/solver/FE/Common/Types.h
index 1f57ffcc5..462b7ca76 100644
--- a/Code/Source/solver/FE/Common/Types.h
+++ b/Code/Source/solver/FE/Common/Types.h
@@ -58,17 +58,6 @@ enum class CellFamily {
 #include <type_traits>
 #include <limits>
 
-#if defined(_MSC_VER)
-/// Portable restrict qualifier for aliasing-free pointer parameters.
-#  define SVMP_RESTRICT __restrict
-#elif defined(__clang__) || defined(__GNUC__)
-/// Portable restrict qualifier for aliasing-free pointer parameters.
-#  define SVMP_RESTRICT __restrict__
-#else
-/// Portable restrict qualifier for aliasing-free pointer parameters.
-#  define SVMP_RESTRICT
-#endif
-
 /// \defgroup FE_Common Common
 /// \ingroup FE
 /// \brief Shared vocabulary types, constants, and exception infrastructure used by every FE module.
diff --git a/Code/Source/solver/FE/Math/DenseTransformKernels.h b/Code/Source/solver/FE/Math/DenseTransformKernels.h
index 2ddb9cefa..f6639dcd3 100644
--- a/Code/Source/solver/FE/Math/DenseTransformKernels.h
+++ b/Code/Source/solver/FE/Math/DenseTransformKernels.h
@@ -4,11 +4,13 @@
 #ifndef SVMP_FE_MATH_DENSETRANSFORMKERNELS_H
 #define SVMP_FE_MATH_DENSETRANSFORMKERNELS_H
 
+#include "FEException.h"
 #include "Types.h"
 
 #include <Eigen/Core>
 
 #include <cstddef>
+#include <span>
 
 namespace svmp {
 namespace FE {
@@ -22,18 +24,31 @@ namespace math {
 /// (row stride output_row_stride). Strides may exceed rhs_count for padded
 /// layouts; padding entries are left untouched.
 inline void dense_transform_batched_row_major(
-    const Real* SVMP_RESTRICT matrix,
+    std::span<const Real> matrix,
     std::size_t rows,
     std::size_t cols,
-    const Real* SVMP_RESTRICT input,
+    std::span<const Real> input,
     std::size_t input_row_stride,
-    Real* SVMP_RESTRICT output,
+    std::span<Real> output,
     std::size_t output_row_stride,
     std::size_t rhs_count) {
     if (rows == 0u || cols == 0u || rhs_count == 0u) {
         return;
     }
 
+    FE::throw_if<FEException>(matrix.size() < rows * cols, SVMP_HERE,
+                              "dense_transform_batched_row_major: matrix span is too small");
+    FE::throw_if<FEException>(input_row_stride < rhs_count, SVMP_HERE,
+                              "dense_transform_batched_row_major: input stride is smaller than RHS count");
+    FE::throw_if<FEException>(output_row_stride < rhs_count, SVMP_HERE,
+                              "dense_transform_batched_row_major: output stride is smaller than RHS count");
+    FE::throw_if<FEException>(
+        input.size() < (cols - 1u) * input_row_stride + rhs_count, SVMP_HERE,
+        "dense_transform_batched_row_major: input span is too small");
+    FE::throw_if<FEException>(
+        output.size() < (rows - 1u) * output_row_stride + rhs_count, SVMP_HERE,
+        "dense_transform_batched_row_major: output span is too small");
+
     using RowMajorMatrix =
         Eigen::Matrix<Real, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
     using ConstMap = Eigen::Map<const RowMajorMatrix>;
@@ -42,16 +57,16 @@ inline void dense_transform_batched_row_major(
     using StridedMap =
         Eigen::Map<RowMajorMatrix, Eigen::Unaligned, Eigen::OuterStride<>>;
 
-    const ConstMap matrix_map(matrix,
+    const ConstMap matrix_map(matrix.data(),
                               static_cast<Eigen::Index>(rows),
                               static_cast<Eigen::Index>(cols));
     const ConstStridedMap input_map(
-        input,
+        input.data(),
         static_cast<Eigen::Index>(cols),
         static_cast<Eigen::Index>(rhs_count),
         Eigen::OuterStride<>(static_cast<Eigen::Index>(input_row_stride)));
     StridedMap output_map(
-        output,
+        output.data(),
         static_cast<Eigen::Index>(rows),
         static_cast<Eigen::Index>(rhs_count),
         Eigen::OuterStride<>(static_cast<Eigen::Index>(output_row_stride)));
diff --git a/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp b/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
index 60ca72114..edeca5ac5 100644
--- a/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
+++ b/tests/unitTests/FE/Basis/test_BasisErrorPaths.cpp
@@ -255,29 +255,29 @@ TEST(BasisErrorPaths, NumericalDerivativeHelpersMatchAnalyticDerivatives) {
     }
 }
 
-TEST(BasisErrorPaths, BasisFunctionFallbackWritesRawLayouts) {
+TEST(BasisErrorPaths, BasisFunctionFallbackWritesSpanOutputs) {
     CompleteFallbackBasis basis;
     const math::Vector<Real, 3> point{Real(0.25), Real(0.5), Real(-0.25)};
 
-    std::vector<Real> flat_values(basis.size());
-    std::vector<Real> flat_gradients(basis.size() * 3u);
-    std::vector<Real> flat_hessians(basis.size() * 9u);
-    basis.evaluate_values_to(point, flat_values.data());
-    basis.evaluate_gradients_to(point, flat_gradients.data());
-    basis.evaluate_hessians_to(point, flat_hessians.data());
+    std::vector<Real> span_values(basis.size());
+    std::vector<Gradient> span_gradients(basis.size());
+    std::vector<Hessian> span_hessians(basis.size());
+    basis.evaluate_values_to(point, span_values);
+    basis.evaluate_gradients_to(point, span_gradients);
+    basis.evaluate_hessians_to(point, span_hessians);
 
     std::vector<Real> expected_values;
     std::vector<Gradient> expected_gradients;
     std::vector<Hessian> expected_hessians;
     basis.evaluate_all(point, expected_values, expected_gradients, expected_hessians);
     for (std::size_t d = 0; d < basis.size(); ++d) {
-        EXPECT_EQ(flat_values[d], expected_values[d]);
+        EXPECT_EQ(span_values[d], expected_values[d]);
         for (std::size_t c = 0; c < 3u; ++c) {
-            EXPECT_EQ(flat_gradients[d * 3u + c], expected_gradients[d][c]);
+            EXPECT_EQ(span_gradients[d][c], expected_gradients[d][c]);
         }
         for (std::size_t r = 0; r < 3u; ++r) {
             for (std::size_t c = 0; c < 3u; ++c) {
-                EXPECT_EQ(flat_hessians[d * 9u + r * 3u + c], expected_hessians[d](r, c));
+                EXPECT_EQ(span_hessians[d](r, c), expected_hessians[d](r, c));
             }
         }
     }
diff --git a/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp b/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp
index 8a1f43c58..68232d216 100644
--- a/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp
+++ b/tests/unitTests/FE/Basis/test_LagrangeBasis.cpp
@@ -12,6 +12,7 @@
 
 #include <algorithm>
 #include <array>
+#include <span>
 #include <tuple>
 #include <vector>
 
@@ -116,27 +117,27 @@ void expect_partition_gradient_hessian_sums(const LagrangeBasis& basis,
     }
 }
 
-void expect_raw_sinks_match_vector_evaluation(const LagrangeBasis& basis,
-                                              const Point& xi)
+void expect_span_sinks_match_vector_evaluation(const LagrangeBasis& basis,
+                                               const Point& xi)
 {
     std::vector<Real> values;
     std::vector<Gradient> gradients;
     std::vector<Hessian> hessians;
     basis.evaluate_all(xi, values, gradients, hessians);
 
-    std::vector<Real> flat_values(basis.size());
-    std::vector<Real> flat_gradients(basis.size() * 3u);
-    std::vector<Real> flat_hessians(basis.size() * 9u);
-    basis.evaluate_values_to(xi, flat_values.data());
-    basis.evaluate_gradients_to(xi, flat_gradients.data());
-    basis.evaluate_hessians_to(xi, flat_hessians.data());
+    std::vector<Real> span_values(basis.size());
+    std::vector<Gradient> span_gradients(basis.size());
+    std::vector<Hessian> span_hessians(basis.size());
+    basis.evaluate_values_to(xi, span_values);
+    basis.evaluate_gradients_to(xi, span_gradients);
+    basis.evaluate_hessians_to(xi, span_hessians);
 
     for (std::size_t i = 0; i < basis.size(); ++i) {
-        EXPECT_NEAR(flat_values[i], values[i], Real(1e-14));
+        EXPECT_NEAR(span_values[i], values[i], Real(1e-14));
         for (std::size_t d = 0; d < 3u; ++d) {
-            EXPECT_NEAR(flat_gradients[i * 3u + d], gradients[i][d], Real(1e-14));
+            EXPECT_NEAR(span_gradients[i][d], gradients[i][d], Real(1e-14));
             for (std::size_t e = 0; e < 3u; ++e) {
-                EXPECT_NEAR(flat_hessians[i * 9u + d * 3u + e],
+                EXPECT_NEAR(span_hessians[i](d, e),
                             hessians[i](d, e),
                             Real(1e-14));
             }
@@ -251,10 +252,10 @@ TEST(LagrangeBasis, CanonicalTopologiesAreNodalAndPartitionUnity) {
     }
 }
 
-TEST(LagrangeBasis, RawOutputSinksMatchVectorEvaluationAcrossTopologies) {
+TEST(LagrangeBasis, SpanOutputSinksMatchVectorEvaluationAcrossTopologies) {
     for (const auto& c : canonical_cases()) {
         LagrangeBasis basis(c.type, c.order);
-        expect_raw_sinks_match_vector_evaluation(basis, c.points.front());
+        expect_span_sinks_match_vector_evaluation(basis, c.points.front());
     }
 }
 
@@ -461,19 +462,26 @@ TEST(LagrangeBasis, PointTopologyEvaluatesConstantUnity) {
         }
     }
 
-    Real flat_value = Real(-1);
-    Real flat_gradient[3] = {Real(-1), Real(-1), Real(-1)};
-    Real flat_hessian[9];
-    std::fill_n(flat_hessian, 9u, Real(-1));
-    basis.evaluate_values_to(xi, &flat_value);
-    basis.evaluate_gradients_to(xi, flat_gradient);
-    basis.evaluate_hessians_to(xi, flat_hessian);
-    EXPECT_EQ(flat_value, Real(1));
+    Real span_value = Real(-1);
+    Gradient span_gradient;
+    span_gradient[0] = span_gradient[1] = span_gradient[2] = Real(-1);
+    Hessian span_hessian;
     for (std::size_t d = 0; d < 3u; ++d) {
-        EXPECT_EQ(flat_gradient[d], Real(0));
+        for (std::size_t e = 0; e < 3u; ++e) {
+            span_hessian(d, e) = Real(-1);
+        }
+    }
+    basis.evaluate_values_to(xi, std::span<Real>(&span_value, 1u));
+    basis.evaluate_gradients_to(xi, std::span<Gradient>(&span_gradient, 1u));
+    basis.evaluate_hessians_to(xi, std::span<Hessian>(&span_hessian, 1u));
+    EXPECT_EQ(span_value, Real(1));
+    for (std::size_t d = 0; d < 3u; ++d) {
+        EXPECT_EQ(span_gradient[d], Real(0));
     }
-    for (std::size_t e = 0; e < 9u; ++e) {
-        EXPECT_EQ(flat_hessian[e], Real(0));
+    for (std::size_t d = 0; d < 3u; ++d) {
+        for (std::size_t e = 0; e < 3u; ++e) {
+            EXPECT_EQ(span_hessian(d, e), Real(0));
+        }
     }
 }