diff --git a/components/eamxx/cime_config/testdefs/testmods_dirs/eamxx/caar/opt/shell_commands b/components/eamxx/cime_config/testdefs/testmods_dirs/eamxx/caar/opt/shell_commands
new file mode 100644
index 000000000000..d9fcf6cb6527
--- /dev/null
+++ b/components/eamxx/cime_config/testdefs/testmods_dirs/eamxx/caar/opt/shell_commands
@@ -0,0 +1 @@
+./xmlchange --append SCREAM_CMAKE_OPTIONS='HOMMEXX_ENABLE_CAAR_OPT ON'
diff --git a/components/homme/cmake/machineFiles/pm-cpu-bfb.cmake b/components/homme/cmake/machineFiles/pm-cpu-bfb.cmake
index 9d7a9dd21f2c..be8d509cfa55 100644
--- a/components/homme/cmake/machineFiles/pm-cpu-bfb.cmake
+++ b/components/homme/cmake/machineFiles/pm-cpu-bfb.cmake
@@ -125,6 +125,7 @@ SET(Kokkos_ENABLE_CUDA_LAMBDA OFF CACHE BOOL "")
 SET(Kokkos_ARCH_AMPERE80 OFF CACHE BOOL "")
 SET(Kokkos_ENABLE_EXPLICIT_INSTANTIATION OFF CACHE BOOL "")
 #SET(Kokkos_ENABLE_CUDA_ARCH_LINKING OFF CACHE BOOL "")
+SET(HOMMEXX_ENABLE_CAAR_OPT OFF CACHE BOOL "")
 
 SET(CXXLIB_SUPPORTED_CACHE FALSE CACHE BOOL "")
 
diff --git a/components/homme/cmake/machineFiles/pm-cpu.cmake b/components/homme/cmake/machineFiles/pm-cpu.cmake
index 84cb0875e7b9..3af1f09b0cdc 100644
--- a/components/homme/cmake/machineFiles/pm-cpu.cmake
+++ b/components/homme/cmake/machineFiles/pm-cpu.cmake
@@ -71,6 +71,7 @@ SET(Kokkos_ENABLE_CUDA_LAMBDA OFF CACHE BOOL "")
 SET(Kokkos_ARCH_AMPERE80 OFF CACHE BOOL "")
 SET(Kokkos_ENABLE_EXPLICIT_INSTANTIATION OFF CACHE BOOL "")
 #SET(Kokkos_ENABLE_CUDA_ARCH_LINKING OFF CACHE BOOL "")
+SET(HOMMEXX_ENABLE_CAAR_OPT OFF CACHE BOOL "")
 
 SET(CXXLIB_SUPPORTED_CACHE FALSE CACHE BOOL "")
 
diff --git a/components/homme/cmake/machineFiles/pm-gpu-bfb.cmake b/components/homme/cmake/machineFiles/pm-gpu-bfb.cmake
index e19eedaf496d..48d17e7ab091 100644
--- a/components/homme/cmake/machineFiles/pm-gpu-bfb.cmake
+++ b/components/homme/cmake/machineFiles/pm-gpu-bfb.cmake
@@ -127,6 +127,7 @@ SET(Kokkos_ENABLE_IMPL_CUDA_MALLOC_ASYNC OFF CACHE BOOL "")
 #SET(Kokkos_ENABLE_CUDA_UVM ON CACHE BOOL "")
 SET(Kokkos_ENABLE_EXPLICIT_INSTANTIATION OFF CACHE BOOL "")
 #SET(Kokkos_ENABLE_CUDA_ARCH_LINKING OFF CACHE BOOL "")
+SET(HOMMEXX_ENABLE_CAAR_OPT OFF CACHE BOOL "")
 
 SET(CXXLIB_SUPPORTED_CACHE FALSE CACHE BOOL "")
 
@@ -139,5 +140,5 @@ SET(CMAKE_VERBOSE_MAKEFILE ON CACHE BOOL "")
 SET(USE_NUM_PROCS 4 CACHE STRING "") # only default
 
 SET(USE_MPIEXEC "srun" CACHE STRING "")
-SET(USE_MPI_OPTIONS "-K --cpu-bind=cores" CACHE STRING "")
+SET(USE_MPI_OPTIONS "-C gpu -K --cpu-bind=cores" CACHE STRING "")
 SET(HOMME_TESTING_TIMELIMIT 1800 CACHE STRING "")
diff --git a/components/homme/cmake/machineFiles/pm-gpu.cmake b/components/homme/cmake/machineFiles/pm-gpu.cmake
index 669857e03e75..716f3d1df2f0 100644
--- a/components/homme/cmake/machineFiles/pm-gpu.cmake
+++ b/components/homme/cmake/machineFiles/pm-gpu.cmake
@@ -76,6 +76,7 @@ SET(Kokkos_ENABLE_IMPL_CUDA_MALLOC_ASYNC OFF CACHE BOOL "")
 #SET(Kokkos_ENABLE_CUDA_UVM ON CACHE BOOL "")
 SET(Kokkos_ENABLE_EXPLICIT_INSTANTIATION OFF CACHE BOOL "")
 #SET(Kokkos_ENABLE_CUDA_ARCH_LINKING OFF CACHE BOOL "")
+SET(HOMMEXX_ENABLE_CAAR_OPT OFF CACHE BOOL "")
 
 SET(CXXLIB_SUPPORTED_CACHE FALSE CACHE BOOL "")
 
@@ -88,5 +89,5 @@ SET(CMAKE_VERBOSE_MAKEFILE ON CACHE BOOL "")
 SET(USE_NUM_PROCS 4 CACHE STRING "") # only default
 
 SET(USE_MPIEXEC "srun" CACHE STRING "")
-SET(USE_MPI_OPTIONS "-K --cpu-bind=cores" CACHE STRING "")
+SET(USE_MPI_OPTIONS "-C gpu -K --cpu-bind=cores" CACHE STRING "")
 SET(HOMME_TESTING_TIMELIMIT 1800 CACHE STRING "")
diff --git a/components/homme/src/preqx_kokkos/cxx/CaarFunctorImpl-caar-opt.hpp b/components/homme/src/preqx_kokkos/cxx/CaarFunctorImpl-caar-opt.hpp
new file mode 100644
index 000000000000..aa28c6a7e37e
--- /dev/null
+++ b/components/homme/src/preqx_kokkos/cxx/CaarFunctorImpl-caar-opt.hpp
@@ -0,0 +1,3 @@
+// For preqx_kokkos, the caar-opt path is not implemented.
+// Fall back to the standard CaarFunctorImpl.
+#include "CaarFunctorImpl.hpp"
diff --git a/components/homme/src/preqx_kokkos/cxx/HyperviscosityFunctorImpl.hpp b/components/homme/src/preqx_kokkos/cxx/HyperviscosityFunctorImpl.hpp
index 688007e27f24..923d38d4a770 100644
--- a/components/homme/src/preqx_kokkos/cxx/HyperviscosityFunctorImpl.hpp
+++ b/components/homme/src/preqx_kokkos/cxx/HyperviscosityFunctorImpl.hpp
@@ -297,7 +297,7 @@ class HyperviscosityFunctorImpl
   SphereOperators       m_sphere_ops;
 
   // Policies
-#ifndef NDEBUG
+#if defined(KOKKOS_ENABLE_CUDA) && !defined(NDEBUG)
   template<typename Tag>
   using TeamPolicyType = Kokkos::TeamPolicy<ExecSpace,Kokkos::LaunchBounds<512,1>,Tag>;
 #else
diff --git a/components/homme/src/share/cxx/CaarFunctor.cpp b/components/homme/src/share/cxx/CaarFunctor.cpp
index a0b27efd3725..bd9c60085957 100644
--- a/components/homme/src/share/cxx/CaarFunctor.cpp
+++ b/components/homme/src/share/cxx/CaarFunctor.cpp
@@ -5,7 +5,12 @@
  *******************************************************************************/
 
 #include "CaarFunctor.hpp"
+#include "Hommexx_config.h"
+#ifdef HOMMEXX_ENABLE_CAAR_OPT
+#include "CaarFunctorImpl-caar-opt.hpp"
+#else
 #include "CaarFunctorImpl.hpp"
+#endif
 #include "Context.hpp"
 #include "Elements.hpp"
 #include "ErrorDefs.hpp"
diff --git a/components/homme/src/share/cxx/Hommexx_config.h.in b/components/homme/src/share/cxx/Hommexx_config.h.in
index d563d2bcde70..7e0c4e81dc42 100644
--- a/components/homme/src/share/cxx/Hommexx_config.h.in
+++ b/components/homme/src/share/cxx/Hommexx_config.h.in
@@ -24,6 +24,9 @@
 // Whether the debug parts of cxx code should be compiled or not
 #cmakedefine HOMMEXX_DEBUG
 
+// Enable CAAR Optimization
+#cmakedefine HOMMEXX_ENABLE_CAAR_OPT
+
 // Whether the MPI operations have to be performed directly on the device
 #cmakedefine01 HOMMEXX_MPI_ON_DEVICE
 
diff --git a/components/homme/src/share/cxx/SphereOperators-caar-opt.hpp b/components/homme/src/share/cxx/SphereOperators-caar-opt.hpp
new file mode 100644
index 000000000000..124d756adde5
--- /dev/null
+++ b/components/homme/src/share/cxx/SphereOperators-caar-opt.hpp
@@ -0,0 +1,1720 @@
+/********************************************************************************
+ * HOMMEXX 1.0: Copyright of Sandia Corporation
+ * This software is released under the BSD license
+ * See the file 'COPYRIGHT' in the HOMMEXX/src/share/cxx directory
+ *******************************************************************************/
+
+#ifndef HOMMEXX_SPHERE_OPERATORS_HPP
+#define HOMMEXX_SPHERE_OPERATORS_HPP
+
+#include "Types.hpp"
+#include "ElementsGeometry.hpp"
+#include "CombineOps.hpp"
+#include "ReferenceElement.hpp"
+#include "Dimensions.hpp"
+#include "KernelVariables.hpp"
+#include "utilities/SubviewUtils.hpp"
+#include "utilities/ViewUtils.hpp"
+
+#include <Kokkos_Core.hpp>
+
+namespace Homme {
+
+class SphereOperators
+{
+  static constexpr int MAX_NUM_LEV = NUM_LEV_P;
+  static constexpr int NUM_2D_VECTOR_BUFFERS = 2;
+  static constexpr int NUM_3D_SCALAR_BUFFERS = 3;
+  static constexpr int NUM_3D_VECTOR_BUFFERS = 3;
+
+  // These two short names will be used to extract subviews from the 3d buffers,
+  // since we can no longer use the auto keyword. Before we used to do
+  //   const auto& gv = Homme::subview(vector_buf_ml,kv.team_idx,0);
+  // However, now vector_buf_ml has last timension NUM_LEV_P, but we may
+  // need gv to be smaller. Hence, we simply grab the pointer from the subview,
+  // but then we need to explicitly tell the compiler the type of the result,
+  // which can no longer be deduced. Like this:
+  //   vector_buf<NUM_LEV gv(Homme::subview(vector_buf_ml,kv.team_idx,0).data());
+
+  template<int NL>
+  using scalar_buf = ExecViewUnmanaged<Scalar[NP][NP][NL]>;
+
+  template<int NL>
+  using vector_buf = ExecViewUnmanaged<Scalar[2][NP][NP][NL]>;
+
+  // std::min is constexpr only from c++14 on.
+  template<int M, int N>
+  struct Min {
+    static constexpr int value = M > N ? N : M;
+  };
+
+  template<int NUM_LEVELS>
+  using DefaultProvider = ExecViewUnmanaged<const Scalar [NP][NP][NUM_LEVELS]>;
+public:
+
+
+  SphereOperators () = default;
+
+  SphereOperators (const ElementsGeometry& geometry,
+                   const ReferenceElement& ref_FE)
+  {
+    setup (geometry, ref_FE);
+  }
+
+  //only for unit tests
+  SphereOperators (const Real scale_factor, const Real laplacian_rigid_factor)
+  {
+    m_scale_factor_inv = 1/scale_factor;
+    m_laplacian_rigid_factor = laplacian_rigid_factor;
+  }
+
+  void setup (const ElementsGeometry& geometry,
+              const ReferenceElement& ref_FE) {
+    // Get reference element stuff
+    dvv = ref_FE.get_deriv();
+    m_mp = ref_FE.get_mass();
+
+    // Get all needed 2d fields from elements
+    m_d        = geometry.m_d;
+    m_dinv     = geometry.m_dinv;
+    m_metdet   = geometry.m_metdet;
+    m_metinv   = geometry.m_metinv;
+    m_spheremp = geometry.m_spheremp;
+    m_scale_factor_inv = 1/geometry.m_scale_factor;
+    m_laplacian_rigid_factor = geometry.m_laplacian_rigid_factor;
+  }
+
+  template<typename... Tags>
+  void allocate_buffers (const Kokkos::TeamPolicy<ExecSpace,Tags...>& team_policy)
+  {
+    const int num_parallel_iterations = team_policy.league_size();
+    const int alloc_dim = OnGpu<ExecSpace>::value ?
+                          num_parallel_iterations : std::min(get_num_concurrent_teams(team_policy),num_parallel_iterations);
+
+    if (vector_buf_ml.extent_int(0)<alloc_dim) {
+      vector_buf_sl = decltype(vector_buf_sl)("",alloc_dim);
+      scalar_buf_ml = decltype(scalar_buf_ml)("",alloc_dim);
+      vector_buf_ml = decltype(vector_buf_ml)("",alloc_dim);
+    }
+  }
+
+  void allocate_buffers (const TeamUtils<ExecSpace>& tu)
+  {
+    const int alloc_dim = tu.get_num_ws_slots();
+
+    if (vector_buf_ml.extent_int(0)<alloc_dim) {
+      vector_buf_sl = decltype(vector_buf_sl)("",alloc_dim);
+      scalar_buf_ml = decltype(scalar_buf_ml)("",alloc_dim);
+      vector_buf_ml = decltype(vector_buf_ml)("",alloc_dim);
+    }
+  }
+
+  // This one is used in the unit tests
+  void set_views (const ExecViewManaged<const Real         [NP][NP]>  dvv_in,
+                  const ExecViewManaged<const Real * [2][2][NP][NP]>  d,
+                  const ExecViewManaged<const Real * [2][2][NP][NP]>  dinv,
+                  const ExecViewManaged<const Real * [2][2][NP][NP]>  metinv,
+                  const ExecViewManaged<const Real *       [NP][NP]>  metdet,
+                  const ExecViewManaged<const Real *       [NP][NP]>  spheremp,
+                  const ExecViewManaged<const Real         [NP][NP]>  mp)
+  {
+    dvv = dvv_in;
+    m_d = d;
+    m_dinv = dinv;
+    m_metinv = metinv;
+    m_metdet = metdet;
+    m_spheremp = spheremp;
+    m_mp = mp;
+  }
+
+  // ================ SINGLE-LEVEL IMPLEMENTATION =========================== //
+
+  KOKKOS_INLINE_FUNCTION void
+  gradient_sphere_sl (const KernelVariables &kv,
+                      const ExecViewUnmanaged<const Real    [NP][NP]>& scalar,
+                      const ExecViewUnmanaged<      Real [2][NP][NP]>& grad_s) const
+  {
+    // Make sure the buffers have been created
+    assert (vector_buf_sl.size()>0);
+
+    const auto& D_inv = Homme::subview(m_dinv,kv.ie);
+    const auto& temp_v_buf = Homme::subview(vector_buf_sl,kv.team_idx,0);
+    constexpr int np_squared = NP * NP;
+    // TODO: Use scratch space for this
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, np_squared),
+                         [&](const int loop_idx) {
+      const int j = loop_idx / NP;
+      const int l = loop_idx % NP;
+      Real dsdx(0), dsdy(0);
+      for (int i = 0; i < NP; ++i) {
+        dsdx += dvv(l, i) * scalar(j, i);
+        dsdy += dvv(l, i) * scalar(i, j);
+      }
+      temp_v_buf(0, j, l) = dsdx * m_scale_factor_inv;
+      temp_v_buf(1, l, j) = dsdy * m_scale_factor_inv;
+    });
+    kv.team_barrier();
+
+    constexpr int grad_iters = 2 * NP * NP;
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, grad_iters),
+                         [&](const int loop_idx) {
+      const int h = (loop_idx / NP) / NP;
+      const int i = (loop_idx / NP) % NP;
+      const int j = loop_idx % NP;
+      grad_s(h, j, i) = D_inv(h, 0, j, i) * temp_v_buf(0, j, i) +
+                        D_inv(h, 1, j, i) * temp_v_buf(1, j, i);
+    });
+    kv.team_barrier();
+  }
+
+  KOKKOS_INLINE_FUNCTION void
+  gradient_sphere_update_sl (const KernelVariables &kv,
+                             const ExecViewUnmanaged<const Real    [NP][NP]>& scalar,
+                             const ExecViewUnmanaged<      Real [2][NP][NP]>& grad_s) const
+  {
+    // Make sure the buffers have been created
+    assert (vector_buf_sl.size()>0);
+
+    constexpr int np_squared = NP * NP;
+    const auto& D_inv = Homme::subview(m_dinv,kv.ie);
+    const auto& temp_v_buf = Homme::subview(vector_buf_sl,kv.team_idx,0);
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, np_squared),
+                         [&](const int loop_idx) {
+      const int j = loop_idx / NP;
+      const int l = loop_idx % NP;
+      Real dsdx(0), dsdy(0);
+      for (int i = 0; i < NP; ++i) {
+        dsdx += dvv(l, i) * scalar(j, i);
+        dsdy += dvv(l, i) * scalar(i, j);
+      }
+      temp_v_buf(0, j, l) = dsdx * m_scale_factor_inv;
+      temp_v_buf(1, l, j) = dsdy * m_scale_factor_inv;
+    });
+    kv.team_barrier();
+
+    constexpr int grad_iters = NP * NP;
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, grad_iters),
+                         [&](const int loop_idx) {
+      const int i = loop_idx / NP;
+      const int j = loop_idx % NP;
+      const auto& tmp0 = temp_v_buf(0,i,j);
+      const auto& tmp1 = temp_v_buf(1,i,j);
+      grad_s(0,i,j) += D_inv(0,0,i,j) * tmp0 + D_inv(0,1,i,j) * tmp1;
+      grad_s(1,i,j) += D_inv(1,0,i,j) * tmp0 + D_inv(1,1,i,j) * tmp1;
+    });
+    kv.team_barrier();
+  }
+
+  KOKKOS_INLINE_FUNCTION void
+  divergence_sphere_sl (const KernelVariables &kv,
+                        const ExecViewUnmanaged<const Real [2][NP][NP]>& v,
+                        const ExecViewUnmanaged<      Real    [NP][NP]>& div_v) const
+  {
+    // Make sure the buffers have been created
+    assert (vector_buf_sl.size()>0);
+
+    const auto& metdet = Homme::subview(m_metdet,kv.ie);
+    const auto& D_inv = Homme::subview(m_dinv,kv.ie);
+    const auto& gv_buf = Homme::subview(vector_buf_sl,kv.team_idx,0);
+    constexpr int np_squared = NP * NP;
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, np_squared),
+                         [&](const int loop_idx) {
+      const int igp = loop_idx / NP;
+      const int jgp = loop_idx % NP;
+      const auto& v0 = v(0,igp,jgp);
+      const auto& v1 = v(1,igp,jgp);
+      gv_buf(0,igp,jgp) = (D_inv(0,0,igp,jgp) * v0 + D_inv(1,0,igp,jgp) * v1) * metdet(igp,jgp);
+      gv_buf(1,igp,jgp) = (D_inv(0,1,igp,jgp) * v0 + D_inv(1,1,igp,jgp) * v1) * metdet(igp,jgp);
+    });
+    kv.team_barrier();
+
+    constexpr int div_iters = NP * NP;
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, div_iters),
+                         [&](const int loop_idx) {
+      const int igp = loop_idx / NP;
+      const int jgp = loop_idx % NP;
+      Real dudx = 0.0, dvdy = 0.0;
+      for (int kgp = 0; kgp < NP; ++kgp) {
+        dudx += dvv(jgp, kgp) * gv_buf(0, igp, kgp);
+        dvdy += dvv(igp, kgp) * gv_buf(1, kgp, jgp);
+      }
+      div_v(igp,jgp) = (dudx + dvdy) * ((1.0 / metdet(igp,jgp)) *
+                                         m_scale_factor_inv);
+    });
+    kv.team_barrier();
+  }
+
+  KOKKOS_INLINE_FUNCTION void
+  divergence_sphere_wk_sl (const KernelVariables &kv,
+                           const ExecViewUnmanaged<const Real [2][NP][NP]>& v,
+                           const ExecViewUnmanaged<      Real    [NP][NP]>& div_v) const
+  {
+    // Make sure the buffers have been created
+    assert (vector_buf_sl.size()>0);
+
+    const auto& D_inv = Homme::subview(m_dinv,kv.ie);
+    const auto& spheremp = Homme::subview(m_spheremp,kv.ie);
+    const auto& gv_buf = Homme::subview(vector_buf_sl,kv.team_idx,0);
+
+    // copied from strong divergence as is but without metdet
+    // conversion to contravariant
+    constexpr int np_squared = NP * NP;
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, np_squared),
+                         [&](const int loop_idx) {
+      const int igp = loop_idx / NP;
+      const int jgp = loop_idx % NP;
+      const auto& v0 = v(0,igp,jgp);
+      const auto& v1 = v(1,igp,jgp);
+      gv_buf(0,igp,jgp) = D_inv(0,0,igp,jgp) * v0 + D_inv(1,0,igp,jgp) * v1;
+      gv_buf(1,igp,jgp) = D_inv(0,1,igp,jgp) * v0 + D_inv(1,1,igp,jgp) * v1;
+    });
+    kv.team_barrier();
+
+    // in strong div
+    // kgp = i in strong code, jgp=j, igp=l
+    // in weak div, n is like j in strong div,
+    // n(weak)=j(strong)=jgp
+    // m(weak)=l(strong)=igp
+    // j(weak)=i(strong)=kgp
+    constexpr int div_iters = NP * NP;
+    // keeping indices' names as in F
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, div_iters),
+                         [&](const int loop_idx) {
+      // Note: for this one time, it is better if m strides faster, due to
+      //       the way the views are accessed.
+      const int mgp = loop_idx % NP;
+      const int ngp = loop_idx / NP;
+      Real dd = 0.0;
+      for (int jgp = 0; jgp < NP; ++jgp) {
+        dd -= (spheremp(ngp, jgp) * gv_buf(0, ngp, jgp) * dvv(jgp, mgp) +
+               spheremp(jgp, mgp) * gv_buf(1, jgp, mgp) * dvv(jgp, ngp)) *
+              m_scale_factor_inv;
+      }
+      div_v(ngp, mgp) = dd;
+    });
+    kv.team_barrier();
+
+  } // end of divergence_sphere_wk_sl
+
+  // Note that divergence_sphere requires scratch space of 3 x NP x NP Reals
+  // This must be called from the device space
+  KOKKOS_INLINE_FUNCTION void
+  vorticity_sphere_sl (const KernelVariables &kv,
+                       const ExecViewUnmanaged<const Real [NP][NP]>& u,
+                       const ExecViewUnmanaged<const Real [NP][NP]>& v,
+                       const ExecViewUnmanaged<      Real [NP][NP]>& vort) const
+  {
+    // Make sure the buffers have been created
+    assert (vector_buf_sl.size()>0);
+
+    const auto& D = Homme::subview(m_d,kv.ie);
+    const auto& metdet = Homme::subview(m_metdet,kv.ie);
+    const auto& vcov_buf = Homme::subview(vector_buf_sl,kv.team_idx,0);
+
+    constexpr int np_squared = NP * NP;
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, np_squared),
+                         [&](const int loop_idx) {
+      const int igp = loop_idx / NP;
+      const int jgp = loop_idx % NP;
+      const auto& u_ij = u(igp,jgp);
+      const auto& v_ij = v(igp,jgp);
+      vcov_buf(0,igp,jgp) = D(0,0,igp,jgp) * u_ij + D(0,1,igp,jgp) * v_ij;
+      vcov_buf(1,igp,jgp) = D(1,0,igp,jgp) * u_ij + D(1,1,igp,jgp) * v_ij;
+    });
+    kv.team_barrier();
+
+    constexpr int vort_iters = NP * NP;
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, vort_iters),
+                         [&](const int loop_idx) {
+      const int igp = loop_idx / NP;
+      const int jgp = loop_idx % NP;
+      Real dudy = 0.0;
+      Real dvdx = 0.0;
+      for (int kgp = 0; kgp < NP; ++kgp) {
+        dvdx += dvv(jgp, kgp) * vcov_buf(1, igp, kgp);
+        dudy += dvv(igp, kgp) * vcov_buf(0, kgp, jgp);
+      }
+
+      vort(igp, jgp) = (dvdx - dudy) * ((1.0 / metdet(igp, jgp)) *
+                                        m_scale_factor_inv);
+    });
+    kv.team_barrier();
+  }
+
+  // analog of fortran's laplace_wk_sphere
+  // Single level implementation
+  KOKKOS_INLINE_FUNCTION void
+  laplace_wk_sl (const KernelVariables &kv,
+                 const ExecViewUnmanaged<const Real [NP][NP]>& field,
+                 const ExecViewUnmanaged<      Real [NP][NP]>& laplace) const
+  {
+    // Make sure the buffers have been created
+    assert (vector_buf_sl.size()>0);
+
+    const auto& grad_s = Homme::subview(vector_buf_sl, kv.team_idx, 1);
+    gradient_sphere_sl(kv, field, grad_s);
+    divergence_sphere_wk_sl(kv, grad_s, laplace);
+  } // end of laplace_wk_sl
+
+  // ================ MULTI-LEVEL IMPLEMENTATION =========================== //
+
+  // Note: if you are puzzled by the use/need of ViewConst, don't worry, you're not alone.
+  //       To be clear, using `const typename ExecViewUnmanaged<Scalar[...]>::const_type`
+  //       would also work. I prefer ViewConst cause it does not change the 'template
+  //       signature of the type', while Kokkos::View<...>::const_type has a pre-defined
+  //       templated signature (with data type, layout and memory space).
+  //       Anyhow, back to the point, the key fact is that (quoting cppreference),
+  //
+  //        "The nested-name-specifier (everything to the left of the scope resolution
+  //         operator ::) of a type that was specified using a qualified-id" [do not
+  //         participate in template argument deduction].
+  //
+  //       So, without the `typename Blah::type` trick, the type of scalar is used to deduce
+  //       the template arguments, but if the input view had data type non-const, this
+  //       makes the template deduction fail. On the other hand, *with* the `typename Blah::type`
+  //       trick, only the type of the output view is used to deduce the template args. This
+  //       succeeds, and then the copy constructor of View is used to produce a View<const T>
+  //       from a View<T>. Magic.
+
+  template<int NUM_LEV_OUT, typename InputProvider>
+  KOKKOS_INLINE_FUNCTION void
+  gradient_sphere (const KernelVariables &kv,
+                   const InputProvider& scalar,
+                   const ExecViewUnmanaged<Scalar [2][NP][NP][NUM_LEV_OUT]>& grad_s,
+                   const int NUM_LEV_REQUEST) const
+  {
+    assert(NUM_LEV_REQUEST>=0);
+    assert(NUM_LEV_REQUEST<=NUM_LEV_OUT);
+
+    // Make sure the buffers have been created
+    assert (vector_buf_ml.size()>0);
+
+    const auto& D_inv = Homme::subview(m_dinv, kv.ie);
+
+    constexpr int np_squared = NP * NP;
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, np_squared),
+                         [&](const int loop_idx) {
+      const int igp = loop_idx / NP;
+      const int jgp = loop_idx % NP;
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team, NUM_LEV_REQUEST), [&] (const int& ilev) {
+        Scalar v0, v1;
+        for (int kgp = 0; kgp < NP; ++kgp) {
+          v0 += dvv(jgp, kgp) * scalar(igp, kgp, ilev);
+          v1 += dvv(igp, kgp) * scalar(kgp, jgp, ilev);
+        }
+        v0 *= m_scale_factor_inv;
+        v1 *= m_scale_factor_inv;
+        grad_s(0,igp,jgp,ilev) = D_inv(0,0,igp,jgp) * v0 + D_inv(0,1,igp,jgp) * v1;
+        grad_s(1,igp,jgp,ilev) = D_inv(1,0,igp,jgp) * v0 + D_inv(1,1,igp,jgp) * v1;
+      });
+    });
+    kv.team_barrier();
+  }
+
+  template<int NUM_LEV_OUT, typename InputProvider, int NUM_LEV_REQUEST = NUM_LEV_OUT>
+  KOKKOS_INLINE_FUNCTION void
+  gradient_sphere (const KernelVariables &kv,
+                   const InputProvider& scalar,
+                   const ExecViewUnmanaged<Scalar [2][NP][NP][NUM_LEV_OUT]>& grad_s) const
+  {
+    static_assert(NUM_LEV_REQUEST>=0, "Error! Invalid value for NUM_LEV_REQUEST.\n");
+    static_assert(NUM_LEV_REQUEST<=NUM_LEV_OUT, "Error! Output view does not have enough levels.\n");
+
+    gradient_sphere<NUM_LEV_OUT, InputProvider>(kv, scalar, grad_s, NUM_LEV_REQUEST);
+  }
+
+  template<int NUM_LEV_OUT, int NUM_LEV_IN = NUM_LEV_OUT, int NUM_LEV_REQUEST = NUM_LEV_OUT>
+  KOKKOS_INLINE_FUNCTION void
+  gradient_sphere_update (const KernelVariables &kv,
+                          const typename ViewConst<ExecViewUnmanaged<Scalar [NP][NP][NUM_LEV_IN]>>::type& scalar,
+                          const ExecViewUnmanaged<Scalar [2][NP][NP][NUM_LEV_OUT]>& grad_s) const
+  {
+    static_assert(NUM_LEV_REQUEST>=0, "Error! Invalid value for NUM_LEV_REQUEST.\n");
+    static_assert(NUM_LEV_REQUEST<=NUM_LEV_IN, "Error! Input view does not have enough levels.\n");
+    static_assert(NUM_LEV_REQUEST<=NUM_LEV_OUT, "Error! Output view does not have enough levels.\n");
+
+    // Make sure the buffers have been created
+    assert (vector_buf_ml.size()>0);
+
+    const auto& D_inv = Homme::subview(m_dinv, kv.ie);
+    constexpr int np_squared = NP * NP;
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, np_squared),
+                         [&](const int loop_idx) {
+      const int igp = loop_idx / NP;
+      const int jgp = loop_idx % NP;
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team, NUM_LEV_REQUEST), [&] (const int& ilev) {
+        Scalar dsdx, dsdy;
+        for (int kgp = 0; kgp < NP; ++kgp) {
+          dsdx += dvv(jgp, kgp) * scalar(igp, kgp, ilev);
+          dsdy += dvv(igp, kgp) * scalar(kgp, jgp, ilev);
+        }
+        dsdx *= m_scale_factor_inv;
+        dsdy *= m_scale_factor_inv;
+        grad_s(0,igp,jgp,ilev) += D_inv(0,0,igp,jgp) * dsdx + D_inv(0,1,igp,jgp) * dsdy;
+        grad_s(1,igp,jgp,ilev) += D_inv(1,0,igp,jgp) * dsdx + D_inv(1,1,igp,jgp) * dsdy;
+      });
+    });
+    kv.team_barrier();
+  }
+
+  template<int NUM_LEV_OUT, typename InputProvider, int NUM_LEV_REQUEST = NUM_LEV_OUT>
+  KOKKOS_INLINE_FUNCTION void
+  divergence_sphere (const KernelVariables &kv,
+                     const InputProvider& v,
+                     const ExecViewUnmanaged<Scalar [NP][NP][NUM_LEV_OUT]>& div_v,
+                     const Real alpha = 1.0, const Real beta = 0.0) const
+  {
+    divergence_sphere_cm<CombineMode::Replace,
+                         InputProvider,
+                         NUM_LEV_OUT,
+                         NUM_LEV_REQUEST>(kv,v,div_v,alpha,beta);
+
+  }
+
+  template<int NUM_LEV_OUT, typename InputProvider>
+  KOKKOS_INLINE_FUNCTION void
+  divergence_sphere_nlev (const KernelVariables &kv,
+                          const InputProvider& v,
+                          const ExecViewUnmanaged<Scalar [NP][NP][NUM_LEV_OUT]>& div_v,
+                          const int NUM_LEV_REQUEST,
+                          const Real alpha = 1.0, const Real beta = 0.0) const
+  {
+    divergence_sphere_cm<CombineMode::Replace,
+                         InputProvider,
+                         NUM_LEV_OUT>(kv,v,div_v,alpha,beta,NUM_LEV_REQUEST);
+
+  }
+
+  template<CombineMode CM, typename InputProvider, int NUM_LEV_OUT>
+  KOKKOS_INLINE_FUNCTION void
+  divergence_sphere_cm (const KernelVariables &kv,
+                        const InputProvider& v,
+                        const ExecViewUnmanaged<Scalar [NP][NP][NUM_LEV_OUT]>& div_v,
+                        const Real alpha, const Real beta,
+                        const int NUM_LEV_REQUEST) const
+  {
+    assert(NUM_LEV_REQUEST>=0);
+    assert(NUM_LEV_REQUEST<=NUM_LEV_OUT);
+
+    // Make sure the buffers have been created
+    assert (vector_buf_ml.size()>0);
+
+    const auto& D_inv = Homme::subview(m_dinv, kv.ie);
+    const auto& metdet = Homme::subview(m_metdet, kv.ie);
+    vector_buf<NUM_LEV_OUT> gv_buf(Homme::subview(vector_buf_ml,kv.team_idx, 0).data());
+    constexpr int np_squared = NP * NP;
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, np_squared),
+                         [&](const int loop_idx) {
+      const int igp = loop_idx / NP;
+      const int jgp = loop_idx % NP;
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team, NUM_LEV_REQUEST), [&] (const int& ilev) {
+        const auto& v0 = v(0, igp, jgp, ilev);
+        const auto& v1 = v(1, igp, jgp, ilev);
+        gv_buf(0,igp,jgp,ilev) = (D_inv(0,0,igp,jgp) * v0 + D_inv(1,0,igp,jgp) * v1) * metdet(igp,jgp);
+        gv_buf(1,igp,jgp,ilev) = (D_inv(0,1,igp,jgp) * v0 + D_inv(1,1,igp,jgp) * v1) * metdet(igp,jgp);
+      });
+    });
+    kv.team_barrier();
+
+    // j, l, i -> i, j, k
+    constexpr int div_iters = NP * NP;
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, div_iters),
+                         [&](const int loop_idx) {
+      const int igp = loop_idx / NP;
+      const int jgp = loop_idx % NP;
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team, NUM_LEV_REQUEST), [&] (const int& ilev) {
+        Scalar dudx, dvdy;
+        for (int kgp = 0; kgp < NP; ++kgp) {
+          dudx += dvv(jgp, kgp) * gv_buf(0, igp, kgp, ilev);
+          dvdy += dvv(igp, kgp) * gv_buf(1, kgp, jgp, ilev);
+        }
+        combine<CM>((dudx + dvdy) * (1.0 / metdet(igp, jgp) * m_scale_factor_inv),
+                     div_v(igp, jgp, ilev), alpha, beta);
+      });
+    });
+    kv.team_barrier();
+  }
+
+  template<CombineMode CM, typename InputProvider,
+           int NUM_LEV_OUT, int NUM_LEV_REQUEST = NUM_LEV_OUT>
+  KOKKOS_INLINE_FUNCTION void
+  divergence_sphere_cm (const KernelVariables &kv,
+                        const InputProvider& v,
+                        const ExecViewUnmanaged<Scalar [NP][NP][NUM_LEV_OUT]>& div_v,
+                        const Real alpha = 1.0, const Real beta = 0.0) const
+  {
+    static_assert(NUM_LEV_REQUEST>=0, "Error! Invalid value for NUM_LEV_REQUEST.\n");
+    static_assert(NUM_LEV_REQUEST<=NUM_LEV_OUT, "Error! Output view does not have enough levels.\n");
+
+    divergence_sphere_cm<CM, InputProvider, NUM_LEV_OUT>(kv, v, div_v, alpha, beta, NUM_LEV_REQUEST);
+  }
+
+  template<int NUM_LEV_OUT, int NUM_LEV_IN = NUM_LEV_OUT, int NUM_LEV_REQUEST = NUM_LEV_OUT>
+  KOKKOS_INLINE_FUNCTION void
+  divergence_sphere_update (const KernelVariables &kv,
+                            const Real alpha, const bool add_hyperviscosity,
+                            const typename ViewConst<ExecViewUnmanaged<Scalar [2][NP][NP][NUM_LEV_IN]>>::type& vstar,
+                            const typename ViewConst<ExecViewUnmanaged<Scalar    [NP][NP][NUM_LEV_IN]>>::type& qdp,
+                            // On input, qtens_biharmonic if add_hyperviscosity, undefined
+                            // if not; on output, qtens.
+                            const ExecViewUnmanaged<Scalar [NP][NP][NUM_LEV_OUT]>& qtens) const
+  {
+    static_assert(NUM_LEV_REQUEST>=0, "Error! Invalid value for NUM_LEV_REQUEST.\n");
+    static_assert(NUM_LEV_REQUEST<=NUM_LEV_IN, "Error! Input view does not have enough levels.\n");
+    static_assert(NUM_LEV_REQUEST<=NUM_LEV_OUT, "Error! Output view does not have enough levels.\n");
+
+    // Make sure the buffers have been created
+    assert (vector_buf_ml.size()>0);
+
+    const auto& D_inv = Homme::subview(m_dinv, kv.ie);
+    const auto& metdet = Homme::subview(m_metdet, kv.ie);
+    vector_buf<NUM_LEV_REQUEST> gv(Homme::subview(vector_buf_ml,kv.team_idx,0).data());
+    constexpr int np_squared = NP * NP;
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, np_squared),
+                         [&](const int loop_idx) {
+      const int igp = loop_idx / NP;
+      const int jgp = loop_idx % NP;
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team, NUM_LEV_REQUEST), [&] (const int& ilev) {
+        const auto& qdpijk = qdp(igp, jgp, ilev);
+        const auto v0 = vstar(0, igp, jgp, ilev) * qdpijk;
+        const auto v1 = vstar(1, igp, jgp, ilev) * qdpijk;
+        gv(0,igp,jgp,ilev) = (D_inv(0,0,igp,jgp) * v0 + D_inv(1,0,igp,jgp) * v1) * metdet(igp,jgp);
+        gv(1,igp,jgp,ilev) = (D_inv(0,1,igp,jgp) * v0 + D_inv(1,1,igp,jgp) * v1) * metdet(igp,jgp);
+      });
+    });
+    kv.team_barrier();
+
+    constexpr int div_iters = NP * NP;
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, div_iters),
+                         [&](const int loop_idx) {
+      const int igp = loop_idx / NP;
+      const int jgp = loop_idx % NP;
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team, NUM_LEV_REQUEST), [&] (const int& ilev) {
+        Scalar dudx, dvdy;
+        for (int kgp = 0; kgp < NP; ++kgp) {
+          dudx += dvv(jgp, kgp) * gv(0, igp, kgp, ilev);
+          dvdy += dvv(igp, kgp) * gv(1, kgp, jgp, ilev);
+        }
+        const Scalar qtensijk0 = add_hyperviscosity ? qtens(igp,jgp,ilev) : 0;
+        qtens(igp,jgp,ilev) = (qdp(igp,jgp,ilev) +
+                               alpha*((dudx + dvdy) * (1.0 / metdet(igp,jgp) * m_scale_factor_inv)) +
+                               qtensijk0);
+      });
+    });
+    kv.team_barrier();
+  }
+
+  template<int NUM_LEV_OUT, int NUM_LEV_IN = NUM_LEV_OUT, int NUM_LEV_REQUEST = NUM_LEV_OUT>
+  KOKKOS_INLINE_FUNCTION void
+  vorticity_sphere (const KernelVariables &kv,
+                    const typename ViewConst<ExecViewUnmanaged<Scalar [NP][NP][NUM_LEV_IN]>>::type& u,
+                    const typename ViewConst<ExecViewUnmanaged<Scalar [NP][NP][NUM_LEV_IN]>>::type& v,
+                    const ExecViewUnmanaged<Scalar [NP][NP][NUM_LEV_OUT]>& vort) const
+  {
+    static_assert(NUM_LEV_REQUEST>=0, "Error! Invalid value for NUM_LEV_REQUEST.\n");
+    static_assert(NUM_LEV_REQUEST<=NUM_LEV_IN, "Error! Input view does not have enough levels.\n");
+    static_assert(NUM_LEV_REQUEST<=NUM_LEV_OUT, "Error! Output view does not have enough levels.\n");
+
+    // Make sure the buffers have been created
+    assert (vector_buf_ml.size()>0);
+
+    const auto& D = Homme::subview(m_d, kv.ie);
+    const auto& metdet = Homme::subview(m_metdet, kv.ie);
+    vector_buf<NUM_LEV_REQUEST> vcov_buf(Homme::subview(vector_buf_ml,kv.team_idx,0).data());
+    constexpr int np_squared = NP * NP;
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, np_squared),
+                         [&](const int loop_idx) {
+      const int igp = loop_idx / NP;
+      const int jgp = loop_idx % NP;
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team, NUM_LEV_REQUEST), [&] (const int& ilev) {
+        const auto& u_ijk = u(igp, jgp, ilev);
+        const auto& v_ijk = v(igp, jgp, ilev);
+        vcov_buf(0,igp,jgp,ilev) = D(0,0,igp,jgp) * u_ijk + D(0,1,igp,jgp) * v_ijk;
+        vcov_buf(1,igp,jgp,ilev) = D(1,0,igp,jgp) * u_ijk + D(1,1,igp,jgp) * v_ijk;
+      });
+    });
+    kv.team_barrier();
+
+    constexpr int vort_iters = NP * NP;
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, vort_iters),
+                         [&](const int loop_idx) {
+      const int igp = loop_idx / NP;
+      const int jgp = loop_idx % NP;
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team, NUM_LEV_REQUEST), [&] (const int& ilev) {
+        Scalar dudy, dvdx;
+        for (int kgp = 0; kgp < NP; ++kgp) {
+          dvdx += dvv(jgp, kgp) * vcov_buf(1, igp, kgp, ilev);
+          dudy += dvv(igp, kgp) * vcov_buf(0, kgp, jgp, ilev);
+        }
+        vort(igp, jgp, ilev) = (dvdx - dudy) * (1.0 / metdet(igp, jgp) *
+                                                m_scale_factor_inv);
+      });
+    });
+    kv.team_barrier();
+  }
+
+  template<int NUM_LEV_OUT, int NUM_LEV_IN = NUM_LEV_OUT>
+  KOKKOS_INLINE_FUNCTION void
+  vorticity_sphere (const KernelVariables &kv,
+                    const typename ViewConst<ExecViewUnmanaged<Scalar [2][NP][NP][NUM_LEV_IN]>>::type& v,
+                    const ExecViewUnmanaged<Scalar [NP][NP][NUM_LEV_OUT]>& vort,
+                    const int NUM_LEV_REQUEST) const
+  {
+    assert(NUM_LEV_REQUEST>=0);
+    assert(NUM_LEV_REQUEST<=NUM_LEV_IN);
+    assert(NUM_LEV_REQUEST<=NUM_LEV_OUT);
+
+    // Make sure the buffers have been created
+    assert (vector_buf_ml.size()>0);
+
+    const auto& D = Homme::subview(m_d, kv.ie);
+    const auto& metdet = Homme::subview(m_metdet, kv.ie);
+    vector_buf<NUM_LEV_OUT> sphere_buf(Homme::subview(vector_buf_ml,kv.team_idx,0).data());
+    constexpr int np_squared = NP * NP;
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, np_squared),
+                         [&](const int loop_idx) {
+      const int igp = loop_idx / NP;
+      const int jgp = loop_idx % NP;
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team, NUM_LEV_REQUEST), [&] (const int& ilev) {
+        const auto& v0 = v(0,igp,jgp,ilev);
+        const auto& v1 = v(1,igp,jgp,ilev);
+        sphere_buf(0,igp,jgp,ilev) = D(0,0,igp,jgp) * v0 + D(0,1,igp,jgp) * v1;
+        sphere_buf(1,igp,jgp,ilev) = D(1,0,igp,jgp) * v0 + D(1,1,igp,jgp) * v1;
+      });
+    });
+    kv.team_barrier();
+
+    constexpr int vort_iters = NP * NP;
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, vort_iters),
+                         [&](const int loop_idx) {
+      const int igp = loop_idx / NP;
+      const int jgp = loop_idx % NP;
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team, NUM_LEV_REQUEST), [&] (const int& ilev) {
+        Scalar dudy, dvdx;
+        for (int kgp = 0; kgp < NP; ++kgp) {
+          dvdx += dvv(jgp, kgp) * sphere_buf(1, igp, kgp, ilev);
+          dudy += dvv(igp, kgp) * sphere_buf(0, kgp, jgp, ilev);
+        }
+        vort(igp, jgp, ilev) = (dvdx - dudy) * (1.0 / metdet(igp, jgp) *
+                                                m_scale_factor_inv);
+      });
+    });
+    kv.team_barrier();
+  }
+
+  template<int NUM_LEV_OUT, int NUM_LEV_IN = NUM_LEV_OUT, int NUM_LEV_REQUEST = NUM_LEV_OUT>
+  KOKKOS_INLINE_FUNCTION void
+  vorticity_sphere (const KernelVariables &kv,
+                    const typename ViewConst<ExecViewUnmanaged<Scalar [2][NP][NP][NUM_LEV_IN]>>::type& v,
+                    const ExecViewUnmanaged<Scalar [NP][NP][NUM_LEV_OUT]>& vort) const
+  {
+    static_assert(NUM_LEV_REQUEST>=0, "Error! Invalid value for NUM_LEV_REQUEST.\n");
+    static_assert(NUM_LEV_REQUEST<=NUM_LEV_IN, "Error! Input view does not have enough levels.\n");
+    static_assert(NUM_LEV_REQUEST<=NUM_LEV_OUT, "Error! Output view does not have enough levels.\n");
+
+    vorticity_sphere<NUM_LEV_OUT,NUM_LEV_IN>(kv, v, vort, NUM_LEV_REQUEST);
+  }
+
+  template<int NUM_LEV_OUT, int NUM_LEV_IN = NUM_LEV_OUT>
+  KOKKOS_INLINE_FUNCTION void
+  divergence_sphere_wk (const KernelVariables &kv,
+                        // On input, a field whose divergence is sought; on
+                        // output, the view's data are invalid.
+                        const ExecViewUnmanaged<Scalar [2][NP][NP][NUM_LEV_IN]>& v,
+                        const ExecViewUnmanaged<Scalar [NP][NP][NUM_LEV_OUT]>& div_v,
+                        const int NUM_LEV_REQUEST) const
+  {
+    assert(NUM_LEV_REQUEST>=0);
+    assert(NUM_LEV_REQUEST<=NUM_LEV_IN);
+    assert(NUM_LEV_REQUEST<=NUM_LEV_OUT);
+
+    // Make sure the buffers have been created
+    assert (vector_buf_ml.size()>0);
+
+    const auto& D_inv = Homme::subview(m_dinv, kv.ie);
+    const auto& spheremp = Homme::subview(m_spheremp, kv.ie);
+    constexpr int np_squared = NP * NP;
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, np_squared),
+                         [&](const int loop_idx) {
+      const int igp = loop_idx / NP;
+      const int jgp = loop_idx % NP;
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team, NUM_LEV_REQUEST), [&] (const int& ilev) {
+        const auto v0 = v(0,igp,jgp,ilev);
+        const auto v1 = v(1,igp,jgp,ilev);
+        v(0,igp,jgp,ilev) = D_inv(0, 0, igp, jgp) * v0 + D_inv(1, 0, igp, jgp) * v1;
+        v(1,igp,jgp,ilev) = D_inv(0, 1, igp, jgp) * v0 + D_inv(1, 1, igp, jgp) * v1;
+      });
+    });
+    kv.team_barrier();
+
+    constexpr int div_iters = NP * NP;
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, div_iters),
+                         [&](const int loop_idx) {
+      // Note: for this one time, it is better if m strides faster, due to
+      //       the way the views are accessed.
+      const int mgp = loop_idx % NP;
+      const int ngp = loop_idx / NP;
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team, NUM_LEV_REQUEST), [&] (const int& ilev) {
+        Scalar dd;
+        // TODO: move multiplication by scale_factor_inv outside the loop
+        for (int jgp = 0; jgp < NP; ++jgp) {
+          // Here, v is the temporary buffer, aliased on the input v.
+          dd -= (spheremp(ngp, jgp) * v(0, ngp, jgp, ilev) * dvv(jgp, mgp) +
+                 spheremp(jgp, mgp) * v(1, jgp, mgp, ilev) * dvv(jgp, ngp)) *
+                m_scale_factor_inv;
+        }
+        div_v(ngp, mgp, ilev) = dd;
+      });
+    });
+    kv.team_barrier();
+
+  }//end of divergence_sphere_wk
+
+  template<int NUM_LEV_OUT, int NUM_LEV_IN = NUM_LEV_OUT, int NUM_LEV_REQUEST = NUM_LEV_OUT>
+  KOKKOS_INLINE_FUNCTION void
+  divergence_sphere_wk (const KernelVariables &kv,
+                        // On input, a field whose divergence is sought; on
+                        // output, the view's data are invalid.
+                        const ExecViewUnmanaged<Scalar [2][NP][NP][NUM_LEV_IN]>& v,
+                        const ExecViewUnmanaged<Scalar [NP][NP][NUM_LEV_OUT]>& div_v) const
+  {
+    divergence_sphere_wk<NUM_LEV_OUT, NUM_LEV_IN>(kv, v, div_v, NUM_LEV_REQUEST);
+  }//end of divergence_sphere_wk
+
+  template<int NUM_LEV_OUT, int NUM_LEV_IN = NUM_LEV_OUT>
+  KOKKOS_INLINE_FUNCTION void
+  laplace_simple (const KernelVariables &kv,
+                  const typename ViewConst<ExecViewUnmanaged<Scalar [NP][NP][NUM_LEV_IN]>>::type& field,
+                  const ExecViewUnmanaged<Scalar [NP][NP][NUM_LEV_OUT]>& laplace,
+                  const int NUM_LEV_REQUEST) const
+  {
+    assert(NUM_LEV_REQUEST<=NUM_LEV_IN);
+    assert(NUM_LEV_REQUEST<=NUM_LEV_OUT);
+
+    // Make sure the buffers have been created
+    assert (vector_buf_ml.size()>0);
+
+    vector_buf<NUM_LEV_OUT> grad_s(Homme::subview(vector_buf_ml, kv.team_idx, 0).data());
+    gradient_sphere<NUM_LEV_OUT,decltype(field)>(kv, field, grad_s, NUM_LEV_REQUEST);
+    divergence_sphere_wk<NUM_LEV_OUT,NUM_LEV_OUT>(kv, grad_s, laplace, NUM_LEV_REQUEST);
+  }//end of laplace_simple
+
+  template<int NUM_LEV_OUT, int NUM_LEV_IN = NUM_LEV_OUT, int NUM_LEV_REQUEST = NUM_LEV_OUT>
+  KOKKOS_INLINE_FUNCTION void
+  laplace_simple (const KernelVariables &kv,
+                  const typename ViewConst<ExecViewUnmanaged<Scalar [NP][NP][NUM_LEV_IN]>>::type& field,
+                  const ExecViewUnmanaged<Scalar [NP][NP][NUM_LEV_OUT]>& laplace) const
+  {
+    static_assert(NUM_LEV_REQUEST>=0, "Error! Invalid value for NUM_LEV_REQUEST.\n");
+    static_assert(NUM_LEV_REQUEST<=NUM_LEV_IN, "Error! Input view does not have enough levels.\n");
+    static_assert(NUM_LEV_REQUEST<=NUM_LEV_OUT, "Error! Output view does not have enough levels.\n");
+
+    laplace_simple<NUM_LEV_OUT,NUM_LEV_IN>(kv, field, laplace, NUM_LEV_REQUEST);
+  }//end of laplace_simple
+
+  template<int NUM_LEV_OUT, int NUM_LEV_IN = NUM_LEV_OUT, int NUM_LEV_REQUEST = NUM_LEV_OUT>
+  KOKKOS_INLINE_FUNCTION void
+  laplace_tensor(const KernelVariables &kv,
+                 const ExecViewUnmanaged<const Real   [2][2][NP][NP]>&              tensorVisc,
+                 const typename ViewConst<ExecViewUnmanaged<Scalar [NP][NP][NUM_LEV_IN]>>::type&  field,         // input
+                 const ExecViewUnmanaged<Scalar [NP][NP][NUM_LEV_OUT]>& laplace) const
+  {
+    static_assert(NUM_LEV_REQUEST>=0, "Error! Invalid value for NUM_LEV_REQUEST.\n");
+    static_assert(NUM_LEV_REQUEST<=NUM_LEV_IN, "Error! Input view does not have enough levels.\n");
+    static_assert(NUM_LEV_REQUEST<=NUM_LEV_OUT, "Error! Output view does not have enough levels.\n");
+
+    // Make sure the buffers have been created
+    assert (vector_buf_ml.size()>0);
+
+    vector_buf<NUM_LEV_REQUEST> grad_s(Homme::subview(vector_buf_ml, kv.team_idx, 1).data());
+    vector_buf<NUM_LEV_REQUEST> sphere_buf(Homme::subview(vector_buf_ml, kv.team_idx, 2).data());
+
+    gradient_sphere<NUM_LEV_REQUEST,decltype(field),NUM_LEV_REQUEST>(kv, field, grad_s);
+    //now multiply tensorVisc(:,:,i,j)*grad_s(i,j) (matrix*vector, independent of i,j )
+    //but it requires a temp var to store a result. the result is then placed to grad_s,
+    //or should it be an extra temp var instead of an extra loop?
+    constexpr int num_iters = NP * NP;
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, num_iters),
+                       [&](const int loop_idx) {
+      const int igp = loop_idx / NP;
+      const int jgp = loop_idx % NP;
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team, NUM_LEV_REQUEST), [&] (const int& ilev) {
+        const auto& grad_s0 = grad_s(0,igp,jgp,ilev);
+        const auto& grad_s1 = grad_s(1,igp,jgp,ilev);
+        sphere_buf(0,igp,jgp,ilev) = tensorVisc(0,0,igp,jgp) * grad_s0 + tensorVisc(1,0,igp,jgp) * grad_s1;
+        sphere_buf(1,igp,jgp,ilev) = tensorVisc(0,1,igp,jgp) * grad_s0 + tensorVisc(1,1,igp,jgp) * grad_s1;
+      });
+    });
+    kv.team_barrier();
+
+    divergence_sphere_wk<NUM_LEV_OUT,NUM_LEV_REQUEST,NUM_LEV_REQUEST>(kv, sphere_buf, laplace);
+  }//end of laplace_tensor
+
+  template<int NUM_LEV_OUT, int NUM_LEV_IN = NUM_LEV_OUT>
+  KOKKOS_INLINE_FUNCTION void
+  curl_sphere_wk_testcov (const KernelVariables &kv,
+                          const typename ViewConst<ExecViewUnmanaged<Scalar [NP][NP][NUM_LEV_IN]>>::type& scalar,
+                          const ExecViewUnmanaged<Scalar [2][NP][NP][NUM_LEV_OUT]>& curls,
+                          const int NUM_LEV_REQUEST) const
+  {
+    assert(NUM_LEV_REQUEST>=0);
+    assert(NUM_LEV_REQUEST<=NUM_LEV_IN);
+    assert(NUM_LEV_REQUEST<=NUM_LEV_OUT);
+
+    // Make sure the buffers have been created
+    assert (vector_buf_ml.size()>0);
+
+    const auto& D = Homme::subview(m_d, kv.ie);
+    vector_buf<NUM_LEV_OUT> sphere_buf(Homme::subview(vector_buf_ml,kv.team_idx,0).data());
+    constexpr int np_squared = NP * NP;
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, np_squared), [&](const int loop_idx) {
+      const int ngp = loop_idx / NP;
+      const int mgp = loop_idx % NP;
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team, NUM_LEV_REQUEST), [&] (const int& ilev) {
+        auto& sb0 = sphere_buf(0, ngp, mgp, ilev);
+        auto& sb1 = sphere_buf(1, ngp, mgp, ilev);
+        sb0 = 0;
+        sb1 = 0;
+        for (int jgp = 0; jgp < NP; ++jgp) {
+          sb0 -= m_mp(jgp,mgp)*scalar(jgp,mgp,ilev)*dvv(jgp,ngp);
+          sb1 += m_mp(ngp,jgp)*scalar(ngp,jgp,ilev)*dvv(jgp,mgp);
+        }
+      });
+    });
+    kv.team_barrier();
+
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, np_squared),
+                         [&](const int loop_idx) {
+      const int igp = loop_idx / NP; //slowest
+      const int jgp = loop_idx % NP; //fastest
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team, NUM_LEV_REQUEST), [&] (const int& ilev) {
+        const auto& sb0 = sphere_buf(0, igp, jgp, ilev);
+        const auto& sb1 = sphere_buf(1, igp, jgp, ilev);
+        curls(0,igp,jgp,ilev) = (D(0,0,igp,jgp) * sb0 + D(1,0,igp,jgp) * sb1) * m_scale_factor_inv;
+        curls(1,igp,jgp,ilev) = (D(0,1,igp,jgp) * sb0 + D(1,1,igp,jgp) * sb1) * m_scale_factor_inv;
+      });
+    });
+    kv.team_barrier();
+  }
+
+  template<int NUM_LEV_OUT, int NUM_LEV_IN = NUM_LEV_OUT, int NUM_LEV_REQUEST = NUM_LEV_OUT>
+  KOKKOS_INLINE_FUNCTION void
+  curl_sphere_wk_testcov (const KernelVariables &kv,
+                          const typename ViewConst<ExecViewUnmanaged<Scalar [NP][NP][NUM_LEV_IN]>>::type& scalar,
+                          const ExecViewUnmanaged<Scalar [2][NP][NP][NUM_LEV_OUT]>& curls) const
+  {
+    static_assert(NUM_LEV_REQUEST>=0, "Error! Invalid value for NUM_LEV_REQUEST.\n");
+    static_assert(NUM_LEV_REQUEST<=NUM_LEV_IN, "Error! Input view does not have enough levels.\n");
+    static_assert(NUM_LEV_REQUEST<=NUM_LEV_OUT, "Error! Output view does not have enough levels.\n");
+
+    curl_sphere_wk_testcov<NUM_LEV_OUT,NUM_LEV_IN>(kv, scalar, curls, NUM_LEV_REQUEST);
+  }
+
+  template<int NUM_LEV_OUT, int NUM_LEV_IN = NUM_LEV_OUT>
+  KOKKOS_INLINE_FUNCTION void
+  curl_sphere_wk_testcov_update (const KernelVariables &kv, const Real alpha, const Real beta,
+                                 const typename ViewConst<ExecViewUnmanaged<Scalar [NP][NP][NUM_LEV_IN]>>::type& scalar,
+                                 const ExecViewUnmanaged<Scalar [2][NP][NP][NUM_LEV_OUT]>& curls,
+                                 const int NUM_LEV_REQUEST) const
+  {
+    assert(NUM_LEV_REQUEST>=0);
+    assert(NUM_LEV_REQUEST<=NUM_LEV_IN);
+    assert(NUM_LEV_REQUEST<=NUM_LEV_OUT);
+
+    // Make sure the buffers have been created
+    assert (vector_buf_ml.size()>0);
+
+    const auto& D = Homme::subview(m_d, kv.ie);
+    constexpr int np_squared = NP * NP;
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, np_squared), [&](const int loop_idx) {
+      const int ngp = loop_idx / NP;
+      const int mgp = loop_idx % NP;
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team, NUM_LEV_REQUEST), [&] (const int& ilev) {
+        Scalar sb0, sb1;
+        for (int jgp = 0; jgp < NP; ++jgp) {
+          sb0 -= m_mp(jgp,mgp)*scalar(jgp,mgp,ilev)*dvv(jgp,ngp);
+          sb1 += m_mp(ngp,jgp)*scalar(ngp,jgp,ilev)*dvv(jgp,mgp);
+        }
+        curls(0,ngp,mgp,ilev) = beta*curls(0,ngp,mgp,ilev) + alpha *
+                                ( D(0,0,ngp,mgp)*sb0 + D(1,0,ngp,mgp)*sb1 )
+                                * m_scale_factor_inv;
+        curls(1,ngp,mgp,ilev) = beta*curls(1,ngp,mgp,ilev) + alpha *
+                                ( D(0,1,ngp,mgp)*sb0 + D(1,1,ngp,mgp)*sb1 )
+                              * m_scale_factor_inv;
+      });
+    });
+    kv.team_barrier();
+  }
+
+  template<int NUM_LEV_OUT, int NUM_LEV_IN = NUM_LEV_OUT, int NUM_LEV_REQUEST = NUM_LEV_OUT>
+  KOKKOS_INLINE_FUNCTION void
+  curl_sphere_wk_testcov_update (const KernelVariables &kv, const Real alpha, const Real beta,
+                                 const typename ViewConst<ExecViewUnmanaged<Scalar [NP][NP][NUM_LEV_IN]>>::type& scalar,
+                                 const ExecViewUnmanaged<Scalar [2][NP][NP][NUM_LEV_OUT]>& curls) const
+  {
+    static_assert(NUM_LEV_REQUEST>=0, "Error! Invalid value for NUM_LEV_REQUEST.\n");
+    static_assert(NUM_LEV_REQUEST<=NUM_LEV_IN, "Error! Input view does not have enough levels.\n");
+    static_assert(NUM_LEV_REQUEST<=NUM_LEV_OUT, "Error! Output view does not have enough levels.\n");
+
+    curl_sphere_wk_testcov_update<NUM_LEV_OUT,NUM_LEV_IN>(kv, alpha, beta, scalar, curls, NUM_LEV_REQUEST);
+  }
+
+  template<int NUM_LEV_OUT, int NUM_LEV_IN = NUM_LEV_OUT>
+  KOKKOS_INLINE_FUNCTION void
+  grad_sphere_wk_testcov (const KernelVariables &kv,
+                          const typename ViewConst<ExecViewUnmanaged<Scalar [NP][NP][NUM_LEV_IN]>>::type& scalar,
+                          const ExecViewUnmanaged<Scalar [2][NP][NP][NUM_LEV_OUT]>& grads,
+                          const int NUM_LEV_REQUEST) const
+  {
+    assert(NUM_LEV_REQUEST>=0);
+    assert(NUM_LEV_REQUEST<=NUM_LEV_IN);
+    assert(NUM_LEV_REQUEST<=NUM_LEV_OUT);
+
+    // Make sure the buffers have been created
+    assert (vector_buf_ml.size()>0);
+
+    const auto& D = Homme::subview(m_d, kv.ie);
+    const auto& metinv = Homme::subview(m_metinv, kv.ie);
+    const auto& metdet = Homme::subview(m_metdet, kv.ie);
+    constexpr int np_squared = NP * NP;
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, np_squared), [&](const int loop_idx) {
+      const int ngp = loop_idx / NP;
+      const int mgp = loop_idx % NP;
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team, NUM_LEV_REQUEST), [&] (const int& ilev) {
+        Scalar b0, b1;
+        for (int jgp = 0; jgp < NP; ++jgp) {
+          const auto& mpnj = m_mp(ngp,jgp);
+          const auto& mpjm = m_mp(jgp,mgp);
+          const auto& md = metdet(ngp,mgp);
+          const auto& snj = scalar(ngp,jgp,ilev);
+          const auto& sjm = scalar(jgp,mgp,ilev);
+          const auto& djm = dvv(jgp,mgp);
+          const auto& djn = dvv(jgp,ngp);
+          b0 -= (mpnj * metinv(0,0,ngp,mgp) * md * snj * djm +
+                 mpjm * metinv(0,1,ngp,mgp) * md * sjm * djn);
+          b1 -= (mpnj * metinv(1,0,ngp,mgp) * md * snj * djm +
+                 mpjm * metinv(1,1,ngp,mgp) * md * sjm * djn);
+        }
+        grads(0,ngp,mgp,ilev) = (D(0,0,ngp,mgp) * b0 + D(1,0,ngp,mgp) * b1) * m_scale_factor_inv;
+        grads(1,ngp,mgp,ilev) = (D(0,1,ngp,mgp) * b0 + D(1,1,ngp,mgp) * b1) * m_scale_factor_inv;
+      });
+    });
+    kv.team_barrier();
+  }
+
+  template<int NUM_LEV_OUT, int NUM_LEV_IN = NUM_LEV_OUT, int NUM_LEV_REQUEST = NUM_LEV_OUT>
+  KOKKOS_INLINE_FUNCTION void
+  grad_sphere_wk_testcov (const KernelVariables &kv,
+                          const typename ViewConst<ExecViewUnmanaged<Scalar [NP][NP][NUM_LEV_IN]>>::type& scalar,
+                          const ExecViewUnmanaged<Scalar [2][NP][NP][NUM_LEV_OUT]>& grads) const
+  {
+    static_assert(NUM_LEV_REQUEST>=0, "Error! Invalid value for NUM_LEV_REQUEST.\n");
+    static_assert(NUM_LEV_REQUEST<=NUM_LEV_IN, "Error! Input view does not have enough levels.\n");
+    static_assert(NUM_LEV_REQUEST<=NUM_LEV_OUT, "Error! Output view does not have enough levels.\n");
+
+    grad_sphere_wk_testcov<NUM_LEV_OUT,NUM_LEV_IN>(kv, scalar, grads, NUM_LEV_REQUEST);
+  }
+
+  template<int NUM_LEV_OUT, int NUM_LEV_IN = NUM_LEV_OUT, int NUM_LEV_REQUEST = NUM_LEV_OUT>
+  KOKKOS_INLINE_FUNCTION void
+  vlaplace_sphere_wk_cartesian (const KernelVariables &kv,
+                                const ExecViewUnmanaged<const Real [2][2][NP][NP]>&          tensorVisc,
+                                const ExecViewUnmanaged<const Real [2][3][NP][NP]>&          vec_sph2cart,
+                                const typename ViewConst<ExecViewUnmanaged<Scalar [2][NP][NP][NUM_LEV_IN]>>::type& vector,
+                                const ExecViewUnmanaged<Scalar [2][NP][NP][NUM_LEV_OUT]>& laplace) const
+  {
+    static_assert(NUM_LEV_REQUEST>=0, "Error! Invalid value for NUM_LEV_REQUEST.\n");
+    static_assert(NUM_LEV_REQUEST<=NUM_LEV_IN, "Error! Input view does not have enough levels.\n");
+    static_assert(NUM_LEV_REQUEST<=NUM_LEV_OUT, "Error! Output view does not have enough levels.\n");
+
+    // Make sure the buffers have been created
+    assert (vector_buf_ml.size()>0);
+
+    const auto& spheremp = Homme::subview(m_spheremp, kv.ie);
+    scalar_buf<NUM_LEV_REQUEST> laplace0(Homme::subview(scalar_buf_ml,kv.team_idx,0).data());
+    scalar_buf<NUM_LEV_REQUEST> laplace1(Homme::subview(scalar_buf_ml,kv.team_idx,1).data());
+    scalar_buf<NUM_LEV_REQUEST> laplace2(Homme::subview(scalar_buf_ml,kv.team_idx,2).data());
+    constexpr int np_squared = NP * NP;
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, np_squared),
+                         [&](const int loop_idx) {
+      const int igp = loop_idx / NP; //slowest
+      const int jgp = loop_idx % NP; //fastest
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team, NUM_LEV_REQUEST), [&] (const int& ilev) {
+        const auto& v0 = vector(0,igp,jgp,ilev);
+        const auto& v1 = vector(1,igp,jgp,ilev);
+        laplace0(igp,jgp,ilev) = vec_sph2cart(0,0,igp,jgp)*v0 + vec_sph2cart(1,0,igp,jgp)*v1 ;
+        laplace1(igp,jgp,ilev) = vec_sph2cart(0,1,igp,jgp)*v0 + vec_sph2cart(1,1,igp,jgp)*v1 ;
+        laplace2(igp,jgp,ilev) = vec_sph2cart(0,2,igp,jgp)*v0 + vec_sph2cart(1,2,igp,jgp)*v1 ;
+      });
+    });
+    kv.team_barrier();
+
+    // Use laplace* as input, and then overwrite it with the output (saves temporaries)
+    laplace_tensor<NUM_LEV_REQUEST,NUM_LEV_REQUEST,NUM_LEV_REQUEST>(kv,tensorVisc,laplace0,laplace0);
+    laplace_tensor<NUM_LEV_REQUEST,NUM_LEV_REQUEST,NUM_LEV_REQUEST>(kv,tensorVisc,laplace1,laplace1);
+    laplace_tensor<NUM_LEV_REQUEST,NUM_LEV_REQUEST,NUM_LEV_REQUEST>(kv,tensorVisc,laplace2,laplace2);
+
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, np_squared),
+                         [&](const int loop_idx) {
+      const int igp = loop_idx / NP; //slowest
+      const int jgp = loop_idx % NP; //fastest
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team, NUM_LEV_REQUEST), [&] (const int& ilev) {
+#define UNDAMPRRCART
+#ifdef UNDAMPRRCART
+        laplace(0,igp,jgp,ilev) = vec_sph2cart(0,0,igp,jgp)*laplace0(igp,jgp,ilev)
+                                + vec_sph2cart(0,1,igp,jgp)*laplace1(igp,jgp,ilev)
+                                + vec_sph2cart(0,2,igp,jgp)*laplace2(igp,jgp,ilev)
+                                + 2.0*spheremp(igp,jgp)*vector(0,igp,jgp,ilev)
+                                        *(m_laplacian_rigid_factor)*(m_laplacian_rigid_factor);
+
+        laplace(1,igp,jgp,ilev) = vec_sph2cart(1,0,igp,jgp)*laplace0(igp,jgp,ilev)
+                                + vec_sph2cart(1,1,igp,jgp)*laplace1(igp,jgp,ilev)
+                                + vec_sph2cart(1,2,igp,jgp)*laplace2(igp,jgp,ilev)
+                                + 2.0*spheremp(igp,jgp)*vector(1,igp,jgp,ilev)
+                                        *(m_laplacian_rigid_factor)*(m_laplacian_rigid_factor);
+#else
+        laplace(0,igp,jgp,ilev) = vec_sph2cart(0,0,igp,jgp)*laplace0(igp,jgp,ilev)
+                                + vec_sph2cart(0,1,igp,jgp)*laplace1(igp,jgp,ilev)
+                                + vec_sph2cart(0,2,igp,jgp)*laplace2(igp,jgp,ilev);
+        laplace(1,igp,jgp,ilev) = vec_sph2cart(1,0,igp,jgp)*laplace0(igp,jgp,ilev)
+                                + vec_sph2cart(1,1,igp,jgp)*laplace1(igp,jgp,ilev)
+                                + vec_sph2cart(1,2,igp,jgp)*laplace2(igp,jgp,ilev);
+#endif
+      });
+    });
+    kv.team_barrier();
+  } // end of vlaplace_sphere_wk_cartesian
+
+  template<int NUM_LEV_OUT, int NUM_LEV_IN = NUM_LEV_OUT>
+  KOKKOS_INLINE_FUNCTION void
+  vlaplace_sphere_wk_contra (const KernelVariables &kv, const Real nu_ratio,
+                             const typename ViewConst<ExecViewUnmanaged<Scalar [2][NP][NP][NUM_LEV_IN]>>::type& vector,
+                             const ExecViewUnmanaged<Scalar [2][NP][NP][NUM_LEV_OUT]>& laplace,
+                             const int NUM_LEV_REQUEST) const
+  {
+    assert(NUM_LEV_REQUEST>=0);
+    assert(NUM_LEV_REQUEST<=NUM_LEV_IN);
+    assert(NUM_LEV_REQUEST<=NUM_LEV_OUT);
+
+    // Make sure the buffers have been created
+    assert (vector_buf_ml.size()>0);
+
+    const auto& spheremp = Homme::subview(m_spheremp, kv.ie);
+    scalar_buf<NUM_LEV_OUT> div (Homme::subview(scalar_buf_ml,kv.team_idx,0).data());
+    scalar_buf<NUM_LEV_OUT> vort(Homme::subview(scalar_buf_ml,kv.team_idx,0).data());
+    vector_buf<NUM_LEV_OUT> grad_curl_cov(Homme::subview(vector_buf_ml,kv.team_idx,1).data());
+    constexpr int np_squared = NP * NP;
+
+    // grad(div(v))
+    divergence_sphere_nlev(kv,vector,div,NUM_LEV_REQUEST);
+    if (nu_ratio>0 && nu_ratio!=1.0) {
+      Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, np_squared),
+                          [&](const int loop_idx) {
+        const int igp = loop_idx / NP; //slow
+        const int jgp = loop_idx % NP; //fast
+        Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team, NUM_LEV_REQUEST), [&] (const int& ilev) {
+          div(igp,jgp,ilev) *= nu_ratio;
+        });
+      });
+      kv.team_barrier();
+    }
+    grad_sphere_wk_testcov<NUM_LEV_OUT,NUM_LEV_OUT>(kv,div,grad_curl_cov,NUM_LEV_REQUEST);
+
+    // curl(curl(v))
+    vorticity_sphere<NUM_LEV_OUT,NUM_LEV_IN>(kv,vector,vort,NUM_LEV_REQUEST);
+    curl_sphere_wk_testcov_update<NUM_LEV_OUT,NUM_LEV_OUT>(kv,-1.0,1.0,vort,grad_curl_cov,NUM_LEV_REQUEST);
+
+    const auto re2 = m_laplacian_rigid_factor*m_laplacian_rigid_factor;
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, np_squared),
+                        [&](const int loop_idx) {
+      const int igp = loop_idx / NP; //slow
+      const int jgp = loop_idx % NP; //fast
+      const auto f = 2.0*spheremp(igp,jgp);
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team, NUM_LEV_REQUEST), [&] (const int& ilev) {
+        laplace(0,igp,jgp,ilev) = f*vector(0,igp,jgp,ilev)*re2 + grad_curl_cov(0,igp,jgp,ilev);
+        laplace(1,igp,jgp,ilev) = f*vector(1,igp,jgp,ilev)*re2 + grad_curl_cov(1,igp,jgp,ilev);
+      });
+     });
+     kv.team_barrier();
+  }//end of vlaplace_sphere_wk_contra
+
+  template<int NUM_LEV_OUT, int NUM_LEV_IN = NUM_LEV_OUT, int NUM_LEV_REQUEST = NUM_LEV_OUT>
+  KOKKOS_INLINE_FUNCTION void
+  vlaplace_sphere_wk_contra (const KernelVariables &kv, const Real nu_ratio,
+                             const typename ViewConst<ExecViewUnmanaged<Scalar [2][NP][NP][NUM_LEV_IN]>>::type& vector,
+                             const ExecViewUnmanaged<Scalar [2][NP][NP][NUM_LEV_OUT]>& laplace) const
+  {
+    static_assert(NUM_LEV_REQUEST>=0, "Error! Invalid value for NUM_LEV_REQUEST.\n");
+    static_assert(NUM_LEV_REQUEST<=NUM_LEV_IN, "Error! Input view does not have enough levels.\n");
+    static_assert(NUM_LEV_REQUEST<=NUM_LEV_OUT, "Error! Output view does not have enough levels.\n");
+
+    vlaplace_sphere_wk_contra<NUM_LEV_OUT,NUM_LEV_IN>(kv, nu_ratio, vector, laplace, NUM_LEV_REQUEST);
+  }//end of vlaplace_sphere_wk_contra
+
+  // The buffers should be enough to handle any single call to any
+  // single sphere operator.
+  // One might prefer them to be private, but they are handy for
+  // users of SphereOperators for other things, and using the same
+  // memory buffers for multiple things gives performance in mem
+  // b/w-limited computations.
+
+  ExecViewManaged<Real   * [NUM_2D_VECTOR_BUFFERS][2][NP][NP]>                vector_buf_sl;
+  ExecViewManaged<Scalar * [NUM_3D_SCALAR_BUFFERS][NP][NP][MAX_NUM_LEV]>      scalar_buf_ml;
+  ExecViewManaged<Scalar * [NUM_3D_VECTOR_BUFFERS][2][NP][NP][MAX_NUM_LEV]>   vector_buf_ml;
+
+
+  ExecViewManaged<const Real [NP][NP]>          dvv;
+  ExecViewManaged<const Real [NP][NP]>          m_mp;
+
+  ExecViewManaged<const Real * [NP][NP]>        m_spheremp;
+  ExecViewManaged<const Real * [NP][NP]>        m_rspheremp;
+  ExecViewManaged<const Real * [2][2][NP][NP]>  m_metinv;
+  ExecViewManaged<const Real * [NP][NP]>        m_metdet;
+  ExecViewManaged<const Real * [2][2][NP][NP]>  m_d;
+  ExecViewManaged<const Real * [2][2][NP][NP]>  m_dinv;
+
+  Real m_scale_factor_inv, m_laplacian_rigid_factor;
+};
+
+static constexpr int NPNP = NP * NP;
+
+#ifdef KOKKOS_ENABLE_CUDA
+using TeamPolicy = Kokkos::TeamPolicy<ExecSpace,Kokkos::LaunchBounds<1024,1> >;
+#else
+using TeamPolicy = Kokkos::TeamPolicy<ExecSpace>;
+#endif
+
+#ifndef WARP_SIZE
+
+#if defined(KOKKOS_ENABLE_HIP)
+#define WARP_SIZE warpSize
+#elif defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_SYCL)
+#define WARP_SIZE 32
+#else
+#define WARP_SIZE 1
+#endif
+
+#endif
+
+#if (WARP_SIZE == 1)
+
+#define SPHERE_BLOCK_START3(B,X,Y,Z) \
+  Scalar X; Scalar sbo##X[NP][NP][NUM_LEV]; \
+  Scalar Y; Scalar sbo##Y[NP][NP][NUM_LEV]; \
+  Scalar Z; Scalar sbo##Z[NP][NP][NUM_LEV]; \
+  for (int ix = 0; ix < NP; ix++) for(int iy = 0; iy < NP; iy++) { \
+    B.update(ix, iy); \
+    for (int iz = 0; iz < NUM_LEV; iz++) { \
+      B.z = iz;
+
+#define SPHERE_BLOCK_START0(B) \
+  for (int ix = 0; ix < NP; ix++) for(int iy = 0; iy < NP; iy++) { \
+    B.update(ix,iy); \
+    for (int iz = 0; iz < NUM_LEV; iz++) { \
+      B.z = iz;
+
+#define SPHERE_BLOCK_MIDDLE3(B,X,Y,Z) \
+      sbo##X[B.x][B.y][B.z] = X; \
+      sbo##Y[B.x][B.y][B.z] = Y; \
+      sbo##Z[B.x][B.y][B.z] = Z; \
+    }\
+  } \
+  for (int ix = 0; ix < NP; ix++) for(int iy = 0; iy < NP; iy++) { \
+    B.update(ix,iy); \
+    for (int iz = 0; iz < NUM_LEV; iz++) { \
+      B.z = iz; \
+      X = sbo##X[B.x][B.y][B.z]; \
+      Y = sbo##Y[B.x][B.y][B.z]; \
+      Z = sbo##Z[B.x][B.y][B.z];
+
+#define SPHERE_BLOCK_MIDDLE0(B) \
+    } \
+  } \
+  for (int ix = 0; ix < NP; ix++) for(int iy = 0; iy < NP; iy++) { \
+    B.update(ix,iy); \
+    for (int iz = 0; iz < NUM_LEV; iz++) { \
+      B.z = iz;
+
+#define SPHERE_BLOCK_END() } }
+
+#else
+
+static_assert(VECTOR_SIZE == 1, "VECTOR_SIZE != 1");
+
+// SPHERE_BLOCK macros for GPU (WARP_SIZE > 1).
+//
+// When NUM_PHYSICAL_LEV is not a multiple of WARP_SIZE (e.g. nlev=30, warp=32),
+// threads with B.z >= NUM_LEV are padding threads.  The original formulation
+// used an early `return` before the team_barrier(), which violates the CUDA
+// requirement that all threads in a CTA must reach __syncthreads().  This
+// caused non-deterministic results between otherwise-identical runs.
+//
+// Fix: guard the body with `if (B.z < NUM_LEV)` blocks so that all 512
+// threads always reach the barrier.  Local variables X/Y/Z are declared in
+// outer scope (default-initialized) so they remain accessible after the
+// barrier for the active (z < NUM_LEV) threads.
+
+#define SPHERE_BLOCK_START3(B,X,Y,Z) \
+      Scalar X = {}, Y = {}, Z = {}; \
+      if (B.z < NUM_LEV) {
+#define SPHERE_BLOCK_START0(B) if (B.z < NUM_LEV) {
+
+#define SPHERE_BLOCK_MIDDLE3(B,X,Y,Z) } B.t.team_barrier(); if (B.z < NUM_LEV) {
+#define SPHERE_BLOCK_MIDDLE0(B)       } B.t.team_barrier(); if (B.z < NUM_LEV) {
+
+#define SPHERE_BLOCK_END() }
+
+static constexpr int SPHERE_BLOCK_LEV = WARP_SIZE;
+static constexpr int SPHERE_BLOCK = NPNP * SPHERE_BLOCK_LEV;
+static constexpr int SPHERE_BLOCKS_PER_COL = (NUM_PHYSICAL_LEV - 1) / SPHERE_BLOCK_LEV + 1;
+
+#endif
+
+namespace SphereOuter {
+#if WARP_SIZE == 1
+  using Team = TeamPolicy::member_type;
+  template <typename F>
+  void parallel_for(const int num_elems, F f) {
+    Kokkos::parallel_for(
+      __PRETTY_FUNCTION__,
+      TeamPolicy(num_elems, 1),
+      f);
+  }
+#else
+  using Team = int;
+  template <typename F>
+  void parallel_for(const int num_elems, F f) {
+    f(num_elems);
+  }
+#endif
+}
+
+struct SphereGlobal {
+
+  const ExecViewManaged<const Real *[2][2][NP][NP]> d;
+  const ExecViewManaged<const Real *[2][2][NP][NP]> dinv;
+  const ExecViewManaged<const Real [NP][NP]> dvv;
+  const ExecViewManaged<const Real *[NP][NP]> metdet;
+  const Real scale_factor_inv;
+
+  SphereGlobal(const SphereOperators &op):
+    d(op.m_d),
+    dinv(op.m_dinv),
+    dvv(op.dvv),
+    metdet(op.m_metdet),
+    scale_factor_inv(op.m_scale_factor_inv)
+  {}
+};
+
+struct SphereBlockOps;
+
+#if (WARP_SIZE == 1)
+
+struct SphereBlockScratch {
+  const SphereBlockOps &b;
+  Scalar v[NP][NP][NUM_LEV];
+  KOKKOS_INLINE_FUNCTION SphereBlockScratch(const SphereBlockOps &);
+  KOKKOS_INLINE_FUNCTION const Scalar &sv(int x, int y) const;
+  KOKKOS_INLINE_FUNCTION Scalar &sv(int x, int y);
+};
+
+#else
+
+using SphereBlockScratchView = Kokkos::View<
+  Scalar[SPHERE_BLOCK_LEV][NP][NP],
+  ExecSpace::scratch_memory_space,
+  Kokkos::MemoryTraits<Kokkos::Unmanaged>
+  >;
+
+using SphereBlockScratchSubview = Kokkos::Subview<
+  SphereBlockScratchView, int,
+  std::remove_const_t<decltype(Kokkos::ALL)>,
+  std::remove_const_t<decltype(Kokkos::ALL)>
+  >;
+
+struct SphereBlockScratch {
+  SphereBlockScratchView v;
+  SphereBlockScratchSubview sv;
+  KOKKOS_INLINE_FUNCTION SphereBlockScratch(const SphereBlockOps &b);
+};
+
+#endif
+
+struct SphereBlockOps {
+  using Team = TeamPolicy::member_type;
+
+  const SphereGlobal &g;
+  const Team &t;
+  Real scale_factor_inv;
+  Real metdet;
+  Real rrdmd;
+  Real dinv[2][2];
+  Real dvvx[NP];
+  Real dvvy[NP];
+  int e,x,y,z;
+
+  KOKKOS_INLINE_FUNCTION SphereBlockOps(const SphereGlobal &sg, const Team &team):
+    g(sg),
+    t(team),
+    scale_factor_inv(g.scale_factor_inv)
+  {
+#if (WARP_SIZE == 1)
+    e = t.league_rank();
+    x = y = z = 0;
+#else
+    const int lr = t.league_rank();
+    e = lr / SPHERE_BLOCKS_PER_COL;
+    const int iw = lr % SPHERE_BLOCKS_PER_COL;
+    const int tr = t.team_rank();
+    const int ixy = tr / SPHERE_BLOCK_LEV;
+    const int dz = tr % SPHERE_BLOCK_LEV;
+    z = dz + iw * SPHERE_BLOCK_LEV;
+    update(ixy / NP, ixy % NP);
+#endif
+  }
+
+  KOKKOS_INLINE_FUNCTION void update(const int ix, const int iy)
+  {
+    rrdmd = 0;
+    x = ix;
+    y = iy;
+    metdet = g.metdet(e,x,y);
+    for (int i = 0; i < 2; i++) for (int j = 0; j < 2; j++) dinv[i][j] = g.dinv(e,i,j,x,y);
+    for (int j = 0; j < NP; j++) {
+      dvvx[j] = g.dvv(x,j);
+      dvvy[j] = g.dvv(y,j);
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION Scalar div(const SphereBlockScratch &t0, const SphereBlockScratch &t1)
+  {
+    // Separately compute du and dv to match Fortran bfb
+    Scalar du = 0;
+    Scalar dv = 0;
+    for (int j = 0; j < NP; j++) {
+      du += dvvy[j] * t0.sv(x,j);
+      dv += dvvx[j] * t1.sv(j,y);
+    }
+    if (rrdmd == 0) rrdmd = (1.0 / metdet) * scale_factor_inv;
+    return (du + dv) * rrdmd;
+  }
+
+  KOKKOS_INLINE_FUNCTION void divInit(SphereBlockScratch &t0, SphereBlockScratch &t1, const Scalar v0, const Scalar v1) const
+  {
+    t0.sv(x,y) = (dinv[0][0] * v0 + dinv[1][0] * v1) * metdet;
+    t1.sv(x,y) = (dinv[0][1] * v0 + dinv[1][1] * v1) * metdet;
+  }
+
+  KOKKOS_INLINE_FUNCTION void grad(Scalar &g0, Scalar &g1, const SphereBlockScratch &t) const
+  {
+    Scalar s0 = 0;
+    Scalar s1 = 0;
+    for (int j = 0; j < NP; j++) {
+      s0 += dvvy[j] * t.sv(x,j);
+      s1 += dvvx[j] * t.sv(j,y);
+    }
+    s0 *= scale_factor_inv;
+    s1 *= scale_factor_inv;
+    g0 = dinv[0][0] * s0 + dinv[0][1] * s1;
+    g1 = dinv[1][0] * s0 + dinv[1][1] * s1;
+  }
+
+  KOKKOS_INLINE_FUNCTION void gradInit(SphereBlockScratch &t0, const Scalar v0)
+  {
+    t0.sv(x,y) = v0;
+  }
+
+  KOKKOS_INLINE_FUNCTION Scalar vort(const SphereBlockScratch &t0, const SphereBlockScratch &t1)
+  {
+    Scalar du = 0;
+    Scalar dv = 0;
+    for (int j = 0; j < NP; j++) {
+      du += dvvx[j] * t0.sv(j,y);
+      dv += dvvy[j] * t1.sv(x,j);
+    }
+    if (rrdmd == 0) rrdmd = (1.0 / metdet) * scale_factor_inv;
+    return (dv - du) * rrdmd;
+  }
+
+  KOKKOS_INLINE_FUNCTION void vortInit(SphereBlockScratch &t0, SphereBlockScratch &t1, const Scalar v0, const Scalar v1) const
+  {
+    t0.sv(x,y) = g.d(e,0,0,x,y) * v0 + g.d(e,0,1,x,y) * v1;
+    t1.sv(x,y) = g.d(e,1,0,x,y) * v0 + g.d(e,1,1,x,y) * v1;
+  }
+
+  KOKKOS_INLINE_FUNCTION Scalar zabove(const Scalar *const here, const Scalar *const above) const
+  {
+    if (VECTOR_SIZE == 1) return above[0];
+    Scalar out;
+    for (int i = 0; i < VECTOR_END; i++) out[i] = here[0][i+1];
+    if (z < NUM_LEV_P-1) out[VECTOR_END] = above[0][0];
+    return out;
+  }
+
+  KOKKOS_INLINE_FUNCTION Scalar zabove(const Scalar top, const Scalar *const here, const Scalar *const above) const
+  {
+    if (VECTOR_SIZE == 1) return (z < NUM_LEV-1) ? above[0] : top;
+    Scalar out;
+    for (int i = 0; i < VECTOR_END; i++) out[i] = here[0][i+1];
+    if (z < NUM_LEV-1) out[VECTOR_END] = above[0][0];
+    else out[NUM_PHYSICAL_LEV%VECTOR_SIZE] = top[NUM_PHYSICAL_LEV%VECTOR_SIZE];
+    return out;
+  }
+
+  KOKKOS_INLINE_FUNCTION Scalar zbelow(const Scalar bottom, const Scalar *const below, const Scalar *const here) const
+  {
+    if (VECTOR_SIZE == 1) return (z > 0) ? below[0] : bottom;
+    Scalar out;
+    for (int i = 1; i < VECTOR_SIZE; i++) out[i] = here[0][i-1];
+    if (z > 0) out[0] = below[0][VECTOR_END];
+    else out[0] = bottom[0];
+    return out;
+  }
+
+#if (WARP_SIZE == 1)
+  template <typename F>
+  static void parallel_for(const Team &team, int /*num_scratch*/, F f)
+  {
+    f(team);
+  }
+#else
+  template <typename F>
+  static void parallel_for(const int num_elems, const int num_scratch, F f)
+  {
+    Kokkos::parallel_for(
+      __PRETTY_FUNCTION__,
+      TeamPolicy(num_elems * SPHERE_BLOCKS_PER_COL, SPHERE_BLOCK).
+      set_scratch_size(0, Kokkos::PerTeam(num_scratch * SphereBlockScratchView::shmem_size())),
+      f);
+  }
+#endif
+
+};
+
+#if (WARP_SIZE == 1)
+
+KOKKOS_INLINE_FUNCTION SphereBlockScratch::SphereBlockScratch(const SphereBlockOps &b):
+  b(b)
+{}
+
+KOKKOS_INLINE_FUNCTION const Scalar &SphereBlockScratch::sv(const int x, const int y) const
+{ return v[x][y][b.z]; }
+
+KOKKOS_INLINE_FUNCTION Scalar &SphereBlockScratch::sv(const int x, const int y)
+{ return v[x][y][b.z]; }
+
+#else
+
+KOKKOS_INLINE_FUNCTION SphereBlockScratch::SphereBlockScratch(const SphereBlockOps &b):
+  v(b.t.team_scratch(0)),
+  sv(Kokkos::subview(v, b.z % SPHERE_BLOCK_LEV, Kokkos::ALL, Kokkos::ALL))
+{}
+
+#endif
+
+struct SphereCol {
+  int e,x,y,z;
+
+#if (WARP_SIZE == 1)
+  struct Team { int e,x,y,z; };
+  KOKKOS_INLINE_FUNCTION SphereCol(const Team &team):
+    e(team.e),
+    x(team.x),
+    y(team.y),
+    z(team.z)
+  {}
+
+  template <typename F>
+  static void parallel_for(const SphereOuter::Team &outer, const int num_lev, F f)
+  {
+    const int e = outer.league_rank();
+    for (int x = 0; x < NP; x++) for (int y = 0; y < NP; y++) {
+      Kokkos::parallel_for(
+        Kokkos::ThreadVectorRange(outer, num_lev),
+        [=] (const int z) { f(Team{e,x,y,z}); });
+    }
+  }
+#else
+  using Team = TeamPolicy::member_type;
+
+  KOKKOS_INLINE_FUNCTION SphereCol(const Team &team)
+  {
+    const int lr = team.league_rank();
+    e = lr / NPNP;
+    const int xy = lr % NPNP;
+    x = xy / NP;
+    y = xy % NP;
+    z = team.team_rank();
+  }
+
+  template <typename F>
+  static void parallel_for(const int num_elems, const int num_lev, F f)
+  {
+    Kokkos::parallel_for(
+      __PRETTY_FUNCTION__,
+      TeamPolicy(num_elems * NPNP, num_lev),
+      f);
+  }
+#endif
+
+  KOKKOS_INLINE_FUNCTION Scalar zplus(const Scalar *const here, const Scalar *const above) const
+  {
+    Scalar out;
+    for (int i = 0; i < VECTOR_END; i++) out[i] = here[0][i+1];
+    out[VECTOR_END] = above[0][0];
+    return out;
+  }
+};
+
+struct SphereColOps: SphereCol {
+  Real scale_factor_inv;
+  Real dinv[2][2];
+  Real dvvx[NP];
+  Real dvvy[NP];
+
+  KOKKOS_INLINE_FUNCTION SphereColOps(const SphereGlobal &sg, const Team &team):
+    SphereCol(team)
+  {
+    scale_factor_inv = sg.scale_factor_inv;
+    for (int i = 0; i < 2; i++) for (int j = 0; j < 2; j++) dinv[i][j] = sg.dinv(e,i,j,x,y);
+    for (int j = 0; j < NP; j++) {
+      dvvx[j] = sg.dvv(x,j);
+      dvvy[j] = sg.dvv(y,j);
+    }
+  }
+
+  template <typename OutView, typename InView>
+  KOKKOS_INLINE_FUNCTION void grad(OutView &out, const InView &in, const int n) const
+  {
+    Scalar s0 = 0;
+    Scalar s1 = 0;
+    for (int j = 0; j < NP; j++) {
+      s0 += dvvy[j] * in(e,n,x,j,z);
+      s1 += dvvx[j] * in(e,n,j,y,z);
+    }
+    s0 *= scale_factor_inv;
+    s1 *= scale_factor_inv;
+    out(e,0,x,y,z) = dinv[0][0] * s0 + dinv[0][1] * s1;
+    out(e,1,x,y,z) = dinv[1][0] * s0 + dinv[1][1] * s1;
+  }
+
+  KOKKOS_INLINE_FUNCTION Scalar zabove(const Scalar top, const Scalar *const in) const
+  {
+    Scalar out = *in;
+    if (z == NUM_LEV_P-1) out[NUM_PHYSICAL_LEV%VECTOR_SIZE] = top[NUM_PHYSICAL_LEV%VECTOR_SIZE];
+    return out;
+  }
+
+  KOKKOS_INLINE_FUNCTION Scalar zbelow(const Real bottom, const Scalar *const below, const Scalar *const here) const
+  {
+    Scalar out;
+    out[0] = (z == 0) ? bottom : below[0][VECTOR_END];
+    for (int i = 1; i < VECTOR_SIZE; i++) out[i] = here[0][i-1];
+    return out;
+  }
+
+  KOKKOS_INLINE_FUNCTION Scalar ztop(const Real top, const Real in) const
+  {
+    Scalar out = in;
+    if (z == NUM_LEV_P-1) out[NUM_PHYSICAL_LEV%VECTOR_SIZE] = top;
+    return out;
+  }
+
+  KOKKOS_INLINE_FUNCTION Scalar ztrim(const Scalar bottom, const Scalar top, const Scalar in) const
+  {
+    Scalar out = in;
+    if (z == 0) out[0] = bottom[0];
+    if (z == NUM_LEV_P-1) out[NUM_PHYSICAL_LEV%VECTOR_SIZE] = top[NUM_PHYSICAL_LEV%VECTOR_SIZE];
+    return out;
+  }
+};
+
+struct SphereScanOps {
+  TeamPolicy::member_type t;
+  int e,x,y;
+
+#if (WARP_SIZE == 1)
+  struct Team {
+    TeamPolicy::member_type t;
+    int e,x,y;
+  };
+
+  KOKKOS_INLINE_FUNCTION SphereScanOps(const Team &team):
+    t(team.t),
+    e(team.e),
+    x(team.x),
+    y(team.y)
+  {}
+
+  template <typename F>
+  static void parallel_for(const SphereOuter::Team &outer, F f)
+  {
+    const int e = outer.league_rank();
+    for (int x = 0; x < NP; x++) for (int y = 0; y < NP; y++) {
+      f(Team{outer,e,x,y});
+    }
+  }
+#else
+  using Team = TeamPolicy::member_type;
+
+  KOKKOS_INLINE_FUNCTION SphereScanOps(const Team &team):
+    t(team)
+  {
+    const int lr = t.league_rank();
+    const int tr = t.team_rank();
+    e = lr;
+    x = tr / NP;
+    y = tr % NP;
+  }
+
+  template <typename F>
+  static void parallel_for(const int num_elems, F f)
+  {
+    Kokkos::parallel_for(
+      __PRETTY_FUNCTION__,
+      TeamPolicy(num_elems, NPNP, WARP_SIZE),
+      f);
+  }
+#endif
+
+  template <typename OutView, typename InView, typename EndView>
+  KOKKOS_INLINE_FUNCTION void nacs(OutView &out, const int n, const InView &in, const EndView &end) const
+  {
+    Dispatch<>::parallel_scan(
+      t, NUM_PHYSICAL_LEV,
+      [&](const int a, Real &sum, const bool last) {
+        if (a == 0) out(e,n,x,y,NUM_PHYSICAL_LEV) = sum = end(e,x,y);
+        const int z = NUM_PHYSICAL_LEV-a-1;
+        sum += in(e,x,y,z);
+        if (last) out(e,n,x,y,z) = sum;
+      });
+  }
+
+  template <typename OutView, typename InView>
+  KOKKOS_INLINE_FUNCTION void scan(OutView &out, const InView &in, const Real zero) const
+  {
+    Dispatch<>::parallel_scan(
+      t, NUM_PHYSICAL_LEV,
+      [&](const int z, Real &sum, const bool last) {
+        if (z == 0) out(e,x,y,0) = sum = zero;
+        sum += in(e,x,y,z);
+        if (last) out(e,x,y,z+1) = sum;
+      });
+  }
+
+  template <typename OutView, typename InView>
+  KOKKOS_INLINE_FUNCTION void scan(OutView &out, const InView &in, const int n, const Real zero) const
+  {
+    Dispatch<>::parallel_scan(
+      t, NUM_PHYSICAL_LEV,
+      [&](const int z, Real &sum, const bool last) {
+        if (z == 0) out(e,x,y,0) = sum = zero;
+        sum += in(e,n,x,y,z);
+        if (last) out(e,x,y,z+1) = sum;
+      });
+  }
+};
+
+} // namespace Homme
+
+#endif // HOMMEXX_SPHERE_OPERATORS_HPP
diff --git a/components/homme/src/share/cxx/utilities/ViewUtils.hpp b/components/homme/src/share/cxx/utilities/ViewUtils.hpp
index 1b3efcd91020..9ccf3d70fae1 100644
--- a/components/homme/src/share/cxx/utilities/ViewUtils.hpp
+++ b/components/homme/src/share/cxx/utilities/ViewUtils.hpp
@@ -67,6 +67,18 @@ viewAsReal(ViewType<ScalarType [DIM1][DIM2][DIM3], Properties...> v_in) {
   return ReturnView(reinterpret_cast<ReturnST*>(v_in.data()));
 }
 
+template <typename ScalarType, int DIM1, int DIM2, int DIM3, typename... Properties>
+KOKKOS_INLINE_FUNCTION
+typename
+std::enable_if<std::is_same<typename std::remove_const<ScalarType>::type,Scalar>::value,
+               Unmanaged<ViewType<RealType<ScalarType>*[DIM1][DIM2][DIM3*VECTOR_SIZE],Properties...>>
+              >::type
+viewAsReal(ViewType<ScalarType *[DIM1][DIM2][DIM3], Properties...> v_in) {
+  using ReturnST = RealType<ScalarType>;
+  using ReturnView = Unmanaged<ViewType<RealType<ScalarType>*[DIM1][DIM2][DIM3*VECTOR_SIZE],Properties...>>;
+  return ReturnView(reinterpret_cast<ReturnST*>(v_in.data()), v_in.extent(0));
+}
+
 template <typename ScalarType, int DIM1, int DIM2, int DIM3, int DIM4, typename... Properties>
 KOKKOS_INLINE_FUNCTION
 typename
@@ -79,6 +91,30 @@ viewAsReal(ViewType<ScalarType [DIM1][DIM2][DIM3][DIM4], Properties...> v_in) {
   return ReturnView(reinterpret_cast<ReturnST*>(v_in.data()));
 }
 
+template <typename ScalarType, int DIM1, int DIM2, int DIM3, int DIM4, typename... Properties>
+KOKKOS_INLINE_FUNCTION
+typename
+std::enable_if<std::is_same<typename std::remove_const<ScalarType>::type,Scalar>::value,
+               Unmanaged<ViewType<RealType<ScalarType>*[DIM1][DIM2][DIM3][DIM4*VECTOR_SIZE],Properties...>>
+              >::type
+viewAsReal(ViewType<ScalarType*[DIM1][DIM2][DIM3][DIM4], Properties...> v_in) {
+  using ReturnST = RealType<ScalarType>;
+  using ReturnView = Unmanaged<ViewType<RealType<ScalarType>*[DIM1][DIM2][DIM3][DIM4*VECTOR_SIZE],Properties...>>;
+  return ReturnView(reinterpret_cast<ReturnST*>(v_in.data()), v_in.extent(0));
+}
+
+template <typename ScalarType, int DIM1, int DIM2, int DIM3, int DIM4, int DIM5, typename... Properties>
+KOKKOS_INLINE_FUNCTION
+typename
+std::enable_if<std::is_same<typename std::remove_const<ScalarType>::type,Scalar>::value,
+               Unmanaged<ViewType<RealType<ScalarType>*[DIM1][DIM2][DIM3][DIM4][DIM5*VECTOR_SIZE],Properties...>>
+              >::type
+viewAsReal(ViewType<ScalarType*[DIM1][DIM2][DIM3][DIM4][DIM5], Properties...> v_in) {
+  using ReturnST = RealType<ScalarType>;
+  using ReturnView = Unmanaged<ViewType<RealType<ScalarType>*[DIM1][DIM2][DIM3][DIM4][DIM5*VECTOR_SIZE],Properties...>>;
+  return ReturnView(reinterpret_cast<ReturnST*>(v_in.data()), v_in.extent(0));
+}
+
 // Structure to define the type of a view that has a const data type,
 // given the type of an input view
 template<typename ViewT>
diff --git a/components/homme/src/theta-l_kokkos/CMakeLists.txt b/components/homme/src/theta-l_kokkos/CMakeLists.txt
index 1909cc39cd4e..fd1ea51cc301 100644
--- a/components/homme/src/theta-l_kokkos/CMakeLists.txt
+++ b/components/homme/src/theta-l_kokkos/CMakeLists.txt
@@ -184,6 +184,10 @@ MACRO(THETAL_KOKKOS_SETUP)
     ${SRC_SHARE_DIR}/cxx/utilities/Hash.cpp
   )
 
+  IF (HOMMEXX_ENABLE_CAAR_OPT)
+    LIST(APPEND THETAL_DEPS_CXX ${TARGET_DIR}/cxx/CaarFunctorImpl.cpp)
+  ENDIF ()
+
   IF (HOMME_USE_ARKODE)
     SET(THETAL_DEPS_F90
       ${THETAL_DEPS_F90}
diff --git a/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl-caar-opt.hpp b/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl-caar-opt.hpp
new file mode 100644
index 000000000000..e38a12215cfb
--- /dev/null
+++ b/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl-caar-opt.hpp
@@ -0,0 +1,1412 @@
+/********************************************************************************
+ * HOMMEXX 1.0: Copyright of Sandia Corporation
+ * This software is released under the BSD license
+ * See the file 'COPYRIGHT' in the HOMMEXX/src/share/cxx directory
+ *******************************************************************************/
+
+#ifndef HOMMEXX_CAAR_FUNCTOR_IMPL_OPT_HPP
+#define HOMMEXX_CAAR_FUNCTOR_IMPL_OPT_HPP
+
+#include "Types.hpp"
+#include "Elements.hpp"
+#include "ColumnOps.hpp"
+#include "Context.hpp"
+#include "EquationOfState.hpp"
+#include "FunctorsBuffersManager.hpp"
+#include "HybridVCoord.hpp"
+#include "KernelVariables.hpp"
+#include "LimiterFunctor.hpp"
+#include "ReferenceElement.hpp"
+#include "RKStageData.hpp"
+#include "SimulationParams.hpp"
+#include "SphereOperators-caar-opt.hpp"
+#include "kokkos_utils.hpp"
+
+#include "mpi/BoundaryExchange.hpp"
+#include "mpi/MpiBuffersManager.hpp"
+#include "utilities/SubviewUtils.hpp"
+#include "utilities/ViewUtils.hpp"
+
+#include "profiling.hpp"
+#include "ErrorDefs.hpp"
+
+#include <algorithm>
+#include <assert.h>
+
+namespace Homme {
+
+// Theta does not use tracers in caar. A fwd decl is enough here
+struct Tracers;
+
+struct CaarFunctorImpl {
+
+  struct Buffers {
+    static constexpr int num_3d_scalar_mid_buf = 10;
+    static constexpr int num_3d_vector_mid_buf =  5;
+    static constexpr int num_3d_scalar_int_buf =  6;
+    static constexpr int num_3d_vector_int_buf =  3;
+
+    ExecViewUnmanaged<Scalar*    [NP][NP][NUM_LEV]  >   temp;
+
+    ExecViewUnmanaged<Scalar*    [NP][NP][NUM_LEV]  >   pnh;
+    ExecViewUnmanaged<Scalar*    [NP][NP][NUM_LEV]  >   pi;
+    ExecViewUnmanaged<Scalar*    [NP][NP][NUM_LEV]  >   exner;
+    ExecViewUnmanaged<Scalar*    [NP][NP][NUM_LEV]  >   div_vdp;
+    ExecViewUnmanaged<Scalar*    [NP][NP][NUM_LEV]  >   phi;
+    ExecViewUnmanaged<Scalar*    [NP][NP][NUM_LEV]  >   omega_p;
+    ExecViewUnmanaged<Scalar*    [NP][NP][NUM_LEV]  >   vort;
+
+    ExecViewUnmanaged<Scalar* [2][NP][NP][NUM_LEV]  >   grad_exner;
+    ExecViewUnmanaged<Scalar* [2][NP][NP][NUM_LEV]  >   mgrad;
+    ExecViewUnmanaged<Scalar* [2][NP][NP][NUM_LEV]  >   grad_tmp;
+    ExecViewUnmanaged<Scalar* [2][NP][NP][NUM_LEV]  >   vdp;
+
+    ExecViewUnmanaged<Scalar*    [NP][NP][NUM_LEV_P]>   dp_i;
+    ExecViewUnmanaged<Scalar*    [NP][NP][NUM_LEV_P]>   vtheta_i;
+    ExecViewUnmanaged<Scalar*    [NP][NP][NUM_LEV_P]>   dpnh_dp_i;
+    ExecViewUnmanaged<Scalar*    [NP][NP][NUM_LEV_P]>   eta_dot_dpdn;
+
+    ExecViewUnmanaged<Scalar* [2][NP][NP][NUM_LEV_P]>   v_i;
+    ExecViewUnmanaged<Scalar* [2][NP][NP][NUM_LEV_P]>   grad_phinh_i;
+    ExecViewUnmanaged<Scalar* [2][NP][NP][NUM_LEV_P]>   grad_w_i;
+
+    ExecViewUnmanaged<Scalar* [2][NP][NP][NUM_LEV]  >   v_tens;
+    ExecViewUnmanaged<Scalar*    [NP][NP][NUM_LEV]  >   theta_tens;
+    ExecViewUnmanaged<Scalar*    [NP][NP][NUM_LEV]  >   dp_tens;
+    ExecViewUnmanaged<Scalar*    [NP][NP][NUM_LEV_P]>   w_tens;
+    ExecViewUnmanaged<Scalar*    [NP][NP][NUM_LEV_P]>   phi_tens;
+  };
+
+  using deriv_type = ReferenceElement::deriv_type;
+
+  RKStageData                 m_data;
+
+  // The 'm_data.scale2' coeff used for the calculation of w_tens and phi_tens must be replaced
+  // with 'm_data.scale1' on the surface (k=NUM_INTERFACE_LEV). To allow pack-level operations
+  // in the code, we store a pack containing [[scale2 scale2 ...]  scale1 [garbage] ], to be used
+  // at pack NUM_LEV_P
+
+  Scalar                      m_scale2g_last_int_pack;
+
+  const int           m_num_elems;
+  const int           m_rsplit;
+  const bool          m_theta_hydrostatic_mode;
+  const AdvectionForm m_theta_advection_form;
+  const bool          m_pgrad_correction;
+
+  HybridVCoord          m_hvcoord;
+  ElementsState         m_state;
+  ElementsDerivedState  m_derived;
+  ElementsGeometry      m_geometry;
+  EquationOfState       m_eos;
+  Buffers               m_buffers;
+  deriv_type            m_deriv;
+
+  SphereOperators       m_sphere_ops;
+
+  struct TagPreExchange {};
+  struct TagPostExchange {};
+
+  // Policies
+#if defined(KOKKOS_ENABLE_CUDA) && !defined(NDEBUG)
+  template<typename Tag>
+  using TeamPolicyType = Kokkos::TeamPolicy<ExecSpace,Kokkos::LaunchBounds<512,1>,Tag>;
+#else
+  template<typename Tag>
+  using TeamPolicyType = Kokkos::TeamPolicy<ExecSpace,Tag>;
+#endif
+
+  TeamPolicyType<TagPreExchange>   m_policy_pre;
+
+  Kokkos::RangePolicy<ExecSpace, TagPostExchange> m_policy_post;
+
+  TeamUtils<ExecSpace> m_tu;
+
+  Kokkos::Array<std::shared_ptr<BoundaryExchange>, NUM_TIME_LEVELS> m_bes;
+
+  template <bool HYDROSTATIC, bool CONSERVATIVE> void epoch1_blockOps(const SphereOuter::Team &);
+  template <bool RSPLIT_ZERO> void epoch2_scanOps(const SphereOuter::Team &);
+  template <bool HYDROSTATIC, bool RSPLIT_ZERO> void epoch3_blockOps(const SphereOuter::Team &);
+  void epoch4_scanOps(const SphereOuter::Team &);
+  template <bool HYDROSTATIC, bool RSPLIT_ZERO> void epoch5_colOps(const SphereOuter::Team &);
+  template <bool HYDROSTATIC, bool RSPLIT_ZERO, bool PGRAD_CORRECTION> void epoch6_blockOps(const SphereOuter::Team &);
+  template <bool HYDROSTATIC, bool RSPLIT_ZERO> void epoch7_col(const SphereOuter::Team &);
+  void caar_compute();
+
+  public:
+  CaarFunctorImpl(const Elements &elements, const Tracers &/* tracers */,
+                  const ReferenceElement &ref_FE, const HybridVCoord &hvcoord,
+                  const SphereOperators &sphere_ops, const SimulationParams& params)
+      : m_num_elems(elements.num_elems())
+      , m_rsplit(params.rsplit)
+      , m_theta_hydrostatic_mode(params.theta_hydrostatic_mode)
+      , m_theta_advection_form(params.theta_adv_form)
+      , m_pgrad_correction(params.pgrad_correction)
+      , m_hvcoord(hvcoord)
+      , m_state(elements.m_state)
+      , m_derived(elements.m_derived)
+      , m_geometry(elements.m_geometry)
+      , m_deriv(ref_FE.get_deriv())
+      , m_sphere_ops(sphere_ops)
+      , m_policy_pre (Homme::get_default_team_policy<ExecSpace,TagPreExchange>(m_num_elems))
+      , m_policy_post (0,m_num_elems*NP*NP)
+      , m_tu(m_policy_pre)
+  {
+    // Initialize equation of state
+    m_eos.init(params.theta_hydrostatic_mode,m_hvcoord);
+
+    // Make sure the buffers in sph op are large enough for this functor's needs
+    m_sphere_ops.allocate_buffers(m_tu);
+  }
+
+  CaarFunctorImpl(const int num_elems, const SimulationParams& params)
+      : m_num_elems(num_elems)
+      , m_rsplit(params.rsplit)
+      , m_theta_hydrostatic_mode(params.theta_hydrostatic_mode)
+      , m_theta_advection_form(params.theta_adv_form)
+      , m_pgrad_correction(params.pgrad_correction)
+      , m_policy_pre (Homme::get_default_team_policy<ExecSpace,TagPreExchange>(m_num_elems))
+      , m_policy_post (0,num_elems*NP*NP)
+      , m_tu(m_policy_pre)
+  {}
+
+  void setup (const Elements &elements, const Tracers &/*tracers*/,
+              const ReferenceElement &ref_FE, const HybridVCoord &hvcoord,
+              const SphereOperators &sphere_ops)
+  {
+    assert(m_num_elems == elements.num_elems()); // Sanity check
+    m_hvcoord = hvcoord;
+    m_state = elements.m_state;
+    m_derived = elements.m_derived;
+    m_geometry = elements.m_geometry;
+    m_deriv = ref_FE.get_deriv();
+    m_sphere_ops = sphere_ops;
+
+    // Initialize equation of state
+    m_eos.init(m_theta_hydrostatic_mode,m_hvcoord);
+
+    // Make sure the buffers in sph op are large enough for this functor's needs
+    m_sphere_ops.allocate_buffers(m_tu);
+  }
+
+  int requested_buffer_size () const {
+    // Ask the buffers manager to allocate enough buffers to satisfy Caar's needs.
+    //
+    // IMPORTANT: Why we use m_num_elems instead of m_tu.get_num_ws_slots() here.
+    //
+    // The operator()(TagPreExchange) path (used when HOMMEXX_ENABLE_CAAR_OPT is
+    // NOT defined, or in the #else branch) allocates buffers sized by
+    // m_tu.get_num_ws_slots() (= nslots = number of concurrent Kokkos teams).
+    // In that path, each team processes one element and uses its team workspace
+    // slot index (kv.team_idx, which is in [0, nslots)) as the first dimension
+    // into the buffer arrays.  At most nslots teams run simultaneously, so nslots
+    // entries suffice.
+    //
+    // The epoch path in CaarFunctorImpl.cpp (the caar_compute() function, which
+    // is selected by the `#if 1` block in run()) uses a DIFFERENT indexing
+    // strategy: it iterates over ALL elements and directly uses the element index
+    // (b.e / c.e, in [0, m_num_elems)) as the first buffer dimension.  For
+    // example: buffers_pnh(b.e, b.x, b.y, b.z).
+    //
+    // On serial CPU execution, m_tu.get_num_ws_slots() returns 1 (only one team
+    // can run at a time), but m_num_elems can be much larger (e.g. 7 for a
+    // 160-element mesh distributed over 24 MPI ranks).  If we allocate only
+    // nslots=1 entries, then any access with b.e > 0 is out-of-bounds into
+    // uninitialised (or zeroed) memory, causing NaN values in the buffers and
+    // ultimately NaN in the DIRK Newton loop's dphi/phi diagnostics.
+    //
+    // The fix is to size the first buffer dimension by m_num_elems so that the
+    // epoch path's direct element-index access is always in bounds.  On GPU,
+    // nslots == m_num_elems (all elements processed concurrently), so this
+    // change has no effect there.
+    const int nslots = m_num_elems;
+
+    int num_scalar_mid_buf = Buffers::num_3d_scalar_mid_buf;
+    int num_scalar_int_buf = Buffers::num_3d_scalar_int_buf;
+    int num_vector_mid_buf = Buffers::num_3d_vector_mid_buf;
+    int num_vector_int_buf = Buffers::num_3d_vector_int_buf;
+
+    // Depending on rsplit/hydro-mode, we may remove some
+    // buffers that are not needed from the counters above.
+    if (m_theta_hydrostatic_mode) {
+      // pi=pnh, and no dpnh_dp_i/phitens
+      // NOTE: w_tens is still allocated in hydrostatic mode because epoch2_scanOps
+      // repurposes it as a scratch buffer for the omega integral scan (computing
+      // the prefix sum of div_vdp to get omega_i).  Without this buffer allocated,
+      // epoch2 and epoch3 would access an uninitialized view -> SIGSEGV.
+      num_scalar_mid_buf -= 1;
+      num_scalar_int_buf -= 2;
+
+      // No grad_w_i/v_i
+      num_vector_int_buf -= 2;
+    }
+    if (m_rsplit>0) {
+      // No theta_i/eta_dot_dpdn
+      num_scalar_int_buf -=2;
+    }
+
+    return num_scalar_mid_buf  *NP*NP*NUM_LEV  *VECTOR_SIZE*nslots
+         + num_scalar_int_buf  *NP*NP*NUM_LEV_P*VECTOR_SIZE*nslots
+         + num_vector_mid_buf*2*NP*NP*NUM_LEV  *VECTOR_SIZE*nslots
+         + num_vector_int_buf*2*NP*NP*NUM_LEV_P*VECTOR_SIZE*nslots;
+  }
+
+  void init_buffers (const FunctorsBuffersManager& fbm) {
+    Errors::runtime_check(fbm.allocated_size()>=requested_buffer_size(), "Error! Buffers size not sufficient.\n");
+
+    Scalar* mem = reinterpret_cast<Scalar*>(fbm.get_memory());
+    // Use m_num_elems (not m_tu.get_num_ws_slots()) as the first buffer
+    // dimension.  See the comment in requested_buffer_size() for the full
+    // explanation.  In short: the epoch path in CaarFunctorImpl.cpp accesses
+    // buffers with the element index (b.e / c.e) as the first dimension, so we
+    // must allocate m_num_elems slots regardless of how many Kokkos teams can
+    // run simultaneously.
+    const int nslots = m_num_elems;
+
+    // Midpoints scalars
+    m_buffers.pnh        = decltype(m_buffers.pnh       )(mem,nslots);
+    mem += m_buffers.pnh.size();
+    if (m_theta_hydrostatic_mode) {
+      // pi = pnh
+      m_buffers.pi = m_buffers.pnh;
+    } else {
+      m_buffers.pi       = decltype(m_buffers.pi        )(mem,nslots);
+      mem += m_buffers.pi.size();
+    }
+
+    m_buffers.temp       = decltype(m_buffers.temp      )(mem,nslots);
+    mem += m_buffers.temp.size();
+    m_buffers.exner      = decltype(m_buffers.exner     )(mem,nslots);
+    mem += m_buffers.exner.size();
+    m_buffers.phi        = decltype(m_buffers.phi       )(mem,nslots);
+    mem += m_buffers.phi.size();
+    m_buffers.div_vdp    = decltype(m_buffers.div_vdp   )(mem,nslots);
+    mem += m_buffers.div_vdp.size();
+    m_buffers.omega_p    = decltype(m_buffers.omega_p   )(mem,nslots);
+    mem += m_buffers.omega_p.size();
+    m_buffers.vort       = decltype(m_buffers.vort)(mem,nslots);
+    mem += m_buffers.vort.size();
+    m_buffers.theta_tens = decltype(m_buffers.theta_tens)(mem,nslots);
+    mem += m_buffers.theta_tens.size();
+    m_buffers.dp_tens    = decltype(m_buffers.dp_tens   )(mem,nslots);
+    mem += m_buffers.dp_tens.size();
+
+    // Midpoints vectors
+    m_buffers.grad_exner = decltype(m_buffers.grad_exner)(mem,nslots);
+    mem += m_buffers.grad_exner.size();
+    m_buffers.mgrad = decltype(m_buffers.mgrad)(mem,nslots);
+    mem += m_buffers.mgrad.size();
+    m_buffers.grad_tmp = decltype(m_buffers.grad_tmp)(mem,nslots);
+    mem += m_buffers.grad_tmp.size();
+
+    m_buffers.vdp      = decltype(m_buffers.vdp     )(mem,nslots);
+    mem += m_buffers.vdp.size();
+    m_buffers.v_tens   = decltype(m_buffers.v_tens  )(mem,nslots);
+    mem += m_buffers.v_tens.size();
+
+    // Interface scalars
+    // ALWAYS allocate dp_i because we use it to store pi_i
+    m_buffers.dp_i = decltype(m_buffers.dp_i)(mem,nslots);
+    mem += m_buffers.dp_i.size();
+
+    if (!m_theta_hydrostatic_mode) {
+      m_buffers.dpnh_dp_i = decltype(m_buffers.dpnh_dp_i)(mem,nslots);
+      mem += m_buffers.dpnh_dp_i.size();
+    }
+
+    if (m_rsplit==0) {
+      m_buffers.eta_dot_dpdn = decltype(m_buffers.eta_dot_dpdn)(mem,nslots);
+      mem += m_buffers.eta_dot_dpdn.size();
+      m_buffers.vtheta_i     = decltype(m_buffers.vtheta_i    )(mem,nslots);
+      mem += m_buffers.vtheta_i.size();
+    }
+
+    if (!m_theta_hydrostatic_mode) {
+      m_buffers.phi_tens     = decltype(m_buffers.phi_tens    )(mem,nslots);
+      mem += m_buffers.phi_tens.size();
+    }
+    // ALWAYS allocate w_tens: in non-hydrostatic mode it stores the w-tendency;
+    // in hydrostatic mode it is repurposed by epoch2_scanOps as a scratch buffer
+    // for the omega integral scan (prefix sum of div_vdp -> omega_i).
+    // Failing to allocate it in hydrostatic mode causes epoch2 and epoch3 to
+    // access an uninitialized view, leading to SIGSEGV.
+    m_buffers.w_tens       = decltype(m_buffers.w_tens      )(mem,nslots);
+    mem += m_buffers.w_tens.size();
+
+    // Interface vectors
+    if (!m_theta_hydrostatic_mode) {
+      m_buffers.v_i          = decltype(m_buffers.v_i         )(mem,nslots);
+      mem += m_buffers.v_i.size();
+      m_buffers.grad_w_i     = decltype(m_buffers.grad_w_i    )(mem,nslots);
+      mem += m_buffers.grad_w_i.size();
+    }
+    m_buffers.grad_phinh_i = decltype(m_buffers.grad_phinh_i)(mem,nslots);
+    mem += m_buffers.grad_phinh_i.size();
+
+    assert ((reinterpret_cast<Real*>(mem) - fbm.get_memory())==requested_buffer_size());
+  }
+
+  void init_boundary_exchanges (const std::shared_ptr<MpiBuffersManager>& bm_exchange) {
+    const auto& sp = Context::singleton().get<SimulationParams>();
+    for (int tl=0; tl<NUM_TIME_LEVELS; ++tl) {
+      m_bes[tl] = std::make_shared<BoundaryExchange>();
+      auto& be = *m_bes[tl];
+      be.set_label(std::string("CAAR-") + std::to_string(tl));
+      be.set_diagnostics_level(sp.internal_diagnostics_level);
+      be.set_buffers_manager(bm_exchange);
+      if (m_theta_hydrostatic_mode) {
+        be.set_num_fields(0,0,4);
+      } else {
+        be.set_num_fields(0,0,4,2);
+      }
+      be.register_field(m_state.m_v,tl,2,0);
+      be.register_field(m_state.m_vtheta_dp,1,tl);
+      be.register_field(m_state.m_dp3d,1,tl);
+      if (!m_theta_hydrostatic_mode) {
+        // Note: phinh_i at the surface (last level) is constant, so it doesn't *need* bex.
+        //       If bex(constant)=constant, we might just do it. This would not eliminate
+        //       the need for halo-exchange of interface-based quantities though, since
+        //       we would still need to exchange w_i.
+        be.register_field(m_state.m_w_i,1,tl);
+        be.register_field(m_state.m_phinh_i,1,tl);
+      }
+      be.registration_completed();
+    }
+  }
+
+  void set_rk_stage_data (const RKStageData& data) {
+    m_data = data;
+
+    // Set m_scale2 to m_data.scale2 everywhere, except on last interface,
+    // where we set it to m_data.scale1. While at it, we already multiply by g.
+    constexpr auto g = PhysicalConstants::g;
+
+    m_scale2g_last_int_pack = m_data.scale2*g;
+    m_scale2g_last_int_pack[ColInfo<NUM_INTERFACE_LEV>::LastPackEnd] = m_data.scale1*g;
+  }
+
+  void run (const RKStageData& data)
+  {
+
+    auto& limiter  = Context::singleton().get<LimiterFunctor>();
+
+    set_rk_stage_data(data);
+
+    GPTLstart("caar compute");
+
+#if 1
+
+    caar_compute();
+    Kokkos::fence();
+    GPTLstop("caar compute");
+
+#else
+
+    int nerr;
+    Kokkos::parallel_reduce("caar loop pre-boundary exchange", m_policy_pre, *this, nerr);
+    Kokkos::fence();
+    GPTLstop("caar compute");
+    if (nerr > 0)
+      check_print_abort_on_bad_elems("CaarFunctorImpl::run TagPreExchange", data.n0);
+
+#endif
+
+    GPTLstart("caar_bexchV");
+    m_bes[data.np1]->exchange(m_geometry.m_rspheremp);
+    Kokkos::fence();
+    GPTLstop("caar_bexchV");
+
+    if (!m_theta_hydrostatic_mode) {
+      GPTLstart("caar compute");
+      Kokkos::parallel_for("caar loop post-boundary exchange", m_policy_post, *this);
+      Kokkos::fence();
+      GPTLstop("caar compute");
+    }
+
+    limiter.run(data.np1);
+
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const TagPreExchange&, const TeamMember &team, int& nerr) const {
+    // In this body, we use '====' to separate sync epochs (delimited by barriers)
+    // Note: make sure the same temp is not used within each epoch!
+
+    KernelVariables kv(team, m_tu);
+
+    // =========== EPOCH 1 =========== //
+    compute_div_vdp(kv);
+
+    // =========== EPOCH 2 =========== //
+    kv.team_barrier();
+
+    // Computes pi, omega, and phi.
+    const bool ok = compute_scan_quantities(kv);
+    if ( ! ok) nerr = 1;
+
+    if (m_rsplit==0 || !m_theta_hydrostatic_mode) {
+      // ============ EPOCH 2.1 =========== //
+      kv.team_barrier();
+      compute_interface_quantities(kv);
+    }
+
+    if (m_rsplit==0) {
+      // ============= EPOCH 2.2 ============ //
+      kv.team_barrier();
+      compute_vertical_advection(kv);
+    }
+
+    // ============= EPOCH 3 ============== //
+    kv.team_barrier();
+    compute_accumulated_quantities(kv);
+
+    // Compute update quantities
+    if (!m_theta_hydrostatic_mode) {
+      compute_w_and_phi_tens (kv);
+    }
+
+    compute_dp_and_theta_tens (kv);
+
+    // ============= EPOCH 4 =========== //
+    // compute_v_tens reuses some buffers used by compute_dp_and_theta_tens
+    kv.team_barrier();
+    compute_v_tens (kv);
+
+    // Update states
+    if (!m_theta_hydrostatic_mode) {
+      compute_w_and_phi_np1(kv);
+    }
+    compute_dp3d_and_theta_np1(kv);
+
+    // ============= EPOCH 5 =========== //
+    // v_tens has been computed after last barrier. Need to make sure it's done
+    kv.team_barrier();
+    compute_v_np1(kv);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const TagPostExchange&, const int idx) const {
+    // For g
+    using namespace PhysicalConstants;
+
+    using InfoM = ColInfo<NUM_PHYSICAL_LEV>;
+    using InfoI = ColInfo<NUM_INTERFACE_LEV>;
+    constexpr int LAST_MID_PACK     = InfoM::LastPack;
+    constexpr int LAST_MID_PACK_END = InfoM::LastPackEnd;
+    constexpr int LAST_INT_PACK     = InfoI::LastPack;
+    constexpr int LAST_INT_PACK_END = InfoI::LastPackEnd;
+
+    // Note: make sure you run this only in non-hydro mode
+    // KernelVariables kv(team);
+    const int ie  = idx / (NP*NP);
+    const int igp = (idx / NP) % NP;
+    const int jgp =  idx % NP;
+
+    auto& u = m_state.m_v(ie,m_data.np1,0,igp,jgp,LAST_MID_PACK)[LAST_MID_PACK_END];
+    auto& v = m_state.m_v(ie,m_data.np1,1,igp,jgp,LAST_MID_PACK)[LAST_MID_PACK_END];
+    auto& w = m_state.m_w_i(ie,m_data.np1,igp,jgp,LAST_INT_PACK)[LAST_INT_PACK_END];
+    const auto& phis_x = m_geometry.m_gradphis(ie,0,igp,jgp);
+    const auto& phis_y = m_geometry.m_gradphis(ie,1,igp,jgp);
+
+    // Compute dpnh_dp_i on surface
+    auto dpnh_dp_i = 1 + ( ( (u*phis_x + v*phis_y)/g - w) /
+                             (g + (phis_x*phis_x+phis_y*phis_y)/(2*g) ) ) / m_data.dt;
+
+    // Update w_i on bottom interface
+    // Update v on bottom level
+    w += m_data.scale1*m_data.dt*g*(dpnh_dp_i-1.0);
+    u -= m_data.scale1*m_data.dt*(dpnh_dp_i-1.0)*phis_x/2.0;
+    v -= m_data.scale1*m_data.dt*(dpnh_dp_i-1.0)*phis_y/2.0;
+
+    // TODO: you need to modify the BoundaryExchange class a bit, cause as of today
+    //       it exchanges *all* vertical levels. For phi, we don't want/need to
+    //       exchange the last level, since phi=phis at surface.
+    //       So to make sure we're not messing up, set phi back to phis on last interface
+    // Note: this is *independent* of whether NUM_LEV==NUM_LEV_P or not.
+    auto& phi_surf = m_state.m_phinh_i(ie,m_data.np1,igp,jgp,LAST_INT_PACK)[LAST_INT_PACK_END];
+    phi_surf = m_geometry.m_phis(ie,igp,jgp);
+
+#if defined(ENERGY_DIAGNOSTICS) && !defined(NDEBUG)
+    // Check w bc
+    if (fabs( (u*phis_x+v*phis_y)/g - w ) > 1e-10) {
+      printf("[CAAR] WARNING! w b.c. not satisfied at (ie,igp,jgp) = (%d,%d,%d):\n"
+             "         w:              %3.15f\n"
+             "         v*grad(phis)/g: %3.15f\n"
+             "         diff:           %3.15f\n",
+             ie,igp,jgp,w,(u*phis_x+v*phis_y)/g,fabs( (u*phis_x+v*phis_y)/g - w ));
+    }
+
+    auto phi = Homme::viewAsReal(Homme::subview(m_state.m_phinh_i,ie,m_data.np1,igp,jgp));
+    for (int k=0; k<NUM_PHYSICAL_LEV; ++k) {
+      if ( (phi(k)-phi(k+1)) < g ) {
+        printf("[CAAR] WARNING! delta z < 1m, at (ie,igp,jgp,k) = (%d,%d,%d,%d):\n"
+               "         phi(k):   %3.15f\n"
+               "         phi(k+1): %3.15f\n",
+               ie,igp,jgp,k,phi(k),phi(k+1));
+      }
+    }
+#endif
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void compute_div_vdp(KernelVariables &kv) const {
+    // Compute vdp
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, NP * NP),
+                         [&](const int idx) {
+      const int igp = idx / NP;
+      const int jgp = idx % NP;
+
+      auto u = Homme::subview(m_state.m_v,kv.ie,m_data.n0,0,igp,jgp);
+      auto v = Homme::subview(m_state.m_v,kv.ie,m_data.n0,1,igp,jgp);
+      auto dp3d = Homme::subview(m_state.m_dp3d,kv.ie,m_data.n0,igp,jgp);
+      auto udp = Homme::subview(m_buffers.vdp,kv.team_idx,0,igp,jgp);
+      auto vdp = Homme::subview(m_buffers.vdp,kv.team_idx,1,igp,jgp);
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team, NUM_LEV),
+                           [&] (const int& ilev) {
+        udp(ilev) = u(ilev)*dp3d(ilev);
+        vdp(ilev) = v(ilev)*dp3d(ilev);
+      });
+    });
+    kv.team_barrier();
+
+    // Compute div(vdp)
+    m_sphere_ops.divergence_sphere(kv,
+        Homme::subview(m_buffers.vdp, kv.team_idx),
+        Homme::subview(m_buffers.div_vdp, kv.team_idx));
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  bool compute_scan_quantities (KernelVariables &kv) const {
+    bool ok = true;
+
+    kv.team_barrier();
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team,NP*NP),
+                         [&](const int idx) {
+      const int igp = idx / NP;
+      const int jgp = idx % NP;
+
+      // At interfaces, pi_i(k+1) = pi_i(k)+dp(k), with pi_i(0)=hyai(0)*ps0;
+      // then, pi(k) = (pi_i(k) + pi_i(k+1))/2. Hence, accumulate midpoints to interfaces,
+      // then do a interface->midpoints average.
+      // For omega, we have omega_i(k+1) = omega_i(k)+div_vdp(k), with omega_i(0)=0;
+      // then, omega(k) = v*grad(pi) - average(omega_i). You can see how we could
+      // avoid computing omega_i, and set the accumulation step to be
+      // omega(k+1) = omega(k)+(div_vdp(k)+div_vdp(k-1))/2, with omega(0)=div_vdp(0)/2.
+      // However, avoiding computing omega_i would lose BFB agreement
+      // with the original F90 implementation. So for now, keep
+      // the two step calculation for omega (interface, then midpoints)
+      // TODO; skip calculation of omega_i
+      // Note: pi_i and omega_i are not needed after computing pi and omega,
+      //       so simply grab unused buffers
+      auto dp      = Homme::subview(m_state.m_dp3d,kv.ie,m_data.n0,igp,jgp);
+      auto div_vdp = Homme::subview(m_buffers.div_vdp,kv.team_idx,igp,jgp);
+      auto pi      = Homme::subview(m_buffers.pi,kv.team_idx,igp,jgp);
+      auto omega_i = Homme::subview(m_buffers.grad_phinh_i,kv.team_idx,0,igp,jgp);
+      auto pi_i    = Homme::subview(m_buffers.grad_phinh_i,kv.team_idx,1,igp,jgp);
+
+      Kokkos::single(Kokkos::PerThread(kv.team),[&]() {
+        pi_i(0)[0] = m_hvcoord.ps0*m_hvcoord.hybrid_ai0;
+      });
+      kv.team_barrier(); // necessary to avoid race in column_scan_mid_to_int
+
+      ColumnOps::column_scan_mid_to_int<true>(kv,dp,pi_i);
+
+      ColumnOps::compute_midpoint_values(kv,pi_i,pi);
+
+      Kokkos::single(Kokkos::PerThread(kv.team),[&]() {
+        omega_i(0)[0] = 0.0;
+      });
+      kv.team_barrier(); // necessary to avoid race in column_scan_mid_to_int
+
+      ColumnOps::column_scan_mid_to_int<true>(kv,div_vdp,omega_i);
+      kv.team_barrier();
+      // Average omega_i to midpoints, and change sign, since later
+      //   omega=v*grad(pi)-average(omega_i)
+      auto omega = Homme::subview(m_buffers.omega_p,kv.team_idx,igp,jgp);
+      ColumnOps::compute_midpoint_values<CombineMode::Scale>(kv,omega_i,omega,-1.0);
+    });
+
+    kv.team_barrier();
+
+    // Compute grad(pi)
+    m_sphere_ops.gradient_sphere(kv,Homme::subview(m_buffers.pi,kv.team_idx),
+                                    Homme::subview(m_buffers.grad_tmp,kv.team_idx));
+    kv.team_barrier();
+
+    // Update omega with v*grad(p)
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team,NP*NP),
+                         [&](const int idx) {
+      const int igp = idx / NP;
+      const int jgp = idx % NP;
+
+      auto omega = Homme::subview(m_buffers.omega_p,kv.team_idx,igp,jgp);
+      auto v = Homme::subview(m_state.m_v,kv.ie,m_data.n0);
+      auto grad_pi = Homme::subview(m_buffers.grad_tmp,kv.team_idx);
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team,NUM_LEV),
+                           [&](const int ilev) {
+        omega(ilev) += (v(0,igp,jgp,ilev)*grad_pi(0,igp,jgp,ilev) +
+                        v(1,igp,jgp,ilev)*grad_pi(1,igp,jgp,ilev));
+      });
+
+      // Use EOS to compute pnh/exner or exner/phi (depending on whether it's hydro mode).
+      if (m_theta_hydrostatic_mode) {
+        // Recall that, in this case, pnh aliases pi, and pi was already computed
+        m_eos.compute_exner(kv,Homme::subview(m_buffers.pnh,kv.team_idx,igp,jgp),
+                               Homme::subview(m_buffers.exner,kv.team_idx,igp,jgp));
+
+        m_eos.compute_phi_i(kv, m_geometry.m_phis(kv.ie,igp,jgp),
+                            Homme::subview(m_state.m_vtheta_dp,kv.ie,m_data.n0,igp,jgp),
+                            Homme::subview(m_buffers.exner,kv.team_idx,igp,jgp),
+                            Homme::subview(m_buffers.pnh,kv.team_idx,igp,jgp),
+                            Homme::subview(m_state.m_phinh_i,kv.ie,m_data.n0,igp,jgp));
+      } else {
+        const bool ok1 =
+        m_eos.compute_pnh_and_exner(kv,
+                                    Homme::subview(m_state.m_vtheta_dp,kv.ie,m_data.n0,igp,jgp),
+                                    Homme::subview(m_state.m_phinh_i,kv.ie,m_data.n0,igp,jgp),
+                                    Homme::subview(m_buffers.pnh,kv.team_idx,igp,jgp),
+                                    Homme::subview(m_buffers.exner,kv.team_idx,igp,jgp));
+        if ( ! ok1) ok = false;
+      }
+
+      // Compute phi at midpoints.
+      // IMPORTANT: The original PR #8192 optimization guards this with (m_rsplit == 0)
+      // because compute_interface_quantities (the vertical advection remap step) only
+      // consumes m_buffers.phi when rsplit==0.  However, compute_phi_vadv in
+      // compute_vertical_advection is called whenever (!m_theta_hydrostatic_mode),
+      // regardless of rsplit, and it reads m_buffers.phi.  So when rsplit>0 and
+      // !m_theta_hydrostatic_mode, m_buffers.phi is read uninitialized.
+      // TODO: revisit whether compute_phi_vadv should also be guarded by (m_rsplit==0),
+      // or whether compute_interface_quantities should re-compute phi midpoints rather
+      // than relying on this buffer.  For now, widen the guard to cover the
+      // non-hydrostatic path so the buffer is always valid when it is read.
+      if (m_rsplit == 0 || !m_theta_hydrostatic_mode) {
+        ColumnOps::compute_midpoint_values(kv,Homme::subview(m_state.m_phinh_i,kv.ie,m_data.n0,igp,jgp),
+                                              Homme::subview(m_buffers.phi,kv.team_idx,igp,jgp));
+      }
+    });
+    kv.team_barrier();
+    return ok;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void compute_interface_quantities(KernelVariables &kv) const {
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team,NP*NP),
+                         [&](const int idx) {
+      const int igp = idx / NP;
+      const int jgp = idx % NP;
+
+      auto dp   = Homme::subview(m_state.m_dp3d,kv.ie,m_data.n0,igp,jgp);
+      auto dp_i = Homme::subview(m_buffers.dp_i,kv.team_idx,igp,jgp);
+
+      // Compute interface dp
+      ColumnOps::compute_interface_values(kv.team,dp,dp_i);
+
+      if (!m_theta_hydrostatic_mode) {
+        auto u    = Homme::subview(m_state.m_v,kv.ie,m_data.n0,0,igp,jgp);
+        auto v    = Homme::subview(m_state.m_v,kv.ie,m_data.n0,1,igp,jgp);
+
+        // Compute interface horiz velocity
+        auto u_i  = Homme::subview(m_buffers.v_i,kv.team_idx,0,igp,jgp);
+        auto v_i  = Homme::subview(m_buffers.v_i,kv.team_idx,1,igp,jgp);
+
+        ColumnOps::compute_interface_values(kv.team,dp,dp_i,u,u_i);
+        ColumnOps::compute_interface_values(kv.team,dp,dp_i,v,v_i);
+
+        // grad_phinh_i is yet to be computed, so the buffer is available
+        auto dpnh_dp_i = Homme::subview(m_buffers.dpnh_dp_i,kv.team_idx,igp,jgp);
+
+        m_eos.compute_dpnh_dp_i(kv,Homme::subview(m_buffers.pnh,kv.team_idx,igp,jgp),
+                                   dp_i,
+                                   dpnh_dp_i);
+      }
+
+      if (m_rsplit==0) {
+        // Shorter names, to keep a call to ColumnOps a bit shorter
+        using CM = CombineMode;
+
+        // Compute interface vtheta_i, with an energy preserving scheme
+        auto vtheta_i = Homme::subview(m_buffers.vtheta_i,kv.team_idx,igp,jgp);
+        auto dexner_i = Homme::subview(m_buffers.grad_phinh_i,kv.team_idx,0,igp,jgp);
+        auto exner = Homme::subview(m_buffers.exner,kv.team_idx,igp,jgp);
+        auto phi = Homme::subview(m_buffers.phi,kv.team_idx,igp,jgp);
+
+        // vtheta_i(k) = -dpnh_dp_i(k)*(dphi(k) / dexner(k)) / Cp
+        // with dX = X(k+1)-X(k)
+        ColumnOps::compute_interface_delta(kv,phi,vtheta_i);
+
+        // Since bcVal is the last input, if you need bcVal != 0.0,
+        // you also need to pass alpha and beta as well (1.0 and 0.0 resp.).
+        // Here, bcVal is 1.0 (to avoid nan/inf with division by 0)
+        ColumnOps::compute_interface_delta<CM::Divide>(kv,exner,vtheta_i,1.0,0.0,1.0);
+
+        Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team,NUM_LEV),
+                             [&](const int ilev) {
+          vtheta_i(ilev) /= -PhysicalConstants::cp;
+          if (!m_theta_hydrostatic_mode) {
+            vtheta_i(ilev) *= m_buffers.dpnh_dp_i(kv.team_idx,igp,jgp,ilev);
+          }
+        });
+      }
+    });
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void compute_vertical_advection(KernelVariables &kv) const {
+    // Compute vertical advection terms:
+    //  - eta_dot_dpdn
+    //  - v_vadv
+    //  - theta_vadv
+    //  - phi_vadv_i
+    //  - w_vadv_i
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team,NP*NP),
+                         [&](const int idx) {
+      const int igp = idx / NP;
+      const int jgp = idx % NP;
+
+      compute_eta_dot_dpn (kv,igp,jgp);
+      compute_v_vadv      (kv,igp,jgp);
+      compute_vtheta_vadv (kv,igp,jgp);
+      if (!m_theta_hydrostatic_mode) {
+        compute_w_vadv      (kv,igp,jgp);
+        compute_phi_vadv    (kv,igp,jgp);
+      }
+    });
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void compute_eta_dot_dpn (KernelVariables& kv, const int& igp, const int& jgp) const {
+    auto div_vdp = viewAsReal(Homme::subview(m_buffers.div_vdp,kv.team_idx,igp,jgp));
+    auto eta_dot_dpdn = viewAsReal(Homme::subview(m_buffers.eta_dot_dpdn,kv.team_idx,igp,jgp));
+
+    // Integrate -vdp
+    Dispatch<ExecSpace>::parallel_scan(kv.team,NUM_PHYSICAL_LEV,
+                          [&](const int ilev, Real& accumulator, const bool last) {
+      accumulator += div_vdp(ilev);
+      if (last) {
+        eta_dot_dpdn(ilev+1) = -accumulator;
+      }
+    });
+
+    // Get the last entry, which is the sum over the whole column
+    const Real eta_dot_dpdn_last = -eta_dot_dpdn(NUM_INTERFACE_LEV-1);
+
+    Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team,1,NUM_PHYSICAL_LEV),
+                         [&](const int ilev) {
+      eta_dot_dpdn(ilev) += m_hvcoord.hybrid_bi(ilev)*eta_dot_dpdn_last;
+    });
+
+    // Set the boundary conditions for eta_dot_dpdn
+    Kokkos::single(Kokkos::PerThread(kv.team),[&](){
+      eta_dot_dpdn(0) = eta_dot_dpdn(NUM_INTERFACE_LEV-1) = 0;
+    });
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void compute_v_vadv (KernelVariables& kv, const int& igp, const int& jgp) const {
+    // Note: save v_vadv temp by stuffing directly in vtens
+
+    auto u  = viewAsReal(Homme::subview(m_state.m_v,kv.ie,m_data.n0,0,igp,jgp));
+    auto v  = viewAsReal(Homme::subview(m_state.m_v,kv.ie,m_data.n0,1,igp,jgp));
+    auto dp = viewAsReal(Homme::subview(m_state.m_dp3d,kv.ie,m_data.n0,igp,jgp));
+    auto eta_dot_dpdn = viewAsReal(Homme::subview(m_buffers.eta_dot_dpdn,kv.team_idx,igp,jgp));
+    auto u_vadv = viewAsReal(Homme::subview(m_buffers.v_tens,kv.team_idx,0,igp,jgp));
+    auto v_vadv = viewAsReal(Homme::subview(m_buffers.v_tens,kv.team_idx,1,igp,jgp));
+
+    // TODO: vectorize this code.
+    const auto facp_1 = 0.5*eta_dot_dpdn(1)/dp(0);
+    u_vadv(0) = facp_1*(u(1)-u(0));
+    v_vadv(0) = facp_1*(v(1)-v(0));
+
+    Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team,1,NUM_PHYSICAL_LEV-1),
+                         [&](const int k) {
+      const auto facp = 0.5*eta_dot_dpdn(k+1)/dp(k);
+      const auto facm = 0.5*eta_dot_dpdn(k)/dp(k);
+      u_vadv(k) = facp*(u(k+1)-u(k)) + facm*(u(k)-u(k-1));
+      v_vadv(k) = facp*(v(k+1)-v(k)) + facm*(v(k)-v(k-1));
+    });
+
+    constexpr int last = NUM_PHYSICAL_LEV-1;
+    const auto facm_N = 0.5*eta_dot_dpdn(last)/dp(last);
+    u_vadv(last) = facm_N*(u(last)-u(last-1));
+    v_vadv(last) = facm_N*(v(last)-v(last-1));
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void compute_vtheta_vadv (KernelVariables& kv, const int& igp, const int& jgp) const {
+    auto eta_dot_dpdn = Homme::subview(m_buffers.eta_dot_dpdn,kv.team_idx,igp,jgp);
+    auto vtheta_i = Homme::subview(m_buffers.vtheta_i,kv.team_idx,igp,jgp);
+    // No point in going till NUM_LEV_P, since vtheta_i=0 at the bottom
+    // Also, this is the last time we need vtheta_i, so we can overwrite it.
+
+    auto provider = [&](const int ilev)->Scalar{
+      return eta_dot_dpdn(ilev)*vtheta_i(ilev);
+    };
+
+    // Compute theta_vadv, store directly in theta_tens
+    auto theta_vadv = Homme::subview(m_buffers.theta_tens,kv.team_idx,igp,jgp);
+    ColumnOps::compute_midpoint_delta(kv,provider,theta_vadv);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void compute_w_vadv (KernelVariables& kv, const int& igp, const int& jgp) const {
+    // w_vadv = average(temp)/dp3d_i.
+    // temp = average(eta_dot)*delta(w_i)
+    // Note: store directly in w_tens
+    auto temp = Homme::subview(m_buffers.temp,kv.team_idx,igp,jgp);
+    auto w_i = Homme::subview(m_state.m_w_i,kv.ie,m_data.n0,igp,jgp);
+    auto dp_i = Homme::subview(m_buffers.dp_i,kv.team_idx,igp,jgp);
+    auto eta_dot_dpdn = Homme::subview(m_buffers.eta_dot_dpdn,kv.team_idx,igp,jgp);
+    auto w_vadv = Homme::subview(m_buffers.w_tens,kv.team_idx,igp,jgp);
+
+    ColumnOps::compute_midpoint_delta(kv,w_i,temp);
+    ColumnOps::compute_midpoint_values<CombineMode::Multiply>(kv,eta_dot_dpdn,temp);
+
+    ColumnOps::compute_interface_values(kv,temp,w_vadv);
+    Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team,NUM_LEV_P),
+                         [&](const int ilev) {
+      w_vadv(ilev) /= dp_i(ilev);
+    });
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void compute_phi_vadv (KernelVariables& kv, const int& igp, const int& jgp) const {
+    // phi_vadv = eta_dot*delta(phi)/dp3d_i.
+    // Note: store directly into phi_tens
+    auto phi = Homme::subview(m_buffers.phi,kv.team_idx,igp,jgp);
+    auto phi_vadv = Homme::subview(m_buffers.phi_tens,kv.team_idx,igp,jgp);
+    auto eta_dot_dpdn = Homme::subview(m_buffers.eta_dot_dpdn,kv.team_idx,igp,jgp);
+    auto dp_i = Homme::subview(m_buffers.dp_i,kv.team_idx,igp,jgp);
+
+    ColumnOps::compute_interface_delta(kv,phi,phi_vadv);
+    Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team,NUM_LEV),
+                         [&](const int ilev) {
+      phi_vadv(ilev) *= eta_dot_dpdn(ilev);
+      phi_vadv(ilev) /= dp_i(ilev);
+    });
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void compute_accumulated_quantities(KernelVariables &kv) const {
+    // Compute omega = v*grad(p) + average(omega_i)
+    // Accmuulate: dereived.omega_p += eta_ave_w*omega
+    // Accmuulate: dereived.eta_dot_dpdn += eta_ave_w*eta_dot_dpdn
+    // Accumulate: vn0 += eta_ave_w*v*dp
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team,NP*NP),
+                         [&](const int idx) {
+      const int igp = idx / NP;
+      const int jgp = idx % NP;
+
+      // If rsplit>0, m_buffers.eta_dot_dpdn is identically 0, so skip this step
+      if (m_rsplit==0) {
+        // Accumulate eta_dot_dpdn
+        // Note: m_buffers.eta_dot_dpdn is 0 at top/bottom, so we can just accumulate NUM_LEV packs
+        Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team,NUM_LEV),
+                             [&](const int ilev) {
+          m_derived.m_eta_dot_dpdn(kv.ie,igp,jgp,ilev) +=
+                m_data.eta_ave_w*m_buffers.eta_dot_dpdn(kv.team_idx,igp,jgp,ilev);
+        });
+      }
+
+      // Compute omega = v*grad(p)+average(omega_i) and accumulate
+      // Accumulate v*dp in vn0
+      // Note: so far omega already contains average(omega_i), since we didn't even
+      //       bother storing omega_i, and computed directly the average
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team,NUM_LEV),
+                           [&](const int ilev) {
+        m_derived.m_omega_p(kv.ie,igp,jgp,ilev) +=
+              m_data.eta_ave_w*m_buffers.omega_p(kv.team_idx,igp,jgp,ilev);
+
+        m_derived.m_vn0(kv.ie,0,igp,jgp,ilev) += m_data.eta_ave_w*m_buffers.vdp(kv.team_idx,0,igp,jgp,ilev);
+        m_derived.m_vn0(kv.ie,1,igp,jgp,ilev) += m_data.eta_ave_w*m_buffers.vdp(kv.team_idx,1,igp,jgp,ilev);
+      });
+    });
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void compute_w_and_phi_tens (KernelVariables& kv) const {
+    using namespace PhysicalConstants;
+    // Compute grad(phinh_i)
+    // Compute v*grad(w_i)
+    // Compute w_tens = scale1*(-w_vadv_i - v*grad(w_i)) - scale2*g*(1-dpnh_dp_i)
+    // Compute phi_tens = scale1*(-phi_vadv_i - v*grad(phinh_i)) + scale2*g*w_i
+    auto grad_w_i = Homme::subview(m_buffers.grad_w_i,kv.team_idx);
+    auto grad_phinh_i = Homme::subview(m_buffers.grad_phinh_i,kv.team_idx);
+    m_sphere_ops.gradient_sphere(kv,Homme::subview(m_state.m_phinh_i,kv.ie,m_data.n0),
+                                    grad_phinh_i);
+    kv.team_barrier();
+    m_sphere_ops.gradient_sphere(kv,Homme::subview(m_state.m_w_i,kv.ie,m_data.n0),
+                                    grad_w_i);
+    kv.team_barrier();
+
+    auto v_i = Homme::subview(m_buffers.v_i,kv.team_idx);
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team,NP*NP),
+                         [&](const int idx) {
+      const int igp = idx / NP;
+      const int jgp = idx % NP;
+
+      auto w_tens = Homme::subview(m_buffers.w_tens,kv.team_idx,igp,jgp);
+      auto phi_tens = Homme::subview(m_buffers.phi_tens,kv.team_idx,igp,jgp);
+
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team,NUM_LEV_P),
+                           [&](const int ilev) {
+
+        // Note: if rsplit=0, phi_tens/w_tens already contains phi_vadv/w_vadv,
+        //       otherwise, just garbage from previous team
+
+        // Compute w_tens
+        Scalar v_grad = v_i(0,igp,jgp,ilev)*grad_w_i(0,igp,jgp,ilev)
+                      + v_i(1,igp,jgp,ilev)*grad_w_i(1,igp,jgp,ilev);
+        if (m_rsplit==0) {
+          w_tens(ilev) += v_grad;
+        } else {
+          w_tens(ilev)  = v_grad;
+        }
+        w_tens(ilev) *= -m_data.scale1;
+        w_tens(ilev) += (m_buffers.dpnh_dp_i(kv.team_idx,igp,jgp,ilev)-1) *
+                        (ilev==(NUM_LEV_P-1) ? m_scale2g_last_int_pack : m_data.scale2*g);
+
+        // Compute phi_tens.
+        v_grad = v_i(0,igp,jgp,ilev)*grad_phinh_i(0,igp,jgp,ilev)
+               + v_i(1,igp,jgp,ilev)*grad_phinh_i(1,igp,jgp,ilev);
+        if (m_rsplit==0) {
+          phi_tens(ilev) += v_grad;
+        } else {
+          phi_tens(ilev) =  v_grad;
+        }
+        phi_tens(ilev) *= -m_data.scale1;
+        phi_tens(ilev) += m_state.m_w_i(kv.ie,m_data.n0,igp,jgp,ilev) *
+                          (ilev==NUM_LEV_P ? m_scale2g_last_int_pack : m_data.scale2*g);
+
+        if (m_data.scale1!=m_data.scale2) {
+           // add imex phi_h splitting
+           // use approximate phi_h = hybi*phis
+           // could also use true hydrostatic pressure, but this requires extra DSS in dirk()
+           phi_tens(ilev) +=  (m_data.scale1-m_data.scale2) *
+                (v_i(0,igp,jgp,ilev)*m_geometry.m_gradphis(kv.ie,0,igp,jgp) +
+                 v_i(1,igp,jgp,ilev)*m_geometry.m_gradphis(kv.ie,1,igp,jgp) ) * m_hvcoord.hybrid_bi_packed(ilev);
+        }
+      });
+    });
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void compute_w_and_phi_np1(KernelVariables &kv) const {
+    // Update w_i(np1) = spheremp*(scale3*w_i(nm1) + dt*w_tens)
+    // Update phi_i(np1) = spheremp*(scale3*phi_i(nm1) + dt*phi_tens)
+
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team,NP*NP),
+                         [&](const int idx) {
+      const int igp = idx / NP;
+      const int jgp = idx % NP;
+
+      auto spheremp = m_geometry.m_spheremp(kv.ie,igp,jgp);
+
+      auto w_tens = Homme::subview(m_buffers.w_tens,kv.team_idx,igp,jgp);
+      auto phi_tens = Homme::subview(m_buffers.phi_tens,kv.team_idx,igp,jgp);
+
+      auto w_np1 = Homme::subview(m_state.m_w_i,kv.ie,m_data.np1,igp,jgp);
+      auto phi_np1 = Homme::subview(m_state.m_phinh_i,kv.ie,m_data.np1,igp,jgp);
+
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team,NUM_LEV),
+                           [&](const int ilev) {
+
+        // Add w_tens to w_nm1 and multiply by spheremp
+        w_tens(ilev) *= m_data.dt*spheremp;
+        w_np1(ilev) = m_state.m_w_i(kv.ie,m_data.nm1,igp,jgp,ilev);
+        w_np1(ilev) *= m_data.scale3*spheremp;
+        w_np1(ilev) += w_tens(ilev);
+
+        // Add phi_tens to phi_nm1 and multiply by spheremp
+        // temp *= m_data.dt*spheremp;
+        phi_tens(ilev) *= m_data.dt*spheremp;
+        phi_np1(ilev) = m_state.m_phinh_i(kv.ie,m_data.nm1,igp,jgp,ilev);
+        phi_np1(ilev) *= m_data.scale3*spheremp;
+        phi_np1(ilev) += phi_tens(ilev);
+      });
+
+      // Last interface only for w, not phi (since phi=phis there)
+      Kokkos::single(Kokkos::PerThread(kv.team),[&](){
+        using Info = ColInfo<NUM_INTERFACE_LEV>;
+        constexpr int ilev = Info::LastPack;
+        constexpr int ivec = Info::LastPackEnd;
+
+        // Note: we only have 1 physical entry in the pack,
+        // so we may as well operate on the Real level
+        if (NUM_LEV==NUM_LEV_P) {
+          // We processed last interface too, so fix phi.
+          phi_np1(ilev)[ivec] = m_geometry.m_phis(kv.ie,igp,jgp);
+        } else {
+          // We didn't do anything on last interface, so update w
+          w_tens(ilev)[ivec] *= m_data.dt*spheremp;
+          w_np1(ilev)[ivec]  = m_state.m_w_i(kv.ie,m_data.nm1,igp,jgp,ilev)[ivec];
+          w_np1(ilev)[ivec] *= m_data.scale3*spheremp;
+          w_np1(ilev)[ivec] += w_tens(ilev)[ivec];
+        }
+      });
+    });
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void compute_dp_and_theta_tens (KernelVariables &kv) const {
+    // Compute dp_tens=scale1*(div(vdp) + delta(eta_dot_dpdn))
+    // Compute theta_tens=scale1*(-theta_vadv-div(v*vtheta_dp)
+    // Note: div(v*vtheta_dp) can be computed in conservative or
+    //       non concervative form (div(ab) vs a*div(b)+grad(a)*b)
+
+    // Use a couple of lambdas to compute input of diff operators on the fly
+    auto v         = Homme::subview(m_state.m_v,kv.ie,m_data.n0);
+    auto vtheta_dp = Homme::subview(m_state.m_vtheta_dp,kv.ie,m_data.n0);
+    auto dp        = Homme::subview(m_state.m_dp3d,kv.ie,m_data.n0);
+
+    auto vtheta = [&](const int igp,const int jgp,const int ilev)->Scalar {
+      return vtheta_dp(igp,jgp,ilev) / dp(igp,jgp,ilev);
+    };
+
+    auto v_vtheta_dp = [&](const int icomp, const int igp, const int jgp, const int ilev)->Scalar {
+      return v(icomp,igp,jgp,ilev) * vtheta_dp(igp,jgp,ilev);
+    };
+
+    if (m_theta_advection_form==AdvectionForm::Conservative) {
+      if (m_rsplit==0) {
+        using CM = CombineMode;
+        // If you want a CombineMode different than Replace, unfortunately you have to specify
+        // all the template args, since the CombineMode is the last one...
+        m_sphere_ops.divergence_sphere_cm<CM::Add>(kv,v_vtheta_dp,
+                                          Homme::subview(m_buffers.theta_tens,kv.team_idx));
+      } else {
+        m_sphere_ops.divergence_sphere(kv,v_vtheta_dp,
+                                          Homme::subview(m_buffers.theta_tens,kv.team_idx));
+      }
+    } else {
+      m_sphere_ops.gradient_sphere(kv,vtheta,
+                                      Homme::subview(m_buffers.grad_tmp,kv.team_idx));
+    }
+    kv.team_barrier();
+
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team,NP*NP),
+                         [&](const int idx) {
+      const int igp = idx / NP;
+      const int jgp = idx % NP;
+
+      auto dp_tens = Homme::subview(m_buffers.dp_tens,kv.team_idx,igp,jgp);
+      auto theta_tens = Homme::subview(m_buffers.theta_tens,kv.team_idx,igp,jgp);
+
+      auto div_vdp = Homme::subview(m_buffers.div_vdp,kv.team_idx,igp,jgp);
+
+      if (m_rsplit==0) {
+        auto eta_dot_dpdn = Homme::subview(m_buffers.eta_dot_dpdn,kv.team_idx,igp,jgp);
+        ColumnOps::compute_midpoint_delta(kv,eta_dot_dpdn,dp_tens);
+      }
+
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team,NUM_LEV),
+                           [&](const int ilev) {
+        // Compute dp_tens
+        if (m_rsplit>0) {
+          dp_tens(ilev)  = div_vdp(ilev);
+        } else {
+          dp_tens(ilev) += div_vdp(ilev);
+        }
+
+        // Compute theta_tens
+        // NOTE: if the condition is false, then theta_tens already contains div(v*theta*dp) already
+        if (m_theta_advection_form==AdvectionForm::NonConservative) {
+          // We need a temp, since, if rsplit=0, theta_tens is already storing theta_vadv
+
+          Scalar temp = div_vdp(ilev)*vtheta(igp,jgp,ilev);
+          temp += m_buffers.grad_tmp(kv.team_idx,0,igp,jgp,ilev)*m_buffers.vdp(kv.team_idx,0,igp,jgp,ilev);
+          temp += m_buffers.grad_tmp(kv.team_idx,1,igp,jgp,ilev)*m_buffers.vdp(kv.team_idx,1,igp,jgp,ilev);
+          if (m_rsplit>0) {
+            theta_tens(ilev) = temp;
+          } else {
+            theta_tens(ilev) += temp;
+          }
+        }
+      });
+    });
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void compute_dp3d_and_theta_np1(KernelVariables &kv) const {
+    // Update dp3d(np1) = spheremp*(scale3*dp3d(nm1) + dt*dp_tens)
+    // Update vtheta_dp(np1) = spheremp*(scale3*vtheta_dp(nm1) + dt*theta_tens)
+
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team,NP*NP),
+                         [&](const int idx) {
+      const int igp = idx / NP;
+      const int jgp = idx % NP;
+
+      const auto& spheremp = m_geometry.m_spheremp(kv.ie,igp,jgp);
+
+      auto dp_tens = Homme::subview(m_buffers.dp_tens,kv.team_idx,igp,jgp);
+      auto theta_tens = Homme::subview(m_buffers.theta_tens,kv.team_idx,igp,jgp);
+      auto dp_np1 = Homme::subview(m_state.m_dp3d,kv.ie,m_data.np1,igp,jgp);
+      auto vtheta_np1 = Homme::subview(m_state.m_vtheta_dp,kv.ie,m_data.np1,igp,jgp);
+
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team,NUM_LEV),
+                           [&](const int ilev) {
+
+        // Add dp_tens to dp_nm1 and multiply by spheremp
+        dp_tens(ilev) *= m_data.scale1*m_data.dt*spheremp;
+        dp_np1(ilev)   = m_data.scale3*spheremp*m_state.m_dp3d(kv.ie,m_data.nm1,igp,jgp,ilev);
+        dp_np1(ilev)  -= dp_tens(ilev);
+
+        // Add theta_tens to vtheta_nm1 and multiply by spheremp
+        theta_tens(ilev) *= -m_data.scale1*m_data.dt*spheremp;
+        vtheta_np1(ilev)  = m_state.m_vtheta_dp(kv.ie,m_data.nm1,igp,jgp,ilev);
+        vtheta_np1(ilev) *= m_data.scale3*spheremp;
+        vtheta_np1(ilev) += theta_tens(ilev);
+      });
+    });
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void compute_v_tens (KernelVariables &kv) const {
+    // Not necessarily in this order,
+    //  - Compute vort=vorticity(v)
+    //  - Compute wvor = grad[average(w_i^2/2)] - average[w_i*grad(w_i)]
+    //  - Compute gradKE = grad(v*v/2)
+    //  - Compute gradExner = grad(exner)
+    //  - Compute mgrad = average[dpnh_dp_i*gradphinh_i]
+    //              + cp*T0*(grad(log(exner))-grad(exner)/exner) (pgrad_correction, if applicable)
+    //  - Compute v_tens =
+    //           scale1*(-v_vadv + v2*(fcor+vort)-gradKE -mgrad  -cp*vtheta*gradExner - (mgrad + wvor))
+    //           scale1*(-v_vadv - v1*(fcor+vort)-gradKE -mgrad  -cp*vtheta*gradExner - (mgrad + wvor))
+
+    auto vort  = Homme::subview(m_buffers.vort,kv.team_idx);
+    auto wvor = Homme::subview(m_buffers.vdp,kv.team_idx);
+    auto grad_exner = Homme::subview(m_buffers.grad_exner,kv.team_idx);
+    auto mgrad = Homme::subview(m_buffers.mgrad,kv.team_idx);
+    auto grad_tmp = Homme::subview(m_buffers.grad_tmp,kv.team_idx);
+
+    // Compute vorticity(v)
+    m_sphere_ops.vorticity_sphere(kv, Homme::subview(m_state.m_v,kv.ie,m_data.n0),
+                                      vort);
+
+    if (m_theta_hydrostatic_mode) {
+      // In nh mode, gradphinh has already been computed, but in hydro mode
+      // we skip the whole compute_w_and_phi_tens call
+      m_sphere_ops.gradient_sphere(kv,Homme::subview(m_state.m_phinh_i,kv.ie,m_data.n0),
+                                      Homme::subview(m_buffers.grad_phinh_i,kv.team_idx));
+      kv.team_barrier();
+    } else {
+      // Compute average(w^2/2)
+      Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team,NP*NP),
+                           [&](const int idx) {
+        const int igp = idx / NP;
+        const int jgp = idx % NP;
+        auto w_i = Homme::subview(m_state.m_w_i,kv.ie,m_data.n0,igp,jgp);
+
+        // Use lambda as a provider for w_i*w_i
+        const auto w_sq = [&w_i] (const int ilev) -> Scalar{
+          return w_i(ilev)*w_i(ilev);
+        };
+
+        // Rescale by 2 (we are averaging kinetic energy w_i*w_i/2, but the provider has no '/2')
+        ColumnOps::compute_midpoint_values<CombineMode::Scale>(kv, w_sq, Homme::subview(m_buffers.temp,kv.team_idx,igp,jgp),0.5);
+      });
+      kv.team_barrier();
+      // Compute grad(average(w^2/2)). Store in wvor.
+      m_sphere_ops.gradient_sphere(kv, Homme::subview(m_buffers.temp,kv.team_idx),
+                                       wvor);
+      kv.team_barrier();
+    }
+
+    // Compute grad(exner)
+    // Note: exner = (pi/p0)^k, therefore grad(exner) = k*(exner/pi)*grad(pi).
+    //       So you *could* avoid computing this grad, at the price of some arithmetic op.
+    m_sphere_ops.gradient_sphere(kv, Homme::subview(m_buffers.exner,kv.team_idx),
+                                     grad_exner);
+
+    // If pgrad_correction=1, we need the gradient sphere
+    // for log(exner) below. Store into m_buffers.grad_tmp.
+    if (m_pgrad_correction) {
+      const auto exner = Homme::subview(m_buffers.exner,kv.team_idx);
+      const auto log_exner = [&exner](const int igp, const int jgp, const int ilev)->Scalar {
+        return log(exner(igp, jgp, ilev));
+      };
+      m_sphere_ops.gradient_sphere(kv, log_exner,
+                                       grad_tmp);
+    }
+    kv.team_barrier();
+
+    // Scalar w_vor,mgrad,w_gradw,gradw2;
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team,NP*NP),
+                         [&](const int idx) {
+      const int igp = idx / NP;
+      const int jgp = idx % NP;
+
+      auto wvor_x = Homme::subview(wvor,0,igp,jgp);
+      auto wvor_y = Homme::subview(wvor,1,igp,jgp);
+
+      if (!m_theta_hydrostatic_mode) {
+        // Compute wvor = grad(average(w^2/2)) - average(w*grad(w))
+        // Note: vtens is already storing grad(avg(w^2/2))
+        auto gradw_x = Homme::subview(m_buffers.grad_w_i,kv.team_idx,0,igp,jgp);
+        auto gradw_y = Homme::subview(m_buffers.grad_w_i,kv.team_idx,1,igp,jgp);
+        auto w_i = Homme::subview(m_state.m_w_i,kv.ie,m_data.n0,igp,jgp);
+
+        const auto w_gradw_x = [&gradw_x,&w_i] (const int ilev) -> Scalar {
+          return gradw_x(ilev)*w_i(ilev);
+        };
+        const auto w_gradw_y = [&gradw_y,&w_i] (const int ilev) -> Scalar {
+          return gradw_y(ilev)*w_i(ilev);
+        };
+
+        ColumnOps::compute_midpoint_values<CombineMode::ScaleAdd>(kv,
+                          w_gradw_x, wvor_x, -1.0);
+        ColumnOps::compute_midpoint_values<CombineMode::ScaleAdd>(kv,
+                          w_gradw_y, wvor_y, -1.0);
+      } else {
+        // wvor is not used if theta_hydrostatic_mode=1. Set to zero
+        // here to avoid adding in uninitialized values into v_tens.
+        Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team,NUM_LEV),
+                             [&](const int ilev) {
+          wvor_x(ilev) = 0;
+          wvor_y(ilev) = 0;
+        });
+      }
+
+      auto mgrad_x = Homme::subview(mgrad,0,igp,jgp);
+      auto mgrad_y = Homme::subview(mgrad,1,igp,jgp);
+
+      // Compute mgrad = average(dpnh_dp_i*grad(phinh_i))
+      const auto phinh_i_x = Homme::subview(m_buffers.grad_phinh_i,kv.team_idx,0,igp,jgp);
+      const auto phinh_i_y = Homme::subview(m_buffers.grad_phinh_i,kv.team_idx,1,igp,jgp);
+      if (m_theta_hydrostatic_mode) {
+        ColumnOps::compute_midpoint_values(kv,phinh_i_x,mgrad_x);
+        ColumnOps::compute_midpoint_values(kv,phinh_i_y,mgrad_y);
+      } else {
+        const auto dpnh_dp_i = Homme::subview(m_buffers.dpnh_dp_i,kv.team_idx,igp,jgp);
+        const auto prod_x = [&phinh_i_x,&dpnh_dp_i](const int ilev)->Scalar {
+          return phinh_i_x(ilev)*dpnh_dp_i(ilev);
+        };
+        const auto prod_y = [&phinh_i_y,&dpnh_dp_i](const int ilev)->Scalar {
+          return phinh_i_y(ilev)*dpnh_dp_i(ilev);
+        };
+
+        ColumnOps::compute_midpoint_values(kv,prod_x,mgrad_x);
+        ColumnOps::compute_midpoint_values(kv,prod_y,mgrad_y);
+      }
+
+      // Apply pgrad_correction: mgrad += cp*T0*(grad(log(exner))-grad(exner)/exner) (if applicable)
+      if (m_pgrad_correction) {
+        using namespace PhysicalConstants;
+        constexpr Real T0 = Tref - Tref_lapse_rate*Tref*cp/g;
+
+        const auto grad_tmp_i_x   = Homme::subview(grad_tmp,0,igp,jgp);
+        const auto grad_tmp_i_y   = Homme::subview(grad_tmp,1,igp,jgp);
+        const auto grad_exner_i_x = Homme::subview(grad_exner,0,igp,jgp);
+        const auto grad_exner_i_y = Homme::subview(grad_exner,1,igp,jgp);
+        const auto exner_i        = Homme::subview(m_buffers.exner,kv.team_idx,igp,jgp);
+
+        Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team,NUM_LEV),
+                             [&](const int ilev) {
+          mgrad_x(ilev) += cp*T0*(grad_tmp_i_x(ilev) - grad_exner_i_x(ilev)/exner_i(ilev));
+          mgrad_y(ilev) += cp*T0*(grad_tmp_i_y(ilev) - grad_exner_i_y(ilev)/exner_i(ilev));
+        });
+      }
+
+      // Compute KE. Also, add fcor to vort
+      auto u  = Homme::subview(m_state.m_v,kv.ie,m_data.n0,0,igp,jgp);
+      auto v  = Homme::subview(m_state.m_v,kv.ie,m_data.n0,1,igp,jgp);
+      auto KE = Homme::subview(m_buffers.temp,kv.team_idx,igp,jgp);
+      const auto& fcor = m_geometry.m_fcor(kv.ie,igp,jgp);
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team,NUM_LEV),
+                           [&](const int ilev) {
+        KE(ilev)  = u(ilev)*u(ilev);
+        KE(ilev) += v(ilev)*v(ilev);
+        KE(ilev) /= 2.0;
+
+        vort(igp,jgp,ilev) += fcor;
+      });
+    });
+    kv.team_barrier();
+
+    // Compute grad(KE), and put it directly in v_tens
+    if (m_rsplit==0) {
+      // v_tens already contains v_vadv. Need to sum into it.
+      m_sphere_ops.gradient_sphere_update(kv, Homme::subview(m_buffers.temp,kv.team_idx),
+                                              Homme::subview(m_buffers.v_tens,kv.team_idx));
+    } else {
+      // v_tens contains garbage from previos team. Overwrite it.
+      m_sphere_ops.gradient_sphere(kv, Homme::subview(m_buffers.temp,kv.team_idx),
+                                       Homme::subview(m_buffers.v_tens,kv.team_idx));
+    }
+
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team,NP*NP),
+                         [&](const int idx) {
+      const int igp = idx / NP;
+      const int jgp = idx % NP;
+
+      // Assemble vtens (both components)
+      // Note: right now, vtens already contains gradKE+v_vadv
+      //       (or just gradKE if rsplit>0)
+      auto u_tens = Homme::subview(m_buffers.v_tens,kv.team_idx,0,igp,jgp);
+      auto v_tens = Homme::subview(m_buffers.v_tens,kv.team_idx,1,igp,jgp);
+      auto vort = Homme::subview(m_buffers.vort,kv.team_idx,igp,jgp);
+
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team,NUM_LEV),
+                           [&](const int ilev) {
+        // grad(exner)*vtheta*cp
+        Scalar cp_vtheta = PhysicalConstants::cp *
+                           (m_state.m_vtheta_dp(kv.ie,m_data.n0,igp,jgp,ilev) /
+                            m_state.m_dp3d(kv.ie,m_data.n0,igp,jgp,ilev));
+
+        u_tens(ilev) += cp_vtheta*grad_exner(0,igp,jgp,ilev);
+        v_tens(ilev) += cp_vtheta*grad_exner(1,igp,jgp,ilev);
+
+        u_tens(ilev) += (mgrad(0,igp,jgp,ilev) + wvor(0,igp,jgp,ilev));
+        v_tens(ilev) += (mgrad(1,igp,jgp,ilev) + wvor(1,igp,jgp,ilev));
+
+        // Add +/- v_j*(vort+fcor), where v_j=v for u_tens and v_j=u for v_tens
+        u_tens(ilev) -= m_state.m_v(kv.ie,m_data.n0,1,igp,jgp,ilev)*vort(ilev);
+        v_tens(ilev) += m_state.m_v(kv.ie,m_data.n0,0,igp,jgp,ilev)*vort(ilev);
+      });
+    });
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void compute_v_np1(KernelVariables &kv) const {
+    // Update v(np1) = spheremp*(scale3*v(nm1) + dt*v_tens)
+    // NOTE: quite a few buffers no longer needed in this call of operator() are going to be reused here.
+
+    auto v_tens = Homme::subview(m_buffers.v_tens,kv.team_idx);
+
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team,NP*NP),
+                         [&](const int idx) {
+      const int igp = idx / NP;
+      const int jgp = idx % NP;
+
+      const auto& spheremp = m_geometry.m_spheremp(kv.ie,igp,jgp);
+
+      // Add vtens to v_nm1 and multiply by spheremp
+      auto u_np1 = Homme::subview(m_state.m_v,kv.ie,m_data.np1,0,igp,jgp);
+      auto v_np1 = Homme::subview(m_state.m_v,kv.ie,m_data.np1,1,igp,jgp);
+      auto u_tens = Homme::subview(m_buffers.v_tens,kv.team_idx,0,igp,jgp);
+      auto v_tens = Homme::subview(m_buffers.v_tens,kv.team_idx,1,igp,jgp);
+
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team,NUM_LEV),
+                           [&](const int ilev) {
+
+        // Add v_tens to v_nm1 and multiply by spheremp
+        u_np1(ilev) = m_state.m_v(kv.ie,m_data.nm1,0,igp,jgp,ilev);
+        v_np1(ilev) = m_state.m_v(kv.ie,m_data.nm1,1,igp,jgp,ilev);
+
+        u_tens(ilev) *= -m_data.scale1*m_data.dt*spheremp;
+        v_tens(ilev) *= -m_data.scale1*m_data.dt*spheremp;
+
+        u_np1(ilev) *= m_data.scale3*spheremp;
+        v_np1(ilev) *= m_data.scale3*spheremp;
+
+        u_np1(ilev) += u_tens(ilev);
+        v_np1(ilev) += v_tens(ilev);
+      });
+    });
+  }
+
+};
+
+} // Namespace Homme
+
+#endif // HOMMEXX_CAAR_FUNCTOR_IMPL_OPT_HPP
diff --git a/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.cpp b/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.cpp
new file mode 100644
index 000000000000..10aad89f8154
--- /dev/null
+++ b/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.cpp
@@ -0,0 +1,647 @@
+#include "CaarFunctorImpl-caar-opt.hpp"
+
+namespace Homme {
+
+template <bool HYDROSTATIC, bool CONSERVATIVE>
+void CaarFunctorImpl::epoch1_blockOps(const SphereOuter::Team &team)
+{
+  auto buffers_dp_tens = m_buffers.dp_tens;
+  auto buffers_exner = m_buffers.exner;
+  auto buffers_phi = m_buffers.phi;
+  auto buffers_pnh = m_buffers.pnh;
+  auto buffers_theta_tens = m_buffers.theta_tens;
+  auto buffers_vdp = m_buffers.vdp;
+
+  const Real data_eta_ave_w = m_data.eta_ave_w;
+  const int data_n0 = m_data.n0;
+
+  auto derived_vn0 = m_derived.m_vn0;
+
+  const SphereGlobal sg(m_sphere_ops);
+
+  auto state_dp3d = m_state.m_dp3d;
+  auto state_phinh_i = m_state.m_phinh_i;
+  auto state_v = m_state.m_v;
+  auto state_vtheta_dp= m_state.m_vtheta_dp;
+
+  SphereBlockOps::parallel_for(
+    team, 4,
+    KOKKOS_LAMBDA(const SphereBlockOps::Team &team) {
+      SphereBlockOps b(sg, team);
+      SphereBlockScratch ttmp0(b), ttmp1(b), ttmp2(b), ttmp3(b);
+
+      SPHERE_BLOCK_START3(b, v0, v1, vtheta);
+
+      if (!HYDROSTATIC) {
+
+        const Scalar dphi = b.zabove(&state_phinh_i(b.e,data_n0,b.x,b.y,b.z), &state_phinh_i(b.e,data_n0,b.x,b.y,b.z+1)) - state_phinh_i(b.e,data_n0,b.x,b.y,b.z);
+        const Scalar vtheta_dp = state_vtheta_dp(b.e,data_n0,b.x,b.y,b.z);
+        for (int i = 0; i < VECTOR_SIZE; i++) {
+          if ((vtheta_dp[i] < 0) || (dphi[i] > 0)) {
+            if (b.z * VECTOR_SIZE + i < NUM_PHYSICAL_LEV) abort();
+          }
+        }
+
+        EquationOfState::compute_pnh_and_exner(vtheta_dp, dphi, buffers_pnh(b.e,b.x,b.y,b.z), buffers_exner(b.e,b.x,b.y,b.z));
+
+        // Zero out padding slots in the last pack to prevent NaN propagation.
+        // When NUM_PHYSICAL_LEV % VECTOR_SIZE != 0, the last midpoint pack (z=NUM_LEV-1)
+        // has slots NUM_PHYSICAL_LEV%VECTOR_SIZE..VECTOR_SIZE-1 that are padding (vtheta_dp=0,
+        // dphi=0 or -phis), causing 0/0=NaN in compute_pnh_and_exner.
+        if (NUM_PHYSICAL_LEV % VECTOR_SIZE != 0 && b.z == NUM_LEV-1) {
+          for (int i = NUM_PHYSICAL_LEV % VECTOR_SIZE; i < VECTOR_SIZE; i++) {
+            buffers_pnh(b.e,b.x,b.y,b.z)[i] = 1.0;
+            buffers_exner(b.e,b.x,b.y,b.z)[i] = 1.0;
+          }
+        }
+
+        buffers_phi(b.e,b.x,b.y,b.z) = 0.5 * (state_phinh_i(b.e,data_n0,b.x,b.y,b.z) + b.zabove(&state_phinh_i(b.e,data_n0,b.x,b.y,b.z), &state_phinh_i(b.e,data_n0,b.x,b.y,b.z+1)));
+      }
+
+      v0 = state_v(b.e,data_n0,0,b.x,b.y,b.z) * state_dp3d(b.e,data_n0,b.x,b.y,b.z);
+      v1 = state_v(b.e,data_n0,1,b.x,b.y,b.z) * state_dp3d(b.e,data_n0,b.x,b.y,b.z);
+      b.divInit(ttmp0, ttmp1, v0, v1);
+      buffers_vdp(b.e,0,b.x,b.y,b.z) = v0;
+      buffers_vdp(b.e,1,b.x,b.y,b.z) = v1;
+      derived_vn0(b.e,0,b.x,b.y,b.z) += data_eta_ave_w * v0;
+      derived_vn0(b.e,1,b.x,b.y,b.z) += data_eta_ave_w * v1;
+
+      vtheta = 0;
+
+      if (CONSERVATIVE) {
+        const Scalar sv0 = state_v(b.e,data_n0,0,b.x,b.y,b.z) * state_vtheta_dp(b.e,data_n0,b.x,b.y,b.z);
+        const Scalar sv1 = state_v(b.e,data_n0,1,b.x,b.y,b.z) * state_vtheta_dp(b.e,data_n0,b.x,b.y,b.z);
+        b.divInit(ttmp2, ttmp3, sv0, sv1);
+      } else {
+        vtheta = state_vtheta_dp(b.e,data_n0,b.x,b.y,b.z) / state_dp3d(b.e,data_n0,b.x,b.y,b.z);
+        b.gradInit(ttmp2, vtheta);
+      }
+
+      SPHERE_BLOCK_MIDDLE3(b, v0, v1, vtheta);
+
+      const Scalar dvdp = b.div(ttmp0, ttmp1);
+      buffers_dp_tens(b.e,b.x,b.y,b.z) = dvdp;
+
+      if (CONSERVATIVE) {
+        buffers_theta_tens(b.e,b.x,b.y,b.z) = b.div(ttmp2, ttmp3);
+      } else {
+        Scalar grad0, grad1;
+        b.grad(grad0, grad1, ttmp2);
+        Scalar theta_tens = dvdp * vtheta;
+        theta_tens += grad0 * v0;
+        theta_tens += grad1 * v1;
+        buffers_theta_tens(b.e,b.x,b.y,b.z) = theta_tens;
+      }
+
+      SPHERE_BLOCK_END();
+    });
+}
+
+template <bool RSPLIT_ZERO>
+void CaarFunctorImpl::epoch2_scanOps(const SphereOuter::Team &team)
+{
+  auto buffers_dp_tens = viewAsReal(m_buffers.dp_tens);
+  auto buffers_dp_i = viewAsReal(m_buffers.dp_i);
+  auto buffers_eta_dot_dpdn = viewAsReal(m_buffers.eta_dot_dpdn);
+  auto buffers_w_tens = viewAsReal(m_buffers.w_tens);
+
+  const int data_n0 = m_data.n0;
+  auto &hvcoord_hybrid_bi = m_hvcoord.hybrid_bi;
+  const Real pi_i00 = m_hvcoord.ps0 * m_hvcoord.hybrid_ai0;
+  auto state_dp3d = viewAsReal(m_state.m_dp3d);
+
+  SphereScanOps::parallel_for(
+    team,
+    KOKKOS_LAMBDA(const SphereScanOps::Team &team) {
+      SphereScanOps s(team);
+
+      s.scan(buffers_dp_i, state_dp3d, data_n0, pi_i00);
+      s.scan(buffers_w_tens, buffers_dp_tens, 0);
+
+      if (RSPLIT_ZERO) {
+
+        s.scan(buffers_eta_dot_dpdn, buffers_dp_tens, 0);
+
+        const Real last = buffers_eta_dot_dpdn(s.e,s.x,s.y,NUM_PHYSICAL_LEV);
+
+        Kokkos::parallel_for(
+          Kokkos::ThreadVectorRange(s.t, 1, NUM_PHYSICAL_LEV),
+          [&](const int z) {
+            Real eta_dot_dpdn = -buffers_eta_dot_dpdn(s.e,s.x,s.y,z);
+            eta_dot_dpdn += hvcoord_hybrid_bi(z) * last;
+            buffers_eta_dot_dpdn(s.e,s.x,s.y,z) = eta_dot_dpdn;
+          });
+
+        Kokkos::single(
+          Kokkos::PerThread(s.t),
+          [&]() {
+            buffers_eta_dot_dpdn(s.e,s.x,s.y,0) = buffers_eta_dot_dpdn(s.e,s.x,s.y,NUM_PHYSICAL_LEV) = 0;
+          });
+      }
+    });
+}
+
+template <bool HYDROSTATIC, bool RSPLIT_ZERO>
+void CaarFunctorImpl::epoch3_blockOps(const SphereOuter::Team &team)
+{
+  auto buffers_dp_i = m_buffers.dp_i;
+  auto buffers_dp_tens = m_buffers.dp_tens;
+  auto buffers_eta_dot_dpdn = m_buffers.eta_dot_dpdn;
+  auto buffers_exner = m_buffers.exner;
+  auto buffers_omega_p = m_buffers.omega_p;
+  auto buffers_pnh = m_buffers.pnh;
+  auto buffers_temp = m_buffers.temp;
+  auto buffers_v_tens = m_buffers.v_tens;
+  auto buffers_w_tens = m_buffers.w_tens;
+
+  const Real data_eta_ave_w = m_data.eta_ave_w;
+  const int data_n0 = m_data.n0;
+
+  auto derived_eta_dot_dpdn = m_derived.m_eta_dot_dpdn;
+  auto derived_omega_p = m_derived.m_omega_p;
+
+  const SphereGlobal sg(m_sphere_ops);
+
+  auto state_dp3d = m_state.m_dp3d;
+  auto state_v = m_state.m_v;
+  auto state_vtheta_dp= m_state.m_vtheta_dp;
+  auto state_w_i = m_state.m_w_i;
+
+  SphereBlockOps::parallel_for(
+    team, 1,
+    KOKKOS_LAMBDA(const SphereBlockOps::Team &team) {
+      SphereBlockOps b(sg, team);
+      SphereBlockScratch ttmp0(b);
+
+      SPHERE_BLOCK_START0(b);
+
+      const Scalar pi = 0.5 * (buffers_dp_i(b.e,b.x,b.y,b.z) + b.zabove(&buffers_dp_i(b.e,b.x,b.y,b.z), &buffers_dp_i(b.e,b.x,b.y,b.z+1)));
+      b.gradInit(ttmp0, pi);
+
+      if (HYDROSTATIC) {
+        Scalar exner = pi;
+        EquationOfState::pressure_to_exner(exner);
+        buffers_exner(b.e,b.x,b.y,b.z) = exner;
+        buffers_pnh(b.e,b.x,b.y,b.z) = EquationOfState::compute_dphi(state_vtheta_dp(b.e,data_n0,b.x,b.y,b.z), exner, pi);
+      }
+
+      SPHERE_BLOCK_MIDDLE0(b);
+
+      Scalar grad0, grad1;
+      b.grad(grad0, grad1, ttmp0);
+
+      Scalar omega = -0.5 * (buffers_w_tens(b.e,b.x,b.y,b.z) + b.zabove(&buffers_w_tens(b.e,b.x,b.y,b.z), &buffers_w_tens(b.e,b.x,b.y,b.z+1)));
+      const Scalar uz = state_v(b.e,data_n0,0,b.x,b.y,b.z);
+      const Scalar vz = state_v(b.e,data_n0,1,b.x,b.y,b.z);
+      omega += uz * grad0 + vz * grad1;
+      buffers_omega_p(b.e,b.x,b.y,b.z) = omega;
+
+      derived_omega_p(b.e,b.x,b.y,b.z) += data_eta_ave_w * omega;
+
+      if (RSPLIT_ZERO) {
+
+        const Scalar dp = state_dp3d(b.e,data_n0,b.x,b.y,b.z);
+        const Scalar etap = b.zabove(&buffers_eta_dot_dpdn(b.e,b.x,b.y,b.z), &buffers_eta_dot_dpdn(b.e,b.x,b.y,b.z+1));
+        const Scalar etaz = buffers_eta_dot_dpdn(b.e,b.x,b.y,b.z);
+
+        buffers_dp_tens(b.e,b.x,b.y,b.z) += etap - etaz;
+
+        derived_eta_dot_dpdn(b.e,b.x,b.y,b.z) += data_eta_ave_w * etaz;
+
+        if (!HYDROSTATIC) {
+          const Scalar dw = b.zabove(&state_w_i(b.e,data_n0,b.x,b.y,b.z), &state_w_i(b.e,data_n0,b.x,b.y,b.z+1)) - state_w_i(b.e,data_n0,b.x,b.y,b.z);
+          const Scalar eta = 0.5 * (etaz + etap);
+          buffers_temp(b.e,b.x,b.y,b.z) = dw * eta;
+        }
+
+        const Scalar facp = 0.5 * etap / dp;
+        Scalar u = facp * (b.zabove(uz, &state_v(b.e,data_n0,0,b.x,b.y,b.z), &state_v(b.e,data_n0,0,b.x,b.y,b.z+1)) - uz);
+        Scalar v = facp * (b.zabove(vz, &state_v(b.e,data_n0,1,b.x,b.y,b.z), &state_v(b.e,data_n0,1,b.x,b.y,b.z+1)) - vz);
+
+        const Scalar facm = 0.5 * etaz / dp;
+        u += facm * (uz - b.zbelow(uz, &state_v(b.e,data_n0,0,b.x,b.y,b.z-1), &state_v(b.e,data_n0,0,b.x,b.y,b.z)));
+        v += facm * (vz - b.zbelow(uz, &state_v(b.e,data_n0,1,b.x,b.y,b.z-1), &state_v(b.e,data_n0,1,b.x,b.y,b.z)));
+        buffers_v_tens(b.e,0,b.x,b.y,b.z) = u;
+        buffers_v_tens(b.e,1,b.x,b.y,b.z) = v;
+      }
+      SPHERE_BLOCK_END();
+    });
+}
+
+void CaarFunctorImpl::epoch4_scanOps(const SphereOuter::Team &team)
+{
+  auto buffers_phi = viewAsReal(m_buffers.phi);
+  auto buffers_pnh = viewAsReal(m_buffers.pnh);
+
+  const int data_n0 = m_data.n0;
+
+  auto &geometry_phis = m_geometry.m_phis;
+
+  auto state_phinh_i = viewAsReal(m_state.m_phinh_i);
+
+  SphereScanOps::parallel_for(
+    team,
+    KOKKOS_LAMBDA(const SphereScanOps::Team &team) {
+      SphereScanOps s(team);
+      s.nacs(state_phinh_i, data_n0, buffers_pnh, geometry_phis);
+      Kokkos::parallel_for(
+        Kokkos::ThreadVectorRange(s.t, NUM_PHYSICAL_LEV),
+        [&](const int z) {
+          buffers_phi(s.e,s.x,s.y,z) = 0.5 * (state_phinh_i(s.e,data_n0,s.x,s.y,z) + state_phinh_i(s.e,data_n0,s.x,s.y,z+1));
+        });
+    });
+}
+
+template <bool HYDROSTATIC, bool RSPLIT_ZERO>
+void CaarFunctorImpl::epoch5_colOps(const SphereOuter::Team &team)
+{
+  auto buffers_dp_i = m_buffers.dp_i;
+  auto buffers_dpnh_dp_i = m_buffers.dpnh_dp_i;
+  auto buffers_eta_dot_dpdn = m_buffers.eta_dot_dpdn;
+  auto buffers_exner = m_buffers.exner;
+  auto buffers_grad_phinh_i = m_buffers.grad_phinh_i;
+  auto buffers_grad_w_i = m_buffers.grad_w_i;
+  auto buffers_phi = m_buffers.phi;
+  auto buffers_phi_tens = m_buffers.phi_tens;
+  auto buffers_pnh = m_buffers.pnh;
+  auto buffers_temp = m_buffers.temp;
+  auto buffers_v_i = m_buffers.v_i;
+  auto buffers_vtheta_i = m_buffers.vtheta_i;
+  auto buffers_w_tens = m_buffers.w_tens;
+
+  const int data_n0 = m_data.n0;
+  const Real dscale = m_data.scale1 - m_data.scale2;
+
+  auto &geometry_gradphis = m_geometry.m_gradphis;
+
+  const Real gscale1 = m_data.scale1 * PhysicalConstants::g;
+  const Real gscale2 = m_data.scale2 * PhysicalConstants::g;
+
+  auto hvcoord_hybrid_bi_packed = m_hvcoord.hybrid_bi_packed;
+
+  const Real ndata_scale1 = -m_data.scale1;
+  const Real pi_i00 = m_hvcoord.ps0 * m_hvcoord.hybrid_ai0;
+
+  const SphereGlobal sg(m_sphere_ops);
+
+  auto state_dp3d = m_state.m_dp3d;
+  auto state_phinh_i = m_state.m_phinh_i;
+  auto state_v = m_state.m_v;
+  auto state_w_i = m_state.m_w_i;
+
+  SphereColOps::parallel_for(
+    team, NUM_LEV_P,
+    KOKKOS_LAMBDA(const SphereColOps::Team &team) {
+      SphereColOps c(sg, team);
+
+      c.grad(buffers_grad_phinh_i, state_phinh_i, data_n0);
+      if (!HYDROSTATIC) c.grad(buffers_grad_w_i, state_w_i, data_n0);
+
+      const Scalar dm = c.zbelow(0, &state_dp3d(c.e,data_n0,c.x,c.y,c.z-1), &state_dp3d(c.e,data_n0,c.x,c.y,c.z));;
+      const Scalar dz = c.zabove(0, &state_dp3d(c.e,data_n0,c.x,c.y,c.z));
+      const Scalar dp_i = c.ztrim(dz, dm, 0.5 * (dz + dm));
+      buffers_dp_i(c.e,c.x,c.y,c.z) = dp_i;
+
+      if (!HYDROSTATIC) {
+
+        const Scalar v0m = c.zbelow(0, &state_v(c.e,data_n0,0,c.x,c.y,c.z-1), &state_v(c.e,data_n0,0,c.x,c.y,c.z));;
+        const Scalar v0z = c.zabove(0, &state_v(c.e,data_n0,0,c.x,c.y,c.z));
+        const Scalar v_i0 = c.ztrim(v0z, v0m, (dz * v0z + dm * v0m) / (dm + dz));
+        buffers_v_i(c.e,0,c.x,c.y,c.z) = v_i0;
+
+        const Scalar v1m = c.zbelow(0, &state_v(c.e,data_n0,1,c.x,c.y,c.z-1), &state_v(c.e,data_n0,1,c.x,c.y,c.z));;
+        const Scalar v1z = c.zabove(0, &state_v(c.e,data_n0,1,c.x,c.y,c.z));
+        const Scalar v_i1 = c.ztrim(v1z, v1m, (dz * v1z + dm * v1m) / (dm + dz));
+        buffers_v_i(c.e,1,c.x,c.y,c.z) = v_i1;
+
+        const Scalar pm = c.zbelow(pi_i00, &buffers_pnh(c.e,c.x,c.y,c.z-1), &buffers_pnh(c.e,c.x,c.y,c.z));;
+        const Scalar pz = c.zabove(pm + 0.5 * dm, &buffers_pnh(c.e,c.x,c.y,c.z));
+        buffers_dpnh_dp_i(c.e,c.x,c.y,c.z) = 2.0 * (pz - pm) / (dm + dz);
+      }
+
+      if (RSPLIT_ZERO) {
+
+        const Scalar phim = c.zbelow(0, &buffers_phi(c.e,c.x,c.y,c.z-1), &buffers_phi(c.e,c.x,c.y,c.z));;
+        const Scalar phiz = c.zabove(0, &buffers_phi(c.e,c.x,c.y,c.z));
+
+        if (!HYDROSTATIC) {
+          const Scalar phi_vadv = c.ztrim(0, 0, (phiz - phim) * buffers_eta_dot_dpdn(c.e,c.x,c.y,c.z) / dp_i);
+          buffers_phi_tens(c.e,c.x,c.y,c.z) = phi_vadv;
+        }
+
+        Scalar vtheta_i = phiz - phim;
+
+        const Scalar exnerm = c.zbelow(0, &buffers_exner(c.e,c.x,c.y,c.z-1), &buffers_exner(c.e,c.x,c.y,c.z));;
+        const Scalar exnerz = c.zabove(0, &buffers_exner(c.e,c.x,c.y,c.z));
+        const Scalar dexner = c.ztrim(1, 1, exnerz - exnerm);
+        vtheta_i /= dexner;
+
+        vtheta_i /= -PhysicalConstants::cp; // separate for BFB equality with original Fortran
+
+        if (!HYDROSTATIC) vtheta_i *= buffers_dpnh_dp_i(c.e,c.x,c.y,c.z);
+        buffers_vtheta_i(c.e,c.x,c.y,c.z) = vtheta_i;
+      }
+
+      if (!HYDROSTATIC) {
+
+        Scalar w_tens = 0;
+        if (RSPLIT_ZERO) {
+          const Scalar tempm = c.zbelow(0, &buffers_temp(c.e,c.x,c.y,c.z-1), &buffers_temp(c.e,c.x,c.y,c.z));;
+          const Scalar tempz = c.zabove(0, &buffers_temp(c.e,c.x,c.y,c.z));
+          const Scalar dw = c.ztrim(tempz, tempm, 0.5 * (tempz + tempm));
+          w_tens = dw / dp_i;
+        }
+        w_tens += buffers_v_i(c.e,0,c.x,c.y,c.z) * buffers_grad_w_i(c.e,0,c.x,c.y,c.z) + buffers_v_i(c.e,1,c.x,c.y,c.z) * buffers_grad_w_i(c.e,1,c.x,c.y,c.z);
+        w_tens *= ndata_scale1;
+        const Scalar gscale = c.ztop(gscale1, gscale2);
+        w_tens += (buffers_dpnh_dp_i(c.e,c.x,c.y,c.z)-Scalar(1)) * gscale;
+        buffers_w_tens(c.e,c.x,c.y,c.z) = w_tens;
+
+        Scalar phi_tens = (RSPLIT_ZERO) ? buffers_phi_tens(c.e,c.x,c.y,c.z) : 0;
+        phi_tens += buffers_v_i(c.e,0,c.x,c.y,c.z) * buffers_grad_phinh_i(c.e,0,c.x,c.y,c.z) + buffers_v_i(c.e,1,c.x,c.y,c.z) * buffers_grad_phinh_i(c.e,1,c.x,c.y,c.z);
+        phi_tens *= ndata_scale1;
+        phi_tens += state_w_i(c.e,data_n0,c.x,c.y,c.z) * gscale;
+
+        if (dscale) phi_tens += dscale * (buffers_v_i(c.e,0,c.x,c.y,c.z) * geometry_gradphis(c.e,0,c.x,c.y) + buffers_v_i(c.e,1,c.x,c.y,c.z) * geometry_gradphis(c.e,1,c.x,c.y)) * hvcoord_hybrid_bi_packed(c.z);
+
+        buffers_phi_tens(c.e,c.x,c.y,c.z) = phi_tens;
+      }
+    });
+}
+
+template <bool HYDROSTATIC, bool RSPLIT_ZERO, bool PGRAD_CORRECTION>
+void CaarFunctorImpl::epoch6_blockOps(const SphereOuter::Team &team)
+{
+  auto buffers_dpnh_dp_i = m_buffers.dpnh_dp_i;
+  auto buffers_exner = m_buffers.exner;
+  auto buffers_grad_phinh_i = m_buffers.grad_phinh_i;
+  auto buffers_grad_w_i = m_buffers.grad_w_i;
+  auto buffers_v_tens = m_buffers.v_tens;
+
+  const int data_n0 = m_data.n0;
+
+  auto &geometry_fcor = m_geometry.m_fcor;
+
+  const SphereGlobal sg(m_sphere_ops);
+
+  auto state_dp3d = m_state.m_dp3d;
+  auto state_v = m_state.m_v;
+  auto state_vtheta_dp= m_state.m_vtheta_dp;
+  auto state_w_i = m_state.m_w_i;
+
+  SphereBlockOps::parallel_for(
+    team, 6,
+    KOKKOS_LAMBDA(const SphereBlockOps::Team &team) {
+      SphereBlockOps b(sg, team);
+      SphereBlockScratch ttmp0(b), ttmp1(b), ttmp2(b), ttmp3(b), ttmp4(b), ttmp5(b);
+
+      SPHERE_BLOCK_START3(b, exneriz, v0, v1);
+
+      if (!HYDROSTATIC) {
+        const Scalar wz = b.zabove(&state_w_i(b.e,data_n0,b.x,b.y,b.z), &state_w_i(b.e,data_n0,b.x,b.y,b.z+1));
+        const Scalar w2 = 0.25 * (state_w_i(b.e,data_n0,b.x,b.y,b.z) * state_w_i(b.e,data_n0,b.x,b.y,b.z) + wz * wz);
+        b.gradInit(ttmp0, w2);
+      }
+
+      exneriz = buffers_exner(b.e,b.x,b.y,b.z);
+      b.gradInit(ttmp1, exneriz);
+
+      const Scalar log_exneriz = (PGRAD_CORRECTION) ? log(exneriz) : 0;
+      b.gradInit(ttmp2, log_exneriz);
+
+      v0 = state_v(b.e,data_n0,0,b.x,b.y,b.z);
+      v1 = state_v(b.e,data_n0,1,b.x,b.y,b.z);
+
+      b.vortInit(ttmp3, ttmp4, v0, v1);
+      b.gradInit(ttmp5, 0.5 * (v0 * v0 + v1 * v1));
+
+      SPHERE_BLOCK_MIDDLE3(b, exneriz, v0, v1);
+
+      Scalar grad_v0, grad_v1;
+      b.grad(grad_v0, grad_v1, ttmp5);
+
+      Scalar u_tens = (RSPLIT_ZERO) ? buffers_v_tens(b.e,0,b.x,b.y,b.z) : 0;
+      Scalar v_tens = (RSPLIT_ZERO) ? buffers_v_tens(b.e,1,b.x,b.y,b.z) : 0;
+      u_tens += grad_v0;
+      v_tens += grad_v1;
+
+      const Scalar cp_vtheta = PhysicalConstants::cp * (state_vtheta_dp(b.e,data_n0,b.x,b.y,b.z) / state_dp3d(b.e,data_n0,b.x,b.y,b.z));
+
+      Scalar grad_exner0, grad_exner1;
+      b.grad(grad_exner0, grad_exner1, ttmp1);
+
+      u_tens += cp_vtheta * grad_exner0;
+      v_tens += cp_vtheta * grad_exner1;
+
+      Scalar mgrad_x, mgrad_y;
+      if (HYDROSTATIC) {
+
+        mgrad_x = 0.5 * (buffers_grad_phinh_i(b.e,0,b.x,b.y,b.z) + b.zabove(&buffers_grad_phinh_i(b.e,0,b.x,b.y,b.z), &buffers_grad_phinh_i(b.e,0,b.x,b.y,b.z+1)));
+        mgrad_y = 0.5 * (buffers_grad_phinh_i(b.e,1,b.x,b.y,b.z) + b.zabove(&buffers_grad_phinh_i(b.e,1,b.x,b.y,b.z), &buffers_grad_phinh_i(b.e,1,b.x,b.y,b.z+1)));
+
+      } else {
+
+        mgrad_x = 0.5 * (buffers_grad_phinh_i(b.e,0,b.x,b.y,b.z) * buffers_dpnh_dp_i(b.e,b.x,b.y,b.z) + b.zabove(&buffers_grad_phinh_i(b.e,0,b.x,b.y,b.z), &buffers_grad_phinh_i(b.e,0,b.x,b.y,b.z+1)) * b.zabove(&buffers_dpnh_dp_i(b.e,b.x,b.y,b.z), &buffers_dpnh_dp_i(b.e,b.x,b.y,b.z+1)));
+        mgrad_y = 0.5 * (buffers_grad_phinh_i(b.e,1,b.x,b.y,b.z) * buffers_dpnh_dp_i(b.e,b.x,b.y,b.z) + b.zabove(&buffers_grad_phinh_i(b.e,1,b.x,b.y,b.z), &buffers_grad_phinh_i(b.e,1,b.x,b.y,b.z+1)) * b.zabove(&buffers_dpnh_dp_i(b.e,b.x,b.y,b.z), &buffers_dpnh_dp_i(b.e,b.x,b.y,b.z+1)));
+
+      }
+
+      if (PGRAD_CORRECTION) {
+
+        Scalar grad_lexner0, grad_lexner1;
+        b.grad(grad_lexner0, grad_lexner1, ttmp2);
+
+        namespace PC = PhysicalConstants;
+        constexpr Real cpt0 = PC::cp * (PC::Tref - PC::Tref_lapse_rate * PC::Tref * PC::cp / PC::g);
+        mgrad_x += cpt0 * (grad_lexner0 - grad_exner0 / exneriz);
+        mgrad_y += cpt0 * (grad_lexner1 - grad_exner1 / exneriz);
+      }
+
+      Scalar wvor_x = 0;
+      Scalar wvor_y = 0;
+      if (!HYDROSTATIC) {
+        b.grad(wvor_x, wvor_y, ttmp0);
+        wvor_x -= 0.5 * (buffers_grad_w_i(b.e,0,b.x,b.y,b.z) * state_w_i(b.e,data_n0,b.x,b.y,b.z) + b.zabove(&buffers_grad_w_i(b.e,0,b.x,b.y,b.z), &buffers_grad_w_i(b.e,0,b.x,b.y,b.z+1)) * b.zabove(&state_w_i(b.e,data_n0,b.x,b.y,b.z), &state_w_i(b.e,data_n0,b.x,b.y,b.z+1)));
+        wvor_y -= 0.5 * (buffers_grad_w_i(b.e,1,b.x,b.y,b.z) * state_w_i(b.e,data_n0,b.x,b.y,b.z) + b.zabove(&buffers_grad_w_i(b.e,1,b.x,b.y,b.z), &buffers_grad_w_i(b.e,1,b.x,b.y,b.z+1)) * b.zabove(&state_w_i(b.e,data_n0,b.x,b.y,b.z), &state_w_i(b.e,data_n0,b.x,b.y,b.z+1)));
+      }
+
+      u_tens += mgrad_x + wvor_x;
+      v_tens += mgrad_y + wvor_y;
+
+      const Scalar vort = b.vort(ttmp3, ttmp4) + geometry_fcor(b.e,b.x,b.y);
+      u_tens -= v1 * vort;
+      v_tens += v0 * vort;
+
+      buffers_v_tens(b.e,0,b.x,b.y,b.z) = u_tens;
+      buffers_v_tens(b.e,1,b.x,b.y,b.z) = v_tens;
+
+      SPHERE_BLOCK_END();
+    });
+}
+
+template <bool HYDROSTATIC, bool RSPLIT_ZERO>
+void CaarFunctorImpl::epoch7_col(const SphereOuter::Team &team)
+{
+  auto buffers_dp_tens = m_buffers.dp_tens;
+  auto buffers_eta_dot_dpdn = m_buffers.eta_dot_dpdn;
+  auto buffers_phi_tens = m_buffers.phi_tens;
+  auto buffers_theta_tens = m_buffers.theta_tens;
+  auto buffers_v_tens = m_buffers.v_tens;
+  auto buffers_vtheta_i = m_buffers.vtheta_i;
+  auto buffers_w_tens = m_buffers.w_tens;
+
+  const Real data_dt = m_data.dt;
+  const int data_nm1 = m_data.nm1;
+  const int data_np1 = m_data.np1;
+  const Real data_scale3 = m_data.scale3;
+
+  auto &geometry_spheremp = m_geometry.m_spheremp;
+
+  const Real scale1_dt = m_data.scale1 * m_data.dt;
+
+  auto state_dp3d = m_state.m_dp3d;
+  auto state_phinh_i = m_state.m_phinh_i;
+  auto state_v = m_state.m_v;
+  auto state_vtheta_dp = m_state.m_vtheta_dp;
+  auto state_w_i = m_state.m_w_i;
+
+  SphereCol::parallel_for(
+    team, NUM_LEV,
+    KOKKOS_LAMBDA(const SphereCol::Team &team) {
+      const SphereCol c(team);
+
+      const Real spheremp = geometry_spheremp(c.e,c.x,c.y);
+      const Real scale1_dt_spheremp = scale1_dt * spheremp;
+      const Real scale3_spheremp = data_scale3 * spheremp;
+
+      Scalar dp_tens = buffers_dp_tens(c.e,c.x,c.y,c.z);
+      dp_tens *= scale1_dt_spheremp;
+      Scalar dp_np1 = scale3_spheremp * state_dp3d(c.e,data_nm1,c.x,c.y,c.z);
+      dp_np1 -= dp_tens;
+      state_dp3d(c.e,data_np1,c.x,c.y,c.z) = dp_np1;
+
+      Scalar theta_tens = buffers_theta_tens(c.e,c.x,c.y,c.z);
+      if (RSPLIT_ZERO) {
+        const Scalar etap = c.zplus(&buffers_eta_dot_dpdn(c.e,c.x,c.y,c.z), &buffers_eta_dot_dpdn(c.e,c.x,c.y,c.z+1));
+        const Scalar etaz = buffers_eta_dot_dpdn(c.e,c.x,c.y,c.z);
+        const Scalar thetap = etap * c.zplus(&buffers_vtheta_i(c.e,c.x,c.y,c.z), &buffers_vtheta_i(c.e,c.x,c.y,c.z+1));
+        const Scalar thetaz = etaz * buffers_vtheta_i(c.e,c.x,c.y,c.z);
+        theta_tens += thetap - thetaz;
+      }
+      theta_tens *= -scale1_dt_spheremp;
+      Scalar vtheta_np1 = state_vtheta_dp(c.e,data_nm1,c.x,c.y,c.z);
+      vtheta_np1 *= scale3_spheremp;
+      vtheta_np1 += theta_tens;
+      state_vtheta_dp(c.e,data_np1,c.x,c.y,c.z) = vtheta_np1;
+
+      Scalar u_tens = buffers_v_tens(c.e,0,c.x,c.y,c.z);
+      u_tens *= -scale1_dt_spheremp;
+      Scalar u_np1 = state_v(c.e,data_nm1,0,c.x,c.y,c.z);
+      u_np1 *= scale3_spheremp;
+      u_np1 += u_tens;
+      state_v(c.e,data_np1,0,c.x,c.y,c.z) = u_np1;
+
+      Scalar v_tens = buffers_v_tens(c.e,1,c.x,c.y,c.z);
+      v_tens *= -scale1_dt_spheremp;
+      Scalar v_np1 = state_v(c.e,data_nm1,1,c.x,c.y,c.z);
+      v_np1 *= scale3_spheremp;
+      v_np1 += v_tens;
+      state_v(c.e,data_np1,1,c.x,c.y,c.z) = v_np1;
+
+      if (!HYDROSTATIC) {
+
+        const Real dt_spheremp = data_dt * spheremp;
+
+        Scalar phi_tens = buffers_phi_tens(c.e,c.x,c.y,c.z);
+        phi_tens *= dt_spheremp;
+        Scalar phi_np1 = state_phinh_i(c.e,data_nm1,c.x,c.y,c.z);
+        phi_np1 *= scale3_spheremp;
+        phi_np1 += phi_tens;
+        state_phinh_i(c.e,data_np1,c.x,c.y,c.z) = phi_np1;
+
+        Scalar w_tens = buffers_w_tens(c.e,c.x,c.y,c.z);
+        w_tens *= dt_spheremp;
+        Scalar w_np1 = state_w_i(c.e,data_nm1,c.x,c.y,c.z);
+        w_np1 *= scale3_spheremp;
+        w_np1 += w_tens;
+        state_w_i(c.e,data_np1,c.x,c.y,c.z) = w_np1;
+
+        if (c.z == NUM_LEV-1) {
+          constexpr int z = NUM_LEV_P-1;
+          constexpr int dz = NUM_PHYSICAL_LEV%VECTOR_SIZE;
+          Real w_tens = buffers_w_tens(c.e,c.x,c.y,z)[dz];
+          w_tens *= dt_spheremp;
+          buffers_w_tens(c.e,c.x,c.y,z)[dz] = w_tens;
+          Real w_np1 = state_w_i(c.e,data_nm1,c.x,c.y,z)[dz];
+          w_np1 *= scale3_spheremp;
+          w_np1 += w_tens;
+          state_w_i(c.e,data_np1,c.x,c.y,z)[dz] = w_np1;
+        }
+      }
+    });
+}
+
+void CaarFunctorImpl::caar_compute()
+{
+  SphereOuter::parallel_for(
+    m_num_elems,
+    KOKKOS_LAMBDA(const SphereOuter::Team &team) {
+
+      if (m_theta_hydrostatic_mode) {
+        if (m_theta_advection_form == AdvectionForm::Conservative) epoch1_blockOps<true,true>(team);
+        else epoch1_blockOps<true,false>(team);
+      } else {
+        if (m_theta_advection_form == AdvectionForm::Conservative) epoch1_blockOps<false,true>(team);
+        else epoch1_blockOps<false,false>(team);
+      }
+
+      if (m_rsplit == 0) epoch2_scanOps<true>(team);
+      else epoch2_scanOps<false>(team);
+
+      if (m_theta_hydrostatic_mode) {
+        if (m_rsplit == 0) epoch3_blockOps<true,true>(team);
+        else epoch3_blockOps<true,false>(team);
+      } else {
+        if (m_rsplit == 0) epoch3_blockOps<false,true>(team);
+        else epoch3_blockOps<false,false>(team);
+      }
+
+      if (m_theta_hydrostatic_mode) epoch4_scanOps(team);
+
+      if (m_theta_hydrostatic_mode) {
+        if (m_rsplit == 0) epoch5_colOps<true,true>(team);
+        else epoch5_colOps<true,false>(team);
+      } else {
+        if (m_rsplit == 0) epoch5_colOps<false,true>(team);
+        else epoch5_colOps<false,false>(team);
+      }
+
+      if (m_theta_hydrostatic_mode) {
+        if (m_rsplit == 0) {
+          if (m_pgrad_correction) epoch6_blockOps<true,true,true>(team);
+          else epoch6_blockOps<true,true,false>(team);
+        } else {
+          if (m_pgrad_correction) epoch6_blockOps<true,false,true>(team);
+          else epoch6_blockOps<true,false,false>(team);
+        }
+      } else {
+        if (m_rsplit == 0) {
+          if (m_pgrad_correction) epoch6_blockOps<false,true,true>(team);
+          else epoch6_blockOps<false,true,false>(team);
+        } else {
+          if (m_pgrad_correction) epoch6_blockOps<false,false,true>(team);
+          else epoch6_blockOps<false,false,false>(team);
+        }
+      }
+
+      if (m_theta_hydrostatic_mode) {
+        if (m_rsplit == 0) epoch7_col<true,true>(team);
+        else epoch7_col<true,false>(team);
+      } else {
+        if (m_rsplit == 0) epoch7_col<false,true>(team);
+        else epoch7_col<false,false>(team);
+      }
+    });
+}
+
+}
diff --git a/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.hpp b/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.hpp
index 25cdc133fa6f..1655229bd04c 100644
--- a/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.hpp
+++ b/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.hpp
@@ -30,6 +30,8 @@
 #include "profiling.hpp"
 #include "ErrorDefs.hpp"
 
+#include "Hommexx_config.h"
+
 #include <assert.h>
 
 namespace Homme {
@@ -107,7 +109,7 @@ struct CaarFunctorImpl {
   struct TagPostExchange {};
 
   // Policies
-#ifndef NDEBUG
+#if defined(KOKKOS_ENABLE_CUDA) && !defined(NDEBUG)
   template<typename Tag>
   using TeamPolicyType = Kokkos::TeamPolicy<ExecSpace,Kokkos::LaunchBounds<512,1>,Tag>;
 #else
@@ -123,6 +125,7 @@ struct CaarFunctorImpl {
 
   Kokkos::Array<std::shared_ptr<BoundaryExchange>, NUM_TIME_LEVELS> m_bes;
 
+  public:
   CaarFunctorImpl(const Elements &elements, const Tracers &/* tracers */,
                   const ReferenceElement &ref_FE, const HybridVCoord &hvcoord,
                   const SphereOperators &sphere_ops, const SimulationParams& params)
@@ -179,7 +182,7 @@ struct CaarFunctorImpl {
   }
 
   int requested_buffer_size () const {
-    // Ask the buffers manager to allocate enough buffers to satisfy Caar's needs
+    // Ask the buffers manager to allocate enough buffers to satisfy Caar's needs.
     const int nslots = m_tu.get_num_ws_slots();
 
     int num_scalar_mid_buf = Buffers::num_3d_scalar_mid_buf;
@@ -190,8 +193,9 @@ struct CaarFunctorImpl {
     // Depending on rsplit/hydro-mode, we may remove some
     // buffers that are not needed from the counters above.
     if (m_theta_hydrostatic_mode) {
-      // pi=pnh, and no wtens/phitens
+      // pi=pnh, and no dpnh_dp_i/phitens
       num_scalar_mid_buf -= 1;
+      // No dpnh_dp_i/phitens/wtens
       num_scalar_int_buf -= 3;
 
       // No grad_w_i/v_i
@@ -347,9 +351,9 @@ struct CaarFunctorImpl {
     int nerr;
     Kokkos::parallel_reduce("caar loop pre-boundary exchange", m_policy_pre, *this, nerr);
     Kokkos::fence();
-    GPTLstop("caar compute");
     if (nerr > 0)
       check_print_abort_on_bad_elems("CaarFunctorImpl::run TagPreExchange", data.n0);
+    GPTLstop("caar compute");
 
     GPTLstart("caar_bexchV");
     m_bes[data.np1]->exchange(m_geometry.m_rspheremp);
@@ -559,11 +563,13 @@ struct CaarFunctorImpl {
       kv.team_barrier(); // necessary to avoid race in column_scan_mid_to_int
 
       ColumnOps::column_scan_mid_to_int<true>(kv,div_vdp,omega_i);
+      kv.team_barrier();
       // Average omega_i to midpoints, and change sign, since later
       //   omega=v*grad(pi)-average(omega_i)
       auto omega = Homme::subview(m_buffers.omega_p,kv.team_idx,igp,jgp);
       ColumnOps::compute_midpoint_values<CombineMode::Scale>(kv,omega_i,omega,-1.0);
     });
+
     kv.team_barrier();
 
     // Compute grad(pi)
diff --git a/components/homme/src/theta-l_kokkos/cxx/EquationOfState.hpp b/components/homme/src/theta-l_kokkos/cxx/EquationOfState.hpp
index dd97720f1be2..037041d10315 100644
--- a/components/homme/src/theta-l_kokkos/cxx/EquationOfState.hpp
+++ b/components/homme/src/theta-l_kokkos/cxx/EquationOfState.hpp
@@ -248,6 +248,12 @@ class EquationOfState {
     ColumnOps::column_scan_mid_to_int<false>(kv,integrand_provider,phi_i);
   }
 
+  KOKKOS_INLINE_FUNCTION static
+  Scalar compute_dphi (const Scalar vtheta_dp, const Scalar exner, const Scalar p) {
+    return PhysicalConstants::Rgas*vtheta_dp*exner/p;
+  }
+
+
 public:
 
   bool            m_theta_hydrostatic_mode;
diff --git a/components/homme/src/theta-l_kokkos/cxx/LimiterFunctor.hpp b/components/homme/src/theta-l_kokkos/cxx/LimiterFunctor.hpp
index f36966a4bca5..ca8d4f6ac7ab 100644
--- a/components/homme/src/theta-l_kokkos/cxx/LimiterFunctor.hpp
+++ b/components/homme/src/theta-l_kokkos/cxx/LimiterFunctor.hpp
@@ -49,7 +49,7 @@ struct LimiterFunctor {
   struct TagDp3dLimiter {};
 
   // Policies
-#ifndef NDEBUG
+#if defined(KOKKOS_ENABLE_CUDA) && !defined(NDEBUG)
   template<typename Tag>
   using TeamPolicyType = Kokkos::TeamPolicy<ExecSpace,Kokkos::LaunchBounds<512,1>,Tag>;
 #else
diff --git a/components/homme/test/reg_test/run_tests/test-list.cmake b/components/homme/test/reg_test/run_tests/test-list.cmake
index 09607d387dcb..5eb238b90071 100644
--- a/components/homme/test/reg_test/run_tests/test-list.cmake
+++ b/components/homme/test/reg_test/run_tests/test-list.cmake
@@ -106,6 +106,13 @@ IF (BUILD_HOMME_THETA_KOKKOS)
         thetah-sl-test11conv-r0t1-cdr30-rrm
         thetanh-moist-bubble-sl
         thetanh-moist-bubble-sl-pg2)
+      # QLT vertical_levels not yet supported on GPU (compose_cedr_qlt.cpp:182)
+      IF (NOT (Kokkos_ENABLE_CUDA OR Kokkos_ENABLE_HIP))
+        LIST(APPEND HOMME_TESTS
+          thetah-sl-dcmip16_test1pg2-kokkos.cmake)
+        LIST(APPEND HOMME_ONEOFF_CVF_TESTS
+          thetah-sl-dcmip16_test1pg2)
+      ENDIF()
     ENDIF()
   ENDIF()
   IF (HOMMEXX_BFB_TESTING)
diff --git a/components/homme/test/reg_test/run_tests/thetah-sl-dcmip16_test1pg2-kokkos.cmake b/components/homme/test/reg_test/run_tests/thetah-sl-dcmip16_test1pg2-kokkos.cmake
new file mode 100644
index 000000000000..73142474416e
--- /dev/null
+++ b/components/homme/test/reg_test/run_tests/thetah-sl-dcmip16_test1pg2-kokkos.cmake
@@ -0,0 +1,13 @@
+# The name of this test (should be the basename of this file)
+SET(TEST_NAME thetah-sl-dcmip16_test1pg2-kokkos)
+# The specifically compiled executable that this test uses
+SET(EXEC_NAME theta-l-nlev30-kokkos)
+
+SET(NUM_CPUS 16)
+
+SET(NAMELIST_FILES ${HOMME_ROOT}/test/reg_test/namelists/thetah-sl-dcmip16_test1pg2.nl)
+SET(VCOORD_FILES ${HOMME_ROOT}/test/vcoord/*30*)
+
+# compare all of these files against baselines:
+SET(NC_OUTPUT_FILES 
+  dcmip2016_test1_pg21.nc)