diff --git a/components/eamxx/cime_config/testdefs/testmods_dirs/eamxx/caar/opt/shell_commands b/components/eamxx/cime_config/testdefs/testmods_dirs/eamxx/caar/opt/shell_commands new file mode 100644 index 000000000000..d9fcf6cb6527 --- /dev/null +++ b/components/eamxx/cime_config/testdefs/testmods_dirs/eamxx/caar/opt/shell_commands @@ -0,0 +1 @@ +./xmlchange --append SCREAM_CMAKE_OPTIONS='HOMMEXX_ENABLE_CAAR_OPT ON' diff --git a/components/homme/cmake/machineFiles/pm-cpu-bfb.cmake b/components/homme/cmake/machineFiles/pm-cpu-bfb.cmake index 9d7a9dd21f2c..be8d509cfa55 100644 --- a/components/homme/cmake/machineFiles/pm-cpu-bfb.cmake +++ b/components/homme/cmake/machineFiles/pm-cpu-bfb.cmake @@ -125,6 +125,7 @@ SET(Kokkos_ENABLE_CUDA_LAMBDA OFF CACHE BOOL "") SET(Kokkos_ARCH_AMPERE80 OFF CACHE BOOL "") SET(Kokkos_ENABLE_EXPLICIT_INSTANTIATION OFF CACHE BOOL "") #SET(Kokkos_ENABLE_CUDA_ARCH_LINKING OFF CACHE BOOL "") +SET(HOMMEXX_ENABLE_CAAR_OPT OFF CACHE BOOL "") SET(CXXLIB_SUPPORTED_CACHE FALSE CACHE BOOL "") diff --git a/components/homme/cmake/machineFiles/pm-cpu.cmake b/components/homme/cmake/machineFiles/pm-cpu.cmake index 84cb0875e7b9..3af1f09b0cdc 100644 --- a/components/homme/cmake/machineFiles/pm-cpu.cmake +++ b/components/homme/cmake/machineFiles/pm-cpu.cmake @@ -71,6 +71,7 @@ SET(Kokkos_ENABLE_CUDA_LAMBDA OFF CACHE BOOL "") SET(Kokkos_ARCH_AMPERE80 OFF CACHE BOOL "") SET(Kokkos_ENABLE_EXPLICIT_INSTANTIATION OFF CACHE BOOL "") #SET(Kokkos_ENABLE_CUDA_ARCH_LINKING OFF CACHE BOOL "") +SET(HOMMEXX_ENABLE_CAAR_OPT OFF CACHE BOOL "") SET(CXXLIB_SUPPORTED_CACHE FALSE CACHE BOOL "") diff --git a/components/homme/cmake/machineFiles/pm-gpu-bfb.cmake b/components/homme/cmake/machineFiles/pm-gpu-bfb.cmake index e19eedaf496d..48d17e7ab091 100644 --- a/components/homme/cmake/machineFiles/pm-gpu-bfb.cmake +++ b/components/homme/cmake/machineFiles/pm-gpu-bfb.cmake @@ -127,6 +127,7 @@ SET(Kokkos_ENABLE_IMPL_CUDA_MALLOC_ASYNC OFF CACHE BOOL "") #SET(Kokkos_ENABLE_CUDA_UVM ON CACHE BOOL "") SET(Kokkos_ENABLE_EXPLICIT_INSTANTIATION OFF CACHE BOOL "") #SET(Kokkos_ENABLE_CUDA_ARCH_LINKING OFF CACHE BOOL "") +SET(HOMMEXX_ENABLE_CAAR_OPT OFF CACHE BOOL "") SET(CXXLIB_SUPPORTED_CACHE FALSE CACHE BOOL "") @@ -139,5 +140,5 @@ SET(CMAKE_VERBOSE_MAKEFILE ON CACHE BOOL "") SET(USE_NUM_PROCS 4 CACHE STRING "") # only default SET(USE_MPIEXEC "srun" CACHE STRING "") -SET(USE_MPI_OPTIONS "-K --cpu-bind=cores" CACHE STRING "") +SET(USE_MPI_OPTIONS "-C gpu -K --cpu-bind=cores" CACHE STRING "") SET(HOMME_TESTING_TIMELIMIT 1800 CACHE STRING "") diff --git a/components/homme/cmake/machineFiles/pm-gpu.cmake b/components/homme/cmake/machineFiles/pm-gpu.cmake index 669857e03e75..716f3d1df2f0 100644 --- a/components/homme/cmake/machineFiles/pm-gpu.cmake +++ b/components/homme/cmake/machineFiles/pm-gpu.cmake @@ -76,6 +76,7 @@ SET(Kokkos_ENABLE_IMPL_CUDA_MALLOC_ASYNC OFF CACHE BOOL "") #SET(Kokkos_ENABLE_CUDA_UVM ON CACHE BOOL "") SET(Kokkos_ENABLE_EXPLICIT_INSTANTIATION OFF CACHE BOOL "") #SET(Kokkos_ENABLE_CUDA_ARCH_LINKING OFF CACHE BOOL "") +SET(HOMMEXX_ENABLE_CAAR_OPT OFF CACHE BOOL "") SET(CXXLIB_SUPPORTED_CACHE FALSE CACHE BOOL "") @@ -88,5 +89,5 @@ SET(CMAKE_VERBOSE_MAKEFILE ON CACHE BOOL "") SET(USE_NUM_PROCS 4 CACHE STRING "") # only default SET(USE_MPIEXEC "srun" CACHE STRING "") -SET(USE_MPI_OPTIONS "-K --cpu-bind=cores" CACHE STRING "") +SET(USE_MPI_OPTIONS "-C gpu -K --cpu-bind=cores" CACHE STRING "") SET(HOMME_TESTING_TIMELIMIT 1800 CACHE STRING "") diff --git a/components/homme/src/preqx_kokkos/cxx/CaarFunctorImpl-caar-opt.hpp b/components/homme/src/preqx_kokkos/cxx/CaarFunctorImpl-caar-opt.hpp new file mode 100644 index 000000000000..aa28c6a7e37e --- /dev/null +++ b/components/homme/src/preqx_kokkos/cxx/CaarFunctorImpl-caar-opt.hpp @@ -0,0 +1,3 @@ +// For preqx_kokkos, the caar-opt path is not implemented. +// Fall back to the standard CaarFunctorImpl. +#include "CaarFunctorImpl.hpp" diff --git a/components/homme/src/preqx_kokkos/cxx/HyperviscosityFunctorImpl.hpp b/components/homme/src/preqx_kokkos/cxx/HyperviscosityFunctorImpl.hpp index 688007e27f24..923d38d4a770 100644 --- a/components/homme/src/preqx_kokkos/cxx/HyperviscosityFunctorImpl.hpp +++ b/components/homme/src/preqx_kokkos/cxx/HyperviscosityFunctorImpl.hpp @@ -297,7 +297,7 @@ class HyperviscosityFunctorImpl SphereOperators m_sphere_ops; // Policies -#ifndef NDEBUG +#if defined(KOKKOS_ENABLE_CUDA) && !defined(NDEBUG) template using TeamPolicyType = Kokkos::TeamPolicy,Tag>; #else diff --git a/components/homme/src/share/cxx/CaarFunctor.cpp b/components/homme/src/share/cxx/CaarFunctor.cpp index a0b27efd3725..bd9c60085957 100644 --- a/components/homme/src/share/cxx/CaarFunctor.cpp +++ b/components/homme/src/share/cxx/CaarFunctor.cpp @@ -5,7 +5,12 @@ *******************************************************************************/ #include "CaarFunctor.hpp" +#include "Hommexx_config.h" +#ifdef HOMMEXX_ENABLE_CAAR_OPT +#include "CaarFunctorImpl-caar-opt.hpp" +#else #include "CaarFunctorImpl.hpp" +#endif #include "Context.hpp" #include "Elements.hpp" #include "ErrorDefs.hpp" diff --git a/components/homme/src/share/cxx/Hommexx_config.h.in b/components/homme/src/share/cxx/Hommexx_config.h.in index d563d2bcde70..7e0c4e81dc42 100644 --- a/components/homme/src/share/cxx/Hommexx_config.h.in +++ b/components/homme/src/share/cxx/Hommexx_config.h.in @@ -24,6 +24,9 @@ // Whether the debug parts of cxx code should be compiled or not #cmakedefine HOMMEXX_DEBUG +// Enable CAAR Optimization +#cmakedefine HOMMEXX_ENABLE_CAAR_OPT + // Whether the MPI operations have to be performed directly on the device #cmakedefine01 HOMMEXX_MPI_ON_DEVICE diff --git a/components/homme/src/share/cxx/SphereOperators-caar-opt.hpp b/components/homme/src/share/cxx/SphereOperators-caar-opt.hpp new file mode 100644 index 000000000000..124d756adde5 --- /dev/null +++ b/components/homme/src/share/cxx/SphereOperators-caar-opt.hpp @@ -0,0 +1,1720 @@ +/******************************************************************************** + * HOMMEXX 1.0: Copyright of Sandia Corporation + * This software is released under the BSD license + * See the file 'COPYRIGHT' in the HOMMEXX/src/share/cxx directory + *******************************************************************************/ + +#ifndef HOMMEXX_SPHERE_OPERATORS_HPP +#define HOMMEXX_SPHERE_OPERATORS_HPP + +#include "Types.hpp" +#include "ElementsGeometry.hpp" +#include "CombineOps.hpp" +#include "ReferenceElement.hpp" +#include "Dimensions.hpp" +#include "KernelVariables.hpp" +#include "utilities/SubviewUtils.hpp" +#include "utilities/ViewUtils.hpp" + +#include + +namespace Homme { + +class SphereOperators +{ + static constexpr int MAX_NUM_LEV = NUM_LEV_P; + static constexpr int NUM_2D_VECTOR_BUFFERS = 2; + static constexpr int NUM_3D_SCALAR_BUFFERS = 3; + static constexpr int NUM_3D_VECTOR_BUFFERS = 3; + + // These two short names will be used to extract subviews from the 3d buffers, + // since we can no longer use the auto keyword. Before we used to do + // const auto& gv = Homme::subview(vector_buf_ml,kv.team_idx,0); + // However, now vector_buf_ml has last timension NUM_LEV_P, but we may + // need gv to be smaller. Hence, we simply grab the pointer from the subview, + // but then we need to explicitly tell the compiler the type of the result, + // which can no longer be deduced. Like this: + // vector_buf + using scalar_buf = ExecViewUnmanaged; + + template + using vector_buf = ExecViewUnmanaged; + + // std::min is constexpr only from c++14 on. + template + struct Min { + static constexpr int value = M > N ? N : M; + }; + + template + using DefaultProvider = ExecViewUnmanaged; +public: + + + SphereOperators () = default; + + SphereOperators (const ElementsGeometry& geometry, + const ReferenceElement& ref_FE) + { + setup (geometry, ref_FE); + } + + //only for unit tests + SphereOperators (const Real scale_factor, const Real laplacian_rigid_factor) + { + m_scale_factor_inv = 1/scale_factor; + m_laplacian_rigid_factor = laplacian_rigid_factor; + } + + void setup (const ElementsGeometry& geometry, + const ReferenceElement& ref_FE) { + // Get reference element stuff + dvv = ref_FE.get_deriv(); + m_mp = ref_FE.get_mass(); + + // Get all needed 2d fields from elements + m_d = geometry.m_d; + m_dinv = geometry.m_dinv; + m_metdet = geometry.m_metdet; + m_metinv = geometry.m_metinv; + m_spheremp = geometry.m_spheremp; + m_scale_factor_inv = 1/geometry.m_scale_factor; + m_laplacian_rigid_factor = geometry.m_laplacian_rigid_factor; + } + + template + void allocate_buffers (const Kokkos::TeamPolicy& team_policy) + { + const int num_parallel_iterations = team_policy.league_size(); + const int alloc_dim = OnGpu::value ? + num_parallel_iterations : std::min(get_num_concurrent_teams(team_policy),num_parallel_iterations); + + if (vector_buf_ml.extent_int(0)& tu) + { + const int alloc_dim = tu.get_num_ws_slots(); + + if (vector_buf_ml.extent_int(0) dvv_in, + const ExecViewManaged d, + const ExecViewManaged dinv, + const ExecViewManaged metinv, + const ExecViewManaged metdet, + const ExecViewManaged spheremp, + const ExecViewManaged mp) + { + dvv = dvv_in; + m_d = d; + m_dinv = dinv; + m_metinv = metinv; + m_metdet = metdet; + m_spheremp = spheremp; + m_mp = mp; + } + + // ================ SINGLE-LEVEL IMPLEMENTATION =========================== // + + KOKKOS_INLINE_FUNCTION void + gradient_sphere_sl (const KernelVariables &kv, + const ExecViewUnmanaged& scalar, + const ExecViewUnmanaged< Real [2][NP][NP]>& grad_s) const + { + // Make sure the buffers have been created + assert (vector_buf_sl.size()>0); + + const auto& D_inv = Homme::subview(m_dinv,kv.ie); + const auto& temp_v_buf = Homme::subview(vector_buf_sl,kv.team_idx,0); + constexpr int np_squared = NP * NP; + // TODO: Use scratch space for this + Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, np_squared), + [&](const int loop_idx) { + const int j = loop_idx / NP; + const int l = loop_idx % NP; + Real dsdx(0), dsdy(0); + for (int i = 0; i < NP; ++i) { + dsdx += dvv(l, i) * scalar(j, i); + dsdy += dvv(l, i) * scalar(i, j); + } + temp_v_buf(0, j, l) = dsdx * m_scale_factor_inv; + temp_v_buf(1, l, j) = dsdy * m_scale_factor_inv; + }); + kv.team_barrier(); + + constexpr int grad_iters = 2 * NP * NP; + Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, grad_iters), + [&](const int loop_idx) { + const int h = (loop_idx / NP) / NP; + const int i = (loop_idx / NP) % NP; + const int j = loop_idx % NP; + grad_s(h, j, i) = D_inv(h, 0, j, i) * temp_v_buf(0, j, i) + + D_inv(h, 1, j, i) * temp_v_buf(1, j, i); + }); + kv.team_barrier(); + } + + KOKKOS_INLINE_FUNCTION void + gradient_sphere_update_sl (const KernelVariables &kv, + const ExecViewUnmanaged& scalar, + const ExecViewUnmanaged< Real [2][NP][NP]>& grad_s) const + { + // Make sure the buffers have been created + assert (vector_buf_sl.size()>0); + + constexpr int np_squared = NP * NP; + const auto& D_inv = Homme::subview(m_dinv,kv.ie); + const auto& temp_v_buf = Homme::subview(vector_buf_sl,kv.team_idx,0); + Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, np_squared), + [&](const int loop_idx) { + const int j = loop_idx / NP; + const int l = loop_idx % NP; + Real dsdx(0), dsdy(0); + for (int i = 0; i < NP; ++i) { + dsdx += dvv(l, i) * scalar(j, i); + dsdy += dvv(l, i) * scalar(i, j); + } + temp_v_buf(0, j, l) = dsdx * m_scale_factor_inv; + temp_v_buf(1, l, j) = dsdy * m_scale_factor_inv; + }); + kv.team_barrier(); + + constexpr int grad_iters = NP * NP; + Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, grad_iters), + [&](const int loop_idx) { + const int i = loop_idx / NP; + const int j = loop_idx % NP; + const auto& tmp0 = temp_v_buf(0,i,j); + const auto& tmp1 = temp_v_buf(1,i,j); + grad_s(0,i,j) += D_inv(0,0,i,j) * tmp0 + D_inv(0,1,i,j) * tmp1; + grad_s(1,i,j) += D_inv(1,0,i,j) * tmp0 + D_inv(1,1,i,j) * tmp1; + }); + kv.team_barrier(); + } + + KOKKOS_INLINE_FUNCTION void + divergence_sphere_sl (const KernelVariables &kv, + const ExecViewUnmanaged& v, + const ExecViewUnmanaged< Real [NP][NP]>& div_v) const + { + // Make sure the buffers have been created + assert (vector_buf_sl.size()>0); + + const auto& metdet = Homme::subview(m_metdet,kv.ie); + const auto& D_inv = Homme::subview(m_dinv,kv.ie); + const auto& gv_buf = Homme::subview(vector_buf_sl,kv.team_idx,0); + constexpr int np_squared = NP * NP; + Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, np_squared), + [&](const int loop_idx) { + const int igp = loop_idx / NP; + const int jgp = loop_idx % NP; + const auto& v0 = v(0,igp,jgp); + const auto& v1 = v(1,igp,jgp); + gv_buf(0,igp,jgp) = (D_inv(0,0,igp,jgp) * v0 + D_inv(1,0,igp,jgp) * v1) * metdet(igp,jgp); + gv_buf(1,igp,jgp) = (D_inv(0,1,igp,jgp) * v0 + D_inv(1,1,igp,jgp) * v1) * metdet(igp,jgp); + }); + kv.team_barrier(); + + constexpr int div_iters = NP * NP; + Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, div_iters), + [&](const int loop_idx) { + const int igp = loop_idx / NP; + const int jgp = loop_idx % NP; + Real dudx = 0.0, dvdy = 0.0; + for (int kgp = 0; kgp < NP; ++kgp) { + dudx += dvv(jgp, kgp) * gv_buf(0, igp, kgp); + dvdy += dvv(igp, kgp) * gv_buf(1, kgp, jgp); + } + div_v(igp,jgp) = (dudx + dvdy) * ((1.0 / metdet(igp,jgp)) * + m_scale_factor_inv); + }); + kv.team_barrier(); + } + + KOKKOS_INLINE_FUNCTION void + divergence_sphere_wk_sl (const KernelVariables &kv, + const ExecViewUnmanaged& v, + const ExecViewUnmanaged< Real [NP][NP]>& div_v) const + { + // Make sure the buffers have been created + assert (vector_buf_sl.size()>0); + + const auto& D_inv = Homme::subview(m_dinv,kv.ie); + const auto& spheremp = Homme::subview(m_spheremp,kv.ie); + const auto& gv_buf = Homme::subview(vector_buf_sl,kv.team_idx,0); + + // copied from strong divergence as is but without metdet + // conversion to contravariant + constexpr int np_squared = NP * NP; + Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, np_squared), + [&](const int loop_idx) { + const int igp = loop_idx / NP; + const int jgp = loop_idx % NP; + const auto& v0 = v(0,igp,jgp); + const auto& v1 = v(1,igp,jgp); + gv_buf(0,igp,jgp) = D_inv(0,0,igp,jgp) * v0 + D_inv(1,0,igp,jgp) * v1; + gv_buf(1,igp,jgp) = D_inv(0,1,igp,jgp) * v0 + D_inv(1,1,igp,jgp) * v1; + }); + kv.team_barrier(); + + // in strong div + // kgp = i in strong code, jgp=j, igp=l + // in weak div, n is like j in strong div, + // n(weak)=j(strong)=jgp + // m(weak)=l(strong)=igp + // j(weak)=i(strong)=kgp + constexpr int div_iters = NP * NP; + // keeping indices' names as in F + Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, div_iters), + [&](const int loop_idx) { + // Note: for this one time, it is better if m strides faster, due to + // the way the views are accessed. + const int mgp = loop_idx % NP; + const int ngp = loop_idx / NP; + Real dd = 0.0; + for (int jgp = 0; jgp < NP; ++jgp) { + dd -= (spheremp(ngp, jgp) * gv_buf(0, ngp, jgp) * dvv(jgp, mgp) + + spheremp(jgp, mgp) * gv_buf(1, jgp, mgp) * dvv(jgp, ngp)) * + m_scale_factor_inv; + } + div_v(ngp, mgp) = dd; + }); + kv.team_barrier(); + + } // end of divergence_sphere_wk_sl + + // Note that divergence_sphere requires scratch space of 3 x NP x NP Reals + // This must be called from the device space + KOKKOS_INLINE_FUNCTION void + vorticity_sphere_sl (const KernelVariables &kv, + const ExecViewUnmanaged& u, + const ExecViewUnmanaged& v, + const ExecViewUnmanaged< Real [NP][NP]>& vort) const + { + // Make sure the buffers have been created + assert (vector_buf_sl.size()>0); + + const auto& D = Homme::subview(m_d,kv.ie); + const auto& metdet = Homme::subview(m_metdet,kv.ie); + const auto& vcov_buf = Homme::subview(vector_buf_sl,kv.team_idx,0); + + constexpr int np_squared = NP * NP; + Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, np_squared), + [&](const int loop_idx) { + const int igp = loop_idx / NP; + const int jgp = loop_idx % NP; + const auto& u_ij = u(igp,jgp); + const auto& v_ij = v(igp,jgp); + vcov_buf(0,igp,jgp) = D(0,0,igp,jgp) * u_ij + D(0,1,igp,jgp) * v_ij; + vcov_buf(1,igp,jgp) = D(1,0,igp,jgp) * u_ij + D(1,1,igp,jgp) * v_ij; + }); + kv.team_barrier(); + + constexpr int vort_iters = NP * NP; + Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, vort_iters), + [&](const int loop_idx) { + const int igp = loop_idx / NP; + const int jgp = loop_idx % NP; + Real dudy = 0.0; + Real dvdx = 0.0; + for (int kgp = 0; kgp < NP; ++kgp) { + dvdx += dvv(jgp, kgp) * vcov_buf(1, igp, kgp); + dudy += dvv(igp, kgp) * vcov_buf(0, kgp, jgp); + } + + vort(igp, jgp) = (dvdx - dudy) * ((1.0 / metdet(igp, jgp)) * + m_scale_factor_inv); + }); + kv.team_barrier(); + } + + // analog of fortran's laplace_wk_sphere + // Single level implementation + KOKKOS_INLINE_FUNCTION void + laplace_wk_sl (const KernelVariables &kv, + const ExecViewUnmanaged& field, + const ExecViewUnmanaged< Real [NP][NP]>& laplace) const + { + // Make sure the buffers have been created + assert (vector_buf_sl.size()>0); + + const auto& grad_s = Homme::subview(vector_buf_sl, kv.team_idx, 1); + gradient_sphere_sl(kv, field, grad_s); + divergence_sphere_wk_sl(kv, grad_s, laplace); + } // end of laplace_wk_sl + + // ================ MULTI-LEVEL IMPLEMENTATION =========================== // + + // Note: if you are puzzled by the use/need of ViewConst, don't worry, you're not alone. + // To be clear, using `const typename ExecViewUnmanaged::const_type` + // would also work. I prefer ViewConst cause it does not change the 'template + // signature of the type', while Kokkos::View<...>::const_type has a pre-defined + // templated signature (with data type, layout and memory space). + // Anyhow, back to the point, the key fact is that (quoting cppreference), + // + // "The nested-name-specifier (everything to the left of the scope resolution + // operator ::) of a type that was specified using a qualified-id" [do not + // participate in template argument deduction]. + // + // So, without the `typename Blah::type` trick, the type of scalar is used to deduce + // the template arguments, but if the input view had data type non-const, this + // makes the template deduction fail. On the other hand, *with* the `typename Blah::type` + // trick, only the type of the output view is used to deduce the template args. This + // succeeds, and then the copy constructor of View is used to produce a View + // from a View. Magic. + + template + KOKKOS_INLINE_FUNCTION void + gradient_sphere (const KernelVariables &kv, + const InputProvider& scalar, + const ExecViewUnmanaged& grad_s, + const int NUM_LEV_REQUEST) const + { + assert(NUM_LEV_REQUEST>=0); + assert(NUM_LEV_REQUEST<=NUM_LEV_OUT); + + // Make sure the buffers have been created + assert (vector_buf_ml.size()>0); + + const auto& D_inv = Homme::subview(m_dinv, kv.ie); + + constexpr int np_squared = NP * NP; + Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, np_squared), + [&](const int loop_idx) { + const int igp = loop_idx / NP; + const int jgp = loop_idx % NP; + Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team, NUM_LEV_REQUEST), [&] (const int& ilev) { + Scalar v0, v1; + for (int kgp = 0; kgp < NP; ++kgp) { + v0 += dvv(jgp, kgp) * scalar(igp, kgp, ilev); + v1 += dvv(igp, kgp) * scalar(kgp, jgp, ilev); + } + v0 *= m_scale_factor_inv; + v1 *= m_scale_factor_inv; + grad_s(0,igp,jgp,ilev) = D_inv(0,0,igp,jgp) * v0 + D_inv(0,1,igp,jgp) * v1; + grad_s(1,igp,jgp,ilev) = D_inv(1,0,igp,jgp) * v0 + D_inv(1,1,igp,jgp) * v1; + }); + }); + kv.team_barrier(); + } + + template + KOKKOS_INLINE_FUNCTION void + gradient_sphere (const KernelVariables &kv, + const InputProvider& scalar, + const ExecViewUnmanaged& grad_s) const + { + static_assert(NUM_LEV_REQUEST>=0, "Error! Invalid value for NUM_LEV_REQUEST.\n"); + static_assert(NUM_LEV_REQUEST<=NUM_LEV_OUT, "Error! Output view does not have enough levels.\n"); + + gradient_sphere(kv, scalar, grad_s, NUM_LEV_REQUEST); + } + + template + KOKKOS_INLINE_FUNCTION void + gradient_sphere_update (const KernelVariables &kv, + const typename ViewConst>::type& scalar, + const ExecViewUnmanaged& grad_s) const + { + static_assert(NUM_LEV_REQUEST>=0, "Error! Invalid value for NUM_LEV_REQUEST.\n"); + static_assert(NUM_LEV_REQUEST<=NUM_LEV_IN, "Error! Input view does not have enough levels.\n"); + static_assert(NUM_LEV_REQUEST<=NUM_LEV_OUT, "Error! Output view does not have enough levels.\n"); + + // Make sure the buffers have been created + assert (vector_buf_ml.size()>0); + + const auto& D_inv = Homme::subview(m_dinv, kv.ie); + constexpr int np_squared = NP * NP; + Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, np_squared), + [&](const int loop_idx) { + const int igp = loop_idx / NP; + const int jgp = loop_idx % NP; + Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team, NUM_LEV_REQUEST), [&] (const int& ilev) { + Scalar dsdx, dsdy; + for (int kgp = 0; kgp < NP; ++kgp) { + dsdx += dvv(jgp, kgp) * scalar(igp, kgp, ilev); + dsdy += dvv(igp, kgp) * scalar(kgp, jgp, ilev); + } + dsdx *= m_scale_factor_inv; + dsdy *= m_scale_factor_inv; + grad_s(0,igp,jgp,ilev) += D_inv(0,0,igp,jgp) * dsdx + D_inv(0,1,igp,jgp) * dsdy; + grad_s(1,igp,jgp,ilev) += D_inv(1,0,igp,jgp) * dsdx + D_inv(1,1,igp,jgp) * dsdy; + }); + }); + kv.team_barrier(); + } + + template + KOKKOS_INLINE_FUNCTION void + divergence_sphere (const KernelVariables &kv, + const InputProvider& v, + const ExecViewUnmanaged& div_v, + const Real alpha = 1.0, const Real beta = 0.0) const + { + divergence_sphere_cm(kv,v,div_v,alpha,beta); + + } + + template + KOKKOS_INLINE_FUNCTION void + divergence_sphere_nlev (const KernelVariables &kv, + const InputProvider& v, + const ExecViewUnmanaged& div_v, + const int NUM_LEV_REQUEST, + const Real alpha = 1.0, const Real beta = 0.0) const + { + divergence_sphere_cm(kv,v,div_v,alpha,beta,NUM_LEV_REQUEST); + + } + + template + KOKKOS_INLINE_FUNCTION void + divergence_sphere_cm (const KernelVariables &kv, + const InputProvider& v, + const ExecViewUnmanaged& div_v, + const Real alpha, const Real beta, + const int NUM_LEV_REQUEST) const + { + assert(NUM_LEV_REQUEST>=0); + assert(NUM_LEV_REQUEST<=NUM_LEV_OUT); + + // Make sure the buffers have been created + assert (vector_buf_ml.size()>0); + + const auto& D_inv = Homme::subview(m_dinv, kv.ie); + const auto& metdet = Homme::subview(m_metdet, kv.ie); + vector_buf gv_buf(Homme::subview(vector_buf_ml,kv.team_idx, 0).data()); + constexpr int np_squared = NP * NP; + Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, np_squared), + [&](const int loop_idx) { + const int igp = loop_idx / NP; + const int jgp = loop_idx % NP; + Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team, NUM_LEV_REQUEST), [&] (const int& ilev) { + const auto& v0 = v(0, igp, jgp, ilev); + const auto& v1 = v(1, igp, jgp, ilev); + gv_buf(0,igp,jgp,ilev) = (D_inv(0,0,igp,jgp) * v0 + D_inv(1,0,igp,jgp) * v1) * metdet(igp,jgp); + gv_buf(1,igp,jgp,ilev) = (D_inv(0,1,igp,jgp) * v0 + D_inv(1,1,igp,jgp) * v1) * metdet(igp,jgp); + }); + }); + kv.team_barrier(); + + // j, l, i -> i, j, k + constexpr int div_iters = NP * NP; + Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, div_iters), + [&](const int loop_idx) { + const int igp = loop_idx / NP; + const int jgp = loop_idx % NP; + Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team, NUM_LEV_REQUEST), [&] (const int& ilev) { + Scalar dudx, dvdy; + for (int kgp = 0; kgp < NP; ++kgp) { + dudx += dvv(jgp, kgp) * gv_buf(0, igp, kgp, ilev); + dvdy += dvv(igp, kgp) * gv_buf(1, kgp, jgp, ilev); + } + combine((dudx + dvdy) * (1.0 / metdet(igp, jgp) * m_scale_factor_inv), + div_v(igp, jgp, ilev), alpha, beta); + }); + }); + kv.team_barrier(); + } + + template + KOKKOS_INLINE_FUNCTION void + divergence_sphere_cm (const KernelVariables &kv, + const InputProvider& v, + const ExecViewUnmanaged& div_v, + const Real alpha = 1.0, const Real beta = 0.0) const + { + static_assert(NUM_LEV_REQUEST>=0, "Error! Invalid value for NUM_LEV_REQUEST.\n"); + static_assert(NUM_LEV_REQUEST<=NUM_LEV_OUT, "Error! Output view does not have enough levels.\n"); + + divergence_sphere_cm(kv, v, div_v, alpha, beta, NUM_LEV_REQUEST); + } + + template + KOKKOS_INLINE_FUNCTION void + divergence_sphere_update (const KernelVariables &kv, + const Real alpha, const bool add_hyperviscosity, + const typename ViewConst>::type& vstar, + const typename ViewConst>::type& qdp, + // On input, qtens_biharmonic if add_hyperviscosity, undefined + // if not; on output, qtens. + const ExecViewUnmanaged& qtens) const + { + static_assert(NUM_LEV_REQUEST>=0, "Error! Invalid value for NUM_LEV_REQUEST.\n"); + static_assert(NUM_LEV_REQUEST<=NUM_LEV_IN, "Error! Input view does not have enough levels.\n"); + static_assert(NUM_LEV_REQUEST<=NUM_LEV_OUT, "Error! Output view does not have enough levels.\n"); + + // Make sure the buffers have been created + assert (vector_buf_ml.size()>0); + + const auto& D_inv = Homme::subview(m_dinv, kv.ie); + const auto& metdet = Homme::subview(m_metdet, kv.ie); + vector_buf gv(Homme::subview(vector_buf_ml,kv.team_idx,0).data()); + constexpr int np_squared = NP * NP; + Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, np_squared), + [&](const int loop_idx) { + const int igp = loop_idx / NP; + const int jgp = loop_idx % NP; + Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team, NUM_LEV_REQUEST), [&] (const int& ilev) { + const auto& qdpijk = qdp(igp, jgp, ilev); + const auto v0 = vstar(0, igp, jgp, ilev) * qdpijk; + const auto v1 = vstar(1, igp, jgp, ilev) * qdpijk; + gv(0,igp,jgp,ilev) = (D_inv(0,0,igp,jgp) * v0 + D_inv(1,0,igp,jgp) * v1) * metdet(igp,jgp); + gv(1,igp,jgp,ilev) = (D_inv(0,1,igp,jgp) * v0 + D_inv(1,1,igp,jgp) * v1) * metdet(igp,jgp); + }); + }); + kv.team_barrier(); + + constexpr int div_iters = NP * NP; + Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, div_iters), + [&](const int loop_idx) { + const int igp = loop_idx / NP; + const int jgp = loop_idx % NP; + Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team, NUM_LEV_REQUEST), [&] (const int& ilev) { + Scalar dudx, dvdy; + for (int kgp = 0; kgp < NP; ++kgp) { + dudx += dvv(jgp, kgp) * gv(0, igp, kgp, ilev); + dvdy += dvv(igp, kgp) * gv(1, kgp, jgp, ilev); + } + const Scalar qtensijk0 = add_hyperviscosity ? qtens(igp,jgp,ilev) : 0; + qtens(igp,jgp,ilev) = (qdp(igp,jgp,ilev) + + alpha*((dudx + dvdy) * (1.0 / metdet(igp,jgp) * m_scale_factor_inv)) + + qtensijk0); + }); + }); + kv.team_barrier(); + } + + template + KOKKOS_INLINE_FUNCTION void + vorticity_sphere (const KernelVariables &kv, + const typename ViewConst>::type& u, + const typename ViewConst>::type& v, + const ExecViewUnmanaged& vort) const + { + static_assert(NUM_LEV_REQUEST>=0, "Error! Invalid value for NUM_LEV_REQUEST.\n"); + static_assert(NUM_LEV_REQUEST<=NUM_LEV_IN, "Error! Input view does not have enough levels.\n"); + static_assert(NUM_LEV_REQUEST<=NUM_LEV_OUT, "Error! Output view does not have enough levels.\n"); + + // Make sure the buffers have been created + assert (vector_buf_ml.size()>0); + + const auto& D = Homme::subview(m_d, kv.ie); + const auto& metdet = Homme::subview(m_metdet, kv.ie); + vector_buf vcov_buf(Homme::subview(vector_buf_ml,kv.team_idx,0).data()); + constexpr int np_squared = NP * NP; + Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, np_squared), + [&](const int loop_idx) { + const int igp = loop_idx / NP; + const int jgp = loop_idx % NP; + Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team, NUM_LEV_REQUEST), [&] (const int& ilev) { + const auto& u_ijk = u(igp, jgp, ilev); + const auto& v_ijk = v(igp, jgp, ilev); + vcov_buf(0,igp,jgp,ilev) = D(0,0,igp,jgp) * u_ijk + D(0,1,igp,jgp) * v_ijk; + vcov_buf(1,igp,jgp,ilev) = D(1,0,igp,jgp) * u_ijk + D(1,1,igp,jgp) * v_ijk; + }); + }); + kv.team_barrier(); + + constexpr int vort_iters = NP * NP; + Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, vort_iters), + [&](const int loop_idx) { + const int igp = loop_idx / NP; + const int jgp = loop_idx % NP; + Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team, NUM_LEV_REQUEST), [&] (const int& ilev) { + Scalar dudy, dvdx; + for (int kgp = 0; kgp < NP; ++kgp) { + dvdx += dvv(jgp, kgp) * vcov_buf(1, igp, kgp, ilev); + dudy += dvv(igp, kgp) * vcov_buf(0, kgp, jgp, ilev); + } + vort(igp, jgp, ilev) = (dvdx - dudy) * (1.0 / metdet(igp, jgp) * + m_scale_factor_inv); + }); + }); + kv.team_barrier(); + } + + template + KOKKOS_INLINE_FUNCTION void + vorticity_sphere (const KernelVariables &kv, + const typename ViewConst>::type& v, + const ExecViewUnmanaged& vort, + const int NUM_LEV_REQUEST) const + { + assert(NUM_LEV_REQUEST>=0); + assert(NUM_LEV_REQUEST<=NUM_LEV_IN); + assert(NUM_LEV_REQUEST<=NUM_LEV_OUT); + + // Make sure the buffers have been created + assert (vector_buf_ml.size()>0); + + const auto& D = Homme::subview(m_d, kv.ie); + const auto& metdet = Homme::subview(m_metdet, kv.ie); + vector_buf sphere_buf(Homme::subview(vector_buf_ml,kv.team_idx,0).data()); + constexpr int np_squared = NP * NP; + Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, np_squared), + [&](const int loop_idx) { + const int igp = loop_idx / NP; + const int jgp = loop_idx % NP; + Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team, NUM_LEV_REQUEST), [&] (const int& ilev) { + const auto& v0 = v(0,igp,jgp,ilev); + const auto& v1 = v(1,igp,jgp,ilev); + sphere_buf(0,igp,jgp,ilev) = D(0,0,igp,jgp) * v0 + D(0,1,igp,jgp) * v1; + sphere_buf(1,igp,jgp,ilev) = D(1,0,igp,jgp) * v0 + D(1,1,igp,jgp) * v1; + }); + }); + kv.team_barrier(); + + constexpr int vort_iters = NP * NP; + Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, vort_iters), + [&](const int loop_idx) { + const int igp = loop_idx / NP; + const int jgp = loop_idx % NP; + Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team, NUM_LEV_REQUEST), [&] (const int& ilev) { + Scalar dudy, dvdx; + for (int kgp = 0; kgp < NP; ++kgp) { + dvdx += dvv(jgp, kgp) * sphere_buf(1, igp, kgp, ilev); + dudy += dvv(igp, kgp) * sphere_buf(0, kgp, jgp, ilev); + } + vort(igp, jgp, ilev) = (dvdx - dudy) * (1.0 / metdet(igp, jgp) * + m_scale_factor_inv); + }); + }); + kv.team_barrier(); + } + + template + KOKKOS_INLINE_FUNCTION void + vorticity_sphere (const KernelVariables &kv, + const typename ViewConst>::type& v, + const ExecViewUnmanaged& vort) const + { + static_assert(NUM_LEV_REQUEST>=0, "Error! Invalid value for NUM_LEV_REQUEST.\n"); + static_assert(NUM_LEV_REQUEST<=NUM_LEV_IN, "Error! Input view does not have enough levels.\n"); + static_assert(NUM_LEV_REQUEST<=NUM_LEV_OUT, "Error! Output view does not have enough levels.\n"); + + vorticity_sphere(kv, v, vort, NUM_LEV_REQUEST); + } + + template + KOKKOS_INLINE_FUNCTION void + divergence_sphere_wk (const KernelVariables &kv, + // On input, a field whose divergence is sought; on + // output, the view's data are invalid. + const ExecViewUnmanaged& v, + const ExecViewUnmanaged& div_v, + const int NUM_LEV_REQUEST) const + { + assert(NUM_LEV_REQUEST>=0); + assert(NUM_LEV_REQUEST<=NUM_LEV_IN); + assert(NUM_LEV_REQUEST<=NUM_LEV_OUT); + + // Make sure the buffers have been created + assert (vector_buf_ml.size()>0); + + const auto& D_inv = Homme::subview(m_dinv, kv.ie); + const auto& spheremp = Homme::subview(m_spheremp, kv.ie); + constexpr int np_squared = NP * NP; + Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, np_squared), + [&](const int loop_idx) { + const int igp = loop_idx / NP; + const int jgp = loop_idx % NP; + Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team, NUM_LEV_REQUEST), [&] (const int& ilev) { + const auto v0 = v(0,igp,jgp,ilev); + const auto v1 = v(1,igp,jgp,ilev); + v(0,igp,jgp,ilev) = D_inv(0, 0, igp, jgp) * v0 + D_inv(1, 0, igp, jgp) * v1; + v(1,igp,jgp,ilev) = D_inv(0, 1, igp, jgp) * v0 + D_inv(1, 1, igp, jgp) * v1; + }); + }); + kv.team_barrier(); + + constexpr int div_iters = NP * NP; + Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, div_iters), + [&](const int loop_idx) { + // Note: for this one time, it is better if m strides faster, due to + // the way the views are accessed. + const int mgp = loop_idx % NP; + const int ngp = loop_idx / NP; + Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team, NUM_LEV_REQUEST), [&] (const int& ilev) { + Scalar dd; + // TODO: move multiplication by scale_factor_inv outside the loop + for (int jgp = 0; jgp < NP; ++jgp) { + // Here, v is the temporary buffer, aliased on the input v. + dd -= (spheremp(ngp, jgp) * v(0, ngp, jgp, ilev) * dvv(jgp, mgp) + + spheremp(jgp, mgp) * v(1, jgp, mgp, ilev) * dvv(jgp, ngp)) * + m_scale_factor_inv; + } + div_v(ngp, mgp, ilev) = dd; + }); + }); + kv.team_barrier(); + + }//end of divergence_sphere_wk + + template + KOKKOS_INLINE_FUNCTION void + divergence_sphere_wk (const KernelVariables &kv, + // On input, a field whose divergence is sought; on + // output, the view's data are invalid. + const ExecViewUnmanaged& v, + const ExecViewUnmanaged& div_v) const + { + divergence_sphere_wk(kv, v, div_v, NUM_LEV_REQUEST); + }//end of divergence_sphere_wk + + template + KOKKOS_INLINE_FUNCTION void + laplace_simple (const KernelVariables &kv, + const typename ViewConst>::type& field, + const ExecViewUnmanaged& laplace, + const int NUM_LEV_REQUEST) const + { + assert(NUM_LEV_REQUEST<=NUM_LEV_IN); + assert(NUM_LEV_REQUEST<=NUM_LEV_OUT); + + // Make sure the buffers have been created + assert (vector_buf_ml.size()>0); + + vector_buf grad_s(Homme::subview(vector_buf_ml, kv.team_idx, 0).data()); + gradient_sphere(kv, field, grad_s, NUM_LEV_REQUEST); + divergence_sphere_wk(kv, grad_s, laplace, NUM_LEV_REQUEST); + }//end of laplace_simple + + template + KOKKOS_INLINE_FUNCTION void + laplace_simple (const KernelVariables &kv, + const typename ViewConst>::type& field, + const ExecViewUnmanaged& laplace) const + { + static_assert(NUM_LEV_REQUEST>=0, "Error! Invalid value for NUM_LEV_REQUEST.\n"); + static_assert(NUM_LEV_REQUEST<=NUM_LEV_IN, "Error! Input view does not have enough levels.\n"); + static_assert(NUM_LEV_REQUEST<=NUM_LEV_OUT, "Error! Output view does not have enough levels.\n"); + + laplace_simple(kv, field, laplace, NUM_LEV_REQUEST); + }//end of laplace_simple + + template + KOKKOS_INLINE_FUNCTION void + laplace_tensor(const KernelVariables &kv, + const ExecViewUnmanaged& tensorVisc, + const typename ViewConst>::type& field, // input + const ExecViewUnmanaged& laplace) const + { + static_assert(NUM_LEV_REQUEST>=0, "Error! Invalid value for NUM_LEV_REQUEST.\n"); + static_assert(NUM_LEV_REQUEST<=NUM_LEV_IN, "Error! Input view does not have enough levels.\n"); + static_assert(NUM_LEV_REQUEST<=NUM_LEV_OUT, "Error! Output view does not have enough levels.\n"); + + // Make sure the buffers have been created + assert (vector_buf_ml.size()>0); + + vector_buf grad_s(Homme::subview(vector_buf_ml, kv.team_idx, 1).data()); + vector_buf sphere_buf(Homme::subview(vector_buf_ml, kv.team_idx, 2).data()); + + gradient_sphere(kv, field, grad_s); + //now multiply tensorVisc(:,:,i,j)*grad_s(i,j) (matrix*vector, independent of i,j ) + //but it requires a temp var to store a result. the result is then placed to grad_s, + //or should it be an extra temp var instead of an extra loop? + constexpr int num_iters = NP * NP; + Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, num_iters), + [&](const int loop_idx) { + const int igp = loop_idx / NP; + const int jgp = loop_idx % NP; + Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team, NUM_LEV_REQUEST), [&] (const int& ilev) { + const auto& grad_s0 = grad_s(0,igp,jgp,ilev); + const auto& grad_s1 = grad_s(1,igp,jgp,ilev); + sphere_buf(0,igp,jgp,ilev) = tensorVisc(0,0,igp,jgp) * grad_s0 + tensorVisc(1,0,igp,jgp) * grad_s1; + sphere_buf(1,igp,jgp,ilev) = tensorVisc(0,1,igp,jgp) * grad_s0 + tensorVisc(1,1,igp,jgp) * grad_s1; + }); + }); + kv.team_barrier(); + + divergence_sphere_wk(kv, sphere_buf, laplace); + }//end of laplace_tensor + + template + KOKKOS_INLINE_FUNCTION void + curl_sphere_wk_testcov (const KernelVariables &kv, + const typename ViewConst>::type& scalar, + const ExecViewUnmanaged& curls, + const int NUM_LEV_REQUEST) const + { + assert(NUM_LEV_REQUEST>=0); + assert(NUM_LEV_REQUEST<=NUM_LEV_IN); + assert(NUM_LEV_REQUEST<=NUM_LEV_OUT); + + // Make sure the buffers have been created + assert (vector_buf_ml.size()>0); + + const auto& D = Homme::subview(m_d, kv.ie); + vector_buf sphere_buf(Homme::subview(vector_buf_ml,kv.team_idx,0).data()); + constexpr int np_squared = NP * NP; + Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, np_squared), [&](const int loop_idx) { + const int ngp = loop_idx / NP; + const int mgp = loop_idx % NP; + Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team, NUM_LEV_REQUEST), [&] (const int& ilev) { + auto& sb0 = sphere_buf(0, ngp, mgp, ilev); + auto& sb1 = sphere_buf(1, ngp, mgp, ilev); + sb0 = 0; + sb1 = 0; + for (int jgp = 0; jgp < NP; ++jgp) { + sb0 -= m_mp(jgp,mgp)*scalar(jgp,mgp,ilev)*dvv(jgp,ngp); + sb1 += m_mp(ngp,jgp)*scalar(ngp,jgp,ilev)*dvv(jgp,mgp); + } + }); + }); + kv.team_barrier(); + + Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, np_squared), + [&](const int loop_idx) { + const int igp = loop_idx / NP; //slowest + const int jgp = loop_idx % NP; //fastest + Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team, NUM_LEV_REQUEST), [&] (const int& ilev) { + const auto& sb0 = sphere_buf(0, igp, jgp, ilev); + const auto& sb1 = sphere_buf(1, igp, jgp, ilev); + curls(0,igp,jgp,ilev) = (D(0,0,igp,jgp) * sb0 + D(1,0,igp,jgp) * sb1) * m_scale_factor_inv; + curls(1,igp,jgp,ilev) = (D(0,1,igp,jgp) * sb0 + D(1,1,igp,jgp) * sb1) * m_scale_factor_inv; + }); + }); + kv.team_barrier(); + } + + template + KOKKOS_INLINE_FUNCTION void + curl_sphere_wk_testcov (const KernelVariables &kv, + const typename ViewConst>::type& scalar, + const ExecViewUnmanaged& curls) const + { + static_assert(NUM_LEV_REQUEST>=0, "Error! Invalid value for NUM_LEV_REQUEST.\n"); + static_assert(NUM_LEV_REQUEST<=NUM_LEV_IN, "Error! Input view does not have enough levels.\n"); + static_assert(NUM_LEV_REQUEST<=NUM_LEV_OUT, "Error! Output view does not have enough levels.\n"); + + curl_sphere_wk_testcov(kv, scalar, curls, NUM_LEV_REQUEST); + } + + template + KOKKOS_INLINE_FUNCTION void + curl_sphere_wk_testcov_update (const KernelVariables &kv, const Real alpha, const Real beta, + const typename ViewConst>::type& scalar, + const ExecViewUnmanaged& curls, + const int NUM_LEV_REQUEST) const + { + assert(NUM_LEV_REQUEST>=0); + assert(NUM_LEV_REQUEST<=NUM_LEV_IN); + assert(NUM_LEV_REQUEST<=NUM_LEV_OUT); + + // Make sure the buffers have been created + assert (vector_buf_ml.size()>0); + + const auto& D = Homme::subview(m_d, kv.ie); + constexpr int np_squared = NP * NP; + Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, np_squared), [&](const int loop_idx) { + const int ngp = loop_idx / NP; + const int mgp = loop_idx % NP; + Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team, NUM_LEV_REQUEST), [&] (const int& ilev) { + Scalar sb0, sb1; + for (int jgp = 0; jgp < NP; ++jgp) { + sb0 -= m_mp(jgp,mgp)*scalar(jgp,mgp,ilev)*dvv(jgp,ngp); + sb1 += m_mp(ngp,jgp)*scalar(ngp,jgp,ilev)*dvv(jgp,mgp); + } + curls(0,ngp,mgp,ilev) = beta*curls(0,ngp,mgp,ilev) + alpha * + ( D(0,0,ngp,mgp)*sb0 + D(1,0,ngp,mgp)*sb1 ) + * m_scale_factor_inv; + curls(1,ngp,mgp,ilev) = beta*curls(1,ngp,mgp,ilev) + alpha * + ( D(0,1,ngp,mgp)*sb0 + D(1,1,ngp,mgp)*sb1 ) + * m_scale_factor_inv; + }); + }); + kv.team_barrier(); + } + + template + KOKKOS_INLINE_FUNCTION void + curl_sphere_wk_testcov_update (const KernelVariables &kv, const Real alpha, const Real beta, + const typename ViewConst>::type& scalar, + const ExecViewUnmanaged& curls) const + { + static_assert(NUM_LEV_REQUEST>=0, "Error! Invalid value for NUM_LEV_REQUEST.\n"); + static_assert(NUM_LEV_REQUEST<=NUM_LEV_IN, "Error! Input view does not have enough levels.\n"); + static_assert(NUM_LEV_REQUEST<=NUM_LEV_OUT, "Error! Output view does not have enough levels.\n"); + + curl_sphere_wk_testcov_update(kv, alpha, beta, scalar, curls, NUM_LEV_REQUEST); + } + + template + KOKKOS_INLINE_FUNCTION void + grad_sphere_wk_testcov (const KernelVariables &kv, + const typename ViewConst>::type& scalar, + const ExecViewUnmanaged& grads, + const int NUM_LEV_REQUEST) const + { + assert(NUM_LEV_REQUEST>=0); + assert(NUM_LEV_REQUEST<=NUM_LEV_IN); + assert(NUM_LEV_REQUEST<=NUM_LEV_OUT); + + // Make sure the buffers have been created + assert (vector_buf_ml.size()>0); + + const auto& D = Homme::subview(m_d, kv.ie); + const auto& metinv = Homme::subview(m_metinv, kv.ie); + const auto& metdet = Homme::subview(m_metdet, kv.ie); + constexpr int np_squared = NP * NP; + Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, np_squared), [&](const int loop_idx) { + const int ngp = loop_idx / NP; + const int mgp = loop_idx % NP; + Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team, NUM_LEV_REQUEST), [&] (const int& ilev) { + Scalar b0, b1; + for (int jgp = 0; jgp < NP; ++jgp) { + const auto& mpnj = m_mp(ngp,jgp); + const auto& mpjm = m_mp(jgp,mgp); + const auto& md = metdet(ngp,mgp); + const auto& snj = scalar(ngp,jgp,ilev); + const auto& sjm = scalar(jgp,mgp,ilev); + const auto& djm = dvv(jgp,mgp); + const auto& djn = dvv(jgp,ngp); + b0 -= (mpnj * metinv(0,0,ngp,mgp) * md * snj * djm + + mpjm * metinv(0,1,ngp,mgp) * md * sjm * djn); + b1 -= (mpnj * metinv(1,0,ngp,mgp) * md * snj * djm + + mpjm * metinv(1,1,ngp,mgp) * md * sjm * djn); + } + grads(0,ngp,mgp,ilev) = (D(0,0,ngp,mgp) * b0 + D(1,0,ngp,mgp) * b1) * m_scale_factor_inv; + grads(1,ngp,mgp,ilev) = (D(0,1,ngp,mgp) * b0 + D(1,1,ngp,mgp) * b1) * m_scale_factor_inv; + }); + }); + kv.team_barrier(); + } + + template + KOKKOS_INLINE_FUNCTION void + grad_sphere_wk_testcov (const KernelVariables &kv, + const typename ViewConst>::type& scalar, + const ExecViewUnmanaged& grads) const + { + static_assert(NUM_LEV_REQUEST>=0, "Error! Invalid value for NUM_LEV_REQUEST.\n"); + static_assert(NUM_LEV_REQUEST<=NUM_LEV_IN, "Error! Input view does not have enough levels.\n"); + static_assert(NUM_LEV_REQUEST<=NUM_LEV_OUT, "Error! Output view does not have enough levels.\n"); + + grad_sphere_wk_testcov(kv, scalar, grads, NUM_LEV_REQUEST); + } + + template + KOKKOS_INLINE_FUNCTION void + vlaplace_sphere_wk_cartesian (const KernelVariables &kv, + const ExecViewUnmanaged& tensorVisc, + const ExecViewUnmanaged& vec_sph2cart, + const typename ViewConst>::type& vector, + const ExecViewUnmanaged& laplace) const + { + static_assert(NUM_LEV_REQUEST>=0, "Error! Invalid value for NUM_LEV_REQUEST.\n"); + static_assert(NUM_LEV_REQUEST<=NUM_LEV_IN, "Error! Input view does not have enough levels.\n"); + static_assert(NUM_LEV_REQUEST<=NUM_LEV_OUT, "Error! Output view does not have enough levels.\n"); + + // Make sure the buffers have been created + assert (vector_buf_ml.size()>0); + + const auto& spheremp = Homme::subview(m_spheremp, kv.ie); + scalar_buf laplace0(Homme::subview(scalar_buf_ml,kv.team_idx,0).data()); + scalar_buf laplace1(Homme::subview(scalar_buf_ml,kv.team_idx,1).data()); + scalar_buf laplace2(Homme::subview(scalar_buf_ml,kv.team_idx,2).data()); + constexpr int np_squared = NP * NP; + Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, np_squared), + [&](const int loop_idx) { + const int igp = loop_idx / NP; //slowest + const int jgp = loop_idx % NP; //fastest + Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team, NUM_LEV_REQUEST), [&] (const int& ilev) { + const auto& v0 = vector(0,igp,jgp,ilev); + const auto& v1 = vector(1,igp,jgp,ilev); + laplace0(igp,jgp,ilev) = vec_sph2cart(0,0,igp,jgp)*v0 + vec_sph2cart(1,0,igp,jgp)*v1 ; + laplace1(igp,jgp,ilev) = vec_sph2cart(0,1,igp,jgp)*v0 + vec_sph2cart(1,1,igp,jgp)*v1 ; + laplace2(igp,jgp,ilev) = vec_sph2cart(0,2,igp,jgp)*v0 + vec_sph2cart(1,2,igp,jgp)*v1 ; + }); + }); + kv.team_barrier(); + + // Use laplace* as input, and then overwrite it with the output (saves temporaries) + laplace_tensor(kv,tensorVisc,laplace0,laplace0); + laplace_tensor(kv,tensorVisc,laplace1,laplace1); + laplace_tensor(kv,tensorVisc,laplace2,laplace2); + + Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, np_squared), + [&](const int loop_idx) { + const int igp = loop_idx / NP; //slowest + const int jgp = loop_idx % NP; //fastest + Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team, NUM_LEV_REQUEST), [&] (const int& ilev) { +#define UNDAMPRRCART +#ifdef UNDAMPRRCART + laplace(0,igp,jgp,ilev) = vec_sph2cart(0,0,igp,jgp)*laplace0(igp,jgp,ilev) + + vec_sph2cart(0,1,igp,jgp)*laplace1(igp,jgp,ilev) + + vec_sph2cart(0,2,igp,jgp)*laplace2(igp,jgp,ilev) + + 2.0*spheremp(igp,jgp)*vector(0,igp,jgp,ilev) + *(m_laplacian_rigid_factor)*(m_laplacian_rigid_factor); + + laplace(1,igp,jgp,ilev) = vec_sph2cart(1,0,igp,jgp)*laplace0(igp,jgp,ilev) + + vec_sph2cart(1,1,igp,jgp)*laplace1(igp,jgp,ilev) + + vec_sph2cart(1,2,igp,jgp)*laplace2(igp,jgp,ilev) + + 2.0*spheremp(igp,jgp)*vector(1,igp,jgp,ilev) + *(m_laplacian_rigid_factor)*(m_laplacian_rigid_factor); +#else + laplace(0,igp,jgp,ilev) = vec_sph2cart(0,0,igp,jgp)*laplace0(igp,jgp,ilev) + + vec_sph2cart(0,1,igp,jgp)*laplace1(igp,jgp,ilev) + + vec_sph2cart(0,2,igp,jgp)*laplace2(igp,jgp,ilev); + laplace(1,igp,jgp,ilev) = vec_sph2cart(1,0,igp,jgp)*laplace0(igp,jgp,ilev) + + vec_sph2cart(1,1,igp,jgp)*laplace1(igp,jgp,ilev) + + vec_sph2cart(1,2,igp,jgp)*laplace2(igp,jgp,ilev); +#endif + }); + }); + kv.team_barrier(); + } // end of vlaplace_sphere_wk_cartesian + + template + KOKKOS_INLINE_FUNCTION void + vlaplace_sphere_wk_contra (const KernelVariables &kv, const Real nu_ratio, + const typename ViewConst>::type& vector, + const ExecViewUnmanaged& laplace, + const int NUM_LEV_REQUEST) const + { + assert(NUM_LEV_REQUEST>=0); + assert(NUM_LEV_REQUEST<=NUM_LEV_IN); + assert(NUM_LEV_REQUEST<=NUM_LEV_OUT); + + // Make sure the buffers have been created + assert (vector_buf_ml.size()>0); + + const auto& spheremp = Homme::subview(m_spheremp, kv.ie); + scalar_buf div (Homme::subview(scalar_buf_ml,kv.team_idx,0).data()); + scalar_buf vort(Homme::subview(scalar_buf_ml,kv.team_idx,0).data()); + vector_buf grad_curl_cov(Homme::subview(vector_buf_ml,kv.team_idx,1).data()); + constexpr int np_squared = NP * NP; + + // grad(div(v)) + divergence_sphere_nlev(kv,vector,div,NUM_LEV_REQUEST); + if (nu_ratio>0 && nu_ratio!=1.0) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, np_squared), + [&](const int loop_idx) { + const int igp = loop_idx / NP; //slow + const int jgp = loop_idx % NP; //fast + Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team, NUM_LEV_REQUEST), [&] (const int& ilev) { + div(igp,jgp,ilev) *= nu_ratio; + }); + }); + kv.team_barrier(); + } + grad_sphere_wk_testcov(kv,div,grad_curl_cov,NUM_LEV_REQUEST); + + // curl(curl(v)) + vorticity_sphere(kv,vector,vort,NUM_LEV_REQUEST); + curl_sphere_wk_testcov_update(kv,-1.0,1.0,vort,grad_curl_cov,NUM_LEV_REQUEST); + + const auto re2 = m_laplacian_rigid_factor*m_laplacian_rigid_factor; + Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team, np_squared), + [&](const int loop_idx) { + const int igp = loop_idx / NP; //slow + const int jgp = loop_idx % NP; //fast + const auto f = 2.0*spheremp(igp,jgp); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team, NUM_LEV_REQUEST), [&] (const int& ilev) { + laplace(0,igp,jgp,ilev) = f*vector(0,igp,jgp,ilev)*re2 + grad_curl_cov(0,igp,jgp,ilev); + laplace(1,igp,jgp,ilev) = f*vector(1,igp,jgp,ilev)*re2 + grad_curl_cov(1,igp,jgp,ilev); + }); + }); + kv.team_barrier(); + }//end of vlaplace_sphere_wk_contra + + template + KOKKOS_INLINE_FUNCTION void + vlaplace_sphere_wk_contra (const KernelVariables &kv, const Real nu_ratio, + const typename ViewConst>::type& vector, + const ExecViewUnmanaged& laplace) const + { + static_assert(NUM_LEV_REQUEST>=0, "Error! Invalid value for NUM_LEV_REQUEST.\n"); + static_assert(NUM_LEV_REQUEST<=NUM_LEV_IN, "Error! Input view does not have enough levels.\n"); + static_assert(NUM_LEV_REQUEST<=NUM_LEV_OUT, "Error! Output view does not have enough levels.\n"); + + vlaplace_sphere_wk_contra(kv, nu_ratio, vector, laplace, NUM_LEV_REQUEST); + }//end of vlaplace_sphere_wk_contra + + // The buffers should be enough to handle any single call to any + // single sphere operator. + // One might prefer them to be private, but they are handy for + // users of SphereOperators for other things, and using the same + // memory buffers for multiple things gives performance in mem + // b/w-limited computations. + + ExecViewManaged vector_buf_sl; + ExecViewManaged scalar_buf_ml; + ExecViewManaged vector_buf_ml; + + + ExecViewManaged dvv; + ExecViewManaged m_mp; + + ExecViewManaged m_spheremp; + ExecViewManaged m_rspheremp; + ExecViewManaged m_metinv; + ExecViewManaged m_metdet; + ExecViewManaged m_d; + ExecViewManaged m_dinv; + + Real m_scale_factor_inv, m_laplacian_rigid_factor; +}; + +static constexpr int NPNP = NP * NP; + +#ifdef KOKKOS_ENABLE_CUDA +using TeamPolicy = Kokkos::TeamPolicy >; +#else +using TeamPolicy = Kokkos::TeamPolicy; +#endif + +#ifndef WARP_SIZE + +#if defined(KOKKOS_ENABLE_HIP) +#define WARP_SIZE warpSize +#elif defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_SYCL) +#define WARP_SIZE 32 +#else +#define WARP_SIZE 1 +#endif + +#endif + +#if (WARP_SIZE == 1) + +#define SPHERE_BLOCK_START3(B,X,Y,Z) \ + Scalar X; Scalar sbo##X[NP][NP][NUM_LEV]; \ + Scalar Y; Scalar sbo##Y[NP][NP][NUM_LEV]; \ + Scalar Z; Scalar sbo##Z[NP][NP][NUM_LEV]; \ + for (int ix = 0; ix < NP; ix++) for(int iy = 0; iy < NP; iy++) { \ + B.update(ix, iy); \ + for (int iz = 0; iz < NUM_LEV; iz++) { \ + B.z = iz; + +#define SPHERE_BLOCK_START0(B) \ + for (int ix = 0; ix < NP; ix++) for(int iy = 0; iy < NP; iy++) { \ + B.update(ix,iy); \ + for (int iz = 0; iz < NUM_LEV; iz++) { \ + B.z = iz; + +#define SPHERE_BLOCK_MIDDLE3(B,X,Y,Z) \ + sbo##X[B.x][B.y][B.z] = X; \ + sbo##Y[B.x][B.y][B.z] = Y; \ + sbo##Z[B.x][B.y][B.z] = Z; \ + }\ + } \ + for (int ix = 0; ix < NP; ix++) for(int iy = 0; iy < NP; iy++) { \ + B.update(ix,iy); \ + for (int iz = 0; iz < NUM_LEV; iz++) { \ + B.z = iz; \ + X = sbo##X[B.x][B.y][B.z]; \ + Y = sbo##Y[B.x][B.y][B.z]; \ + Z = sbo##Z[B.x][B.y][B.z]; + +#define SPHERE_BLOCK_MIDDLE0(B) \ + } \ + } \ + for (int ix = 0; ix < NP; ix++) for(int iy = 0; iy < NP; iy++) { \ + B.update(ix,iy); \ + for (int iz = 0; iz < NUM_LEV; iz++) { \ + B.z = iz; + +#define SPHERE_BLOCK_END() } } + +#else + +static_assert(VECTOR_SIZE == 1, "VECTOR_SIZE != 1"); + +// SPHERE_BLOCK macros for GPU (WARP_SIZE > 1). +// +// When NUM_PHYSICAL_LEV is not a multiple of WARP_SIZE (e.g. nlev=30, warp=32), +// threads with B.z >= NUM_LEV are padding threads. The original formulation +// used an early `return` before the team_barrier(), which violates the CUDA +// requirement that all threads in a CTA must reach __syncthreads(). This +// caused non-deterministic results between otherwise-identical runs. +// +// Fix: guard the body with `if (B.z < NUM_LEV)` blocks so that all 512 +// threads always reach the barrier. Local variables X/Y/Z are declared in +// outer scope (default-initialized) so they remain accessible after the +// barrier for the active (z < NUM_LEV) threads. + +#define SPHERE_BLOCK_START3(B,X,Y,Z) \ + Scalar X = {}, Y = {}, Z = {}; \ + if (B.z < NUM_LEV) { +#define SPHERE_BLOCK_START0(B) if (B.z < NUM_LEV) { + +#define SPHERE_BLOCK_MIDDLE3(B,X,Y,Z) } B.t.team_barrier(); if (B.z < NUM_LEV) { +#define SPHERE_BLOCK_MIDDLE0(B) } B.t.team_barrier(); if (B.z < NUM_LEV) { + +#define SPHERE_BLOCK_END() } + +static constexpr int SPHERE_BLOCK_LEV = WARP_SIZE; +static constexpr int SPHERE_BLOCK = NPNP * SPHERE_BLOCK_LEV; +static constexpr int SPHERE_BLOCKS_PER_COL = (NUM_PHYSICAL_LEV - 1) / SPHERE_BLOCK_LEV + 1; + +#endif + +namespace SphereOuter { +#if WARP_SIZE == 1 + using Team = TeamPolicy::member_type; + template + void parallel_for(const int num_elems, F f) { + Kokkos::parallel_for( + __PRETTY_FUNCTION__, + TeamPolicy(num_elems, 1), + f); + } +#else + using Team = int; + template + void parallel_for(const int num_elems, F f) { + f(num_elems); + } +#endif +} + +struct SphereGlobal { + + const ExecViewManaged d; + const ExecViewManaged dinv; + const ExecViewManaged dvv; + const ExecViewManaged metdet; + const Real scale_factor_inv; + + SphereGlobal(const SphereOperators &op): + d(op.m_d), + dinv(op.m_dinv), + dvv(op.dvv), + metdet(op.m_metdet), + scale_factor_inv(op.m_scale_factor_inv) + {} +}; + +struct SphereBlockOps; + +#if (WARP_SIZE == 1) + +struct SphereBlockScratch { + const SphereBlockOps &b; + Scalar v[NP][NP][NUM_LEV]; + KOKKOS_INLINE_FUNCTION SphereBlockScratch(const SphereBlockOps &); + KOKKOS_INLINE_FUNCTION const Scalar &sv(int x, int y) const; + KOKKOS_INLINE_FUNCTION Scalar &sv(int x, int y); +}; + +#else + +using SphereBlockScratchView = Kokkos::View< + Scalar[SPHERE_BLOCK_LEV][NP][NP], + ExecSpace::scratch_memory_space, + Kokkos::MemoryTraits + >; + +using SphereBlockScratchSubview = Kokkos::Subview< + SphereBlockScratchView, int, + std::remove_const_t, + std::remove_const_t + >; + +struct SphereBlockScratch { + SphereBlockScratchView v; + SphereBlockScratchSubview sv; + KOKKOS_INLINE_FUNCTION SphereBlockScratch(const SphereBlockOps &b); +}; + +#endif + +struct SphereBlockOps { + using Team = TeamPolicy::member_type; + + const SphereGlobal &g; + const Team &t; + Real scale_factor_inv; + Real metdet; + Real rrdmd; + Real dinv[2][2]; + Real dvvx[NP]; + Real dvvy[NP]; + int e,x,y,z; + + KOKKOS_INLINE_FUNCTION SphereBlockOps(const SphereGlobal &sg, const Team &team): + g(sg), + t(team), + scale_factor_inv(g.scale_factor_inv) + { +#if (WARP_SIZE == 1) + e = t.league_rank(); + x = y = z = 0; +#else + const int lr = t.league_rank(); + e = lr / SPHERE_BLOCKS_PER_COL; + const int iw = lr % SPHERE_BLOCKS_PER_COL; + const int tr = t.team_rank(); + const int ixy = tr / SPHERE_BLOCK_LEV; + const int dz = tr % SPHERE_BLOCK_LEV; + z = dz + iw * SPHERE_BLOCK_LEV; + update(ixy / NP, ixy % NP); +#endif + } + + KOKKOS_INLINE_FUNCTION void update(const int ix, const int iy) + { + rrdmd = 0; + x = ix; + y = iy; + metdet = g.metdet(e,x,y); + for (int i = 0; i < 2; i++) for (int j = 0; j < 2; j++) dinv[i][j] = g.dinv(e,i,j,x,y); + for (int j = 0; j < NP; j++) { + dvvx[j] = g.dvv(x,j); + dvvy[j] = g.dvv(y,j); + } + } + + KOKKOS_INLINE_FUNCTION Scalar div(const SphereBlockScratch &t0, const SphereBlockScratch &t1) + { + // Separately compute du and dv to match Fortran bfb + Scalar du = 0; + Scalar dv = 0; + for (int j = 0; j < NP; j++) { + du += dvvy[j] * t0.sv(x,j); + dv += dvvx[j] * t1.sv(j,y); + } + if (rrdmd == 0) rrdmd = (1.0 / metdet) * scale_factor_inv; + return (du + dv) * rrdmd; + } + + KOKKOS_INLINE_FUNCTION void divInit(SphereBlockScratch &t0, SphereBlockScratch &t1, const Scalar v0, const Scalar v1) const + { + t0.sv(x,y) = (dinv[0][0] * v0 + dinv[1][0] * v1) * metdet; + t1.sv(x,y) = (dinv[0][1] * v0 + dinv[1][1] * v1) * metdet; + } + + KOKKOS_INLINE_FUNCTION void grad(Scalar &g0, Scalar &g1, const SphereBlockScratch &t) const + { + Scalar s0 = 0; + Scalar s1 = 0; + for (int j = 0; j < NP; j++) { + s0 += dvvy[j] * t.sv(x,j); + s1 += dvvx[j] * t.sv(j,y); + } + s0 *= scale_factor_inv; + s1 *= scale_factor_inv; + g0 = dinv[0][0] * s0 + dinv[0][1] * s1; + g1 = dinv[1][0] * s0 + dinv[1][1] * s1; + } + + KOKKOS_INLINE_FUNCTION void gradInit(SphereBlockScratch &t0, const Scalar v0) + { + t0.sv(x,y) = v0; + } + + KOKKOS_INLINE_FUNCTION Scalar vort(const SphereBlockScratch &t0, const SphereBlockScratch &t1) + { + Scalar du = 0; + Scalar dv = 0; + for (int j = 0; j < NP; j++) { + du += dvvx[j] * t0.sv(j,y); + dv += dvvy[j] * t1.sv(x,j); + } + if (rrdmd == 0) rrdmd = (1.0 / metdet) * scale_factor_inv; + return (dv - du) * rrdmd; + } + + KOKKOS_INLINE_FUNCTION void vortInit(SphereBlockScratch &t0, SphereBlockScratch &t1, const Scalar v0, const Scalar v1) const + { + t0.sv(x,y) = g.d(e,0,0,x,y) * v0 + g.d(e,0,1,x,y) * v1; + t1.sv(x,y) = g.d(e,1,0,x,y) * v0 + g.d(e,1,1,x,y) * v1; + } + + KOKKOS_INLINE_FUNCTION Scalar zabove(const Scalar *const here, const Scalar *const above) const + { + if (VECTOR_SIZE == 1) return above[0]; + Scalar out; + for (int i = 0; i < VECTOR_END; i++) out[i] = here[0][i+1]; + if (z < NUM_LEV_P-1) out[VECTOR_END] = above[0][0]; + return out; + } + + KOKKOS_INLINE_FUNCTION Scalar zabove(const Scalar top, const Scalar *const here, const Scalar *const above) const + { + if (VECTOR_SIZE == 1) return (z < NUM_LEV-1) ? above[0] : top; + Scalar out; + for (int i = 0; i < VECTOR_END; i++) out[i] = here[0][i+1]; + if (z < NUM_LEV-1) out[VECTOR_END] = above[0][0]; + else out[NUM_PHYSICAL_LEV%VECTOR_SIZE] = top[NUM_PHYSICAL_LEV%VECTOR_SIZE]; + return out; + } + + KOKKOS_INLINE_FUNCTION Scalar zbelow(const Scalar bottom, const Scalar *const below, const Scalar *const here) const + { + if (VECTOR_SIZE == 1) return (z > 0) ? below[0] : bottom; + Scalar out; + for (int i = 1; i < VECTOR_SIZE; i++) out[i] = here[0][i-1]; + if (z > 0) out[0] = below[0][VECTOR_END]; + else out[0] = bottom[0]; + return out; + } + +#if (WARP_SIZE == 1) + template + static void parallel_for(const Team &team, int /*num_scratch*/, F f) + { + f(team); + } +#else + template + static void parallel_for(const int num_elems, const int num_scratch, F f) + { + Kokkos::parallel_for( + __PRETTY_FUNCTION__, + TeamPolicy(num_elems * SPHERE_BLOCKS_PER_COL, SPHERE_BLOCK). + set_scratch_size(0, Kokkos::PerTeam(num_scratch * SphereBlockScratchView::shmem_size())), + f); + } +#endif + +}; + +#if (WARP_SIZE == 1) + +KOKKOS_INLINE_FUNCTION SphereBlockScratch::SphereBlockScratch(const SphereBlockOps &b): + b(b) +{} + +KOKKOS_INLINE_FUNCTION const Scalar &SphereBlockScratch::sv(const int x, const int y) const +{ return v[x][y][b.z]; } + +KOKKOS_INLINE_FUNCTION Scalar &SphereBlockScratch::sv(const int x, const int y) +{ return v[x][y][b.z]; } + +#else + +KOKKOS_INLINE_FUNCTION SphereBlockScratch::SphereBlockScratch(const SphereBlockOps &b): + v(b.t.team_scratch(0)), + sv(Kokkos::subview(v, b.z % SPHERE_BLOCK_LEV, Kokkos::ALL, Kokkos::ALL)) +{} + +#endif + +struct SphereCol { + int e,x,y,z; + +#if (WARP_SIZE == 1) + struct Team { int e,x,y,z; }; + KOKKOS_INLINE_FUNCTION SphereCol(const Team &team): + e(team.e), + x(team.x), + y(team.y), + z(team.z) + {} + + template + static void parallel_for(const SphereOuter::Team &outer, const int num_lev, F f) + { + const int e = outer.league_rank(); + for (int x = 0; x < NP; x++) for (int y = 0; y < NP; y++) { + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(outer, num_lev), + [=] (const int z) { f(Team{e,x,y,z}); }); + } + } +#else + using Team = TeamPolicy::member_type; + + KOKKOS_INLINE_FUNCTION SphereCol(const Team &team) + { + const int lr = team.league_rank(); + e = lr / NPNP; + const int xy = lr % NPNP; + x = xy / NP; + y = xy % NP; + z = team.team_rank(); + } + + template + static void parallel_for(const int num_elems, const int num_lev, F f) + { + Kokkos::parallel_for( + __PRETTY_FUNCTION__, + TeamPolicy(num_elems * NPNP, num_lev), + f); + } +#endif + + KOKKOS_INLINE_FUNCTION Scalar zplus(const Scalar *const here, const Scalar *const above) const + { + Scalar out; + for (int i = 0; i < VECTOR_END; i++) out[i] = here[0][i+1]; + out[VECTOR_END] = above[0][0]; + return out; + } +}; + +struct SphereColOps: SphereCol { + Real scale_factor_inv; + Real dinv[2][2]; + Real dvvx[NP]; + Real dvvy[NP]; + + KOKKOS_INLINE_FUNCTION SphereColOps(const SphereGlobal &sg, const Team &team): + SphereCol(team) + { + scale_factor_inv = sg.scale_factor_inv; + for (int i = 0; i < 2; i++) for (int j = 0; j < 2; j++) dinv[i][j] = sg.dinv(e,i,j,x,y); + for (int j = 0; j < NP; j++) { + dvvx[j] = sg.dvv(x,j); + dvvy[j] = sg.dvv(y,j); + } + } + + template + KOKKOS_INLINE_FUNCTION void grad(OutView &out, const InView &in, const int n) const + { + Scalar s0 = 0; + Scalar s1 = 0; + for (int j = 0; j < NP; j++) { + s0 += dvvy[j] * in(e,n,x,j,z); + s1 += dvvx[j] * in(e,n,j,y,z); + } + s0 *= scale_factor_inv; + s1 *= scale_factor_inv; + out(e,0,x,y,z) = dinv[0][0] * s0 + dinv[0][1] * s1; + out(e,1,x,y,z) = dinv[1][0] * s0 + dinv[1][1] * s1; + } + + KOKKOS_INLINE_FUNCTION Scalar zabove(const Scalar top, const Scalar *const in) const + { + Scalar out = *in; + if (z == NUM_LEV_P-1) out[NUM_PHYSICAL_LEV%VECTOR_SIZE] = top[NUM_PHYSICAL_LEV%VECTOR_SIZE]; + return out; + } + + KOKKOS_INLINE_FUNCTION Scalar zbelow(const Real bottom, const Scalar *const below, const Scalar *const here) const + { + Scalar out; + out[0] = (z == 0) ? bottom : below[0][VECTOR_END]; + for (int i = 1; i < VECTOR_SIZE; i++) out[i] = here[0][i-1]; + return out; + } + + KOKKOS_INLINE_FUNCTION Scalar ztop(const Real top, const Real in) const + { + Scalar out = in; + if (z == NUM_LEV_P-1) out[NUM_PHYSICAL_LEV%VECTOR_SIZE] = top; + return out; + } + + KOKKOS_INLINE_FUNCTION Scalar ztrim(const Scalar bottom, const Scalar top, const Scalar in) const + { + Scalar out = in; + if (z == 0) out[0] = bottom[0]; + if (z == NUM_LEV_P-1) out[NUM_PHYSICAL_LEV%VECTOR_SIZE] = top[NUM_PHYSICAL_LEV%VECTOR_SIZE]; + return out; + } +}; + +struct SphereScanOps { + TeamPolicy::member_type t; + int e,x,y; + +#if (WARP_SIZE == 1) + struct Team { + TeamPolicy::member_type t; + int e,x,y; + }; + + KOKKOS_INLINE_FUNCTION SphereScanOps(const Team &team): + t(team.t), + e(team.e), + x(team.x), + y(team.y) + {} + + template + static void parallel_for(const SphereOuter::Team &outer, F f) + { + const int e = outer.league_rank(); + for (int x = 0; x < NP; x++) for (int y = 0; y < NP; y++) { + f(Team{outer,e,x,y}); + } + } +#else + using Team = TeamPolicy::member_type; + + KOKKOS_INLINE_FUNCTION SphereScanOps(const Team &team): + t(team) + { + const int lr = t.league_rank(); + const int tr = t.team_rank(); + e = lr; + x = tr / NP; + y = tr % NP; + } + + template + static void parallel_for(const int num_elems, F f) + { + Kokkos::parallel_for( + __PRETTY_FUNCTION__, + TeamPolicy(num_elems, NPNP, WARP_SIZE), + f); + } +#endif + + template + KOKKOS_INLINE_FUNCTION void nacs(OutView &out, const int n, const InView &in, const EndView &end) const + { + Dispatch<>::parallel_scan( + t, NUM_PHYSICAL_LEV, + [&](const int a, Real &sum, const bool last) { + if (a == 0) out(e,n,x,y,NUM_PHYSICAL_LEV) = sum = end(e,x,y); + const int z = NUM_PHYSICAL_LEV-a-1; + sum += in(e,x,y,z); + if (last) out(e,n,x,y,z) = sum; + }); + } + + template + KOKKOS_INLINE_FUNCTION void scan(OutView &out, const InView &in, const Real zero) const + { + Dispatch<>::parallel_scan( + t, NUM_PHYSICAL_LEV, + [&](const int z, Real &sum, const bool last) { + if (z == 0) out(e,x,y,0) = sum = zero; + sum += in(e,x,y,z); + if (last) out(e,x,y,z+1) = sum; + }); + } + + template + KOKKOS_INLINE_FUNCTION void scan(OutView &out, const InView &in, const int n, const Real zero) const + { + Dispatch<>::parallel_scan( + t, NUM_PHYSICAL_LEV, + [&](const int z, Real &sum, const bool last) { + if (z == 0) out(e,x,y,0) = sum = zero; + sum += in(e,n,x,y,z); + if (last) out(e,x,y,z+1) = sum; + }); + } +}; + +} // namespace Homme + +#endif // HOMMEXX_SPHERE_OPERATORS_HPP diff --git a/components/homme/src/share/cxx/utilities/ViewUtils.hpp b/components/homme/src/share/cxx/utilities/ViewUtils.hpp index 1b3efcd91020..9ccf3d70fae1 100644 --- a/components/homme/src/share/cxx/utilities/ViewUtils.hpp +++ b/components/homme/src/share/cxx/utilities/ViewUtils.hpp @@ -67,6 +67,18 @@ viewAsReal(ViewType v_in) { return ReturnView(reinterpret_cast(v_in.data())); } +template +KOKKOS_INLINE_FUNCTION +typename +std::enable_if::type,Scalar>::value, + Unmanaged*[DIM1][DIM2][DIM3*VECTOR_SIZE],Properties...>> + >::type +viewAsReal(ViewType v_in) { + using ReturnST = RealType; + using ReturnView = Unmanaged*[DIM1][DIM2][DIM3*VECTOR_SIZE],Properties...>>; + return ReturnView(reinterpret_cast(v_in.data()), v_in.extent(0)); +} + template KOKKOS_INLINE_FUNCTION typename @@ -79,6 +91,30 @@ viewAsReal(ViewType v_in) { return ReturnView(reinterpret_cast(v_in.data())); } +template +KOKKOS_INLINE_FUNCTION +typename +std::enable_if::type,Scalar>::value, + Unmanaged*[DIM1][DIM2][DIM3][DIM4*VECTOR_SIZE],Properties...>> + >::type +viewAsReal(ViewType v_in) { + using ReturnST = RealType; + using ReturnView = Unmanaged*[DIM1][DIM2][DIM3][DIM4*VECTOR_SIZE],Properties...>>; + return ReturnView(reinterpret_cast(v_in.data()), v_in.extent(0)); +} + +template +KOKKOS_INLINE_FUNCTION +typename +std::enable_if::type,Scalar>::value, + Unmanaged*[DIM1][DIM2][DIM3][DIM4][DIM5*VECTOR_SIZE],Properties...>> + >::type +viewAsReal(ViewType v_in) { + using ReturnST = RealType; + using ReturnView = Unmanaged*[DIM1][DIM2][DIM3][DIM4][DIM5*VECTOR_SIZE],Properties...>>; + return ReturnView(reinterpret_cast(v_in.data()), v_in.extent(0)); +} + // Structure to define the type of a view that has a const data type, // given the type of an input view template diff --git a/components/homme/src/theta-l_kokkos/CMakeLists.txt b/components/homme/src/theta-l_kokkos/CMakeLists.txt index 1909cc39cd4e..fd1ea51cc301 100644 --- a/components/homme/src/theta-l_kokkos/CMakeLists.txt +++ b/components/homme/src/theta-l_kokkos/CMakeLists.txt @@ -184,6 +184,10 @@ MACRO(THETAL_KOKKOS_SETUP) ${SRC_SHARE_DIR}/cxx/utilities/Hash.cpp ) + IF (HOMMEXX_ENABLE_CAAR_OPT) + LIST(APPEND THETAL_DEPS_CXX ${TARGET_DIR}/cxx/CaarFunctorImpl.cpp) + ENDIF () + IF (HOMME_USE_ARKODE) SET(THETAL_DEPS_F90 ${THETAL_DEPS_F90} diff --git a/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl-caar-opt.hpp b/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl-caar-opt.hpp new file mode 100644 index 000000000000..e38a12215cfb --- /dev/null +++ b/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl-caar-opt.hpp @@ -0,0 +1,1412 @@ +/******************************************************************************** + * HOMMEXX 1.0: Copyright of Sandia Corporation + * This software is released under the BSD license + * See the file 'COPYRIGHT' in the HOMMEXX/src/share/cxx directory + *******************************************************************************/ + +#ifndef HOMMEXX_CAAR_FUNCTOR_IMPL_OPT_HPP +#define HOMMEXX_CAAR_FUNCTOR_IMPL_OPT_HPP + +#include "Types.hpp" +#include "Elements.hpp" +#include "ColumnOps.hpp" +#include "Context.hpp" +#include "EquationOfState.hpp" +#include "FunctorsBuffersManager.hpp" +#include "HybridVCoord.hpp" +#include "KernelVariables.hpp" +#include "LimiterFunctor.hpp" +#include "ReferenceElement.hpp" +#include "RKStageData.hpp" +#include "SimulationParams.hpp" +#include "SphereOperators-caar-opt.hpp" +#include "kokkos_utils.hpp" + +#include "mpi/BoundaryExchange.hpp" +#include "mpi/MpiBuffersManager.hpp" +#include "utilities/SubviewUtils.hpp" +#include "utilities/ViewUtils.hpp" + +#include "profiling.hpp" +#include "ErrorDefs.hpp" + +#include +#include + +namespace Homme { + +// Theta does not use tracers in caar. A fwd decl is enough here +struct Tracers; + +struct CaarFunctorImpl { + + struct Buffers { + static constexpr int num_3d_scalar_mid_buf = 10; + static constexpr int num_3d_vector_mid_buf = 5; + static constexpr int num_3d_scalar_int_buf = 6; + static constexpr int num_3d_vector_int_buf = 3; + + ExecViewUnmanaged temp; + + ExecViewUnmanaged pnh; + ExecViewUnmanaged pi; + ExecViewUnmanaged exner; + ExecViewUnmanaged div_vdp; + ExecViewUnmanaged phi; + ExecViewUnmanaged omega_p; + ExecViewUnmanaged vort; + + ExecViewUnmanaged grad_exner; + ExecViewUnmanaged mgrad; + ExecViewUnmanaged grad_tmp; + ExecViewUnmanaged vdp; + + ExecViewUnmanaged dp_i; + ExecViewUnmanaged vtheta_i; + ExecViewUnmanaged dpnh_dp_i; + ExecViewUnmanaged eta_dot_dpdn; + + ExecViewUnmanaged v_i; + ExecViewUnmanaged grad_phinh_i; + ExecViewUnmanaged grad_w_i; + + ExecViewUnmanaged v_tens; + ExecViewUnmanaged theta_tens; + ExecViewUnmanaged dp_tens; + ExecViewUnmanaged w_tens; + ExecViewUnmanaged phi_tens; + }; + + using deriv_type = ReferenceElement::deriv_type; + + RKStageData m_data; + + // The 'm_data.scale2' coeff used for the calculation of w_tens and phi_tens must be replaced + // with 'm_data.scale1' on the surface (k=NUM_INTERFACE_LEV). To allow pack-level operations + // in the code, we store a pack containing [[scale2 scale2 ...] scale1 [garbage] ], to be used + // at pack NUM_LEV_P + + Scalar m_scale2g_last_int_pack; + + const int m_num_elems; + const int m_rsplit; + const bool m_theta_hydrostatic_mode; + const AdvectionForm m_theta_advection_form; + const bool m_pgrad_correction; + + HybridVCoord m_hvcoord; + ElementsState m_state; + ElementsDerivedState m_derived; + ElementsGeometry m_geometry; + EquationOfState m_eos; + Buffers m_buffers; + deriv_type m_deriv; + + SphereOperators m_sphere_ops; + + struct TagPreExchange {}; + struct TagPostExchange {}; + + // Policies +#if defined(KOKKOS_ENABLE_CUDA) && !defined(NDEBUG) + template + using TeamPolicyType = Kokkos::TeamPolicy,Tag>; +#else + template + using TeamPolicyType = Kokkos::TeamPolicy; +#endif + + TeamPolicyType m_policy_pre; + + Kokkos::RangePolicy m_policy_post; + + TeamUtils m_tu; + + Kokkos::Array, NUM_TIME_LEVELS> m_bes; + + template void epoch1_blockOps(const SphereOuter::Team &); + template void epoch2_scanOps(const SphereOuter::Team &); + template void epoch3_blockOps(const SphereOuter::Team &); + void epoch4_scanOps(const SphereOuter::Team &); + template void epoch5_colOps(const SphereOuter::Team &); + template void epoch6_blockOps(const SphereOuter::Team &); + template void epoch7_col(const SphereOuter::Team &); + void caar_compute(); + + public: + CaarFunctorImpl(const Elements &elements, const Tracers &/* tracers */, + const ReferenceElement &ref_FE, const HybridVCoord &hvcoord, + const SphereOperators &sphere_ops, const SimulationParams& params) + : m_num_elems(elements.num_elems()) + , m_rsplit(params.rsplit) + , m_theta_hydrostatic_mode(params.theta_hydrostatic_mode) + , m_theta_advection_form(params.theta_adv_form) + , m_pgrad_correction(params.pgrad_correction) + , m_hvcoord(hvcoord) + , m_state(elements.m_state) + , m_derived(elements.m_derived) + , m_geometry(elements.m_geometry) + , m_deriv(ref_FE.get_deriv()) + , m_sphere_ops(sphere_ops) + , m_policy_pre (Homme::get_default_team_policy(m_num_elems)) + , m_policy_post (0,m_num_elems*NP*NP) + , m_tu(m_policy_pre) + { + // Initialize equation of state + m_eos.init(params.theta_hydrostatic_mode,m_hvcoord); + + // Make sure the buffers in sph op are large enough for this functor's needs + m_sphere_ops.allocate_buffers(m_tu); + } + + CaarFunctorImpl(const int num_elems, const SimulationParams& params) + : m_num_elems(num_elems) + , m_rsplit(params.rsplit) + , m_theta_hydrostatic_mode(params.theta_hydrostatic_mode) + , m_theta_advection_form(params.theta_adv_form) + , m_pgrad_correction(params.pgrad_correction) + , m_policy_pre (Homme::get_default_team_policy(m_num_elems)) + , m_policy_post (0,num_elems*NP*NP) + , m_tu(m_policy_pre) + {} + + void setup (const Elements &elements, const Tracers &/*tracers*/, + const ReferenceElement &ref_FE, const HybridVCoord &hvcoord, + const SphereOperators &sphere_ops) + { + assert(m_num_elems == elements.num_elems()); // Sanity check + m_hvcoord = hvcoord; + m_state = elements.m_state; + m_derived = elements.m_derived; + m_geometry = elements.m_geometry; + m_deriv = ref_FE.get_deriv(); + m_sphere_ops = sphere_ops; + + // Initialize equation of state + m_eos.init(m_theta_hydrostatic_mode,m_hvcoord); + + // Make sure the buffers in sph op are large enough for this functor's needs + m_sphere_ops.allocate_buffers(m_tu); + } + + int requested_buffer_size () const { + // Ask the buffers manager to allocate enough buffers to satisfy Caar's needs. + // + // IMPORTANT: Why we use m_num_elems instead of m_tu.get_num_ws_slots() here. + // + // The operator()(TagPreExchange) path (used when HOMMEXX_ENABLE_CAAR_OPT is + // NOT defined, or in the #else branch) allocates buffers sized by + // m_tu.get_num_ws_slots() (= nslots = number of concurrent Kokkos teams). + // In that path, each team processes one element and uses its team workspace + // slot index (kv.team_idx, which is in [0, nslots)) as the first dimension + // into the buffer arrays. At most nslots teams run simultaneously, so nslots + // entries suffice. + // + // The epoch path in CaarFunctorImpl.cpp (the caar_compute() function, which + // is selected by the `#if 1` block in run()) uses a DIFFERENT indexing + // strategy: it iterates over ALL elements and directly uses the element index + // (b.e / c.e, in [0, m_num_elems)) as the first buffer dimension. For + // example: buffers_pnh(b.e, b.x, b.y, b.z). + // + // On serial CPU execution, m_tu.get_num_ws_slots() returns 1 (only one team + // can run at a time), but m_num_elems can be much larger (e.g. 7 for a + // 160-element mesh distributed over 24 MPI ranks). If we allocate only + // nslots=1 entries, then any access with b.e > 0 is out-of-bounds into + // uninitialised (or zeroed) memory, causing NaN values in the buffers and + // ultimately NaN in the DIRK Newton loop's dphi/phi diagnostics. + // + // The fix is to size the first buffer dimension by m_num_elems so that the + // epoch path's direct element-index access is always in bounds. On GPU, + // nslots == m_num_elems (all elements processed concurrently), so this + // change has no effect there. + const int nslots = m_num_elems; + + int num_scalar_mid_buf = Buffers::num_3d_scalar_mid_buf; + int num_scalar_int_buf = Buffers::num_3d_scalar_int_buf; + int num_vector_mid_buf = Buffers::num_3d_vector_mid_buf; + int num_vector_int_buf = Buffers::num_3d_vector_int_buf; + + // Depending on rsplit/hydro-mode, we may remove some + // buffers that are not needed from the counters above. + if (m_theta_hydrostatic_mode) { + // pi=pnh, and no dpnh_dp_i/phitens + // NOTE: w_tens is still allocated in hydrostatic mode because epoch2_scanOps + // repurposes it as a scratch buffer for the omega integral scan (computing + // the prefix sum of div_vdp to get omega_i). Without this buffer allocated, + // epoch2 and epoch3 would access an uninitialized view -> SIGSEGV. + num_scalar_mid_buf -= 1; + num_scalar_int_buf -= 2; + + // No grad_w_i/v_i + num_vector_int_buf -= 2; + } + if (m_rsplit>0) { + // No theta_i/eta_dot_dpdn + num_scalar_int_buf -=2; + } + + return num_scalar_mid_buf *NP*NP*NUM_LEV *VECTOR_SIZE*nslots + + num_scalar_int_buf *NP*NP*NUM_LEV_P*VECTOR_SIZE*nslots + + num_vector_mid_buf*2*NP*NP*NUM_LEV *VECTOR_SIZE*nslots + + num_vector_int_buf*2*NP*NP*NUM_LEV_P*VECTOR_SIZE*nslots; + } + + void init_buffers (const FunctorsBuffersManager& fbm) { + Errors::runtime_check(fbm.allocated_size()>=requested_buffer_size(), "Error! Buffers size not sufficient.\n"); + + Scalar* mem = reinterpret_cast(fbm.get_memory()); + // Use m_num_elems (not m_tu.get_num_ws_slots()) as the first buffer + // dimension. See the comment in requested_buffer_size() for the full + // explanation. In short: the epoch path in CaarFunctorImpl.cpp accesses + // buffers with the element index (b.e / c.e) as the first dimension, so we + // must allocate m_num_elems slots regardless of how many Kokkos teams can + // run simultaneously. + const int nslots = m_num_elems; + + // Midpoints scalars + m_buffers.pnh = decltype(m_buffers.pnh )(mem,nslots); + mem += m_buffers.pnh.size(); + if (m_theta_hydrostatic_mode) { + // pi = pnh + m_buffers.pi = m_buffers.pnh; + } else { + m_buffers.pi = decltype(m_buffers.pi )(mem,nslots); + mem += m_buffers.pi.size(); + } + + m_buffers.temp = decltype(m_buffers.temp )(mem,nslots); + mem += m_buffers.temp.size(); + m_buffers.exner = decltype(m_buffers.exner )(mem,nslots); + mem += m_buffers.exner.size(); + m_buffers.phi = decltype(m_buffers.phi )(mem,nslots); + mem += m_buffers.phi.size(); + m_buffers.div_vdp = decltype(m_buffers.div_vdp )(mem,nslots); + mem += m_buffers.div_vdp.size(); + m_buffers.omega_p = decltype(m_buffers.omega_p )(mem,nslots); + mem += m_buffers.omega_p.size(); + m_buffers.vort = decltype(m_buffers.vort)(mem,nslots); + mem += m_buffers.vort.size(); + m_buffers.theta_tens = decltype(m_buffers.theta_tens)(mem,nslots); + mem += m_buffers.theta_tens.size(); + m_buffers.dp_tens = decltype(m_buffers.dp_tens )(mem,nslots); + mem += m_buffers.dp_tens.size(); + + // Midpoints vectors + m_buffers.grad_exner = decltype(m_buffers.grad_exner)(mem,nslots); + mem += m_buffers.grad_exner.size(); + m_buffers.mgrad = decltype(m_buffers.mgrad)(mem,nslots); + mem += m_buffers.mgrad.size(); + m_buffers.grad_tmp = decltype(m_buffers.grad_tmp)(mem,nslots); + mem += m_buffers.grad_tmp.size(); + + m_buffers.vdp = decltype(m_buffers.vdp )(mem,nslots); + mem += m_buffers.vdp.size(); + m_buffers.v_tens = decltype(m_buffers.v_tens )(mem,nslots); + mem += m_buffers.v_tens.size(); + + // Interface scalars + // ALWAYS allocate dp_i because we use it to store pi_i + m_buffers.dp_i = decltype(m_buffers.dp_i)(mem,nslots); + mem += m_buffers.dp_i.size(); + + if (!m_theta_hydrostatic_mode) { + m_buffers.dpnh_dp_i = decltype(m_buffers.dpnh_dp_i)(mem,nslots); + mem += m_buffers.dpnh_dp_i.size(); + } + + if (m_rsplit==0) { + m_buffers.eta_dot_dpdn = decltype(m_buffers.eta_dot_dpdn)(mem,nslots); + mem += m_buffers.eta_dot_dpdn.size(); + m_buffers.vtheta_i = decltype(m_buffers.vtheta_i )(mem,nslots); + mem += m_buffers.vtheta_i.size(); + } + + if (!m_theta_hydrostatic_mode) { + m_buffers.phi_tens = decltype(m_buffers.phi_tens )(mem,nslots); + mem += m_buffers.phi_tens.size(); + } + // ALWAYS allocate w_tens: in non-hydrostatic mode it stores the w-tendency; + // in hydrostatic mode it is repurposed by epoch2_scanOps as a scratch buffer + // for the omega integral scan (prefix sum of div_vdp -> omega_i). + // Failing to allocate it in hydrostatic mode causes epoch2 and epoch3 to + // access an uninitialized view, leading to SIGSEGV. + m_buffers.w_tens = decltype(m_buffers.w_tens )(mem,nslots); + mem += m_buffers.w_tens.size(); + + // Interface vectors + if (!m_theta_hydrostatic_mode) { + m_buffers.v_i = decltype(m_buffers.v_i )(mem,nslots); + mem += m_buffers.v_i.size(); + m_buffers.grad_w_i = decltype(m_buffers.grad_w_i )(mem,nslots); + mem += m_buffers.grad_w_i.size(); + } + m_buffers.grad_phinh_i = decltype(m_buffers.grad_phinh_i)(mem,nslots); + mem += m_buffers.grad_phinh_i.size(); + + assert ((reinterpret_cast(mem) - fbm.get_memory())==requested_buffer_size()); + } + + void init_boundary_exchanges (const std::shared_ptr& bm_exchange) { + const auto& sp = Context::singleton().get(); + for (int tl=0; tl(); + auto& be = *m_bes[tl]; + be.set_label(std::string("CAAR-") + std::to_string(tl)); + be.set_diagnostics_level(sp.internal_diagnostics_level); + be.set_buffers_manager(bm_exchange); + if (m_theta_hydrostatic_mode) { + be.set_num_fields(0,0,4); + } else { + be.set_num_fields(0,0,4,2); + } + be.register_field(m_state.m_v,tl,2,0); + be.register_field(m_state.m_vtheta_dp,1,tl); + be.register_field(m_state.m_dp3d,1,tl); + if (!m_theta_hydrostatic_mode) { + // Note: phinh_i at the surface (last level) is constant, so it doesn't *need* bex. + // If bex(constant)=constant, we might just do it. This would not eliminate + // the need for halo-exchange of interface-based quantities though, since + // we would still need to exchange w_i. + be.register_field(m_state.m_w_i,1,tl); + be.register_field(m_state.m_phinh_i,1,tl); + } + be.registration_completed(); + } + } + + void set_rk_stage_data (const RKStageData& data) { + m_data = data; + + // Set m_scale2 to m_data.scale2 everywhere, except on last interface, + // where we set it to m_data.scale1. While at it, we already multiply by g. + constexpr auto g = PhysicalConstants::g; + + m_scale2g_last_int_pack = m_data.scale2*g; + m_scale2g_last_int_pack[ColInfo::LastPackEnd] = m_data.scale1*g; + } + + void run (const RKStageData& data) + { + + auto& limiter = Context::singleton().get(); + + set_rk_stage_data(data); + + GPTLstart("caar compute"); + +#if 1 + + caar_compute(); + Kokkos::fence(); + GPTLstop("caar compute"); + +#else + + int nerr; + Kokkos::parallel_reduce("caar loop pre-boundary exchange", m_policy_pre, *this, nerr); + Kokkos::fence(); + GPTLstop("caar compute"); + if (nerr > 0) + check_print_abort_on_bad_elems("CaarFunctorImpl::run TagPreExchange", data.n0); + +#endif + + GPTLstart("caar_bexchV"); + m_bes[data.np1]->exchange(m_geometry.m_rspheremp); + Kokkos::fence(); + GPTLstop("caar_bexchV"); + + if (!m_theta_hydrostatic_mode) { + GPTLstart("caar compute"); + Kokkos::parallel_for("caar loop post-boundary exchange", m_policy_post, *this); + Kokkos::fence(); + GPTLstop("caar compute"); + } + + limiter.run(data.np1); + + } + + KOKKOS_INLINE_FUNCTION + void operator()(const TagPreExchange&, const TeamMember &team, int& nerr) const { + // In this body, we use '====' to separate sync epochs (delimited by barriers) + // Note: make sure the same temp is not used within each epoch! + + KernelVariables kv(team, m_tu); + + // =========== EPOCH 1 =========== // + compute_div_vdp(kv); + + // =========== EPOCH 2 =========== // + kv.team_barrier(); + + // Computes pi, omega, and phi. + const bool ok = compute_scan_quantities(kv); + if ( ! ok) nerr = 1; + + if (m_rsplit==0 || !m_theta_hydrostatic_mode) { + // ============ EPOCH 2.1 =========== // + kv.team_barrier(); + compute_interface_quantities(kv); + } + + if (m_rsplit==0) { + // ============= EPOCH 2.2 ============ // + kv.team_barrier(); + compute_vertical_advection(kv); + } + + // ============= EPOCH 3 ============== // + kv.team_barrier(); + compute_accumulated_quantities(kv); + + // Compute update quantities + if (!m_theta_hydrostatic_mode) { + compute_w_and_phi_tens (kv); + } + + compute_dp_and_theta_tens (kv); + + // ============= EPOCH 4 =========== // + // compute_v_tens reuses some buffers used by compute_dp_and_theta_tens + kv.team_barrier(); + compute_v_tens (kv); + + // Update states + if (!m_theta_hydrostatic_mode) { + compute_w_and_phi_np1(kv); + } + compute_dp3d_and_theta_np1(kv); + + // ============= EPOCH 5 =========== // + // v_tens has been computed after last barrier. Need to make sure it's done + kv.team_barrier(); + compute_v_np1(kv); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const TagPostExchange&, const int idx) const { + // For g + using namespace PhysicalConstants; + + using InfoM = ColInfo; + using InfoI = ColInfo; + constexpr int LAST_MID_PACK = InfoM::LastPack; + constexpr int LAST_MID_PACK_END = InfoM::LastPackEnd; + constexpr int LAST_INT_PACK = InfoI::LastPack; + constexpr int LAST_INT_PACK_END = InfoI::LastPackEnd; + + // Note: make sure you run this only in non-hydro mode + // KernelVariables kv(team); + const int ie = idx / (NP*NP); + const int igp = (idx / NP) % NP; + const int jgp = idx % NP; + + auto& u = m_state.m_v(ie,m_data.np1,0,igp,jgp,LAST_MID_PACK)[LAST_MID_PACK_END]; + auto& v = m_state.m_v(ie,m_data.np1,1,igp,jgp,LAST_MID_PACK)[LAST_MID_PACK_END]; + auto& w = m_state.m_w_i(ie,m_data.np1,igp,jgp,LAST_INT_PACK)[LAST_INT_PACK_END]; + const auto& phis_x = m_geometry.m_gradphis(ie,0,igp,jgp); + const auto& phis_y = m_geometry.m_gradphis(ie,1,igp,jgp); + + // Compute dpnh_dp_i on surface + auto dpnh_dp_i = 1 + ( ( (u*phis_x + v*phis_y)/g - w) / + (g + (phis_x*phis_x+phis_y*phis_y)/(2*g) ) ) / m_data.dt; + + // Update w_i on bottom interface + // Update v on bottom level + w += m_data.scale1*m_data.dt*g*(dpnh_dp_i-1.0); + u -= m_data.scale1*m_data.dt*(dpnh_dp_i-1.0)*phis_x/2.0; + v -= m_data.scale1*m_data.dt*(dpnh_dp_i-1.0)*phis_y/2.0; + + // TODO: you need to modify the BoundaryExchange class a bit, cause as of today + // it exchanges *all* vertical levels. For phi, we don't want/need to + // exchange the last level, since phi=phis at surface. + // So to make sure we're not messing up, set phi back to phis on last interface + // Note: this is *independent* of whether NUM_LEV==NUM_LEV_P or not. + auto& phi_surf = m_state.m_phinh_i(ie,m_data.np1,igp,jgp,LAST_INT_PACK)[LAST_INT_PACK_END]; + phi_surf = m_geometry.m_phis(ie,igp,jgp); + +#if defined(ENERGY_DIAGNOSTICS) && !defined(NDEBUG) + // Check w bc + if (fabs( (u*phis_x+v*phis_y)/g - w ) > 1e-10) { + printf("[CAAR] WARNING! w b.c. not satisfied at (ie,igp,jgp) = (%d,%d,%d):\n" + " w: %3.15f\n" + " v*grad(phis)/g: %3.15f\n" + " diff: %3.15f\n", + ie,igp,jgp,w,(u*phis_x+v*phis_y)/g,fabs( (u*phis_x+v*phis_y)/g - w )); + } + + auto phi = Homme::viewAsReal(Homme::subview(m_state.m_phinh_i,ie,m_data.np1,igp,jgp)); + for (int k=0; kmidpoints average. + // For omega, we have omega_i(k+1) = omega_i(k)+div_vdp(k), with omega_i(0)=0; + // then, omega(k) = v*grad(pi) - average(omega_i). You can see how we could + // avoid computing omega_i, and set the accumulation step to be + // omega(k+1) = omega(k)+(div_vdp(k)+div_vdp(k-1))/2, with omega(0)=div_vdp(0)/2. + // However, avoiding computing omega_i would lose BFB agreement + // with the original F90 implementation. So for now, keep + // the two step calculation for omega (interface, then midpoints) + // TODO; skip calculation of omega_i + // Note: pi_i and omega_i are not needed after computing pi and omega, + // so simply grab unused buffers + auto dp = Homme::subview(m_state.m_dp3d,kv.ie,m_data.n0,igp,jgp); + auto div_vdp = Homme::subview(m_buffers.div_vdp,kv.team_idx,igp,jgp); + auto pi = Homme::subview(m_buffers.pi,kv.team_idx,igp,jgp); + auto omega_i = Homme::subview(m_buffers.grad_phinh_i,kv.team_idx,0,igp,jgp); + auto pi_i = Homme::subview(m_buffers.grad_phinh_i,kv.team_idx,1,igp,jgp); + + Kokkos::single(Kokkos::PerThread(kv.team),[&]() { + pi_i(0)[0] = m_hvcoord.ps0*m_hvcoord.hybrid_ai0; + }); + kv.team_barrier(); // necessary to avoid race in column_scan_mid_to_int + + ColumnOps::column_scan_mid_to_int(kv,dp,pi_i); + + ColumnOps::compute_midpoint_values(kv,pi_i,pi); + + Kokkos::single(Kokkos::PerThread(kv.team),[&]() { + omega_i(0)[0] = 0.0; + }); + kv.team_barrier(); // necessary to avoid race in column_scan_mid_to_int + + ColumnOps::column_scan_mid_to_int(kv,div_vdp,omega_i); + kv.team_barrier(); + // Average omega_i to midpoints, and change sign, since later + // omega=v*grad(pi)-average(omega_i) + auto omega = Homme::subview(m_buffers.omega_p,kv.team_idx,igp,jgp); + ColumnOps::compute_midpoint_values(kv,omega_i,omega,-1.0); + }); + + kv.team_barrier(); + + // Compute grad(pi) + m_sphere_ops.gradient_sphere(kv,Homme::subview(m_buffers.pi,kv.team_idx), + Homme::subview(m_buffers.grad_tmp,kv.team_idx)); + kv.team_barrier(); + + // Update omega with v*grad(p) + Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team,NP*NP), + [&](const int idx) { + const int igp = idx / NP; + const int jgp = idx % NP; + + auto omega = Homme::subview(m_buffers.omega_p,kv.team_idx,igp,jgp); + auto v = Homme::subview(m_state.m_v,kv.ie,m_data.n0); + auto grad_pi = Homme::subview(m_buffers.grad_tmp,kv.team_idx); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team,NUM_LEV), + [&](const int ilev) { + omega(ilev) += (v(0,igp,jgp,ilev)*grad_pi(0,igp,jgp,ilev) + + v(1,igp,jgp,ilev)*grad_pi(1,igp,jgp,ilev)); + }); + + // Use EOS to compute pnh/exner or exner/phi (depending on whether it's hydro mode). + if (m_theta_hydrostatic_mode) { + // Recall that, in this case, pnh aliases pi, and pi was already computed + m_eos.compute_exner(kv,Homme::subview(m_buffers.pnh,kv.team_idx,igp,jgp), + Homme::subview(m_buffers.exner,kv.team_idx,igp,jgp)); + + m_eos.compute_phi_i(kv, m_geometry.m_phis(kv.ie,igp,jgp), + Homme::subview(m_state.m_vtheta_dp,kv.ie,m_data.n0,igp,jgp), + Homme::subview(m_buffers.exner,kv.team_idx,igp,jgp), + Homme::subview(m_buffers.pnh,kv.team_idx,igp,jgp), + Homme::subview(m_state.m_phinh_i,kv.ie,m_data.n0,igp,jgp)); + } else { + const bool ok1 = + m_eos.compute_pnh_and_exner(kv, + Homme::subview(m_state.m_vtheta_dp,kv.ie,m_data.n0,igp,jgp), + Homme::subview(m_state.m_phinh_i,kv.ie,m_data.n0,igp,jgp), + Homme::subview(m_buffers.pnh,kv.team_idx,igp,jgp), + Homme::subview(m_buffers.exner,kv.team_idx,igp,jgp)); + if ( ! ok1) ok = false; + } + + // Compute phi at midpoints. + // IMPORTANT: The original PR #8192 optimization guards this with (m_rsplit == 0) + // because compute_interface_quantities (the vertical advection remap step) only + // consumes m_buffers.phi when rsplit==0. However, compute_phi_vadv in + // compute_vertical_advection is called whenever (!m_theta_hydrostatic_mode), + // regardless of rsplit, and it reads m_buffers.phi. So when rsplit>0 and + // !m_theta_hydrostatic_mode, m_buffers.phi is read uninitialized. + // TODO: revisit whether compute_phi_vadv should also be guarded by (m_rsplit==0), + // or whether compute_interface_quantities should re-compute phi midpoints rather + // than relying on this buffer. For now, widen the guard to cover the + // non-hydrostatic path so the buffer is always valid when it is read. + if (m_rsplit == 0 || !m_theta_hydrostatic_mode) { + ColumnOps::compute_midpoint_values(kv,Homme::subview(m_state.m_phinh_i,kv.ie,m_data.n0,igp,jgp), + Homme::subview(m_buffers.phi,kv.team_idx,igp,jgp)); + } + }); + kv.team_barrier(); + return ok; + } + + KOKKOS_INLINE_FUNCTION + void compute_interface_quantities(KernelVariables &kv) const { + Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team,NP*NP), + [&](const int idx) { + const int igp = idx / NP; + const int jgp = idx % NP; + + auto dp = Homme::subview(m_state.m_dp3d,kv.ie,m_data.n0,igp,jgp); + auto dp_i = Homme::subview(m_buffers.dp_i,kv.team_idx,igp,jgp); + + // Compute interface dp + ColumnOps::compute_interface_values(kv.team,dp,dp_i); + + if (!m_theta_hydrostatic_mode) { + auto u = Homme::subview(m_state.m_v,kv.ie,m_data.n0,0,igp,jgp); + auto v = Homme::subview(m_state.m_v,kv.ie,m_data.n0,1,igp,jgp); + + // Compute interface horiz velocity + auto u_i = Homme::subview(m_buffers.v_i,kv.team_idx,0,igp,jgp); + auto v_i = Homme::subview(m_buffers.v_i,kv.team_idx,1,igp,jgp); + + ColumnOps::compute_interface_values(kv.team,dp,dp_i,u,u_i); + ColumnOps::compute_interface_values(kv.team,dp,dp_i,v,v_i); + + // grad_phinh_i is yet to be computed, so the buffer is available + auto dpnh_dp_i = Homme::subview(m_buffers.dpnh_dp_i,kv.team_idx,igp,jgp); + + m_eos.compute_dpnh_dp_i(kv,Homme::subview(m_buffers.pnh,kv.team_idx,igp,jgp), + dp_i, + dpnh_dp_i); + } + + if (m_rsplit==0) { + // Shorter names, to keep a call to ColumnOps a bit shorter + using CM = CombineMode; + + // Compute interface vtheta_i, with an energy preserving scheme + auto vtheta_i = Homme::subview(m_buffers.vtheta_i,kv.team_idx,igp,jgp); + auto dexner_i = Homme::subview(m_buffers.grad_phinh_i,kv.team_idx,0,igp,jgp); + auto exner = Homme::subview(m_buffers.exner,kv.team_idx,igp,jgp); + auto phi = Homme::subview(m_buffers.phi,kv.team_idx,igp,jgp); + + // vtheta_i(k) = -dpnh_dp_i(k)*(dphi(k) / dexner(k)) / Cp + // with dX = X(k+1)-X(k) + ColumnOps::compute_interface_delta(kv,phi,vtheta_i); + + // Since bcVal is the last input, if you need bcVal != 0.0, + // you also need to pass alpha and beta as well (1.0 and 0.0 resp.). + // Here, bcVal is 1.0 (to avoid nan/inf with division by 0) + ColumnOps::compute_interface_delta(kv,exner,vtheta_i,1.0,0.0,1.0); + + Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team,NUM_LEV), + [&](const int ilev) { + vtheta_i(ilev) /= -PhysicalConstants::cp; + if (!m_theta_hydrostatic_mode) { + vtheta_i(ilev) *= m_buffers.dpnh_dp_i(kv.team_idx,igp,jgp,ilev); + } + }); + } + }); + } + + KOKKOS_INLINE_FUNCTION + void compute_vertical_advection(KernelVariables &kv) const { + // Compute vertical advection terms: + // - eta_dot_dpdn + // - v_vadv + // - theta_vadv + // - phi_vadv_i + // - w_vadv_i + Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team,NP*NP), + [&](const int idx) { + const int igp = idx / NP; + const int jgp = idx % NP; + + compute_eta_dot_dpn (kv,igp,jgp); + compute_v_vadv (kv,igp,jgp); + compute_vtheta_vadv (kv,igp,jgp); + if (!m_theta_hydrostatic_mode) { + compute_w_vadv (kv,igp,jgp); + compute_phi_vadv (kv,igp,jgp); + } + }); + } + + KOKKOS_INLINE_FUNCTION + void compute_eta_dot_dpn (KernelVariables& kv, const int& igp, const int& jgp) const { + auto div_vdp = viewAsReal(Homme::subview(m_buffers.div_vdp,kv.team_idx,igp,jgp)); + auto eta_dot_dpdn = viewAsReal(Homme::subview(m_buffers.eta_dot_dpdn,kv.team_idx,igp,jgp)); + + // Integrate -vdp + Dispatch::parallel_scan(kv.team,NUM_PHYSICAL_LEV, + [&](const int ilev, Real& accumulator, const bool last) { + accumulator += div_vdp(ilev); + if (last) { + eta_dot_dpdn(ilev+1) = -accumulator; + } + }); + + // Get the last entry, which is the sum over the whole column + const Real eta_dot_dpdn_last = -eta_dot_dpdn(NUM_INTERFACE_LEV-1); + + Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team,1,NUM_PHYSICAL_LEV), + [&](const int ilev) { + eta_dot_dpdn(ilev) += m_hvcoord.hybrid_bi(ilev)*eta_dot_dpdn_last; + }); + + // Set the boundary conditions for eta_dot_dpdn + Kokkos::single(Kokkos::PerThread(kv.team),[&](){ + eta_dot_dpdn(0) = eta_dot_dpdn(NUM_INTERFACE_LEV-1) = 0; + }); + } + + KOKKOS_INLINE_FUNCTION + void compute_v_vadv (KernelVariables& kv, const int& igp, const int& jgp) const { + // Note: save v_vadv temp by stuffing directly in vtens + + auto u = viewAsReal(Homme::subview(m_state.m_v,kv.ie,m_data.n0,0,igp,jgp)); + auto v = viewAsReal(Homme::subview(m_state.m_v,kv.ie,m_data.n0,1,igp,jgp)); + auto dp = viewAsReal(Homme::subview(m_state.m_dp3d,kv.ie,m_data.n0,igp,jgp)); + auto eta_dot_dpdn = viewAsReal(Homme::subview(m_buffers.eta_dot_dpdn,kv.team_idx,igp,jgp)); + auto u_vadv = viewAsReal(Homme::subview(m_buffers.v_tens,kv.team_idx,0,igp,jgp)); + auto v_vadv = viewAsReal(Homme::subview(m_buffers.v_tens,kv.team_idx,1,igp,jgp)); + + // TODO: vectorize this code. + const auto facp_1 = 0.5*eta_dot_dpdn(1)/dp(0); + u_vadv(0) = facp_1*(u(1)-u(0)); + v_vadv(0) = facp_1*(v(1)-v(0)); + + Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team,1,NUM_PHYSICAL_LEV-1), + [&](const int k) { + const auto facp = 0.5*eta_dot_dpdn(k+1)/dp(k); + const auto facm = 0.5*eta_dot_dpdn(k)/dp(k); + u_vadv(k) = facp*(u(k+1)-u(k)) + facm*(u(k)-u(k-1)); + v_vadv(k) = facp*(v(k+1)-v(k)) + facm*(v(k)-v(k-1)); + }); + + constexpr int last = NUM_PHYSICAL_LEV-1; + const auto facm_N = 0.5*eta_dot_dpdn(last)/dp(last); + u_vadv(last) = facm_N*(u(last)-u(last-1)); + v_vadv(last) = facm_N*(v(last)-v(last-1)); + } + + KOKKOS_INLINE_FUNCTION + void compute_vtheta_vadv (KernelVariables& kv, const int& igp, const int& jgp) const { + auto eta_dot_dpdn = Homme::subview(m_buffers.eta_dot_dpdn,kv.team_idx,igp,jgp); + auto vtheta_i = Homme::subview(m_buffers.vtheta_i,kv.team_idx,igp,jgp); + // No point in going till NUM_LEV_P, since vtheta_i=0 at the bottom + // Also, this is the last time we need vtheta_i, so we can overwrite it. + + auto provider = [&](const int ilev)->Scalar{ + return eta_dot_dpdn(ilev)*vtheta_i(ilev); + }; + + // Compute theta_vadv, store directly in theta_tens + auto theta_vadv = Homme::subview(m_buffers.theta_tens,kv.team_idx,igp,jgp); + ColumnOps::compute_midpoint_delta(kv,provider,theta_vadv); + } + + KOKKOS_INLINE_FUNCTION + void compute_w_vadv (KernelVariables& kv, const int& igp, const int& jgp) const { + // w_vadv = average(temp)/dp3d_i. + // temp = average(eta_dot)*delta(w_i) + // Note: store directly in w_tens + auto temp = Homme::subview(m_buffers.temp,kv.team_idx,igp,jgp); + auto w_i = Homme::subview(m_state.m_w_i,kv.ie,m_data.n0,igp,jgp); + auto dp_i = Homme::subview(m_buffers.dp_i,kv.team_idx,igp,jgp); + auto eta_dot_dpdn = Homme::subview(m_buffers.eta_dot_dpdn,kv.team_idx,igp,jgp); + auto w_vadv = Homme::subview(m_buffers.w_tens,kv.team_idx,igp,jgp); + + ColumnOps::compute_midpoint_delta(kv,w_i,temp); + ColumnOps::compute_midpoint_values(kv,eta_dot_dpdn,temp); + + ColumnOps::compute_interface_values(kv,temp,w_vadv); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team,NUM_LEV_P), + [&](const int ilev) { + w_vadv(ilev) /= dp_i(ilev); + }); + } + + KOKKOS_INLINE_FUNCTION + void compute_phi_vadv (KernelVariables& kv, const int& igp, const int& jgp) const { + // phi_vadv = eta_dot*delta(phi)/dp3d_i. + // Note: store directly into phi_tens + auto phi = Homme::subview(m_buffers.phi,kv.team_idx,igp,jgp); + auto phi_vadv = Homme::subview(m_buffers.phi_tens,kv.team_idx,igp,jgp); + auto eta_dot_dpdn = Homme::subview(m_buffers.eta_dot_dpdn,kv.team_idx,igp,jgp); + auto dp_i = Homme::subview(m_buffers.dp_i,kv.team_idx,igp,jgp); + + ColumnOps::compute_interface_delta(kv,phi,phi_vadv); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team,NUM_LEV), + [&](const int ilev) { + phi_vadv(ilev) *= eta_dot_dpdn(ilev); + phi_vadv(ilev) /= dp_i(ilev); + }); + } + + KOKKOS_INLINE_FUNCTION + void compute_accumulated_quantities(KernelVariables &kv) const { + // Compute omega = v*grad(p) + average(omega_i) + // Accmuulate: dereived.omega_p += eta_ave_w*omega + // Accmuulate: dereived.eta_dot_dpdn += eta_ave_w*eta_dot_dpdn + // Accumulate: vn0 += eta_ave_w*v*dp + Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team,NP*NP), + [&](const int idx) { + const int igp = idx / NP; + const int jgp = idx % NP; + + // If rsplit>0, m_buffers.eta_dot_dpdn is identically 0, so skip this step + if (m_rsplit==0) { + // Accumulate eta_dot_dpdn + // Note: m_buffers.eta_dot_dpdn is 0 at top/bottom, so we can just accumulate NUM_LEV packs + Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team,NUM_LEV), + [&](const int ilev) { + m_derived.m_eta_dot_dpdn(kv.ie,igp,jgp,ilev) += + m_data.eta_ave_w*m_buffers.eta_dot_dpdn(kv.team_idx,igp,jgp,ilev); + }); + } + + // Compute omega = v*grad(p)+average(omega_i) and accumulate + // Accumulate v*dp in vn0 + // Note: so far omega already contains average(omega_i), since we didn't even + // bother storing omega_i, and computed directly the average + Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team,NUM_LEV), + [&](const int ilev) { + m_derived.m_omega_p(kv.ie,igp,jgp,ilev) += + m_data.eta_ave_w*m_buffers.omega_p(kv.team_idx,igp,jgp,ilev); + + m_derived.m_vn0(kv.ie,0,igp,jgp,ilev) += m_data.eta_ave_w*m_buffers.vdp(kv.team_idx,0,igp,jgp,ilev); + m_derived.m_vn0(kv.ie,1,igp,jgp,ilev) += m_data.eta_ave_w*m_buffers.vdp(kv.team_idx,1,igp,jgp,ilev); + }); + }); + } + + KOKKOS_INLINE_FUNCTION + void compute_w_and_phi_tens (KernelVariables& kv) const { + using namespace PhysicalConstants; + // Compute grad(phinh_i) + // Compute v*grad(w_i) + // Compute w_tens = scale1*(-w_vadv_i - v*grad(w_i)) - scale2*g*(1-dpnh_dp_i) + // Compute phi_tens = scale1*(-phi_vadv_i - v*grad(phinh_i)) + scale2*g*w_i + auto grad_w_i = Homme::subview(m_buffers.grad_w_i,kv.team_idx); + auto grad_phinh_i = Homme::subview(m_buffers.grad_phinh_i,kv.team_idx); + m_sphere_ops.gradient_sphere(kv,Homme::subview(m_state.m_phinh_i,kv.ie,m_data.n0), + grad_phinh_i); + kv.team_barrier(); + m_sphere_ops.gradient_sphere(kv,Homme::subview(m_state.m_w_i,kv.ie,m_data.n0), + grad_w_i); + kv.team_barrier(); + + auto v_i = Homme::subview(m_buffers.v_i,kv.team_idx); + Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team,NP*NP), + [&](const int idx) { + const int igp = idx / NP; + const int jgp = idx % NP; + + auto w_tens = Homme::subview(m_buffers.w_tens,kv.team_idx,igp,jgp); + auto phi_tens = Homme::subview(m_buffers.phi_tens,kv.team_idx,igp,jgp); + + Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team,NUM_LEV_P), + [&](const int ilev) { + + // Note: if rsplit=0, phi_tens/w_tens already contains phi_vadv/w_vadv, + // otherwise, just garbage from previous team + + // Compute w_tens + Scalar v_grad = v_i(0,igp,jgp,ilev)*grad_w_i(0,igp,jgp,ilev) + + v_i(1,igp,jgp,ilev)*grad_w_i(1,igp,jgp,ilev); + if (m_rsplit==0) { + w_tens(ilev) += v_grad; + } else { + w_tens(ilev) = v_grad; + } + w_tens(ilev) *= -m_data.scale1; + w_tens(ilev) += (m_buffers.dpnh_dp_i(kv.team_idx,igp,jgp,ilev)-1) * + (ilev==(NUM_LEV_P-1) ? m_scale2g_last_int_pack : m_data.scale2*g); + + // Compute phi_tens. + v_grad = v_i(0,igp,jgp,ilev)*grad_phinh_i(0,igp,jgp,ilev) + + v_i(1,igp,jgp,ilev)*grad_phinh_i(1,igp,jgp,ilev); + if (m_rsplit==0) { + phi_tens(ilev) += v_grad; + } else { + phi_tens(ilev) = v_grad; + } + phi_tens(ilev) *= -m_data.scale1; + phi_tens(ilev) += m_state.m_w_i(kv.ie,m_data.n0,igp,jgp,ilev) * + (ilev==NUM_LEV_P ? m_scale2g_last_int_pack : m_data.scale2*g); + + if (m_data.scale1!=m_data.scale2) { + // add imex phi_h splitting + // use approximate phi_h = hybi*phis + // could also use true hydrostatic pressure, but this requires extra DSS in dirk() + phi_tens(ilev) += (m_data.scale1-m_data.scale2) * + (v_i(0,igp,jgp,ilev)*m_geometry.m_gradphis(kv.ie,0,igp,jgp) + + v_i(1,igp,jgp,ilev)*m_geometry.m_gradphis(kv.ie,1,igp,jgp) ) * m_hvcoord.hybrid_bi_packed(ilev); + } + }); + }); + } + + KOKKOS_INLINE_FUNCTION + void compute_w_and_phi_np1(KernelVariables &kv) const { + // Update w_i(np1) = spheremp*(scale3*w_i(nm1) + dt*w_tens) + // Update phi_i(np1) = spheremp*(scale3*phi_i(nm1) + dt*phi_tens) + + Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team,NP*NP), + [&](const int idx) { + const int igp = idx / NP; + const int jgp = idx % NP; + + auto spheremp = m_geometry.m_spheremp(kv.ie,igp,jgp); + + auto w_tens = Homme::subview(m_buffers.w_tens,kv.team_idx,igp,jgp); + auto phi_tens = Homme::subview(m_buffers.phi_tens,kv.team_idx,igp,jgp); + + auto w_np1 = Homme::subview(m_state.m_w_i,kv.ie,m_data.np1,igp,jgp); + auto phi_np1 = Homme::subview(m_state.m_phinh_i,kv.ie,m_data.np1,igp,jgp); + + Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team,NUM_LEV), + [&](const int ilev) { + + // Add w_tens to w_nm1 and multiply by spheremp + w_tens(ilev) *= m_data.dt*spheremp; + w_np1(ilev) = m_state.m_w_i(kv.ie,m_data.nm1,igp,jgp,ilev); + w_np1(ilev) *= m_data.scale3*spheremp; + w_np1(ilev) += w_tens(ilev); + + // Add phi_tens to phi_nm1 and multiply by spheremp + // temp *= m_data.dt*spheremp; + phi_tens(ilev) *= m_data.dt*spheremp; + phi_np1(ilev) = m_state.m_phinh_i(kv.ie,m_data.nm1,igp,jgp,ilev); + phi_np1(ilev) *= m_data.scale3*spheremp; + phi_np1(ilev) += phi_tens(ilev); + }); + + // Last interface only for w, not phi (since phi=phis there) + Kokkos::single(Kokkos::PerThread(kv.team),[&](){ + using Info = ColInfo; + constexpr int ilev = Info::LastPack; + constexpr int ivec = Info::LastPackEnd; + + // Note: we only have 1 physical entry in the pack, + // so we may as well operate on the Real level + if (NUM_LEV==NUM_LEV_P) { + // We processed last interface too, so fix phi. + phi_np1(ilev)[ivec] = m_geometry.m_phis(kv.ie,igp,jgp); + } else { + // We didn't do anything on last interface, so update w + w_tens(ilev)[ivec] *= m_data.dt*spheremp; + w_np1(ilev)[ivec] = m_state.m_w_i(kv.ie,m_data.nm1,igp,jgp,ilev)[ivec]; + w_np1(ilev)[ivec] *= m_data.scale3*spheremp; + w_np1(ilev)[ivec] += w_tens(ilev)[ivec]; + } + }); + }); + } + + KOKKOS_INLINE_FUNCTION + void compute_dp_and_theta_tens (KernelVariables &kv) const { + // Compute dp_tens=scale1*(div(vdp) + delta(eta_dot_dpdn)) + // Compute theta_tens=scale1*(-theta_vadv-div(v*vtheta_dp) + // Note: div(v*vtheta_dp) can be computed in conservative or + // non concervative form (div(ab) vs a*div(b)+grad(a)*b) + + // Use a couple of lambdas to compute input of diff operators on the fly + auto v = Homme::subview(m_state.m_v,kv.ie,m_data.n0); + auto vtheta_dp = Homme::subview(m_state.m_vtheta_dp,kv.ie,m_data.n0); + auto dp = Homme::subview(m_state.m_dp3d,kv.ie,m_data.n0); + + auto vtheta = [&](const int igp,const int jgp,const int ilev)->Scalar { + return vtheta_dp(igp,jgp,ilev) / dp(igp,jgp,ilev); + }; + + auto v_vtheta_dp = [&](const int icomp, const int igp, const int jgp, const int ilev)->Scalar { + return v(icomp,igp,jgp,ilev) * vtheta_dp(igp,jgp,ilev); + }; + + if (m_theta_advection_form==AdvectionForm::Conservative) { + if (m_rsplit==0) { + using CM = CombineMode; + // If you want a CombineMode different than Replace, unfortunately you have to specify + // all the template args, since the CombineMode is the last one... + m_sphere_ops.divergence_sphere_cm(kv,v_vtheta_dp, + Homme::subview(m_buffers.theta_tens,kv.team_idx)); + } else { + m_sphere_ops.divergence_sphere(kv,v_vtheta_dp, + Homme::subview(m_buffers.theta_tens,kv.team_idx)); + } + } else { + m_sphere_ops.gradient_sphere(kv,vtheta, + Homme::subview(m_buffers.grad_tmp,kv.team_idx)); + } + kv.team_barrier(); + + Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team,NP*NP), + [&](const int idx) { + const int igp = idx / NP; + const int jgp = idx % NP; + + auto dp_tens = Homme::subview(m_buffers.dp_tens,kv.team_idx,igp,jgp); + auto theta_tens = Homme::subview(m_buffers.theta_tens,kv.team_idx,igp,jgp); + + auto div_vdp = Homme::subview(m_buffers.div_vdp,kv.team_idx,igp,jgp); + + if (m_rsplit==0) { + auto eta_dot_dpdn = Homme::subview(m_buffers.eta_dot_dpdn,kv.team_idx,igp,jgp); + ColumnOps::compute_midpoint_delta(kv,eta_dot_dpdn,dp_tens); + } + + Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team,NUM_LEV), + [&](const int ilev) { + // Compute dp_tens + if (m_rsplit>0) { + dp_tens(ilev) = div_vdp(ilev); + } else { + dp_tens(ilev) += div_vdp(ilev); + } + + // Compute theta_tens + // NOTE: if the condition is false, then theta_tens already contains div(v*theta*dp) already + if (m_theta_advection_form==AdvectionForm::NonConservative) { + // We need a temp, since, if rsplit=0, theta_tens is already storing theta_vadv + + Scalar temp = div_vdp(ilev)*vtheta(igp,jgp,ilev); + temp += m_buffers.grad_tmp(kv.team_idx,0,igp,jgp,ilev)*m_buffers.vdp(kv.team_idx,0,igp,jgp,ilev); + temp += m_buffers.grad_tmp(kv.team_idx,1,igp,jgp,ilev)*m_buffers.vdp(kv.team_idx,1,igp,jgp,ilev); + if (m_rsplit>0) { + theta_tens(ilev) = temp; + } else { + theta_tens(ilev) += temp; + } + } + }); + }); + } + + KOKKOS_INLINE_FUNCTION + void compute_dp3d_and_theta_np1(KernelVariables &kv) const { + // Update dp3d(np1) = spheremp*(scale3*dp3d(nm1) + dt*dp_tens) + // Update vtheta_dp(np1) = spheremp*(scale3*vtheta_dp(nm1) + dt*theta_tens) + + Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team,NP*NP), + [&](const int idx) { + const int igp = idx / NP; + const int jgp = idx % NP; + + const auto& spheremp = m_geometry.m_spheremp(kv.ie,igp,jgp); + + auto dp_tens = Homme::subview(m_buffers.dp_tens,kv.team_idx,igp,jgp); + auto theta_tens = Homme::subview(m_buffers.theta_tens,kv.team_idx,igp,jgp); + auto dp_np1 = Homme::subview(m_state.m_dp3d,kv.ie,m_data.np1,igp,jgp); + auto vtheta_np1 = Homme::subview(m_state.m_vtheta_dp,kv.ie,m_data.np1,igp,jgp); + + Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team,NUM_LEV), + [&](const int ilev) { + + // Add dp_tens to dp_nm1 and multiply by spheremp + dp_tens(ilev) *= m_data.scale1*m_data.dt*spheremp; + dp_np1(ilev) = m_data.scale3*spheremp*m_state.m_dp3d(kv.ie,m_data.nm1,igp,jgp,ilev); + dp_np1(ilev) -= dp_tens(ilev); + + // Add theta_tens to vtheta_nm1 and multiply by spheremp + theta_tens(ilev) *= -m_data.scale1*m_data.dt*spheremp; + vtheta_np1(ilev) = m_state.m_vtheta_dp(kv.ie,m_data.nm1,igp,jgp,ilev); + vtheta_np1(ilev) *= m_data.scale3*spheremp; + vtheta_np1(ilev) += theta_tens(ilev); + }); + }); + } + + KOKKOS_INLINE_FUNCTION + void compute_v_tens (KernelVariables &kv) const { + // Not necessarily in this order, + // - Compute vort=vorticity(v) + // - Compute wvor = grad[average(w_i^2/2)] - average[w_i*grad(w_i)] + // - Compute gradKE = grad(v*v/2) + // - Compute gradExner = grad(exner) + // - Compute mgrad = average[dpnh_dp_i*gradphinh_i] + // + cp*T0*(grad(log(exner))-grad(exner)/exner) (pgrad_correction, if applicable) + // - Compute v_tens = + // scale1*(-v_vadv + v2*(fcor+vort)-gradKE -mgrad -cp*vtheta*gradExner - (mgrad + wvor)) + // scale1*(-v_vadv - v1*(fcor+vort)-gradKE -mgrad -cp*vtheta*gradExner - (mgrad + wvor)) + + auto vort = Homme::subview(m_buffers.vort,kv.team_idx); + auto wvor = Homme::subview(m_buffers.vdp,kv.team_idx); + auto grad_exner = Homme::subview(m_buffers.grad_exner,kv.team_idx); + auto mgrad = Homme::subview(m_buffers.mgrad,kv.team_idx); + auto grad_tmp = Homme::subview(m_buffers.grad_tmp,kv.team_idx); + + // Compute vorticity(v) + m_sphere_ops.vorticity_sphere(kv, Homme::subview(m_state.m_v,kv.ie,m_data.n0), + vort); + + if (m_theta_hydrostatic_mode) { + // In nh mode, gradphinh has already been computed, but in hydro mode + // we skip the whole compute_w_and_phi_tens call + m_sphere_ops.gradient_sphere(kv,Homme::subview(m_state.m_phinh_i,kv.ie,m_data.n0), + Homme::subview(m_buffers.grad_phinh_i,kv.team_idx)); + kv.team_barrier(); + } else { + // Compute average(w^2/2) + Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team,NP*NP), + [&](const int idx) { + const int igp = idx / NP; + const int jgp = idx % NP; + auto w_i = Homme::subview(m_state.m_w_i,kv.ie,m_data.n0,igp,jgp); + + // Use lambda as a provider for w_i*w_i + const auto w_sq = [&w_i] (const int ilev) -> Scalar{ + return w_i(ilev)*w_i(ilev); + }; + + // Rescale by 2 (we are averaging kinetic energy w_i*w_i/2, but the provider has no '/2') + ColumnOps::compute_midpoint_values(kv, w_sq, Homme::subview(m_buffers.temp,kv.team_idx,igp,jgp),0.5); + }); + kv.team_barrier(); + // Compute grad(average(w^2/2)). Store in wvor. + m_sphere_ops.gradient_sphere(kv, Homme::subview(m_buffers.temp,kv.team_idx), + wvor); + kv.team_barrier(); + } + + // Compute grad(exner) + // Note: exner = (pi/p0)^k, therefore grad(exner) = k*(exner/pi)*grad(pi). + // So you *could* avoid computing this grad, at the price of some arithmetic op. + m_sphere_ops.gradient_sphere(kv, Homme::subview(m_buffers.exner,kv.team_idx), + grad_exner); + + // If pgrad_correction=1, we need the gradient sphere + // for log(exner) below. Store into m_buffers.grad_tmp. + if (m_pgrad_correction) { + const auto exner = Homme::subview(m_buffers.exner,kv.team_idx); + const auto log_exner = [&exner](const int igp, const int jgp, const int ilev)->Scalar { + return log(exner(igp, jgp, ilev)); + }; + m_sphere_ops.gradient_sphere(kv, log_exner, + grad_tmp); + } + kv.team_barrier(); + + // Scalar w_vor,mgrad,w_gradw,gradw2; + Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team,NP*NP), + [&](const int idx) { + const int igp = idx / NP; + const int jgp = idx % NP; + + auto wvor_x = Homme::subview(wvor,0,igp,jgp); + auto wvor_y = Homme::subview(wvor,1,igp,jgp); + + if (!m_theta_hydrostatic_mode) { + // Compute wvor = grad(average(w^2/2)) - average(w*grad(w)) + // Note: vtens is already storing grad(avg(w^2/2)) + auto gradw_x = Homme::subview(m_buffers.grad_w_i,kv.team_idx,0,igp,jgp); + auto gradw_y = Homme::subview(m_buffers.grad_w_i,kv.team_idx,1,igp,jgp); + auto w_i = Homme::subview(m_state.m_w_i,kv.ie,m_data.n0,igp,jgp); + + const auto w_gradw_x = [&gradw_x,&w_i] (const int ilev) -> Scalar { + return gradw_x(ilev)*w_i(ilev); + }; + const auto w_gradw_y = [&gradw_y,&w_i] (const int ilev) -> Scalar { + return gradw_y(ilev)*w_i(ilev); + }; + + ColumnOps::compute_midpoint_values(kv, + w_gradw_x, wvor_x, -1.0); + ColumnOps::compute_midpoint_values(kv, + w_gradw_y, wvor_y, -1.0); + } else { + // wvor is not used if theta_hydrostatic_mode=1. Set to zero + // here to avoid adding in uninitialized values into v_tens. + Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team,NUM_LEV), + [&](const int ilev) { + wvor_x(ilev) = 0; + wvor_y(ilev) = 0; + }); + } + + auto mgrad_x = Homme::subview(mgrad,0,igp,jgp); + auto mgrad_y = Homme::subview(mgrad,1,igp,jgp); + + // Compute mgrad = average(dpnh_dp_i*grad(phinh_i)) + const auto phinh_i_x = Homme::subview(m_buffers.grad_phinh_i,kv.team_idx,0,igp,jgp); + const auto phinh_i_y = Homme::subview(m_buffers.grad_phinh_i,kv.team_idx,1,igp,jgp); + if (m_theta_hydrostatic_mode) { + ColumnOps::compute_midpoint_values(kv,phinh_i_x,mgrad_x); + ColumnOps::compute_midpoint_values(kv,phinh_i_y,mgrad_y); + } else { + const auto dpnh_dp_i = Homme::subview(m_buffers.dpnh_dp_i,kv.team_idx,igp,jgp); + const auto prod_x = [&phinh_i_x,&dpnh_dp_i](const int ilev)->Scalar { + return phinh_i_x(ilev)*dpnh_dp_i(ilev); + }; + const auto prod_y = [&phinh_i_y,&dpnh_dp_i](const int ilev)->Scalar { + return phinh_i_y(ilev)*dpnh_dp_i(ilev); + }; + + ColumnOps::compute_midpoint_values(kv,prod_x,mgrad_x); + ColumnOps::compute_midpoint_values(kv,prod_y,mgrad_y); + } + + // Apply pgrad_correction: mgrad += cp*T0*(grad(log(exner))-grad(exner)/exner) (if applicable) + if (m_pgrad_correction) { + using namespace PhysicalConstants; + constexpr Real T0 = Tref - Tref_lapse_rate*Tref*cp/g; + + const auto grad_tmp_i_x = Homme::subview(grad_tmp,0,igp,jgp); + const auto grad_tmp_i_y = Homme::subview(grad_tmp,1,igp,jgp); + const auto grad_exner_i_x = Homme::subview(grad_exner,0,igp,jgp); + const auto grad_exner_i_y = Homme::subview(grad_exner,1,igp,jgp); + const auto exner_i = Homme::subview(m_buffers.exner,kv.team_idx,igp,jgp); + + Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team,NUM_LEV), + [&](const int ilev) { + mgrad_x(ilev) += cp*T0*(grad_tmp_i_x(ilev) - grad_exner_i_x(ilev)/exner_i(ilev)); + mgrad_y(ilev) += cp*T0*(grad_tmp_i_y(ilev) - grad_exner_i_y(ilev)/exner_i(ilev)); + }); + } + + // Compute KE. Also, add fcor to vort + auto u = Homme::subview(m_state.m_v,kv.ie,m_data.n0,0,igp,jgp); + auto v = Homme::subview(m_state.m_v,kv.ie,m_data.n0,1,igp,jgp); + auto KE = Homme::subview(m_buffers.temp,kv.team_idx,igp,jgp); + const auto& fcor = m_geometry.m_fcor(kv.ie,igp,jgp); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team,NUM_LEV), + [&](const int ilev) { + KE(ilev) = u(ilev)*u(ilev); + KE(ilev) += v(ilev)*v(ilev); + KE(ilev) /= 2.0; + + vort(igp,jgp,ilev) += fcor; + }); + }); + kv.team_barrier(); + + // Compute grad(KE), and put it directly in v_tens + if (m_rsplit==0) { + // v_tens already contains v_vadv. Need to sum into it. + m_sphere_ops.gradient_sphere_update(kv, Homme::subview(m_buffers.temp,kv.team_idx), + Homme::subview(m_buffers.v_tens,kv.team_idx)); + } else { + // v_tens contains garbage from previos team. Overwrite it. + m_sphere_ops.gradient_sphere(kv, Homme::subview(m_buffers.temp,kv.team_idx), + Homme::subview(m_buffers.v_tens,kv.team_idx)); + } + + Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team,NP*NP), + [&](const int idx) { + const int igp = idx / NP; + const int jgp = idx % NP; + + // Assemble vtens (both components) + // Note: right now, vtens already contains gradKE+v_vadv + // (or just gradKE if rsplit>0) + auto u_tens = Homme::subview(m_buffers.v_tens,kv.team_idx,0,igp,jgp); + auto v_tens = Homme::subview(m_buffers.v_tens,kv.team_idx,1,igp,jgp); + auto vort = Homme::subview(m_buffers.vort,kv.team_idx,igp,jgp); + + Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team,NUM_LEV), + [&](const int ilev) { + // grad(exner)*vtheta*cp + Scalar cp_vtheta = PhysicalConstants::cp * + (m_state.m_vtheta_dp(kv.ie,m_data.n0,igp,jgp,ilev) / + m_state.m_dp3d(kv.ie,m_data.n0,igp,jgp,ilev)); + + u_tens(ilev) += cp_vtheta*grad_exner(0,igp,jgp,ilev); + v_tens(ilev) += cp_vtheta*grad_exner(1,igp,jgp,ilev); + + u_tens(ilev) += (mgrad(0,igp,jgp,ilev) + wvor(0,igp,jgp,ilev)); + v_tens(ilev) += (mgrad(1,igp,jgp,ilev) + wvor(1,igp,jgp,ilev)); + + // Add +/- v_j*(vort+fcor), where v_j=v for u_tens and v_j=u for v_tens + u_tens(ilev) -= m_state.m_v(kv.ie,m_data.n0,1,igp,jgp,ilev)*vort(ilev); + v_tens(ilev) += m_state.m_v(kv.ie,m_data.n0,0,igp,jgp,ilev)*vort(ilev); + }); + }); + } + + KOKKOS_INLINE_FUNCTION + void compute_v_np1(KernelVariables &kv) const { + // Update v(np1) = spheremp*(scale3*v(nm1) + dt*v_tens) + // NOTE: quite a few buffers no longer needed in this call of operator() are going to be reused here. + + auto v_tens = Homme::subview(m_buffers.v_tens,kv.team_idx); + + Kokkos::parallel_for(Kokkos::TeamThreadRange(kv.team,NP*NP), + [&](const int idx) { + const int igp = idx / NP; + const int jgp = idx % NP; + + const auto& spheremp = m_geometry.m_spheremp(kv.ie,igp,jgp); + + // Add vtens to v_nm1 and multiply by spheremp + auto u_np1 = Homme::subview(m_state.m_v,kv.ie,m_data.np1,0,igp,jgp); + auto v_np1 = Homme::subview(m_state.m_v,kv.ie,m_data.np1,1,igp,jgp); + auto u_tens = Homme::subview(m_buffers.v_tens,kv.team_idx,0,igp,jgp); + auto v_tens = Homme::subview(m_buffers.v_tens,kv.team_idx,1,igp,jgp); + + Kokkos::parallel_for(Kokkos::ThreadVectorRange(kv.team,NUM_LEV), + [&](const int ilev) { + + // Add v_tens to v_nm1 and multiply by spheremp + u_np1(ilev) = m_state.m_v(kv.ie,m_data.nm1,0,igp,jgp,ilev); + v_np1(ilev) = m_state.m_v(kv.ie,m_data.nm1,1,igp,jgp,ilev); + + u_tens(ilev) *= -m_data.scale1*m_data.dt*spheremp; + v_tens(ilev) *= -m_data.scale1*m_data.dt*spheremp; + + u_np1(ilev) *= m_data.scale3*spheremp; + v_np1(ilev) *= m_data.scale3*spheremp; + + u_np1(ilev) += u_tens(ilev); + v_np1(ilev) += v_tens(ilev); + }); + }); + } + +}; + +} // Namespace Homme + +#endif // HOMMEXX_CAAR_FUNCTOR_IMPL_OPT_HPP diff --git a/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.cpp b/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.cpp new file mode 100644 index 000000000000..10aad89f8154 --- /dev/null +++ b/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.cpp @@ -0,0 +1,647 @@ +#include "CaarFunctorImpl-caar-opt.hpp" + +namespace Homme { + +template +void CaarFunctorImpl::epoch1_blockOps(const SphereOuter::Team &team) +{ + auto buffers_dp_tens = m_buffers.dp_tens; + auto buffers_exner = m_buffers.exner; + auto buffers_phi = m_buffers.phi; + auto buffers_pnh = m_buffers.pnh; + auto buffers_theta_tens = m_buffers.theta_tens; + auto buffers_vdp = m_buffers.vdp; + + const Real data_eta_ave_w = m_data.eta_ave_w; + const int data_n0 = m_data.n0; + + auto derived_vn0 = m_derived.m_vn0; + + const SphereGlobal sg(m_sphere_ops); + + auto state_dp3d = m_state.m_dp3d; + auto state_phinh_i = m_state.m_phinh_i; + auto state_v = m_state.m_v; + auto state_vtheta_dp= m_state.m_vtheta_dp; + + SphereBlockOps::parallel_for( + team, 4, + KOKKOS_LAMBDA(const SphereBlockOps::Team &team) { + SphereBlockOps b(sg, team); + SphereBlockScratch ttmp0(b), ttmp1(b), ttmp2(b), ttmp3(b); + + SPHERE_BLOCK_START3(b, v0, v1, vtheta); + + if (!HYDROSTATIC) { + + const Scalar dphi = b.zabove(&state_phinh_i(b.e,data_n0,b.x,b.y,b.z), &state_phinh_i(b.e,data_n0,b.x,b.y,b.z+1)) - state_phinh_i(b.e,data_n0,b.x,b.y,b.z); + const Scalar vtheta_dp = state_vtheta_dp(b.e,data_n0,b.x,b.y,b.z); + for (int i = 0; i < VECTOR_SIZE; i++) { + if ((vtheta_dp[i] < 0) || (dphi[i] > 0)) { + if (b.z * VECTOR_SIZE + i < NUM_PHYSICAL_LEV) abort(); + } + } + + EquationOfState::compute_pnh_and_exner(vtheta_dp, dphi, buffers_pnh(b.e,b.x,b.y,b.z), buffers_exner(b.e,b.x,b.y,b.z)); + + // Zero out padding slots in the last pack to prevent NaN propagation. + // When NUM_PHYSICAL_LEV % VECTOR_SIZE != 0, the last midpoint pack (z=NUM_LEV-1) + // has slots NUM_PHYSICAL_LEV%VECTOR_SIZE..VECTOR_SIZE-1 that are padding (vtheta_dp=0, + // dphi=0 or -phis), causing 0/0=NaN in compute_pnh_and_exner. + if (NUM_PHYSICAL_LEV % VECTOR_SIZE != 0 && b.z == NUM_LEV-1) { + for (int i = NUM_PHYSICAL_LEV % VECTOR_SIZE; i < VECTOR_SIZE; i++) { + buffers_pnh(b.e,b.x,b.y,b.z)[i] = 1.0; + buffers_exner(b.e,b.x,b.y,b.z)[i] = 1.0; + } + } + + buffers_phi(b.e,b.x,b.y,b.z) = 0.5 * (state_phinh_i(b.e,data_n0,b.x,b.y,b.z) + b.zabove(&state_phinh_i(b.e,data_n0,b.x,b.y,b.z), &state_phinh_i(b.e,data_n0,b.x,b.y,b.z+1))); + } + + v0 = state_v(b.e,data_n0,0,b.x,b.y,b.z) * state_dp3d(b.e,data_n0,b.x,b.y,b.z); + v1 = state_v(b.e,data_n0,1,b.x,b.y,b.z) * state_dp3d(b.e,data_n0,b.x,b.y,b.z); + b.divInit(ttmp0, ttmp1, v0, v1); + buffers_vdp(b.e,0,b.x,b.y,b.z) = v0; + buffers_vdp(b.e,1,b.x,b.y,b.z) = v1; + derived_vn0(b.e,0,b.x,b.y,b.z) += data_eta_ave_w * v0; + derived_vn0(b.e,1,b.x,b.y,b.z) += data_eta_ave_w * v1; + + vtheta = 0; + + if (CONSERVATIVE) { + const Scalar sv0 = state_v(b.e,data_n0,0,b.x,b.y,b.z) * state_vtheta_dp(b.e,data_n0,b.x,b.y,b.z); + const Scalar sv1 = state_v(b.e,data_n0,1,b.x,b.y,b.z) * state_vtheta_dp(b.e,data_n0,b.x,b.y,b.z); + b.divInit(ttmp2, ttmp3, sv0, sv1); + } else { + vtheta = state_vtheta_dp(b.e,data_n0,b.x,b.y,b.z) / state_dp3d(b.e,data_n0,b.x,b.y,b.z); + b.gradInit(ttmp2, vtheta); + } + + SPHERE_BLOCK_MIDDLE3(b, v0, v1, vtheta); + + const Scalar dvdp = b.div(ttmp0, ttmp1); + buffers_dp_tens(b.e,b.x,b.y,b.z) = dvdp; + + if (CONSERVATIVE) { + buffers_theta_tens(b.e,b.x,b.y,b.z) = b.div(ttmp2, ttmp3); + } else { + Scalar grad0, grad1; + b.grad(grad0, grad1, ttmp2); + Scalar theta_tens = dvdp * vtheta; + theta_tens += grad0 * v0; + theta_tens += grad1 * v1; + buffers_theta_tens(b.e,b.x,b.y,b.z) = theta_tens; + } + + SPHERE_BLOCK_END(); + }); +} + +template +void CaarFunctorImpl::epoch2_scanOps(const SphereOuter::Team &team) +{ + auto buffers_dp_tens = viewAsReal(m_buffers.dp_tens); + auto buffers_dp_i = viewAsReal(m_buffers.dp_i); + auto buffers_eta_dot_dpdn = viewAsReal(m_buffers.eta_dot_dpdn); + auto buffers_w_tens = viewAsReal(m_buffers.w_tens); + + const int data_n0 = m_data.n0; + auto &hvcoord_hybrid_bi = m_hvcoord.hybrid_bi; + const Real pi_i00 = m_hvcoord.ps0 * m_hvcoord.hybrid_ai0; + auto state_dp3d = viewAsReal(m_state.m_dp3d); + + SphereScanOps::parallel_for( + team, + KOKKOS_LAMBDA(const SphereScanOps::Team &team) { + SphereScanOps s(team); + + s.scan(buffers_dp_i, state_dp3d, data_n0, pi_i00); + s.scan(buffers_w_tens, buffers_dp_tens, 0); + + if (RSPLIT_ZERO) { + + s.scan(buffers_eta_dot_dpdn, buffers_dp_tens, 0); + + const Real last = buffers_eta_dot_dpdn(s.e,s.x,s.y,NUM_PHYSICAL_LEV); + + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(s.t, 1, NUM_PHYSICAL_LEV), + [&](const int z) { + Real eta_dot_dpdn = -buffers_eta_dot_dpdn(s.e,s.x,s.y,z); + eta_dot_dpdn += hvcoord_hybrid_bi(z) * last; + buffers_eta_dot_dpdn(s.e,s.x,s.y,z) = eta_dot_dpdn; + }); + + Kokkos::single( + Kokkos::PerThread(s.t), + [&]() { + buffers_eta_dot_dpdn(s.e,s.x,s.y,0) = buffers_eta_dot_dpdn(s.e,s.x,s.y,NUM_PHYSICAL_LEV) = 0; + }); + } + }); +} + +template +void CaarFunctorImpl::epoch3_blockOps(const SphereOuter::Team &team) +{ + auto buffers_dp_i = m_buffers.dp_i; + auto buffers_dp_tens = m_buffers.dp_tens; + auto buffers_eta_dot_dpdn = m_buffers.eta_dot_dpdn; + auto buffers_exner = m_buffers.exner; + auto buffers_omega_p = m_buffers.omega_p; + auto buffers_pnh = m_buffers.pnh; + auto buffers_temp = m_buffers.temp; + auto buffers_v_tens = m_buffers.v_tens; + auto buffers_w_tens = m_buffers.w_tens; + + const Real data_eta_ave_w = m_data.eta_ave_w; + const int data_n0 = m_data.n0; + + auto derived_eta_dot_dpdn = m_derived.m_eta_dot_dpdn; + auto derived_omega_p = m_derived.m_omega_p; + + const SphereGlobal sg(m_sphere_ops); + + auto state_dp3d = m_state.m_dp3d; + auto state_v = m_state.m_v; + auto state_vtheta_dp= m_state.m_vtheta_dp; + auto state_w_i = m_state.m_w_i; + + SphereBlockOps::parallel_for( + team, 1, + KOKKOS_LAMBDA(const SphereBlockOps::Team &team) { + SphereBlockOps b(sg, team); + SphereBlockScratch ttmp0(b); + + SPHERE_BLOCK_START0(b); + + const Scalar pi = 0.5 * (buffers_dp_i(b.e,b.x,b.y,b.z) + b.zabove(&buffers_dp_i(b.e,b.x,b.y,b.z), &buffers_dp_i(b.e,b.x,b.y,b.z+1))); + b.gradInit(ttmp0, pi); + + if (HYDROSTATIC) { + Scalar exner = pi; + EquationOfState::pressure_to_exner(exner); + buffers_exner(b.e,b.x,b.y,b.z) = exner; + buffers_pnh(b.e,b.x,b.y,b.z) = EquationOfState::compute_dphi(state_vtheta_dp(b.e,data_n0,b.x,b.y,b.z), exner, pi); + } + + SPHERE_BLOCK_MIDDLE0(b); + + Scalar grad0, grad1; + b.grad(grad0, grad1, ttmp0); + + Scalar omega = -0.5 * (buffers_w_tens(b.e,b.x,b.y,b.z) + b.zabove(&buffers_w_tens(b.e,b.x,b.y,b.z), &buffers_w_tens(b.e,b.x,b.y,b.z+1))); + const Scalar uz = state_v(b.e,data_n0,0,b.x,b.y,b.z); + const Scalar vz = state_v(b.e,data_n0,1,b.x,b.y,b.z); + omega += uz * grad0 + vz * grad1; + buffers_omega_p(b.e,b.x,b.y,b.z) = omega; + + derived_omega_p(b.e,b.x,b.y,b.z) += data_eta_ave_w * omega; + + if (RSPLIT_ZERO) { + + const Scalar dp = state_dp3d(b.e,data_n0,b.x,b.y,b.z); + const Scalar etap = b.zabove(&buffers_eta_dot_dpdn(b.e,b.x,b.y,b.z), &buffers_eta_dot_dpdn(b.e,b.x,b.y,b.z+1)); + const Scalar etaz = buffers_eta_dot_dpdn(b.e,b.x,b.y,b.z); + + buffers_dp_tens(b.e,b.x,b.y,b.z) += etap - etaz; + + derived_eta_dot_dpdn(b.e,b.x,b.y,b.z) += data_eta_ave_w * etaz; + + if (!HYDROSTATIC) { + const Scalar dw = b.zabove(&state_w_i(b.e,data_n0,b.x,b.y,b.z), &state_w_i(b.e,data_n0,b.x,b.y,b.z+1)) - state_w_i(b.e,data_n0,b.x,b.y,b.z); + const Scalar eta = 0.5 * (etaz + etap); + buffers_temp(b.e,b.x,b.y,b.z) = dw * eta; + } + + const Scalar facp = 0.5 * etap / dp; + Scalar u = facp * (b.zabove(uz, &state_v(b.e,data_n0,0,b.x,b.y,b.z), &state_v(b.e,data_n0,0,b.x,b.y,b.z+1)) - uz); + Scalar v = facp * (b.zabove(vz, &state_v(b.e,data_n0,1,b.x,b.y,b.z), &state_v(b.e,data_n0,1,b.x,b.y,b.z+1)) - vz); + + const Scalar facm = 0.5 * etaz / dp; + u += facm * (uz - b.zbelow(uz, &state_v(b.e,data_n0,0,b.x,b.y,b.z-1), &state_v(b.e,data_n0,0,b.x,b.y,b.z))); + v += facm * (vz - b.zbelow(uz, &state_v(b.e,data_n0,1,b.x,b.y,b.z-1), &state_v(b.e,data_n0,1,b.x,b.y,b.z))); + buffers_v_tens(b.e,0,b.x,b.y,b.z) = u; + buffers_v_tens(b.e,1,b.x,b.y,b.z) = v; + } + SPHERE_BLOCK_END(); + }); +} + +void CaarFunctorImpl::epoch4_scanOps(const SphereOuter::Team &team) +{ + auto buffers_phi = viewAsReal(m_buffers.phi); + auto buffers_pnh = viewAsReal(m_buffers.pnh); + + const int data_n0 = m_data.n0; + + auto &geometry_phis = m_geometry.m_phis; + + auto state_phinh_i = viewAsReal(m_state.m_phinh_i); + + SphereScanOps::parallel_for( + team, + KOKKOS_LAMBDA(const SphereScanOps::Team &team) { + SphereScanOps s(team); + s.nacs(state_phinh_i, data_n0, buffers_pnh, geometry_phis); + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(s.t, NUM_PHYSICAL_LEV), + [&](const int z) { + buffers_phi(s.e,s.x,s.y,z) = 0.5 * (state_phinh_i(s.e,data_n0,s.x,s.y,z) + state_phinh_i(s.e,data_n0,s.x,s.y,z+1)); + }); + }); +} + +template +void CaarFunctorImpl::epoch5_colOps(const SphereOuter::Team &team) +{ + auto buffers_dp_i = m_buffers.dp_i; + auto buffers_dpnh_dp_i = m_buffers.dpnh_dp_i; + auto buffers_eta_dot_dpdn = m_buffers.eta_dot_dpdn; + auto buffers_exner = m_buffers.exner; + auto buffers_grad_phinh_i = m_buffers.grad_phinh_i; + auto buffers_grad_w_i = m_buffers.grad_w_i; + auto buffers_phi = m_buffers.phi; + auto buffers_phi_tens = m_buffers.phi_tens; + auto buffers_pnh = m_buffers.pnh; + auto buffers_temp = m_buffers.temp; + auto buffers_v_i = m_buffers.v_i; + auto buffers_vtheta_i = m_buffers.vtheta_i; + auto buffers_w_tens = m_buffers.w_tens; + + const int data_n0 = m_data.n0; + const Real dscale = m_data.scale1 - m_data.scale2; + + auto &geometry_gradphis = m_geometry.m_gradphis; + + const Real gscale1 = m_data.scale1 * PhysicalConstants::g; + const Real gscale2 = m_data.scale2 * PhysicalConstants::g; + + auto hvcoord_hybrid_bi_packed = m_hvcoord.hybrid_bi_packed; + + const Real ndata_scale1 = -m_data.scale1; + const Real pi_i00 = m_hvcoord.ps0 * m_hvcoord.hybrid_ai0; + + const SphereGlobal sg(m_sphere_ops); + + auto state_dp3d = m_state.m_dp3d; + auto state_phinh_i = m_state.m_phinh_i; + auto state_v = m_state.m_v; + auto state_w_i = m_state.m_w_i; + + SphereColOps::parallel_for( + team, NUM_LEV_P, + KOKKOS_LAMBDA(const SphereColOps::Team &team) { + SphereColOps c(sg, team); + + c.grad(buffers_grad_phinh_i, state_phinh_i, data_n0); + if (!HYDROSTATIC) c.grad(buffers_grad_w_i, state_w_i, data_n0); + + const Scalar dm = c.zbelow(0, &state_dp3d(c.e,data_n0,c.x,c.y,c.z-1), &state_dp3d(c.e,data_n0,c.x,c.y,c.z));; + const Scalar dz = c.zabove(0, &state_dp3d(c.e,data_n0,c.x,c.y,c.z)); + const Scalar dp_i = c.ztrim(dz, dm, 0.5 * (dz + dm)); + buffers_dp_i(c.e,c.x,c.y,c.z) = dp_i; + + if (!HYDROSTATIC) { + + const Scalar v0m = c.zbelow(0, &state_v(c.e,data_n0,0,c.x,c.y,c.z-1), &state_v(c.e,data_n0,0,c.x,c.y,c.z));; + const Scalar v0z = c.zabove(0, &state_v(c.e,data_n0,0,c.x,c.y,c.z)); + const Scalar v_i0 = c.ztrim(v0z, v0m, (dz * v0z + dm * v0m) / (dm + dz)); + buffers_v_i(c.e,0,c.x,c.y,c.z) = v_i0; + + const Scalar v1m = c.zbelow(0, &state_v(c.e,data_n0,1,c.x,c.y,c.z-1), &state_v(c.e,data_n0,1,c.x,c.y,c.z));; + const Scalar v1z = c.zabove(0, &state_v(c.e,data_n0,1,c.x,c.y,c.z)); + const Scalar v_i1 = c.ztrim(v1z, v1m, (dz * v1z + dm * v1m) / (dm + dz)); + buffers_v_i(c.e,1,c.x,c.y,c.z) = v_i1; + + const Scalar pm = c.zbelow(pi_i00, &buffers_pnh(c.e,c.x,c.y,c.z-1), &buffers_pnh(c.e,c.x,c.y,c.z));; + const Scalar pz = c.zabove(pm + 0.5 * dm, &buffers_pnh(c.e,c.x,c.y,c.z)); + buffers_dpnh_dp_i(c.e,c.x,c.y,c.z) = 2.0 * (pz - pm) / (dm + dz); + } + + if (RSPLIT_ZERO) { + + const Scalar phim = c.zbelow(0, &buffers_phi(c.e,c.x,c.y,c.z-1), &buffers_phi(c.e,c.x,c.y,c.z));; + const Scalar phiz = c.zabove(0, &buffers_phi(c.e,c.x,c.y,c.z)); + + if (!HYDROSTATIC) { + const Scalar phi_vadv = c.ztrim(0, 0, (phiz - phim) * buffers_eta_dot_dpdn(c.e,c.x,c.y,c.z) / dp_i); + buffers_phi_tens(c.e,c.x,c.y,c.z) = phi_vadv; + } + + Scalar vtheta_i = phiz - phim; + + const Scalar exnerm = c.zbelow(0, &buffers_exner(c.e,c.x,c.y,c.z-1), &buffers_exner(c.e,c.x,c.y,c.z));; + const Scalar exnerz = c.zabove(0, &buffers_exner(c.e,c.x,c.y,c.z)); + const Scalar dexner = c.ztrim(1, 1, exnerz - exnerm); + vtheta_i /= dexner; + + vtheta_i /= -PhysicalConstants::cp; // separate for BFB equality with original Fortran + + if (!HYDROSTATIC) vtheta_i *= buffers_dpnh_dp_i(c.e,c.x,c.y,c.z); + buffers_vtheta_i(c.e,c.x,c.y,c.z) = vtheta_i; + } + + if (!HYDROSTATIC) { + + Scalar w_tens = 0; + if (RSPLIT_ZERO) { + const Scalar tempm = c.zbelow(0, &buffers_temp(c.e,c.x,c.y,c.z-1), &buffers_temp(c.e,c.x,c.y,c.z));; + const Scalar tempz = c.zabove(0, &buffers_temp(c.e,c.x,c.y,c.z)); + const Scalar dw = c.ztrim(tempz, tempm, 0.5 * (tempz + tempm)); + w_tens = dw / dp_i; + } + w_tens += buffers_v_i(c.e,0,c.x,c.y,c.z) * buffers_grad_w_i(c.e,0,c.x,c.y,c.z) + buffers_v_i(c.e,1,c.x,c.y,c.z) * buffers_grad_w_i(c.e,1,c.x,c.y,c.z); + w_tens *= ndata_scale1; + const Scalar gscale = c.ztop(gscale1, gscale2); + w_tens += (buffers_dpnh_dp_i(c.e,c.x,c.y,c.z)-Scalar(1)) * gscale; + buffers_w_tens(c.e,c.x,c.y,c.z) = w_tens; + + Scalar phi_tens = (RSPLIT_ZERO) ? buffers_phi_tens(c.e,c.x,c.y,c.z) : 0; + phi_tens += buffers_v_i(c.e,0,c.x,c.y,c.z) * buffers_grad_phinh_i(c.e,0,c.x,c.y,c.z) + buffers_v_i(c.e,1,c.x,c.y,c.z) * buffers_grad_phinh_i(c.e,1,c.x,c.y,c.z); + phi_tens *= ndata_scale1; + phi_tens += state_w_i(c.e,data_n0,c.x,c.y,c.z) * gscale; + + if (dscale) phi_tens += dscale * (buffers_v_i(c.e,0,c.x,c.y,c.z) * geometry_gradphis(c.e,0,c.x,c.y) + buffers_v_i(c.e,1,c.x,c.y,c.z) * geometry_gradphis(c.e,1,c.x,c.y)) * hvcoord_hybrid_bi_packed(c.z); + + buffers_phi_tens(c.e,c.x,c.y,c.z) = phi_tens; + } + }); +} + +template +void CaarFunctorImpl::epoch6_blockOps(const SphereOuter::Team &team) +{ + auto buffers_dpnh_dp_i = m_buffers.dpnh_dp_i; + auto buffers_exner = m_buffers.exner; + auto buffers_grad_phinh_i = m_buffers.grad_phinh_i; + auto buffers_grad_w_i = m_buffers.grad_w_i; + auto buffers_v_tens = m_buffers.v_tens; + + const int data_n0 = m_data.n0; + + auto &geometry_fcor = m_geometry.m_fcor; + + const SphereGlobal sg(m_sphere_ops); + + auto state_dp3d = m_state.m_dp3d; + auto state_v = m_state.m_v; + auto state_vtheta_dp= m_state.m_vtheta_dp; + auto state_w_i = m_state.m_w_i; + + SphereBlockOps::parallel_for( + team, 6, + KOKKOS_LAMBDA(const SphereBlockOps::Team &team) { + SphereBlockOps b(sg, team); + SphereBlockScratch ttmp0(b), ttmp1(b), ttmp2(b), ttmp3(b), ttmp4(b), ttmp5(b); + + SPHERE_BLOCK_START3(b, exneriz, v0, v1); + + if (!HYDROSTATIC) { + const Scalar wz = b.zabove(&state_w_i(b.e,data_n0,b.x,b.y,b.z), &state_w_i(b.e,data_n0,b.x,b.y,b.z+1)); + const Scalar w2 = 0.25 * (state_w_i(b.e,data_n0,b.x,b.y,b.z) * state_w_i(b.e,data_n0,b.x,b.y,b.z) + wz * wz); + b.gradInit(ttmp0, w2); + } + + exneriz = buffers_exner(b.e,b.x,b.y,b.z); + b.gradInit(ttmp1, exneriz); + + const Scalar log_exneriz = (PGRAD_CORRECTION) ? log(exneriz) : 0; + b.gradInit(ttmp2, log_exneriz); + + v0 = state_v(b.e,data_n0,0,b.x,b.y,b.z); + v1 = state_v(b.e,data_n0,1,b.x,b.y,b.z); + + b.vortInit(ttmp3, ttmp4, v0, v1); + b.gradInit(ttmp5, 0.5 * (v0 * v0 + v1 * v1)); + + SPHERE_BLOCK_MIDDLE3(b, exneriz, v0, v1); + + Scalar grad_v0, grad_v1; + b.grad(grad_v0, grad_v1, ttmp5); + + Scalar u_tens = (RSPLIT_ZERO) ? buffers_v_tens(b.e,0,b.x,b.y,b.z) : 0; + Scalar v_tens = (RSPLIT_ZERO) ? buffers_v_tens(b.e,1,b.x,b.y,b.z) : 0; + u_tens += grad_v0; + v_tens += grad_v1; + + const Scalar cp_vtheta = PhysicalConstants::cp * (state_vtheta_dp(b.e,data_n0,b.x,b.y,b.z) / state_dp3d(b.e,data_n0,b.x,b.y,b.z)); + + Scalar grad_exner0, grad_exner1; + b.grad(grad_exner0, grad_exner1, ttmp1); + + u_tens += cp_vtheta * grad_exner0; + v_tens += cp_vtheta * grad_exner1; + + Scalar mgrad_x, mgrad_y; + if (HYDROSTATIC) { + + mgrad_x = 0.5 * (buffers_grad_phinh_i(b.e,0,b.x,b.y,b.z) + b.zabove(&buffers_grad_phinh_i(b.e,0,b.x,b.y,b.z), &buffers_grad_phinh_i(b.e,0,b.x,b.y,b.z+1))); + mgrad_y = 0.5 * (buffers_grad_phinh_i(b.e,1,b.x,b.y,b.z) + b.zabove(&buffers_grad_phinh_i(b.e,1,b.x,b.y,b.z), &buffers_grad_phinh_i(b.e,1,b.x,b.y,b.z+1))); + + } else { + + mgrad_x = 0.5 * (buffers_grad_phinh_i(b.e,0,b.x,b.y,b.z) * buffers_dpnh_dp_i(b.e,b.x,b.y,b.z) + b.zabove(&buffers_grad_phinh_i(b.e,0,b.x,b.y,b.z), &buffers_grad_phinh_i(b.e,0,b.x,b.y,b.z+1)) * b.zabove(&buffers_dpnh_dp_i(b.e,b.x,b.y,b.z), &buffers_dpnh_dp_i(b.e,b.x,b.y,b.z+1))); + mgrad_y = 0.5 * (buffers_grad_phinh_i(b.e,1,b.x,b.y,b.z) * buffers_dpnh_dp_i(b.e,b.x,b.y,b.z) + b.zabove(&buffers_grad_phinh_i(b.e,1,b.x,b.y,b.z), &buffers_grad_phinh_i(b.e,1,b.x,b.y,b.z+1)) * b.zabove(&buffers_dpnh_dp_i(b.e,b.x,b.y,b.z), &buffers_dpnh_dp_i(b.e,b.x,b.y,b.z+1))); + + } + + if (PGRAD_CORRECTION) { + + Scalar grad_lexner0, grad_lexner1; + b.grad(grad_lexner0, grad_lexner1, ttmp2); + + namespace PC = PhysicalConstants; + constexpr Real cpt0 = PC::cp * (PC::Tref - PC::Tref_lapse_rate * PC::Tref * PC::cp / PC::g); + mgrad_x += cpt0 * (grad_lexner0 - grad_exner0 / exneriz); + mgrad_y += cpt0 * (grad_lexner1 - grad_exner1 / exneriz); + } + + Scalar wvor_x = 0; + Scalar wvor_y = 0; + if (!HYDROSTATIC) { + b.grad(wvor_x, wvor_y, ttmp0); + wvor_x -= 0.5 * (buffers_grad_w_i(b.e,0,b.x,b.y,b.z) * state_w_i(b.e,data_n0,b.x,b.y,b.z) + b.zabove(&buffers_grad_w_i(b.e,0,b.x,b.y,b.z), &buffers_grad_w_i(b.e,0,b.x,b.y,b.z+1)) * b.zabove(&state_w_i(b.e,data_n0,b.x,b.y,b.z), &state_w_i(b.e,data_n0,b.x,b.y,b.z+1))); + wvor_y -= 0.5 * (buffers_grad_w_i(b.e,1,b.x,b.y,b.z) * state_w_i(b.e,data_n0,b.x,b.y,b.z) + b.zabove(&buffers_grad_w_i(b.e,1,b.x,b.y,b.z), &buffers_grad_w_i(b.e,1,b.x,b.y,b.z+1)) * b.zabove(&state_w_i(b.e,data_n0,b.x,b.y,b.z), &state_w_i(b.e,data_n0,b.x,b.y,b.z+1))); + } + + u_tens += mgrad_x + wvor_x; + v_tens += mgrad_y + wvor_y; + + const Scalar vort = b.vort(ttmp3, ttmp4) + geometry_fcor(b.e,b.x,b.y); + u_tens -= v1 * vort; + v_tens += v0 * vort; + + buffers_v_tens(b.e,0,b.x,b.y,b.z) = u_tens; + buffers_v_tens(b.e,1,b.x,b.y,b.z) = v_tens; + + SPHERE_BLOCK_END(); + }); +} + +template +void CaarFunctorImpl::epoch7_col(const SphereOuter::Team &team) +{ + auto buffers_dp_tens = m_buffers.dp_tens; + auto buffers_eta_dot_dpdn = m_buffers.eta_dot_dpdn; + auto buffers_phi_tens = m_buffers.phi_tens; + auto buffers_theta_tens = m_buffers.theta_tens; + auto buffers_v_tens = m_buffers.v_tens; + auto buffers_vtheta_i = m_buffers.vtheta_i; + auto buffers_w_tens = m_buffers.w_tens; + + const Real data_dt = m_data.dt; + const int data_nm1 = m_data.nm1; + const int data_np1 = m_data.np1; + const Real data_scale3 = m_data.scale3; + + auto &geometry_spheremp = m_geometry.m_spheremp; + + const Real scale1_dt = m_data.scale1 * m_data.dt; + + auto state_dp3d = m_state.m_dp3d; + auto state_phinh_i = m_state.m_phinh_i; + auto state_v = m_state.m_v; + auto state_vtheta_dp = m_state.m_vtheta_dp; + auto state_w_i = m_state.m_w_i; + + SphereCol::parallel_for( + team, NUM_LEV, + KOKKOS_LAMBDA(const SphereCol::Team &team) { + const SphereCol c(team); + + const Real spheremp = geometry_spheremp(c.e,c.x,c.y); + const Real scale1_dt_spheremp = scale1_dt * spheremp; + const Real scale3_spheremp = data_scale3 * spheremp; + + Scalar dp_tens = buffers_dp_tens(c.e,c.x,c.y,c.z); + dp_tens *= scale1_dt_spheremp; + Scalar dp_np1 = scale3_spheremp * state_dp3d(c.e,data_nm1,c.x,c.y,c.z); + dp_np1 -= dp_tens; + state_dp3d(c.e,data_np1,c.x,c.y,c.z) = dp_np1; + + Scalar theta_tens = buffers_theta_tens(c.e,c.x,c.y,c.z); + if (RSPLIT_ZERO) { + const Scalar etap = c.zplus(&buffers_eta_dot_dpdn(c.e,c.x,c.y,c.z), &buffers_eta_dot_dpdn(c.e,c.x,c.y,c.z+1)); + const Scalar etaz = buffers_eta_dot_dpdn(c.e,c.x,c.y,c.z); + const Scalar thetap = etap * c.zplus(&buffers_vtheta_i(c.e,c.x,c.y,c.z), &buffers_vtheta_i(c.e,c.x,c.y,c.z+1)); + const Scalar thetaz = etaz * buffers_vtheta_i(c.e,c.x,c.y,c.z); + theta_tens += thetap - thetaz; + } + theta_tens *= -scale1_dt_spheremp; + Scalar vtheta_np1 = state_vtheta_dp(c.e,data_nm1,c.x,c.y,c.z); + vtheta_np1 *= scale3_spheremp; + vtheta_np1 += theta_tens; + state_vtheta_dp(c.e,data_np1,c.x,c.y,c.z) = vtheta_np1; + + Scalar u_tens = buffers_v_tens(c.e,0,c.x,c.y,c.z); + u_tens *= -scale1_dt_spheremp; + Scalar u_np1 = state_v(c.e,data_nm1,0,c.x,c.y,c.z); + u_np1 *= scale3_spheremp; + u_np1 += u_tens; + state_v(c.e,data_np1,0,c.x,c.y,c.z) = u_np1; + + Scalar v_tens = buffers_v_tens(c.e,1,c.x,c.y,c.z); + v_tens *= -scale1_dt_spheremp; + Scalar v_np1 = state_v(c.e,data_nm1,1,c.x,c.y,c.z); + v_np1 *= scale3_spheremp; + v_np1 += v_tens; + state_v(c.e,data_np1,1,c.x,c.y,c.z) = v_np1; + + if (!HYDROSTATIC) { + + const Real dt_spheremp = data_dt * spheremp; + + Scalar phi_tens = buffers_phi_tens(c.e,c.x,c.y,c.z); + phi_tens *= dt_spheremp; + Scalar phi_np1 = state_phinh_i(c.e,data_nm1,c.x,c.y,c.z); + phi_np1 *= scale3_spheremp; + phi_np1 += phi_tens; + state_phinh_i(c.e,data_np1,c.x,c.y,c.z) = phi_np1; + + Scalar w_tens = buffers_w_tens(c.e,c.x,c.y,c.z); + w_tens *= dt_spheremp; + Scalar w_np1 = state_w_i(c.e,data_nm1,c.x,c.y,c.z); + w_np1 *= scale3_spheremp; + w_np1 += w_tens; + state_w_i(c.e,data_np1,c.x,c.y,c.z) = w_np1; + + if (c.z == NUM_LEV-1) { + constexpr int z = NUM_LEV_P-1; + constexpr int dz = NUM_PHYSICAL_LEV%VECTOR_SIZE; + Real w_tens = buffers_w_tens(c.e,c.x,c.y,z)[dz]; + w_tens *= dt_spheremp; + buffers_w_tens(c.e,c.x,c.y,z)[dz] = w_tens; + Real w_np1 = state_w_i(c.e,data_nm1,c.x,c.y,z)[dz]; + w_np1 *= scale3_spheremp; + w_np1 += w_tens; + state_w_i(c.e,data_np1,c.x,c.y,z)[dz] = w_np1; + } + } + }); +} + +void CaarFunctorImpl::caar_compute() +{ + SphereOuter::parallel_for( + m_num_elems, + KOKKOS_LAMBDA(const SphereOuter::Team &team) { + + if (m_theta_hydrostatic_mode) { + if (m_theta_advection_form == AdvectionForm::Conservative) epoch1_blockOps(team); + else epoch1_blockOps(team); + } else { + if (m_theta_advection_form == AdvectionForm::Conservative) epoch1_blockOps(team); + else epoch1_blockOps(team); + } + + if (m_rsplit == 0) epoch2_scanOps(team); + else epoch2_scanOps(team); + + if (m_theta_hydrostatic_mode) { + if (m_rsplit == 0) epoch3_blockOps(team); + else epoch3_blockOps(team); + } else { + if (m_rsplit == 0) epoch3_blockOps(team); + else epoch3_blockOps(team); + } + + if (m_theta_hydrostatic_mode) epoch4_scanOps(team); + + if (m_theta_hydrostatic_mode) { + if (m_rsplit == 0) epoch5_colOps(team); + else epoch5_colOps(team); + } else { + if (m_rsplit == 0) epoch5_colOps(team); + else epoch5_colOps(team); + } + + if (m_theta_hydrostatic_mode) { + if (m_rsplit == 0) { + if (m_pgrad_correction) epoch6_blockOps(team); + else epoch6_blockOps(team); + } else { + if (m_pgrad_correction) epoch6_blockOps(team); + else epoch6_blockOps(team); + } + } else { + if (m_rsplit == 0) { + if (m_pgrad_correction) epoch6_blockOps(team); + else epoch6_blockOps(team); + } else { + if (m_pgrad_correction) epoch6_blockOps(team); + else epoch6_blockOps(team); + } + } + + if (m_theta_hydrostatic_mode) { + if (m_rsplit == 0) epoch7_col(team); + else epoch7_col(team); + } else { + if (m_rsplit == 0) epoch7_col(team); + else epoch7_col(team); + } + }); +} + +} diff --git a/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.hpp b/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.hpp index 25cdc133fa6f..1655229bd04c 100644 --- a/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.hpp +++ b/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.hpp @@ -30,6 +30,8 @@ #include "profiling.hpp" #include "ErrorDefs.hpp" +#include "Hommexx_config.h" + #include namespace Homme { @@ -107,7 +109,7 @@ struct CaarFunctorImpl { struct TagPostExchange {}; // Policies -#ifndef NDEBUG +#if defined(KOKKOS_ENABLE_CUDA) && !defined(NDEBUG) template using TeamPolicyType = Kokkos::TeamPolicy,Tag>; #else @@ -123,6 +125,7 @@ struct CaarFunctorImpl { Kokkos::Array, NUM_TIME_LEVELS> m_bes; + public: CaarFunctorImpl(const Elements &elements, const Tracers &/* tracers */, const ReferenceElement &ref_FE, const HybridVCoord &hvcoord, const SphereOperators &sphere_ops, const SimulationParams& params) @@ -179,7 +182,7 @@ struct CaarFunctorImpl { } int requested_buffer_size () const { - // Ask the buffers manager to allocate enough buffers to satisfy Caar's needs + // Ask the buffers manager to allocate enough buffers to satisfy Caar's needs. const int nslots = m_tu.get_num_ws_slots(); int num_scalar_mid_buf = Buffers::num_3d_scalar_mid_buf; @@ -190,8 +193,9 @@ struct CaarFunctorImpl { // Depending on rsplit/hydro-mode, we may remove some // buffers that are not needed from the counters above. if (m_theta_hydrostatic_mode) { - // pi=pnh, and no wtens/phitens + // pi=pnh, and no dpnh_dp_i/phitens num_scalar_mid_buf -= 1; + // No dpnh_dp_i/phitens/wtens num_scalar_int_buf -= 3; // No grad_w_i/v_i @@ -347,9 +351,9 @@ struct CaarFunctorImpl { int nerr; Kokkos::parallel_reduce("caar loop pre-boundary exchange", m_policy_pre, *this, nerr); Kokkos::fence(); - GPTLstop("caar compute"); if (nerr > 0) check_print_abort_on_bad_elems("CaarFunctorImpl::run TagPreExchange", data.n0); + GPTLstop("caar compute"); GPTLstart("caar_bexchV"); m_bes[data.np1]->exchange(m_geometry.m_rspheremp); @@ -559,11 +563,13 @@ struct CaarFunctorImpl { kv.team_barrier(); // necessary to avoid race in column_scan_mid_to_int ColumnOps::column_scan_mid_to_int(kv,div_vdp,omega_i); + kv.team_barrier(); // Average omega_i to midpoints, and change sign, since later // omega=v*grad(pi)-average(omega_i) auto omega = Homme::subview(m_buffers.omega_p,kv.team_idx,igp,jgp); ColumnOps::compute_midpoint_values(kv,omega_i,omega,-1.0); }); + kv.team_barrier(); // Compute grad(pi) diff --git a/components/homme/src/theta-l_kokkos/cxx/EquationOfState.hpp b/components/homme/src/theta-l_kokkos/cxx/EquationOfState.hpp index dd97720f1be2..037041d10315 100644 --- a/components/homme/src/theta-l_kokkos/cxx/EquationOfState.hpp +++ b/components/homme/src/theta-l_kokkos/cxx/EquationOfState.hpp @@ -248,6 +248,12 @@ class EquationOfState { ColumnOps::column_scan_mid_to_int(kv,integrand_provider,phi_i); } + KOKKOS_INLINE_FUNCTION static + Scalar compute_dphi (const Scalar vtheta_dp, const Scalar exner, const Scalar p) { + return PhysicalConstants::Rgas*vtheta_dp*exner/p; + } + + public: bool m_theta_hydrostatic_mode; diff --git a/components/homme/src/theta-l_kokkos/cxx/LimiterFunctor.hpp b/components/homme/src/theta-l_kokkos/cxx/LimiterFunctor.hpp index f36966a4bca5..ca8d4f6ac7ab 100644 --- a/components/homme/src/theta-l_kokkos/cxx/LimiterFunctor.hpp +++ b/components/homme/src/theta-l_kokkos/cxx/LimiterFunctor.hpp @@ -49,7 +49,7 @@ struct LimiterFunctor { struct TagDp3dLimiter {}; // Policies -#ifndef NDEBUG +#if defined(KOKKOS_ENABLE_CUDA) && !defined(NDEBUG) template using TeamPolicyType = Kokkos::TeamPolicy,Tag>; #else diff --git a/components/homme/test/reg_test/run_tests/test-list.cmake b/components/homme/test/reg_test/run_tests/test-list.cmake index 09607d387dcb..5eb238b90071 100644 --- a/components/homme/test/reg_test/run_tests/test-list.cmake +++ b/components/homme/test/reg_test/run_tests/test-list.cmake @@ -106,6 +106,13 @@ IF (BUILD_HOMME_THETA_KOKKOS) thetah-sl-test11conv-r0t1-cdr30-rrm thetanh-moist-bubble-sl thetanh-moist-bubble-sl-pg2) + # QLT vertical_levels not yet supported on GPU (compose_cedr_qlt.cpp:182) + IF (NOT (Kokkos_ENABLE_CUDA OR Kokkos_ENABLE_HIP)) + LIST(APPEND HOMME_TESTS + thetah-sl-dcmip16_test1pg2-kokkos.cmake) + LIST(APPEND HOMME_ONEOFF_CVF_TESTS + thetah-sl-dcmip16_test1pg2) + ENDIF() ENDIF() ENDIF() IF (HOMMEXX_BFB_TESTING) diff --git a/components/homme/test/reg_test/run_tests/thetah-sl-dcmip16_test1pg2-kokkos.cmake b/components/homme/test/reg_test/run_tests/thetah-sl-dcmip16_test1pg2-kokkos.cmake new file mode 100644 index 000000000000..73142474416e --- /dev/null +++ b/components/homme/test/reg_test/run_tests/thetah-sl-dcmip16_test1pg2-kokkos.cmake @@ -0,0 +1,13 @@ +# The name of this test (should be the basename of this file) +SET(TEST_NAME thetah-sl-dcmip16_test1pg2-kokkos) +# The specifically compiled executable that this test uses +SET(EXEC_NAME theta-l-nlev30-kokkos) + +SET(NUM_CPUS 16) + +SET(NAMELIST_FILES ${HOMME_ROOT}/test/reg_test/namelists/thetah-sl-dcmip16_test1pg2.nl) +SET(VCOORD_FILES ${HOMME_ROOT}/test/vcoord/*30*) + +# compare all of these files against baselines: +SET(NC_OUTPUT_FILES + dcmip2016_test1_pg21.nc)