stfc · sergisiso · Jun 2, 2026 · Jun 2, 2026 · Jun 2, 2026 · Jun 5, 2026
diff --git a/.github/workflows/lfric_test.yml b/.github/workflows/lfric_test.yml
@@ -149,8 +149,13 @@ jobs:
         rm -rf working-gh-ompoffload
         export BUILD_START="${SECONDS}"
         LFRIC_OFFLOAD_DIRECTIVES=omp ./build/local_build.py -v -j ${NUM_PARALLEL} -p psyclone-test \
-          -w working-gh-ompoffload gungho_model
+          -w working-gh-ompoffload gungho_model |& tee output.txt
+        # Piping to tee ignores the errcode of the first command, make sure we account for it
+        if [[ ${PIPESTATUS[0]} -ne 0 ]]; then
+            exit ${PIPESTATUS[0]}
+        fi
         export BUILD_ELAPSED=$((${SECONDS}-${BUILD_START}))
+        ${PSYCLONE_LFRIC_DIR}/aggregate_gpu_stats.sh output.txt
         cd applications/gungho_model/example
         rm -f timer.txt gungho_model-checksums.txt  # In case there were from a previous run
         export OMP_NUM_THREADS=12

diff --git a/examples/lfric/scripts/aggregate_gpu_stats.sh b/examples/lfric/scripts/aggregate_gpu_stats.sh
@@ -0,0 +1,97 @@
+#!/usr/bin/env bash
+
+# -----------------------------------------------------------------------------
+# BSD 3-Clause License
+#
+# Copyright (c) 2018-2026, Science and Technology Facilities Council.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+# -----------------------------------------------------------------------------
+# Authors: S. Siso, STFC Daresbury Lab
+
+# Check a filename argument is given
+if [[ $# -ne 1 ]]; then
+    echo "Usage: gpu_stats.sh <filename>"
+    exit 1
+fi
+filename=$1
+if [[ ! -r "$filename" ]]; then
+    echo "$filename does not exist or is not readable" 
+    exit 1
+fi
+
+
+count_uniq() {
+    echo -n "$1: "
+    grep "$1" $filename | sort | uniq | wc -l
+}
+
+check_above() {
+    value=$(grep "$1" $filename | sort | uniq | wc -l)
+    if [[ $value -lt $2 ]]; then
+        echo
+        echo "Error: Number of $1 is below $2"
+        exit 1
+    fi
+}
+
+echo " --- First we need to be able to modify kernels ---"
+count_uniq "Module-inline successful"
+count_uniq "Module-inline failed"
+echo -n "   -> "
+count_uniq "because it accesses data from its outer scope"
+echo
+echo " --- Then we need to inline them, or fallback to GPU routine annotations ---"
+count_uniq "Inline successful"
+count_uniq "Inline failed"
+count_uniq "Annotation successful"
+count_uniq "Annotation failed"
+echo -n "   -> "
+count_uniq "accesses the imported symbol"
+echo -n "   -> "
+count_uniq "calls another routine"
+echo -n "   -> "
+count_uniq "calls intrinsic"
+echo -n "   -> "
+count_uniq "only supports the transformation of a MATMUL operation when"
+
+echo
+echo " --- Then offload each loop with kernels inside ---"
+count_uniq "Offload independent loop"
+count_uniq "Offload with dof loop"
+count_uniq "Offload with atomics"
+count_uniq "Offload with cell colouring"
+count_uniq "Offload with cell tile-colouring"
+count_uniq "Failed to offload"
+count_uniq "Added inner loop nested parallelism"
+count_uniq "Added OMP threading"
+
+check_above "Module-inline successful" 277
+check_above "Offload independent loop" 88
+check_above "Offload with cell colouring" 40
diff --git a/examples/lfric/scripts/gpu_offloading.py b/examples/lfric/scripts/gpu_offloading.py
@@ -36,9 +36,8 @@
 #          S. Siso, STFC Daresbury Lab
 #          L. Mosimann, NVIDIA.
 
-'''PSyclone transformation script for LFRic to apply colouring and GPU
-offloading. Also adds redundant computation to the level-1 halo for
-setval_* generically.
+'''PSyclone transformation script for LFRic to apply GPU offloading directives.
+Also adds redundant computation to the level-1 halo for setval_* generically.
 
 '''
 import os
@@ -52,23 +51,25 @@
     Call, Directive, IntrinsicCall, Loop, Routine, Schedule)
 from psyclone.psyir.transformations import (
     ACCKernelsTrans, Matmul2CodeTrans, OMPTargetTrans, TransformationError,
-    OMPDeclareTargetTrans, OMPParallelTrans, ACCLoopTrans)
+    OMPDeclareTargetTrans, OMPParallelTrans, InlineTrans)
 from psyclone.transformations import (
     LFRicColourTrans, LFRicOMPLoopTrans,
     ACCParallelTrans, ACCRoutineTrans,
     OMPLoopTrans)
+from psyclone.psyir.transformations import ACCLoopTrans
 
 
 # Names of any invoke that we won't add any GPU offloading
-INVOKE_EXCLUSIONS = [
-]
+INVOKE_EXCLUSIONS = []
 
 # We won't attempt to inline calls to routines with names that contain
 # these strings (because they're not computationally important).
 INLINE_EXCLUSIONS = ["abort", "logging"]
 
 OFFLOAD_DIRECTIVES = os.getenv('LFRIC_OFFLOAD_DIRECTIVES', "none")
 
+RESOLVE_IMPORTS = ['constants_mod']
+
 
 def _replace_matmuls(sched: Schedule):
     '''
@@ -98,8 +99,7 @@ def _replace_matmuls(sched: Schedule):
 def trans(psyir):
     '''Applies PSyclone colouring and GPU offloading transformations. Any
     kernels that cannot be offloaded to GPU are parallelised using OpenMP
-    on the CPU. Any setval_* kernels are transformed so as to compute
-    into the L1 halos.
+    on the CPU.
 
     :param psyir: the PSyIR of the PSy-layer.
     :type psyir: :py:class:`psyclone.psyir.nodes.FileContainer`
@@ -111,6 +111,7 @@ def trans(psyir):
     const = LFRicConstants()
     cpu_parallel = OMPParallelTrans()
     mod_inline_trans = KernelModuleInlineTrans()
+    inline_trans = InlineTrans()
 
     if OFFLOAD_DIRECTIVES == "omp":
         # Use OpenMP offloading
@@ -141,13 +142,13 @@ def trans(psyir):
 
         print(f"Transforming invoke '{subroutine.name}' ...")
 
-        # Make setval_* compute redundantly to the level 1 halo if it
+        # Make setval_c compute redundantly to the level 1 halo if it
         # is in its own loop and only if it has an iteration space that is
         # *not* restricted to owned dofs.
         for loop in subroutine.loops():
             if loop.iteration_space == "dof":
                 if len(loop.kernels()) == 1:
-                    if loop.kernels()[0].name in ["setval_c", "setval_x"]:
+                    if loop.kernels()[0].name in ["setval_c"]:
                         rtrans.apply(loop, options={"depth": 1})
 
         if psyir.name.lower() in INVOKE_EXCLUSIONS:
@@ -168,41 +169,52 @@ def trans(psyir):
                         const.VALID_DISCONTINUOUS_NAMES):
                     ctrans.apply(loop)
 
-        # Module-inline the Kernels inside the loops and then mark them as
-        # GPU-enabled.
-        # (The latter step won't be necessary if/when we fully inline them.)
+        # We need to Module-inline all Kernels inside the loops and then mark
+        # them with GPU-enabling directive or Inline them.
         for loop in subroutine.loops():
-            if offload:
-                for kern in loop.kernels():
-                    if isinstance(kern, LFRicBuiltIn):
-                        # BuiltIns are replaced with inlined code when lowering
-                        continue
-                    # Attempt to module-inline the kernel.
-                    try:
-                        mod_inline_trans.apply(kern)
-                        print(f"Module-inlined kernel '{kern.name}'")
-                    except TransformationError as err:
-                        failed_inline.add(kern.name.lower())
-                        print(f"Failed to module-inline kernel "
-                              f"'{kern.name}' due to:\n{err.value}")
+            for kern in loop.kernels():
+                if not offload or isinstance(kern, LFRicBuiltIn):
+                    # BuiltIns and kernels inside excluded invokes do
+                    # not need inlining/annotations.
+                    continue
+
+                # Attempt to module-inline the kernel.
+                try:
+                    mod_inline_trans.apply(kern)
+                    print(f"Module-inline successful for kernel "
+                          f"'{kern.name}'")
+                except TransformationError as err:
+                    failed_inline.add(kern.name.lower())
+                    print(f"Module-inline failed for kernel "
+                          f"'{kern.name}' due to:\n{err.value}")
+
+                # Attempt to fully inline the kernel
+                try:
+                    # Ensure any MATMULs within the kernel are also inlined
+                    for routine in kern.get_callees():
+                        _replace_matmuls(routine)
+
+                    inline_trans.apply(kern)
+                    print(f"Inline successful for kernel "
+                          f"'{kern.name}'")
+                    continue
+                except TransformationError as err:
+                    failed_inline.add(kern.name.lower())
+                    print(f"Inline failed for kernel "
+                          f"'{kern.name}' due to:\n{err.value}")
+
+                    # If it cannot be inlined, fallback to annotate the
+                    # kernel with GPU routine directives.
                     try:
-                        # Ensure any MATMULs within the kernel are
-                        # replaced.
-                        for routine in kern.get_callees():
-                            _replace_matmuls(routine)
-                        # Finally, annotate the kernel routine for GPU.
                         gpu_annotation_trans.apply(kern)
-                        print(f"Annotated kernel '{kern.name}'")
+                        print(f"Annotation successful for kernel "
+                              f"'{kern.name}'")
                     except TransformationError as err:
                         failed_to_offload.add(kern.name.lower())
-                        print(f"Failed to annotate '{kern.name}' with "
-                              f"GPU-enabled directive due to:\n"
-                              f"{err.value}")
-                    # For annotated/inlined kernels we could attempt to
-                    # provide compile-time dimensions for temporary arrays
-                    # and convert to code any unsupported intrinsics.
-
-        # Add GPU offloading to loops unless they are over colours or are null.
+                        print(f"Annotation failed for kernel '{kern.name}' "
+                              f"due to:\n{err.value}")
+
+        # Add GPU offloading to loops
         for loop in subroutine.walk(Loop):
             kernel_names = [k.name.lower() for k in loop.kernels()]
             if offload and all(name not in failed_to_offload for name in
@@ -214,10 +226,12 @@ def trans(psyir):
                         loop_offloading_trans.apply(
                             loop, options={"independent": True})
                         gpu_region_trans.apply(loop.ancestor(Directive))
+                        print(f"Offload with cell colouring: {kernel_names}")
                     if loop.loop_type == "":
                         loop_offloading_trans.apply(
                             loop, options={"independent": True})
                         gpu_region_trans.apply(loop.ancestor(Directive))
+                        print(f"Offload independent loop: {kernel_names}")
                     if loop.loop_type == "dof":
                         # Loops over dofs can contains reductions
                         if kernels_trans:
@@ -231,9 +245,9 @@ def trans(psyir):
                             loop_offloading_trans.apply(
                                 loop, options={"independent": True})
                             gpu_region_trans.apply(loop.ancestor(Directive))
+                        print(f"Offload with dof loop: {kernel_names}")
                         # Alternatively we could use loop parallelism with
                         # reduction clauses
-                    print(f"Successfully offloaded loop with {kernel_names}")
                 except TransformationError as err:
                     print(f"Failed to offload loop with {kernel_names} "
                           f"because: {err}")

diff --git a/src/psyclone/psyir/transformations/mark_routine_for_gpu_mixin.py b/src/psyclone/psyir/transformations/mark_routine_for_gpu_mixin.py
@@ -136,8 +136,8 @@ def validate_it_can_run_on_gpu(self, node, options, **kwargs):
                         # An import of a compile-time constant is fine.
                         continue
                     raise TransformationError(
-                        f"{k_or_r} '{node.name}' accesses the symbol "
-                        f"'{symbol}' which is imported. If this symbol "
+                        f"{k_or_r} '{node.name}' accesses the imported symbol "
+                        f"'{symbol}'. If this symbol "
                         f"represents data then it must first be converted to a"
                         f" {k_or_r} argument using the "
                         f"KernelImportsToArguments transformation.")

diff --git a/src/psyclone/tests/domain/gocean/transformations/gocean1p0_transformations_test.py b/src/psyclone/tests/domain/gocean/transformations/gocean1p0_transformations_test.py
@@ -1488,9 +1488,9 @@ def test_accroutinetrans_module_use():
     rtrans = ACCRoutineTrans()
     with pytest.raises(TransformationError) as err:
         rtrans.apply(kernels[0])
-    assert ("accesses the symbol 'magic: Symbol<Import(container='model_mod'"
-            ")>' which is imported. If this symbol "
-            "represents data then it must first" in str(err.value))
+    assert ("accesses the imported symbol 'magic: Symbol<Import(container="
+            "'model_mod')>'. If this symbol represents data then it must first"
+            in str(err.value))
     # Tell the ModuleManager where to find the module that is being USED by
     # the kernel.
     mod_man = ModuleManager.get()
@@ -1500,9 +1500,9 @@ def test_accroutinetrans_module_use():
     with pytest.raises(TransformationError) as err:
         rtrans.apply(kernels[0])
     assert ("Transformation Error: Kernel 'kernel_with_use_code' accesses "
-            "the symbol 'magic: DataSymbol<Scalar<REAL, Reference"
-            "[name:'go_wp']>, Import(container='model_mod')>' which is "
-            "imported. If this symbol represents data then it must first be "
+            "the imported symbol 'magic: DataSymbol<Scalar<REAL, Reference"
+            "[name:'go_wp']>, Import(container='model_mod')>'. "
+            "If this symbol represents data then it must first be "
             "converted to a Kernel argument using the "
             "KernelImportsToArguments transformation." in str(err.value))
 

diff --git a/src/psyclone/tests/psyir/transformations/kernel_transformation_test.py b/src/psyclone/tests/psyir/transformations/kernel_transformation_test.py
@@ -176,9 +176,9 @@ def test_gpumixin_validate_no_import(fortran_reader):
     rtrans = ACCRoutineTrans()
     with pytest.raises(TransformationError) as err:
         rtrans.validate(routine)
-    assert ("Transformation Error: routine 'my_sub' accesses the symbol "
-            "'some_data: Symbol<Import(container='other_mod')>' which is "
-            "imported. If this symbol represents data "
+    assert ("Transformation Error: routine 'my_sub' accesses the imported "
+            "symbol 'some_data: Symbol<Import(container='other_mod')>'. "
+            "If this symbol represents data "
             "then it must first be converted to a routine argument using the "
             "KernelImportsToArguments transformation." in str(err.value))
     # Specialise the imported symbol and make it constant.
@@ -239,8 +239,8 @@ def test_gpumixin_validate_no_cblock(fortran_reader):
     routine = psyir.walk(Routine)[0]
     with pytest.raises(TransformationError) as err:
         rtrans.validate(routine, options={'force': True})
-    assert ("Transformation Error: routine 'my_sub' accesses the symbol "
-            "'some_data: Symbol<Import" in str(err.value))
+    assert ("Transformation Error: routine 'my_sub' accesses the imported "
+            "symbol 'some_data: Symbol<Import" in str(err.value))
 
 
 def test_gpumixin_validate_no_call():

diff --git a/src/psyclone/tests/psyir/transformations/transformations_test.py b/src/psyclone/tests/psyir/transformations/transformations_test.py
@@ -271,7 +271,7 @@ def __getitem__(self, _):
     monkeypatch.setattr(routine, "reference_accesses", lambda: DummyVAM())
     with pytest.raises(TransformationError) as err:
         ompdeclaretargettrans.apply(routine)
-    assert "which is imported" in str(err.value)
+    assert "accesses the imported symbol" in str(err.value)
 
 
 def test_omptaskloop_no_collapse():
@@ -429,9 +429,9 @@ def test_ompdeclaretargettrans_with_globals(sample_psyir, parser):
     ref1.symbol.interface = ImportInterface(ContainerSymbol('my_mod'))
     with pytest.raises(TransformationError) as err:
         ompdeclaretargettrans.apply(routine)
-    assert ("routine 'my_subroutine' accesses the symbol 'a: DataSymbol<Array"
-            "<Scalar<INTEGER, UNDEFINED>, shape=[10, 10]>, "
-            "Import(container='my_mod')>' which is imported. If this symbol "
+    assert ("routine 'my_subroutine' accesses the imported symbol "
+            "'a: DataSymbol<Array<Scalar<INTEGER, UNDEFINED>, shape=[10, 10]>,"
+            " Import(container='my_mod')>'. If this symbol "
             "represents data then it must first be converted to a routine "
             "argument using the KernelImportsToArguments transformation."
             in str(err.value))
@@ -448,9 +448,9 @@ def test_ompdeclaretargettrans_with_globals(sample_psyir, parser):
     ref1.replace_with(block)
     with pytest.raises(TransformationError) as err:
         ompdeclaretargettrans.apply(routine)
-    assert ("routine 'my_subroutine' accesses the symbol 'a: DataSymbol<Array<"
-            "Scalar<INTEGER, UNDEFINED>, shape=[10, 10]>, "
-            "Import(container='my_mod')>' which is imported. If this symbol "
+    assert ("routine 'my_subroutine' accesses the imported symbol "
+            "'a: DataSymbol<Array<Scalar<INTEGER, UNDEFINED>, shape=[10, 10]>,"
+            " Import(container='my_mod')>'. If this symbol "
             "represents data then it must first be converted to a routine "
             "argument using the KernelImportsToArguments transformation."
             in str(err.value))