diff --git a/.github/workflows/lfric_test.yml b/.github/workflows/lfric_test.yml index 254e0e3b7f..1c6c4b4ba1 100644 --- a/.github/workflows/lfric_test.yml +++ b/.github/workflows/lfric_test.yml @@ -149,8 +149,13 @@ jobs: rm -rf working-gh-ompoffload export BUILD_START="${SECONDS}" LFRIC_OFFLOAD_DIRECTIVES=omp ./build/local_build.py -v -j ${NUM_PARALLEL} -p psyclone-test \ - -w working-gh-ompoffload gungho_model + -w working-gh-ompoffload gungho_model |& tee output.txt + # Piping to tee ignores the errcode of the first command, make sure we account for it + if [[ ${PIPESTATUS[0]} -ne 0 ]]; then + exit ${PIPESTATUS[0]} + fi export BUILD_ELAPSED=$((${SECONDS}-${BUILD_START})) + ${PSYCLONE_LFRIC_DIR}/aggregate_gpu_stats.sh output.txt cd applications/gungho_model/example rm -f timer.txt gungho_model-checksums.txt # In case there were from a previous run export OMP_NUM_THREADS=12 diff --git a/examples/lfric/scripts/aggregate_gpu_stats.sh b/examples/lfric/scripts/aggregate_gpu_stats.sh new file mode 100755 index 0000000000..af21075cb6 --- /dev/null +++ b/examples/lfric/scripts/aggregate_gpu_stats.sh @@ -0,0 +1,97 @@ +#!/usr/bin/env bash + +# ----------------------------------------------------------------------------- +# BSD 3-Clause License +# +# Copyright (c) 2018-2026, Science and Technology Facilities Council. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# ----------------------------------------------------------------------------- +# Authors: S. Siso, STFC Daresbury Lab + +# Check a filename argument is given +if [[ $# -ne 1 ]]; then + echo "Usage: gpu_stats.sh " + exit 1 +fi +filename=$1 +if [[ ! -r "$filename" ]]; then + echo "$filename does not exist or is not readable" + exit 1 +fi + + +count_uniq() { + echo -n "$1: " + grep "$1" $filename | sort | uniq | wc -l +} + +check_above() { + value=$(grep "$1" $filename | sort | uniq | wc -l) + if [[ $value -lt $2 ]]; then + echo + echo "Error: Number of $1 is below $2" + exit 1 + fi +} + +echo " --- First we need to be able to modify kernels ---" +count_uniq "Module-inline successful" +count_uniq "Module-inline failed" +echo -n " -> " +count_uniq "because it accesses data from its outer scope" +echo +echo " --- Then we need to inline them, or fallback to GPU routine annotations ---" +count_uniq "Inline successful" +count_uniq "Inline failed" +count_uniq "Annotation successful" +count_uniq "Annotation failed" +echo -n " -> " +count_uniq "accesses the imported symbol" +echo -n " -> " +count_uniq "calls another routine" +echo -n " -> " +count_uniq "calls intrinsic" +echo -n " -> " +count_uniq "only supports the transformation of a MATMUL operation when" + +echo +echo " --- Then offload each loop with kernels inside ---" +count_uniq "Offload independent loop" +count_uniq "Offload with dof loop" +count_uniq "Offload with atomics" +count_uniq "Offload with cell colouring" +count_uniq "Offload with cell tile-colouring" +count_uniq "Failed to offload" +count_uniq "Added inner loop nested parallelism" +count_uniq "Added OMP threading" + +check_above "Module-inline successful" 277 +check_above "Offload independent loop" 88 +check_above "Offload with cell colouring" 40 diff --git a/examples/lfric/scripts/gpu_offloading.py b/examples/lfric/scripts/gpu_offloading.py index 4f7d6de924..d88cfa2d61 100644 --- a/examples/lfric/scripts/gpu_offloading.py +++ b/examples/lfric/scripts/gpu_offloading.py @@ -36,9 +36,8 @@ # S. Siso, STFC Daresbury Lab # L. Mosimann, NVIDIA. -'''PSyclone transformation script for LFRic to apply colouring and GPU -offloading. Also adds redundant computation to the level-1 halo for -setval_* generically. +'''PSyclone transformation script for LFRic to apply GPU offloading directives. +Also adds redundant computation to the level-1 halo for setval_* generically. ''' import os @@ -52,16 +51,16 @@ Call, Directive, IntrinsicCall, Loop, Routine, Schedule) from psyclone.psyir.transformations import ( ACCKernelsTrans, Matmul2CodeTrans, OMPTargetTrans, TransformationError, - OMPDeclareTargetTrans, OMPParallelTrans, ACCLoopTrans) + OMPDeclareTargetTrans, OMPParallelTrans, InlineTrans) from psyclone.transformations import ( LFRicColourTrans, LFRicOMPLoopTrans, ACCParallelTrans, ACCRoutineTrans, OMPLoopTrans) +from psyclone.psyir.transformations import ACCLoopTrans # Names of any invoke that we won't add any GPU offloading -INVOKE_EXCLUSIONS = [ -] +INVOKE_EXCLUSIONS = [] # We won't attempt to inline calls to routines with names that contain # these strings (because they're not computationally important). @@ -69,6 +68,8 @@ OFFLOAD_DIRECTIVES = os.getenv('LFRIC_OFFLOAD_DIRECTIVES', "none") +RESOLVE_IMPORTS = ['constants_mod'] + def _replace_matmuls(sched: Schedule): ''' @@ -98,8 +99,7 @@ def _replace_matmuls(sched: Schedule): def trans(psyir): '''Applies PSyclone colouring and GPU offloading transformations. Any kernels that cannot be offloaded to GPU are parallelised using OpenMP - on the CPU. Any setval_* kernels are transformed so as to compute - into the L1 halos. + on the CPU. :param psyir: the PSyIR of the PSy-layer. :type psyir: :py:class:`psyclone.psyir.nodes.FileContainer` @@ -111,6 +111,7 @@ def trans(psyir): const = LFRicConstants() cpu_parallel = OMPParallelTrans() mod_inline_trans = KernelModuleInlineTrans() + inline_trans = InlineTrans() if OFFLOAD_DIRECTIVES == "omp": # Use OpenMP offloading @@ -141,13 +142,13 @@ def trans(psyir): print(f"Transforming invoke '{subroutine.name}' ...") - # Make setval_* compute redundantly to the level 1 halo if it + # Make setval_c compute redundantly to the level 1 halo if it # is in its own loop and only if it has an iteration space that is # *not* restricted to owned dofs. for loop in subroutine.loops(): if loop.iteration_space == "dof": if len(loop.kernels()) == 1: - if loop.kernels()[0].name in ["setval_c", "setval_x"]: + if loop.kernels()[0].name in ["setval_c"]: rtrans.apply(loop, options={"depth": 1}) if psyir.name.lower() in INVOKE_EXCLUSIONS: @@ -168,41 +169,52 @@ def trans(psyir): const.VALID_DISCONTINUOUS_NAMES): ctrans.apply(loop) - # Module-inline the Kernels inside the loops and then mark them as - # GPU-enabled. - # (The latter step won't be necessary if/when we fully inline them.) + # We need to Module-inline all Kernels inside the loops and then mark + # them with GPU-enabling directive or Inline them. for loop in subroutine.loops(): - if offload: - for kern in loop.kernels(): - if isinstance(kern, LFRicBuiltIn): - # BuiltIns are replaced with inlined code when lowering - continue - # Attempt to module-inline the kernel. - try: - mod_inline_trans.apply(kern) - print(f"Module-inlined kernel '{kern.name}'") - except TransformationError as err: - failed_inline.add(kern.name.lower()) - print(f"Failed to module-inline kernel " - f"'{kern.name}' due to:\n{err.value}") + for kern in loop.kernels(): + if not offload or isinstance(kern, LFRicBuiltIn): + # BuiltIns and kernels inside excluded invokes do + # not need inlining/annotations. + continue + + # Attempt to module-inline the kernel. + try: + mod_inline_trans.apply(kern) + print(f"Module-inline successful for kernel " + f"'{kern.name}'") + except TransformationError as err: + failed_inline.add(kern.name.lower()) + print(f"Module-inline failed for kernel " + f"'{kern.name}' due to:\n{err.value}") + + # Attempt to fully inline the kernel + try: + # Ensure any MATMULs within the kernel are also inlined + for routine in kern.get_callees(): + _replace_matmuls(routine) + + inline_trans.apply(kern) + print(f"Inline successful for kernel " + f"'{kern.name}'") + continue + except TransformationError as err: + failed_inline.add(kern.name.lower()) + print(f"Inline failed for kernel " + f"'{kern.name}' due to:\n{err.value}") + + # If it cannot be inlined, fallback to annotate the + # kernel with GPU routine directives. try: - # Ensure any MATMULs within the kernel are - # replaced. - for routine in kern.get_callees(): - _replace_matmuls(routine) - # Finally, annotate the kernel routine for GPU. gpu_annotation_trans.apply(kern) - print(f"Annotated kernel '{kern.name}'") + print(f"Annotation successful for kernel " + f"'{kern.name}'") except TransformationError as err: failed_to_offload.add(kern.name.lower()) - print(f"Failed to annotate '{kern.name}' with " - f"GPU-enabled directive due to:\n" - f"{err.value}") - # For annotated/inlined kernels we could attempt to - # provide compile-time dimensions for temporary arrays - # and convert to code any unsupported intrinsics. - - # Add GPU offloading to loops unless they are over colours or are null. + print(f"Annotation failed for kernel '{kern.name}' " + f"due to:\n{err.value}") + + # Add GPU offloading to loops for loop in subroutine.walk(Loop): kernel_names = [k.name.lower() for k in loop.kernels()] if offload and all(name not in failed_to_offload for name in @@ -214,10 +226,12 @@ def trans(psyir): loop_offloading_trans.apply( loop, options={"independent": True}) gpu_region_trans.apply(loop.ancestor(Directive)) + print(f"Offload with cell colouring: {kernel_names}") if loop.loop_type == "": loop_offloading_trans.apply( loop, options={"independent": True}) gpu_region_trans.apply(loop.ancestor(Directive)) + print(f"Offload independent loop: {kernel_names}") if loop.loop_type == "dof": # Loops over dofs can contains reductions if kernels_trans: @@ -231,9 +245,9 @@ def trans(psyir): loop_offloading_trans.apply( loop, options={"independent": True}) gpu_region_trans.apply(loop.ancestor(Directive)) + print(f"Offload with dof loop: {kernel_names}") # Alternatively we could use loop parallelism with # reduction clauses - print(f"Successfully offloaded loop with {kernel_names}") except TransformationError as err: print(f"Failed to offload loop with {kernel_names} " f"because: {err}") diff --git a/src/psyclone/psyir/transformations/mark_routine_for_gpu_mixin.py b/src/psyclone/psyir/transformations/mark_routine_for_gpu_mixin.py index 27bea9bf14..1b559832a4 100644 --- a/src/psyclone/psyir/transformations/mark_routine_for_gpu_mixin.py +++ b/src/psyclone/psyir/transformations/mark_routine_for_gpu_mixin.py @@ -136,8 +136,8 @@ def validate_it_can_run_on_gpu(self, node, options, **kwargs): # An import of a compile-time constant is fine. continue raise TransformationError( - f"{k_or_r} '{node.name}' accesses the symbol " - f"'{symbol}' which is imported. If this symbol " + f"{k_or_r} '{node.name}' accesses the imported symbol " + f"'{symbol}'. If this symbol " f"represents data then it must first be converted to a" f" {k_or_r} argument using the " f"KernelImportsToArguments transformation.") diff --git a/src/psyclone/tests/domain/gocean/transformations/gocean1p0_transformations_test.py b/src/psyclone/tests/domain/gocean/transformations/gocean1p0_transformations_test.py index fda730c02c..c2a190a44c 100644 --- a/src/psyclone/tests/domain/gocean/transformations/gocean1p0_transformations_test.py +++ b/src/psyclone/tests/domain/gocean/transformations/gocean1p0_transformations_test.py @@ -1488,9 +1488,9 @@ def test_accroutinetrans_module_use(): rtrans = ACCRoutineTrans() with pytest.raises(TransformationError) as err: rtrans.apply(kernels[0]) - assert ("accesses the symbol 'magic: Symbol' which is imported. If this symbol " - "represents data then it must first" in str(err.value)) + assert ("accesses the imported symbol 'magic: Symbol'. If this symbol represents data then it must first" + in str(err.value)) # Tell the ModuleManager where to find the module that is being USED by # the kernel. mod_man = ModuleManager.get() @@ -1500,9 +1500,9 @@ def test_accroutinetrans_module_use(): with pytest.raises(TransformationError) as err: rtrans.apply(kernels[0]) assert ("Transformation Error: Kernel 'kernel_with_use_code' accesses " - "the symbol 'magic: DataSymbol, Import(container='model_mod')>' which is " - "imported. If this symbol represents data then it must first be " + "the imported symbol 'magic: DataSymbol, Import(container='model_mod')>'. " + "If this symbol represents data then it must first be " "converted to a Kernel argument using the " "KernelImportsToArguments transformation." in str(err.value)) diff --git a/src/psyclone/tests/psyir/transformations/kernel_transformation_test.py b/src/psyclone/tests/psyir/transformations/kernel_transformation_test.py index 1f2c647fd6..6672a243da 100644 --- a/src/psyclone/tests/psyir/transformations/kernel_transformation_test.py +++ b/src/psyclone/tests/psyir/transformations/kernel_transformation_test.py @@ -176,9 +176,9 @@ def test_gpumixin_validate_no_import(fortran_reader): rtrans = ACCRoutineTrans() with pytest.raises(TransformationError) as err: rtrans.validate(routine) - assert ("Transformation Error: routine 'my_sub' accesses the symbol " - "'some_data: Symbol' which is " - "imported. If this symbol represents data " + assert ("Transformation Error: routine 'my_sub' accesses the imported " + "symbol 'some_data: Symbol'. " + "If this symbol represents data " "then it must first be converted to a routine argument using the " "KernelImportsToArguments transformation." in str(err.value)) # Specialise the imported symbol and make it constant. @@ -239,8 +239,8 @@ def test_gpumixin_validate_no_cblock(fortran_reader): routine = psyir.walk(Routine)[0] with pytest.raises(TransformationError) as err: rtrans.validate(routine, options={'force': True}) - assert ("Transformation Error: routine 'my_sub' accesses the symbol " - "'some_data: Symbol, shape=[10, 10]>, " - "Import(container='my_mod')>' which is imported. If this symbol " + assert ("routine 'my_subroutine' accesses the imported symbol " + "'a: DataSymbol, shape=[10, 10]>," + " Import(container='my_mod')>'. If this symbol " "represents data then it must first be converted to a routine " "argument using the KernelImportsToArguments transformation." in str(err.value)) @@ -448,9 +448,9 @@ def test_ompdeclaretargettrans_with_globals(sample_psyir, parser): ref1.replace_with(block) with pytest.raises(TransformationError) as err: ompdeclaretargettrans.apply(routine) - assert ("routine 'my_subroutine' accesses the symbol 'a: DataSymbol, shape=[10, 10]>, " - "Import(container='my_mod')>' which is imported. If this symbol " + assert ("routine 'my_subroutine' accesses the imported symbol " + "'a: DataSymbol, shape=[10, 10]>," + " Import(container='my_mod')>'. If this symbol " "represents data then it must first be converted to a routine " "argument using the KernelImportsToArguments transformation." in str(err.value))